gt-ai-os-community/apps/tenant-backend/app/services/summarization_service.py

"""
GT 2.0 Summarization Service

Provides AI-powered summarization capabilities for documents and datasets.
Uses the same pattern as conversation title generation with Llama 3.1 8B.
"""

import logging
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime

from app.core.postgresql_client import get_postgresql_client
from app.core.resource_client import ResourceClusterClient
from app.core.config import get_settings

logger = logging.getLogger(__name__)


class SummarizationService:
    """
    Service for generating AI summaries of documents and datasets.

    Uses the same approach as conversation title generation:
    - Llama 3.1 8B instant model
    - Low temperature for consistency
    - Resource cluster for AI responses
    """

    def __init__(self, tenant_domain: str, user_id: str):
        self.tenant_domain = tenant_domain
        self.user_id = user_id
        self.resource_client = ResourceClusterClient()
        self.summarization_model = "llama-3.1-8b-instant"
        self.settings = get_settings()

    async def generate_document_summary(
        self,
        document_id: str,
        document_content: str,
        document_name: str
    ) -> Optional[str]:
        """
        Generate AI summary for a document using Llama 3.1 8B.

        Args:
            document_id: UUID of the document
            document_content: Full text content of the document
            document_name: Original filename/name of the document

        Returns:
            Generated summary string or None if failed
        """
        try:
            # Truncate content to first 3000 chars (like conversation title generation)
            content_preview = document_content[:3000]

            # Create summarization prompt
            prompt = f"""Summarize this document '{document_name}' in 2-3 sentences.
Focus on the main topics, key information, and purpose of the document.

Document content:
{content_preview}

Summary:"""

            logger.info(f"Generating summary for document {document_id} ({document_name})")

            # Call Resource Cluster with same pattern as conversation titles
            summary = await self._call_ai_for_summary(
                prompt=prompt,
                context_type="document",
                max_tokens=150
            )

            if summary:
                # Store summary in database
                await self._store_document_summary(document_id, summary)
                logger.info(f"Generated summary for document {document_id}: {summary[:100]}...")
                return summary
            else:
                logger.warning(f"Failed to generate summary for document {document_id}")
                return None

        except Exception as e:
            logger.error(f"Error generating document summary for {document_id}: {e}")
            return None

    async def generate_dataset_summary(self, dataset_id: str) -> Optional[str]:
        """
        Generate AI summary for a dataset based on its document summaries.

        Args:
            dataset_id: UUID of the dataset

        Returns:
            Generated dataset summary or None if failed
        """
        try:
            # Get all document summaries in this dataset
            document_summaries = await self._get_document_summaries_for_dataset(dataset_id)

            if not document_summaries:
                logger.info(f"No document summaries found for dataset {dataset_id}")
                return None

            # Get dataset name for context
            dataset_info = await self._get_dataset_info(dataset_id)
            dataset_name = dataset_info.get('name', 'Unknown Dataset') if dataset_info else 'Unknown Dataset'

            # Combine summaries for LLM context
            combined_summaries = "\n".join([
                f"- {doc['filename']}: {doc['summary']}"
                for doc in document_summaries
                if doc['summary']  # Only include docs that have summaries
            ])

            if not combined_summaries.strip():
                logger.info(f"No valid document summaries for dataset {dataset_id}")
                return None

            # Create dataset summarization prompt
            prompt = f"""Based on these document summaries, create a comprehensive 3-4 sentence summary describing what the dataset '{dataset_name}' contains and its purpose:

Documents in dataset:
{combined_summaries}

Dataset summary:"""

            logger.info(f"Generating summary for dataset {dataset_id} ({dataset_name})")

            # Call AI for dataset summary
            summary = await self._call_ai_for_summary(
                prompt=prompt,
                context_type="dataset",
                max_tokens=200
            )

            if summary:
                # Store dataset summary in database
                await self._store_dataset_summary(dataset_id, summary)
                logger.info(f"Generated dataset summary for {dataset_id}: {summary[:100]}...")
                return summary
            else:
                logger.warning(f"Failed to generate summary for dataset {dataset_id}")
                return None

        except Exception as e:
            logger.error(f"Error generating dataset summary for {dataset_id}: {e}")
            return None

    async def update_dataset_summary_on_change(self, dataset_id: str) -> bool:
        """
        Regenerate dataset summary when documents are added/removed.

        Args:
            dataset_id: UUID of the dataset to update

        Returns:
            True if summary was updated successfully
        """
        try:
            summary = await self.generate_dataset_summary(dataset_id)
            return summary is not None
        except Exception as e:
            logger.error(f"Error updating dataset summary for {dataset_id}: {e}")
            return False

    async def _call_ai_for_summary(
        self,
        prompt: str,
        context_type: str,
        max_tokens: int = 150
    ) -> Optional[str]:
        """
        Call Resource Cluster for AI summary generation.
        Uses ResourceClusterClient for consistent service discovery.

        Args:
            prompt: The summarization prompt
            context_type: Type of summary (document, dataset)
            max_tokens: Maximum tokens to generate

        Returns:
            Generated summary text or None if failed
        """
        try:
            # Prepare request payload (same format as conversation service)
            request_data = {
                "model": self.summarization_model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.3,  # Lower temperature for consistent summaries
                "max_tokens": max_tokens,
                "top_p": 1.0
            }

            logger.info(f"Calling Resource Cluster for {context_type} summary generation")

            # Use ResourceClusterClient for consistent service discovery and auth
            result = await self.resource_client.call_inference_endpoint(
                tenant_id=self.tenant_domain,
                user_id=self.user_id,
                endpoint="chat/completions",
                data=request_data
            )

            if result and "choices" in result and len(result["choices"]) > 0:
                summary = result["choices"][0]["message"]["content"].strip()
                logger.info(f"✅ AI {context_type} summary generated successfully: {summary[:50]}...")
                return summary
            else:
                logger.error(f"❌ Invalid AI response format for {context_type} summary: {result}")
                return None

        except Exception as e:
            logger.error(f"❌ Error calling Resource Cluster for {context_type} summary: {e}", exc_info=True)
            return None

    async def _store_document_summary(self, document_id: str, summary: str) -> None:
        """Store document summary in database"""
        try:
            client = await get_postgresql_client()
            async with client.get_connection() as conn:
                schema_name = self.settings.postgres_schema

                await conn.execute(f"""
                    UPDATE {schema_name}.documents
                    SET summary = $1,
                        summary_generated_at = $2,
                        summary_model = $3
                    WHERE id = $4
                """, summary, datetime.now(), self.summarization_model, document_id)

        except Exception as e:
            logger.error(f"Error storing document summary for {document_id}: {e}")
            raise

    async def _store_dataset_summary(self, dataset_id: str, summary: str) -> None:
        """Store dataset summary in database"""
        try:
            client = await get_postgresql_client()
            async with client.get_connection() as conn:
                schema_name = self.settings.postgres_schema

                await conn.execute(f"""
                    UPDATE {schema_name}.datasets
                    SET summary = $1,
                        summary_generated_at = $2,
                        summary_model = $3
                    WHERE id = $4
                """, summary, datetime.now(), self.summarization_model, dataset_id)

        except Exception as e:
            logger.error(f"Error storing dataset summary for {dataset_id}: {e}")
            raise

    async def _get_document_summaries_for_dataset(self, dataset_id: str) -> List[Dict[str, Any]]:
        """Get all document summaries for a dataset"""
        try:
            client = await get_postgresql_client()
            async with client.get_connection() as conn:
                schema_name = self.settings.postgres_schema

                rows = await conn.fetch(f"""
                    SELECT id, filename, original_filename, summary, summary_generated_at
                    FROM {schema_name}.documents
                    WHERE dataset_id = $1
                    AND summary IS NOT NULL
                    AND summary != ''
                    ORDER BY created_at ASC
                """, dataset_id)

                return [dict(row) for row in rows]

        except Exception as e:
            logger.error(f"Error getting document summaries for dataset {dataset_id}: {e}")
            return []

    async def _get_dataset_info(self, dataset_id: str) -> Optional[Dict[str, Any]]:
        """Get basic dataset information"""
        try:
            client = await get_postgresql_client()
            async with client.get_connection() as conn:
                schema_name = self.settings.postgres_schema

                row = await conn.fetchrow(f"""
                    SELECT id, name, description
                    FROM {schema_name}.datasets
                    WHERE id = $1
                """, dataset_id)

                return dict(row) if row else None

        except Exception as e:
            logger.error(f"Error getting dataset info for {dataset_id}: {e}")
            return None

    async def get_datasets_with_summaries(self, user_id: str) -> List[Dict[str, Any]]:
        """
        Get all user-accessible datasets with their summaries.
        Used for context injection in chat.

        Args:
            user_id: UUID of the user

        Returns:
            List of datasets with summaries
        """
        try:
            client = await get_postgresql_client()
            async with client.get_connection() as conn:
                schema_name = self.settings.postgres_schema

                rows = await conn.fetch(f"""
                    SELECT id, name, description, summary, summary_generated_at,
                           document_count, total_size_bytes
                    FROM {schema_name}.datasets
                    WHERE (created_by = $1::uuid
                           OR access_group IN ('team', 'organization'))
                    AND is_active = true
                    ORDER BY name ASC
                """, user_id)

                return [dict(row) for row in rows]

        except Exception as e:
            logger.error(f"Error getting datasets with summaries for user {user_id}: {e}")
            return []

    async def get_filtered_datasets_with_summaries(
        self,
        user_id: str,
        allowed_dataset_ids: List[str]
    ) -> List[Dict[str, Any]]:
        """
        Get datasets with summaries filtered by allowed dataset IDs.
        Used for agent-aware context injection in chat.

        Args:
            user_id: User UUID string
            allowed_dataset_ids: List of dataset IDs the agent/user should see

        Returns:
            List of dataset dictionaries with summaries, filtered by allowed IDs
        """
        if not allowed_dataset_ids:
            logger.info(f"No allowed dataset IDs provided for user {user_id} - returning empty list")
            return []

        try:
            client = await get_postgresql_client()
            async with client.get_connection() as conn:
                schema_name = self.settings.postgres_schema

                # Convert dataset IDs to UUID format for query
                placeholders = ",".join(f"${i+2}::uuid" for i in range(len(allowed_dataset_ids)))

                query = f"""
                    SELECT id, name, description, summary, summary_generated_at,
                           document_count, total_size_bytes
                    FROM {schema_name}.datasets
                    WHERE (created_by = $1::uuid
                           OR access_group IN ('team', 'organization'))
                    AND is_active = true
                    AND id = ANY(ARRAY[{placeholders}])
                    ORDER BY name ASC
                """

                params = [user_id] + allowed_dataset_ids
                rows = await conn.fetch(query, *params)

                filtered_datasets = [dict(row) for row in rows]
                logger.info(f"Filtered datasets for user {user_id}: {len(filtered_datasets)} out of {len(allowed_dataset_ids)} requested")

                return filtered_datasets

        except Exception as e:
            logger.error(f"Error getting filtered datasets with summaries for user {user_id}: {e}")
            return []


# Factory function for dependency injection
def get_summarization_service(tenant_domain: str, user_id: str) -> SummarizationService:
    """Factory function to create SummarizationService instance"""
    return SummarizationService(tenant_domain, user_id)