gt-ai-os-community/apps/tenant-backend/app/services/document_summarizer.py

"""
Document Summarization Service for GT 2.0

Generates AI-powered summaries for uploaded documents using the Resource Cluster.
Provides both quick summaries and detailed analysis for RAG visualization.
"""

import logging
import asyncio
import httpx
from typing import Dict, Any, Optional, List
from datetime import datetime

from app.core.database import get_db_session, execute_command, fetch_one

logger = logging.getLogger(__name__)


class DocumentSummarizer:
    """
    Service for generating document summaries using Resource Cluster LLM.

    Features:
    - Quick document summaries (2-3 sentences)
    - Detailed analysis with key topics and themes
    - Metadata extraction (document type, language, etc.)
    - Integration with document processor workflow
    """

    def __init__(self):
        self.resource_cluster_url = "http://gentwo-resource-backend:8000"
        self.max_content_length = 4000  # Max chars to send for summarization

    async def generate_document_summary(
        self,
        document_id: str,
        content: str,
        filename: str,
        tenant_domain: str,
        user_id: str
    ) -> Dict[str, Any]:
        """
        Generate a comprehensive summary for a document.

        Args:
            document_id: Document ID in the database
            content: Document text content
            filename: Original filename
            tenant_domain: Tenant domain for context
            user_id: User who uploaded the document

        Returns:
            Dictionary with summary data including quick_summary, detailed_analysis,
            topics, metadata, and confidence scores
        """
        try:
            # Truncate content if too long
            truncated_content = content[:self.max_content_length]
            if len(content) > self.max_content_length:
                truncated_content += "... [content truncated]"

            # Generate summary using Resource Cluster LLM
            summary_data = await self._call_llm_for_summary(
                content=truncated_content,
                filename=filename,
                document_type=self._detect_document_type(filename)
            )

            # Store summary in database
            await self._store_document_summary(
                document_id=document_id,
                summary_data=summary_data,
                tenant_domain=tenant_domain,
                user_id=user_id
            )

            logger.info(f"Generated summary for document {document_id}: {filename}")
            return summary_data

        except Exception as e:
            logger.error(f"Failed to generate summary for document {document_id}: {e}")
            # Return basic fallback summary
            return {
                "quick_summary": f"Document: {filename}",
                "detailed_analysis": "Summary generation failed",
                "topics": [],
                "metadata": {
                    "document_type": self._detect_document_type(filename),
                    "estimated_read_time": len(content) // 200,  # ~200 words per minute
                    "character_count": len(content),
                    "language": "unknown"
                },
                "confidence": 0.0,
                "error": str(e)
            }

    async def _call_llm_for_summary(
        self,
        content: str,
        filename: str,
        document_type: str
    ) -> Dict[str, Any]:
        """Call Resource Cluster LLM to generate document summary"""

        prompt = f"""Analyze this {document_type} document and provide a comprehensive summary.

Document: {filename}
Content:
{content}

Please provide:
1. A concise 2-3 sentence summary
2. Key topics and themes (list)
3. Document analysis including tone, purpose, and target audience
4. Estimated language and reading level

Format your response as JSON with these keys:
- quick_summary: Brief 2-3 sentence overview
- detailed_analysis: Paragraph with deeper insights
- topics: Array of key topics/themes
- metadata: Object with language, tone, purpose, target_audience
- confidence: Float 0-1 indicating analysis confidence"""

        try:
            async with httpx.AsyncClient(timeout=30.0) as client:
                response = await client.post(
                    f"{self.resource_cluster_url}/api/v1/ai/chat/completions",
                    json={
                        "model": "llama-3.1-8b-instant",
                        "messages": [
                            {
                                "role": "system",
                                "content": "You are a document analysis expert. Provide accurate, concise summaries in valid JSON format."
                            },
                            {
                                "role": "user",
                                "content": prompt
                            }
                        ],
                        "temperature": 0.3,
                        "max_tokens": 1000
                    },
                    headers={
                        "X-Tenant-ID": "default",
                        "Content-Type": "application/json"
                    }
                )

                if response.status_code == 200:
                    llm_response = response.json()
                    content_text = llm_response["choices"][0]["message"]["content"]

                    # Try to parse JSON response
                    try:
                        import json
                        summary_data = json.loads(content_text)

                        # Validate required fields and add defaults if missing
                        return {
                            "quick_summary": summary_data.get("quick_summary", f"Analysis of {filename}"),
                            "detailed_analysis": summary_data.get("detailed_analysis", "Detailed analysis not available"),
                            "topics": summary_data.get("topics", []),
                            "metadata": {
                                **summary_data.get("metadata", {}),
                                "document_type": document_type,
                                "generated_at": datetime.utcnow().isoformat()
                            },
                            "confidence": min(1.0, max(0.0, summary_data.get("confidence", 0.7)))
                        }

                    except json.JSONDecodeError:
                        # Fallback if LLM doesn't return valid JSON
                        return {
                            "quick_summary": content_text[:200] + "..." if len(content_text) > 200 else content_text,
                            "detailed_analysis": content_text,
                            "topics": [],
                            "metadata": {
                                "document_type": document_type,
                                "generated_at": datetime.utcnow().isoformat(),
                                "note": "Summary extracted from free-form LLM response"
                            },
                            "confidence": 0.5
                        }
                else:
                    raise Exception(f"Resource Cluster API error: {response.status_code}")

        except Exception as e:
            logger.error(f"LLM summarization failed: {e}")
            raise

    async def _store_document_summary(
        self,
        document_id: str,
        summary_data: Dict[str, Any],
        tenant_domain: str,
        user_id: str
    ):
        """Store generated summary in database"""

        # Use the same database session pattern as document processor
        async with get_db_session(tenant_domain) as session:
            try:
                # Insert or update document summary
                query = """
                INSERT INTO document_summaries (
                    document_id, user_id, quick_summary, detailed_analysis,
                    topics, metadata, confidence, created_at, updated_at
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
                ON CONFLICT (document_id)
                DO UPDATE SET
                    quick_summary = EXCLUDED.quick_summary,
                    detailed_analysis = EXCLUDED.detailed_analysis,
                    topics = EXCLUDED.topics,
                    metadata = EXCLUDED.metadata,
                    confidence = EXCLUDED.confidence,
                    updated_at = EXCLUDED.updated_at
                """

                import json
                await execute_command(
                    session,
                    query,
                    document_id,
                    user_id,
                    summary_data["quick_summary"],
                    summary_data["detailed_analysis"],
                    json.dumps(summary_data["topics"]),
                    json.dumps(summary_data["metadata"]),
                    summary_data["confidence"],
                    datetime.utcnow(),
                    datetime.utcnow()
                )

                logger.info(f"Stored summary for document {document_id}")

            except Exception as e:
                logger.error(f"Failed to store document summary: {e}")
                raise

    def _detect_document_type(self, filename: str) -> str:
        """Detect document type from filename extension"""
        import pathlib

        extension = pathlib.Path(filename).suffix.lower()

        type_mapping = {
            '.pdf': 'PDF document',
            '.docx': 'Word document',
            '.doc': 'Word document',
            '.txt': 'Text file',
            '.md': 'Markdown document',
            '.csv': 'CSV data file',
            '.json': 'JSON data file',
            '.html': 'HTML document',
            '.htm': 'HTML document',
            '.rtf': 'Rich text document'
        }

        return type_mapping.get(extension, 'Unknown document type')

    async def get_document_summary(
        self,
        document_id: str,
        tenant_domain: str
    ) -> Optional[Dict[str, Any]]:
        """Retrieve stored document summary"""

        async with get_db_session(tenant_domain) as session:
            try:
                query = """
                SELECT quick_summary, detailed_analysis, topics, metadata,
                       confidence, created_at, updated_at
                FROM document_summaries
                WHERE document_id = $1
                """

                result = await fetch_one(session, query, document_id)

                if result:
                    import json
                    return {
                        "quick_summary": result["quick_summary"],
                        "detailed_analysis": result["detailed_analysis"],
                        "topics": json.loads(result["topics"]) if result["topics"] else [],
                        "metadata": json.loads(result["metadata"]) if result["metadata"] else {},
                        "confidence": result["confidence"],
                        "created_at": result["created_at"].isoformat(),
                        "updated_at": result["updated_at"].isoformat()
                    }

                return None

            except Exception as e:
                logger.error(f"Failed to retrieve document summary: {e}")
                return None


# Global instance
document_summarizer = DocumentSummarizer()


async def generate_document_summary(
    document_id: str,
    content: str,
    filename: str,
    tenant_domain: str,
    user_id: str
) -> Dict[str, Any]:
    """Convenience function for document summary generation"""
    return await document_summarizer.generate_document_summary(
        document_id, content, filename, tenant_domain, user_id
    )


async def get_document_summary(document_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
    """Convenience function for retrieving document summary"""
    return await document_summarizer.get_document_summary(document_id, tenant_domain)