GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/tenant-backend/app/services/document_summarizer.py
+++ b/apps/tenant-backend/app/services/document_summarizer.py
@@ -0,0 +1,317 @@
+"""
+Document Summarization Service for GT 2.0
+
+Generates AI-powered summaries for uploaded documents using the Resource Cluster.
+Provides both quick summaries and detailed analysis for RAG visualization.
+"""
+
+import logging
+import asyncio
+import httpx
+from typing import Dict, Any, Optional, List
+from datetime import datetime
+
+from app.core.database import get_db_session, execute_command, fetch_one
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentSummarizer:
+    """
+    Service for generating document summaries using Resource Cluster LLM.
+
+    Features:
+    - Quick document summaries (2-3 sentences)
+    - Detailed analysis with key topics and themes
+    - Metadata extraction (document type, language, etc.)
+    - Integration with document processor workflow
+    """
+
+    def __init__(self):
+        self.resource_cluster_url = "http://gentwo-resource-backend:8000"
+        self.max_content_length = 4000  # Max chars to send for summarization
+
+    async def generate_document_summary(
+        self,
+        document_id: str,
+        content: str,
+        filename: str,
+        tenant_domain: str,
+        user_id: str
+    ) -> Dict[str, Any]:
+        """
+        Generate a comprehensive summary for a document.
+
+        Args:
+            document_id: Document ID in the database
+            content: Document text content
+            filename: Original filename
+            tenant_domain: Tenant domain for context
+            user_id: User who uploaded the document
+
+        Returns:
+            Dictionary with summary data including quick_summary, detailed_analysis,
+            topics, metadata, and confidence scores
+        """
+        try:
+            # Truncate content if too long
+            truncated_content = content[:self.max_content_length]
+            if len(content) > self.max_content_length:
+                truncated_content += "... [content truncated]"
+
+            # Generate summary using Resource Cluster LLM
+            summary_data = await self._call_llm_for_summary(
+                content=truncated_content,
+                filename=filename,
+                document_type=self._detect_document_type(filename)
+            )
+
+            # Store summary in database
+            await self._store_document_summary(
+                document_id=document_id,
+                summary_data=summary_data,
+                tenant_domain=tenant_domain,
+                user_id=user_id
+            )
+
+            logger.info(f"Generated summary for document {document_id}: {filename}")
+            return summary_data
+
+        except Exception as e:
+            logger.error(f"Failed to generate summary for document {document_id}: {e}")
+            # Return basic fallback summary
+            return {
+                "quick_summary": f"Document: {filename}",
+                "detailed_analysis": "Summary generation failed",
+                "topics": [],
+                "metadata": {
+                    "document_type": self._detect_document_type(filename),
+                    "estimated_read_time": len(content) // 200,  # ~200 words per minute
+                    "character_count": len(content),
+                    "language": "unknown"
+                },
+                "confidence": 0.0,
+                "error": str(e)
+            }
+
+    async def _call_llm_for_summary(
+        self,
+        content: str,
+        filename: str,
+        document_type: str
+    ) -> Dict[str, Any]:
+        """Call Resource Cluster LLM to generate document summary"""
+
+        prompt = f"""Analyze this {document_type} document and provide a comprehensive summary.
+
+Document: {filename}
+Content:
+{content}
+
+Please provide:
+1. A concise 2-3 sentence summary
+2. Key topics and themes (list)
+3. Document analysis including tone, purpose, and target audience
+4. Estimated language and reading level
+
+Format your response as JSON with these keys:
+- quick_summary: Brief 2-3 sentence overview
+- detailed_analysis: Paragraph with deeper insights
+- topics: Array of key topics/themes
+- metadata: Object with language, tone, purpose, target_audience
+- confidence: Float 0-1 indicating analysis confidence"""
+
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                response = await client.post(
+                    f"{self.resource_cluster_url}/api/v1/ai/chat/completions",
+                    json={
+                        "model": "llama-3.1-8b-instant",
+                        "messages": [
+                            {
+                                "role": "system",
+                                "content": "You are a document analysis expert. Provide accurate, concise summaries in valid JSON format."
+                            },
+                            {
+                                "role": "user",
+                                "content": prompt
+                            }
+                        ],
+                        "temperature": 0.3,
+                        "max_tokens": 1000
+                    },
+                    headers={
+                        "X-Tenant-ID": "default",
+                        "Content-Type": "application/json"
+                    }
+                )
+
+                if response.status_code == 200:
+                    llm_response = response.json()
+                    content_text = llm_response["choices"][0]["message"]["content"]
+
+                    # Try to parse JSON response
+                    try:
+                        import json
+                        summary_data = json.loads(content_text)
+
+                        # Validate required fields and add defaults if missing
+                        return {
+                            "quick_summary": summary_data.get("quick_summary", f"Analysis of {filename}"),
+                            "detailed_analysis": summary_data.get("detailed_analysis", "Detailed analysis not available"),
+                            "topics": summary_data.get("topics", []),
+                            "metadata": {
+                                **summary_data.get("metadata", {}),
+                                "document_type": document_type,
+                                "generated_at": datetime.utcnow().isoformat()
+                            },
+                            "confidence": min(1.0, max(0.0, summary_data.get("confidence", 0.7)))
+                        }
+
+                    except json.JSONDecodeError:
+                        # Fallback if LLM doesn't return valid JSON
+                        return {
+                            "quick_summary": content_text[:200] + "..." if len(content_text) > 200 else content_text,
+                            "detailed_analysis": content_text,
+                            "topics": [],
+                            "metadata": {
+                                "document_type": document_type,
+                                "generated_at": datetime.utcnow().isoformat(),
+                                "note": "Summary extracted from free-form LLM response"
+                            },
+                            "confidence": 0.5
+                        }
+                else:
+                    raise Exception(f"Resource Cluster API error: {response.status_code}")
+
+        except Exception as e:
+            logger.error(f"LLM summarization failed: {e}")
+            raise
+
+    async def _store_document_summary(
+        self,
+        document_id: str,
+        summary_data: Dict[str, Any],
+        tenant_domain: str,
+        user_id: str
+    ):
+        """Store generated summary in database"""
+
+        # Use the same database session pattern as document processor
+        async with get_db_session(tenant_domain) as session:
+            try:
+                # Insert or update document summary
+                query = """
+                INSERT INTO document_summaries (
+                    document_id, user_id, quick_summary, detailed_analysis,
+                    topics, metadata, confidence, created_at, updated_at
+                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+                ON CONFLICT (document_id)
+                DO UPDATE SET
+                    quick_summary = EXCLUDED.quick_summary,
+                    detailed_analysis = EXCLUDED.detailed_analysis,
+                    topics = EXCLUDED.topics,
+                    metadata = EXCLUDED.metadata,
+                    confidence = EXCLUDED.confidence,
+                    updated_at = EXCLUDED.updated_at
+                """
+
+                import json
+                await execute_command(
+                    session,
+                    query,
+                    document_id,
+                    user_id,
+                    summary_data["quick_summary"],
+                    summary_data["detailed_analysis"],
+                    json.dumps(summary_data["topics"]),
+                    json.dumps(summary_data["metadata"]),
+                    summary_data["confidence"],
+                    datetime.utcnow(),
+                    datetime.utcnow()
+                )
+
+                logger.info(f"Stored summary for document {document_id}")
+
+            except Exception as e:
+                logger.error(f"Failed to store document summary: {e}")
+                raise
+
+    def _detect_document_type(self, filename: str) -> str:
+        """Detect document type from filename extension"""
+        import pathlib
+
+        extension = pathlib.Path(filename).suffix.lower()
+
+        type_mapping = {
+            '.pdf': 'PDF document',
+            '.docx': 'Word document',
+            '.doc': 'Word document',
+            '.txt': 'Text file',
+            '.md': 'Markdown document',
+            '.csv': 'CSV data file',
+            '.json': 'JSON data file',
+            '.html': 'HTML document',
+            '.htm': 'HTML document',
+            '.rtf': 'Rich text document'
+        }
+
+        return type_mapping.get(extension, 'Unknown document type')
+
+    async def get_document_summary(
+        self,
+        document_id: str,
+        tenant_domain: str
+    ) -> Optional[Dict[str, Any]]:
+        """Retrieve stored document summary"""
+
+        async with get_db_session(tenant_domain) as session:
+            try:
+                query = """
+                SELECT quick_summary, detailed_analysis, topics, metadata,
+                       confidence, created_at, updated_at
+                FROM document_summaries
+                WHERE document_id = $1
+                """
+
+                result = await fetch_one(session, query, document_id)
+
+                if result:
+                    import json
+                    return {
+                        "quick_summary": result["quick_summary"],
+                        "detailed_analysis": result["detailed_analysis"],
+                        "topics": json.loads(result["topics"]) if result["topics"] else [],
+                        "metadata": json.loads(result["metadata"]) if result["metadata"] else {},
+                        "confidence": result["confidence"],
+                        "created_at": result["created_at"].isoformat(),
+                        "updated_at": result["updated_at"].isoformat()
+                    }
+
+                return None
+
+            except Exception as e:
+                logger.error(f"Failed to retrieve document summary: {e}")
+                return None
+
+
+# Global instance
+document_summarizer = DocumentSummarizer()
+
+
+async def generate_document_summary(
+    document_id: str,
+    content: str,
+    filename: str,
+    tenant_domain: str,
+    user_id: str
+) -> Dict[str, Any]:
+    """Convenience function for document summary generation"""
+    return await document_summarizer.generate_document_summary(
+        document_id, content, filename, tenant_domain, user_id
+    )
+
+
+async def get_document_summary(document_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
+    """Convenience function for retrieving document summary"""
+    return await document_summarizer.get_document_summary(document_id, tenant_domain)