GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/tenant-backend/app/services/summarization_service.py
+++ b/apps/tenant-backend/app/services/summarization_service.py
@@ -0,0 +1,385 @@
+"""
+GT 2.0 Summarization Service
+
+Provides AI-powered summarization capabilities for documents and datasets.
+Uses the same pattern as conversation title generation with Llama 3.1 8B.
+"""
+
+import logging
+import asyncio
+from typing import Dict, Any, List, Optional
+from datetime import datetime
+
+from app.core.postgresql_client import get_postgresql_client
+from app.core.resource_client import ResourceClusterClient
+from app.core.config import get_settings
+
+logger = logging.getLogger(__name__)
+
+
+class SummarizationService:
+    """
+    Service for generating AI summaries of documents and datasets.
+
+    Uses the same approach as conversation title generation:
+    - Llama 3.1 8B instant model
+    - Low temperature for consistency
+    - Resource cluster for AI responses
+    """
+
+    def __init__(self, tenant_domain: str, user_id: str):
+        self.tenant_domain = tenant_domain
+        self.user_id = user_id
+        self.resource_client = ResourceClusterClient()
+        self.summarization_model = "llama-3.1-8b-instant"
+        self.settings = get_settings()
+
+    async def generate_document_summary(
+        self,
+        document_id: str,
+        document_content: str,
+        document_name: str
+    ) -> Optional[str]:
+        """
+        Generate AI summary for a document using Llama 3.1 8B.
+
+        Args:
+            document_id: UUID of the document
+            document_content: Full text content of the document
+            document_name: Original filename/name of the document
+
+        Returns:
+            Generated summary string or None if failed
+        """
+        try:
+            # Truncate content to first 3000 chars (like conversation title generation)
+            content_preview = document_content[:3000]
+
+            # Create summarization prompt
+            prompt = f"""Summarize this document '{document_name}' in 2-3 sentences.
+Focus on the main topics, key information, and purpose of the document.
+
+Document content:
+{content_preview}
+
+Summary:"""
+
+            logger.info(f"Generating summary for document {document_id} ({document_name})")
+
+            # Call Resource Cluster with same pattern as conversation titles
+            summary = await self._call_ai_for_summary(
+                prompt=prompt,
+                context_type="document",
+                max_tokens=150
+            )
+
+            if summary:
+                # Store summary in database
+                await self._store_document_summary(document_id, summary)
+                logger.info(f"Generated summary for document {document_id}: {summary[:100]}...")
+                return summary
+            else:
+                logger.warning(f"Failed to generate summary for document {document_id}")
+                return None
+
+        except Exception as e:
+            logger.error(f"Error generating document summary for {document_id}: {e}")
+            return None
+
+    async def generate_dataset_summary(self, dataset_id: str) -> Optional[str]:
+        """
+        Generate AI summary for a dataset based on its document summaries.
+
+        Args:
+            dataset_id: UUID of the dataset
+
+        Returns:
+            Generated dataset summary or None if failed
+        """
+        try:
+            # Get all document summaries in this dataset
+            document_summaries = await self._get_document_summaries_for_dataset(dataset_id)
+
+            if not document_summaries:
+                logger.info(f"No document summaries found for dataset {dataset_id}")
+                return None
+
+            # Get dataset name for context
+            dataset_info = await self._get_dataset_info(dataset_id)
+            dataset_name = dataset_info.get('name', 'Unknown Dataset') if dataset_info else 'Unknown Dataset'
+
+            # Combine summaries for LLM context
+            combined_summaries = "\n".join([
+                f"- {doc['filename']}: {doc['summary']}"
+                for doc in document_summaries
+                if doc['summary']  # Only include docs that have summaries
+            ])
+
+            if not combined_summaries.strip():
+                logger.info(f"No valid document summaries for dataset {dataset_id}")
+                return None
+
+            # Create dataset summarization prompt
+            prompt = f"""Based on these document summaries, create a comprehensive 3-4 sentence summary describing what the dataset '{dataset_name}' contains and its purpose:
+
+Documents in dataset:
+{combined_summaries}
+
+Dataset summary:"""
+
+            logger.info(f"Generating summary for dataset {dataset_id} ({dataset_name})")
+
+            # Call AI for dataset summary
+            summary = await self._call_ai_for_summary(
+                prompt=prompt,
+                context_type="dataset",
+                max_tokens=200
+            )
+
+            if summary:
+                # Store dataset summary in database
+                await self._store_dataset_summary(dataset_id, summary)
+                logger.info(f"Generated dataset summary for {dataset_id}: {summary[:100]}...")
+                return summary
+            else:
+                logger.warning(f"Failed to generate summary for dataset {dataset_id}")
+                return None
+
+        except Exception as e:
+            logger.error(f"Error generating dataset summary for {dataset_id}: {e}")
+            return None
+
+    async def update_dataset_summary_on_change(self, dataset_id: str) -> bool:
+        """
+        Regenerate dataset summary when documents are added/removed.
+
+        Args:
+            dataset_id: UUID of the dataset to update
+
+        Returns:
+            True if summary was updated successfully
+        """
+        try:
+            summary = await self.generate_dataset_summary(dataset_id)
+            return summary is not None
+        except Exception as e:
+            logger.error(f"Error updating dataset summary for {dataset_id}: {e}")
+            return False
+
+    async def _call_ai_for_summary(
+        self,
+        prompt: str,
+        context_type: str,
+        max_tokens: int = 150
+    ) -> Optional[str]:
+        """
+        Call Resource Cluster for AI summary generation.
+        Uses ResourceClusterClient for consistent service discovery.
+
+        Args:
+            prompt: The summarization prompt
+            context_type: Type of summary (document, dataset)
+            max_tokens: Maximum tokens to generate
+
+        Returns:
+            Generated summary text or None if failed
+        """
+        try:
+            # Prepare request payload (same format as conversation service)
+            request_data = {
+                "model": self.summarization_model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.3,  # Lower temperature for consistent summaries
+                "max_tokens": max_tokens,
+                "top_p": 1.0
+            }
+
+            logger.info(f"Calling Resource Cluster for {context_type} summary generation")
+
+            # Use ResourceClusterClient for consistent service discovery and auth
+            result = await self.resource_client.call_inference_endpoint(
+                tenant_id=self.tenant_domain,
+                user_id=self.user_id,
+                endpoint="chat/completions",
+                data=request_data
+            )
+
+            if result and "choices" in result and len(result["choices"]) > 0:
+                summary = result["choices"][0]["message"]["content"].strip()
+                logger.info(f"✅ AI {context_type} summary generated successfully: {summary[:50]}...")
+                return summary
+            else:
+                logger.error(f"❌ Invalid AI response format for {context_type} summary: {result}")
+                return None
+
+        except Exception as e:
+            logger.error(f"❌ Error calling Resource Cluster for {context_type} summary: {e}", exc_info=True)
+            return None
+
+    async def _store_document_summary(self, document_id: str, summary: str) -> None:
+        """Store document summary in database"""
+        try:
+            client = await get_postgresql_client()
+            async with client.get_connection() as conn:
+                schema_name = self.settings.postgres_schema
+
+                await conn.execute(f"""
+                    UPDATE {schema_name}.documents
+                    SET summary = $1,
+                        summary_generated_at = $2,
+                        summary_model = $3
+                    WHERE id = $4
+                """, summary, datetime.now(), self.summarization_model, document_id)
+
+        except Exception as e:
+            logger.error(f"Error storing document summary for {document_id}: {e}")
+            raise
+
+    async def _store_dataset_summary(self, dataset_id: str, summary: str) -> None:
+        """Store dataset summary in database"""
+        try:
+            client = await get_postgresql_client()
+            async with client.get_connection() as conn:
+                schema_name = self.settings.postgres_schema
+
+                await conn.execute(f"""
+                    UPDATE {schema_name}.datasets
+                    SET summary = $1,
+                        summary_generated_at = $2,
+                        summary_model = $3
+                    WHERE id = $4
+                """, summary, datetime.now(), self.summarization_model, dataset_id)
+
+        except Exception as e:
+            logger.error(f"Error storing dataset summary for {dataset_id}: {e}")
+            raise
+
+    async def _get_document_summaries_for_dataset(self, dataset_id: str) -> List[Dict[str, Any]]:
+        """Get all document summaries for a dataset"""
+        try:
+            client = await get_postgresql_client()
+            async with client.get_connection() as conn:
+                schema_name = self.settings.postgres_schema
+
+                rows = await conn.fetch(f"""
+                    SELECT id, filename, original_filename, summary, summary_generated_at
+                    FROM {schema_name}.documents
+                    WHERE dataset_id = $1
+                    AND summary IS NOT NULL
+                    AND summary != ''
+                    ORDER BY created_at ASC
+                """, dataset_id)
+
+                return [dict(row) for row in rows]
+
+        except Exception as e:
+            logger.error(f"Error getting document summaries for dataset {dataset_id}: {e}")
+            return []
+
+    async def _get_dataset_info(self, dataset_id: str) -> Optional[Dict[str, Any]]:
+        """Get basic dataset information"""
+        try:
+            client = await get_postgresql_client()
+            async with client.get_connection() as conn:
+                schema_name = self.settings.postgres_schema
+
+                row = await conn.fetchrow(f"""
+                    SELECT id, name, description
+                    FROM {schema_name}.datasets
+                    WHERE id = $1
+                """, dataset_id)
+
+                return dict(row) if row else None
+
+        except Exception as e:
+            logger.error(f"Error getting dataset info for {dataset_id}: {e}")
+            return None
+
+    async def get_datasets_with_summaries(self, user_id: str) -> List[Dict[str, Any]]:
+        """
+        Get all user-accessible datasets with their summaries.
+        Used for context injection in chat.
+
+        Args:
+            user_id: UUID of the user
+
+        Returns:
+            List of datasets with summaries
+        """
+        try:
+            client = await get_postgresql_client()
+            async with client.get_connection() as conn:
+                schema_name = self.settings.postgres_schema
+
+                rows = await conn.fetch(f"""
+                    SELECT id, name, description, summary, summary_generated_at,
+                           document_count, total_size_bytes
+                    FROM {schema_name}.datasets
+                    WHERE (created_by = $1::uuid
+                           OR access_group IN ('team', 'organization'))
+                    AND is_active = true
+                    ORDER BY name ASC
+                """, user_id)
+
+                return [dict(row) for row in rows]
+
+        except Exception as e:
+            logger.error(f"Error getting datasets with summaries for user {user_id}: {e}")
+            return []
+
+    async def get_filtered_datasets_with_summaries(
+        self,
+        user_id: str,
+        allowed_dataset_ids: List[str]
+    ) -> List[Dict[str, Any]]:
+        """
+        Get datasets with summaries filtered by allowed dataset IDs.
+        Used for agent-aware context injection in chat.
+
+        Args:
+            user_id: User UUID string
+            allowed_dataset_ids: List of dataset IDs the agent/user should see
+
+        Returns:
+            List of dataset dictionaries with summaries, filtered by allowed IDs
+        """
+        if not allowed_dataset_ids:
+            logger.info(f"No allowed dataset IDs provided for user {user_id} - returning empty list")
+            return []
+
+        try:
+            client = await get_postgresql_client()
+            async with client.get_connection() as conn:
+                schema_name = self.settings.postgres_schema
+
+                # Convert dataset IDs to UUID format for query
+                placeholders = ",".join(f"${i+2}::uuid" for i in range(len(allowed_dataset_ids)))
+
+                query = f"""
+                    SELECT id, name, description, summary, summary_generated_at,
+                           document_count, total_size_bytes
+                    FROM {schema_name}.datasets
+                    WHERE (created_by = $1::uuid
+                           OR access_group IN ('team', 'organization'))
+                    AND is_active = true
+                    AND id = ANY(ARRAY[{placeholders}])
+                    ORDER BY name ASC
+                """
+
+                params = [user_id] + allowed_dataset_ids
+                rows = await conn.fetch(query, *params)
+
+                filtered_datasets = [dict(row) for row in rows]
+                logger.info(f"Filtered datasets for user {user_id}: {len(filtered_datasets)} out of {len(allowed_dataset_ids)} requested")
+
+                return filtered_datasets
+
+        except Exception as e:
+            logger.error(f"Error getting filtered datasets with summaries for user {user_id}: {e}")
+            return []
+
+
+# Factory function for dependency injection
+def get_summarization_service(tenant_domain: str, user_id: str) -> SummarizationService:
+    """Factory function to create SummarizationService instance"""
+    return SummarizationService(tenant_domain, user_id)