GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/tenant-backend/app/services/conversation_file_service.py
+++ b/apps/tenant-backend/app/services/conversation_file_service.py
@@ -0,0 +1,563 @@
+"""
+Conversation File Service for GT 2.0
+
+Handles conversation-scoped file attachments as a simpler alternative to dataset-based uploads.
+Preserves all existing dataset infrastructure while providing direct conversation file storage.
+"""
+
+import os
+import uuid
+import logging
+import asyncio
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from datetime import datetime
+
+from fastapi import UploadFile, HTTPException
+from app.core.config import get_settings
+from app.core.postgresql_client import get_postgresql_client
+from app.core.path_security import sanitize_tenant_domain
+from app.services.embedding_client import BGE_M3_EmbeddingClient
+from app.services.document_processor import DocumentProcessor
+
+logger = logging.getLogger(__name__)
+
+
+class ConversationFileService:
+    """Service for managing conversation-scoped file attachments"""
+
+    def __init__(self, tenant_domain: str, user_id: str):
+        self.tenant_domain = tenant_domain
+        self.user_id = user_id
+        self.settings = get_settings()
+        self.schema_name = f"tenant_{tenant_domain.replace('.', '_').replace('-', '_')}"
+
+        # File storage configuration
+        # Sanitize tenant_domain to prevent path traversal
+        safe_tenant = sanitize_tenant_domain(tenant_domain)
+        # codeql[py/path-injection] safe_tenant validated by sanitize_tenant_domain()
+        self.storage_root = Path(self.settings.file_storage_path) / safe_tenant / "conversations"
+        self.storage_root.mkdir(parents=True, exist_ok=True)
+
+        logger.info(f"ConversationFileService initialized for {tenant_domain}/{user_id}")
+
+    def _get_conversation_storage_path(self, conversation_id: str) -> Path:
+        """Get storage directory for conversation files"""
+        conv_path = self.storage_root / conversation_id
+        conv_path.mkdir(parents=True, exist_ok=True)
+        return conv_path
+
+    def _generate_safe_filename(self, original_filename: str, file_id: str) -> str:
+        """Generate safe filename for storage"""
+        # Sanitize filename and prepend file ID
+        safe_name = "".join(c for c in original_filename if c.isalnum() or c in ".-_")
+        return f"{file_id}-{safe_name}"
+
+    async def upload_files(
+        self,
+        conversation_id: str,
+        files: List[UploadFile],
+        user_id: str
+    ) -> List[Dict[str, Any]]:
+        """Upload files directly to conversation"""
+        try:
+            # Validate conversation access
+            await self._validate_conversation_access(conversation_id, user_id)
+
+            uploaded_files = []
+
+            for file in files:
+                if not file.filename:
+                    raise HTTPException(status_code=400, detail="File must have a filename")
+
+                # Generate file metadata
+                file_id = str(uuid.uuid4())
+                safe_filename = self._generate_safe_filename(file.filename, file_id)
+                conversation_path = self._get_conversation_storage_path(conversation_id)
+                file_path = conversation_path / safe_filename
+
+                # Store file to disk
+                content = await file.read()
+                with open(file_path, "wb") as f:
+                    f.write(content)
+
+                # Create database record
+                file_record = await self._create_file_record(
+                    file_id=file_id,
+                    conversation_id=conversation_id,
+                    original_filename=file.filename,
+                    safe_filename=safe_filename,
+                    content_type=file.content_type or "application/octet-stream",
+                    file_size=len(content),
+                    file_path=str(file_path.relative_to(Path(self.settings.file_storage_path))),
+                    uploaded_by=user_id
+                )
+
+                uploaded_files.append(file_record)
+
+                # Queue for background processing
+                asyncio.create_task(self._process_file_embeddings(file_id))
+
+                logger.info(f"Uploaded conversation file: {file.filename} -> {file_id}")
+
+            return uploaded_files
+
+        except Exception as e:
+            logger.error(f"Failed to upload conversation files: {e}")
+            raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
+
+    async def _get_user_uuid(self, user_email: str) -> str:
+        """Resolve user email to UUID"""
+        client = await get_postgresql_client()
+        query = f"SELECT id FROM {self.schema_name}.users WHERE email = $1 LIMIT 1"
+        result = await client.fetch_one(query, user_email)
+        if not result:
+            raise ValueError(f"User not found: {user_email}")
+        return str(result['id'])
+
+    async def _create_file_record(
+        self,
+        file_id: str,
+        conversation_id: str,
+        original_filename: str,
+        safe_filename: str,
+        content_type: str,
+        file_size: int,
+        file_path: str,
+        uploaded_by: str
+    ) -> Dict[str, Any]:
+        """Create conversation_files database record"""
+
+        client = await get_postgresql_client()
+
+        # Resolve user email to UUID if needed
+        user_uuid = uploaded_by
+        if '@' in uploaded_by:  # Check if it's an email
+            user_uuid = await self._get_user_uuid(uploaded_by)
+
+        query = f"""
+            INSERT INTO {self.schema_name}.conversation_files (
+                id, conversation_id, filename, original_filename, content_type,
+                file_size_bytes, file_path, uploaded_by, processing_status
+            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'pending')
+            RETURNING id, filename, original_filename, content_type, file_size_bytes,
+                     processing_status, uploaded_at
+        """
+
+        result = await client.fetch_one(
+            query,
+            file_id, conversation_id, safe_filename, original_filename,
+            content_type, file_size, file_path, user_uuid
+        )
+
+        # Convert UUID fields to strings for JSON serialization
+        result_dict = dict(result)
+        if 'id' in result_dict and result_dict['id']:
+            result_dict['id'] = str(result_dict['id'])
+
+        return result_dict
+
+    async def _process_file_embeddings(self, file_id: str):
+        """Background task to process file content and generate embeddings"""
+        try:
+            # Update status to processing
+            await self._update_processing_status(file_id, "processing")
+
+            # Get file record
+            file_record = await self._get_file_record(file_id)
+            if not file_record:
+                logger.error(f"File record not found: {file_id}")
+                return
+
+            # Read file content
+            file_path = Path(self.settings.file_storage_path) / file_record['file_path']
+            if not file_path.exists():
+                logger.error(f"File not found on disk: {file_path}")
+                await self._update_processing_status(file_id, "failed")
+                return
+
+            # Extract text content using DocumentProcessor public methods
+            processor = DocumentProcessor()
+
+            text_content = await processor.extract_text_from_path(
+                file_path,
+                file_record['content_type']
+            )
+
+            if not text_content:
+                logger.warning(f"No text content extracted from {file_record['original_filename']}")
+                await self._update_processing_status(file_id, "completed")
+                return
+
+            # Chunk content for RAG
+            chunks = await processor.chunk_text_simple(text_content)
+
+            # Generate embeddings for full document (single embedding for semantic search)
+            embedding_client = BGE_M3_EmbeddingClient()
+            embeddings = await embedding_client.generate_embeddings([text_content])
+
+            if not embeddings:
+                logger.error(f"Failed to generate embeddings for {file_id}")
+                await self._update_processing_status(file_id, "failed")
+                return
+
+            # Update record with processed content (chunks as JSONB, embedding as vector)
+            await self._update_file_processing_results(
+                file_id, chunks, embeddings[0], "completed"
+            )
+
+            logger.info(f"Successfully processed file: {file_record['original_filename']}")
+
+        except Exception as e:
+            logger.error(f"Failed to process file {file_id}: {e}")
+            await self._update_processing_status(file_id, "failed")
+
+    async def _update_processing_status(self, file_id: str, status: str):
+        """Update file processing status"""
+        client = await get_postgresql_client()
+
+        query = f"""
+            UPDATE {self.schema_name}.conversation_files
+            SET processing_status = $1,
+                processed_at = CASE WHEN $1 IN ('completed', 'failed') THEN NOW() ELSE processed_at END
+            WHERE id = $2
+        """
+
+        await client.execute_query(query, status, file_id)
+
+    async def _update_file_processing_results(
+        self,
+        file_id: str,
+        chunks: List[str],
+        embedding: List[float],
+        status: str
+    ):
+        """Update file with processing results"""
+        import json
+        client = await get_postgresql_client()
+
+        # Sanitize chunks: remove null bytes and other control characters
+        # that PostgreSQL can't handle in JSONB
+        sanitized_chunks = [
+            chunk.replace('\u0000', '').replace('\x00', '')
+            for chunk in chunks
+        ]
+
+        # Convert chunks list to JSONB-compatible format
+        chunks_json = json.dumps(sanitized_chunks)
+
+        # Convert embedding to PostgreSQL vector format
+        embedding_str = f"[{','.join(map(str, embedding))}]"
+
+        query = f"""
+            UPDATE {self.schema_name}.conversation_files
+            SET processed_chunks = $1::jsonb,
+                embeddings = $2::vector,
+                processing_status = $3,
+                processed_at = NOW()
+            WHERE id = $4
+        """
+
+        await client.execute_query(query, chunks_json, embedding_str, status, file_id)
+
+    async def _get_file_record(self, file_id: str) -> Optional[Dict[str, Any]]:
+        """Get file record by ID"""
+        client = await get_postgresql_client()
+
+        query = f"""
+            SELECT id, conversation_id, filename, original_filename, content_type,
+                   file_size_bytes, file_path, processing_status, uploaded_at
+            FROM {self.schema_name}.conversation_files
+            WHERE id = $1
+        """
+
+        result = await client.fetch_one(query, file_id)
+        return dict(result) if result else None
+
+    async def list_files(self, conversation_id: str) -> List[Dict[str, Any]]:
+        """List files attached to conversation"""
+        try:
+            client = await get_postgresql_client()
+
+            query = f"""
+                SELECT id, filename, original_filename, content_type, file_size_bytes,
+                       processing_status, uploaded_at, processed_at
+                FROM {self.schema_name}.conversation_files
+                WHERE conversation_id = $1
+                ORDER BY uploaded_at DESC
+            """
+
+            rows = await client.execute_query(query, conversation_id)
+            return [dict(row) for row in rows]
+
+        except Exception as e:
+            logger.error(f"Failed to list conversation files: {e}")
+            return []
+
+    async def delete_file(self, conversation_id: str, file_id: str, user_id: str, allow_post_message_deletion: bool = False) -> bool:
+        """Delete specific file from conversation
+
+        Args:
+            conversation_id: The conversation ID
+            file_id: The file ID to delete
+            user_id: The user requesting deletion
+            allow_post_message_deletion: If False, prevents deletion after messages exist (default: False)
+        """
+        try:
+            logger.info(f"DELETE FILE CALLED: file_id={file_id}, conversation_id={conversation_id}, user_id={user_id}")
+
+            # Validate access
+            await self._validate_conversation_access(conversation_id, user_id)
+            logger.info(f"DELETE FILE: Access validated")
+
+            # Check if conversation has messages (unless explicitly allowed to delete post-message)
+            if not allow_post_message_deletion:
+                client = await get_postgresql_client()
+                message_check_query = f"""
+                    SELECT COUNT(*) as count
+                    FROM {self.schema_name}.messages
+                    WHERE conversation_id = $1
+                """
+                message_count_result = await client.fetch_one(message_check_query, conversation_id)
+                message_count = message_count_result['count'] if message_count_result else 0
+
+                if message_count > 0:
+                    from fastapi import HTTPException
+                    raise HTTPException(
+                        status_code=400,
+                        detail="Cannot delete files after conversation has started. Files are part of the conversation context."
+                    )
+
+            # Get file record for cleanup
+            file_record = await self._get_file_record(file_id)
+            logger.info(f"DELETE FILE: file_record={file_record}")
+            if not file_record or str(file_record['conversation_id']) != conversation_id:
+                logger.warning(f"DELETE FILE FAILED: file not found or conversation mismatch. file_record={file_record}, expected_conv_id={conversation_id}")
+                return False
+
+            # Delete from database
+            client = await get_postgresql_client()
+            query = f"""
+                DELETE FROM {self.schema_name}.conversation_files
+                WHERE id = $1 AND conversation_id = $2
+            """
+
+            rows_deleted = await client.execute_command(query, file_id, conversation_id)
+
+            if rows_deleted > 0:
+                # Delete file from disk
+                file_path = Path(self.settings.file_storage_path) / file_record['file_path']
+                if file_path.exists():
+                    file_path.unlink()
+
+                logger.info(f"Deleted conversation file: {file_id}")
+                return True
+
+            return False
+
+        except HTTPException:
+            raise  # Re-raise HTTPException to preserve status code and message
+        except Exception as e:
+            logger.error(f"Failed to delete conversation file: {e}")
+            return False
+
+    async def search_conversation_files(
+        self,
+        conversation_id: str,
+        query: str,
+        max_results: int = 5
+    ) -> List[Dict[str, Any]]:
+        """Search files within a conversation using vector similarity"""
+        try:
+            # Generate query embedding
+            embedding_client = BGE_M3_EmbeddingClient()
+            embeddings = await embedding_client.generate_embeddings([query])
+
+            if not embeddings:
+                return []
+
+            query_embedding = embeddings[0]
+
+            # Convert embedding to PostgreSQL vector format
+            embedding_str = '[' + ','.join(map(str, query_embedding)) + ']'
+
+            # Vector search against conversation files
+            client = await get_postgresql_client()
+
+            search_query = f"""
+                SELECT id, filename, original_filename, processed_chunks,
+                       1 - (embeddings <=> $1::vector) as similarity_score
+                FROM {self.schema_name}.conversation_files
+                WHERE conversation_id = $2
+                  AND processing_status = 'completed'
+                  AND embeddings IS NOT NULL
+                  AND 1 - (embeddings <=> $1::vector) > 0.1
+                ORDER BY embeddings <=> $1::vector
+                LIMIT $3
+            """
+
+            rows = await client.execute_query(
+                search_query, embedding_str, conversation_id, max_results
+            )
+
+            results = []
+
+            for row in rows:
+                processed_chunks = row.get('processed_chunks', [])
+
+                if not processed_chunks:
+                    continue
+
+                # Handle case where processed_chunks might be returned as JSON string
+                if isinstance(processed_chunks, str):
+                    import json
+                    processed_chunks = json.loads(processed_chunks)
+
+                for idx, chunk_text in enumerate(processed_chunks):
+                    results.append({
+                        'id': f"{row['id']}_chunk_{idx}",
+                        'document_id': row['id'],
+                        'document_name': row['original_filename'],
+                        'original_filename': row['original_filename'],
+                        'chunk_index': idx,
+                        'content': chunk_text,
+                        'similarity_score': row['similarity_score'],
+                        'source': 'conversation_file',
+                        'source_type': 'conversation_file'
+                    })
+
+                if len(results) >= max_results:
+                    results = results[:max_results]
+                    break
+
+            logger.info(f"Found {len(results)} chunks from {len(rows)} matching conversation files")
+            return results
+
+        except Exception as e:
+            logger.error(f"Failed to search conversation files: {e}")
+            return []
+
+    async def get_all_chunks_for_conversation(
+        self,
+        conversation_id: str,
+        max_chunks_per_file: int = 50,
+        max_total_chunks: int = 100
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve ALL chunks from files attached to conversation.
+        Non-query-dependent - returns everything up to limits.
+
+        Args:
+            conversation_id: UUID of conversation
+            max_chunks_per_file: Limit per file (enforces diversity)
+            max_total_chunks: Total chunk limit across all files
+
+        Returns:
+            List of chunks with metadata, grouped by file
+        """
+        try:
+            client = await get_postgresql_client()
+
+            query = f"""
+                SELECT id, filename, original_filename, processed_chunks,
+                       file_size_bytes, uploaded_at
+                FROM {self.schema_name}.conversation_files
+                WHERE conversation_id = $1
+                  AND processing_status = 'completed'
+                  AND processed_chunks IS NOT NULL
+                ORDER BY uploaded_at ASC
+            """
+
+            rows = await client.execute_query(query, conversation_id)
+
+            results = []
+            total_chunks = 0
+
+            for row in rows:
+                if total_chunks >= max_total_chunks:
+                    break
+
+                processed_chunks = row.get('processed_chunks', [])
+
+                # Handle JSON string if needed
+                if isinstance(processed_chunks, str):
+                    import json
+                    processed_chunks = json.loads(processed_chunks)
+
+                # Limit chunks per file (diversity enforcement)
+                chunks_from_this_file = 0
+
+                for idx, chunk_text in enumerate(processed_chunks):
+                    if chunks_from_this_file >= max_chunks_per_file:
+                        break
+                    if total_chunks >= max_total_chunks:
+                        break
+
+                    results.append({
+                        'id': f"{row['id']}_chunk_{idx}",
+                        'document_id': row['id'],
+                        'document_name': row['original_filename'],
+                        'original_filename': row['original_filename'],
+                        'chunk_index': idx,
+                        'total_chunks': len(processed_chunks),
+                        'content': chunk_text,
+                        'file_size_bytes': row['file_size_bytes'],
+                        'source': 'conversation_file',
+                        'source_type': 'conversation_file'
+                    })
+
+                    chunks_from_this_file += 1
+                    total_chunks += 1
+
+            logger.info(f"Retrieved {len(results)} total chunks from {len(rows)} conversation files")
+            return results
+
+        except Exception as e:
+            logger.error(f"Failed to get all chunks for conversation: {e}")
+            return []
+
+    async def _validate_conversation_access(self, conversation_id: str, user_id: str):
+        """Validate user has access to conversation"""
+        client = await get_postgresql_client()
+
+        query = f"""
+            SELECT id FROM {self.schema_name}.conversations
+            WHERE id = $1 AND user_id = (
+                SELECT id FROM {self.schema_name}.users WHERE email = $2 LIMIT 1
+            )
+        """
+
+        result = await client.fetch_one(query, conversation_id, user_id)
+        if not result:
+            raise HTTPException(
+                status_code=403,
+                detail="Access denied: conversation not found or access denied"
+            )
+
+    async def get_file_content(self, file_id: str, user_id: str) -> Optional[bytes]:
+        """Get file content for download"""
+        try:
+            file_record = await self._get_file_record(file_id)
+            if not file_record:
+                return None
+
+            # Validate access to conversation
+            await self._validate_conversation_access(file_record['conversation_id'], user_id)
+
+            # Read file content
+            file_path = Path(self.settings.file_storage_path) / file_record['file_path']
+            if file_path.exists():
+                with open(file_path, "rb") as f:
+                    return f.read()
+
+            return None
+
+        except Exception as e:
+            logger.error(f"Failed to get file content: {e}")
+            return None
+
+
+# Factory function for service instances
+def get_conversation_file_service(tenant_domain: str, user_id: str) -> ConversationFileService:
+    """Get conversation file service instance"""
+    return ConversationFileService(tenant_domain, user_id)