""" Conversation File Service for GT 2.0 Handles conversation-scoped file attachments as a simpler alternative to dataset-based uploads. Preserves all existing dataset infrastructure while providing direct conversation file storage. """ import os import uuid import logging import asyncio from pathlib import Path from typing import Dict, Any, List, Optional from datetime import datetime from fastapi import UploadFile, HTTPException from app.core.config import get_settings from app.core.postgresql_client import get_postgresql_client from app.core.path_security import sanitize_tenant_domain from app.services.embedding_client import BGE_M3_EmbeddingClient from app.services.document_processor import DocumentProcessor logger = logging.getLogger(__name__) class ConversationFileService: """Service for managing conversation-scoped file attachments""" def __init__(self, tenant_domain: str, user_id: str): self.tenant_domain = tenant_domain self.user_id = user_id self.settings = get_settings() self.schema_name = f"tenant_{tenant_domain.replace('.', '_').replace('-', '_')}" # File storage configuration # Sanitize tenant_domain to prevent path traversal safe_tenant = sanitize_tenant_domain(tenant_domain) # codeql[py/path-injection] safe_tenant validated by sanitize_tenant_domain() self.storage_root = Path(self.settings.file_storage_path) / safe_tenant / "conversations" self.storage_root.mkdir(parents=True, exist_ok=True) logger.info(f"ConversationFileService initialized for {tenant_domain}/{user_id}") def _get_conversation_storage_path(self, conversation_id: str) -> Path: """Get storage directory for conversation files""" conv_path = self.storage_root / conversation_id conv_path.mkdir(parents=True, exist_ok=True) return conv_path def _generate_safe_filename(self, original_filename: str, file_id: str) -> str: """Generate safe filename for storage""" # Sanitize filename and prepend file ID safe_name = "".join(c for c in original_filename if c.isalnum() or c in ".-_") return f"{file_id}-{safe_name}" async def upload_files( self, conversation_id: str, files: List[UploadFile], user_id: str ) -> List[Dict[str, Any]]: """Upload files directly to conversation""" try: # Validate conversation access await self._validate_conversation_access(conversation_id, user_id) uploaded_files = [] for file in files: if not file.filename: raise HTTPException(status_code=400, detail="File must have a filename") # Generate file metadata file_id = str(uuid.uuid4()) safe_filename = self._generate_safe_filename(file.filename, file_id) conversation_path = self._get_conversation_storage_path(conversation_id) file_path = conversation_path / safe_filename # Store file to disk content = await file.read() with open(file_path, "wb") as f: f.write(content) # Create database record file_record = await self._create_file_record( file_id=file_id, conversation_id=conversation_id, original_filename=file.filename, safe_filename=safe_filename, content_type=file.content_type or "application/octet-stream", file_size=len(content), file_path=str(file_path.relative_to(Path(self.settings.file_storage_path))), uploaded_by=user_id ) uploaded_files.append(file_record) # Queue for background processing asyncio.create_task(self._process_file_embeddings(file_id)) logger.info(f"Uploaded conversation file: {file.filename} -> {file_id}") return uploaded_files except Exception as e: logger.error(f"Failed to upload conversation files: {e}") raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}") async def _get_user_uuid(self, user_email: str) -> str: """Resolve user email to UUID""" client = await get_postgresql_client() query = f"SELECT id FROM {self.schema_name}.users WHERE email = $1 LIMIT 1" result = await client.fetch_one(query, user_email) if not result: raise ValueError(f"User not found: {user_email}") return str(result['id']) async def _create_file_record( self, file_id: str, conversation_id: str, original_filename: str, safe_filename: str, content_type: str, file_size: int, file_path: str, uploaded_by: str ) -> Dict[str, Any]: """Create conversation_files database record""" client = await get_postgresql_client() # Resolve user email to UUID if needed user_uuid = uploaded_by if '@' in uploaded_by: # Check if it's an email user_uuid = await self._get_user_uuid(uploaded_by) query = f""" INSERT INTO {self.schema_name}.conversation_files ( id, conversation_id, filename, original_filename, content_type, file_size_bytes, file_path, uploaded_by, processing_status ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'pending') RETURNING id, filename, original_filename, content_type, file_size_bytes, processing_status, uploaded_at """ result = await client.fetch_one( query, file_id, conversation_id, safe_filename, original_filename, content_type, file_size, file_path, user_uuid ) # Convert UUID fields to strings for JSON serialization result_dict = dict(result) if 'id' in result_dict and result_dict['id']: result_dict['id'] = str(result_dict['id']) return result_dict async def _process_file_embeddings(self, file_id: str): """Background task to process file content and generate embeddings""" try: # Update status to processing await self._update_processing_status(file_id, "processing") # Get file record file_record = await self._get_file_record(file_id) if not file_record: logger.error(f"File record not found: {file_id}") return # Read file content file_path = Path(self.settings.file_storage_path) / file_record['file_path'] if not file_path.exists(): logger.error(f"File not found on disk: {file_path}") await self._update_processing_status(file_id, "failed") return # Extract text content using DocumentProcessor public methods processor = DocumentProcessor() text_content = await processor.extract_text_from_path( file_path, file_record['content_type'] ) if not text_content: logger.warning(f"No text content extracted from {file_record['original_filename']}") await self._update_processing_status(file_id, "completed") return # Chunk content for RAG chunks = await processor.chunk_text_simple(text_content) # Generate embeddings for full document (single embedding for semantic search) embedding_client = BGE_M3_EmbeddingClient() embeddings = await embedding_client.generate_embeddings([text_content]) if not embeddings: logger.error(f"Failed to generate embeddings for {file_id}") await self._update_processing_status(file_id, "failed") return # Update record with processed content (chunks as JSONB, embedding as vector) await self._update_file_processing_results( file_id, chunks, embeddings[0], "completed" ) logger.info(f"Successfully processed file: {file_record['original_filename']}") except Exception as e: logger.error(f"Failed to process file {file_id}: {e}") await self._update_processing_status(file_id, "failed") async def _update_processing_status(self, file_id: str, status: str): """Update file processing status""" client = await get_postgresql_client() query = f""" UPDATE {self.schema_name}.conversation_files SET processing_status = $1, processed_at = CASE WHEN $1 IN ('completed', 'failed') THEN NOW() ELSE processed_at END WHERE id = $2 """ await client.execute_query(query, status, file_id) async def _update_file_processing_results( self, file_id: str, chunks: List[str], embedding: List[float], status: str ): """Update file with processing results""" import json client = await get_postgresql_client() # Sanitize chunks: remove null bytes and other control characters # that PostgreSQL can't handle in JSONB sanitized_chunks = [ chunk.replace('\u0000', '').replace('\x00', '') for chunk in chunks ] # Convert chunks list to JSONB-compatible format chunks_json = json.dumps(sanitized_chunks) # Convert embedding to PostgreSQL vector format embedding_str = f"[{','.join(map(str, embedding))}]" query = f""" UPDATE {self.schema_name}.conversation_files SET processed_chunks = $1::jsonb, embeddings = $2::vector, processing_status = $3, processed_at = NOW() WHERE id = $4 """ await client.execute_query(query, chunks_json, embedding_str, status, file_id) async def _get_file_record(self, file_id: str) -> Optional[Dict[str, Any]]: """Get file record by ID""" client = await get_postgresql_client() query = f""" SELECT id, conversation_id, filename, original_filename, content_type, file_size_bytes, file_path, processing_status, uploaded_at FROM {self.schema_name}.conversation_files WHERE id = $1 """ result = await client.fetch_one(query, file_id) return dict(result) if result else None async def list_files(self, conversation_id: str) -> List[Dict[str, Any]]: """List files attached to conversation""" try: client = await get_postgresql_client() query = f""" SELECT id, filename, original_filename, content_type, file_size_bytes, processing_status, uploaded_at, processed_at FROM {self.schema_name}.conversation_files WHERE conversation_id = $1 ORDER BY uploaded_at DESC """ rows = await client.execute_query(query, conversation_id) return [dict(row) for row in rows] except Exception as e: logger.error(f"Failed to list conversation files: {e}") return [] async def delete_file(self, conversation_id: str, file_id: str, user_id: str, allow_post_message_deletion: bool = False) -> bool: """Delete specific file from conversation Args: conversation_id: The conversation ID file_id: The file ID to delete user_id: The user requesting deletion allow_post_message_deletion: If False, prevents deletion after messages exist (default: False) """ try: logger.info(f"DELETE FILE CALLED: file_id={file_id}, conversation_id={conversation_id}, user_id={user_id}") # Validate access await self._validate_conversation_access(conversation_id, user_id) logger.info(f"DELETE FILE: Access validated") # Check if conversation has messages (unless explicitly allowed to delete post-message) if not allow_post_message_deletion: client = await get_postgresql_client() message_check_query = f""" SELECT COUNT(*) as count FROM {self.schema_name}.messages WHERE conversation_id = $1 """ message_count_result = await client.fetch_one(message_check_query, conversation_id) message_count = message_count_result['count'] if message_count_result else 0 if message_count > 0: from fastapi import HTTPException raise HTTPException( status_code=400, detail="Cannot delete files after conversation has started. Files are part of the conversation context." ) # Get file record for cleanup file_record = await self._get_file_record(file_id) logger.info(f"DELETE FILE: file_record={file_record}") if not file_record or str(file_record['conversation_id']) != conversation_id: logger.warning(f"DELETE FILE FAILED: file not found or conversation mismatch. file_record={file_record}, expected_conv_id={conversation_id}") return False # Delete from database client = await get_postgresql_client() query = f""" DELETE FROM {self.schema_name}.conversation_files WHERE id = $1 AND conversation_id = $2 """ rows_deleted = await client.execute_command(query, file_id, conversation_id) if rows_deleted > 0: # Delete file from disk file_path = Path(self.settings.file_storage_path) / file_record['file_path'] if file_path.exists(): file_path.unlink() logger.info(f"Deleted conversation file: {file_id}") return True return False except HTTPException: raise # Re-raise HTTPException to preserve status code and message except Exception as e: logger.error(f"Failed to delete conversation file: {e}") return False async def search_conversation_files( self, conversation_id: str, query: str, max_results: int = 5 ) -> List[Dict[str, Any]]: """Search files within a conversation using vector similarity""" try: # Generate query embedding embedding_client = BGE_M3_EmbeddingClient() embeddings = await embedding_client.generate_embeddings([query]) if not embeddings: return [] query_embedding = embeddings[0] # Convert embedding to PostgreSQL vector format embedding_str = '[' + ','.join(map(str, query_embedding)) + ']' # Vector search against conversation files client = await get_postgresql_client() search_query = f""" SELECT id, filename, original_filename, processed_chunks, 1 - (embeddings <=> $1::vector) as similarity_score FROM {self.schema_name}.conversation_files WHERE conversation_id = $2 AND processing_status = 'completed' AND embeddings IS NOT NULL AND 1 - (embeddings <=> $1::vector) > 0.1 ORDER BY embeddings <=> $1::vector LIMIT $3 """ rows = await client.execute_query( search_query, embedding_str, conversation_id, max_results ) results = [] for row in rows: processed_chunks = row.get('processed_chunks', []) if not processed_chunks: continue # Handle case where processed_chunks might be returned as JSON string if isinstance(processed_chunks, str): import json processed_chunks = json.loads(processed_chunks) for idx, chunk_text in enumerate(processed_chunks): results.append({ 'id': f"{row['id']}_chunk_{idx}", 'document_id': row['id'], 'document_name': row['original_filename'], 'original_filename': row['original_filename'], 'chunk_index': idx, 'content': chunk_text, 'similarity_score': row['similarity_score'], 'source': 'conversation_file', 'source_type': 'conversation_file' }) if len(results) >= max_results: results = results[:max_results] break logger.info(f"Found {len(results)} chunks from {len(rows)} matching conversation files") return results except Exception as e: logger.error(f"Failed to search conversation files: {e}") return [] async def get_all_chunks_for_conversation( self, conversation_id: str, max_chunks_per_file: int = 50, max_total_chunks: int = 100 ) -> List[Dict[str, Any]]: """ Retrieve ALL chunks from files attached to conversation. Non-query-dependent - returns everything up to limits. Args: conversation_id: UUID of conversation max_chunks_per_file: Limit per file (enforces diversity) max_total_chunks: Total chunk limit across all files Returns: List of chunks with metadata, grouped by file """ try: client = await get_postgresql_client() query = f""" SELECT id, filename, original_filename, processed_chunks, file_size_bytes, uploaded_at FROM {self.schema_name}.conversation_files WHERE conversation_id = $1 AND processing_status = 'completed' AND processed_chunks IS NOT NULL ORDER BY uploaded_at ASC """ rows = await client.execute_query(query, conversation_id) results = [] total_chunks = 0 for row in rows: if total_chunks >= max_total_chunks: break processed_chunks = row.get('processed_chunks', []) # Handle JSON string if needed if isinstance(processed_chunks, str): import json processed_chunks = json.loads(processed_chunks) # Limit chunks per file (diversity enforcement) chunks_from_this_file = 0 for idx, chunk_text in enumerate(processed_chunks): if chunks_from_this_file >= max_chunks_per_file: break if total_chunks >= max_total_chunks: break results.append({ 'id': f"{row['id']}_chunk_{idx}", 'document_id': row['id'], 'document_name': row['original_filename'], 'original_filename': row['original_filename'], 'chunk_index': idx, 'total_chunks': len(processed_chunks), 'content': chunk_text, 'file_size_bytes': row['file_size_bytes'], 'source': 'conversation_file', 'source_type': 'conversation_file' }) chunks_from_this_file += 1 total_chunks += 1 logger.info(f"Retrieved {len(results)} total chunks from {len(rows)} conversation files") return results except Exception as e: logger.error(f"Failed to get all chunks for conversation: {e}") return [] async def _validate_conversation_access(self, conversation_id: str, user_id: str): """Validate user has access to conversation""" client = await get_postgresql_client() query = f""" SELECT id FROM {self.schema_name}.conversations WHERE id = $1 AND user_id = ( SELECT id FROM {self.schema_name}.users WHERE email = $2 LIMIT 1 ) """ result = await client.fetch_one(query, conversation_id, user_id) if not result: raise HTTPException( status_code=403, detail="Access denied: conversation not found or access denied" ) async def get_file_content(self, file_id: str, user_id: str) -> Optional[bytes]: """Get file content for download""" try: file_record = await self._get_file_record(file_id) if not file_record: return None # Validate access to conversation await self._validate_conversation_access(file_record['conversation_id'], user_id) # Read file content file_path = Path(self.settings.file_storage_path) / file_record['file_path'] if file_path.exists(): with open(file_path, "rb") as f: return f.read() return None except Exception as e: logger.error(f"Failed to get file content: {e}") return None # Factory function for service instances def get_conversation_file_service(tenant_domain: str, user_id: str) -> ConversationFileService: """Get conversation file service instance""" return ConversationFileService(tenant_domain, user_id)