""" GT 2.0 PostgreSQL File Storage Service Replaces MinIO with PostgreSQL-based file storage using: - BYTEA for small files (<10MB) - PostgreSQL Large Objects (LOBs) for large files (10MB-1GB) - Filesystem with metadata for massive files (>1GB) Provides perfect tenant isolation through PostgreSQL schemas. """ import asyncio import json import logging import os import hashlib import mimetypes from typing import Dict, Any, List, Optional, BinaryIO, AsyncIterator, Tuple from datetime import datetime, timedelta from pathlib import Path import aiofiles from fastapi import UploadFile from app.core.postgresql_client import get_postgresql_client from app.core.config import get_settings from app.core.permissions import ADMIN_ROLES from app.core.path_security import sanitize_tenant_domain, sanitize_filename, safe_join_path logger = logging.getLogger(__name__) class PostgreSQLFileService: """PostgreSQL-based file storage service with tenant isolation""" # Storage type thresholds SMALL_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB - use BYTEA LARGE_FILE_THRESHOLD = 1024 * 1024 * 1024 # 1GB - use LOBs def __init__(self, tenant_domain: str, user_id: str, user_role: str = "user"): self.tenant_domain = tenant_domain self.user_id = user_id self.user_role = user_role self.settings = get_settings() # Filesystem path for massive files (>1GB) # Sanitize tenant_domain to prevent path traversal safe_tenant = sanitize_tenant_domain(tenant_domain) self.filesystem_base = Path("/data") / safe_tenant / "files" # codeql[py/path-injection] sanitize_tenant_domain() validates input self.filesystem_base.mkdir(parents=True, exist_ok=True, mode=0o700) logger.info(f"PostgreSQL file service initialized for {tenant_domain}/{user_id} (role: {user_role})") async def store_file( self, file: UploadFile, dataset_id: Optional[str] = None, category: str = "documents" ) -> Dict[str, Any]: """Store file using appropriate PostgreSQL strategy""" try: logger.info(f"PostgreSQL file service: storing file {file.filename} for tenant {self.tenant_domain}, user {self.user_id}") logger.info(f"Dataset ID: {dataset_id}, Category: {category}") # Read file content content = await file.read() file_size = len(content) # Generate file metadata file_hash = hashlib.sha256(content).hexdigest()[:16] content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or "application/octet-stream" # Handle different file types with appropriate processing if file_size <= self.SMALL_FILE_THRESHOLD and content_type.startswith('text/'): # Small text files stored directly storage_type = "text" storage_ref = "content_text" try: text_content = content.decode('utf-8') except UnicodeDecodeError: text_content = content.decode('latin-1') # Fallback encoding elif content_type == 'application/pdf': # PDF files: extract text content, store binary separately storage_type = "pdf_extracted" storage_ref = "content_text" text_content = await self._extract_pdf_text(content) else: # Other binary files: store as base64 for now import base64 storage_type = "base64" storage_ref = "content_text" text_content = base64.b64encode(content).decode('utf-8') # Get PostgreSQL client logger.info("Getting PostgreSQL client") pg_client = await get_postgresql_client() # Always expect user_id to be a UUID string - no email lookups logger.info(f"Using user UUID: {self.user_id}") # Validate user_id is a valid UUID format try: import uuid user_uuid = str(uuid.UUID(self.user_id)) except (ValueError, TypeError) as e: logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}") raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}") logger.info(f"Validated user UUID: {user_uuid}") # 1. Validate user_uuid is present if not user_uuid: raise ValueError("User UUID is required but not found") # 2. Validate and clean dataset_id dataset_uuid_param = None if dataset_id and dataset_id.strip() and dataset_id != "": try: import uuid dataset_uuid_param = str(uuid.UUID(dataset_id.strip())) logger.info(f"Dataset UUID validated: {dataset_uuid_param}") except ValueError as e: logger.error(f"Invalid dataset UUID: {dataset_id}, error: {e}") raise ValueError(f"Invalid dataset ID format: {dataset_id}") else: logger.info("No dataset_id provided, using NULL") # 3. Validate file content and metadata if not file.filename or not file.filename.strip(): raise ValueError("Filename cannot be empty") if not content: raise ValueError("File content cannot be empty") # 4. Generate and validate all string parameters safe_filename = f"{file_hash}_{file.filename}" safe_original_filename = file.filename.strip() safe_content_type = content_type or "application/octet-stream" safe_file_hash = file_hash safe_metadata = json.dumps({ "storage_type": storage_type, "storage_ref": storage_ref, "category": category }) logger.info(f"All parameters validated:") logger.info(f" user_uuid: {user_uuid}") logger.info(f" dataset_uuid: {dataset_uuid_param}") logger.info(f" filename: {safe_filename}") logger.info(f" original_filename: {safe_original_filename}") logger.info(f" file_type: {safe_content_type}") logger.info(f" file_size: {file_size}") logger.info(f" file_hash: {safe_file_hash}") # Store metadata in documents table (using existing schema) try: # Application user now has BYPASSRLS privilege - no RLS context needed logger.info("Storing document with BYPASSRLS privilege") # Require dataset_id for all document uploads if not dataset_uuid_param: raise ValueError("dataset_id is required for document uploads") logger.info(f"Storing document with dataset_id: {dataset_uuid_param}") logger.info(f"Document details: {safe_filename} ({file_size} bytes)") # Insert with dataset_id # Determine if content is searchable (under PostgreSQL tsvector size limit) is_searchable = text_content is None or len(text_content) < 1048575 async with pg_client.get_connection() as conn: # Get tenant_id for the document tenant_id = await conn.fetchval(""" SELECT id FROM tenants WHERE domain = $1 LIMIT 1 """, self.tenant_domain) if not tenant_id: raise ValueError(f"Tenant not found for domain: {self.tenant_domain}") document_id = await conn.fetchval(""" INSERT INTO documents ( tenant_id, user_id, dataset_id, filename, original_filename, file_type, file_size_bytes, file_hash, content_text, processing_status, metadata, is_searchable, created_at, updated_at ) VALUES ( $1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, 'pending', $10, $11, NOW(), NOW() ) RETURNING id """, tenant_id, user_uuid, dataset_uuid_param, safe_filename, safe_original_filename, safe_content_type, file_size, safe_file_hash, text_content, safe_metadata, is_searchable ) logger.info(f"Document inserted successfully with ID: {document_id}") except Exception as db_error: logger.error(f"Database insertion failed: {db_error}") logger.error(f"Tenant domain: {self.tenant_domain}") logger.error(f"User ID: {self.user_id}") logger.error(f"Dataset ID: {dataset_id}") raise result = { "id": document_id, "filename": file.filename, "content_type": content_type, "file_size": file_size, "file_hash": file_hash, "storage_type": storage_type, "storage_ref": storage_ref, "upload_timestamp": datetime.utcnow().isoformat(), "download_url": f"/api/v1/files/{document_id}" } logger.info(f"Stored file {file.filename} ({file_size} bytes) as {storage_type} for user {self.user_id}") # Trigger document processing pipeline for RAG functionality try: await self._trigger_document_processing(document_id, dataset_id, user_uuid, file.filename) logger.info(f"Successfully triggered document processing for {document_id}") except Exception as process_error: logger.error(f"Failed to trigger document processing for {document_id}: {process_error}") # Update document status to show processing failed try: pg_client = await get_postgresql_client() await pg_client.execute_command( "UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2", f"Processing failed: {str(process_error)}", document_id ) except Exception as update_error: logger.error(f"Failed to update document status after processing error: {update_error}") # Don't fail the upload if processing trigger fails - user can retry manually return result except Exception as e: logger.error(f"Failed to store file {file.filename}: {e}") raise finally: # Ensure content is cleared from memory if 'content' in locals(): del content async def _store_as_bytea( self, content: bytes, filename: str, content_type: str, file_hash: str, dataset_id: Optional[str], category: str ) -> str: """Store small file as BYTEA in documents table""" pg_client = await get_postgresql_client() # Store file content directly in BYTEA column # This will be handled by the main insert in store_file return "bytea_column" async def _store_as_lob( self, content: bytes, filename: str, content_type: str, file_hash: str, dataset_id: Optional[str], category: str ) -> str: """Store large file as PostgreSQL Large Object""" pg_client = await get_postgresql_client() # Create Large Object and get OID async with pg_client.get_connection() as conn: # Start transaction for LOB operations async with conn.transaction(): # Create LOB and get OID lob_oid = await conn.fetchval("SELECT lo_create(0)") # Open LOB for writing lob_fd = await conn.fetchval("SELECT lo_open($1, 131072)", lob_oid) # INV_WRITE mode # Write content in chunks for memory efficiency chunk_size = 8192 offset = 0 for i in range(0, len(content), chunk_size): chunk = content[i:i + chunk_size] await conn.execute("SELECT lo_write($1, $2)", lob_fd, chunk) offset += len(chunk) # Close LOB await conn.execute("SELECT lo_close($1)", lob_fd) logger.info(f"Created PostgreSQL LOB with OID {lob_oid} for {filename}") return str(lob_oid) async def _store_as_filesystem( self, content: bytes, filename: str, content_type: str, file_hash: str, dataset_id: Optional[str], category: str ) -> str: """Store massive file on filesystem with PostgreSQL metadata""" # Create secure file path with user isolation user_dir = self.filesystem_base / self.user_id / category if dataset_id: user_dir = user_dir / dataset_id user_dir.mkdir(parents=True, exist_ok=True, mode=0o700) # Generate secure filename timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") secure_filename = f"{timestamp}_{file_hash}_{filename}" file_path = user_dir / secure_filename # Write file with secure permissions async with aiofiles.open(file_path, 'wb') as f: await f.write(content) # Set secure file permissions os.chmod(file_path, 0o600) logger.info(f"Stored large file on filesystem: {file_path}") return str(file_path) async def get_file(self, document_id: str) -> AsyncIterator[bytes]: """Stream file content by document ID""" try: pg_client = await get_postgresql_client() # Validate user_id is a valid UUID format try: import uuid user_uuid = str(uuid.UUID(self.user_id)) except (ValueError, TypeError) as e: logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}") raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}") # Get document metadata using UUID directly # Admins can access any document in their tenant, regular users only their own if self.user_role in ADMIN_ROLES: doc_info = await pg_client.fetch_one(""" SELECT metadata, file_size_bytes, filename, content_text FROM documents d WHERE d.id = $1 AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2) """, document_id, self.tenant_domain) else: doc_info = await pg_client.fetch_one(""" SELECT metadata, file_size_bytes, filename, content_text FROM documents WHERE id = $1 AND user_id = $2::uuid """, document_id, user_uuid) if not doc_info: raise FileNotFoundError(f"Document {document_id} not found") # Get storage info from metadata - handle JSON string or dict metadata_raw = doc_info["metadata"] or "{}" if isinstance(metadata_raw, str): import json metadata = json.loads(metadata_raw) else: metadata = metadata_raw or {} storage_type = metadata.get("storage_type", "text") if storage_type == "text": # Text content stored directly if doc_info["content_text"]: content_bytes = doc_info["content_text"].encode('utf-8') async for chunk in self._stream_from_bytea(content_bytes): yield chunk else: raise FileNotFoundError(f"Document content not found") elif storage_type == "base64": # Base64 encoded binary content if doc_info["content_text"]: import base64 content_bytes = base64.b64decode(doc_info["content_text"]) async for chunk in self._stream_from_bytea(content_bytes): yield chunk else: raise FileNotFoundError(f"Document content not found") elif storage_type == "lob": # Stream from PostgreSQL LOB storage_ref = metadata.get("storage_ref", "") async for chunk in self._stream_from_lob(int(storage_ref)): yield chunk elif storage_type == "filesystem": # Stream from filesystem storage_ref = metadata.get("storage_ref", "") async for chunk in self._stream_from_filesystem(storage_ref): yield chunk else: # Default: treat as text content if doc_info["content_text"]: content_bytes = doc_info["content_text"].encode('utf-8') async for chunk in self._stream_from_bytea(content_bytes): yield chunk else: raise FileNotFoundError(f"Document content not found") except Exception as e: logger.error(f"Failed to get file {document_id}: {e}") raise async def _stream_from_bytea(self, content: bytes) -> AsyncIterator[bytes]: """Stream content from BYTEA in chunks""" chunk_size = 8192 for i in range(0, len(content), chunk_size): yield content[i:i + chunk_size] async def _stream_from_lob(self, lob_oid: int) -> AsyncIterator[bytes]: """Stream content from PostgreSQL Large Object""" pg_client = await get_postgresql_client() async with pg_client.get_connection() as conn: async with conn.transaction(): # Open LOB for reading lob_fd = await conn.fetchval("SELECT lo_open($1, 262144)", lob_oid) # INV_READ mode # Stream in chunks chunk_size = 8192 while True: chunk = await conn.fetchval("SELECT lo_read($1, $2)", lob_fd, chunk_size) if not chunk: break yield chunk # Close LOB await conn.execute("SELECT lo_close($1)", lob_fd) async def _stream_from_filesystem(self, file_path: str) -> AsyncIterator[bytes]: """Stream content from filesystem""" # Verify file belongs to tenant (security check) path_obj = Path(file_path) if not str(path_obj).startswith(str(self.filesystem_base)): raise PermissionError("Access denied to file") if not path_obj.exists(): raise FileNotFoundError(f"File not found: {file_path}") async with aiofiles.open(file_path, 'rb') as f: chunk_size = 8192 while True: chunk = await f.read(chunk_size) if not chunk: break yield chunk async def delete_file(self, document_id: str) -> bool: """Delete file and metadata""" try: pg_client = await get_postgresql_client() # Validate user_id is a valid UUID format try: import uuid user_uuid = str(uuid.UUID(self.user_id)) except (ValueError, TypeError) as e: logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}") raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}") # Get document info before deletion # Admins can delete any document in their tenant, regular users only their own if self.user_role in ADMIN_ROLES: doc_info = await pg_client.fetch_one(""" SELECT storage_type, storage_ref FROM documents d WHERE d.id = $1 AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2) """, document_id, self.tenant_domain) else: doc_info = await pg_client.fetch_one(""" SELECT storage_type, storage_ref FROM documents WHERE id = $1 AND user_id = $2::uuid """, document_id, user_uuid) if not doc_info: logger.warning(f"Document {document_id} not found for deletion") return False storage_type = doc_info["storage_type"] storage_ref = doc_info["storage_ref"] # Delete file content based on storage type if storage_type == "lob": # Delete LOB async with pg_client.get_connection() as conn: await conn.execute("SELECT lo_unlink($1)", int(storage_ref)) elif storage_type == "filesystem": # Delete filesystem file try: path_obj = Path(storage_ref) if path_obj.exists(): path_obj.unlink() except Exception as e: logger.warning(f"Failed to delete filesystem file {storage_ref}: {e}") # BYTEA files are deleted with the row # Delete metadata record if self.user_role in ADMIN_ROLES: deleted = await pg_client.execute_command(""" DELETE FROM documents d WHERE d.id = $1 AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2) """, document_id, self.tenant_domain) else: deleted = await pg_client.execute_command(""" DELETE FROM documents WHERE id = $1 AND user_id = $2::uuid """, document_id, user_uuid) if deleted > 0: logger.info(f"Deleted file {document_id} ({storage_type})") return True else: return False except Exception as e: logger.error(f"Failed to delete file {document_id}: {e}") return False async def get_file_info(self, document_id: str) -> Dict[str, Any]: """Get file metadata""" try: pg_client = await get_postgresql_client() # Validate user_id is a valid UUID format try: import uuid user_uuid = str(uuid.UUID(self.user_id)) except (ValueError, TypeError) as e: logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}") raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}") # Admins can access any document metadata in their tenant, regular users only their own if self.user_role in ADMIN_ROLES: doc_info = await pg_client.fetch_one(""" SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size, file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at FROM documents d WHERE d.id = $1 AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2) """, document_id, self.tenant_domain) else: doc_info = await pg_client.fetch_one(""" SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size, file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at FROM documents WHERE id = $1 AND user_id = $2::uuid """, document_id, user_uuid) if not doc_info: raise FileNotFoundError(f"Document {document_id} not found") return { "id": doc_info["id"], "filename": doc_info["filename"], "original_filename": doc_info["original_filename"], "content_type": doc_info["content_type"], "file_size": doc_info["file_size"], "file_hash": doc_info["file_hash"], "dataset_id": str(doc_info["dataset_id"]) if doc_info["dataset_id"] else None, "storage_type": doc_info["storage_type"], "category": doc_info["category"], "created_at": doc_info["created_at"].isoformat(), "download_url": f"/api/v1/files/{document_id}" } except Exception as e: logger.error(f"Failed to get file info for {document_id}: {e}") raise async def list_files( self, dataset_id: Optional[str] = None, category: str = "documents", limit: int = 50, offset: int = 0 ) -> List[Dict[str, Any]]: """List user files with optional filtering""" try: pg_client = await get_postgresql_client() # Validate user_id is a valid UUID format try: import uuid user_uuid = str(uuid.UUID(self.user_id)) except (ValueError, TypeError) as e: logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}") raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}") # Build permission-aware query # Admins can list any documents in their tenant # Regular users can list documents they own OR documents in datasets they can access if self.user_role in ADMIN_ROLES: where_clauses = ["d.tenant_id = (SELECT id FROM tenants WHERE domain = $1)"] params = [self.tenant_domain] param_idx = 2 else: # Non-admin users can see: # 1. Documents they own # 2. Documents in datasets with access_group = 'organization' # 3. Documents in datasets they're a member of (team access) where_clauses = ["""( d.user_id = $1::uuid OR EXISTS ( SELECT 1 FROM datasets ds WHERE ds.id = d.dataset_id AND ds.tenant_id = (SELECT id FROM tenants WHERE domain = $2) AND ( ds.access_group = 'organization' OR (ds.access_group = 'team' AND $1::uuid = ANY(ds.team_members)) ) ) )"""] params = [user_uuid, self.tenant_domain] param_idx = 3 if dataset_id: where_clauses.append(f"d.dataset_id = ${param_idx}::uuid") params.append(dataset_id) param_idx += 1 if category: where_clauses.append(f"(d.metadata->>'category' = ${param_idx} OR d.metadata->>'category' IS NULL)") params.append(category) param_idx += 1 query = f""" SELECT d.id, d.filename, d.original_filename, d.file_type as content_type, d.file_size_bytes as file_size, d.metadata->>'storage_type' as storage_type, d.metadata->>'category' as category, d.created_at, d.updated_at, d.dataset_id, d.processing_status, d.metadata, d.user_id, COUNT(dc.id) as chunk_count, ds.created_by as dataset_owner_id FROM documents d LEFT JOIN document_chunks dc ON d.id = dc.document_id LEFT JOIN datasets ds ON d.dataset_id = ds.id WHERE {' AND '.join(where_clauses)} GROUP BY d.id, d.filename, d.original_filename, d.file_type, d.file_size_bytes, d.metadata, d.created_at, d.updated_at, d.dataset_id, d.processing_status, d.user_id, ds.created_by ORDER BY d.created_at DESC LIMIT ${param_idx} OFFSET ${param_idx + 1} """ params.extend([limit, offset]) files = await pg_client.execute_query(query, *params) # Helper function to parse metadata def parse_metadata(metadata_value): if metadata_value is None: return {} if isinstance(metadata_value, str): import json try: return json.loads(metadata_value) except (json.JSONDecodeError, ValueError): return {} return metadata_value if isinstance(metadata_value, dict) else {} return [ { "id": file["id"], "filename": file["filename"], "original_filename": file["original_filename"], "content_type": file["content_type"], "file_type": file["content_type"], "file_size": file["file_size"], "file_size_bytes": file["file_size"], "dataset_id": file["dataset_id"], "storage_type": file["storage_type"], "category": file["category"], "created_at": file["created_at"].isoformat(), "updated_at": file["updated_at"].isoformat() if file.get("updated_at") else None, "processing_status": file.get("processing_status", "pending"), "chunk_count": file.get("chunk_count", 0), "chunks_processed": parse_metadata(file.get("metadata")).get("chunks_processed", 0), "total_chunks_expected": parse_metadata(file.get("metadata")).get("total_chunks_expected", 0), "processing_progress": parse_metadata(file.get("metadata")).get("processing_progress", 0), "processing_stage": parse_metadata(file.get("metadata")).get("processing_stage"), "download_url": f"/api/v1/files/{file['id']}", # Permission flags - user can delete if: # 1. They are admin, OR # 2. They uploaded the document, OR # 3. They own the parent dataset "can_delete": ( self.user_role in ADMIN_ROLES or file["user_id"] == user_uuid or (file.get("dataset_owner_id") and str(file["dataset_owner_id"]) == user_uuid) ) } for file in files ] except Exception as e: logger.error(f"Failed to list files for user {self.user_id}: {e}") return [] async def cleanup_orphaned_files(self) -> int: """Clean up orphaned files and LOBs""" try: pg_client = await get_postgresql_client() cleanup_count = 0 # Find orphaned LOBs (LOBs without corresponding document records) async with pg_client.get_connection() as conn: async with conn.transaction(): orphaned_lobs = await conn.fetch(""" SELECT lo.oid FROM pg_largeobject_metadata lo LEFT JOIN documents d ON lo.oid::text = d.storage_ref WHERE d.storage_ref IS NULL AND d.storage_type = 'lob' """) for lob in orphaned_lobs: await conn.execute("SELECT lo_unlink($1)", lob["oid"]) cleanup_count += 1 # Find orphaned filesystem files # Note: This would require more complex logic to safely identify orphans logger.info(f"Cleaned up {cleanup_count} orphaned files") return cleanup_count except Exception as e: logger.error(f"Failed to cleanup orphaned files: {e}") return 0 async def _trigger_document_processing( self, document_id: str, dataset_id: Optional[str], user_uuid: str, filename: str ): """Trigger document processing pipeline for RAG functionality""" try: # Import here to avoid circular imports from app.services.document_processor import get_document_processor logger.info(f"Triggering document processing for {document_id}") # Get document processor instance processor = await get_document_processor(tenant_domain=self.tenant_domain) # For documents uploaded via PostgreSQL file service, the content is already stored # We need to process it from the database content rather than a file path await self._process_document_from_database( processor, document_id, dataset_id, user_uuid, filename ) except Exception as e: logger.error(f"Document processing trigger failed for {document_id}: {e}") # Update document status to failed try: pg_client = await get_postgresql_client() await pg_client.execute_command( "UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2", f"Processing trigger failed: {str(e)}", document_id ) except Exception as update_error: logger.error(f"Failed to update document status to failed: {update_error}") raise async def _process_document_from_database( self, processor, document_id: str, dataset_id: Optional[str], user_uuid: str, filename: str ): """Process document using content already stored in database""" try: import tempfile import os from pathlib import Path # Get document content from database pg_client = await get_postgresql_client() doc_info = await pg_client.fetch_one(""" SELECT content_text, file_type, metadata FROM documents WHERE id = $1 AND user_id = $2::uuid """, document_id, user_uuid) if not doc_info or not doc_info["content_text"]: raise ValueError("Document content not found in database") # Create temporary file with the content # Sanitize the file extension to prevent path injection safe_suffix = sanitize_filename(filename) safe_suffix = Path(safe_suffix).suffix if safe_suffix else ".tmp" # codeql[py/path-injection] safe_suffix is sanitized via sanitize_filename() with tempfile.NamedTemporaryFile(mode='w', suffix=safe_suffix, delete=False) as temp_file: # Handle different storage types - metadata might be JSON string or dict metadata_raw = doc_info["metadata"] or "{}" if isinstance(metadata_raw, str): import json metadata = json.loads(metadata_raw) else: metadata = metadata_raw or {} storage_type = metadata.get("storage_type", "text") if storage_type == "text": temp_file.write(doc_info["content_text"]) elif storage_type == "base64": import base64 content_bytes = base64.b64decode(doc_info["content_text"]) temp_file.close() with open(temp_file.name, 'wb') as binary_file: binary_file.write(content_bytes) elif storage_type == "pdf_extracted": # For PDFs with extracted text, create a placeholder text file # since the actual text content is already extracted temp_file.write(doc_info["content_text"]) else: temp_file.write(doc_info["content_text"]) temp_file_path = Path(temp_file.name) try: # Process the document using the existing document processor await processor.process_file( file_path=temp_file_path, dataset_id=dataset_id, # Keep None as None - don't convert to empty string user_id=user_uuid, original_filename=filename, document_id=document_id # Use existing document instead of creating new one ) logger.info(f"Successfully processed document {document_id} from database content") finally: # Clean up temporary file try: os.unlink(temp_file_path) except Exception as cleanup_error: logger.warning(f"Failed to cleanup temporary file {temp_file_path}: {cleanup_error}") except Exception as e: logger.error(f"Failed to process document from database content: {e}") raise async def _extract_pdf_text(self, content: bytes) -> str: """Extract text content from PDF bytes using pypdf""" import io import pypdf as PyPDF2 # pypdf is the maintained successor to PyPDF2 try: # Create BytesIO object from content pdf_stream = io.BytesIO(content) pdf_reader = PyPDF2.PdfReader(pdf_stream) text_parts = [] for page_num, page in enumerate(pdf_reader.pages): try: page_text = page.extract_text() if page_text.strip(): text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}") except Exception as e: logger.warning(f"Could not extract text from page {page_num + 1}: {e}") if not text_parts: # If no text could be extracted, return a placeholder return f"PDF document with {len(pdf_reader.pages)} pages (text extraction failed)" extracted_text = "\n\n".join(text_parts) logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF with {len(pdf_reader.pages)} pages") return extracted_text except Exception as e: logger.error(f"PDF text extraction failed: {e}") # Return a fallback description instead of failing completely return f"PDF document (text extraction failed: {str(e)})"