GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/tenant-backend/app/services/postgresql_file_service.py
+++ b/apps/tenant-backend/app/services/postgresql_file_service.py
@@ -0,0 +1,883 @@
+"""
+GT 2.0 PostgreSQL File Storage Service
+
+Replaces MinIO with PostgreSQL-based file storage using:
+- BYTEA for small files (<10MB)
+- PostgreSQL Large Objects (LOBs) for large files (10MB-1GB)
+- Filesystem with metadata for massive files (>1GB)
+
+Provides perfect tenant isolation through PostgreSQL schemas.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import hashlib
+import mimetypes
+from typing import Dict, Any, List, Optional, BinaryIO, AsyncIterator, Tuple
+from datetime import datetime, timedelta
+from pathlib import Path
+import aiofiles
+from fastapi import UploadFile
+
+from app.core.postgresql_client import get_postgresql_client
+from app.core.config import get_settings
+from app.core.permissions import ADMIN_ROLES
+from app.core.path_security import sanitize_tenant_domain, sanitize_filename, safe_join_path
+
+logger = logging.getLogger(__name__)
+
+class PostgreSQLFileService:
+    """PostgreSQL-based file storage service with tenant isolation"""
+
+    # Storage type thresholds
+    SMALL_FILE_THRESHOLD = 10 * 1024 * 1024  # 10MB - use BYTEA
+    LARGE_FILE_THRESHOLD = 1024 * 1024 * 1024  # 1GB - use LOBs
+
+    def __init__(self, tenant_domain: str, user_id: str, user_role: str = "user"):
+        self.tenant_domain = tenant_domain
+        self.user_id = user_id
+        self.user_role = user_role
+        self.settings = get_settings()
+
+        # Filesystem path for massive files (>1GB)
+        # Sanitize tenant_domain to prevent path traversal
+        safe_tenant = sanitize_tenant_domain(tenant_domain)
+        self.filesystem_base = Path("/data") / safe_tenant / "files"  # codeql[py/path-injection] sanitize_tenant_domain() validates input
+        self.filesystem_base.mkdir(parents=True, exist_ok=True, mode=0o700)
+
+        logger.info(f"PostgreSQL file service initialized for {tenant_domain}/{user_id} (role: {user_role})")
+    
+    async def store_file(
+        self,
+        file: UploadFile,
+        dataset_id: Optional[str] = None,
+        category: str = "documents"
+    ) -> Dict[str, Any]:
+        """Store file using appropriate PostgreSQL strategy"""
+
+        try:
+            logger.info(f"PostgreSQL file service: storing file {file.filename} for tenant {self.tenant_domain}, user {self.user_id}")
+            logger.info(f"Dataset ID: {dataset_id}, Category: {category}")
+            # Read file content
+            content = await file.read()
+            file_size = len(content)
+            
+            # Generate file metadata
+            file_hash = hashlib.sha256(content).hexdigest()[:16]
+            content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or "application/octet-stream"
+            
+            # Handle different file types with appropriate processing
+            if file_size <= self.SMALL_FILE_THRESHOLD and content_type.startswith('text/'):
+                # Small text files stored directly
+                storage_type = "text"
+                storage_ref = "content_text"
+                try:
+                    text_content = content.decode('utf-8')
+                except UnicodeDecodeError:
+                    text_content = content.decode('latin-1')  # Fallback encoding
+            elif content_type == 'application/pdf':
+                # PDF files: extract text content, store binary separately
+                storage_type = "pdf_extracted"
+                storage_ref = "content_text"
+                text_content = await self._extract_pdf_text(content)
+            else:
+                # Other binary files: store as base64 for now
+                import base64
+                storage_type = "base64"
+                storage_ref = "content_text"
+                text_content = base64.b64encode(content).decode('utf-8')
+            
+            # Get PostgreSQL client
+            logger.info("Getting PostgreSQL client")
+            pg_client = await get_postgresql_client()
+
+            # Always expect user_id to be a UUID string - no email lookups
+            logger.info(f"Using user UUID: {self.user_id}")
+
+            # Validate user_id is a valid UUID format
+            try:
+                import uuid
+                user_uuid = str(uuid.UUID(self.user_id))
+            except (ValueError, TypeError) as e:
+                logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
+                raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
+
+            logger.info(f"Validated user UUID: {user_uuid}")
+
+            # 1. Validate user_uuid is present
+            if not user_uuid:
+                raise ValueError("User UUID is required but not found")
+
+            # 2. Validate and clean dataset_id
+            dataset_uuid_param = None
+            if dataset_id and dataset_id.strip() and dataset_id != "":
+                try:
+                    import uuid
+                    dataset_uuid_param = str(uuid.UUID(dataset_id.strip()))
+                    logger.info(f"Dataset UUID validated: {dataset_uuid_param}")
+                except ValueError as e:
+                    logger.error(f"Invalid dataset UUID: {dataset_id}, error: {e}")
+                    raise ValueError(f"Invalid dataset ID format: {dataset_id}")
+            else:
+                logger.info("No dataset_id provided, using NULL")
+
+            # 3. Validate file content and metadata
+            if not file.filename or not file.filename.strip():
+                raise ValueError("Filename cannot be empty")
+
+            if not content:
+                raise ValueError("File content cannot be empty")
+
+            # 4. Generate and validate all string parameters
+            safe_filename = f"{file_hash}_{file.filename}"
+            safe_original_filename = file.filename.strip()
+            safe_content_type = content_type or "application/octet-stream"
+            safe_file_hash = file_hash
+            safe_metadata = json.dumps({
+                "storage_type": storage_type,
+                "storage_ref": storage_ref,
+                "category": category
+            })
+
+            logger.info(f"All parameters validated:")
+            logger.info(f"  user_uuid: {user_uuid}")
+            logger.info(f"  dataset_uuid: {dataset_uuid_param}")
+            logger.info(f"  filename: {safe_filename}")
+            logger.info(f"  original_filename: {safe_original_filename}")
+            logger.info(f"  file_type: {safe_content_type}")
+            logger.info(f"  file_size: {file_size}")
+            logger.info(f"  file_hash: {safe_file_hash}")
+
+            # Store metadata in documents table (using existing schema)
+            try:
+                # Application user now has BYPASSRLS privilege - no RLS context needed
+                logger.info("Storing document with BYPASSRLS privilege")
+
+                # Require dataset_id for all document uploads
+                if not dataset_uuid_param:
+                    raise ValueError("dataset_id is required for document uploads")
+
+                logger.info(f"Storing document with dataset_id: {dataset_uuid_param}")
+                logger.info(f"Document details: {safe_filename} ({file_size} bytes)")
+
+                # Insert with dataset_id
+                # Determine if content is searchable (under PostgreSQL tsvector size limit)
+                is_searchable = text_content is None or len(text_content) < 1048575
+
+                async with pg_client.get_connection() as conn:
+                    # Get tenant_id for the document
+                    tenant_id = await conn.fetchval("""
+                        SELECT id FROM tenants WHERE domain = $1 LIMIT 1
+                    """, self.tenant_domain)
+
+                    if not tenant_id:
+                        raise ValueError(f"Tenant not found for domain: {self.tenant_domain}")
+
+                    document_id = await conn.fetchval("""
+                        INSERT INTO documents (
+                            tenant_id, user_id, dataset_id, filename, original_filename,
+                            file_type, file_size_bytes, file_hash, content_text, processing_status,
+                            metadata, is_searchable, created_at, updated_at
+                        ) VALUES (
+                            $1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, 'pending', $10, $11, NOW(), NOW()
+                        )
+                        RETURNING id
+                    """,
+                        tenant_id, user_uuid, dataset_uuid_param, safe_filename, safe_original_filename,
+                        safe_content_type, file_size, safe_file_hash, text_content,
+                        safe_metadata, is_searchable
+                    )
+                logger.info(f"Document inserted successfully with ID: {document_id}")
+
+            except Exception as db_error:
+                logger.error(f"Database insertion failed: {db_error}")
+                logger.error(f"Tenant domain: {self.tenant_domain}")
+                logger.error(f"User ID: {self.user_id}")
+                logger.error(f"Dataset ID: {dataset_id}")
+                raise
+            
+            result = {
+                "id": document_id,
+                "filename": file.filename,
+                "content_type": content_type,
+                "file_size": file_size,
+                "file_hash": file_hash,
+                "storage_type": storage_type,
+                "storage_ref": storage_ref,
+                "upload_timestamp": datetime.utcnow().isoformat(),
+                "download_url": f"/api/v1/files/{document_id}"
+            }
+            
+            logger.info(f"Stored file {file.filename} ({file_size} bytes) as {storage_type} for user {self.user_id}")
+
+            # Trigger document processing pipeline for RAG functionality
+            try:
+                await self._trigger_document_processing(document_id, dataset_id, user_uuid, file.filename)
+                logger.info(f"Successfully triggered document processing for {document_id}")
+            except Exception as process_error:
+                logger.error(f"Failed to trigger document processing for {document_id}: {process_error}")
+                # Update document status to show processing failed
+                try:
+                    pg_client = await get_postgresql_client()
+                    await pg_client.execute_command(
+                        "UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2",
+                        f"Processing failed: {str(process_error)}", document_id
+                    )
+                except Exception as update_error:
+                    logger.error(f"Failed to update document status after processing error: {update_error}")
+                # Don't fail the upload if processing trigger fails - user can retry manually
+
+            return result
+            
+        except Exception as e:
+            logger.error(f"Failed to store file {file.filename}: {e}")
+            raise
+        finally:
+            # Ensure content is cleared from memory
+            if 'content' in locals():
+                del content
+    
+    async def _store_as_bytea(
+        self, 
+        content: bytes, 
+        filename: str, 
+        content_type: str,
+        file_hash: str,
+        dataset_id: Optional[str],
+        category: str
+    ) -> str:
+        """Store small file as BYTEA in documents table"""
+        
+        pg_client = await get_postgresql_client()
+        
+        # Store file content directly in BYTEA column
+        # This will be handled by the main insert in store_file
+        return "bytea_column"
+    
+    async def _store_as_lob(
+        self, 
+        content: bytes, 
+        filename: str, 
+        content_type: str,
+        file_hash: str,
+        dataset_id: Optional[str],
+        category: str
+    ) -> str:
+        """Store large file as PostgreSQL Large Object"""
+        
+        pg_client = await get_postgresql_client()
+        
+        # Create Large Object and get OID
+        async with pg_client.get_connection() as conn:
+            # Start transaction for LOB operations
+            async with conn.transaction():
+                # Create LOB and get OID
+                lob_oid = await conn.fetchval("SELECT lo_create(0)")
+                
+                # Open LOB for writing
+                lob_fd = await conn.fetchval("SELECT lo_open($1, 131072)", lob_oid)  # INV_WRITE mode
+                
+                # Write content in chunks for memory efficiency
+                chunk_size = 8192
+                offset = 0
+                for i in range(0, len(content), chunk_size):
+                    chunk = content[i:i + chunk_size]
+                    await conn.execute("SELECT lo_write($1, $2)", lob_fd, chunk)
+                    offset += len(chunk)
+                
+                # Close LOB
+                await conn.execute("SELECT lo_close($1)", lob_fd)
+                
+                logger.info(f"Created PostgreSQL LOB with OID {lob_oid} for {filename}")
+                return str(lob_oid)
+    
+    async def _store_as_filesystem(
+        self, 
+        content: bytes, 
+        filename: str, 
+        content_type: str,
+        file_hash: str,
+        dataset_id: Optional[str],
+        category: str
+    ) -> str:
+        """Store massive file on filesystem with PostgreSQL metadata"""
+        
+        # Create secure file path with user isolation
+        user_dir = self.filesystem_base / self.user_id / category
+        if dataset_id:
+            user_dir = user_dir / dataset_id
+        
+        user_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
+        
+        # Generate secure filename
+        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+        secure_filename = f"{timestamp}_{file_hash}_{filename}"
+        file_path = user_dir / secure_filename
+        
+        # Write file with secure permissions
+        async with aiofiles.open(file_path, 'wb') as f:
+            await f.write(content)
+        
+        # Set secure file permissions
+        os.chmod(file_path, 0o600)
+        
+        logger.info(f"Stored large file on filesystem: {file_path}")
+        return str(file_path)
+    
+    async def get_file(self, document_id: str) -> AsyncIterator[bytes]:
+        """Stream file content by document ID"""
+        
+        try:
+            pg_client = await get_postgresql_client()
+            
+            # Validate user_id is a valid UUID format
+            try:
+                import uuid
+                user_uuid = str(uuid.UUID(self.user_id))
+            except (ValueError, TypeError) as e:
+                logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
+                raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
+
+            # Get document metadata using UUID directly
+            # Admins can access any document in their tenant, regular users only their own
+            if self.user_role in ADMIN_ROLES:
+                doc_info = await pg_client.fetch_one("""
+                    SELECT metadata, file_size_bytes, filename, content_text
+                    FROM documents d
+                    WHERE d.id = $1
+                      AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
+                """, document_id, self.tenant_domain)
+            else:
+                doc_info = await pg_client.fetch_one("""
+                    SELECT metadata, file_size_bytes, filename, content_text
+                    FROM documents
+                    WHERE id = $1 AND user_id = $2::uuid
+                """, document_id, user_uuid)
+
+            if not doc_info:
+                raise FileNotFoundError(f"Document {document_id} not found")
+            
+            # Get storage info from metadata - handle JSON string or dict
+            metadata_raw = doc_info["metadata"] or "{}"
+            if isinstance(metadata_raw, str):
+                import json
+                metadata = json.loads(metadata_raw)
+            else:
+                metadata = metadata_raw or {}
+            storage_type = metadata.get("storage_type", "text")
+            
+            if storage_type == "text":
+                # Text content stored directly
+                if doc_info["content_text"]:
+                    content_bytes = doc_info["content_text"].encode('utf-8')
+                    async for chunk in self._stream_from_bytea(content_bytes):
+                        yield chunk
+                else:
+                    raise FileNotFoundError(f"Document content not found")
+                    
+            elif storage_type == "base64":
+                # Base64 encoded binary content
+                if doc_info["content_text"]:
+                    import base64
+                    content_bytes = base64.b64decode(doc_info["content_text"])
+                    async for chunk in self._stream_from_bytea(content_bytes):
+                        yield chunk
+                else:
+                    raise FileNotFoundError(f"Document content not found")
+                    
+            elif storage_type == "lob":
+                # Stream from PostgreSQL LOB
+                storage_ref = metadata.get("storage_ref", "")
+                async for chunk in self._stream_from_lob(int(storage_ref)):
+                    yield chunk
+                    
+            elif storage_type == "filesystem":
+                # Stream from filesystem
+                storage_ref = metadata.get("storage_ref", "")
+                async for chunk in self._stream_from_filesystem(storage_ref):
+                    yield chunk
+            else:
+                # Default: treat as text content
+                if doc_info["content_text"]:
+                    content_bytes = doc_info["content_text"].encode('utf-8')
+                    async for chunk in self._stream_from_bytea(content_bytes):
+                        yield chunk
+                else:
+                    raise FileNotFoundError(f"Document content not found")
+                
+        except Exception as e:
+            logger.error(f"Failed to get file {document_id}: {e}")
+            raise
+    
+    async def _stream_from_bytea(self, content: bytes) -> AsyncIterator[bytes]:
+        """Stream content from BYTEA in chunks"""
+        chunk_size = 8192
+        for i in range(0, len(content), chunk_size):
+            yield content[i:i + chunk_size]
+    
+    async def _stream_from_lob(self, lob_oid: int) -> AsyncIterator[bytes]:
+        """Stream content from PostgreSQL Large Object"""
+        
+        pg_client = await get_postgresql_client()
+        
+        async with pg_client.get_connection() as conn:
+            async with conn.transaction():
+                # Open LOB for reading
+                lob_fd = await conn.fetchval("SELECT lo_open($1, 262144)", lob_oid)  # INV_READ mode
+                
+                # Stream in chunks
+                chunk_size = 8192
+                while True:
+                    chunk = await conn.fetchval("SELECT lo_read($1, $2)", lob_fd, chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
+                
+                # Close LOB
+                await conn.execute("SELECT lo_close($1)", lob_fd)
+    
+    async def _stream_from_filesystem(self, file_path: str) -> AsyncIterator[bytes]:
+        """Stream content from filesystem"""
+        
+        # Verify file belongs to tenant (security check)
+        path_obj = Path(file_path)
+        if not str(path_obj).startswith(str(self.filesystem_base)):
+            raise PermissionError("Access denied to file")
+        
+        if not path_obj.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        async with aiofiles.open(file_path, 'rb') as f:
+            chunk_size = 8192
+            while True:
+                chunk = await f.read(chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+    
+    async def delete_file(self, document_id: str) -> bool:
+        """Delete file and metadata"""
+        
+        try:
+            pg_client = await get_postgresql_client()
+
+            # Validate user_id is a valid UUID format
+            try:
+                import uuid
+                user_uuid = str(uuid.UUID(self.user_id))
+            except (ValueError, TypeError) as e:
+                logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
+                raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
+
+            # Get document info before deletion
+            # Admins can delete any document in their tenant, regular users only their own
+            if self.user_role in ADMIN_ROLES:
+                doc_info = await pg_client.fetch_one("""
+                    SELECT storage_type, storage_ref FROM documents d
+                    WHERE d.id = $1
+                      AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
+                """, document_id, self.tenant_domain)
+            else:
+                doc_info = await pg_client.fetch_one("""
+                    SELECT storage_type, storage_ref FROM documents
+                    WHERE id = $1 AND user_id = $2::uuid
+                """, document_id, user_uuid)
+
+            if not doc_info:
+                logger.warning(f"Document {document_id} not found for deletion")
+                return False
+
+            storage_type = doc_info["storage_type"]
+            storage_ref = doc_info["storage_ref"]
+
+            # Delete file content based on storage type
+            if storage_type == "lob":
+                # Delete LOB
+                async with pg_client.get_connection() as conn:
+                    await conn.execute("SELECT lo_unlink($1)", int(storage_ref))
+            elif storage_type == "filesystem":
+                # Delete filesystem file
+                try:
+                    path_obj = Path(storage_ref)
+                    if path_obj.exists():
+                        path_obj.unlink()
+                except Exception as e:
+                    logger.warning(f"Failed to delete filesystem file {storage_ref}: {e}")
+            # BYTEA files are deleted with the row
+
+            # Delete metadata record
+            if self.user_role in ADMIN_ROLES:
+                deleted = await pg_client.execute_command("""
+                    DELETE FROM documents d
+                    WHERE d.id = $1
+                      AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
+                """, document_id, self.tenant_domain)
+            else:
+                deleted = await pg_client.execute_command("""
+                    DELETE FROM documents WHERE id = $1 AND user_id = $2::uuid
+                """, document_id, user_uuid)
+            
+            if deleted > 0:
+                logger.info(f"Deleted file {document_id} ({storage_type})")
+                return True
+            else:
+                return False
+                
+        except Exception as e:
+            logger.error(f"Failed to delete file {document_id}: {e}")
+            return False
+    
+    async def get_file_info(self, document_id: str) -> Dict[str, Any]:
+        """Get file metadata"""
+        
+        try:
+            pg_client = await get_postgresql_client()
+
+            # Validate user_id is a valid UUID format
+            try:
+                import uuid
+                user_uuid = str(uuid.UUID(self.user_id))
+            except (ValueError, TypeError) as e:
+                logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
+                raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
+
+            # Admins can access any document metadata in their tenant, regular users only their own
+            if self.user_role in ADMIN_ROLES:
+                doc_info = await pg_client.fetch_one("""
+                    SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size,
+                           file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at
+                    FROM documents d
+                    WHERE d.id = $1
+                      AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
+                """, document_id, self.tenant_domain)
+            else:
+                doc_info = await pg_client.fetch_one("""
+                    SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size,
+                           file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at
+                    FROM documents
+                    WHERE id = $1 AND user_id = $2::uuid
+                """, document_id, user_uuid)
+
+            if not doc_info:
+                raise FileNotFoundError(f"Document {document_id} not found")
+            
+            return {
+                "id": doc_info["id"],
+                "filename": doc_info["filename"],
+                "original_filename": doc_info["original_filename"],
+                "content_type": doc_info["content_type"],
+                "file_size": doc_info["file_size"],
+                "file_hash": doc_info["file_hash"],
+                "dataset_id": str(doc_info["dataset_id"]) if doc_info["dataset_id"] else None,
+                "storage_type": doc_info["storage_type"],
+                "category": doc_info["category"],
+                "created_at": doc_info["created_at"].isoformat(),
+                "download_url": f"/api/v1/files/{document_id}"
+            }
+            
+        except Exception as e:
+            logger.error(f"Failed to get file info for {document_id}: {e}")
+            raise
+    
+    async def list_files(
+        self, 
+        dataset_id: Optional[str] = None,
+        category: str = "documents",
+        limit: int = 50,
+        offset: int = 0
+    ) -> List[Dict[str, Any]]:
+        """List user files with optional filtering"""
+        
+        try:
+            pg_client = await get_postgresql_client()
+
+            # Validate user_id is a valid UUID format
+            try:
+                import uuid
+                user_uuid = str(uuid.UUID(self.user_id))
+            except (ValueError, TypeError) as e:
+                logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
+                raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
+
+            # Build permission-aware query
+            # Admins can list any documents in their tenant
+            # Regular users can list documents they own OR documents in datasets they can access
+            if self.user_role in ADMIN_ROLES:
+                where_clauses = ["d.tenant_id = (SELECT id FROM tenants WHERE domain = $1)"]
+                params = [self.tenant_domain]
+                param_idx = 2
+            else:
+                # Non-admin users can see:
+                # 1. Documents they own
+                # 2. Documents in datasets with access_group = 'organization'
+                # 3. Documents in datasets they're a member of (team access)
+                where_clauses = ["""(
+                    d.user_id = $1::uuid
+                    OR EXISTS (
+                        SELECT 1 FROM datasets ds
+                        WHERE ds.id = d.dataset_id
+                        AND ds.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
+                        AND (
+                            ds.access_group = 'organization'
+                            OR (ds.access_group = 'team' AND $1::uuid = ANY(ds.team_members))
+                        )
+                    )
+                )"""]
+                params = [user_uuid, self.tenant_domain]
+                param_idx = 3
+
+            if dataset_id:
+                where_clauses.append(f"d.dataset_id = ${param_idx}::uuid")
+                params.append(dataset_id)
+                param_idx += 1
+
+            if category:
+                where_clauses.append(f"(d.metadata->>'category' = ${param_idx} OR d.metadata->>'category' IS NULL)")
+                params.append(category)
+                param_idx += 1
+
+            query = f"""
+                SELECT d.id, d.filename, d.original_filename, d.file_type as content_type, d.file_size_bytes as file_size,
+                       d.metadata->>'storage_type' as storage_type, d.metadata->>'category' as category, d.created_at, d.updated_at, d.dataset_id,
+                       d.processing_status, d.metadata, d.user_id, COUNT(dc.id) as chunk_count,
+                       ds.created_by as dataset_owner_id
+                FROM documents d
+                LEFT JOIN document_chunks dc ON d.id = dc.document_id
+                LEFT JOIN datasets ds ON d.dataset_id = ds.id
+                WHERE {' AND '.join(where_clauses)}
+                GROUP BY d.id, d.filename, d.original_filename, d.file_type, d.file_size_bytes, d.metadata, d.created_at, d.updated_at, d.dataset_id, d.processing_status, d.user_id, ds.created_by
+                ORDER BY d.created_at DESC LIMIT ${param_idx} OFFSET ${param_idx + 1}
+            """
+            params.extend([limit, offset])
+
+            files = await pg_client.execute_query(query, *params)
+
+            # Helper function to parse metadata
+            def parse_metadata(metadata_value):
+                if metadata_value is None:
+                    return {}
+                if isinstance(metadata_value, str):
+                    import json
+                    try:
+                        return json.loads(metadata_value)
+                    except (json.JSONDecodeError, ValueError):
+                        return {}
+                return metadata_value if isinstance(metadata_value, dict) else {}
+
+            return [
+                {
+                    "id": file["id"],
+                    "filename": file["filename"],
+                    "original_filename": file["original_filename"],
+                    "content_type": file["content_type"],
+                    "file_type": file["content_type"],
+                    "file_size": file["file_size"],
+                    "file_size_bytes": file["file_size"],
+                    "dataset_id": file["dataset_id"],
+                    "storage_type": file["storage_type"],
+                    "category": file["category"],
+                    "created_at": file["created_at"].isoformat(),
+                    "updated_at": file["updated_at"].isoformat() if file.get("updated_at") else None,
+                    "processing_status": file.get("processing_status", "pending"),
+                    "chunk_count": file.get("chunk_count", 0),
+                    "chunks_processed": parse_metadata(file.get("metadata")).get("chunks_processed", 0),
+                    "total_chunks_expected": parse_metadata(file.get("metadata")).get("total_chunks_expected", 0),
+                    "processing_progress": parse_metadata(file.get("metadata")).get("processing_progress", 0),
+                    "processing_stage": parse_metadata(file.get("metadata")).get("processing_stage"),
+                    "download_url": f"/api/v1/files/{file['id']}",
+                    # Permission flags - user can delete if:
+                    # 1. They are admin, OR
+                    # 2. They uploaded the document, OR
+                    # 3. They own the parent dataset
+                    "can_delete": (
+                        self.user_role in ADMIN_ROLES or
+                        file["user_id"] == user_uuid or
+                        (file.get("dataset_owner_id") and str(file["dataset_owner_id"]) == user_uuid)
+                    )
+                }
+                for file in files
+            ]
+            
+        except Exception as e:
+            logger.error(f"Failed to list files for user {self.user_id}: {e}")
+            return []
+    
+    async def cleanup_orphaned_files(self) -> int:
+        """Clean up orphaned files and LOBs"""
+        
+        try:
+            pg_client = await get_postgresql_client()
+            cleanup_count = 0
+            
+            # Find orphaned LOBs (LOBs without corresponding document records)
+            async with pg_client.get_connection() as conn:
+                async with conn.transaction():
+                    orphaned_lobs = await conn.fetch("""
+                        SELECT lo.oid FROM pg_largeobject_metadata lo
+                        LEFT JOIN documents d ON lo.oid::text = d.storage_ref
+                        WHERE d.storage_ref IS NULL AND d.storage_type = 'lob'
+                    """)
+                    
+                    for lob in orphaned_lobs:
+                        await conn.execute("SELECT lo_unlink($1)", lob["oid"])
+                        cleanup_count += 1
+            
+            # Find orphaned filesystem files
+            # Note: This would require more complex logic to safely identify orphans
+            
+            logger.info(f"Cleaned up {cleanup_count} orphaned files")
+            return cleanup_count
+            
+        except Exception as e:
+            logger.error(f"Failed to cleanup orphaned files: {e}")
+            return 0
+
+    async def _trigger_document_processing(
+        self,
+        document_id: str,
+        dataset_id: Optional[str],
+        user_uuid: str,
+        filename: str
+    ):
+        """Trigger document processing pipeline for RAG functionality"""
+        try:
+            # Import here to avoid circular imports
+            from app.services.document_processor import get_document_processor
+
+            logger.info(f"Triggering document processing for {document_id}")
+
+            # Get document processor instance
+            processor = await get_document_processor(tenant_domain=self.tenant_domain)
+
+            # For documents uploaded via PostgreSQL file service, the content is already stored
+            # We need to process it from the database content rather than a file path
+            await self._process_document_from_database(
+                processor, document_id, dataset_id, user_uuid, filename
+            )
+
+        except Exception as e:
+            logger.error(f"Document processing trigger failed for {document_id}: {e}")
+            # Update document status to failed
+            try:
+                pg_client = await get_postgresql_client()
+                await pg_client.execute_command(
+                    "UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2",
+                    f"Processing trigger failed: {str(e)}", document_id
+                )
+            except Exception as update_error:
+                logger.error(f"Failed to update document status to failed: {update_error}")
+            raise
+
+    async def _process_document_from_database(
+        self,
+        processor,
+        document_id: str,
+        dataset_id: Optional[str],
+        user_uuid: str,
+        filename: str
+    ):
+        """Process document using content already stored in database"""
+        try:
+            import tempfile
+            import os
+            from pathlib import Path
+
+            # Get document content from database
+            pg_client = await get_postgresql_client()
+            doc_info = await pg_client.fetch_one("""
+                SELECT content_text, file_type, metadata
+                FROM documents
+                WHERE id = $1 AND user_id = $2::uuid
+            """, document_id, user_uuid)
+
+            if not doc_info or not doc_info["content_text"]:
+                raise ValueError("Document content not found in database")
+
+            # Create temporary file with the content
+            # Sanitize the file extension to prevent path injection
+            safe_suffix = sanitize_filename(filename)
+            safe_suffix = Path(safe_suffix).suffix if safe_suffix else ".tmp"
+            # codeql[py/path-injection] safe_suffix is sanitized via sanitize_filename()
+            with tempfile.NamedTemporaryFile(mode='w', suffix=safe_suffix, delete=False) as temp_file:
+                # Handle different storage types - metadata might be JSON string or dict
+                metadata_raw = doc_info["metadata"] or "{}"
+                if isinstance(metadata_raw, str):
+                    import json
+                    metadata = json.loads(metadata_raw)
+                else:
+                    metadata = metadata_raw or {}
+                storage_type = metadata.get("storage_type", "text")
+
+                if storage_type == "text":
+                    temp_file.write(doc_info["content_text"])
+                elif storage_type == "base64":
+                    import base64
+                    content_bytes = base64.b64decode(doc_info["content_text"])
+                    temp_file.close()
+                    with open(temp_file.name, 'wb') as binary_file:
+                        binary_file.write(content_bytes)
+                elif storage_type == "pdf_extracted":
+                    # For PDFs with extracted text, create a placeholder text file
+                    # since the actual text content is already extracted
+                    temp_file.write(doc_info["content_text"])
+                else:
+                    temp_file.write(doc_info["content_text"])
+
+                temp_file_path = Path(temp_file.name)
+
+            try:
+                # Process the document using the existing document processor
+                await processor.process_file(
+                    file_path=temp_file_path,
+                    dataset_id=dataset_id,  # Keep None as None - don't convert to empty string
+                    user_id=user_uuid,
+                    original_filename=filename,
+                    document_id=document_id  # Use existing document instead of creating new one
+                )
+
+                logger.info(f"Successfully processed document {document_id} from database content")
+
+            finally:
+                # Clean up temporary file
+                try:
+                    os.unlink(temp_file_path)
+                except Exception as cleanup_error:
+                    logger.warning(f"Failed to cleanup temporary file {temp_file_path}: {cleanup_error}")
+
+        except Exception as e:
+            logger.error(f"Failed to process document from database content: {e}")
+            raise
+
+    async def _extract_pdf_text(self, content: bytes) -> str:
+        """Extract text content from PDF bytes using pypdf"""
+        import io
+        import pypdf as PyPDF2  # pypdf is the maintained successor to PyPDF2
+
+        try:
+            # Create BytesIO object from content
+            pdf_stream = io.BytesIO(content)
+            pdf_reader = PyPDF2.PdfReader(pdf_stream)
+
+            text_parts = []
+            for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text.strip():
+                        text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
+                except Exception as e:
+                    logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
+
+            if not text_parts:
+                # If no text could be extracted, return a placeholder
+                return f"PDF document with {len(pdf_reader.pages)} pages (text extraction failed)"
+
+            extracted_text = "\n\n".join(text_parts)
+            logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF with {len(pdf_reader.pages)} pages")
+            return extracted_text
+
+        except Exception as e:
+            logger.error(f"PDF text extraction failed: {e}")
+            # Return a fallback description instead of failing completely
+            return f"PDF document (text extraction failed: {str(e)})"