Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
883 lines
39 KiB
Python
883 lines
39 KiB
Python
"""
|
|
GT 2.0 PostgreSQL File Storage Service
|
|
|
|
Replaces MinIO with PostgreSQL-based file storage using:
|
|
- BYTEA for small files (<10MB)
|
|
- PostgreSQL Large Objects (LOBs) for large files (10MB-1GB)
|
|
- Filesystem with metadata for massive files (>1GB)
|
|
|
|
Provides perfect tenant isolation through PostgreSQL schemas.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import hashlib
|
|
import mimetypes
|
|
from typing import Dict, Any, List, Optional, BinaryIO, AsyncIterator, Tuple
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
import aiofiles
|
|
from fastapi import UploadFile
|
|
|
|
from app.core.postgresql_client import get_postgresql_client
|
|
from app.core.config import get_settings
|
|
from app.core.permissions import ADMIN_ROLES
|
|
from app.core.path_security import sanitize_tenant_domain, sanitize_filename, safe_join_path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class PostgreSQLFileService:
|
|
"""PostgreSQL-based file storage service with tenant isolation"""
|
|
|
|
# Storage type thresholds
|
|
SMALL_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB - use BYTEA
|
|
LARGE_FILE_THRESHOLD = 1024 * 1024 * 1024 # 1GB - use LOBs
|
|
|
|
def __init__(self, tenant_domain: str, user_id: str, user_role: str = "user"):
|
|
self.tenant_domain = tenant_domain
|
|
self.user_id = user_id
|
|
self.user_role = user_role
|
|
self.settings = get_settings()
|
|
|
|
# Filesystem path for massive files (>1GB)
|
|
# Sanitize tenant_domain to prevent path traversal
|
|
safe_tenant = sanitize_tenant_domain(tenant_domain)
|
|
self.filesystem_base = Path("/data") / safe_tenant / "files" # codeql[py/path-injection] sanitize_tenant_domain() validates input
|
|
self.filesystem_base.mkdir(parents=True, exist_ok=True, mode=0o700)
|
|
|
|
logger.info(f"PostgreSQL file service initialized for {tenant_domain}/{user_id} (role: {user_role})")
|
|
|
|
async def store_file(
|
|
self,
|
|
file: UploadFile,
|
|
dataset_id: Optional[str] = None,
|
|
category: str = "documents"
|
|
) -> Dict[str, Any]:
|
|
"""Store file using appropriate PostgreSQL strategy"""
|
|
|
|
try:
|
|
logger.info(f"PostgreSQL file service: storing file {file.filename} for tenant {self.tenant_domain}, user {self.user_id}")
|
|
logger.info(f"Dataset ID: {dataset_id}, Category: {category}")
|
|
# Read file content
|
|
content = await file.read()
|
|
file_size = len(content)
|
|
|
|
# Generate file metadata
|
|
file_hash = hashlib.sha256(content).hexdigest()[:16]
|
|
content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or "application/octet-stream"
|
|
|
|
# Handle different file types with appropriate processing
|
|
if file_size <= self.SMALL_FILE_THRESHOLD and content_type.startswith('text/'):
|
|
# Small text files stored directly
|
|
storage_type = "text"
|
|
storage_ref = "content_text"
|
|
try:
|
|
text_content = content.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
text_content = content.decode('latin-1') # Fallback encoding
|
|
elif content_type == 'application/pdf':
|
|
# PDF files: extract text content, store binary separately
|
|
storage_type = "pdf_extracted"
|
|
storage_ref = "content_text"
|
|
text_content = await self._extract_pdf_text(content)
|
|
else:
|
|
# Other binary files: store as base64 for now
|
|
import base64
|
|
storage_type = "base64"
|
|
storage_ref = "content_text"
|
|
text_content = base64.b64encode(content).decode('utf-8')
|
|
|
|
# Get PostgreSQL client
|
|
logger.info("Getting PostgreSQL client")
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Always expect user_id to be a UUID string - no email lookups
|
|
logger.info(f"Using user UUID: {self.user_id}")
|
|
|
|
# Validate user_id is a valid UUID format
|
|
try:
|
|
import uuid
|
|
user_uuid = str(uuid.UUID(self.user_id))
|
|
except (ValueError, TypeError) as e:
|
|
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
|
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
|
|
|
logger.info(f"Validated user UUID: {user_uuid}")
|
|
|
|
# 1. Validate user_uuid is present
|
|
if not user_uuid:
|
|
raise ValueError("User UUID is required but not found")
|
|
|
|
# 2. Validate and clean dataset_id
|
|
dataset_uuid_param = None
|
|
if dataset_id and dataset_id.strip() and dataset_id != "":
|
|
try:
|
|
import uuid
|
|
dataset_uuid_param = str(uuid.UUID(dataset_id.strip()))
|
|
logger.info(f"Dataset UUID validated: {dataset_uuid_param}")
|
|
except ValueError as e:
|
|
logger.error(f"Invalid dataset UUID: {dataset_id}, error: {e}")
|
|
raise ValueError(f"Invalid dataset ID format: {dataset_id}")
|
|
else:
|
|
logger.info("No dataset_id provided, using NULL")
|
|
|
|
# 3. Validate file content and metadata
|
|
if not file.filename or not file.filename.strip():
|
|
raise ValueError("Filename cannot be empty")
|
|
|
|
if not content:
|
|
raise ValueError("File content cannot be empty")
|
|
|
|
# 4. Generate and validate all string parameters
|
|
safe_filename = f"{file_hash}_{file.filename}"
|
|
safe_original_filename = file.filename.strip()
|
|
safe_content_type = content_type or "application/octet-stream"
|
|
safe_file_hash = file_hash
|
|
safe_metadata = json.dumps({
|
|
"storage_type": storage_type,
|
|
"storage_ref": storage_ref,
|
|
"category": category
|
|
})
|
|
|
|
logger.info(f"All parameters validated:")
|
|
logger.info(f" user_uuid: {user_uuid}")
|
|
logger.info(f" dataset_uuid: {dataset_uuid_param}")
|
|
logger.info(f" filename: {safe_filename}")
|
|
logger.info(f" original_filename: {safe_original_filename}")
|
|
logger.info(f" file_type: {safe_content_type}")
|
|
logger.info(f" file_size: {file_size}")
|
|
logger.info(f" file_hash: {safe_file_hash}")
|
|
|
|
# Store metadata in documents table (using existing schema)
|
|
try:
|
|
# Application user now has BYPASSRLS privilege - no RLS context needed
|
|
logger.info("Storing document with BYPASSRLS privilege")
|
|
|
|
# Require dataset_id for all document uploads
|
|
if not dataset_uuid_param:
|
|
raise ValueError("dataset_id is required for document uploads")
|
|
|
|
logger.info(f"Storing document with dataset_id: {dataset_uuid_param}")
|
|
logger.info(f"Document details: {safe_filename} ({file_size} bytes)")
|
|
|
|
# Insert with dataset_id
|
|
# Determine if content is searchable (under PostgreSQL tsvector size limit)
|
|
is_searchable = text_content is None or len(text_content) < 1048575
|
|
|
|
async with pg_client.get_connection() as conn:
|
|
# Get tenant_id for the document
|
|
tenant_id = await conn.fetchval("""
|
|
SELECT id FROM tenants WHERE domain = $1 LIMIT 1
|
|
""", self.tenant_domain)
|
|
|
|
if not tenant_id:
|
|
raise ValueError(f"Tenant not found for domain: {self.tenant_domain}")
|
|
|
|
document_id = await conn.fetchval("""
|
|
INSERT INTO documents (
|
|
tenant_id, user_id, dataset_id, filename, original_filename,
|
|
file_type, file_size_bytes, file_hash, content_text, processing_status,
|
|
metadata, is_searchable, created_at, updated_at
|
|
) VALUES (
|
|
$1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, 'pending', $10, $11, NOW(), NOW()
|
|
)
|
|
RETURNING id
|
|
""",
|
|
tenant_id, user_uuid, dataset_uuid_param, safe_filename, safe_original_filename,
|
|
safe_content_type, file_size, safe_file_hash, text_content,
|
|
safe_metadata, is_searchable
|
|
)
|
|
logger.info(f"Document inserted successfully with ID: {document_id}")
|
|
|
|
except Exception as db_error:
|
|
logger.error(f"Database insertion failed: {db_error}")
|
|
logger.error(f"Tenant domain: {self.tenant_domain}")
|
|
logger.error(f"User ID: {self.user_id}")
|
|
logger.error(f"Dataset ID: {dataset_id}")
|
|
raise
|
|
|
|
result = {
|
|
"id": document_id,
|
|
"filename": file.filename,
|
|
"content_type": content_type,
|
|
"file_size": file_size,
|
|
"file_hash": file_hash,
|
|
"storage_type": storage_type,
|
|
"storage_ref": storage_ref,
|
|
"upload_timestamp": datetime.utcnow().isoformat(),
|
|
"download_url": f"/api/v1/files/{document_id}"
|
|
}
|
|
|
|
logger.info(f"Stored file {file.filename} ({file_size} bytes) as {storage_type} for user {self.user_id}")
|
|
|
|
# Trigger document processing pipeline for RAG functionality
|
|
try:
|
|
await self._trigger_document_processing(document_id, dataset_id, user_uuid, file.filename)
|
|
logger.info(f"Successfully triggered document processing for {document_id}")
|
|
except Exception as process_error:
|
|
logger.error(f"Failed to trigger document processing for {document_id}: {process_error}")
|
|
# Update document status to show processing failed
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
await pg_client.execute_command(
|
|
"UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2",
|
|
f"Processing failed: {str(process_error)}", document_id
|
|
)
|
|
except Exception as update_error:
|
|
logger.error(f"Failed to update document status after processing error: {update_error}")
|
|
# Don't fail the upload if processing trigger fails - user can retry manually
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to store file {file.filename}: {e}")
|
|
raise
|
|
finally:
|
|
# Ensure content is cleared from memory
|
|
if 'content' in locals():
|
|
del content
|
|
|
|
async def _store_as_bytea(
|
|
self,
|
|
content: bytes,
|
|
filename: str,
|
|
content_type: str,
|
|
file_hash: str,
|
|
dataset_id: Optional[str],
|
|
category: str
|
|
) -> str:
|
|
"""Store small file as BYTEA in documents table"""
|
|
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Store file content directly in BYTEA column
|
|
# This will be handled by the main insert in store_file
|
|
return "bytea_column"
|
|
|
|
async def _store_as_lob(
|
|
self,
|
|
content: bytes,
|
|
filename: str,
|
|
content_type: str,
|
|
file_hash: str,
|
|
dataset_id: Optional[str],
|
|
category: str
|
|
) -> str:
|
|
"""Store large file as PostgreSQL Large Object"""
|
|
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Create Large Object and get OID
|
|
async with pg_client.get_connection() as conn:
|
|
# Start transaction for LOB operations
|
|
async with conn.transaction():
|
|
# Create LOB and get OID
|
|
lob_oid = await conn.fetchval("SELECT lo_create(0)")
|
|
|
|
# Open LOB for writing
|
|
lob_fd = await conn.fetchval("SELECT lo_open($1, 131072)", lob_oid) # INV_WRITE mode
|
|
|
|
# Write content in chunks for memory efficiency
|
|
chunk_size = 8192
|
|
offset = 0
|
|
for i in range(0, len(content), chunk_size):
|
|
chunk = content[i:i + chunk_size]
|
|
await conn.execute("SELECT lo_write($1, $2)", lob_fd, chunk)
|
|
offset += len(chunk)
|
|
|
|
# Close LOB
|
|
await conn.execute("SELECT lo_close($1)", lob_fd)
|
|
|
|
logger.info(f"Created PostgreSQL LOB with OID {lob_oid} for {filename}")
|
|
return str(lob_oid)
|
|
|
|
async def _store_as_filesystem(
|
|
self,
|
|
content: bytes,
|
|
filename: str,
|
|
content_type: str,
|
|
file_hash: str,
|
|
dataset_id: Optional[str],
|
|
category: str
|
|
) -> str:
|
|
"""Store massive file on filesystem with PostgreSQL metadata"""
|
|
|
|
# Create secure file path with user isolation
|
|
user_dir = self.filesystem_base / self.user_id / category
|
|
if dataset_id:
|
|
user_dir = user_dir / dataset_id
|
|
|
|
user_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
|
|
|
|
# Generate secure filename
|
|
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
|
secure_filename = f"{timestamp}_{file_hash}_{filename}"
|
|
file_path = user_dir / secure_filename
|
|
|
|
# Write file with secure permissions
|
|
async with aiofiles.open(file_path, 'wb') as f:
|
|
await f.write(content)
|
|
|
|
# Set secure file permissions
|
|
os.chmod(file_path, 0o600)
|
|
|
|
logger.info(f"Stored large file on filesystem: {file_path}")
|
|
return str(file_path)
|
|
|
|
async def get_file(self, document_id: str) -> AsyncIterator[bytes]:
|
|
"""Stream file content by document ID"""
|
|
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Validate user_id is a valid UUID format
|
|
try:
|
|
import uuid
|
|
user_uuid = str(uuid.UUID(self.user_id))
|
|
except (ValueError, TypeError) as e:
|
|
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
|
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
|
|
|
# Get document metadata using UUID directly
|
|
# Admins can access any document in their tenant, regular users only their own
|
|
if self.user_role in ADMIN_ROLES:
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT metadata, file_size_bytes, filename, content_text
|
|
FROM documents d
|
|
WHERE d.id = $1
|
|
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
|
""", document_id, self.tenant_domain)
|
|
else:
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT metadata, file_size_bytes, filename, content_text
|
|
FROM documents
|
|
WHERE id = $1 AND user_id = $2::uuid
|
|
""", document_id, user_uuid)
|
|
|
|
if not doc_info:
|
|
raise FileNotFoundError(f"Document {document_id} not found")
|
|
|
|
# Get storage info from metadata - handle JSON string or dict
|
|
metadata_raw = doc_info["metadata"] or "{}"
|
|
if isinstance(metadata_raw, str):
|
|
import json
|
|
metadata = json.loads(metadata_raw)
|
|
else:
|
|
metadata = metadata_raw or {}
|
|
storage_type = metadata.get("storage_type", "text")
|
|
|
|
if storage_type == "text":
|
|
# Text content stored directly
|
|
if doc_info["content_text"]:
|
|
content_bytes = doc_info["content_text"].encode('utf-8')
|
|
async for chunk in self._stream_from_bytea(content_bytes):
|
|
yield chunk
|
|
else:
|
|
raise FileNotFoundError(f"Document content not found")
|
|
|
|
elif storage_type == "base64":
|
|
# Base64 encoded binary content
|
|
if doc_info["content_text"]:
|
|
import base64
|
|
content_bytes = base64.b64decode(doc_info["content_text"])
|
|
async for chunk in self._stream_from_bytea(content_bytes):
|
|
yield chunk
|
|
else:
|
|
raise FileNotFoundError(f"Document content not found")
|
|
|
|
elif storage_type == "lob":
|
|
# Stream from PostgreSQL LOB
|
|
storage_ref = metadata.get("storage_ref", "")
|
|
async for chunk in self._stream_from_lob(int(storage_ref)):
|
|
yield chunk
|
|
|
|
elif storage_type == "filesystem":
|
|
# Stream from filesystem
|
|
storage_ref = metadata.get("storage_ref", "")
|
|
async for chunk in self._stream_from_filesystem(storage_ref):
|
|
yield chunk
|
|
else:
|
|
# Default: treat as text content
|
|
if doc_info["content_text"]:
|
|
content_bytes = doc_info["content_text"].encode('utf-8')
|
|
async for chunk in self._stream_from_bytea(content_bytes):
|
|
yield chunk
|
|
else:
|
|
raise FileNotFoundError(f"Document content not found")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get file {document_id}: {e}")
|
|
raise
|
|
|
|
async def _stream_from_bytea(self, content: bytes) -> AsyncIterator[bytes]:
|
|
"""Stream content from BYTEA in chunks"""
|
|
chunk_size = 8192
|
|
for i in range(0, len(content), chunk_size):
|
|
yield content[i:i + chunk_size]
|
|
|
|
async def _stream_from_lob(self, lob_oid: int) -> AsyncIterator[bytes]:
|
|
"""Stream content from PostgreSQL Large Object"""
|
|
|
|
pg_client = await get_postgresql_client()
|
|
|
|
async with pg_client.get_connection() as conn:
|
|
async with conn.transaction():
|
|
# Open LOB for reading
|
|
lob_fd = await conn.fetchval("SELECT lo_open($1, 262144)", lob_oid) # INV_READ mode
|
|
|
|
# Stream in chunks
|
|
chunk_size = 8192
|
|
while True:
|
|
chunk = await conn.fetchval("SELECT lo_read($1, $2)", lob_fd, chunk_size)
|
|
if not chunk:
|
|
break
|
|
yield chunk
|
|
|
|
# Close LOB
|
|
await conn.execute("SELECT lo_close($1)", lob_fd)
|
|
|
|
async def _stream_from_filesystem(self, file_path: str) -> AsyncIterator[bytes]:
|
|
"""Stream content from filesystem"""
|
|
|
|
# Verify file belongs to tenant (security check)
|
|
path_obj = Path(file_path)
|
|
if not str(path_obj).startswith(str(self.filesystem_base)):
|
|
raise PermissionError("Access denied to file")
|
|
|
|
if not path_obj.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
async with aiofiles.open(file_path, 'rb') as f:
|
|
chunk_size = 8192
|
|
while True:
|
|
chunk = await f.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
yield chunk
|
|
|
|
async def delete_file(self, document_id: str) -> bool:
|
|
"""Delete file and metadata"""
|
|
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Validate user_id is a valid UUID format
|
|
try:
|
|
import uuid
|
|
user_uuid = str(uuid.UUID(self.user_id))
|
|
except (ValueError, TypeError) as e:
|
|
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
|
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
|
|
|
# Get document info before deletion
|
|
# Admins can delete any document in their tenant, regular users only their own
|
|
if self.user_role in ADMIN_ROLES:
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT storage_type, storage_ref FROM documents d
|
|
WHERE d.id = $1
|
|
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
|
""", document_id, self.tenant_domain)
|
|
else:
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT storage_type, storage_ref FROM documents
|
|
WHERE id = $1 AND user_id = $2::uuid
|
|
""", document_id, user_uuid)
|
|
|
|
if not doc_info:
|
|
logger.warning(f"Document {document_id} not found for deletion")
|
|
return False
|
|
|
|
storage_type = doc_info["storage_type"]
|
|
storage_ref = doc_info["storage_ref"]
|
|
|
|
# Delete file content based on storage type
|
|
if storage_type == "lob":
|
|
# Delete LOB
|
|
async with pg_client.get_connection() as conn:
|
|
await conn.execute("SELECT lo_unlink($1)", int(storage_ref))
|
|
elif storage_type == "filesystem":
|
|
# Delete filesystem file
|
|
try:
|
|
path_obj = Path(storage_ref)
|
|
if path_obj.exists():
|
|
path_obj.unlink()
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete filesystem file {storage_ref}: {e}")
|
|
# BYTEA files are deleted with the row
|
|
|
|
# Delete metadata record
|
|
if self.user_role in ADMIN_ROLES:
|
|
deleted = await pg_client.execute_command("""
|
|
DELETE FROM documents d
|
|
WHERE d.id = $1
|
|
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
|
""", document_id, self.tenant_domain)
|
|
else:
|
|
deleted = await pg_client.execute_command("""
|
|
DELETE FROM documents WHERE id = $1 AND user_id = $2::uuid
|
|
""", document_id, user_uuid)
|
|
|
|
if deleted > 0:
|
|
logger.info(f"Deleted file {document_id} ({storage_type})")
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete file {document_id}: {e}")
|
|
return False
|
|
|
|
async def get_file_info(self, document_id: str) -> Dict[str, Any]:
|
|
"""Get file metadata"""
|
|
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Validate user_id is a valid UUID format
|
|
try:
|
|
import uuid
|
|
user_uuid = str(uuid.UUID(self.user_id))
|
|
except (ValueError, TypeError) as e:
|
|
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
|
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
|
|
|
# Admins can access any document metadata in their tenant, regular users only their own
|
|
if self.user_role in ADMIN_ROLES:
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size,
|
|
file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at
|
|
FROM documents d
|
|
WHERE d.id = $1
|
|
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
|
""", document_id, self.tenant_domain)
|
|
else:
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size,
|
|
file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at
|
|
FROM documents
|
|
WHERE id = $1 AND user_id = $2::uuid
|
|
""", document_id, user_uuid)
|
|
|
|
if not doc_info:
|
|
raise FileNotFoundError(f"Document {document_id} not found")
|
|
|
|
return {
|
|
"id": doc_info["id"],
|
|
"filename": doc_info["filename"],
|
|
"original_filename": doc_info["original_filename"],
|
|
"content_type": doc_info["content_type"],
|
|
"file_size": doc_info["file_size"],
|
|
"file_hash": doc_info["file_hash"],
|
|
"dataset_id": str(doc_info["dataset_id"]) if doc_info["dataset_id"] else None,
|
|
"storage_type": doc_info["storage_type"],
|
|
"category": doc_info["category"],
|
|
"created_at": doc_info["created_at"].isoformat(),
|
|
"download_url": f"/api/v1/files/{document_id}"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get file info for {document_id}: {e}")
|
|
raise
|
|
|
|
async def list_files(
|
|
self,
|
|
dataset_id: Optional[str] = None,
|
|
category: str = "documents",
|
|
limit: int = 50,
|
|
offset: int = 0
|
|
) -> List[Dict[str, Any]]:
|
|
"""List user files with optional filtering"""
|
|
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
|
|
# Validate user_id is a valid UUID format
|
|
try:
|
|
import uuid
|
|
user_uuid = str(uuid.UUID(self.user_id))
|
|
except (ValueError, TypeError) as e:
|
|
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
|
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
|
|
|
# Build permission-aware query
|
|
# Admins can list any documents in their tenant
|
|
# Regular users can list documents they own OR documents in datasets they can access
|
|
if self.user_role in ADMIN_ROLES:
|
|
where_clauses = ["d.tenant_id = (SELECT id FROM tenants WHERE domain = $1)"]
|
|
params = [self.tenant_domain]
|
|
param_idx = 2
|
|
else:
|
|
# Non-admin users can see:
|
|
# 1. Documents they own
|
|
# 2. Documents in datasets with access_group = 'organization'
|
|
# 3. Documents in datasets they're a member of (team access)
|
|
where_clauses = ["""(
|
|
d.user_id = $1::uuid
|
|
OR EXISTS (
|
|
SELECT 1 FROM datasets ds
|
|
WHERE ds.id = d.dataset_id
|
|
AND ds.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
|
AND (
|
|
ds.access_group = 'organization'
|
|
OR (ds.access_group = 'team' AND $1::uuid = ANY(ds.team_members))
|
|
)
|
|
)
|
|
)"""]
|
|
params = [user_uuid, self.tenant_domain]
|
|
param_idx = 3
|
|
|
|
if dataset_id:
|
|
where_clauses.append(f"d.dataset_id = ${param_idx}::uuid")
|
|
params.append(dataset_id)
|
|
param_idx += 1
|
|
|
|
if category:
|
|
where_clauses.append(f"(d.metadata->>'category' = ${param_idx} OR d.metadata->>'category' IS NULL)")
|
|
params.append(category)
|
|
param_idx += 1
|
|
|
|
query = f"""
|
|
SELECT d.id, d.filename, d.original_filename, d.file_type as content_type, d.file_size_bytes as file_size,
|
|
d.metadata->>'storage_type' as storage_type, d.metadata->>'category' as category, d.created_at, d.updated_at, d.dataset_id,
|
|
d.processing_status, d.metadata, d.user_id, COUNT(dc.id) as chunk_count,
|
|
ds.created_by as dataset_owner_id
|
|
FROM documents d
|
|
LEFT JOIN document_chunks dc ON d.id = dc.document_id
|
|
LEFT JOIN datasets ds ON d.dataset_id = ds.id
|
|
WHERE {' AND '.join(where_clauses)}
|
|
GROUP BY d.id, d.filename, d.original_filename, d.file_type, d.file_size_bytes, d.metadata, d.created_at, d.updated_at, d.dataset_id, d.processing_status, d.user_id, ds.created_by
|
|
ORDER BY d.created_at DESC LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
|
"""
|
|
params.extend([limit, offset])
|
|
|
|
files = await pg_client.execute_query(query, *params)
|
|
|
|
# Helper function to parse metadata
|
|
def parse_metadata(metadata_value):
|
|
if metadata_value is None:
|
|
return {}
|
|
if isinstance(metadata_value, str):
|
|
import json
|
|
try:
|
|
return json.loads(metadata_value)
|
|
except (json.JSONDecodeError, ValueError):
|
|
return {}
|
|
return metadata_value if isinstance(metadata_value, dict) else {}
|
|
|
|
return [
|
|
{
|
|
"id": file["id"],
|
|
"filename": file["filename"],
|
|
"original_filename": file["original_filename"],
|
|
"content_type": file["content_type"],
|
|
"file_type": file["content_type"],
|
|
"file_size": file["file_size"],
|
|
"file_size_bytes": file["file_size"],
|
|
"dataset_id": file["dataset_id"],
|
|
"storage_type": file["storage_type"],
|
|
"category": file["category"],
|
|
"created_at": file["created_at"].isoformat(),
|
|
"updated_at": file["updated_at"].isoformat() if file.get("updated_at") else None,
|
|
"processing_status": file.get("processing_status", "pending"),
|
|
"chunk_count": file.get("chunk_count", 0),
|
|
"chunks_processed": parse_metadata(file.get("metadata")).get("chunks_processed", 0),
|
|
"total_chunks_expected": parse_metadata(file.get("metadata")).get("total_chunks_expected", 0),
|
|
"processing_progress": parse_metadata(file.get("metadata")).get("processing_progress", 0),
|
|
"processing_stage": parse_metadata(file.get("metadata")).get("processing_stage"),
|
|
"download_url": f"/api/v1/files/{file['id']}",
|
|
# Permission flags - user can delete if:
|
|
# 1. They are admin, OR
|
|
# 2. They uploaded the document, OR
|
|
# 3. They own the parent dataset
|
|
"can_delete": (
|
|
self.user_role in ADMIN_ROLES or
|
|
file["user_id"] == user_uuid or
|
|
(file.get("dataset_owner_id") and str(file["dataset_owner_id"]) == user_uuid)
|
|
)
|
|
}
|
|
for file in files
|
|
]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to list files for user {self.user_id}: {e}")
|
|
return []
|
|
|
|
async def cleanup_orphaned_files(self) -> int:
|
|
"""Clean up orphaned files and LOBs"""
|
|
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
cleanup_count = 0
|
|
|
|
# Find orphaned LOBs (LOBs without corresponding document records)
|
|
async with pg_client.get_connection() as conn:
|
|
async with conn.transaction():
|
|
orphaned_lobs = await conn.fetch("""
|
|
SELECT lo.oid FROM pg_largeobject_metadata lo
|
|
LEFT JOIN documents d ON lo.oid::text = d.storage_ref
|
|
WHERE d.storage_ref IS NULL AND d.storage_type = 'lob'
|
|
""")
|
|
|
|
for lob in orphaned_lobs:
|
|
await conn.execute("SELECT lo_unlink($1)", lob["oid"])
|
|
cleanup_count += 1
|
|
|
|
# Find orphaned filesystem files
|
|
# Note: This would require more complex logic to safely identify orphans
|
|
|
|
logger.info(f"Cleaned up {cleanup_count} orphaned files")
|
|
return cleanup_count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to cleanup orphaned files: {e}")
|
|
return 0
|
|
|
|
async def _trigger_document_processing(
|
|
self,
|
|
document_id: str,
|
|
dataset_id: Optional[str],
|
|
user_uuid: str,
|
|
filename: str
|
|
):
|
|
"""Trigger document processing pipeline for RAG functionality"""
|
|
try:
|
|
# Import here to avoid circular imports
|
|
from app.services.document_processor import get_document_processor
|
|
|
|
logger.info(f"Triggering document processing for {document_id}")
|
|
|
|
# Get document processor instance
|
|
processor = await get_document_processor(tenant_domain=self.tenant_domain)
|
|
|
|
# For documents uploaded via PostgreSQL file service, the content is already stored
|
|
# We need to process it from the database content rather than a file path
|
|
await self._process_document_from_database(
|
|
processor, document_id, dataset_id, user_uuid, filename
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Document processing trigger failed for {document_id}: {e}")
|
|
# Update document status to failed
|
|
try:
|
|
pg_client = await get_postgresql_client()
|
|
await pg_client.execute_command(
|
|
"UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2",
|
|
f"Processing trigger failed: {str(e)}", document_id
|
|
)
|
|
except Exception as update_error:
|
|
logger.error(f"Failed to update document status to failed: {update_error}")
|
|
raise
|
|
|
|
async def _process_document_from_database(
|
|
self,
|
|
processor,
|
|
document_id: str,
|
|
dataset_id: Optional[str],
|
|
user_uuid: str,
|
|
filename: str
|
|
):
|
|
"""Process document using content already stored in database"""
|
|
try:
|
|
import tempfile
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Get document content from database
|
|
pg_client = await get_postgresql_client()
|
|
doc_info = await pg_client.fetch_one("""
|
|
SELECT content_text, file_type, metadata
|
|
FROM documents
|
|
WHERE id = $1 AND user_id = $2::uuid
|
|
""", document_id, user_uuid)
|
|
|
|
if not doc_info or not doc_info["content_text"]:
|
|
raise ValueError("Document content not found in database")
|
|
|
|
# Create temporary file with the content
|
|
# Sanitize the file extension to prevent path injection
|
|
safe_suffix = sanitize_filename(filename)
|
|
safe_suffix = Path(safe_suffix).suffix if safe_suffix else ".tmp"
|
|
# codeql[py/path-injection] safe_suffix is sanitized via sanitize_filename()
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix=safe_suffix, delete=False) as temp_file:
|
|
# Handle different storage types - metadata might be JSON string or dict
|
|
metadata_raw = doc_info["metadata"] or "{}"
|
|
if isinstance(metadata_raw, str):
|
|
import json
|
|
metadata = json.loads(metadata_raw)
|
|
else:
|
|
metadata = metadata_raw or {}
|
|
storage_type = metadata.get("storage_type", "text")
|
|
|
|
if storage_type == "text":
|
|
temp_file.write(doc_info["content_text"])
|
|
elif storage_type == "base64":
|
|
import base64
|
|
content_bytes = base64.b64decode(doc_info["content_text"])
|
|
temp_file.close()
|
|
with open(temp_file.name, 'wb') as binary_file:
|
|
binary_file.write(content_bytes)
|
|
elif storage_type == "pdf_extracted":
|
|
# For PDFs with extracted text, create a placeholder text file
|
|
# since the actual text content is already extracted
|
|
temp_file.write(doc_info["content_text"])
|
|
else:
|
|
temp_file.write(doc_info["content_text"])
|
|
|
|
temp_file_path = Path(temp_file.name)
|
|
|
|
try:
|
|
# Process the document using the existing document processor
|
|
await processor.process_file(
|
|
file_path=temp_file_path,
|
|
dataset_id=dataset_id, # Keep None as None - don't convert to empty string
|
|
user_id=user_uuid,
|
|
original_filename=filename,
|
|
document_id=document_id # Use existing document instead of creating new one
|
|
)
|
|
|
|
logger.info(f"Successfully processed document {document_id} from database content")
|
|
|
|
finally:
|
|
# Clean up temporary file
|
|
try:
|
|
os.unlink(temp_file_path)
|
|
except Exception as cleanup_error:
|
|
logger.warning(f"Failed to cleanup temporary file {temp_file_path}: {cleanup_error}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process document from database content: {e}")
|
|
raise
|
|
|
|
async def _extract_pdf_text(self, content: bytes) -> str:
|
|
"""Extract text content from PDF bytes using pypdf"""
|
|
import io
|
|
import pypdf as PyPDF2 # pypdf is the maintained successor to PyPDF2
|
|
|
|
try:
|
|
# Create BytesIO object from content
|
|
pdf_stream = io.BytesIO(content)
|
|
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
|
|
|
text_parts = []
|
|
for page_num, page in enumerate(pdf_reader.pages):
|
|
try:
|
|
page_text = page.extract_text()
|
|
if page_text.strip():
|
|
text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
|
|
except Exception as e:
|
|
logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
|
|
|
|
if not text_parts:
|
|
# If no text could be extracted, return a placeholder
|
|
return f"PDF document with {len(pdf_reader.pages)} pages (text extraction failed)"
|
|
|
|
extracted_text = "\n\n".join(text_parts)
|
|
logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF with {len(pdf_reader.pages)} pages")
|
|
return extracted_text
|
|
|
|
except Exception as e:
|
|
logger.error(f"PDF text extraction failed: {e}")
|
|
# Return a fallback description instead of failing completely
|
|
return f"PDF document (text extraction failed: {str(e)})" |