GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
883
apps/tenant-backend/app/services/postgresql_file_service.py
Normal file
883
apps/tenant-backend/app/services/postgresql_file_service.py
Normal file
@@ -0,0 +1,883 @@
|
||||
"""
|
||||
GT 2.0 PostgreSQL File Storage Service
|
||||
|
||||
Replaces MinIO with PostgreSQL-based file storage using:
|
||||
- BYTEA for small files (<10MB)
|
||||
- PostgreSQL Large Objects (LOBs) for large files (10MB-1GB)
|
||||
- Filesystem with metadata for massive files (>1GB)
|
||||
|
||||
Provides perfect tenant isolation through PostgreSQL schemas.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import hashlib
|
||||
import mimetypes
|
||||
from typing import Dict, Any, List, Optional, BinaryIO, AsyncIterator, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
import aiofiles
|
||||
from fastapi import UploadFile
|
||||
|
||||
from app.core.postgresql_client import get_postgresql_client
|
||||
from app.core.config import get_settings
|
||||
from app.core.permissions import ADMIN_ROLES
|
||||
from app.core.path_security import sanitize_tenant_domain, sanitize_filename, safe_join_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PostgreSQLFileService:
|
||||
"""PostgreSQL-based file storage service with tenant isolation"""
|
||||
|
||||
# Storage type thresholds
|
||||
SMALL_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB - use BYTEA
|
||||
LARGE_FILE_THRESHOLD = 1024 * 1024 * 1024 # 1GB - use LOBs
|
||||
|
||||
def __init__(self, tenant_domain: str, user_id: str, user_role: str = "user"):
|
||||
self.tenant_domain = tenant_domain
|
||||
self.user_id = user_id
|
||||
self.user_role = user_role
|
||||
self.settings = get_settings()
|
||||
|
||||
# Filesystem path for massive files (>1GB)
|
||||
# Sanitize tenant_domain to prevent path traversal
|
||||
safe_tenant = sanitize_tenant_domain(tenant_domain)
|
||||
self.filesystem_base = Path("/data") / safe_tenant / "files" # codeql[py/path-injection] sanitize_tenant_domain() validates input
|
||||
self.filesystem_base.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
logger.info(f"PostgreSQL file service initialized for {tenant_domain}/{user_id} (role: {user_role})")
|
||||
|
||||
async def store_file(
|
||||
self,
|
||||
file: UploadFile,
|
||||
dataset_id: Optional[str] = None,
|
||||
category: str = "documents"
|
||||
) -> Dict[str, Any]:
|
||||
"""Store file using appropriate PostgreSQL strategy"""
|
||||
|
||||
try:
|
||||
logger.info(f"PostgreSQL file service: storing file {file.filename} for tenant {self.tenant_domain}, user {self.user_id}")
|
||||
logger.info(f"Dataset ID: {dataset_id}, Category: {category}")
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
file_size = len(content)
|
||||
|
||||
# Generate file metadata
|
||||
file_hash = hashlib.sha256(content).hexdigest()[:16]
|
||||
content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or "application/octet-stream"
|
||||
|
||||
# Handle different file types with appropriate processing
|
||||
if file_size <= self.SMALL_FILE_THRESHOLD and content_type.startswith('text/'):
|
||||
# Small text files stored directly
|
||||
storage_type = "text"
|
||||
storage_ref = "content_text"
|
||||
try:
|
||||
text_content = content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
text_content = content.decode('latin-1') # Fallback encoding
|
||||
elif content_type == 'application/pdf':
|
||||
# PDF files: extract text content, store binary separately
|
||||
storage_type = "pdf_extracted"
|
||||
storage_ref = "content_text"
|
||||
text_content = await self._extract_pdf_text(content)
|
||||
else:
|
||||
# Other binary files: store as base64 for now
|
||||
import base64
|
||||
storage_type = "base64"
|
||||
storage_ref = "content_text"
|
||||
text_content = base64.b64encode(content).decode('utf-8')
|
||||
|
||||
# Get PostgreSQL client
|
||||
logger.info("Getting PostgreSQL client")
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Always expect user_id to be a UUID string - no email lookups
|
||||
logger.info(f"Using user UUID: {self.user_id}")
|
||||
|
||||
# Validate user_id is a valid UUID format
|
||||
try:
|
||||
import uuid
|
||||
user_uuid = str(uuid.UUID(self.user_id))
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
||||
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
||||
|
||||
logger.info(f"Validated user UUID: {user_uuid}")
|
||||
|
||||
# 1. Validate user_uuid is present
|
||||
if not user_uuid:
|
||||
raise ValueError("User UUID is required but not found")
|
||||
|
||||
# 2. Validate and clean dataset_id
|
||||
dataset_uuid_param = None
|
||||
if dataset_id and dataset_id.strip() and dataset_id != "":
|
||||
try:
|
||||
import uuid
|
||||
dataset_uuid_param = str(uuid.UUID(dataset_id.strip()))
|
||||
logger.info(f"Dataset UUID validated: {dataset_uuid_param}")
|
||||
except ValueError as e:
|
||||
logger.error(f"Invalid dataset UUID: {dataset_id}, error: {e}")
|
||||
raise ValueError(f"Invalid dataset ID format: {dataset_id}")
|
||||
else:
|
||||
logger.info("No dataset_id provided, using NULL")
|
||||
|
||||
# 3. Validate file content and metadata
|
||||
if not file.filename or not file.filename.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
|
||||
if not content:
|
||||
raise ValueError("File content cannot be empty")
|
||||
|
||||
# 4. Generate and validate all string parameters
|
||||
safe_filename = f"{file_hash}_{file.filename}"
|
||||
safe_original_filename = file.filename.strip()
|
||||
safe_content_type = content_type or "application/octet-stream"
|
||||
safe_file_hash = file_hash
|
||||
safe_metadata = json.dumps({
|
||||
"storage_type": storage_type,
|
||||
"storage_ref": storage_ref,
|
||||
"category": category
|
||||
})
|
||||
|
||||
logger.info(f"All parameters validated:")
|
||||
logger.info(f" user_uuid: {user_uuid}")
|
||||
logger.info(f" dataset_uuid: {dataset_uuid_param}")
|
||||
logger.info(f" filename: {safe_filename}")
|
||||
logger.info(f" original_filename: {safe_original_filename}")
|
||||
logger.info(f" file_type: {safe_content_type}")
|
||||
logger.info(f" file_size: {file_size}")
|
||||
logger.info(f" file_hash: {safe_file_hash}")
|
||||
|
||||
# Store metadata in documents table (using existing schema)
|
||||
try:
|
||||
# Application user now has BYPASSRLS privilege - no RLS context needed
|
||||
logger.info("Storing document with BYPASSRLS privilege")
|
||||
|
||||
# Require dataset_id for all document uploads
|
||||
if not dataset_uuid_param:
|
||||
raise ValueError("dataset_id is required for document uploads")
|
||||
|
||||
logger.info(f"Storing document with dataset_id: {dataset_uuid_param}")
|
||||
logger.info(f"Document details: {safe_filename} ({file_size} bytes)")
|
||||
|
||||
# Insert with dataset_id
|
||||
# Determine if content is searchable (under PostgreSQL tsvector size limit)
|
||||
is_searchable = text_content is None or len(text_content) < 1048575
|
||||
|
||||
async with pg_client.get_connection() as conn:
|
||||
# Get tenant_id for the document
|
||||
tenant_id = await conn.fetchval("""
|
||||
SELECT id FROM tenants WHERE domain = $1 LIMIT 1
|
||||
""", self.tenant_domain)
|
||||
|
||||
if not tenant_id:
|
||||
raise ValueError(f"Tenant not found for domain: {self.tenant_domain}")
|
||||
|
||||
document_id = await conn.fetchval("""
|
||||
INSERT INTO documents (
|
||||
tenant_id, user_id, dataset_id, filename, original_filename,
|
||||
file_type, file_size_bytes, file_hash, content_text, processing_status,
|
||||
metadata, is_searchable, created_at, updated_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, 'pending', $10, $11, NOW(), NOW()
|
||||
)
|
||||
RETURNING id
|
||||
""",
|
||||
tenant_id, user_uuid, dataset_uuid_param, safe_filename, safe_original_filename,
|
||||
safe_content_type, file_size, safe_file_hash, text_content,
|
||||
safe_metadata, is_searchable
|
||||
)
|
||||
logger.info(f"Document inserted successfully with ID: {document_id}")
|
||||
|
||||
except Exception as db_error:
|
||||
logger.error(f"Database insertion failed: {db_error}")
|
||||
logger.error(f"Tenant domain: {self.tenant_domain}")
|
||||
logger.error(f"User ID: {self.user_id}")
|
||||
logger.error(f"Dataset ID: {dataset_id}")
|
||||
raise
|
||||
|
||||
result = {
|
||||
"id": document_id,
|
||||
"filename": file.filename,
|
||||
"content_type": content_type,
|
||||
"file_size": file_size,
|
||||
"file_hash": file_hash,
|
||||
"storage_type": storage_type,
|
||||
"storage_ref": storage_ref,
|
||||
"upload_timestamp": datetime.utcnow().isoformat(),
|
||||
"download_url": f"/api/v1/files/{document_id}"
|
||||
}
|
||||
|
||||
logger.info(f"Stored file {file.filename} ({file_size} bytes) as {storage_type} for user {self.user_id}")
|
||||
|
||||
# Trigger document processing pipeline for RAG functionality
|
||||
try:
|
||||
await self._trigger_document_processing(document_id, dataset_id, user_uuid, file.filename)
|
||||
logger.info(f"Successfully triggered document processing for {document_id}")
|
||||
except Exception as process_error:
|
||||
logger.error(f"Failed to trigger document processing for {document_id}: {process_error}")
|
||||
# Update document status to show processing failed
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
await pg_client.execute_command(
|
||||
"UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2",
|
||||
f"Processing failed: {str(process_error)}", document_id
|
||||
)
|
||||
except Exception as update_error:
|
||||
logger.error(f"Failed to update document status after processing error: {update_error}")
|
||||
# Don't fail the upload if processing trigger fails - user can retry manually
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store file {file.filename}: {e}")
|
||||
raise
|
||||
finally:
|
||||
# Ensure content is cleared from memory
|
||||
if 'content' in locals():
|
||||
del content
|
||||
|
||||
async def _store_as_bytea(
|
||||
self,
|
||||
content: bytes,
|
||||
filename: str,
|
||||
content_type: str,
|
||||
file_hash: str,
|
||||
dataset_id: Optional[str],
|
||||
category: str
|
||||
) -> str:
|
||||
"""Store small file as BYTEA in documents table"""
|
||||
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Store file content directly in BYTEA column
|
||||
# This will be handled by the main insert in store_file
|
||||
return "bytea_column"
|
||||
|
||||
async def _store_as_lob(
|
||||
self,
|
||||
content: bytes,
|
||||
filename: str,
|
||||
content_type: str,
|
||||
file_hash: str,
|
||||
dataset_id: Optional[str],
|
||||
category: str
|
||||
) -> str:
|
||||
"""Store large file as PostgreSQL Large Object"""
|
||||
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Create Large Object and get OID
|
||||
async with pg_client.get_connection() as conn:
|
||||
# Start transaction for LOB operations
|
||||
async with conn.transaction():
|
||||
# Create LOB and get OID
|
||||
lob_oid = await conn.fetchval("SELECT lo_create(0)")
|
||||
|
||||
# Open LOB for writing
|
||||
lob_fd = await conn.fetchval("SELECT lo_open($1, 131072)", lob_oid) # INV_WRITE mode
|
||||
|
||||
# Write content in chunks for memory efficiency
|
||||
chunk_size = 8192
|
||||
offset = 0
|
||||
for i in range(0, len(content), chunk_size):
|
||||
chunk = content[i:i + chunk_size]
|
||||
await conn.execute("SELECT lo_write($1, $2)", lob_fd, chunk)
|
||||
offset += len(chunk)
|
||||
|
||||
# Close LOB
|
||||
await conn.execute("SELECT lo_close($1)", lob_fd)
|
||||
|
||||
logger.info(f"Created PostgreSQL LOB with OID {lob_oid} for {filename}")
|
||||
return str(lob_oid)
|
||||
|
||||
async def _store_as_filesystem(
|
||||
self,
|
||||
content: bytes,
|
||||
filename: str,
|
||||
content_type: str,
|
||||
file_hash: str,
|
||||
dataset_id: Optional[str],
|
||||
category: str
|
||||
) -> str:
|
||||
"""Store massive file on filesystem with PostgreSQL metadata"""
|
||||
|
||||
# Create secure file path with user isolation
|
||||
user_dir = self.filesystem_base / self.user_id / category
|
||||
if dataset_id:
|
||||
user_dir = user_dir / dataset_id
|
||||
|
||||
user_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
# Generate secure filename
|
||||
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
secure_filename = f"{timestamp}_{file_hash}_{filename}"
|
||||
file_path = user_dir / secure_filename
|
||||
|
||||
# Write file with secure permissions
|
||||
async with aiofiles.open(file_path, 'wb') as f:
|
||||
await f.write(content)
|
||||
|
||||
# Set secure file permissions
|
||||
os.chmod(file_path, 0o600)
|
||||
|
||||
logger.info(f"Stored large file on filesystem: {file_path}")
|
||||
return str(file_path)
|
||||
|
||||
async def get_file(self, document_id: str) -> AsyncIterator[bytes]:
|
||||
"""Stream file content by document ID"""
|
||||
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Validate user_id is a valid UUID format
|
||||
try:
|
||||
import uuid
|
||||
user_uuid = str(uuid.UUID(self.user_id))
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
||||
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
||||
|
||||
# Get document metadata using UUID directly
|
||||
# Admins can access any document in their tenant, regular users only their own
|
||||
if self.user_role in ADMIN_ROLES:
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT metadata, file_size_bytes, filename, content_text
|
||||
FROM documents d
|
||||
WHERE d.id = $1
|
||||
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
||||
""", document_id, self.tenant_domain)
|
||||
else:
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT metadata, file_size_bytes, filename, content_text
|
||||
FROM documents
|
||||
WHERE id = $1 AND user_id = $2::uuid
|
||||
""", document_id, user_uuid)
|
||||
|
||||
if not doc_info:
|
||||
raise FileNotFoundError(f"Document {document_id} not found")
|
||||
|
||||
# Get storage info from metadata - handle JSON string or dict
|
||||
metadata_raw = doc_info["metadata"] or "{}"
|
||||
if isinstance(metadata_raw, str):
|
||||
import json
|
||||
metadata = json.loads(metadata_raw)
|
||||
else:
|
||||
metadata = metadata_raw or {}
|
||||
storage_type = metadata.get("storage_type", "text")
|
||||
|
||||
if storage_type == "text":
|
||||
# Text content stored directly
|
||||
if doc_info["content_text"]:
|
||||
content_bytes = doc_info["content_text"].encode('utf-8')
|
||||
async for chunk in self._stream_from_bytea(content_bytes):
|
||||
yield chunk
|
||||
else:
|
||||
raise FileNotFoundError(f"Document content not found")
|
||||
|
||||
elif storage_type == "base64":
|
||||
# Base64 encoded binary content
|
||||
if doc_info["content_text"]:
|
||||
import base64
|
||||
content_bytes = base64.b64decode(doc_info["content_text"])
|
||||
async for chunk in self._stream_from_bytea(content_bytes):
|
||||
yield chunk
|
||||
else:
|
||||
raise FileNotFoundError(f"Document content not found")
|
||||
|
||||
elif storage_type == "lob":
|
||||
# Stream from PostgreSQL LOB
|
||||
storage_ref = metadata.get("storage_ref", "")
|
||||
async for chunk in self._stream_from_lob(int(storage_ref)):
|
||||
yield chunk
|
||||
|
||||
elif storage_type == "filesystem":
|
||||
# Stream from filesystem
|
||||
storage_ref = metadata.get("storage_ref", "")
|
||||
async for chunk in self._stream_from_filesystem(storage_ref):
|
||||
yield chunk
|
||||
else:
|
||||
# Default: treat as text content
|
||||
if doc_info["content_text"]:
|
||||
content_bytes = doc_info["content_text"].encode('utf-8')
|
||||
async for chunk in self._stream_from_bytea(content_bytes):
|
||||
yield chunk
|
||||
else:
|
||||
raise FileNotFoundError(f"Document content not found")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get file {document_id}: {e}")
|
||||
raise
|
||||
|
||||
async def _stream_from_bytea(self, content: bytes) -> AsyncIterator[bytes]:
|
||||
"""Stream content from BYTEA in chunks"""
|
||||
chunk_size = 8192
|
||||
for i in range(0, len(content), chunk_size):
|
||||
yield content[i:i + chunk_size]
|
||||
|
||||
async def _stream_from_lob(self, lob_oid: int) -> AsyncIterator[bytes]:
|
||||
"""Stream content from PostgreSQL Large Object"""
|
||||
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
async with pg_client.get_connection() as conn:
|
||||
async with conn.transaction():
|
||||
# Open LOB for reading
|
||||
lob_fd = await conn.fetchval("SELECT lo_open($1, 262144)", lob_oid) # INV_READ mode
|
||||
|
||||
# Stream in chunks
|
||||
chunk_size = 8192
|
||||
while True:
|
||||
chunk = await conn.fetchval("SELECT lo_read($1, $2)", lob_fd, chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
# Close LOB
|
||||
await conn.execute("SELECT lo_close($1)", lob_fd)
|
||||
|
||||
async def _stream_from_filesystem(self, file_path: str) -> AsyncIterator[bytes]:
|
||||
"""Stream content from filesystem"""
|
||||
|
||||
# Verify file belongs to tenant (security check)
|
||||
path_obj = Path(file_path)
|
||||
if not str(path_obj).startswith(str(self.filesystem_base)):
|
||||
raise PermissionError("Access denied to file")
|
||||
|
||||
if not path_obj.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
async with aiofiles.open(file_path, 'rb') as f:
|
||||
chunk_size = 8192
|
||||
while True:
|
||||
chunk = await f.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
async def delete_file(self, document_id: str) -> bool:
|
||||
"""Delete file and metadata"""
|
||||
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Validate user_id is a valid UUID format
|
||||
try:
|
||||
import uuid
|
||||
user_uuid = str(uuid.UUID(self.user_id))
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
||||
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
||||
|
||||
# Get document info before deletion
|
||||
# Admins can delete any document in their tenant, regular users only their own
|
||||
if self.user_role in ADMIN_ROLES:
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT storage_type, storage_ref FROM documents d
|
||||
WHERE d.id = $1
|
||||
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
||||
""", document_id, self.tenant_domain)
|
||||
else:
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT storage_type, storage_ref FROM documents
|
||||
WHERE id = $1 AND user_id = $2::uuid
|
||||
""", document_id, user_uuid)
|
||||
|
||||
if not doc_info:
|
||||
logger.warning(f"Document {document_id} not found for deletion")
|
||||
return False
|
||||
|
||||
storage_type = doc_info["storage_type"]
|
||||
storage_ref = doc_info["storage_ref"]
|
||||
|
||||
# Delete file content based on storage type
|
||||
if storage_type == "lob":
|
||||
# Delete LOB
|
||||
async with pg_client.get_connection() as conn:
|
||||
await conn.execute("SELECT lo_unlink($1)", int(storage_ref))
|
||||
elif storage_type == "filesystem":
|
||||
# Delete filesystem file
|
||||
try:
|
||||
path_obj = Path(storage_ref)
|
||||
if path_obj.exists():
|
||||
path_obj.unlink()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete filesystem file {storage_ref}: {e}")
|
||||
# BYTEA files are deleted with the row
|
||||
|
||||
# Delete metadata record
|
||||
if self.user_role in ADMIN_ROLES:
|
||||
deleted = await pg_client.execute_command("""
|
||||
DELETE FROM documents d
|
||||
WHERE d.id = $1
|
||||
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
||||
""", document_id, self.tenant_domain)
|
||||
else:
|
||||
deleted = await pg_client.execute_command("""
|
||||
DELETE FROM documents WHERE id = $1 AND user_id = $2::uuid
|
||||
""", document_id, user_uuid)
|
||||
|
||||
if deleted > 0:
|
||||
logger.info(f"Deleted file {document_id} ({storage_type})")
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete file {document_id}: {e}")
|
||||
return False
|
||||
|
||||
async def get_file_info(self, document_id: str) -> Dict[str, Any]:
|
||||
"""Get file metadata"""
|
||||
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Validate user_id is a valid UUID format
|
||||
try:
|
||||
import uuid
|
||||
user_uuid = str(uuid.UUID(self.user_id))
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
||||
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
||||
|
||||
# Admins can access any document metadata in their tenant, regular users only their own
|
||||
if self.user_role in ADMIN_ROLES:
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size,
|
||||
file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at
|
||||
FROM documents d
|
||||
WHERE d.id = $1
|
||||
AND d.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
||||
""", document_id, self.tenant_domain)
|
||||
else:
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT id, filename, original_filename, file_type as content_type, file_size_bytes as file_size,
|
||||
file_hash, dataset_id, metadata->'storage_type' as storage_type, metadata->'category' as category, created_at
|
||||
FROM documents
|
||||
WHERE id = $1 AND user_id = $2::uuid
|
||||
""", document_id, user_uuid)
|
||||
|
||||
if not doc_info:
|
||||
raise FileNotFoundError(f"Document {document_id} not found")
|
||||
|
||||
return {
|
||||
"id": doc_info["id"],
|
||||
"filename": doc_info["filename"],
|
||||
"original_filename": doc_info["original_filename"],
|
||||
"content_type": doc_info["content_type"],
|
||||
"file_size": doc_info["file_size"],
|
||||
"file_hash": doc_info["file_hash"],
|
||||
"dataset_id": str(doc_info["dataset_id"]) if doc_info["dataset_id"] else None,
|
||||
"storage_type": doc_info["storage_type"],
|
||||
"category": doc_info["category"],
|
||||
"created_at": doc_info["created_at"].isoformat(),
|
||||
"download_url": f"/api/v1/files/{document_id}"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get file info for {document_id}: {e}")
|
||||
raise
|
||||
|
||||
async def list_files(
|
||||
self,
|
||||
dataset_id: Optional[str] = None,
|
||||
category: str = "documents",
|
||||
limit: int = 50,
|
||||
offset: int = 0
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List user files with optional filtering"""
|
||||
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
|
||||
# Validate user_id is a valid UUID format
|
||||
try:
|
||||
import uuid
|
||||
user_uuid = str(uuid.UUID(self.user_id))
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(f"Invalid user UUID format: {self.user_id}, error: {e}")
|
||||
raise ValueError(f"Invalid user ID format. Expected UUID, got: {self.user_id}")
|
||||
|
||||
# Build permission-aware query
|
||||
# Admins can list any documents in their tenant
|
||||
# Regular users can list documents they own OR documents in datasets they can access
|
||||
if self.user_role in ADMIN_ROLES:
|
||||
where_clauses = ["d.tenant_id = (SELECT id FROM tenants WHERE domain = $1)"]
|
||||
params = [self.tenant_domain]
|
||||
param_idx = 2
|
||||
else:
|
||||
# Non-admin users can see:
|
||||
# 1. Documents they own
|
||||
# 2. Documents in datasets with access_group = 'organization'
|
||||
# 3. Documents in datasets they're a member of (team access)
|
||||
where_clauses = ["""(
|
||||
d.user_id = $1::uuid
|
||||
OR EXISTS (
|
||||
SELECT 1 FROM datasets ds
|
||||
WHERE ds.id = d.dataset_id
|
||||
AND ds.tenant_id = (SELECT id FROM tenants WHERE domain = $2)
|
||||
AND (
|
||||
ds.access_group = 'organization'
|
||||
OR (ds.access_group = 'team' AND $1::uuid = ANY(ds.team_members))
|
||||
)
|
||||
)
|
||||
)"""]
|
||||
params = [user_uuid, self.tenant_domain]
|
||||
param_idx = 3
|
||||
|
||||
if dataset_id:
|
||||
where_clauses.append(f"d.dataset_id = ${param_idx}::uuid")
|
||||
params.append(dataset_id)
|
||||
param_idx += 1
|
||||
|
||||
if category:
|
||||
where_clauses.append(f"(d.metadata->>'category' = ${param_idx} OR d.metadata->>'category' IS NULL)")
|
||||
params.append(category)
|
||||
param_idx += 1
|
||||
|
||||
query = f"""
|
||||
SELECT d.id, d.filename, d.original_filename, d.file_type as content_type, d.file_size_bytes as file_size,
|
||||
d.metadata->>'storage_type' as storage_type, d.metadata->>'category' as category, d.created_at, d.updated_at, d.dataset_id,
|
||||
d.processing_status, d.metadata, d.user_id, COUNT(dc.id) as chunk_count,
|
||||
ds.created_by as dataset_owner_id
|
||||
FROM documents d
|
||||
LEFT JOIN document_chunks dc ON d.id = dc.document_id
|
||||
LEFT JOIN datasets ds ON d.dataset_id = ds.id
|
||||
WHERE {' AND '.join(where_clauses)}
|
||||
GROUP BY d.id, d.filename, d.original_filename, d.file_type, d.file_size_bytes, d.metadata, d.created_at, d.updated_at, d.dataset_id, d.processing_status, d.user_id, ds.created_by
|
||||
ORDER BY d.created_at DESC LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
||||
"""
|
||||
params.extend([limit, offset])
|
||||
|
||||
files = await pg_client.execute_query(query, *params)
|
||||
|
||||
# Helper function to parse metadata
|
||||
def parse_metadata(metadata_value):
|
||||
if metadata_value is None:
|
||||
return {}
|
||||
if isinstance(metadata_value, str):
|
||||
import json
|
||||
try:
|
||||
return json.loads(metadata_value)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return {}
|
||||
return metadata_value if isinstance(metadata_value, dict) else {}
|
||||
|
||||
return [
|
||||
{
|
||||
"id": file["id"],
|
||||
"filename": file["filename"],
|
||||
"original_filename": file["original_filename"],
|
||||
"content_type": file["content_type"],
|
||||
"file_type": file["content_type"],
|
||||
"file_size": file["file_size"],
|
||||
"file_size_bytes": file["file_size"],
|
||||
"dataset_id": file["dataset_id"],
|
||||
"storage_type": file["storage_type"],
|
||||
"category": file["category"],
|
||||
"created_at": file["created_at"].isoformat(),
|
||||
"updated_at": file["updated_at"].isoformat() if file.get("updated_at") else None,
|
||||
"processing_status": file.get("processing_status", "pending"),
|
||||
"chunk_count": file.get("chunk_count", 0),
|
||||
"chunks_processed": parse_metadata(file.get("metadata")).get("chunks_processed", 0),
|
||||
"total_chunks_expected": parse_metadata(file.get("metadata")).get("total_chunks_expected", 0),
|
||||
"processing_progress": parse_metadata(file.get("metadata")).get("processing_progress", 0),
|
||||
"processing_stage": parse_metadata(file.get("metadata")).get("processing_stage"),
|
||||
"download_url": f"/api/v1/files/{file['id']}",
|
||||
# Permission flags - user can delete if:
|
||||
# 1. They are admin, OR
|
||||
# 2. They uploaded the document, OR
|
||||
# 3. They own the parent dataset
|
||||
"can_delete": (
|
||||
self.user_role in ADMIN_ROLES or
|
||||
file["user_id"] == user_uuid or
|
||||
(file.get("dataset_owner_id") and str(file["dataset_owner_id"]) == user_uuid)
|
||||
)
|
||||
}
|
||||
for file in files
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list files for user {self.user_id}: {e}")
|
||||
return []
|
||||
|
||||
async def cleanup_orphaned_files(self) -> int:
|
||||
"""Clean up orphaned files and LOBs"""
|
||||
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
cleanup_count = 0
|
||||
|
||||
# Find orphaned LOBs (LOBs without corresponding document records)
|
||||
async with pg_client.get_connection() as conn:
|
||||
async with conn.transaction():
|
||||
orphaned_lobs = await conn.fetch("""
|
||||
SELECT lo.oid FROM pg_largeobject_metadata lo
|
||||
LEFT JOIN documents d ON lo.oid::text = d.storage_ref
|
||||
WHERE d.storage_ref IS NULL AND d.storage_type = 'lob'
|
||||
""")
|
||||
|
||||
for lob in orphaned_lobs:
|
||||
await conn.execute("SELECT lo_unlink($1)", lob["oid"])
|
||||
cleanup_count += 1
|
||||
|
||||
# Find orphaned filesystem files
|
||||
# Note: This would require more complex logic to safely identify orphans
|
||||
|
||||
logger.info(f"Cleaned up {cleanup_count} orphaned files")
|
||||
return cleanup_count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup orphaned files: {e}")
|
||||
return 0
|
||||
|
||||
async def _trigger_document_processing(
|
||||
self,
|
||||
document_id: str,
|
||||
dataset_id: Optional[str],
|
||||
user_uuid: str,
|
||||
filename: str
|
||||
):
|
||||
"""Trigger document processing pipeline for RAG functionality"""
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from app.services.document_processor import get_document_processor
|
||||
|
||||
logger.info(f"Triggering document processing for {document_id}")
|
||||
|
||||
# Get document processor instance
|
||||
processor = await get_document_processor(tenant_domain=self.tenant_domain)
|
||||
|
||||
# For documents uploaded via PostgreSQL file service, the content is already stored
|
||||
# We need to process it from the database content rather than a file path
|
||||
await self._process_document_from_database(
|
||||
processor, document_id, dataset_id, user_uuid, filename
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Document processing trigger failed for {document_id}: {e}")
|
||||
# Update document status to failed
|
||||
try:
|
||||
pg_client = await get_postgresql_client()
|
||||
await pg_client.execute_command(
|
||||
"UPDATE documents SET processing_status = 'failed', error_message = $1 WHERE id = $2",
|
||||
f"Processing trigger failed: {str(e)}", document_id
|
||||
)
|
||||
except Exception as update_error:
|
||||
logger.error(f"Failed to update document status to failed: {update_error}")
|
||||
raise
|
||||
|
||||
async def _process_document_from_database(
|
||||
self,
|
||||
processor,
|
||||
document_id: str,
|
||||
dataset_id: Optional[str],
|
||||
user_uuid: str,
|
||||
filename: str
|
||||
):
|
||||
"""Process document using content already stored in database"""
|
||||
try:
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Get document content from database
|
||||
pg_client = await get_postgresql_client()
|
||||
doc_info = await pg_client.fetch_one("""
|
||||
SELECT content_text, file_type, metadata
|
||||
FROM documents
|
||||
WHERE id = $1 AND user_id = $2::uuid
|
||||
""", document_id, user_uuid)
|
||||
|
||||
if not doc_info or not doc_info["content_text"]:
|
||||
raise ValueError("Document content not found in database")
|
||||
|
||||
# Create temporary file with the content
|
||||
# Sanitize the file extension to prevent path injection
|
||||
safe_suffix = sanitize_filename(filename)
|
||||
safe_suffix = Path(safe_suffix).suffix if safe_suffix else ".tmp"
|
||||
# codeql[py/path-injection] safe_suffix is sanitized via sanitize_filename()
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix=safe_suffix, delete=False) as temp_file:
|
||||
# Handle different storage types - metadata might be JSON string or dict
|
||||
metadata_raw = doc_info["metadata"] or "{}"
|
||||
if isinstance(metadata_raw, str):
|
||||
import json
|
||||
metadata = json.loads(metadata_raw)
|
||||
else:
|
||||
metadata = metadata_raw or {}
|
||||
storage_type = metadata.get("storage_type", "text")
|
||||
|
||||
if storage_type == "text":
|
||||
temp_file.write(doc_info["content_text"])
|
||||
elif storage_type == "base64":
|
||||
import base64
|
||||
content_bytes = base64.b64decode(doc_info["content_text"])
|
||||
temp_file.close()
|
||||
with open(temp_file.name, 'wb') as binary_file:
|
||||
binary_file.write(content_bytes)
|
||||
elif storage_type == "pdf_extracted":
|
||||
# For PDFs with extracted text, create a placeholder text file
|
||||
# since the actual text content is already extracted
|
||||
temp_file.write(doc_info["content_text"])
|
||||
else:
|
||||
temp_file.write(doc_info["content_text"])
|
||||
|
||||
temp_file_path = Path(temp_file.name)
|
||||
|
||||
try:
|
||||
# Process the document using the existing document processor
|
||||
await processor.process_file(
|
||||
file_path=temp_file_path,
|
||||
dataset_id=dataset_id, # Keep None as None - don't convert to empty string
|
||||
user_id=user_uuid,
|
||||
original_filename=filename,
|
||||
document_id=document_id # Use existing document instead of creating new one
|
||||
)
|
||||
|
||||
logger.info(f"Successfully processed document {document_id} from database content")
|
||||
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
try:
|
||||
os.unlink(temp_file_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"Failed to cleanup temporary file {temp_file_path}: {cleanup_error}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process document from database content: {e}")
|
||||
raise
|
||||
|
||||
async def _extract_pdf_text(self, content: bytes) -> str:
|
||||
"""Extract text content from PDF bytes using pypdf"""
|
||||
import io
|
||||
import pypdf as PyPDF2 # pypdf is the maintained successor to PyPDF2
|
||||
|
||||
try:
|
||||
# Create BytesIO object from content
|
||||
pdf_stream = io.BytesIO(content)
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
||||
|
||||
text_parts = []
|
||||
for page_num, page in enumerate(pdf_reader.pages):
|
||||
try:
|
||||
page_text = page.extract_text()
|
||||
if page_text.strip():
|
||||
text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
|
||||
|
||||
if not text_parts:
|
||||
# If no text could be extracted, return a placeholder
|
||||
return f"PDF document with {len(pdf_reader.pages)} pages (text extraction failed)"
|
||||
|
||||
extracted_text = "\n\n".join(text_parts)
|
||||
logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF with {len(pdf_reader.pages)} pages")
|
||||
return extracted_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF text extraction failed: {e}")
|
||||
# Return a fallback description instead of failing completely
|
||||
return f"PDF document (text extraction failed: {str(e)})"
|
||||
Reference in New Issue
Block a user