Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
256 lines
9.4 KiB
Python
256 lines
9.4 KiB
Python
"""
|
|
GT 2.0 Documents API - Wrapper for Files API
|
|
|
|
Provides document-centric interface that wraps the underlying files API.
|
|
This maintains the document abstraction for the frontend while leveraging
|
|
the existing file storage infrastructure.
|
|
"""
|
|
|
|
import logging
|
|
from fastapi import APIRouter, HTTPException, Depends, Query, UploadFile, File, Form
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from app.core.security import get_current_user
|
|
from app.api.v1.files import (
|
|
get_file_info,
|
|
download_file,
|
|
delete_file,
|
|
list_files,
|
|
get_document_summary as get_file_summary,
|
|
upload_file
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
|
|
|
|
|
@router.post("", status_code=201)
|
|
@router.post("/", status_code=201) # Support both with and without trailing slash
|
|
async def upload_document(
|
|
file: UploadFile = File(...),
|
|
dataset_id: Optional[str] = Form(None, description="Associate with dataset"),
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Upload document (proxy to files API) - accepts dataset_id from FormData"""
|
|
try:
|
|
logger.info(f"Document upload requested - file: {file.filename}, dataset_id: {dataset_id}")
|
|
# Proxy to files upload endpoint with "documents" category
|
|
return await upload_file(file, dataset_id, "documents", current_user)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Document upload failed: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
@router.get("/{document_id}")
|
|
async def get_document(
|
|
document_id: str,
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Get document details (proxy to files API)"""
|
|
try:
|
|
# Proxy to files API - documents are stored as files
|
|
return await get_file_info(document_id, current_user)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to get document {document_id}: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
@router.get("/{document_id}/summary")
|
|
async def get_document_summary(
|
|
document_id: str,
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Get AI-generated summary for a document (proxy to files API)"""
|
|
try:
|
|
# Proxy to files summary endpoint
|
|
# codeql[py/stack-trace-exposure] proxies to files API, returns summary dict
|
|
return await get_file_summary(document_id, current_user)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Document summary generation failed for {document_id}: {e}", exc_info=True)
|
|
# Return a fallback response
|
|
return {
|
|
"summary": "Summary generation is currently unavailable. Please try again later.",
|
|
"key_topics": [],
|
|
"document_type": "unknown",
|
|
"language": "en",
|
|
"metadata": {}
|
|
}
|
|
|
|
|
|
@router.get("")
|
|
async def list_documents(
|
|
dataset_id: Optional[str] = Query(None, description="Filter by dataset"),
|
|
status: Optional[str] = Query(None, description="Filter by processing status"),
|
|
limit: int = Query(50, ge=1, le=100),
|
|
offset: int = Query(0, ge=0),
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""List documents with optional filtering (proxy to files API)"""
|
|
try:
|
|
# Map documents request to files API
|
|
# Documents are files in the "documents" category
|
|
result = await list_files(
|
|
dataset_id=dataset_id,
|
|
category="documents",
|
|
limit=limit,
|
|
offset=offset,
|
|
current_user=current_user
|
|
)
|
|
|
|
# Extract just the files array from the response object
|
|
# The list_files endpoint returns {files: [...], total: N, limit: N, offset: N}
|
|
# But frontend expects just the array
|
|
if isinstance(result, dict) and 'files' in result:
|
|
return result['files']
|
|
elif isinstance(result, list):
|
|
return result
|
|
else:
|
|
logger.warning(f"Unexpected response format from list_files: {type(result)}")
|
|
return []
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to list documents: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
@router.delete("/{document_id}")
|
|
async def delete_document(
|
|
document_id: str,
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Delete document and its metadata (proxy to files API)"""
|
|
try:
|
|
# Proxy to files delete endpoint
|
|
return await delete_file(document_id, current_user)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete document {document_id}: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
@router.get("/{document_id}/download")
|
|
async def download_document(
|
|
document_id: str,
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Download document file (proxy to files API)"""
|
|
try:
|
|
# Proxy to files download endpoint
|
|
return await download_file(document_id, current_user)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to download document {document_id}: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
@router.post("/{document_id}/process")
|
|
async def process_document(
|
|
document_id: str,
|
|
chunking_strategy: Optional[str] = Query("hybrid", description="Chunking strategy"),
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Trigger document processing (chunking and embedding generation)"""
|
|
try:
|
|
from app.services.document_processor import get_document_processor
|
|
from app.core.user_resolver import resolve_user_uuid
|
|
|
|
logger.info(f"Manual processing requested for document {document_id}")
|
|
|
|
# Get user info
|
|
tenant_domain, user_email, user_uuid = await resolve_user_uuid(current_user)
|
|
|
|
# Get document processor
|
|
processor = await get_document_processor(tenant_domain=tenant_domain)
|
|
|
|
# Get document info to verify it exists and get metadata
|
|
from app.services.postgresql_file_service import PostgreSQLFileService
|
|
file_service = PostgreSQLFileService(tenant_domain=tenant_domain, user_id=user_uuid)
|
|
|
|
try:
|
|
doc_info = await file_service.get_file_info(document_id)
|
|
except FileNotFoundError:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
# Trigger processing using the file service's processing method
|
|
await file_service._process_document_from_database(
|
|
processor=processor,
|
|
document_id=document_id,
|
|
dataset_id=doc_info.get("dataset_id"),
|
|
user_uuid=user_uuid,
|
|
filename=doc_info["original_filename"]
|
|
)
|
|
|
|
return {
|
|
"status": "success",
|
|
"message": "Document processing started",
|
|
"document_id": document_id,
|
|
"chunking_strategy": chunking_strategy
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to process document {document_id}: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
@router.post("/processing-status")
|
|
async def get_processing_status(
|
|
request: Dict[str, Any],
|
|
current_user: Dict[str, Any] = Depends(get_current_user)
|
|
):
|
|
"""Get processing status for multiple documents"""
|
|
try:
|
|
from app.services.document_processor import get_document_processor
|
|
from app.core.user_resolver import resolve_user_uuid
|
|
|
|
# Get user info
|
|
tenant_domain, user_email, user_uuid = await resolve_user_uuid(current_user)
|
|
|
|
# Get document IDs from request
|
|
document_ids = request.get("document_ids", [])
|
|
if not document_ids:
|
|
raise HTTPException(status_code=400, detail="document_ids required")
|
|
|
|
# Get processor instance
|
|
processor = await get_document_processor(tenant_domain=tenant_domain)
|
|
|
|
# Get status for each document
|
|
status_results = {}
|
|
for doc_id in document_ids:
|
|
try:
|
|
status_info = await processor.get_processing_status(doc_id)
|
|
status_results[doc_id] = {
|
|
"status": status_info["status"],
|
|
"error_message": status_info["error_message"],
|
|
"progress": status_info["processing_progress"],
|
|
"stage": status_info["processing_stage"],
|
|
"chunks_processed": status_info["chunks_processed"],
|
|
"total_chunks_expected": status_info["total_chunks_expected"]
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Failed to get status for doc {doc_id}: {e}", exc_info=True)
|
|
status_results[doc_id] = {
|
|
"status": "error",
|
|
"error_message": "Failed to get processing status",
|
|
"progress": 0,
|
|
"stage": "unknown"
|
|
}
|
|
|
|
return status_results
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to get processing status: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail="Internal server error") |