Files
gt-ai-os-community/apps/tenant-backend/app/api/v1/documents.py
HackWeasel 310491a557 GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2
- Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2
  - Made more general-purpose (flexible targets, expanded tools)
- Added nemotron-mini-agent.csv for fast local inference via Ollama
- Added nemotron-agent.csv for advanced reasoning via Ollama
- Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00

256 lines
9.4 KiB
Python

"""
GT 2.0 Documents API - Wrapper for Files API
Provides document-centric interface that wraps the underlying files API.
This maintains the document abstraction for the frontend while leveraging
the existing file storage infrastructure.
"""
import logging
from fastapi import APIRouter, HTTPException, Depends, Query, UploadFile, File, Form
from typing import Dict, Any, List, Optional
from app.core.security import get_current_user
from app.api.v1.files import (
get_file_info,
download_file,
delete_file,
list_files,
get_document_summary as get_file_summary,
upload_file
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/documents", tags=["documents"])
@router.post("", status_code=201)
@router.post("/", status_code=201) # Support both with and without trailing slash
async def upload_document(
file: UploadFile = File(...),
dataset_id: Optional[str] = Form(None, description="Associate with dataset"),
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Upload document (proxy to files API) - accepts dataset_id from FormData"""
try:
logger.info(f"Document upload requested - file: {file.filename}, dataset_id: {dataset_id}")
# Proxy to files upload endpoint with "documents" category
return await upload_file(file, dataset_id, "documents", current_user)
except HTTPException:
raise
except Exception as e:
logger.error(f"Document upload failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/{document_id}")
async def get_document(
document_id: str,
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Get document details (proxy to files API)"""
try:
# Proxy to files API - documents are stored as files
return await get_file_info(document_id, current_user)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get document {document_id}: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/{document_id}/summary")
async def get_document_summary(
document_id: str,
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Get AI-generated summary for a document (proxy to files API)"""
try:
# Proxy to files summary endpoint
# codeql[py/stack-trace-exposure] proxies to files API, returns summary dict
return await get_file_summary(document_id, current_user)
except HTTPException:
raise
except Exception as e:
logger.error(f"Document summary generation failed for {document_id}: {e}", exc_info=True)
# Return a fallback response
return {
"summary": "Summary generation is currently unavailable. Please try again later.",
"key_topics": [],
"document_type": "unknown",
"language": "en",
"metadata": {}
}
@router.get("")
async def list_documents(
dataset_id: Optional[str] = Query(None, description="Filter by dataset"),
status: Optional[str] = Query(None, description="Filter by processing status"),
limit: int = Query(50, ge=1, le=100),
offset: int = Query(0, ge=0),
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""List documents with optional filtering (proxy to files API)"""
try:
# Map documents request to files API
# Documents are files in the "documents" category
result = await list_files(
dataset_id=dataset_id,
category="documents",
limit=limit,
offset=offset,
current_user=current_user
)
# Extract just the files array from the response object
# The list_files endpoint returns {files: [...], total: N, limit: N, offset: N}
# But frontend expects just the array
if isinstance(result, dict) and 'files' in result:
return result['files']
elif isinstance(result, list):
return result
else:
logger.warning(f"Unexpected response format from list_files: {type(result)}")
return []
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to list documents: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@router.delete("/{document_id}")
async def delete_document(
document_id: str,
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Delete document and its metadata (proxy to files API)"""
try:
# Proxy to files delete endpoint
return await delete_file(document_id, current_user)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to delete document {document_id}: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/{document_id}/download")
async def download_document(
document_id: str,
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Download document file (proxy to files API)"""
try:
# Proxy to files download endpoint
return await download_file(document_id, current_user)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to download document {document_id}: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@router.post("/{document_id}/process")
async def process_document(
document_id: str,
chunking_strategy: Optional[str] = Query("hybrid", description="Chunking strategy"),
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Trigger document processing (chunking and embedding generation)"""
try:
from app.services.document_processor import get_document_processor
from app.core.user_resolver import resolve_user_uuid
logger.info(f"Manual processing requested for document {document_id}")
# Get user info
tenant_domain, user_email, user_uuid = await resolve_user_uuid(current_user)
# Get document processor
processor = await get_document_processor(tenant_domain=tenant_domain)
# Get document info to verify it exists and get metadata
from app.services.postgresql_file_service import PostgreSQLFileService
file_service = PostgreSQLFileService(tenant_domain=tenant_domain, user_id=user_uuid)
try:
doc_info = await file_service.get_file_info(document_id)
except FileNotFoundError:
raise HTTPException(status_code=404, detail="Document not found")
# Trigger processing using the file service's processing method
await file_service._process_document_from_database(
processor=processor,
document_id=document_id,
dataset_id=doc_info.get("dataset_id"),
user_uuid=user_uuid,
filename=doc_info["original_filename"]
)
return {
"status": "success",
"message": "Document processing started",
"document_id": document_id,
"chunking_strategy": chunking_strategy
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to process document {document_id}: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@router.post("/processing-status")
async def get_processing_status(
request: Dict[str, Any],
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Get processing status for multiple documents"""
try:
from app.services.document_processor import get_document_processor
from app.core.user_resolver import resolve_user_uuid
# Get user info
tenant_domain, user_email, user_uuid = await resolve_user_uuid(current_user)
# Get document IDs from request
document_ids = request.get("document_ids", [])
if not document_ids:
raise HTTPException(status_code=400, detail="document_ids required")
# Get processor instance
processor = await get_document_processor(tenant_domain=tenant_domain)
# Get status for each document
status_results = {}
for doc_id in document_ids:
try:
status_info = await processor.get_processing_status(doc_id)
status_results[doc_id] = {
"status": status_info["status"],
"error_message": status_info["error_message"],
"progress": status_info["processing_progress"],
"stage": status_info["processing_stage"],
"chunks_processed": status_info["chunks_processed"],
"total_chunks_expected": status_info["total_chunks_expected"]
}
except Exception as e:
logger.error(f"Failed to get status for doc {doc_id}: {e}", exc_info=True)
status_results[doc_id] = {
"status": "error",
"error_message": "Failed to get processing status",
"progress": 0,
"stage": "unknown"
}
return status_results
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get processing status: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")