Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
145 lines
5.1 KiB
Python
145 lines
5.1 KiB
Python
"""
|
|
RAG (Retrieval-Augmented Generation) API endpoints
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException, Depends
|
|
from typing import Dict, Any, List
|
|
from pydantic import BaseModel, Field
|
|
import logging
|
|
|
|
from app.core.security import capability_validator, CapabilityToken
|
|
from app.api.auth import verify_capability
|
|
|
|
router = APIRouter()
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentUploadRequest(BaseModel):
|
|
"""Document upload request"""
|
|
content: str = Field(..., description="Document content")
|
|
metadata: Dict[str, Any] = Field(default={}, description="Document metadata")
|
|
collection: str = Field(default="default", description="Collection name")
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
"""Semantic search request"""
|
|
query: str = Field(..., description="Search query")
|
|
collection: str = Field(default="default", description="Collection to search")
|
|
top_k: int = Field(default=5, ge=1, le=100, description="Number of results")
|
|
|
|
|
|
@router.post("/upload")
|
|
async def upload_document(
|
|
request: DocumentUploadRequest,
|
|
token: CapabilityToken = Depends(verify_capability)
|
|
) -> Dict[str, Any]:
|
|
"""Upload document for RAG processing"""
|
|
|
|
try:
|
|
import uuid
|
|
import hashlib
|
|
|
|
# Generate document ID
|
|
doc_id = f"doc_{uuid.uuid4().hex[:8]}"
|
|
|
|
# Create content hash for deduplication
|
|
content_hash = hashlib.sha256(request.content.encode()).hexdigest()[:16]
|
|
|
|
# Process the document content
|
|
# In production, this would:
|
|
# 1. Split document into chunks
|
|
# 2. Generate embeddings using the embedding service
|
|
# 3. Store in ChromaDB collection
|
|
|
|
# For now, simulate document processing
|
|
word_count = len(request.content.split())
|
|
chunk_count = max(1, word_count // 200) # Simulate ~200 words per chunk
|
|
|
|
# Store metadata with content
|
|
document_data = {
|
|
"document_id": doc_id,
|
|
"content_hash": content_hash,
|
|
"content": request.content,
|
|
"metadata": request.metadata,
|
|
"collection": request.collection,
|
|
"tenant_id": token.tenant_id,
|
|
"user_id": token.user_id,
|
|
"word_count": word_count,
|
|
"chunk_count": chunk_count
|
|
}
|
|
|
|
# In production: Store in ChromaDB
|
|
# collection = chromadb_client.get_or_create_collection(request.collection)
|
|
# collection.add(documents=[request.content], ids=[doc_id], metadatas=[request.metadata])
|
|
|
|
logger.info(f"Document uploaded: {doc_id} ({word_count} words, {chunk_count} chunks)")
|
|
|
|
return {
|
|
"success": True,
|
|
"document_id": doc_id,
|
|
"content_hash": content_hash,
|
|
"collection": request.collection,
|
|
"word_count": word_count,
|
|
"chunk_count": chunk_count,
|
|
"message": "Document processed and stored for RAG retrieval"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Document upload failed: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Document upload failed: {str(e)}")
|
|
|
|
|
|
@router.post("/search")
|
|
async def semantic_search(
|
|
request: SearchRequest,
|
|
token: CapabilityToken = Depends(verify_capability)
|
|
) -> Dict[str, Any]:
|
|
"""Perform semantic search"""
|
|
|
|
try:
|
|
# In production, this would:
|
|
# 1. Generate embedding for the query using embedding service
|
|
# 2. Search ChromaDB collection for similar vectors
|
|
# 3. Return ranked results with metadata
|
|
|
|
# For now, simulate semantic search with keyword matching
|
|
import time
|
|
search_start = time.time()
|
|
|
|
# Simulate query processing
|
|
query_terms = request.query.lower().split()
|
|
|
|
# Mock search results
|
|
mock_results = [
|
|
{
|
|
"document_id": f"doc_result_{i}",
|
|
"content": f"Sample content matching '{request.query}' - result {i+1}",
|
|
"metadata": {
|
|
"source": f"document_{i+1}.txt",
|
|
"author": "System",
|
|
"created_at": "2025-01-01T00:00:00Z"
|
|
},
|
|
"similarity_score": 0.9 - (i * 0.1),
|
|
"chunk_id": f"chunk_{i+1}"
|
|
}
|
|
for i in range(min(request.top_k, 3)) # Return up to 3 mock results
|
|
]
|
|
|
|
search_time = time.time() - search_start
|
|
|
|
logger.info(f"Semantic search completed: query='{request.query}', results={len(mock_results)}, time={search_time:.3f}s")
|
|
|
|
return {
|
|
"success": True,
|
|
"query": request.query,
|
|
"collection": request.collection,
|
|
"results": mock_results,
|
|
"total_results": len(mock_results),
|
|
"search_time_ms": int(search_time * 1000),
|
|
"tenant_id": token.tenant_id,
|
|
"user_id": token.user_id
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Semantic search failed: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Semantic search failed: {str(e)}") |