Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
671 lines
24 KiB
Python
671 lines
24 KiB
Python
"""
|
|
RAG Service for GT 2.0 Tenant Backend
|
|
|
|
Orchestrates document processing, embedding generation, and vector storage
|
|
with perfect tenant isolation and zero downtime compliance.
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
import aiofiles
|
|
import os
|
|
import json
|
|
import gc
|
|
from typing import Dict, Any, List, Optional, BinaryIO
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import hashlib
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy import select, and_, or_
|
|
from sqlalchemy.orm import selectinload
|
|
|
|
from app.models.document import Document, RAGDataset, DatasetDocument, DocumentChunk
|
|
from app.core.database import get_db_session
|
|
from app.core.config import get_settings
|
|
from app.core.resource_client import ResourceClusterClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class RAGService:
|
|
"""
|
|
Comprehensive RAG service with perfect tenant isolation.
|
|
|
|
GT 2.0 Security Principles:
|
|
- Perfect tenant isolation (all operations user-scoped)
|
|
- Stateless document processing (no data persistence in Resource Cluster)
|
|
- Encrypted vector storage per tenant
|
|
- Zero downtime compliance (async operations)
|
|
"""
|
|
|
|
def __init__(self, db: AsyncSession):
|
|
self.db = db
|
|
self.settings = get_settings()
|
|
self.resource_client = ResourceClusterClient()
|
|
|
|
# Tenant-specific directories
|
|
self.upload_directory = Path(self.settings.upload_directory)
|
|
self.temp_directory = Path(self.settings.temp_directory)
|
|
|
|
# Ensure directories exist with secure permissions
|
|
self._ensure_directories()
|
|
|
|
logger.info("RAG service initialized with tenant isolation")
|
|
|
|
def _ensure_directories(self):
|
|
"""Ensure required directories exist with secure permissions"""
|
|
for directory in [self.upload_directory, self.temp_directory]:
|
|
directory.mkdir(parents=True, exist_ok=True, mode=0o700)
|
|
|
|
async def create_rag_dataset(
|
|
self,
|
|
user_id: str,
|
|
dataset_name: str,
|
|
description: Optional[str] = None,
|
|
chunking_strategy: str = "hybrid",
|
|
chunk_size: int = 512,
|
|
chunk_overlap: int = 128,
|
|
embedding_model: str = "BAAI/bge-m3"
|
|
) -> RAGDataset:
|
|
"""Create a new RAG dataset with tenant isolation"""
|
|
try:
|
|
# Check if dataset already exists for this user
|
|
existing = await self.db.execute(
|
|
select(RAGDataset).where(
|
|
and_(
|
|
RAGDataset.user_id == user_id,
|
|
RAGDataset.dataset_name == dataset_name
|
|
)
|
|
)
|
|
)
|
|
if existing.scalar_one_or_none():
|
|
raise ValueError(f"Dataset '{dataset_name}' already exists for user")
|
|
|
|
# Create dataset
|
|
dataset = RAGDataset(
|
|
user_id=user_id,
|
|
dataset_name=dataset_name,
|
|
description=description,
|
|
chunking_strategy=chunking_strategy,
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
embedding_model=embedding_model
|
|
)
|
|
|
|
self.db.add(dataset)
|
|
await self.db.commit()
|
|
await self.db.refresh(dataset)
|
|
|
|
logger.info(f"Created RAG dataset '{dataset_name}' for user {user_id}")
|
|
return dataset
|
|
|
|
except Exception as e:
|
|
await self.db.rollback()
|
|
logger.error(f"Failed to create RAG dataset: {e}")
|
|
raise
|
|
|
|
async def upload_document(
|
|
self,
|
|
user_id: str,
|
|
file_content: bytes,
|
|
filename: str,
|
|
file_type: str,
|
|
dataset_id: Optional[str] = None
|
|
) -> Document:
|
|
"""Upload and store document with tenant isolation"""
|
|
try:
|
|
# Validate file
|
|
file_extension = Path(filename).suffix.lower()
|
|
if not file_extension:
|
|
raise ValueError("File must have an extension")
|
|
|
|
# Generate secure filename
|
|
file_hash = hashlib.sha256(file_content).hexdigest()[:16]
|
|
secure_filename = f"{user_id}_{file_hash}_{filename}"
|
|
|
|
# Tenant-specific file path
|
|
user_upload_dir = self.upload_directory / user_id
|
|
user_upload_dir.mkdir(exist_ok=True, mode=0o700)
|
|
|
|
file_path = user_upload_dir / secure_filename
|
|
|
|
# Save file with secure permissions
|
|
async with aiofiles.open(file_path, 'wb') as f:
|
|
await f.write(file_content)
|
|
|
|
# Set file permissions (owner read/write only)
|
|
os.chmod(file_path, 0o600)
|
|
|
|
# Create document record
|
|
document = Document(
|
|
filename=secure_filename,
|
|
original_filename=filename,
|
|
file_path=str(file_path),
|
|
file_type=file_type,
|
|
file_extension=file_extension,
|
|
file_size_bytes=len(file_content),
|
|
uploaded_by=user_id,
|
|
processing_status="pending"
|
|
)
|
|
|
|
self.db.add(document)
|
|
await self.db.commit()
|
|
await self.db.refresh(document)
|
|
|
|
# Add to dataset if specified
|
|
if dataset_id:
|
|
await self.add_document_to_dataset(user_id, document.id, dataset_id)
|
|
|
|
# Clear file content from memory
|
|
del file_content
|
|
gc.collect()
|
|
|
|
logger.info(f"Uploaded document '{filename}' for user {user_id}")
|
|
return document
|
|
|
|
except Exception as e:
|
|
await self.db.rollback()
|
|
logger.error(f"Failed to upload document: {e}")
|
|
# Clear sensitive data even on error
|
|
if 'file_content' in locals():
|
|
del file_content
|
|
gc.collect()
|
|
raise
|
|
|
|
async def process_document(
|
|
self,
|
|
user_id: str,
|
|
document_id: int,
|
|
tenant_id: str,
|
|
chunking_strategy: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""Process document into chunks and generate embeddings"""
|
|
try:
|
|
# Get document with ownership check
|
|
document = await self._get_user_document(user_id, document_id)
|
|
if not document:
|
|
raise PermissionError("Document not found or access denied")
|
|
|
|
# Check if already processed
|
|
if document.is_processing_complete():
|
|
logger.info(f"Document {document_id} already processed")
|
|
return {"status": "already_processed", "chunk_count": document.chunk_count}
|
|
|
|
# Mark as processing
|
|
document.mark_processing_started()
|
|
await self.db.commit()
|
|
|
|
# Read document file
|
|
file_content = await self._read_document_file(document)
|
|
|
|
# Process document using Resource Cluster (stateless)
|
|
chunks = await self.resource_client.process_document(
|
|
content=file_content,
|
|
document_type=document.file_extension,
|
|
strategy_type=chunking_strategy or "hybrid",
|
|
tenant_id=tenant_id,
|
|
user_id=user_id
|
|
)
|
|
|
|
# Clear file content from memory immediately
|
|
del file_content
|
|
gc.collect()
|
|
|
|
if not chunks:
|
|
raise ValueError("Document processing returned no chunks")
|
|
|
|
# Generate embeddings for chunks (stateless)
|
|
chunk_texts = [chunk["text"] for chunk in chunks]
|
|
embeddings = await self.resource_client.generate_document_embeddings(
|
|
documents=chunk_texts,
|
|
tenant_id=tenant_id,
|
|
user_id=user_id
|
|
)
|
|
|
|
if len(embeddings) != len(chunk_texts):
|
|
raise ValueError("Embedding count mismatch with chunk count")
|
|
|
|
# Store vectors in ChromaDB via Resource Cluster
|
|
dataset_name = f"doc_{document.id}"
|
|
collection_created = await self.resource_client.create_vector_collection(
|
|
tenant_id=tenant_id,
|
|
user_id=user_id,
|
|
dataset_name=dataset_name
|
|
)
|
|
|
|
if not collection_created:
|
|
raise RuntimeError("Failed to create vector collection")
|
|
|
|
# Store vectors with metadata
|
|
chunk_metadata = [chunk["metadata"] for chunk in chunks]
|
|
vector_stored = await self.resource_client.store_vectors(
|
|
tenant_id=tenant_id,
|
|
user_id=user_id,
|
|
dataset_name=dataset_name,
|
|
documents=chunk_texts,
|
|
embeddings=embeddings,
|
|
metadata=chunk_metadata
|
|
)
|
|
|
|
if not vector_stored:
|
|
raise RuntimeError("Failed to store vectors")
|
|
|
|
# Clear embedding data from memory
|
|
del chunk_texts, embeddings
|
|
gc.collect()
|
|
|
|
# Update document record
|
|
vector_store_ids = [f"{tenant_id}:{user_id}:{dataset_name}"]
|
|
document.mark_processing_complete(
|
|
chunk_count=len(chunks),
|
|
vector_store_ids=vector_store_ids
|
|
)
|
|
|
|
await self.db.commit()
|
|
|
|
logger.info(f"Processed document {document_id} into {len(chunks)} chunks")
|
|
|
|
return {
|
|
"status": "completed",
|
|
"document_id": document_id,
|
|
"chunk_count": len(chunks),
|
|
"vector_store_ids": vector_store_ids
|
|
}
|
|
|
|
except Exception as e:
|
|
# Mark document processing as failed
|
|
if 'document' in locals() and document:
|
|
document.mark_processing_failed({"error": str(e)})
|
|
await self.db.commit()
|
|
|
|
logger.error(f"Failed to process document {document_id}: {e}")
|
|
# Ensure memory cleanup
|
|
gc.collect()
|
|
raise
|
|
|
|
async def add_document_to_dataset(
|
|
self,
|
|
user_id: str,
|
|
document_id: int,
|
|
dataset_id: str
|
|
) -> DatasetDocument:
|
|
"""Add processed document to RAG dataset"""
|
|
try:
|
|
# Verify dataset ownership
|
|
dataset = await self._get_user_dataset(user_id, dataset_id)
|
|
if not dataset:
|
|
raise PermissionError("Dataset not found or access denied")
|
|
|
|
# Verify document ownership
|
|
document = await self._get_user_document(user_id, document_id)
|
|
if not document:
|
|
raise PermissionError("Document not found or access denied")
|
|
|
|
# Check if already in dataset
|
|
existing = await self.db.execute(
|
|
select(DatasetDocument).where(
|
|
and_(
|
|
DatasetDocument.dataset_id == dataset_id,
|
|
DatasetDocument.document_id == document_id
|
|
)
|
|
)
|
|
)
|
|
if existing.scalar_one_or_none():
|
|
raise ValueError("Document already in dataset")
|
|
|
|
# Create dataset document relationship
|
|
dataset_doc = DatasetDocument(
|
|
dataset_id=dataset_id,
|
|
document_id=document_id,
|
|
user_id=user_id,
|
|
chunk_count=document.chunk_count,
|
|
vector_count=document.chunk_count # Assuming 1 vector per chunk
|
|
)
|
|
|
|
self.db.add(dataset_doc)
|
|
|
|
# Update dataset statistics
|
|
dataset.document_count += 1
|
|
dataset.chunk_count += document.chunk_count
|
|
dataset.vector_count += document.chunk_count
|
|
dataset.total_size_bytes += document.file_size_bytes
|
|
|
|
await self.db.commit()
|
|
await self.db.refresh(dataset_doc)
|
|
|
|
logger.info(f"Added document {document_id} to dataset {dataset_id}")
|
|
return dataset_doc
|
|
|
|
except Exception as e:
|
|
await self.db.rollback()
|
|
logger.error(f"Failed to add document to dataset: {e}")
|
|
raise
|
|
|
|
async def search_documents(
|
|
self,
|
|
user_id: str,
|
|
tenant_id: str,
|
|
query: str,
|
|
dataset_ids: Optional[List[str]] = None,
|
|
top_k: int = 5,
|
|
similarity_threshold: float = 0.7
|
|
) -> List[Dict[str, Any]]:
|
|
"""Search documents using RAG with tenant isolation"""
|
|
try:
|
|
# Generate query embedding
|
|
query_embeddings = await self.resource_client.generate_query_embeddings(
|
|
queries=[query],
|
|
tenant_id=tenant_id,
|
|
user_id=user_id
|
|
)
|
|
|
|
if not query_embeddings:
|
|
raise ValueError("Failed to generate query embedding")
|
|
|
|
query_embedding = query_embeddings[0]
|
|
|
|
# Get user's datasets if not specified
|
|
if not dataset_ids:
|
|
datasets = await self.list_user_datasets(user_id)
|
|
dataset_ids = [d.id for d in datasets]
|
|
|
|
# Search across datasets
|
|
all_results = []
|
|
for dataset_id in dataset_ids:
|
|
# Verify dataset ownership
|
|
dataset = await self._get_user_dataset(user_id, dataset_id)
|
|
if not dataset:
|
|
continue
|
|
|
|
# Search in ChromaDB
|
|
dataset_name = f"dataset_{dataset_id}"
|
|
results = await self.resource_client.search_vectors(
|
|
tenant_id=tenant_id,
|
|
user_id=user_id,
|
|
dataset_name=dataset_name,
|
|
query_embedding=query_embedding,
|
|
top_k=top_k
|
|
)
|
|
|
|
# Filter by similarity threshold and add dataset context
|
|
for result in results:
|
|
if result.get("similarity", 0) >= similarity_threshold:
|
|
result["dataset_id"] = dataset_id
|
|
result["dataset_name"] = dataset.dataset_name
|
|
all_results.append(result)
|
|
|
|
# Sort by similarity and limit
|
|
all_results.sort(key=lambda x: x.get("similarity", 0), reverse=True)
|
|
final_results = all_results[:top_k]
|
|
|
|
# Clear query embedding from memory
|
|
del query_embedding, query_embeddings
|
|
gc.collect()
|
|
|
|
logger.info(f"Search found {len(final_results)} results for user {user_id}")
|
|
return final_results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to search documents: {e}")
|
|
gc.collect()
|
|
raise
|
|
|
|
async def get_document_context(
|
|
self,
|
|
user_id: str,
|
|
tenant_id: str,
|
|
document_id: int,
|
|
query: str,
|
|
context_size: int = 3
|
|
) -> Dict[str, Any]:
|
|
"""Get relevant context from a specific document"""
|
|
try:
|
|
# Verify document ownership
|
|
document = await self._get_user_document(user_id, document_id)
|
|
if not document:
|
|
raise PermissionError("Document not found or access denied")
|
|
|
|
if not document.is_processing_complete():
|
|
raise ValueError("Document not yet processed")
|
|
|
|
# Generate query embedding
|
|
query_embeddings = await self.resource_client.generate_query_embeddings(
|
|
queries=[query],
|
|
tenant_id=tenant_id,
|
|
user_id=user_id
|
|
)
|
|
|
|
query_embedding = query_embeddings[0]
|
|
|
|
# Search document's vectors
|
|
dataset_name = f"doc_{document_id}"
|
|
results = await self.resource_client.search_vectors(
|
|
tenant_id=tenant_id,
|
|
user_id=user_id,
|
|
dataset_name=dataset_name,
|
|
query_embedding=query_embedding,
|
|
top_k=context_size
|
|
)
|
|
|
|
context = {
|
|
"document_id": document_id,
|
|
"document_name": document.original_filename,
|
|
"query": query,
|
|
"relevant_chunks": results,
|
|
"context_text": "\n\n".join([r["document"] for r in results])
|
|
}
|
|
|
|
# Clear query embedding from memory
|
|
del query_embedding, query_embeddings
|
|
gc.collect()
|
|
|
|
return context
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get document context: {e}")
|
|
gc.collect()
|
|
raise
|
|
|
|
async def list_user_documents(
|
|
self,
|
|
user_id: str,
|
|
status_filter: Optional[str] = None,
|
|
offset: int = 0,
|
|
limit: int = 50
|
|
) -> List[Document]:
|
|
"""List user's documents with optional filtering"""
|
|
try:
|
|
query = select(Document).where(Document.uploaded_by == user_id)
|
|
|
|
if status_filter:
|
|
query = query.where(Document.processing_status == status_filter)
|
|
|
|
query = query.order_by(Document.created_at.desc())
|
|
query = query.offset(offset).limit(limit)
|
|
|
|
result = await self.db.execute(query)
|
|
documents = result.scalars().all()
|
|
|
|
return list(documents)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to list user documents: {e}")
|
|
raise
|
|
|
|
async def list_user_datasets(
|
|
self,
|
|
user_id: str,
|
|
include_stats: bool = True
|
|
) -> List[RAGDataset]:
|
|
"""List user's RAG datasets"""
|
|
try:
|
|
query = select(RAGDataset).where(RAGDataset.user_id == user_id)
|
|
|
|
if include_stats:
|
|
query = query.options(selectinload(RAGDataset.documents))
|
|
|
|
query = query.order_by(RAGDataset.created_at.desc())
|
|
|
|
result = await self.db.execute(query)
|
|
datasets = result.scalars().all()
|
|
|
|
return list(datasets)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to list user datasets: {e}")
|
|
raise
|
|
|
|
async def delete_document(
|
|
self,
|
|
user_id: str,
|
|
tenant_id: str,
|
|
document_id: int
|
|
) -> bool:
|
|
"""Delete document and associated vectors"""
|
|
try:
|
|
# Verify document ownership
|
|
document = await self._get_user_document(user_id, document_id)
|
|
if not document:
|
|
raise PermissionError("Document not found or access denied")
|
|
|
|
# Delete vectors from ChromaDB if processed
|
|
if document.is_processing_complete():
|
|
dataset_name = f"doc_{document_id}"
|
|
await self.resource_client.delete_vector_collection(
|
|
tenant_id=tenant_id,
|
|
user_id=user_id,
|
|
dataset_name=dataset_name
|
|
)
|
|
|
|
# Delete physical file
|
|
if document.file_exists():
|
|
os.remove(document.get_absolute_file_path())
|
|
|
|
# Delete from database (cascade will handle related records)
|
|
await self.db.delete(document)
|
|
await self.db.commit()
|
|
|
|
logger.info(f"Deleted document {document_id} for user {user_id}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
await self.db.rollback()
|
|
logger.error(f"Failed to delete document: {e}")
|
|
raise
|
|
|
|
async def delete_dataset(
|
|
self,
|
|
user_id: str,
|
|
tenant_id: str,
|
|
dataset_id: str
|
|
) -> bool:
|
|
"""Delete RAG dataset and associated vectors"""
|
|
try:
|
|
# Verify dataset ownership
|
|
dataset = await self._get_user_dataset(user_id, dataset_id)
|
|
if not dataset:
|
|
raise PermissionError("Dataset not found or access denied")
|
|
|
|
# Delete vectors from ChromaDB
|
|
dataset_name = f"dataset_{dataset_id}"
|
|
await self.resource_client.delete_vector_collection(
|
|
tenant_id=tenant_id,
|
|
user_id=user_id,
|
|
dataset_name=dataset_name
|
|
)
|
|
|
|
# Delete from database (cascade will handle related records)
|
|
await self.db.delete(dataset)
|
|
await self.db.commit()
|
|
|
|
logger.info(f"Deleted dataset {dataset_id} for user {user_id}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
await self.db.rollback()
|
|
logger.error(f"Failed to delete dataset: {e}")
|
|
raise
|
|
|
|
async def get_rag_statistics(
|
|
self,
|
|
user_id: str
|
|
) -> Dict[str, Any]:
|
|
"""Get RAG usage statistics for user"""
|
|
try:
|
|
# Document statistics
|
|
doc_query = select(Document).where(Document.uploaded_by == user_id)
|
|
doc_result = await self.db.execute(doc_query)
|
|
documents = doc_result.scalars().all()
|
|
|
|
# Dataset statistics
|
|
dataset_query = select(RAGDataset).where(RAGDataset.user_id == user_id)
|
|
dataset_result = await self.db.execute(dataset_query)
|
|
datasets = dataset_result.scalars().all()
|
|
|
|
total_size = sum(doc.file_size_bytes for doc in documents)
|
|
total_chunks = sum(doc.chunk_count for doc in documents)
|
|
|
|
stats = {
|
|
"user_id": user_id,
|
|
"document_count": len(documents),
|
|
"dataset_count": len(datasets),
|
|
"total_size_bytes": total_size,
|
|
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
|
"total_chunks": total_chunks,
|
|
"processed_documents": len([d for d in documents if d.is_processing_complete()]),
|
|
"pending_documents": len([d for d in documents if d.is_pending_processing()]),
|
|
"failed_documents": len([d for d in documents if d.is_processing_failed()])
|
|
}
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get RAG statistics: {e}")
|
|
raise
|
|
|
|
# Private helper methods
|
|
|
|
async def _get_user_document(self, user_id: str, document_id: int) -> Optional[Document]:
|
|
"""Get document with ownership verification"""
|
|
result = await self.db.execute(
|
|
select(Document).where(
|
|
and_(
|
|
Document.id == document_id,
|
|
Document.uploaded_by == user_id
|
|
)
|
|
)
|
|
)
|
|
return result.scalar_one_or_none()
|
|
|
|
async def _get_user_dataset(self, user_id: str, dataset_id: str) -> Optional[RAGDataset]:
|
|
"""Get dataset with ownership verification"""
|
|
result = await self.db.execute(
|
|
select(RAGDataset).where(
|
|
and_(
|
|
RAGDataset.id == dataset_id,
|
|
RAGDataset.user_id == user_id
|
|
)
|
|
)
|
|
)
|
|
return result.scalar_one_or_none()
|
|
|
|
async def _read_document_file(self, document: Document) -> bytes:
|
|
"""Read document file content"""
|
|
file_path = document.get_absolute_file_path()
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"Document file not found: {file_path}")
|
|
|
|
async with aiofiles.open(file_path, 'rb') as f:
|
|
content = await f.read()
|
|
|
|
return content
|
|
|
|
|
|
# Factory function for dependency injection
|
|
async def get_rag_service(db: AsyncSession = None) -> RAGService:
|
|
"""Get RAG service instance"""
|
|
if db is None:
|
|
async with get_db_session() as session:
|
|
return RAGService(session)
|
|
return RAGService(db) |