Files
gt-ai-os-community/apps/tenant-backend/app/services/rag_service.py
HackWeasel 310491a557 GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2
- Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2
  - Made more general-purpose (flexible targets, expanded tools)
- Added nemotron-mini-agent.csv for fast local inference via Ollama
- Added nemotron-agent.csv for advanced reasoning via Ollama
- Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00

671 lines
24 KiB
Python

"""
RAG Service for GT 2.0 Tenant Backend
Orchestrates document processing, embedding generation, and vector storage
with perfect tenant isolation and zero downtime compliance.
"""
import logging
import asyncio
import aiofiles
import os
import json
import gc
from typing import Dict, Any, List, Optional, BinaryIO
from datetime import datetime
from pathlib import Path
import hashlib
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, or_
from sqlalchemy.orm import selectinload
from app.models.document import Document, RAGDataset, DatasetDocument, DocumentChunk
from app.core.database import get_db_session
from app.core.config import get_settings
from app.core.resource_client import ResourceClusterClient
logger = logging.getLogger(__name__)
class RAGService:
"""
Comprehensive RAG service with perfect tenant isolation.
GT 2.0 Security Principles:
- Perfect tenant isolation (all operations user-scoped)
- Stateless document processing (no data persistence in Resource Cluster)
- Encrypted vector storage per tenant
- Zero downtime compliance (async operations)
"""
def __init__(self, db: AsyncSession):
self.db = db
self.settings = get_settings()
self.resource_client = ResourceClusterClient()
# Tenant-specific directories
self.upload_directory = Path(self.settings.upload_directory)
self.temp_directory = Path(self.settings.temp_directory)
# Ensure directories exist with secure permissions
self._ensure_directories()
logger.info("RAG service initialized with tenant isolation")
def _ensure_directories(self):
"""Ensure required directories exist with secure permissions"""
for directory in [self.upload_directory, self.temp_directory]:
directory.mkdir(parents=True, exist_ok=True, mode=0o700)
async def create_rag_dataset(
self,
user_id: str,
dataset_name: str,
description: Optional[str] = None,
chunking_strategy: str = "hybrid",
chunk_size: int = 512,
chunk_overlap: int = 128,
embedding_model: str = "BAAI/bge-m3"
) -> RAGDataset:
"""Create a new RAG dataset with tenant isolation"""
try:
# Check if dataset already exists for this user
existing = await self.db.execute(
select(RAGDataset).where(
and_(
RAGDataset.user_id == user_id,
RAGDataset.dataset_name == dataset_name
)
)
)
if existing.scalar_one_or_none():
raise ValueError(f"Dataset '{dataset_name}' already exists for user")
# Create dataset
dataset = RAGDataset(
user_id=user_id,
dataset_name=dataset_name,
description=description,
chunking_strategy=chunking_strategy,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
embedding_model=embedding_model
)
self.db.add(dataset)
await self.db.commit()
await self.db.refresh(dataset)
logger.info(f"Created RAG dataset '{dataset_name}' for user {user_id}")
return dataset
except Exception as e:
await self.db.rollback()
logger.error(f"Failed to create RAG dataset: {e}")
raise
async def upload_document(
self,
user_id: str,
file_content: bytes,
filename: str,
file_type: str,
dataset_id: Optional[str] = None
) -> Document:
"""Upload and store document with tenant isolation"""
try:
# Validate file
file_extension = Path(filename).suffix.lower()
if not file_extension:
raise ValueError("File must have an extension")
# Generate secure filename
file_hash = hashlib.sha256(file_content).hexdigest()[:16]
secure_filename = f"{user_id}_{file_hash}_{filename}"
# Tenant-specific file path
user_upload_dir = self.upload_directory / user_id
user_upload_dir.mkdir(exist_ok=True, mode=0o700)
file_path = user_upload_dir / secure_filename
# Save file with secure permissions
async with aiofiles.open(file_path, 'wb') as f:
await f.write(file_content)
# Set file permissions (owner read/write only)
os.chmod(file_path, 0o600)
# Create document record
document = Document(
filename=secure_filename,
original_filename=filename,
file_path=str(file_path),
file_type=file_type,
file_extension=file_extension,
file_size_bytes=len(file_content),
uploaded_by=user_id,
processing_status="pending"
)
self.db.add(document)
await self.db.commit()
await self.db.refresh(document)
# Add to dataset if specified
if dataset_id:
await self.add_document_to_dataset(user_id, document.id, dataset_id)
# Clear file content from memory
del file_content
gc.collect()
logger.info(f"Uploaded document '{filename}' for user {user_id}")
return document
except Exception as e:
await self.db.rollback()
logger.error(f"Failed to upload document: {e}")
# Clear sensitive data even on error
if 'file_content' in locals():
del file_content
gc.collect()
raise
async def process_document(
self,
user_id: str,
document_id: int,
tenant_id: str,
chunking_strategy: Optional[str] = None
) -> Dict[str, Any]:
"""Process document into chunks and generate embeddings"""
try:
# Get document with ownership check
document = await self._get_user_document(user_id, document_id)
if not document:
raise PermissionError("Document not found or access denied")
# Check if already processed
if document.is_processing_complete():
logger.info(f"Document {document_id} already processed")
return {"status": "already_processed", "chunk_count": document.chunk_count}
# Mark as processing
document.mark_processing_started()
await self.db.commit()
# Read document file
file_content = await self._read_document_file(document)
# Process document using Resource Cluster (stateless)
chunks = await self.resource_client.process_document(
content=file_content,
document_type=document.file_extension,
strategy_type=chunking_strategy or "hybrid",
tenant_id=tenant_id,
user_id=user_id
)
# Clear file content from memory immediately
del file_content
gc.collect()
if not chunks:
raise ValueError("Document processing returned no chunks")
# Generate embeddings for chunks (stateless)
chunk_texts = [chunk["text"] for chunk in chunks]
embeddings = await self.resource_client.generate_document_embeddings(
documents=chunk_texts,
tenant_id=tenant_id,
user_id=user_id
)
if len(embeddings) != len(chunk_texts):
raise ValueError("Embedding count mismatch with chunk count")
# Store vectors in ChromaDB via Resource Cluster
dataset_name = f"doc_{document.id}"
collection_created = await self.resource_client.create_vector_collection(
tenant_id=tenant_id,
user_id=user_id,
dataset_name=dataset_name
)
if not collection_created:
raise RuntimeError("Failed to create vector collection")
# Store vectors with metadata
chunk_metadata = [chunk["metadata"] for chunk in chunks]
vector_stored = await self.resource_client.store_vectors(
tenant_id=tenant_id,
user_id=user_id,
dataset_name=dataset_name,
documents=chunk_texts,
embeddings=embeddings,
metadata=chunk_metadata
)
if not vector_stored:
raise RuntimeError("Failed to store vectors")
# Clear embedding data from memory
del chunk_texts, embeddings
gc.collect()
# Update document record
vector_store_ids = [f"{tenant_id}:{user_id}:{dataset_name}"]
document.mark_processing_complete(
chunk_count=len(chunks),
vector_store_ids=vector_store_ids
)
await self.db.commit()
logger.info(f"Processed document {document_id} into {len(chunks)} chunks")
return {
"status": "completed",
"document_id": document_id,
"chunk_count": len(chunks),
"vector_store_ids": vector_store_ids
}
except Exception as e:
# Mark document processing as failed
if 'document' in locals() and document:
document.mark_processing_failed({"error": str(e)})
await self.db.commit()
logger.error(f"Failed to process document {document_id}: {e}")
# Ensure memory cleanup
gc.collect()
raise
async def add_document_to_dataset(
self,
user_id: str,
document_id: int,
dataset_id: str
) -> DatasetDocument:
"""Add processed document to RAG dataset"""
try:
# Verify dataset ownership
dataset = await self._get_user_dataset(user_id, dataset_id)
if not dataset:
raise PermissionError("Dataset not found or access denied")
# Verify document ownership
document = await self._get_user_document(user_id, document_id)
if not document:
raise PermissionError("Document not found or access denied")
# Check if already in dataset
existing = await self.db.execute(
select(DatasetDocument).where(
and_(
DatasetDocument.dataset_id == dataset_id,
DatasetDocument.document_id == document_id
)
)
)
if existing.scalar_one_or_none():
raise ValueError("Document already in dataset")
# Create dataset document relationship
dataset_doc = DatasetDocument(
dataset_id=dataset_id,
document_id=document_id,
user_id=user_id,
chunk_count=document.chunk_count,
vector_count=document.chunk_count # Assuming 1 vector per chunk
)
self.db.add(dataset_doc)
# Update dataset statistics
dataset.document_count += 1
dataset.chunk_count += document.chunk_count
dataset.vector_count += document.chunk_count
dataset.total_size_bytes += document.file_size_bytes
await self.db.commit()
await self.db.refresh(dataset_doc)
logger.info(f"Added document {document_id} to dataset {dataset_id}")
return dataset_doc
except Exception as e:
await self.db.rollback()
logger.error(f"Failed to add document to dataset: {e}")
raise
async def search_documents(
self,
user_id: str,
tenant_id: str,
query: str,
dataset_ids: Optional[List[str]] = None,
top_k: int = 5,
similarity_threshold: float = 0.7
) -> List[Dict[str, Any]]:
"""Search documents using RAG with tenant isolation"""
try:
# Generate query embedding
query_embeddings = await self.resource_client.generate_query_embeddings(
queries=[query],
tenant_id=tenant_id,
user_id=user_id
)
if not query_embeddings:
raise ValueError("Failed to generate query embedding")
query_embedding = query_embeddings[0]
# Get user's datasets if not specified
if not dataset_ids:
datasets = await self.list_user_datasets(user_id)
dataset_ids = [d.id for d in datasets]
# Search across datasets
all_results = []
for dataset_id in dataset_ids:
# Verify dataset ownership
dataset = await self._get_user_dataset(user_id, dataset_id)
if not dataset:
continue
# Search in ChromaDB
dataset_name = f"dataset_{dataset_id}"
results = await self.resource_client.search_vectors(
tenant_id=tenant_id,
user_id=user_id,
dataset_name=dataset_name,
query_embedding=query_embedding,
top_k=top_k
)
# Filter by similarity threshold and add dataset context
for result in results:
if result.get("similarity", 0) >= similarity_threshold:
result["dataset_id"] = dataset_id
result["dataset_name"] = dataset.dataset_name
all_results.append(result)
# Sort by similarity and limit
all_results.sort(key=lambda x: x.get("similarity", 0), reverse=True)
final_results = all_results[:top_k]
# Clear query embedding from memory
del query_embedding, query_embeddings
gc.collect()
logger.info(f"Search found {len(final_results)} results for user {user_id}")
return final_results
except Exception as e:
logger.error(f"Failed to search documents: {e}")
gc.collect()
raise
async def get_document_context(
self,
user_id: str,
tenant_id: str,
document_id: int,
query: str,
context_size: int = 3
) -> Dict[str, Any]:
"""Get relevant context from a specific document"""
try:
# Verify document ownership
document = await self._get_user_document(user_id, document_id)
if not document:
raise PermissionError("Document not found or access denied")
if not document.is_processing_complete():
raise ValueError("Document not yet processed")
# Generate query embedding
query_embeddings = await self.resource_client.generate_query_embeddings(
queries=[query],
tenant_id=tenant_id,
user_id=user_id
)
query_embedding = query_embeddings[0]
# Search document's vectors
dataset_name = f"doc_{document_id}"
results = await self.resource_client.search_vectors(
tenant_id=tenant_id,
user_id=user_id,
dataset_name=dataset_name,
query_embedding=query_embedding,
top_k=context_size
)
context = {
"document_id": document_id,
"document_name": document.original_filename,
"query": query,
"relevant_chunks": results,
"context_text": "\n\n".join([r["document"] for r in results])
}
# Clear query embedding from memory
del query_embedding, query_embeddings
gc.collect()
return context
except Exception as e:
logger.error(f"Failed to get document context: {e}")
gc.collect()
raise
async def list_user_documents(
self,
user_id: str,
status_filter: Optional[str] = None,
offset: int = 0,
limit: int = 50
) -> List[Document]:
"""List user's documents with optional filtering"""
try:
query = select(Document).where(Document.uploaded_by == user_id)
if status_filter:
query = query.where(Document.processing_status == status_filter)
query = query.order_by(Document.created_at.desc())
query = query.offset(offset).limit(limit)
result = await self.db.execute(query)
documents = result.scalars().all()
return list(documents)
except Exception as e:
logger.error(f"Failed to list user documents: {e}")
raise
async def list_user_datasets(
self,
user_id: str,
include_stats: bool = True
) -> List[RAGDataset]:
"""List user's RAG datasets"""
try:
query = select(RAGDataset).where(RAGDataset.user_id == user_id)
if include_stats:
query = query.options(selectinload(RAGDataset.documents))
query = query.order_by(RAGDataset.created_at.desc())
result = await self.db.execute(query)
datasets = result.scalars().all()
return list(datasets)
except Exception as e:
logger.error(f"Failed to list user datasets: {e}")
raise
async def delete_document(
self,
user_id: str,
tenant_id: str,
document_id: int
) -> bool:
"""Delete document and associated vectors"""
try:
# Verify document ownership
document = await self._get_user_document(user_id, document_id)
if not document:
raise PermissionError("Document not found or access denied")
# Delete vectors from ChromaDB if processed
if document.is_processing_complete():
dataset_name = f"doc_{document_id}"
await self.resource_client.delete_vector_collection(
tenant_id=tenant_id,
user_id=user_id,
dataset_name=dataset_name
)
# Delete physical file
if document.file_exists():
os.remove(document.get_absolute_file_path())
# Delete from database (cascade will handle related records)
await self.db.delete(document)
await self.db.commit()
logger.info(f"Deleted document {document_id} for user {user_id}")
return True
except Exception as e:
await self.db.rollback()
logger.error(f"Failed to delete document: {e}")
raise
async def delete_dataset(
self,
user_id: str,
tenant_id: str,
dataset_id: str
) -> bool:
"""Delete RAG dataset and associated vectors"""
try:
# Verify dataset ownership
dataset = await self._get_user_dataset(user_id, dataset_id)
if not dataset:
raise PermissionError("Dataset not found or access denied")
# Delete vectors from ChromaDB
dataset_name = f"dataset_{dataset_id}"
await self.resource_client.delete_vector_collection(
tenant_id=tenant_id,
user_id=user_id,
dataset_name=dataset_name
)
# Delete from database (cascade will handle related records)
await self.db.delete(dataset)
await self.db.commit()
logger.info(f"Deleted dataset {dataset_id} for user {user_id}")
return True
except Exception as e:
await self.db.rollback()
logger.error(f"Failed to delete dataset: {e}")
raise
async def get_rag_statistics(
self,
user_id: str
) -> Dict[str, Any]:
"""Get RAG usage statistics for user"""
try:
# Document statistics
doc_query = select(Document).where(Document.uploaded_by == user_id)
doc_result = await self.db.execute(doc_query)
documents = doc_result.scalars().all()
# Dataset statistics
dataset_query = select(RAGDataset).where(RAGDataset.user_id == user_id)
dataset_result = await self.db.execute(dataset_query)
datasets = dataset_result.scalars().all()
total_size = sum(doc.file_size_bytes for doc in documents)
total_chunks = sum(doc.chunk_count for doc in documents)
stats = {
"user_id": user_id,
"document_count": len(documents),
"dataset_count": len(datasets),
"total_size_bytes": total_size,
"total_size_mb": round(total_size / (1024 * 1024), 2),
"total_chunks": total_chunks,
"processed_documents": len([d for d in documents if d.is_processing_complete()]),
"pending_documents": len([d for d in documents if d.is_pending_processing()]),
"failed_documents": len([d for d in documents if d.is_processing_failed()])
}
return stats
except Exception as e:
logger.error(f"Failed to get RAG statistics: {e}")
raise
# Private helper methods
async def _get_user_document(self, user_id: str, document_id: int) -> Optional[Document]:
"""Get document with ownership verification"""
result = await self.db.execute(
select(Document).where(
and_(
Document.id == document_id,
Document.uploaded_by == user_id
)
)
)
return result.scalar_one_or_none()
async def _get_user_dataset(self, user_id: str, dataset_id: str) -> Optional[RAGDataset]:
"""Get dataset with ownership verification"""
result = await self.db.execute(
select(RAGDataset).where(
and_(
RAGDataset.id == dataset_id,
RAGDataset.user_id == user_id
)
)
)
return result.scalar_one_or_none()
async def _read_document_file(self, document: Document) -> bytes:
"""Read document file content"""
file_path = document.get_absolute_file_path()
if not os.path.exists(file_path):
raise FileNotFoundError(f"Document file not found: {file_path}")
async with aiofiles.open(file_path, 'rb') as f:
content = await f.read()
return content
# Factory function for dependency injection
async def get_rag_service(db: AsyncSession = None) -> RAGService:
"""Get RAG service instance"""
if db is None:
async with get_db_session() as session:
return RAGService(session)
return RAGService(db)