gt-ai-os-community/apps/tenant-backend/app/services/rag_service.py

"""
RAG Service for GT 2.0 Tenant Backend

Orchestrates document processing, embedding generation, and vector storage
with perfect tenant isolation and zero downtime compliance.
"""

import logging
import asyncio
import aiofiles
import os
import json
import gc
from typing import Dict, Any, List, Optional, BinaryIO
from datetime import datetime
from pathlib import Path
import hashlib

from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, or_
from sqlalchemy.orm import selectinload

from app.models.document import Document, RAGDataset, DatasetDocument, DocumentChunk
from app.core.database import get_db_session
from app.core.config import get_settings
from app.core.resource_client import ResourceClusterClient

logger = logging.getLogger(__name__)


class RAGService:
    """
    Comprehensive RAG service with perfect tenant isolation.

    GT 2.0 Security Principles:
    - Perfect tenant isolation (all operations user-scoped)
    - Stateless document processing (no data persistence in Resource Cluster)
    - Encrypted vector storage per tenant
    - Zero downtime compliance (async operations)
    """

    def __init__(self, db: AsyncSession):
        self.db = db
        self.settings = get_settings()
        self.resource_client = ResourceClusterClient()

        # Tenant-specific directories
        self.upload_directory = Path(self.settings.upload_directory)
        self.temp_directory = Path(self.settings.temp_directory)

        # Ensure directories exist with secure permissions
        self._ensure_directories()

        logger.info("RAG service initialized with tenant isolation")

    def _ensure_directories(self):
        """Ensure required directories exist with secure permissions"""
        for directory in [self.upload_directory, self.temp_directory]:
            directory.mkdir(parents=True, exist_ok=True, mode=0o700)

    async def create_rag_dataset(
        self,
        user_id: str,
        dataset_name: str,
        description: Optional[str] = None,
        chunking_strategy: str = "hybrid",
        chunk_size: int = 512,
        chunk_overlap: int = 128,
        embedding_model: str = "BAAI/bge-m3"
    ) -> RAGDataset:
        """Create a new RAG dataset with tenant isolation"""
        try:
            # Check if dataset already exists for this user
            existing = await self.db.execute(
                select(RAGDataset).where(
                    and_(
                        RAGDataset.user_id == user_id,
                        RAGDataset.dataset_name == dataset_name
                    )
                )
            )
            if existing.scalar_one_or_none():
                raise ValueError(f"Dataset '{dataset_name}' already exists for user")

            # Create dataset
            dataset = RAGDataset(
                user_id=user_id,
                dataset_name=dataset_name,
                description=description,
                chunking_strategy=chunking_strategy,
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                embedding_model=embedding_model
            )

            self.db.add(dataset)
            await self.db.commit()
            await self.db.refresh(dataset)

            logger.info(f"Created RAG dataset '{dataset_name}' for user {user_id}")
            return dataset

        except Exception as e:
            await self.db.rollback()
            logger.error(f"Failed to create RAG dataset: {e}")
            raise

    async def upload_document(
        self,
        user_id: str,
        file_content: bytes,
        filename: str,
        file_type: str,
        dataset_id: Optional[str] = None
    ) -> Document:
        """Upload and store document with tenant isolation"""
        try:
            # Validate file
            file_extension = Path(filename).suffix.lower()
            if not file_extension:
                raise ValueError("File must have an extension")

            # Generate secure filename
            file_hash = hashlib.sha256(file_content).hexdigest()[:16]
            secure_filename = f"{user_id}_{file_hash}_{filename}"

            # Tenant-specific file path
            user_upload_dir = self.upload_directory / user_id
            user_upload_dir.mkdir(exist_ok=True, mode=0o700)

            file_path = user_upload_dir / secure_filename

            # Save file with secure permissions
            async with aiofiles.open(file_path, 'wb') as f:
                await f.write(file_content)

            # Set file permissions (owner read/write only)
            os.chmod(file_path, 0o600)

            # Create document record
            document = Document(
                filename=secure_filename,
                original_filename=filename,
                file_path=str(file_path),
                file_type=file_type,
                file_extension=file_extension,
                file_size_bytes=len(file_content),
                uploaded_by=user_id,
                processing_status="pending"
            )

            self.db.add(document)
            await self.db.commit()
            await self.db.refresh(document)

            # Add to dataset if specified
            if dataset_id:
                await self.add_document_to_dataset(user_id, document.id, dataset_id)

            # Clear file content from memory
            del file_content
            gc.collect()

            logger.info(f"Uploaded document '{filename}' for user {user_id}")
            return document

        except Exception as e:
            await self.db.rollback()
            logger.error(f"Failed to upload document: {e}")
            # Clear sensitive data even on error
            if 'file_content' in locals():
                del file_content
                gc.collect()
            raise

    async def process_document(
        self,
        user_id: str,
        document_id: int,
        tenant_id: str,
        chunking_strategy: Optional[str] = None
    ) -> Dict[str, Any]:
        """Process document into chunks and generate embeddings"""
        try:
            # Get document with ownership check
            document = await self._get_user_document(user_id, document_id)
            if not document:
                raise PermissionError("Document not found or access denied")

            # Check if already processed
            if document.is_processing_complete():
                logger.info(f"Document {document_id} already processed")
                return {"status": "already_processed", "chunk_count": document.chunk_count}

            # Mark as processing
            document.mark_processing_started()
            await self.db.commit()

            # Read document file
            file_content = await self._read_document_file(document)

            # Process document using Resource Cluster (stateless)
            chunks = await self.resource_client.process_document(
                content=file_content,
                document_type=document.file_extension,
                strategy_type=chunking_strategy or "hybrid",
                tenant_id=tenant_id,
                user_id=user_id
            )

            # Clear file content from memory immediately
            del file_content
            gc.collect()

            if not chunks:
                raise ValueError("Document processing returned no chunks")

            # Generate embeddings for chunks (stateless)
            chunk_texts = [chunk["text"] for chunk in chunks]
            embeddings = await self.resource_client.generate_document_embeddings(
                documents=chunk_texts,
                tenant_id=tenant_id,
                user_id=user_id
            )

            if len(embeddings) != len(chunk_texts):
                raise ValueError("Embedding count mismatch with chunk count")

            # Store vectors in ChromaDB via Resource Cluster
            dataset_name = f"doc_{document.id}"
            collection_created = await self.resource_client.create_vector_collection(
                tenant_id=tenant_id,
                user_id=user_id,
                dataset_name=dataset_name
            )

            if not collection_created:
                raise RuntimeError("Failed to create vector collection")

            # Store vectors with metadata
            chunk_metadata = [chunk["metadata"] for chunk in chunks]
            vector_stored = await self.resource_client.store_vectors(
                tenant_id=tenant_id,
                user_id=user_id,
                dataset_name=dataset_name,
                documents=chunk_texts,
                embeddings=embeddings,
                metadata=chunk_metadata
            )

            if not vector_stored:
                raise RuntimeError("Failed to store vectors")

            # Clear embedding data from memory
            del chunk_texts, embeddings
            gc.collect()

            # Update document record
            vector_store_ids = [f"{tenant_id}:{user_id}:{dataset_name}"]
            document.mark_processing_complete(
                chunk_count=len(chunks),
                vector_store_ids=vector_store_ids
            )

            await self.db.commit()

            logger.info(f"Processed document {document_id} into {len(chunks)} chunks")

            return {
                "status": "completed",
                "document_id": document_id,
                "chunk_count": len(chunks),
                "vector_store_ids": vector_store_ids
            }

        except Exception as e:
            # Mark document processing as failed
            if 'document' in locals() and document:
                document.mark_processing_failed({"error": str(e)})
                await self.db.commit()

            logger.error(f"Failed to process document {document_id}: {e}")
            # Ensure memory cleanup
            gc.collect()
            raise

    async def add_document_to_dataset(
        self,
        user_id: str,
        document_id: int,
        dataset_id: str
    ) -> DatasetDocument:
        """Add processed document to RAG dataset"""
        try:
            # Verify dataset ownership
            dataset = await self._get_user_dataset(user_id, dataset_id)
            if not dataset:
                raise PermissionError("Dataset not found or access denied")

            # Verify document ownership
            document = await self._get_user_document(user_id, document_id)
            if not document:
                raise PermissionError("Document not found or access denied")

            # Check if already in dataset
            existing = await self.db.execute(
                select(DatasetDocument).where(
                    and_(
                        DatasetDocument.dataset_id == dataset_id,
                        DatasetDocument.document_id == document_id
                    )
                )
            )
            if existing.scalar_one_or_none():
                raise ValueError("Document already in dataset")

            # Create dataset document relationship
            dataset_doc = DatasetDocument(
                dataset_id=dataset_id,
                document_id=document_id,
                user_id=user_id,
                chunk_count=document.chunk_count,
                vector_count=document.chunk_count  # Assuming 1 vector per chunk
            )

            self.db.add(dataset_doc)

            # Update dataset statistics
            dataset.document_count += 1
            dataset.chunk_count += document.chunk_count
            dataset.vector_count += document.chunk_count
            dataset.total_size_bytes += document.file_size_bytes

            await self.db.commit()
            await self.db.refresh(dataset_doc)

            logger.info(f"Added document {document_id} to dataset {dataset_id}")
            return dataset_doc

        except Exception as e:
            await self.db.rollback()
            logger.error(f"Failed to add document to dataset: {e}")
            raise

    async def search_documents(
        self,
        user_id: str,
        tenant_id: str,
        query: str,
        dataset_ids: Optional[List[str]] = None,
        top_k: int = 5,
        similarity_threshold: float = 0.7
    ) -> List[Dict[str, Any]]:
        """Search documents using RAG with tenant isolation"""
        try:
            # Generate query embedding
            query_embeddings = await self.resource_client.generate_query_embeddings(
                queries=[query],
                tenant_id=tenant_id,
                user_id=user_id
            )

            if not query_embeddings:
                raise ValueError("Failed to generate query embedding")

            query_embedding = query_embeddings[0]

            # Get user's datasets if not specified
            if not dataset_ids:
                datasets = await self.list_user_datasets(user_id)
                dataset_ids = [d.id for d in datasets]

            # Search across datasets
            all_results = []
            for dataset_id in dataset_ids:
                # Verify dataset ownership
                dataset = await self._get_user_dataset(user_id, dataset_id)
                if not dataset:
                    continue

                # Search in ChromaDB
                dataset_name = f"dataset_{dataset_id}"
                results = await self.resource_client.search_vectors(
                    tenant_id=tenant_id,
                    user_id=user_id,
                    dataset_name=dataset_name,
                    query_embedding=query_embedding,
                    top_k=top_k
                )

                # Filter by similarity threshold and add dataset context
                for result in results:
                    if result.get("similarity", 0) >= similarity_threshold:
                        result["dataset_id"] = dataset_id
                        result["dataset_name"] = dataset.dataset_name
                        all_results.append(result)

            # Sort by similarity and limit
            all_results.sort(key=lambda x: x.get("similarity", 0), reverse=True)
            final_results = all_results[:top_k]

            # Clear query embedding from memory
            del query_embedding, query_embeddings
            gc.collect()

            logger.info(f"Search found {len(final_results)} results for user {user_id}")
            return final_results

        except Exception as e:
            logger.error(f"Failed to search documents: {e}")
            gc.collect()
            raise

    async def get_document_context(
        self,
        user_id: str,
        tenant_id: str,
        document_id: int,
        query: str,
        context_size: int = 3
    ) -> Dict[str, Any]:
        """Get relevant context from a specific document"""
        try:
            # Verify document ownership
            document = await self._get_user_document(user_id, document_id)
            if not document:
                raise PermissionError("Document not found or access denied")

            if not document.is_processing_complete():
                raise ValueError("Document not yet processed")

            # Generate query embedding
            query_embeddings = await self.resource_client.generate_query_embeddings(
                queries=[query],
                tenant_id=tenant_id,
                user_id=user_id
            )

            query_embedding = query_embeddings[0]

            # Search document's vectors
            dataset_name = f"doc_{document_id}"
            results = await self.resource_client.search_vectors(
                tenant_id=tenant_id,
                user_id=user_id,
                dataset_name=dataset_name,
                query_embedding=query_embedding,
                top_k=context_size
            )

            context = {
                "document_id": document_id,
                "document_name": document.original_filename,
                "query": query,
                "relevant_chunks": results,
                "context_text": "\n\n".join([r["document"] for r in results])
            }

            # Clear query embedding from memory
            del query_embedding, query_embeddings
            gc.collect()

            return context

        except Exception as e:
            logger.error(f"Failed to get document context: {e}")
            gc.collect()
            raise

    async def list_user_documents(
        self,
        user_id: str,
        status_filter: Optional[str] = None,
        offset: int = 0,
        limit: int = 50
    ) -> List[Document]:
        """List user's documents with optional filtering"""
        try:
            query = select(Document).where(Document.uploaded_by == user_id)

            if status_filter:
                query = query.where(Document.processing_status == status_filter)

            query = query.order_by(Document.created_at.desc())
            query = query.offset(offset).limit(limit)

            result = await self.db.execute(query)
            documents = result.scalars().all()

            return list(documents)

        except Exception as e:
            logger.error(f"Failed to list user documents: {e}")
            raise

    async def list_user_datasets(
        self,
        user_id: str,
        include_stats: bool = True
    ) -> List[RAGDataset]:
        """List user's RAG datasets"""
        try:
            query = select(RAGDataset).where(RAGDataset.user_id == user_id)

            if include_stats:
                query = query.options(selectinload(RAGDataset.documents))

            query = query.order_by(RAGDataset.created_at.desc())

            result = await self.db.execute(query)
            datasets = result.scalars().all()

            return list(datasets)

        except Exception as e:
            logger.error(f"Failed to list user datasets: {e}")
            raise

    async def delete_document(
        self,
        user_id: str,
        tenant_id: str,
        document_id: int
    ) -> bool:
        """Delete document and associated vectors"""
        try:
            # Verify document ownership
            document = await self._get_user_document(user_id, document_id)
            if not document:
                raise PermissionError("Document not found or access denied")

            # Delete vectors from ChromaDB if processed
            if document.is_processing_complete():
                dataset_name = f"doc_{document_id}"
                await self.resource_client.delete_vector_collection(
                    tenant_id=tenant_id,
                    user_id=user_id,
                    dataset_name=dataset_name
                )

            # Delete physical file
            if document.file_exists():
                os.remove(document.get_absolute_file_path())

            # Delete from database (cascade will handle related records)
            await self.db.delete(document)
            await self.db.commit()

            logger.info(f"Deleted document {document_id} for user {user_id}")
            return True

        except Exception as e:
            await self.db.rollback()
            logger.error(f"Failed to delete document: {e}")
            raise

    async def delete_dataset(
        self,
        user_id: str,
        tenant_id: str,
        dataset_id: str
    ) -> bool:
        """Delete RAG dataset and associated vectors"""
        try:
            # Verify dataset ownership
            dataset = await self._get_user_dataset(user_id, dataset_id)
            if not dataset:
                raise PermissionError("Dataset not found or access denied")

            # Delete vectors from ChromaDB
            dataset_name = f"dataset_{dataset_id}"
            await self.resource_client.delete_vector_collection(
                tenant_id=tenant_id,
                user_id=user_id,
                dataset_name=dataset_name
            )

            # Delete from database (cascade will handle related records)
            await self.db.delete(dataset)
            await self.db.commit()

            logger.info(f"Deleted dataset {dataset_id} for user {user_id}")
            return True

        except Exception as e:
            await self.db.rollback()
            logger.error(f"Failed to delete dataset: {e}")
            raise

    async def get_rag_statistics(
        self,
        user_id: str
    ) -> Dict[str, Any]:
        """Get RAG usage statistics for user"""
        try:
            # Document statistics
            doc_query = select(Document).where(Document.uploaded_by == user_id)
            doc_result = await self.db.execute(doc_query)
            documents = doc_result.scalars().all()

            # Dataset statistics
            dataset_query = select(RAGDataset).where(RAGDataset.user_id == user_id)
            dataset_result = await self.db.execute(dataset_query)
            datasets = dataset_result.scalars().all()

            total_size = sum(doc.file_size_bytes for doc in documents)
            total_chunks = sum(doc.chunk_count for doc in documents)

            stats = {
                "user_id": user_id,
                "document_count": len(documents),
                "dataset_count": len(datasets),
                "total_size_bytes": total_size,
                "total_size_mb": round(total_size / (1024 * 1024), 2),
                "total_chunks": total_chunks,
                "processed_documents": len([d for d in documents if d.is_processing_complete()]),
                "pending_documents": len([d for d in documents if d.is_pending_processing()]),
                "failed_documents": len([d for d in documents if d.is_processing_failed()])
            }

            return stats

        except Exception as e:
            logger.error(f"Failed to get RAG statistics: {e}")
            raise

    # Private helper methods

    async def _get_user_document(self, user_id: str, document_id: int) -> Optional[Document]:
        """Get document with ownership verification"""
        result = await self.db.execute(
            select(Document).where(
                and_(
                    Document.id == document_id,
                    Document.uploaded_by == user_id
                )
            )
        )
        return result.scalar_one_or_none()

    async def _get_user_dataset(self, user_id: str, dataset_id: str) -> Optional[RAGDataset]:
        """Get dataset with ownership verification"""
        result = await self.db.execute(
            select(RAGDataset).where(
                and_(
                    RAGDataset.id == dataset_id,
                    RAGDataset.user_id == user_id
                )
            )
        )
        return result.scalar_one_or_none()

    async def _read_document_file(self, document: Document) -> bytes:
        """Read document file content"""
        file_path = document.get_absolute_file_path()
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Document file not found: {file_path}")

        async with aiofiles.open(file_path, 'rb') as f:
            content = await f.read()

        return content


# Factory function for dependency injection
async def get_rag_service(db: AsyncSession = None) -> RAGService:
    """Get RAG service instance"""
    if db is None:
        async with get_db_session() as session:
            return RAGService(session)
    return RAGService(db)