GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/resource-cluster/app/api/rag.py
+++ b/apps/resource-cluster/app/api/rag.py
@@ -0,0 +1,145 @@
+"""
+RAG (Retrieval-Augmented Generation) API endpoints
+"""
+
+from fastapi import APIRouter, HTTPException, Depends
+from typing import Dict, Any, List
+from pydantic import BaseModel, Field
+import logging
+
+from app.core.security import capability_validator, CapabilityToken
+from app.api.auth import verify_capability
+
+router = APIRouter()
+logger = logging.getLogger(__name__)
+
+
+class DocumentUploadRequest(BaseModel):
+    """Document upload request"""
+    content: str = Field(..., description="Document content")
+    metadata: Dict[str, Any] = Field(default={}, description="Document metadata")
+    collection: str = Field(default="default", description="Collection name")
+
+
+class SearchRequest(BaseModel):
+    """Semantic search request"""
+    query: str = Field(..., description="Search query")
+    collection: str = Field(default="default", description="Collection to search")
+    top_k: int = Field(default=5, ge=1, le=100, description="Number of results")
+
+
+@router.post("/upload")
+async def upload_document(
+    request: DocumentUploadRequest,
+    token: CapabilityToken = Depends(verify_capability)
+) -> Dict[str, Any]:
+    """Upload document for RAG processing"""
+    
+    try:
+        import uuid
+        import hashlib
+        
+        # Generate document ID
+        doc_id = f"doc_{uuid.uuid4().hex[:8]}"
+        
+        # Create content hash for deduplication
+        content_hash = hashlib.sha256(request.content.encode()).hexdigest()[:16]
+        
+        # Process the document content
+        # In production, this would:
+        # 1. Split document into chunks
+        # 2. Generate embeddings using the embedding service
+        # 3. Store in ChromaDB collection
+        
+        # For now, simulate document processing
+        word_count = len(request.content.split())
+        chunk_count = max(1, word_count // 200)  # Simulate ~200 words per chunk
+        
+        # Store metadata with content
+        document_data = {
+            "document_id": doc_id,
+            "content_hash": content_hash,
+            "content": request.content,
+            "metadata": request.metadata,
+            "collection": request.collection,
+            "tenant_id": token.tenant_id,
+            "user_id": token.user_id,
+            "word_count": word_count,
+            "chunk_count": chunk_count
+        }
+        
+        # In production: Store in ChromaDB
+        # collection = chromadb_client.get_or_create_collection(request.collection)
+        # collection.add(documents=[request.content], ids=[doc_id], metadatas=[request.metadata])
+        
+        logger.info(f"Document uploaded: {doc_id} ({word_count} words, {chunk_count} chunks)")
+        
+        return {
+            "success": True,
+            "document_id": doc_id,
+            "content_hash": content_hash,
+            "collection": request.collection,
+            "word_count": word_count,
+            "chunk_count": chunk_count,
+            "message": "Document processed and stored for RAG retrieval"
+        }
+        
+    except Exception as e:
+        logger.error(f"Document upload failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Document upload failed: {str(e)}")
+
+
+@router.post("/search")
+async def semantic_search(
+    request: SearchRequest,
+    token: CapabilityToken = Depends(verify_capability)
+) -> Dict[str, Any]:
+    """Perform semantic search"""
+    
+    try:
+        # In production, this would:
+        # 1. Generate embedding for the query using embedding service
+        # 2. Search ChromaDB collection for similar vectors
+        # 3. Return ranked results with metadata
+        
+        # For now, simulate semantic search with keyword matching
+        import time
+        search_start = time.time()
+        
+        # Simulate query processing
+        query_terms = request.query.lower().split()
+        
+        # Mock search results
+        mock_results = [
+            {
+                "document_id": f"doc_result_{i}",
+                "content": f"Sample content matching '{request.query}' - result {i+1}",
+                "metadata": {
+                    "source": f"document_{i+1}.txt",
+                    "author": "System",
+                    "created_at": "2025-01-01T00:00:00Z"
+                },
+                "similarity_score": 0.9 - (i * 0.1),
+                "chunk_id": f"chunk_{i+1}"
+            }
+            for i in range(min(request.top_k, 3))  # Return up to 3 mock results
+        ]
+        
+        search_time = time.time() - search_start
+        
+        logger.info(f"Semantic search completed: query='{request.query}', results={len(mock_results)}, time={search_time:.3f}s")
+        
+        return {
+            "success": True,
+            "query": request.query,
+            "collection": request.collection,
+            "results": mock_results,
+            "total_results": len(mock_results),
+            "search_time_ms": int(search_time * 1000),
+            "tenant_id": token.tenant_id,
+            "user_id": token.user_id
+        }
+        
+    except Exception as e:
+        logger.error(f"Semantic search failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Semantic search failed: {str(e)}")