GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
HackWeasel
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions

View File

@@ -0,0 +1,145 @@
"""
RAG (Retrieval-Augmented Generation) API endpoints
"""
from fastapi import APIRouter, HTTPException, Depends
from typing import Dict, Any, List
from pydantic import BaseModel, Field
import logging
from app.core.security import capability_validator, CapabilityToken
from app.api.auth import verify_capability
router = APIRouter()
logger = logging.getLogger(__name__)
class DocumentUploadRequest(BaseModel):
"""Document upload request"""
content: str = Field(..., description="Document content")
metadata: Dict[str, Any] = Field(default={}, description="Document metadata")
collection: str = Field(default="default", description="Collection name")
class SearchRequest(BaseModel):
"""Semantic search request"""
query: str = Field(..., description="Search query")
collection: str = Field(default="default", description="Collection to search")
top_k: int = Field(default=5, ge=1, le=100, description="Number of results")
@router.post("/upload")
async def upload_document(
request: DocumentUploadRequest,
token: CapabilityToken = Depends(verify_capability)
) -> Dict[str, Any]:
"""Upload document for RAG processing"""
try:
import uuid
import hashlib
# Generate document ID
doc_id = f"doc_{uuid.uuid4().hex[:8]}"
# Create content hash for deduplication
content_hash = hashlib.sha256(request.content.encode()).hexdigest()[:16]
# Process the document content
# In production, this would:
# 1. Split document into chunks
# 2. Generate embeddings using the embedding service
# 3. Store in ChromaDB collection
# For now, simulate document processing
word_count = len(request.content.split())
chunk_count = max(1, word_count // 200) # Simulate ~200 words per chunk
# Store metadata with content
document_data = {
"document_id": doc_id,
"content_hash": content_hash,
"content": request.content,
"metadata": request.metadata,
"collection": request.collection,
"tenant_id": token.tenant_id,
"user_id": token.user_id,
"word_count": word_count,
"chunk_count": chunk_count
}
# In production: Store in ChromaDB
# collection = chromadb_client.get_or_create_collection(request.collection)
# collection.add(documents=[request.content], ids=[doc_id], metadatas=[request.metadata])
logger.info(f"Document uploaded: {doc_id} ({word_count} words, {chunk_count} chunks)")
return {
"success": True,
"document_id": doc_id,
"content_hash": content_hash,
"collection": request.collection,
"word_count": word_count,
"chunk_count": chunk_count,
"message": "Document processed and stored for RAG retrieval"
}
except Exception as e:
logger.error(f"Document upload failed: {e}")
raise HTTPException(status_code=500, detail=f"Document upload failed: {str(e)}")
@router.post("/search")
async def semantic_search(
request: SearchRequest,
token: CapabilityToken = Depends(verify_capability)
) -> Dict[str, Any]:
"""Perform semantic search"""
try:
# In production, this would:
# 1. Generate embedding for the query using embedding service
# 2. Search ChromaDB collection for similar vectors
# 3. Return ranked results with metadata
# For now, simulate semantic search with keyword matching
import time
search_start = time.time()
# Simulate query processing
query_terms = request.query.lower().split()
# Mock search results
mock_results = [
{
"document_id": f"doc_result_{i}",
"content": f"Sample content matching '{request.query}' - result {i+1}",
"metadata": {
"source": f"document_{i+1}.txt",
"author": "System",
"created_at": "2025-01-01T00:00:00Z"
},
"similarity_score": 0.9 - (i * 0.1),
"chunk_id": f"chunk_{i+1}"
}
for i in range(min(request.top_k, 3)) # Return up to 3 mock results
]
search_time = time.time() - search_start
logger.info(f"Semantic search completed: query='{request.query}', results={len(mock_results)}, time={search_time:.3f}s")
return {
"success": True,
"query": request.query,
"collection": request.collection,
"results": mock_results,
"total_results": len(mock_results),
"search_time_ms": int(search_time * 1000),
"tenant_id": token.tenant_id,
"user_id": token.user_id
}
except Exception as e:
logger.error(f"Semantic search failed: {e}")
raise HTTPException(status_code=500, detail=f"Semantic search failed: {str(e)}")