GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/tenant-backend/app/services/resource_cluster_client.py
+++ b/apps/tenant-backend/app/services/resource_cluster_client.py
@@ -0,0 +1,371 @@
+"""
+Resource Cluster Client for GT 2.0 Tenant Backend
+
+Handles communication with the Resource Cluster for AI/ML operations.
+Manages capability token generation and LLM inference requests.
+"""
+
+import httpx
+import json
+import logging
+from typing import Dict, Any, Optional, AsyncIterator, List
+from datetime import timedelta
+import asyncio
+from jose import jwt
+
+from app.core.config import get_settings
+
+logger = logging.getLogger(__name__)
+
+
+async def fetch_model_rate_limit(
+    tenant_id: str,
+    model_id: str,
+    control_panel_url: str
+) -> int:
+    """
+    Fetch rate limit for a model from Control Panel API.
+
+    Returns requests_per_minute (converted from max_requests_per_hour in database).
+    Fails fast if Control Panel is unreachable (GT 2.0 principle: no fallbacks).
+
+    Args:
+        tenant_id: Tenant identifier
+        model_id: Model identifier
+        control_panel_url: Control Panel API base URL
+
+    Returns:
+        Requests per minute limit
+
+    Raises:
+        RuntimeError: If Control Panel API is unreachable
+    """
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            url = f"{control_panel_url}/api/v1/tenant-models/tenants/{tenant_id}/models/{model_id}"
+            logger.debug(f"Fetching rate limit from Control Panel: {url}")
+
+            response = await client.get(url)
+
+            if response.status_code == 404:
+                logger.warning(f"Model {model_id} not configured for tenant {tenant_id}, using default")
+                return 1000  # Default: 1000 requests/minute
+
+            response.raise_for_status()
+            config = response.json()
+
+            # API now returns requests_per_minute directly (translated from DB per-hour)
+            rate_limits = config.get("rate_limits", {})
+            requests_per_minute = rate_limits.get("requests_per_minute", 1000)
+
+            logger.info(f"Model {model_id} rate limit: {requests_per_minute} requests/minute")
+            return requests_per_minute
+
+    except httpx.HTTPStatusError as e:
+        logger.error(f"Control Panel API error: {e.response.status_code}")
+        raise RuntimeError(f"Failed to fetch rate limit: HTTP {e.response.status_code}")
+    except httpx.RequestError as e:
+        logger.error(f"Control Panel API unreachable: {e}")
+        raise RuntimeError(f"Control Panel unreachable at {control_panel_url}")
+    except Exception as e:
+        logger.error(f"Unexpected error fetching rate limit: {e}")
+        raise RuntimeError(f"Failed to fetch rate limit: {e}")
+
+
+class ResourceClusterClient:
+    """Client for communicating with GT 2.0 Resource Cluster"""
+    
+    def __init__(self):
+        self.settings = get_settings()
+        self.resource_cluster_url = self.settings.resource_cluster_url
+        self.secret_key = self.settings.secret_key
+        self.algorithm = "HS256"
+        self.client = httpx.AsyncClient(timeout=60.0)
+    
+    async def generate_capability_token(
+        self,
+        user_id: str,
+        tenant_id: str,
+        assistant_config: Dict[str, Any],
+        expires_minutes: int = 30
+    ) -> str:
+        """
+        Generate capability token for resource access.
+
+        Fetches real rate limits from Control Panel (single source of truth).
+        Fails fast if Control Panel is unreachable.
+        """
+
+        # Extract capabilities from agent configuration
+        capabilities = []
+
+        # Add LLM capability with real rate limit from Control Panel
+        model = assistant_config.get("resource_preferences", {}).get("primary_llm", "llama-3.1-70b-versatile")
+
+        # Fetch real rate limit from Control Panel API
+        requests_per_minute = await fetch_model_rate_limit(
+            tenant_id=tenant_id,
+            model_id=model,
+            control_panel_url=self.settings.control_panel_url
+        )
+
+        capabilities.append({
+            "resource": f"llm:groq",
+            "actions": ["inference", "streaming"],
+            "constraints": {  # Changed from "limits" to match LLM gateway expectations
+                "max_tokens_per_request": assistant_config.get("resource_preferences", {}).get("max_tokens", 4000),
+                "max_requests_per_minute": requests_per_minute  # Real limit from database (converted from per-hour)
+            }
+        })
+        
+        # Add RAG capabilities if configured
+        if assistant_config.get("capabilities", {}).get("rag_enabled"):
+            capabilities.append({
+                "resource": "rag:semantic_search",
+                "actions": ["search", "retrieve"],
+                "limits": {
+                    "max_results": 10
+                }
+            })
+        
+        # Add embedding capability if RAG is enabled
+        if assistant_config.get("capabilities", {}).get("embeddings_enabled"):
+            capabilities.append({
+                "resource": "embedding:text-embedding-3-small",
+                "actions": ["generate"],
+                "limits": {
+                    "max_texts_per_request": 100
+                }
+            })
+        
+        # Create token payload
+        payload = {
+            "sub": user_id,
+            "tenant_id": tenant_id,
+            "capabilities": capabilities,
+            "exp": asyncio.get_event_loop().time() + (expires_minutes * 60),
+            "iat": asyncio.get_event_loop().time()
+        }
+        
+        # Sign token
+        token = jwt.encode(payload, self.secret_key, algorithm=self.algorithm)
+        
+        return token
+    
+    async def execute_inference(
+        self,
+        prompt: str,
+        assistant_config: Dict[str, Any],
+        user_id: str,
+        tenant_id: str,
+        stream: bool = False,
+        conversation_context: Optional[List[Dict[str, str]]] = None
+    ) -> Dict[str, Any]:
+        """Execute LLM inference via Resource Cluster"""
+
+        # Generate capability token (now async - fetches real rate limits)
+        token = await self.generate_capability_token(user_id, tenant_id, assistant_config)
+        
+        # Prepare request
+        model = assistant_config.get("resource_preferences", {}).get("primary_llm", "llama-3.1-70b-versatile")
+        temperature = assistant_config.get("resource_preferences", {}).get("temperature", 0.7)
+        max_tokens = assistant_config.get("resource_preferences", {}).get("max_tokens", 4000)
+        
+        # Build messages array with system prompt
+        messages = []
+        
+        # Add system prompt from agent
+        system_prompt = assistant_config.get("prompt", "You are a helpful AI agent.")
+        messages.append({"role": "system", "content": system_prompt})
+        
+        # Add conversation context if provided
+        if conversation_context:
+            messages.extend(conversation_context)
+        
+        # Add current user message
+        messages.append({"role": "user", "content": prompt})
+        
+        # Prepare request payload
+        request_data = {
+            "messages": messages,
+            "model": model,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "stream": stream,
+            "user_id": user_id,
+            "tenant_id": tenant_id
+        }
+        
+        headers = {
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json"
+        }
+        
+        try:
+            if stream:
+                return await self._stream_inference(request_data, headers)
+            else:
+                response = await self.client.post(
+                    f"{self.resource_cluster_url}/api/v1/inference/",
+                    json=request_data,
+                    headers=headers
+                )
+                response.raise_for_status()
+                return response.json()
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error during inference: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"Error during inference: {e}")
+            raise
+    
+    async def _stream_inference(
+        self,
+        request_data: Dict[str, Any],
+        headers: Dict[str, str]
+    ) -> AsyncIterator[str]:
+        """Stream inference responses"""
+        
+        async with httpx.AsyncClient() as client:
+            async with client.stream(
+                "POST",
+                f"{self.resource_cluster_url}/api/v1/inference/stream",
+                json=request_data,
+                headers=headers,
+                timeout=60.0
+            ) as response:
+                response.raise_for_status()
+                
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[6:]  # Remove "data: " prefix
+                        if data == "[DONE]":
+                            break
+                        try:
+                            chunk = json.loads(data)
+                            if "content" in chunk:
+                                yield chunk["content"]
+                        except json.JSONDecodeError:
+                            logger.warning(f"Failed to parse streaming chunk: {data}")
+                            continue
+    
+    async def generate_embeddings(
+        self,
+        texts: List[str],
+        user_id: str,
+        tenant_id: str,
+        model: str = "text-embedding-3-small"
+    ) -> List[List[float]]:
+        """Generate embeddings for texts"""
+        
+        # Generate capability token with embedding permission
+        assistant_config = {"capabilities": {"embeddings_enabled": True}}
+        token = self.generate_capability_token(user_id, tenant_id, assistant_config)
+        
+        headers = {
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json"
+        }
+        
+        request_data = {
+            "texts": texts,
+            "model": model
+        }
+        
+        try:
+            response = await self.client.post(
+                f"{self.resource_cluster_url}/api/v1/embeddings/",
+                json=request_data,
+                headers=headers
+            )
+            response.raise_for_status()
+            result = response.json()
+            return result.get("embeddings", [])
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error during embedding generation: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"Error generating embeddings: {e}")
+            raise
+    
+    async def search_rag(
+        self,
+        query: str,
+        collection: str,
+        user_id: str,
+        tenant_id: str,
+        top_k: int = 5
+    ) -> List[Dict[str, Any]]:
+        """Search RAG collection for relevant documents"""
+        
+        # Generate capability token with RAG permission
+        assistant_config = {"capabilities": {"rag_enabled": True}}
+        token = self.generate_capability_token(user_id, tenant_id, assistant_config)
+        
+        headers = {
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json"
+        }
+        
+        request_data = {
+            "query": query,
+            "collection": collection,
+            "top_k": top_k
+        }
+        
+        try:
+            response = await self.client.post(
+                f"{self.resource_cluster_url}/api/v1/rag/search",
+                json=request_data,
+                headers=headers
+            )
+            response.raise_for_status()
+            result = response.json()
+            return result.get("results", [])
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error during RAG search: {e}")
+            # Return empty results on error for now
+            return []
+        except Exception as e:
+            logger.error(f"Error searching RAG: {e}")
+            return []
+    
+    async def get_agent_templates(
+        self,
+        user_id: str,
+        tenant_id: str
+    ) -> List[Dict[str, Any]]:
+        """Get available agent templates from Resource Cluster"""
+        
+        # Generate basic capability token
+        token = self.generate_capability_token(user_id, tenant_id, {})
+        
+        headers = {
+            "Authorization": f"Bearer {token}"
+        }
+        
+        try:
+            response = await self.client.get(
+                f"{self.resource_cluster_url}/api/v1/templates/",
+                headers=headers
+            )
+            response.raise_for_status()
+            return response.json()
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error fetching templates: {e}")
+            return []
+        except Exception as e:
+            logger.error(f"Error fetching templates: {e}")
+            return []
+    
+    async def close(self):
+        """Close the HTTP client"""
+        await self.client.aclose()
+    
+    async def __aenter__(self):
+        """Async context manager entry"""
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.close()