""" Resource Cluster Client for GT 2.0 Tenant Backend Handles communication with the Resource Cluster for AI/ML operations. Manages capability token generation and LLM inference requests. """ import httpx import json import logging from typing import Dict, Any, Optional, AsyncIterator, List from datetime import timedelta import asyncio from jose import jwt from app.core.config import get_settings logger = logging.getLogger(__name__) async def fetch_model_rate_limit( tenant_id: str, model_id: str, control_panel_url: str ) -> int: """ Fetch rate limit for a model from Control Panel API. Returns requests_per_minute (converted from max_requests_per_hour in database). Fails fast if Control Panel is unreachable (GT 2.0 principle: no fallbacks). Args: tenant_id: Tenant identifier model_id: Model identifier control_panel_url: Control Panel API base URL Returns: Requests per minute limit Raises: RuntimeError: If Control Panel API is unreachable """ try: async with httpx.AsyncClient(timeout=10.0) as client: url = f"{control_panel_url}/api/v1/tenant-models/tenants/{tenant_id}/models/{model_id}" logger.debug(f"Fetching rate limit from Control Panel: {url}") response = await client.get(url) if response.status_code == 404: logger.warning(f"Model {model_id} not configured for tenant {tenant_id}, using default") return 1000 # Default: 1000 requests/minute response.raise_for_status() config = response.json() # API now returns requests_per_minute directly (translated from DB per-hour) rate_limits = config.get("rate_limits", {}) requests_per_minute = rate_limits.get("requests_per_minute", 1000) logger.info(f"Model {model_id} rate limit: {requests_per_minute} requests/minute") return requests_per_minute except httpx.HTTPStatusError as e: logger.error(f"Control Panel API error: {e.response.status_code}") raise RuntimeError(f"Failed to fetch rate limit: HTTP {e.response.status_code}") except httpx.RequestError as e: logger.error(f"Control Panel API unreachable: {e}") raise RuntimeError(f"Control Panel unreachable at {control_panel_url}") except Exception as e: logger.error(f"Unexpected error fetching rate limit: {e}") raise RuntimeError(f"Failed to fetch rate limit: {e}") class ResourceClusterClient: """Client for communicating with GT 2.0 Resource Cluster""" def __init__(self): self.settings = get_settings() self.resource_cluster_url = self.settings.resource_cluster_url self.secret_key = self.settings.secret_key self.algorithm = "HS256" self.client = httpx.AsyncClient(timeout=60.0) async def generate_capability_token( self, user_id: str, tenant_id: str, assistant_config: Dict[str, Any], expires_minutes: int = 30 ) -> str: """ Generate capability token for resource access. Fetches real rate limits from Control Panel (single source of truth). Fails fast if Control Panel is unreachable. """ # Extract capabilities from agent configuration capabilities = [] # Add LLM capability with real rate limit from Control Panel model = assistant_config.get("resource_preferences", {}).get("primary_llm", "llama-3.1-70b-versatile") # Fetch real rate limit from Control Panel API requests_per_minute = await fetch_model_rate_limit( tenant_id=tenant_id, model_id=model, control_panel_url=self.settings.control_panel_url ) capabilities.append({ "resource": f"llm:groq", "actions": ["inference", "streaming"], "constraints": { # Changed from "limits" to match LLM gateway expectations "max_tokens_per_request": assistant_config.get("resource_preferences", {}).get("max_tokens", 4000), "max_requests_per_minute": requests_per_minute # Real limit from database (converted from per-hour) } }) # Add RAG capabilities if configured if assistant_config.get("capabilities", {}).get("rag_enabled"): capabilities.append({ "resource": "rag:semantic_search", "actions": ["search", "retrieve"], "limits": { "max_results": 10 } }) # Add embedding capability if RAG is enabled if assistant_config.get("capabilities", {}).get("embeddings_enabled"): capabilities.append({ "resource": "embedding:text-embedding-3-small", "actions": ["generate"], "limits": { "max_texts_per_request": 100 } }) # Create token payload payload = { "sub": user_id, "tenant_id": tenant_id, "capabilities": capabilities, "exp": asyncio.get_event_loop().time() + (expires_minutes * 60), "iat": asyncio.get_event_loop().time() } # Sign token token = jwt.encode(payload, self.secret_key, algorithm=self.algorithm) return token async def execute_inference( self, prompt: str, assistant_config: Dict[str, Any], user_id: str, tenant_id: str, stream: bool = False, conversation_context: Optional[List[Dict[str, str]]] = None ) -> Dict[str, Any]: """Execute LLM inference via Resource Cluster""" # Generate capability token (now async - fetches real rate limits) token = await self.generate_capability_token(user_id, tenant_id, assistant_config) # Prepare request model = assistant_config.get("resource_preferences", {}).get("primary_llm", "llama-3.1-70b-versatile") temperature = assistant_config.get("resource_preferences", {}).get("temperature", 0.7) max_tokens = assistant_config.get("resource_preferences", {}).get("max_tokens", 4000) # Build messages array with system prompt messages = [] # Add system prompt from agent system_prompt = assistant_config.get("prompt", "You are a helpful AI agent.") messages.append({"role": "system", "content": system_prompt}) # Add conversation context if provided if conversation_context: messages.extend(conversation_context) # Add current user message messages.append({"role": "user", "content": prompt}) # Prepare request payload request_data = { "messages": messages, "model": model, "temperature": temperature, "max_tokens": max_tokens, "stream": stream, "user_id": user_id, "tenant_id": tenant_id } headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" } try: if stream: return await self._stream_inference(request_data, headers) else: response = await self.client.post( f"{self.resource_cluster_url}/api/v1/inference/", json=request_data, headers=headers ) response.raise_for_status() return response.json() except httpx.HTTPError as e: logger.error(f"HTTP error during inference: {e}") raise except Exception as e: logger.error(f"Error during inference: {e}") raise async def _stream_inference( self, request_data: Dict[str, Any], headers: Dict[str, str] ) -> AsyncIterator[str]: """Stream inference responses""" async with httpx.AsyncClient() as client: async with client.stream( "POST", f"{self.resource_cluster_url}/api/v1/inference/stream", json=request_data, headers=headers, timeout=60.0 ) as response: response.raise_for_status() async for line in response.aiter_lines(): if line.startswith("data: "): data = line[6:] # Remove "data: " prefix if data == "[DONE]": break try: chunk = json.loads(data) if "content" in chunk: yield chunk["content"] except json.JSONDecodeError: logger.warning(f"Failed to parse streaming chunk: {data}") continue async def generate_embeddings( self, texts: List[str], user_id: str, tenant_id: str, model: str = "text-embedding-3-small" ) -> List[List[float]]: """Generate embeddings for texts""" # Generate capability token with embedding permission assistant_config = {"capabilities": {"embeddings_enabled": True}} token = self.generate_capability_token(user_id, tenant_id, assistant_config) headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" } request_data = { "texts": texts, "model": model } try: response = await self.client.post( f"{self.resource_cluster_url}/api/v1/embeddings/", json=request_data, headers=headers ) response.raise_for_status() result = response.json() return result.get("embeddings", []) except httpx.HTTPError as e: logger.error(f"HTTP error during embedding generation: {e}") raise except Exception as e: logger.error(f"Error generating embeddings: {e}") raise async def search_rag( self, query: str, collection: str, user_id: str, tenant_id: str, top_k: int = 5 ) -> List[Dict[str, Any]]: """Search RAG collection for relevant documents""" # Generate capability token with RAG permission assistant_config = {"capabilities": {"rag_enabled": True}} token = self.generate_capability_token(user_id, tenant_id, assistant_config) headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" } request_data = { "query": query, "collection": collection, "top_k": top_k } try: response = await self.client.post( f"{self.resource_cluster_url}/api/v1/rag/search", json=request_data, headers=headers ) response.raise_for_status() result = response.json() return result.get("results", []) except httpx.HTTPError as e: logger.error(f"HTTP error during RAG search: {e}") # Return empty results on error for now return [] except Exception as e: logger.error(f"Error searching RAG: {e}") return [] async def get_agent_templates( self, user_id: str, tenant_id: str ) -> List[Dict[str, Any]]: """Get available agent templates from Resource Cluster""" # Generate basic capability token token = self.generate_capability_token(user_id, tenant_id, {}) headers = { "Authorization": f"Bearer {token}" } try: response = await self.client.get( f"{self.resource_cluster_url}/api/v1/templates/", headers=headers ) response.raise_for_status() return response.json() except httpx.HTTPError as e: logger.error(f"HTTP error fetching templates: {e}") return [] except Exception as e: logger.error(f"Error fetching templates: {e}") return [] async def close(self): """Close the HTTP client""" await self.client.aclose() async def __aenter__(self): """Async context manager entry""" return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit""" await self.close()