Files
gt-ai-os-community/apps/tenant-backend/app/api/v1/models.py
HackWeasel 310491a557 GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2
- Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2
  - Made more general-purpose (flexible targets, expanded tools)
- Added nemotron-mini-agent.csv for fast local inference via Ollama
- Added nemotron-agent.csv for advanced reasoning via Ollama
- Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00

172 lines
6.7 KiB
Python

"""
Tenant Models API - Interface to Resource Cluster Model Management
Provides tenant-scoped access to available AI models from the Resource Cluster.
"""
from typing import Dict, Any, List, Optional
from fastapi import APIRouter, HTTPException, status, Depends
import httpx
import logging
from app.core.security import get_current_user
from app.core.config import get_settings
from app.core.cache import get_cache
from app.services.resource_cluster_client import ResourceClusterClient
logger = logging.getLogger(__name__)
settings = get_settings()
cache = get_cache()
router = APIRouter(prefix="/api/v1/models", tags=["Models"])
@router.get("/", summary="List available models for tenant")
async def list_available_models(
current_user: Dict = Depends(get_current_user)
) -> Dict[str, Any]:
"""Get list of AI models available to the current tenant"""
try:
# Get tenant domain from current user
tenant_domain = current_user.get("tenant_domain", "default")
# Check cache first (5-minute TTL)
cache_key = f"models_list_{tenant_domain}"
cached_models = cache.get(cache_key, ttl=300)
if cached_models:
logger.debug(f"Returning cached model list for tenant {tenant_domain}")
return {**cached_models, "cached": True}
# Call Resource Cluster models API - use Docker service name if in container
import os
if os.path.exists('/.dockerenv'):
resource_cluster_url = "http://resource-cluster:8000"
else:
resource_cluster_url = settings.resource_cluster_url
async with httpx.AsyncClient() as client:
response = await client.get(
f"{resource_cluster_url}/api/v1/models/",
headers={
"X-Tenant-Domain": tenant_domain
},
timeout=30.0
)
if response.status_code == 200:
models_data = response.json()
models = models_data.get("models", [])
# Filter models by health and deployment status
available_models = [
{
"value": model["id"], # model_id string for backwards compatibility
"uuid": model.get("uuid"), # Database UUID for unique identification
"label": model["name"],
"description": model["description"],
"provider": model["provider"],
"model_type": model["model_type"],
"max_tokens": model["performance"]["max_tokens"],
"context_window": model["performance"]["context_window"],
"cost_per_1k_tokens": model["performance"]["cost_per_1k_tokens"],
"latency_p50_ms": model["performance"]["latency_p50_ms"],
"health_status": model["status"]["health"],
"deployment_status": model["status"]["deployment"]
}
for model in models
if (model["status"]["deployment"] == "available" and
model["status"]["health"] in ["healthy", "unknown"] and
model["model_type"] != "embedding")
]
# Sort by provider preference (NVIDIA first, then Groq) and then by performance
provider_order = {"nvidia": 0, "groq": 1}
available_models.sort(key=lambda x: (
provider_order.get(x["provider"], 99), # NVIDIA first, then Groq
x["latency_p50_ms"] or 999 # Lower latency first
))
result = {
"models": available_models,
"total": len(available_models),
"tenant_domain": tenant_domain,
"last_updated": models_data.get("last_updated"),
"cached": False
}
# Cache the result for 5 minutes
cache.set(cache_key, result)
logger.debug(f"Cached model list for tenant {tenant_domain}")
return result
else:
# Resource Cluster unavailable - return empty list
logger.warning(f"Resource Cluster unavailable (HTTP {response.status_code})")
return {
"models": [],
"total": 0,
"tenant_domain": tenant_domain,
"message": "No models available - resource cluster unavailable"
}
except Exception as e:
logger.error(f"Error fetching models from Resource Cluster: {e}")
# Return empty list in case of error
return {
"models": [],
"total": 0,
"tenant_domain": current_user.get("tenant_domain", "default"),
"message": "No models available - service error"
}
@router.get("/{model_id}", summary="Get model details")
async def get_model_details(
model_id: str,
current_user: Dict = Depends(get_current_user)
) -> Dict[str, Any]:
"""Get detailed information about a specific model"""
try:
tenant_domain = current_user.get("tenant_domain", "default")
# Call Resource Cluster for model details - use Docker service name if in container
import os
if os.path.exists('/.dockerenv'):
resource_cluster_url = "http://resource-cluster:8000"
else:
resource_cluster_url = settings.resource_cluster_url
async with httpx.AsyncClient() as client:
response = await client.get(
f"{resource_cluster_url}/api/v1/models/{model_id}",
headers={
"X-Tenant-Domain": tenant_domain
},
timeout=15.0
)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Model {model_id} not found"
)
else:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Resource Cluster unavailable"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error fetching model {model_id} details: {e}")
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Failed to get model details"
)