gt-ai-os-community/apps/tenant-backend/app/api/v1/models.py

"""
Tenant Models API - Interface to Resource Cluster Model Management

Provides tenant-scoped access to available AI models from the Resource Cluster.
"""

from typing import Dict, Any, List, Optional
from fastapi import APIRouter, HTTPException, status, Depends
import httpx
import logging

from app.core.security import get_current_user
from app.core.config import get_settings
from app.core.cache import get_cache
from app.services.resource_cluster_client import ResourceClusterClient

logger = logging.getLogger(__name__)
settings = get_settings()
cache = get_cache()

router = APIRouter(prefix="/api/v1/models", tags=["Models"])


@router.get("/", summary="List available models for tenant")
async def list_available_models(
    current_user: Dict = Depends(get_current_user)
) -> Dict[str, Any]:
    """Get list of AI models available to the current tenant"""

    try:
        # Get tenant domain from current user
        tenant_domain = current_user.get("tenant_domain", "default")

        # Check cache first (5-minute TTL)
        cache_key = f"models_list_{tenant_domain}"
        cached_models = cache.get(cache_key, ttl=300)
        if cached_models:
            logger.debug(f"Returning cached model list for tenant {tenant_domain}")
            return {**cached_models, "cached": True}

        # Call Resource Cluster models API - use Docker service name if in container
        import os
        if os.path.exists('/.dockerenv'):
            resource_cluster_url = "http://resource-cluster:8000"
        else:
            resource_cluster_url = settings.resource_cluster_url

        async with httpx.AsyncClient() as client:
            response = await client.get(
                f"{resource_cluster_url}/api/v1/models/",
                headers={
                    "X-Tenant-Domain": tenant_domain
                },
                timeout=30.0
            )

            if response.status_code == 200:
                models_data = response.json()
                models = models_data.get("models", [])

                # Filter models by health and deployment status
                available_models = [
                    {
                        "value": model["id"],  # model_id string for backwards compatibility
                        "uuid": model.get("uuid"),  # Database UUID for unique identification
                        "label": model["name"],
                        "description": model["description"],
                        "provider": model["provider"],
                        "model_type": model["model_type"],
                        "max_tokens": model["performance"]["max_tokens"],
                        "context_window": model["performance"]["context_window"],
                        "cost_per_1k_tokens": model["performance"]["cost_per_1k_tokens"],
                        "latency_p50_ms": model["performance"]["latency_p50_ms"],
                        "health_status": model["status"]["health"],
                        "deployment_status": model["status"]["deployment"]
                    }
                    for model in models
                    if (model["status"]["deployment"] == "available" and
                        model["status"]["health"] in ["healthy", "unknown"] and
                        model["model_type"] != "embedding")
                ]

                # Sort by provider preference (NVIDIA first, then Groq) and then by performance
                provider_order = {"nvidia": 0, "groq": 1}
                available_models.sort(key=lambda x: (
                    provider_order.get(x["provider"], 99),  # NVIDIA first, then Groq
                    x["latency_p50_ms"] or 999  # Lower latency first
                ))

                result = {
                    "models": available_models,
                    "total": len(available_models),
                    "tenant_domain": tenant_domain,
                    "last_updated": models_data.get("last_updated"),
                    "cached": False
                }

                # Cache the result for 5 minutes
                cache.set(cache_key, result)
                logger.debug(f"Cached model list for tenant {tenant_domain}")

                return result

            else:
                # Resource Cluster unavailable - return empty list
                logger.warning(f"Resource Cluster unavailable (HTTP {response.status_code})")
                return {
                    "models": [],
                    "total": 0,
                    "tenant_domain": tenant_domain,
                    "message": "No models available - resource cluster unavailable"
                }

    except Exception as e:
        logger.error(f"Error fetching models from Resource Cluster: {e}")
        # Return empty list in case of error
        return {
            "models": [],
            "total": 0,
            "tenant_domain": current_user.get("tenant_domain", "default"),
            "message": "No models available - service error"
        }


@router.get("/{model_id}", summary="Get model details")
async def get_model_details(
    model_id: str,
    current_user: Dict = Depends(get_current_user)
) -> Dict[str, Any]:
    """Get detailed information about a specific model"""

    try:
        tenant_domain = current_user.get("tenant_domain", "default")

        # Call Resource Cluster for model details - use Docker service name if in container
        import os
        if os.path.exists('/.dockerenv'):
            resource_cluster_url = "http://resource-cluster:8000"
        else:
            resource_cluster_url = settings.resource_cluster_url

        async with httpx.AsyncClient() as client:
            response = await client.get(
                f"{resource_cluster_url}/api/v1/models/{model_id}",
                headers={
                    "X-Tenant-Domain": tenant_domain
                },
                timeout=15.0
            )

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 404:
                raise HTTPException(
                    status_code=status.HTTP_404_NOT_FOUND,
                    detail=f"Model {model_id} not found"
                )
            else:
                raise HTTPException(
                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
                    detail="Resource Cluster unavailable"
                )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error fetching model {model_id} details: {e}")
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail="Failed to get model details"
        )