gt-ai-os-community/apps/control-panel-backend/app/services/default_models.py

"""
Default Model Configurations for GT 2.0

This module contains the default configuration for all 19 Groq models
plus the BGE-M3 embedding model on GT Edge network.
"""

from typing import List, Dict, Any


def get_default_models() -> List[Dict[str, Any]]:
    """Get list of all default model configurations"""

    # Groq LLM Models (11 models)
    groq_llm_models = [
        {
            "model_id": "llama-3.3-70b-versatile",
            "name": "Llama 3.3 70B Versatile",
            "version": "3.3",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 128000,
                "max_tokens": 32768,
            },
            "capabilities": {
                "reasoning": True,
                "function_calling": True,
                "streaming": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 0.59,
                "per_1k_output": 0.79
            },
            "description": "Latest Llama 3.3 70B model optimized for versatile tasks with large context window",
            "is_active": True
        },
        {
            "model_id": "llama-3.3-70b-specdec",
            "name": "Llama 3.3 70B Speculative Decoding",
            "version": "3.3",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 8192,
                "max_tokens": 8192,
            },
            "capabilities": {
                "reasoning": True,
                "function_calling": True,
                "streaming": True
            },
            "cost": {
                "per_1k_input": 0.59,
                "per_1k_output": 0.79
            },
            "description": "Llama 3.3 70B with speculative decoding for faster inference",
            "is_active": True
        },
        {
            "model_id": "llama-3.2-90b-text-preview",
            "name": "Llama 3.2 90B Text Preview",
            "version": "3.2",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 128000,
                "max_tokens": 8000,
            },
            "capabilities": {
                "reasoning": True,
                "function_calling": True,
                "streaming": True
            },
            "cost": {
                "per_1k_input": 0.2,
                "per_1k_output": 0.2
            },
            "description": "Large Llama 3.2 model with enhanced text processing capabilities",
            "is_active": True
        },
        {
            "model_id": "llama-3.1-405b-reasoning",
            "name": "Llama 3.1 405B Reasoning",
            "version": "3.1",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 131072,
                "max_tokens": 32768,
            },
            "capabilities": {
                "reasoning": True,
                "function_calling": True,
                "streaming": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 2.5,
                "per_1k_output": 2.5
            },
            "description": "Largest Llama model optimized for complex reasoning tasks",
            "is_active": True
        },
        {
            "model_id": "llama-3.1-70b-versatile",
            "name": "Llama 3.1 70B Versatile",
            "version": "3.1",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 131072,
                "max_tokens": 32768,
            },
            "capabilities": {
                "reasoning": True,
                "function_calling": True,
                "streaming": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 0.59,
                "per_1k_output": 0.79
            },
            "description": "Balanced Llama model for general-purpose tasks with large context",
            "is_active": True
        },
        {
            "model_id": "llama-3.1-8b-instant",
            "name": "Llama 3.1 8B Instant",
            "version": "3.1",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 131072,
                "max_tokens": 8192,
            },
            "capabilities": {
                "streaming": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 0.05,
                "per_1k_output": 0.08
            },
            "description": "Fast and efficient Llama model for quick responses",
            "is_active": True
        },
        {
            "model_id": "llama3-groq-70b-8192-tool-use-preview",
            "name": "Llama 3 Groq 70B Tool Use Preview",
            "version": "3.0",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 8192,
                "max_tokens": 8192,
            },
            "capabilities": {
                "function_calling": True,
                "streaming": True
            },
            "cost": {
                "per_1k_input": 0.89,
                "per_1k_output": 0.89
            },
            "description": "Llama 3 70B optimized for tool use and function calling",
            "is_active": True
        },
        {
            "model_id": "llama3-groq-8b-8192-tool-use-preview",
            "name": "Llama 3 Groq 8B Tool Use Preview",
            "version": "3.0",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 8192,
                "max_tokens": 8192,
            },
            "capabilities": {
                "function_calling": True,
                "streaming": True
            },
            "cost": {
                "per_1k_input": 0.19,
                "per_1k_output": 0.19
            },
            "description": "Compact Llama 3 model optimized for tool use and function calling",
            "is_active": True
        },
        {
            "model_id": "mixtral-8x7b-32768",
            "name": "Mixtral 8x7B",
            "version": "1.0",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 32768,
                "max_tokens": 32768,
            },
            "capabilities": {
                "reasoning": True,
                "streaming": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 0.24,
                "per_1k_output": 0.24
            },
            "description": "Mixture of experts model with strong multilingual capabilities",
            "is_active": True
        },
        {
            "model_id": "gemma2-9b-it",
            "name": "Gemma 2 9B Instruction Tuned",
            "version": "2.0",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 8192,
                "max_tokens": 8192,
            },
            "capabilities": {
                "streaming": True,
                "multilingual": False
            },
            "cost": {
                "per_1k_input": 0.2,
                "per_1k_output": 0.2
            },
            "description": "Google's Gemma 2 model optimized for instruction following",
            "is_active": True
        },
        {
            "model_id": "llama-guard-3-8b",
            "name": "Llama Guard 3 8B",
            "version": "3.0",
            "provider": "groq",
            "model_type": "llm",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "specifications": {
                "context_window": 8192,
                "max_tokens": 8192,
            },
            "capabilities": {
                "streaming": False,
                "safety_classification": True
            },
            "cost": {
                "per_1k_input": 0.2,
                "per_1k_output": 0.2
            },
            "description": "Safety classification model for content moderation",
            "is_active": True
        }
    ]

    # Groq Audio Models (3 models)
    groq_audio_models = [
        {
            "model_id": "whisper-large-v3",
            "name": "Whisper Large v3",
            "version": "3.0",
            "provider": "groq",
            "model_type": "audio",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "capabilities": {
                "transcription": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 0.111,
                "per_1k_output": 0.111
            },
            "description": "High-quality speech transcription with multilingual support",
            "is_active": True
        },
        {
            "model_id": "whisper-large-v3-turbo",
            "name": "Whisper Large v3 Turbo",
            "version": "3.0",
            "provider": "groq",
            "model_type": "audio",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "capabilities": {
                "transcription": True,
                "multilingual": True
            },
            "cost": {
                "per_1k_input": 0.04,
                "per_1k_output": 0.04
            },
            "description": "Fast speech transcription optimized for speed",
            "is_active": True
        },
        {
            "model_id": "distil-whisper-large-v3-en",
            "name": "Distil-Whisper Large v3 English",
            "version": "3.0",
            "provider": "groq",
            "model_type": "audio",
            "endpoint": "https://api.groq.com/openai/v1",
            "api_key_name": "GROQ_API_KEY",
            "capabilities": {
                "transcription": True,
                "multilingual": False
            },
            "cost": {
                "per_1k_input": 0.02,
                "per_1k_output": 0.02
            },
            "description": "Compact English-only transcription model",
            "is_active": True
        }
    ]

    # BGE-M3 Embedding Model (External on GT Edge)
    external_models = [
        {
            "model_id": "bge-m3",
            "name": "BAAI BGE-M3 Multilingual Embeddings",
            "version": "1.0",
            "provider": "external",
            "model_type": "embedding",
            "endpoint": "http://10.0.1.50:8080",  # GT Edge local network
            "specifications": {
                "dimensions": 1024,
                "max_tokens": 8192,
            },
            "capabilities": {
                "multilingual": True,
                "dense_retrieval": True,
                "sparse_retrieval": True,
                "colbert": True
            },
            "cost": {
                "per_1k_input": 0.0,
                "per_1k_output": 0.0
            },
            "description": "State-of-the-art multilingual embedding model running on GT Edge local network",
            "config": {
                "batch_size": 32,
                "normalize": True,
                "pooling_method": "mean"
            },
            "is_active": True
        }
    ]

    # Local Ollama Models (for on-premise deployments)
    ollama_models = [
        {
            "model_id": "ollama-local-dgx-x86",
            "name": "Local Ollama (DGX/X86)",
            "version": "1.0",
            "provider": "ollama",
            "model_type": "llm",
            "endpoint": "http://ollama-host:11434/v1/chat/completions",
            "api_key_name": None,  # No API key needed for local Ollama
            "specifications": {
                "context_window": 131072,
                "max_tokens": 4096,
            },
            "capabilities": {
                "streaming": True,
                "function_calling": False
            },
            "cost": {
                "per_1k_input": 0.0,
                "per_1k_output": 0.0
            },
            "description": "Local Ollama instance for DGX and x86 Linux deployments. Uses ollama-host DNS resolution.",
            "is_active": True
        },
        {
            "model_id": "ollama-local-macos",
            "name": "Local Ollama (MacOS)",
            "version": "1.0",
            "provider": "ollama",
            "model_type": "llm",
            "endpoint": "http://host.docker.internal:11434/v1/chat/completions",
            "api_key_name": None,  # No API key needed for local Ollama
            "specifications": {
                "context_window": 131072,
                "max_tokens": 4096,
            },
            "capabilities": {
                "streaming": True,
                "function_calling": False
            },
            "cost": {
                "per_1k_input": 0.0,
                "per_1k_output": 0.0
            },
            "description": "Local Ollama instance for macOS deployments. Uses host.docker.internal for Docker-to-host networking.",
            "is_active": True
        }
    ]

    # TTS Models (placeholder - will be added when available)
    tts_models = [
        # Future TTS models from Groq/PlayAI
    ]

    # Combine all models
    all_models = groq_llm_models + groq_audio_models + external_models + ollama_models + tts_models

    return all_models


def get_groq_models() -> List[Dict[str, Any]]:
    """Get only Groq models"""
    return [model for model in get_default_models() if model["provider"] == "groq"]


def get_external_models() -> List[Dict[str, Any]]:
    """Get only external models (BGE-M3, etc.)"""
    return [model for model in get_default_models() if model["provider"] == "external"]


def get_ollama_models() -> List[Dict[str, Any]]:
    """Get only Ollama models (local inference)"""
    return [model for model in get_default_models() if model["provider"] == "ollama"]


def get_models_by_type(model_type: str) -> List[Dict[str, Any]]:
    """Get models by type (llm, embedding, audio, tts)"""
    return [model for model in get_default_models() if model["model_type"] == model_type]