gt-ai-os-community/scripts/migrations/021_add_nvidia_models.sql

-- Migration: 021_add_nvidia_models.sql
-- Description: Add NVIDIA NIM models to model_configs table
-- Date: 2025-12-08
-- Issue: #266 - Add NVIDIA API endpoint support
-- Reference: https://build.nvidia.com/models

-- NVIDIA NIM Models (build.nvidia.com)
-- Pricing: Estimated based on third-party providers and model size (Dec 2025)
-- Models selected: SOTA reasoning, coding, and general-purpose LLMs

INSERT INTO model_configs (
    model_id,
    name,
    version,
    provider,
    model_type,
    endpoint,
    context_window,
    max_tokens,
    cost_per_million_input,
    cost_per_million_output,
    capabilities,
    is_active,
    description,
    created_at,
    updated_at,
    request_count,
    error_count,
    success_rate,
    avg_latency_ms,
    health_status
)
VALUES
    -- ==========================================
    -- NVIDIA Llama Nemotron Family (Flagship)
    -- ==========================================

    -- Llama 3.3 Nemotron Super 49B v1 - Latest flagship reasoning model
    (
        'nvidia/llama-3.3-nemotron-super-49b-v1',
        'NVIDIA Llama 3.3 Nemotron Super 49B',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        131072,
        8192,
        0.5,
        1.5,
        '{"streaming": true, "function_calling": true, "reasoning": true}',
        true,
        'NVIDIA flagship reasoning model - best accuracy/throughput on single GPU',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),
    -- Llama 3.1 Nemotron Ultra 253B - Maximum accuracy
    (
        'nvidia/llama-3.1-nemotron-ultra-253b-v1',
        'NVIDIA Llama 3.1 Nemotron Ultra 253B',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        131072,
        8192,
        0.6,
        1.8,
        '{"streaming": true, "function_calling": true, "reasoning": true}',
        true,
        'Maximum agentic accuracy for scientific reasoning, math, and coding',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),
    -- Nemotron Nano 8B - Edge/PC deployment
    (
        'nvidia/llama-3.1-nemotron-nano-8b-v1',
        'NVIDIA Llama 3.1 Nemotron Nano 8B',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        131072,
        8192,
        0.02,
        0.06,
        '{"streaming": true, "function_calling": true}',
        true,
        'Cost-effective model optimized for edge devices and low latency',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- Meta Llama 3.3 (via NVIDIA NIM)
    -- ==========================================

    -- Llama 3.3 70B Instruct - Latest Llama
    (
        'nvidia/meta-llama-3.3-70b-instruct',
        'NVIDIA Meta Llama 3.3 70B Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        4096,
        0.13,
        0.4,
        '{"streaming": true, "function_calling": true}',
        true,
        'Latest Meta Llama 3.3 - excellent for instruction following',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- DeepSeek Models (via NVIDIA NIM)
    -- ==========================================

    -- DeepSeek V3 - Hybrid inference with Think/Non-Think modes
    (
        'nvidia/deepseek-ai-deepseek-v3',
        'NVIDIA DeepSeek V3',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        8192,
        0.5,
        1.5,
        '{"streaming": true, "function_calling": true, "reasoning": true}',
        true,
        'Hybrid LLM with Think/Non-Think modes, 128K context, strong tool use',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),
    -- DeepSeek R1 - Enhanced reasoning
    (
        'nvidia/deepseek-ai-deepseek-r1',
        'NVIDIA DeepSeek R1',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        8192,
        0.6,
        2.4,
        '{"streaming": true, "function_calling": true, "reasoning": true}',
        true,
        'Enhanced reasoning model - reduced hallucination, strong math/coding',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- Kimi K2 (Moonshot AI via NVIDIA NIM)
    -- ==========================================

    (
        'nvidia/moonshot-ai-kimi-k2-instruct',
        'NVIDIA Kimi K2 Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        8192,
        0.4,
        1.2,
        '{"streaming": true, "function_calling": true, "reasoning": true}',
        true,
        'Long context window with enhanced reasoning capabilities',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- Mistral Models (via NVIDIA NIM)
    -- ==========================================

    -- Mistral Large 3 - State-of-the-art MoE
    (
        'nvidia/mistralai-mistral-large-3-instruct',
        'NVIDIA Mistral Large 3 Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        8192,
        0.8,
        2.4,
        '{"streaming": true, "function_calling": true}',
        true,
        'State-of-the-art general purpose MoE model',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- Qwen Models (via NVIDIA NIM)
    -- ==========================================

    -- Qwen 3 - Ultra-long context (131K with YaRN extension)
    (
        'nvidia/qwen-qwen3-235b-a22b-fp8-instruct',
        'NVIDIA Qwen 3 235B Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        131072,
        8192,
        0.7,
        2.1,
        '{"streaming": true, "function_calling": true}',
        true,
        'Ultra-long context AI with strong multilingual support',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- Meta Llama 3.1 (via NVIDIA NIM)
    -- ==========================================

    -- Llama 3.1 405B - Largest open model
    (
        'nvidia/meta-llama-3.1-405b-instruct',
        'NVIDIA Meta Llama 3.1 405B Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        4096,
        1.0,
        3.0,
        '{"streaming": true, "function_calling": true}',
        true,
        'Largest open-source LLM - exceptional quality across all tasks',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),
    -- Llama 3.1 70B
    (
        'nvidia/meta-llama-3.1-70b-instruct',
        'NVIDIA Meta Llama 3.1 70B Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        4096,
        0.13,
        0.4,
        '{"streaming": true, "function_calling": true}',
        true,
        'Excellent balance of quality and speed',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),
    -- Llama 3.1 8B - Fast and efficient
    (
        'nvidia/meta-llama-3.1-8b-instruct',
        'NVIDIA Meta Llama 3.1 8B Instruct',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        4096,
        0.02,
        0.06,
        '{"streaming": true, "function_calling": true}',
        true,
        'Fast and cost-effective for simpler tasks',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),

    -- ==========================================
    -- OpenAI GPT-OSS Models (via NVIDIA NIM)
    -- Released August 2025 - Apache 2.0 License
    -- ==========================================

    -- GPT-OSS 120B via NVIDIA NIM - Production flagship, MoE architecture (117B params, 5.7B active)
    (
        'nvidia/openai-gpt-oss-120b',
        'NVIDIA OpenAI GPT-OSS 120B',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        8192,
        0.7,
        2.1,
        '{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
        true,
        'OpenAI flagship open model via NVIDIA NIM - production-grade reasoning, fits single H100 GPU',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    ),
    -- GPT-OSS 20B via NVIDIA NIM - Lightweight MoE for edge/local (21B params, 4B active)
    (
        'nvidia/openai-gpt-oss-20b',
        'NVIDIA OpenAI GPT-OSS 20B',
        '1.0',
        'nvidia',
        'llm',
        'https://integrate.api.nvidia.com/v1/chat/completions',
        128000,
        8192,
        0.15,
        0.45,
        '{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
        true,
        'OpenAI lightweight open model via NVIDIA NIM - low latency, runs in 16GB VRAM',
        NOW(),
        NOW(),
        0,
        0,
        100.0,
        0,
        'unknown'
    )

ON CONFLICT (model_id) DO UPDATE SET
    name = EXCLUDED.name,
    version = EXCLUDED.version,
    provider = EXCLUDED.provider,
    endpoint = EXCLUDED.endpoint,
    context_window = EXCLUDED.context_window,
    max_tokens = EXCLUDED.max_tokens,
    cost_per_million_input = EXCLUDED.cost_per_million_input,
    cost_per_million_output = EXCLUDED.cost_per_million_output,
    capabilities = EXCLUDED.capabilities,
    is_active = EXCLUDED.is_active,
    description = EXCLUDED.description,
    updated_at = NOW();

-- Assign NVIDIA models to all existing tenants with 1000 RPM rate limits
-- Note: model_config_id (UUID) is the foreign key, model_id kept for convenience
INSERT INTO tenant_model_configs (tenant_id, model_config_id, model_id, is_enabled, priority, rate_limits, created_at, updated_at)
SELECT
    t.id,
    m.id,        -- UUID foreign key (auto-generated in model_configs)
    m.model_id,  -- String identifier (kept for easier queries)
    true,
    5,
    '{"max_requests_per_hour": 1000, "max_tokens_per_request": 4000, "concurrent_requests": 5, "max_cost_per_hour": 10.0, "requests_per_minute": 1000, "tokens_per_minute": 100000, "max_concurrent": 10}'::json,
    NOW(),
    NOW()
FROM tenants t
CROSS JOIN model_configs m
WHERE m.provider = 'nvidia'
ON CONFLICT (tenant_id, model_config_id) DO UPDATE SET
    rate_limits = EXCLUDED.rate_limits;

-- Log migration completion
DO $$
BEGIN
    RAISE NOTICE 'Migration 021: Added NVIDIA NIM models (Nemotron, Llama 3.3, DeepSeek, Kimi K2, Mistral, Qwen, OpenAI GPT-OSS) to model_configs and assigned to tenants';
END $$;