Files
gt-ai-os-community/scripts/migrations/021_add_nvidia_models.sql
HackWeasel 310491a557 GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2
- Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2
  - Made more general-purpose (flexible targets, expanded tools)
- Added nemotron-mini-agent.csv for fast local inference via Ollama
- Added nemotron-agent.csv for advanced reasoning via Ollama
- Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00

433 lines
11 KiB
SQL

-- Migration: 021_add_nvidia_models.sql
-- Description: Add NVIDIA NIM models to model_configs table
-- Date: 2025-12-08
-- Issue: #266 - Add NVIDIA API endpoint support
-- Reference: https://build.nvidia.com/models
-- NVIDIA NIM Models (build.nvidia.com)
-- Pricing: Estimated based on third-party providers and model size (Dec 2025)
-- Models selected: SOTA reasoning, coding, and general-purpose LLMs
INSERT INTO model_configs (
model_id,
name,
version,
provider,
model_type,
endpoint,
context_window,
max_tokens,
cost_per_million_input,
cost_per_million_output,
capabilities,
is_active,
description,
created_at,
updated_at,
request_count,
error_count,
success_rate,
avg_latency_ms,
health_status
)
VALUES
-- ==========================================
-- NVIDIA Llama Nemotron Family (Flagship)
-- ==========================================
-- Llama 3.3 Nemotron Super 49B v1 - Latest flagship reasoning model
(
'nvidia/llama-3.3-nemotron-super-49b-v1',
'NVIDIA Llama 3.3 Nemotron Super 49B',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
131072,
8192,
0.5,
1.5,
'{"streaming": true, "function_calling": true, "reasoning": true}',
true,
'NVIDIA flagship reasoning model - best accuracy/throughput on single GPU',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- Llama 3.1 Nemotron Ultra 253B - Maximum accuracy
(
'nvidia/llama-3.1-nemotron-ultra-253b-v1',
'NVIDIA Llama 3.1 Nemotron Ultra 253B',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
131072,
8192,
0.6,
1.8,
'{"streaming": true, "function_calling": true, "reasoning": true}',
true,
'Maximum agentic accuracy for scientific reasoning, math, and coding',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- Nemotron Nano 8B - Edge/PC deployment
(
'nvidia/llama-3.1-nemotron-nano-8b-v1',
'NVIDIA Llama 3.1 Nemotron Nano 8B',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
131072,
8192,
0.02,
0.06,
'{"streaming": true, "function_calling": true}',
true,
'Cost-effective model optimized for edge devices and low latency',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- Meta Llama 3.3 (via NVIDIA NIM)
-- ==========================================
-- Llama 3.3 70B Instruct - Latest Llama
(
'nvidia/meta-llama-3.3-70b-instruct',
'NVIDIA Meta Llama 3.3 70B Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
4096,
0.13,
0.4,
'{"streaming": true, "function_calling": true}',
true,
'Latest Meta Llama 3.3 - excellent for instruction following',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- DeepSeek Models (via NVIDIA NIM)
-- ==========================================
-- DeepSeek V3 - Hybrid inference with Think/Non-Think modes
(
'nvidia/deepseek-ai-deepseek-v3',
'NVIDIA DeepSeek V3',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
8192,
0.5,
1.5,
'{"streaming": true, "function_calling": true, "reasoning": true}',
true,
'Hybrid LLM with Think/Non-Think modes, 128K context, strong tool use',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- DeepSeek R1 - Enhanced reasoning
(
'nvidia/deepseek-ai-deepseek-r1',
'NVIDIA DeepSeek R1',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
8192,
0.6,
2.4,
'{"streaming": true, "function_calling": true, "reasoning": true}',
true,
'Enhanced reasoning model - reduced hallucination, strong math/coding',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- Kimi K2 (Moonshot AI via NVIDIA NIM)
-- ==========================================
(
'nvidia/moonshot-ai-kimi-k2-instruct',
'NVIDIA Kimi K2 Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
8192,
0.4,
1.2,
'{"streaming": true, "function_calling": true, "reasoning": true}',
true,
'Long context window with enhanced reasoning capabilities',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- Mistral Models (via NVIDIA NIM)
-- ==========================================
-- Mistral Large 3 - State-of-the-art MoE
(
'nvidia/mistralai-mistral-large-3-instruct',
'NVIDIA Mistral Large 3 Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
8192,
0.8,
2.4,
'{"streaming": true, "function_calling": true}',
true,
'State-of-the-art general purpose MoE model',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- Qwen Models (via NVIDIA NIM)
-- ==========================================
-- Qwen 3 - Ultra-long context (131K with YaRN extension)
(
'nvidia/qwen-qwen3-235b-a22b-fp8-instruct',
'NVIDIA Qwen 3 235B Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
131072,
8192,
0.7,
2.1,
'{"streaming": true, "function_calling": true}',
true,
'Ultra-long context AI with strong multilingual support',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- Meta Llama 3.1 (via NVIDIA NIM)
-- ==========================================
-- Llama 3.1 405B - Largest open model
(
'nvidia/meta-llama-3.1-405b-instruct',
'NVIDIA Meta Llama 3.1 405B Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
4096,
1.0,
3.0,
'{"streaming": true, "function_calling": true}',
true,
'Largest open-source LLM - exceptional quality across all tasks',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- Llama 3.1 70B
(
'nvidia/meta-llama-3.1-70b-instruct',
'NVIDIA Meta Llama 3.1 70B Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
4096,
0.13,
0.4,
'{"streaming": true, "function_calling": true}',
true,
'Excellent balance of quality and speed',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- Llama 3.1 8B - Fast and efficient
(
'nvidia/meta-llama-3.1-8b-instruct',
'NVIDIA Meta Llama 3.1 8B Instruct',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
4096,
0.02,
0.06,
'{"streaming": true, "function_calling": true}',
true,
'Fast and cost-effective for simpler tasks',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- ==========================================
-- OpenAI GPT-OSS Models (via NVIDIA NIM)
-- Released August 2025 - Apache 2.0 License
-- ==========================================
-- GPT-OSS 120B via NVIDIA NIM - Production flagship, MoE architecture (117B params, 5.7B active)
(
'nvidia/openai-gpt-oss-120b',
'NVIDIA OpenAI GPT-OSS 120B',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
8192,
0.7,
2.1,
'{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
true,
'OpenAI flagship open model via NVIDIA NIM - production-grade reasoning, fits single H100 GPU',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
),
-- GPT-OSS 20B via NVIDIA NIM - Lightweight MoE for edge/local (21B params, 4B active)
(
'nvidia/openai-gpt-oss-20b',
'NVIDIA OpenAI GPT-OSS 20B',
'1.0',
'nvidia',
'llm',
'https://integrate.api.nvidia.com/v1/chat/completions',
128000,
8192,
0.15,
0.45,
'{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
true,
'OpenAI lightweight open model via NVIDIA NIM - low latency, runs in 16GB VRAM',
NOW(),
NOW(),
0,
0,
100.0,
0,
'unknown'
)
ON CONFLICT (model_id) DO UPDATE SET
name = EXCLUDED.name,
version = EXCLUDED.version,
provider = EXCLUDED.provider,
endpoint = EXCLUDED.endpoint,
context_window = EXCLUDED.context_window,
max_tokens = EXCLUDED.max_tokens,
cost_per_million_input = EXCLUDED.cost_per_million_input,
cost_per_million_output = EXCLUDED.cost_per_million_output,
capabilities = EXCLUDED.capabilities,
is_active = EXCLUDED.is_active,
description = EXCLUDED.description,
updated_at = NOW();
-- Assign NVIDIA models to all existing tenants with 1000 RPM rate limits
-- Note: model_config_id (UUID) is the foreign key, model_id kept for convenience
INSERT INTO tenant_model_configs (tenant_id, model_config_id, model_id, is_enabled, priority, rate_limits, created_at, updated_at)
SELECT
t.id,
m.id, -- UUID foreign key (auto-generated in model_configs)
m.model_id, -- String identifier (kept for easier queries)
true,
5,
'{"max_requests_per_hour": 1000, "max_tokens_per_request": 4000, "concurrent_requests": 5, "max_cost_per_hour": 10.0, "requests_per_minute": 1000, "tokens_per_minute": 100000, "max_concurrent": 10}'::json,
NOW(),
NOW()
FROM tenants t
CROSS JOIN model_configs m
WHERE m.provider = 'nvidia'
ON CONFLICT (tenant_id, model_config_id) DO UPDATE SET
rate_limits = EXCLUDED.rate_limits;
-- Log migration completion
DO $$
BEGIN
RAISE NOTICE 'Migration 021: Added NVIDIA NIM models (Nemotron, Llama 3.3, DeepSeek, Kimi K2, Mistral, Qwen, OpenAI GPT-OSS) to model_configs and assigned to tenants';
END $$;