- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
433 lines
11 KiB
SQL
433 lines
11 KiB
SQL
-- Migration: 021_add_nvidia_models.sql
|
|
-- Description: Add NVIDIA NIM models to model_configs table
|
|
-- Date: 2025-12-08
|
|
-- Issue: #266 - Add NVIDIA API endpoint support
|
|
-- Reference: https://build.nvidia.com/models
|
|
|
|
-- NVIDIA NIM Models (build.nvidia.com)
|
|
-- Pricing: Estimated based on third-party providers and model size (Dec 2025)
|
|
-- Models selected: SOTA reasoning, coding, and general-purpose LLMs
|
|
|
|
INSERT INTO model_configs (
|
|
model_id,
|
|
name,
|
|
version,
|
|
provider,
|
|
model_type,
|
|
endpoint,
|
|
context_window,
|
|
max_tokens,
|
|
cost_per_million_input,
|
|
cost_per_million_output,
|
|
capabilities,
|
|
is_active,
|
|
description,
|
|
created_at,
|
|
updated_at,
|
|
request_count,
|
|
error_count,
|
|
success_rate,
|
|
avg_latency_ms,
|
|
health_status
|
|
)
|
|
VALUES
|
|
-- ==========================================
|
|
-- NVIDIA Llama Nemotron Family (Flagship)
|
|
-- ==========================================
|
|
|
|
-- Llama 3.3 Nemotron Super 49B v1 - Latest flagship reasoning model
|
|
(
|
|
'nvidia/llama-3.3-nemotron-super-49b-v1',
|
|
'NVIDIA Llama 3.3 Nemotron Super 49B',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
131072,
|
|
8192,
|
|
0.5,
|
|
1.5,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
|
true,
|
|
'NVIDIA flagship reasoning model - best accuracy/throughput on single GPU',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
-- Llama 3.1 Nemotron Ultra 253B - Maximum accuracy
|
|
(
|
|
'nvidia/llama-3.1-nemotron-ultra-253b-v1',
|
|
'NVIDIA Llama 3.1 Nemotron Ultra 253B',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
131072,
|
|
8192,
|
|
0.6,
|
|
1.8,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
|
true,
|
|
'Maximum agentic accuracy for scientific reasoning, math, and coding',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
-- Nemotron Nano 8B - Edge/PC deployment
|
|
(
|
|
'nvidia/llama-3.1-nemotron-nano-8b-v1',
|
|
'NVIDIA Llama 3.1 Nemotron Nano 8B',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
131072,
|
|
8192,
|
|
0.02,
|
|
0.06,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'Cost-effective model optimized for edge devices and low latency',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- Meta Llama 3.3 (via NVIDIA NIM)
|
|
-- ==========================================
|
|
|
|
-- Llama 3.3 70B Instruct - Latest Llama
|
|
(
|
|
'nvidia/meta-llama-3.3-70b-instruct',
|
|
'NVIDIA Meta Llama 3.3 70B Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
4096,
|
|
0.13,
|
|
0.4,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'Latest Meta Llama 3.3 - excellent for instruction following',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- DeepSeek Models (via NVIDIA NIM)
|
|
-- ==========================================
|
|
|
|
-- DeepSeek V3 - Hybrid inference with Think/Non-Think modes
|
|
(
|
|
'nvidia/deepseek-ai-deepseek-v3',
|
|
'NVIDIA DeepSeek V3',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
8192,
|
|
0.5,
|
|
1.5,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
|
true,
|
|
'Hybrid LLM with Think/Non-Think modes, 128K context, strong tool use',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
-- DeepSeek R1 - Enhanced reasoning
|
|
(
|
|
'nvidia/deepseek-ai-deepseek-r1',
|
|
'NVIDIA DeepSeek R1',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
8192,
|
|
0.6,
|
|
2.4,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
|
true,
|
|
'Enhanced reasoning model - reduced hallucination, strong math/coding',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- Kimi K2 (Moonshot AI via NVIDIA NIM)
|
|
-- ==========================================
|
|
|
|
(
|
|
'nvidia/moonshot-ai-kimi-k2-instruct',
|
|
'NVIDIA Kimi K2 Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
8192,
|
|
0.4,
|
|
1.2,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
|
true,
|
|
'Long context window with enhanced reasoning capabilities',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- Mistral Models (via NVIDIA NIM)
|
|
-- ==========================================
|
|
|
|
-- Mistral Large 3 - State-of-the-art MoE
|
|
(
|
|
'nvidia/mistralai-mistral-large-3-instruct',
|
|
'NVIDIA Mistral Large 3 Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
8192,
|
|
0.8,
|
|
2.4,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'State-of-the-art general purpose MoE model',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- Qwen Models (via NVIDIA NIM)
|
|
-- ==========================================
|
|
|
|
-- Qwen 3 - Ultra-long context (131K with YaRN extension)
|
|
(
|
|
'nvidia/qwen-qwen3-235b-a22b-fp8-instruct',
|
|
'NVIDIA Qwen 3 235B Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
131072,
|
|
8192,
|
|
0.7,
|
|
2.1,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'Ultra-long context AI with strong multilingual support',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- Meta Llama 3.1 (via NVIDIA NIM)
|
|
-- ==========================================
|
|
|
|
-- Llama 3.1 405B - Largest open model
|
|
(
|
|
'nvidia/meta-llama-3.1-405b-instruct',
|
|
'NVIDIA Meta Llama 3.1 405B Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
4096,
|
|
1.0,
|
|
3.0,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'Largest open-source LLM - exceptional quality across all tasks',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
-- Llama 3.1 70B
|
|
(
|
|
'nvidia/meta-llama-3.1-70b-instruct',
|
|
'NVIDIA Meta Llama 3.1 70B Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
4096,
|
|
0.13,
|
|
0.4,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'Excellent balance of quality and speed',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
-- Llama 3.1 8B - Fast and efficient
|
|
(
|
|
'nvidia/meta-llama-3.1-8b-instruct',
|
|
'NVIDIA Meta Llama 3.1 8B Instruct',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
4096,
|
|
0.02,
|
|
0.06,
|
|
'{"streaming": true, "function_calling": true}',
|
|
true,
|
|
'Fast and cost-effective for simpler tasks',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
|
|
-- ==========================================
|
|
-- OpenAI GPT-OSS Models (via NVIDIA NIM)
|
|
-- Released August 2025 - Apache 2.0 License
|
|
-- ==========================================
|
|
|
|
-- GPT-OSS 120B via NVIDIA NIM - Production flagship, MoE architecture (117B params, 5.7B active)
|
|
(
|
|
'nvidia/openai-gpt-oss-120b',
|
|
'NVIDIA OpenAI GPT-OSS 120B',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
8192,
|
|
0.7,
|
|
2.1,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
|
|
true,
|
|
'OpenAI flagship open model via NVIDIA NIM - production-grade reasoning, fits single H100 GPU',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
),
|
|
-- GPT-OSS 20B via NVIDIA NIM - Lightweight MoE for edge/local (21B params, 4B active)
|
|
(
|
|
'nvidia/openai-gpt-oss-20b',
|
|
'NVIDIA OpenAI GPT-OSS 20B',
|
|
'1.0',
|
|
'nvidia',
|
|
'llm',
|
|
'https://integrate.api.nvidia.com/v1/chat/completions',
|
|
128000,
|
|
8192,
|
|
0.15,
|
|
0.45,
|
|
'{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
|
|
true,
|
|
'OpenAI lightweight open model via NVIDIA NIM - low latency, runs in 16GB VRAM',
|
|
NOW(),
|
|
NOW(),
|
|
0,
|
|
0,
|
|
100.0,
|
|
0,
|
|
'unknown'
|
|
)
|
|
|
|
ON CONFLICT (model_id) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
version = EXCLUDED.version,
|
|
provider = EXCLUDED.provider,
|
|
endpoint = EXCLUDED.endpoint,
|
|
context_window = EXCLUDED.context_window,
|
|
max_tokens = EXCLUDED.max_tokens,
|
|
cost_per_million_input = EXCLUDED.cost_per_million_input,
|
|
cost_per_million_output = EXCLUDED.cost_per_million_output,
|
|
capabilities = EXCLUDED.capabilities,
|
|
is_active = EXCLUDED.is_active,
|
|
description = EXCLUDED.description,
|
|
updated_at = NOW();
|
|
|
|
-- Assign NVIDIA models to all existing tenants with 1000 RPM rate limits
|
|
-- Note: model_config_id (UUID) is the foreign key, model_id kept for convenience
|
|
INSERT INTO tenant_model_configs (tenant_id, model_config_id, model_id, is_enabled, priority, rate_limits, created_at, updated_at)
|
|
SELECT
|
|
t.id,
|
|
m.id, -- UUID foreign key (auto-generated in model_configs)
|
|
m.model_id, -- String identifier (kept for easier queries)
|
|
true,
|
|
5,
|
|
'{"max_requests_per_hour": 1000, "max_tokens_per_request": 4000, "concurrent_requests": 5, "max_cost_per_hour": 10.0, "requests_per_minute": 1000, "tokens_per_minute": 100000, "max_concurrent": 10}'::json,
|
|
NOW(),
|
|
NOW()
|
|
FROM tenants t
|
|
CROSS JOIN model_configs m
|
|
WHERE m.provider = 'nvidia'
|
|
ON CONFLICT (tenant_id, model_config_id) DO UPDATE SET
|
|
rate_limits = EXCLUDED.rate_limits;
|
|
|
|
-- Log migration completion
|
|
DO $$
|
|
BEGIN
|
|
RAISE NOTICE 'Migration 021: Added NVIDIA NIM models (Nemotron, Llama 3.3, DeepSeek, Kimi K2, Mistral, Qwen, OpenAI GPT-OSS) to model_configs and assigned to tenants';
|
|
END $$;
|