GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
This commit is contained in:
432
scripts/migrations/021_add_nvidia_models.sql
Normal file
432
scripts/migrations/021_add_nvidia_models.sql
Normal file
@@ -0,0 +1,432 @@
|
||||
-- Migration: 021_add_nvidia_models.sql
|
||||
-- Description: Add NVIDIA NIM models to model_configs table
|
||||
-- Date: 2025-12-08
|
||||
-- Issue: #266 - Add NVIDIA API endpoint support
|
||||
-- Reference: https://build.nvidia.com/models
|
||||
|
||||
-- NVIDIA NIM Models (build.nvidia.com)
|
||||
-- Pricing: Estimated based on third-party providers and model size (Dec 2025)
|
||||
-- Models selected: SOTA reasoning, coding, and general-purpose LLMs
|
||||
|
||||
INSERT INTO model_configs (
|
||||
model_id,
|
||||
name,
|
||||
version,
|
||||
provider,
|
||||
model_type,
|
||||
endpoint,
|
||||
context_window,
|
||||
max_tokens,
|
||||
cost_per_million_input,
|
||||
cost_per_million_output,
|
||||
capabilities,
|
||||
is_active,
|
||||
description,
|
||||
created_at,
|
||||
updated_at,
|
||||
request_count,
|
||||
error_count,
|
||||
success_rate,
|
||||
avg_latency_ms,
|
||||
health_status
|
||||
)
|
||||
VALUES
|
||||
-- ==========================================
|
||||
-- NVIDIA Llama Nemotron Family (Flagship)
|
||||
-- ==========================================
|
||||
|
||||
-- Llama 3.3 Nemotron Super 49B v1 - Latest flagship reasoning model
|
||||
(
|
||||
'nvidia/llama-3.3-nemotron-super-49b-v1',
|
||||
'NVIDIA Llama 3.3 Nemotron Super 49B',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
131072,
|
||||
8192,
|
||||
0.5,
|
||||
1.5,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
||||
true,
|
||||
'NVIDIA flagship reasoning model - best accuracy/throughput on single GPU',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
-- Llama 3.1 Nemotron Ultra 253B - Maximum accuracy
|
||||
(
|
||||
'nvidia/llama-3.1-nemotron-ultra-253b-v1',
|
||||
'NVIDIA Llama 3.1 Nemotron Ultra 253B',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
131072,
|
||||
8192,
|
||||
0.6,
|
||||
1.8,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
||||
true,
|
||||
'Maximum agentic accuracy for scientific reasoning, math, and coding',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
-- Nemotron Nano 8B - Edge/PC deployment
|
||||
(
|
||||
'nvidia/llama-3.1-nemotron-nano-8b-v1',
|
||||
'NVIDIA Llama 3.1 Nemotron Nano 8B',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
131072,
|
||||
8192,
|
||||
0.02,
|
||||
0.06,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'Cost-effective model optimized for edge devices and low latency',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- Meta Llama 3.3 (via NVIDIA NIM)
|
||||
-- ==========================================
|
||||
|
||||
-- Llama 3.3 70B Instruct - Latest Llama
|
||||
(
|
||||
'nvidia/meta-llama-3.3-70b-instruct',
|
||||
'NVIDIA Meta Llama 3.3 70B Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
4096,
|
||||
0.13,
|
||||
0.4,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'Latest Meta Llama 3.3 - excellent for instruction following',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- DeepSeek Models (via NVIDIA NIM)
|
||||
-- ==========================================
|
||||
|
||||
-- DeepSeek V3 - Hybrid inference with Think/Non-Think modes
|
||||
(
|
||||
'nvidia/deepseek-ai-deepseek-v3',
|
||||
'NVIDIA DeepSeek V3',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
8192,
|
||||
0.5,
|
||||
1.5,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
||||
true,
|
||||
'Hybrid LLM with Think/Non-Think modes, 128K context, strong tool use',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
-- DeepSeek R1 - Enhanced reasoning
|
||||
(
|
||||
'nvidia/deepseek-ai-deepseek-r1',
|
||||
'NVIDIA DeepSeek R1',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
8192,
|
||||
0.6,
|
||||
2.4,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
||||
true,
|
||||
'Enhanced reasoning model - reduced hallucination, strong math/coding',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- Kimi K2 (Moonshot AI via NVIDIA NIM)
|
||||
-- ==========================================
|
||||
|
||||
(
|
||||
'nvidia/moonshot-ai-kimi-k2-instruct',
|
||||
'NVIDIA Kimi K2 Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
8192,
|
||||
0.4,
|
||||
1.2,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true}',
|
||||
true,
|
||||
'Long context window with enhanced reasoning capabilities',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- Mistral Models (via NVIDIA NIM)
|
||||
-- ==========================================
|
||||
|
||||
-- Mistral Large 3 - State-of-the-art MoE
|
||||
(
|
||||
'nvidia/mistralai-mistral-large-3-instruct',
|
||||
'NVIDIA Mistral Large 3 Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
8192,
|
||||
0.8,
|
||||
2.4,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'State-of-the-art general purpose MoE model',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- Qwen Models (via NVIDIA NIM)
|
||||
-- ==========================================
|
||||
|
||||
-- Qwen 3 - Ultra-long context (131K with YaRN extension)
|
||||
(
|
||||
'nvidia/qwen-qwen3-235b-a22b-fp8-instruct',
|
||||
'NVIDIA Qwen 3 235B Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
131072,
|
||||
8192,
|
||||
0.7,
|
||||
2.1,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'Ultra-long context AI with strong multilingual support',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- Meta Llama 3.1 (via NVIDIA NIM)
|
||||
-- ==========================================
|
||||
|
||||
-- Llama 3.1 405B - Largest open model
|
||||
(
|
||||
'nvidia/meta-llama-3.1-405b-instruct',
|
||||
'NVIDIA Meta Llama 3.1 405B Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
4096,
|
||||
1.0,
|
||||
3.0,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'Largest open-source LLM - exceptional quality across all tasks',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
-- Llama 3.1 70B
|
||||
(
|
||||
'nvidia/meta-llama-3.1-70b-instruct',
|
||||
'NVIDIA Meta Llama 3.1 70B Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
4096,
|
||||
0.13,
|
||||
0.4,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'Excellent balance of quality and speed',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
-- Llama 3.1 8B - Fast and efficient
|
||||
(
|
||||
'nvidia/meta-llama-3.1-8b-instruct',
|
||||
'NVIDIA Meta Llama 3.1 8B Instruct',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
4096,
|
||||
0.02,
|
||||
0.06,
|
||||
'{"streaming": true, "function_calling": true}',
|
||||
true,
|
||||
'Fast and cost-effective for simpler tasks',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
|
||||
-- ==========================================
|
||||
-- OpenAI GPT-OSS Models (via NVIDIA NIM)
|
||||
-- Released August 2025 - Apache 2.0 License
|
||||
-- ==========================================
|
||||
|
||||
-- GPT-OSS 120B via NVIDIA NIM - Production flagship, MoE architecture (117B params, 5.7B active)
|
||||
(
|
||||
'nvidia/openai-gpt-oss-120b',
|
||||
'NVIDIA OpenAI GPT-OSS 120B',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
8192,
|
||||
0.7,
|
||||
2.1,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
|
||||
true,
|
||||
'OpenAI flagship open model via NVIDIA NIM - production-grade reasoning, fits single H100 GPU',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
),
|
||||
-- GPT-OSS 20B via NVIDIA NIM - Lightweight MoE for edge/local (21B params, 4B active)
|
||||
(
|
||||
'nvidia/openai-gpt-oss-20b',
|
||||
'NVIDIA OpenAI GPT-OSS 20B',
|
||||
'1.0',
|
||||
'nvidia',
|
||||
'llm',
|
||||
'https://integrate.api.nvidia.com/v1/chat/completions',
|
||||
128000,
|
||||
8192,
|
||||
0.15,
|
||||
0.45,
|
||||
'{"streaming": true, "function_calling": true, "reasoning": true, "tool_use": true}',
|
||||
true,
|
||||
'OpenAI lightweight open model via NVIDIA NIM - low latency, runs in 16GB VRAM',
|
||||
NOW(),
|
||||
NOW(),
|
||||
0,
|
||||
0,
|
||||
100.0,
|
||||
0,
|
||||
'unknown'
|
||||
)
|
||||
|
||||
ON CONFLICT (model_id) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
version = EXCLUDED.version,
|
||||
provider = EXCLUDED.provider,
|
||||
endpoint = EXCLUDED.endpoint,
|
||||
context_window = EXCLUDED.context_window,
|
||||
max_tokens = EXCLUDED.max_tokens,
|
||||
cost_per_million_input = EXCLUDED.cost_per_million_input,
|
||||
cost_per_million_output = EXCLUDED.cost_per_million_output,
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
is_active = EXCLUDED.is_active,
|
||||
description = EXCLUDED.description,
|
||||
updated_at = NOW();
|
||||
|
||||
-- Assign NVIDIA models to all existing tenants with 1000 RPM rate limits
|
||||
-- Note: model_config_id (UUID) is the foreign key, model_id kept for convenience
|
||||
INSERT INTO tenant_model_configs (tenant_id, model_config_id, model_id, is_enabled, priority, rate_limits, created_at, updated_at)
|
||||
SELECT
|
||||
t.id,
|
||||
m.id, -- UUID foreign key (auto-generated in model_configs)
|
||||
m.model_id, -- String identifier (kept for easier queries)
|
||||
true,
|
||||
5,
|
||||
'{"max_requests_per_hour": 1000, "max_tokens_per_request": 4000, "concurrent_requests": 5, "max_cost_per_hour": 10.0, "requests_per_minute": 1000, "tokens_per_minute": 100000, "max_concurrent": 10}'::json,
|
||||
NOW(),
|
||||
NOW()
|
||||
FROM tenants t
|
||||
CROSS JOIN model_configs m
|
||||
WHERE m.provider = 'nvidia'
|
||||
ON CONFLICT (tenant_id, model_config_id) DO UPDATE SET
|
||||
rate_limits = EXCLUDED.rate_limits;
|
||||
|
||||
-- Log migration completion
|
||||
DO $$
|
||||
BEGIN
|
||||
RAISE NOTICE 'Migration 021: Added NVIDIA NIM models (Nemotron, Llama 3.3, DeepSeek, Kimi K2, Mistral, Qwen, OpenAI GPT-OSS) to model_configs and assigned to tenants';
|
||||
END $$;
|
||||
Reference in New Issue
Block a user