GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
This commit is contained in:
464
.deployment/docker/embedding_server_dgx.py
Normal file
464
.deployment/docker/embedding_server_dgx.py
Normal file
@@ -0,0 +1,464 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DGX-Optimized BGE-M3 Embedding Server for GT 2.0
|
||||
Optimized for NVIDIA DGX Spark with 20-core Grace ARM architecture
|
||||
Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import uvicorn
|
||||
import psutil
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
# Setup logging first
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# BGE-M3 Model with DGX Grace optimizations
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
# ONNX Runtime imports with direct session support
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
from transformers import AutoTokenizer
|
||||
ONNX_AVAILABLE = True
|
||||
logger.info("ONNX Runtime available for DGX Grace ARM64 optimization")
|
||||
except ImportError as e:
|
||||
ONNX_AVAILABLE = False
|
||||
logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
|
||||
|
||||
# Global model instances
|
||||
model = None
|
||||
tokenizer = None
|
||||
onnx_session = None
|
||||
use_onnx = False
|
||||
model_mode = "unknown"
|
||||
|
||||
def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Perform mean pooling on token embeddings using attention mask.
|
||||
|
||||
Args:
|
||||
token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
|
||||
attention_mask: Attention mask [batch_size, seq_len]
|
||||
|
||||
Returns:
|
||||
Pooled embeddings [batch_size, hidden_dim]
|
||||
"""
|
||||
# Expand attention mask to match embeddings dimensions
|
||||
attention_mask_expanded = np.expand_dims(attention_mask, -1)
|
||||
|
||||
# Sum embeddings where attention mask is 1
|
||||
sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
|
||||
|
||||
# Sum attention mask to get actual sequence lengths
|
||||
sum_mask = np.sum(attention_mask_expanded, axis=1)
|
||||
|
||||
# Divide to get mean (avoid division by zero)
|
||||
mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
|
||||
|
||||
return mean_embeddings
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Load BGE-M3 model on startup with DGX Grace optimization"""
|
||||
global model, tokenizer, onnx_session, use_onnx, model_mode
|
||||
logger.info("Loading BGE-M3 model with DGX Grace ARM64 optimization...")
|
||||
|
||||
# Log system information
|
||||
logger.info(f"CPU cores: {psutil.cpu_count(logical=True)}")
|
||||
logger.info(f"Memory: {psutil.virtual_memory().total / (1024**3):.1f}GB")
|
||||
logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
|
||||
logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
|
||||
|
||||
# Check if ONNX Runtime should be used and is available
|
||||
use_onnx_env = os.environ.get('USE_ONNX_RUNTIME', 'true').lower() == 'true'
|
||||
|
||||
try:
|
||||
if ONNX_AVAILABLE and use_onnx_env:
|
||||
# Try ONNX Runtime with direct session for maximum DGX Grace performance
|
||||
logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
|
||||
try:
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
|
||||
|
||||
# Check for cached ONNX model
|
||||
cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
|
||||
model_id = 'models--BAAI--bge-m3'
|
||||
|
||||
# Find ONNX model in cache - check multiple possible locations
|
||||
import glob
|
||||
onnx_locations = [
|
||||
f'{cache_dir}/{model_id}/onnx/model.onnx', # Our export location
|
||||
f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx', # HF cache location
|
||||
]
|
||||
onnx_files = []
|
||||
for pattern in onnx_locations:
|
||||
onnx_files = glob.glob(pattern)
|
||||
if onnx_files:
|
||||
break
|
||||
|
||||
if onnx_files:
|
||||
onnx_path = onnx_files[0]
|
||||
logger.info(f"Found cached ONNX model at: {onnx_path}")
|
||||
|
||||
# Configure ONNX session options for DGX Grace ARM64
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.log_severity_level = 3 # 3=ERROR (suppresses warnings)
|
||||
sess_options.intra_op_num_threads = 20 # DGX Grace 20 cores
|
||||
sess_options.inter_op_num_threads = 4
|
||||
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
|
||||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
# Create ONNX session with DGX optimized settings
|
||||
onnx_session = ort.InferenceSession(
|
||||
onnx_path,
|
||||
sess_options=sess_options,
|
||||
providers=['CPUExecutionProvider']
|
||||
)
|
||||
|
||||
use_onnx = True
|
||||
model_mode = "ONNX Runtime (Direct Session - DGX)"
|
||||
logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session (DGX optimized)")
|
||||
|
||||
# Log ONNX model outputs for debugging
|
||||
logger.info("ONNX model outputs:")
|
||||
for output in onnx_session.get_outputs():
|
||||
logger.info(f" - {output.name}: {output.shape}")
|
||||
else:
|
||||
logger.warning("No cached ONNX model found, need to export first...")
|
||||
logger.info("Attempting ONNX export via optimum...")
|
||||
|
||||
# Try to export ONNX model using optimum
|
||||
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
||||
|
||||
# Define export path within the huggingface cache structure
|
||||
onnx_export_path = os.path.expanduser('~/.cache/huggingface/hub/models--BAAI--bge-m3/onnx')
|
||||
os.makedirs(onnx_export_path, exist_ok=True)
|
||||
|
||||
logger.info(f"Exporting ONNX model to: {onnx_export_path}")
|
||||
|
||||
# Export and save the ONNX model
|
||||
temp_model = ORTModelForFeatureExtraction.from_pretrained(
|
||||
'BAAI/bge-m3',
|
||||
export=True,
|
||||
provider="CPUExecutionProvider"
|
||||
)
|
||||
temp_model.save_pretrained(onnx_export_path)
|
||||
logger.info(f"ONNX model saved to: {onnx_export_path}")
|
||||
del temp_model
|
||||
|
||||
# Look for the exported model in the new location
|
||||
onnx_export_pattern = f'{onnx_export_path}/model.onnx'
|
||||
onnx_files = glob.glob(onnx_export_pattern)
|
||||
|
||||
# Also check the original pattern in case it was cached differently
|
||||
if not onnx_files:
|
||||
onnx_files = glob.glob(onnx_pattern)
|
||||
if onnx_files:
|
||||
onnx_path = onnx_files[0]
|
||||
logger.info(f"ONNX model exported to: {onnx_path}")
|
||||
|
||||
# Load with direct session
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.log_severity_level = 3
|
||||
sess_options.intra_op_num_threads = 20
|
||||
sess_options.inter_op_num_threads = 4
|
||||
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
|
||||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
onnx_session = ort.InferenceSession(
|
||||
onnx_path,
|
||||
sess_options=sess_options,
|
||||
providers=['CPUExecutionProvider']
|
||||
)
|
||||
|
||||
use_onnx = True
|
||||
model_mode = "ONNX Runtime (Direct Session - DGX Exported)"
|
||||
logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session (DGX optimized)")
|
||||
else:
|
||||
raise FileNotFoundError("ONNX export completed but model file not found")
|
||||
|
||||
except Exception as onnx_error:
|
||||
logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
|
||||
logger.warning(f"Error type: {type(onnx_error).__name__}")
|
||||
logger.info("Falling back to SentenceTransformers...")
|
||||
raise onnx_error
|
||||
else:
|
||||
logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
|
||||
raise ImportError("ONNX disabled")
|
||||
|
||||
except Exception:
|
||||
# Fallback to SentenceTransformers if ONNX fails or is disabled
|
||||
logger.info("Loading BGE-M3 with SentenceTransformers (DGX Grace optimized)...")
|
||||
try:
|
||||
# Configure PyTorch for DGX Grace
|
||||
torch.set_num_threads(20) # DGX Grace 20 cores
|
||||
torch.set_num_interop_threads(4)
|
||||
|
||||
# Load model with DGX optimizations
|
||||
model = SentenceTransformer(
|
||||
'BAAI/bge-m3',
|
||||
device='cpu',
|
||||
trust_remote_code=True,
|
||||
model_kwargs={
|
||||
'torch_dtype': torch.float16, # Memory optimization for large models
|
||||
'low_cpu_mem_usage': False # Use full memory for performance
|
||||
}
|
||||
)
|
||||
|
||||
# Enable optimizations
|
||||
model._modules['0'].auto_model.eval()
|
||||
|
||||
use_onnx = False
|
||||
model_mode = "SentenceTransformers (DGX Grace)"
|
||||
logger.info("✅ BGE-M3 loaded successfully with SentenceTransformers (DGX Grace optimized)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load BGE-M3 model: {e}")
|
||||
raise e
|
||||
|
||||
# Log model configuration
|
||||
logger.info(f"Model mode: {model_mode}")
|
||||
logger.info(f"Using ONNX: {use_onnx}")
|
||||
logger.info(f"OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'not set')}")
|
||||
logger.info(f"PYTORCH_NUM_THREADS: {os.environ.get('PYTORCH_NUM_THREADS', 'not set')}")
|
||||
|
||||
yield
|
||||
|
||||
# Cleanup
|
||||
logger.info("Shutting down BGE-M3 embedding server...")
|
||||
if model:
|
||||
del model
|
||||
if tokenizer:
|
||||
del tokenizer
|
||||
if onnx_session:
|
||||
del onnx_session
|
||||
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
||||
|
||||
# FastAPI app with lifespan
|
||||
app = FastAPI(
|
||||
title="GT 2.0 DGX BGE-M3 Embedding Server",
|
||||
description="DGX Grace ARM optimized BGE-M3 embedding service for GT 2.0",
|
||||
version="2.0.0-dgx",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Pydantic models for OpenAI compatibility
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: List[str] = Field(..., description="Input texts to embed")
|
||||
model: str = Field(default="BAAI/bge-m3", description="Model name")
|
||||
encoding_format: str = Field(default="float", description="Encoding format")
|
||||
dimensions: Optional[int] = Field(None, description="Number of dimensions")
|
||||
user: Optional[str] = Field(None, description="User identifier")
|
||||
|
||||
class EmbeddingData(BaseModel):
|
||||
object: str = "embedding"
|
||||
embedding: List[float]
|
||||
index: int
|
||||
|
||||
class EmbeddingUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
class EmbeddingResponse(BaseModel):
|
||||
object: str = "list"
|
||||
data: List[EmbeddingData]
|
||||
model: str
|
||||
usage: EmbeddingUsage
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint with DGX system metrics"""
|
||||
if not model and not onnx_session:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
|
||||
# Include system metrics for DGX monitoring
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
memory = psutil.virtual_memory()
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"model": "BAAI/bge-m3",
|
||||
"mode": model_mode,
|
||||
"using_onnx": use_onnx,
|
||||
"platform": os.environ.get('GT2_PLATFORM', 'unknown'),
|
||||
"architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
|
||||
"cpu_cores": psutil.cpu_count(logical=True),
|
||||
"cpu_usage": cpu_percent,
|
||||
"memory_total_gb": round(memory.total / (1024**3), 1),
|
||||
"memory_used_gb": round(memory.used / (1024**3), 1),
|
||||
"memory_available_gb": round(memory.available / (1024**3), 1),
|
||||
"omp_threads": os.environ.get('OMP_NUM_THREADS', 'not set'),
|
||||
"pytorch_threads": os.environ.get('PYTORCH_NUM_THREADS', 'not set'),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
|
||||
async def create_embeddings(request: EmbeddingRequest):
|
||||
"""Create embeddings using BGE-M3 model (OpenAI compatible)"""
|
||||
if not model and not onnx_session:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
input_texts = request.input
|
||||
|
||||
# Validate input
|
||||
if not input_texts or len(input_texts) == 0:
|
||||
raise HTTPException(status_code=400, detail="Input texts cannot be empty")
|
||||
|
||||
# Log processing info for DGX monitoring
|
||||
logger.info(f"Processing {len(input_texts)} texts with {model_mode}")
|
||||
|
||||
# DGX optimized batch processing
|
||||
if use_onnx and onnx_session:
|
||||
# Direct ONNX Runtime path for maximum DGX Grace performance
|
||||
batch_size = min(len(input_texts), 128) # Larger batches for DGX Grace
|
||||
embeddings = []
|
||||
|
||||
for i in range(0, len(input_texts), batch_size):
|
||||
batch_texts = input_texts[i:i + batch_size]
|
||||
|
||||
# Tokenize
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
max_length=512
|
||||
)
|
||||
|
||||
# Run ONNX inference
|
||||
# BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
|
||||
outputs = onnx_session.run(
|
||||
None, # Get all outputs
|
||||
{
|
||||
'input_ids': inputs['input_ids'].astype(np.int64),
|
||||
'attention_mask': inputs['attention_mask'].astype(np.int64)
|
||||
}
|
||||
)
|
||||
|
||||
# Get token embeddings (first output)
|
||||
token_embeddings = outputs[0]
|
||||
|
||||
# Mean pooling with attention mask
|
||||
batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
|
||||
|
||||
# Normalize embeddings
|
||||
norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
|
||||
batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
|
||||
|
||||
embeddings.extend(batch_embeddings)
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
else:
|
||||
# SentenceTransformers path with DGX optimization
|
||||
with torch.no_grad():
|
||||
embeddings = model.encode(
|
||||
input_texts,
|
||||
convert_to_numpy=True,
|
||||
normalize_embeddings=True,
|
||||
batch_size=32, # Optimal for DGX Grace
|
||||
show_progress_bar=False
|
||||
)
|
||||
|
||||
# Convert to list format for OpenAI compatibility
|
||||
if hasattr(embeddings, 'tolist'):
|
||||
embeddings = embeddings.tolist()
|
||||
elif isinstance(embeddings, list) and len(embeddings) > 0:
|
||||
if hasattr(embeddings[0], 'tolist'):
|
||||
embeddings = [emb.tolist() for emb in embeddings]
|
||||
|
||||
# Create response in OpenAI format
|
||||
embedding_data = [
|
||||
EmbeddingData(
|
||||
embedding=embedding,
|
||||
index=i
|
||||
)
|
||||
for i, embedding in enumerate(embeddings)
|
||||
]
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
# Calculate token usage (rough estimation)
|
||||
total_tokens = sum(len(text.split()) for text in input_texts)
|
||||
|
||||
# Log performance metrics for DGX monitoring
|
||||
texts_per_second = len(input_texts) / processing_time
|
||||
logger.info(f"Processed {len(input_texts)} texts in {processing_time:.2f}s ({texts_per_second:.1f} texts/sec)")
|
||||
|
||||
return EmbeddingResponse(
|
||||
data=embedding_data,
|
||||
model=request.model,
|
||||
usage=EmbeddingUsage(
|
||||
prompt_tokens=total_tokens,
|
||||
total_tokens=total_tokens
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Embedding generation failed: {e}")
|
||||
logger.exception("Full traceback:")
|
||||
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
|
||||
|
||||
@app.get("/v1/models")
|
||||
@app.get("/models")
|
||||
async def list_models():
|
||||
"""List available models (OpenAI compatible)"""
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "BAAI/bge-m3",
|
||||
"object": "model",
|
||||
"created": int(time.time()),
|
||||
"owned_by": "gt2-dgx",
|
||||
"permission": [],
|
||||
"root": "BAAI/bge-m3",
|
||||
"parent": None
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint with DGX info"""
|
||||
return {
|
||||
"service": "GT 2.0 DGX BGE-M3 Embedding Server",
|
||||
"version": "2.0.0-dgx",
|
||||
"model": "BAAI/bge-m3",
|
||||
"mode": model_mode,
|
||||
"platform": os.environ.get('GT2_PLATFORM', 'unknown'),
|
||||
"architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
|
||||
"cpu_cores": psutil.cpu_count(logical=True),
|
||||
"openai_compatible": True,
|
||||
"endpoints": {
|
||||
"embeddings": "/v1/embeddings",
|
||||
"models": "/models",
|
||||
"health": "/health"
|
||||
}
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Starting GT 2.0 DGX BGE-M3 Embedding Server...")
|
||||
logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
|
||||
logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
|
||||
|
||||
uvicorn.run(
|
||||
app,
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
workers=1, # Single worker for model memory efficiency
|
||||
loop="asyncio",
|
||||
access_log=True
|
||||
)
|
||||
Reference in New Issue
Block a user