GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents

- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2
- Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2
  - Made more general-purpose (flexible targets, expanded tools)
- Added nemotron-mini-agent.csv for fast local inference via Ollama
- Added nemotron-agent.csv for advanced reasoning via Ollama
- Added wiki page: Projects for NVIDIA NIMs and Nemotron
This commit is contained in:
HackWeasel
2025-12-12 17:47:14 -05:00
commit 310491a557
750 changed files with 232701 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
FROM python:3.11-slim
# Install system dependencies for ARM64 with optimized BLAS libraries
RUN apt-get update && apt-get install -y \
gcc \
g++ \
curl \
libblas-dev \
liblapack-dev \
libopenblas-dev \
gfortran \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# Install PyTorch CPU-only for ARM with optimized BLAS
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# Install optimized dependencies for ARM64
RUN pip install --no-cache-dir \
transformers>=4.36.0 \
sentence-transformers \
fastapi \
uvicorn \
numpy \
accelerate \
onnxruntime \
optimum[onnxruntime]
# Set comprehensive ARM64 environment variables for maximum performance
ENV OMP_NUM_THREADS=8
ENV MKL_NUM_THREADS=8
ENV BLIS_NUM_THREADS=8
ENV VECLIB_MAXIMUM_THREADS=8
ENV PYTORCH_NUM_THREADS=8
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
ENV CUDA_VISIBLE_DEVICES=""
ENV USE_ONNX_RUNTIME=true
ENV CFLAGS="-march=armv8-a+simd+fp16 -O3"
ENV CXXFLAGS="-march=armv8-a+simd+fp16 -O3"
# Create app directory
WORKDIR /app
# Copy the custom OpenAI-compatible BGE-M3 server
COPY .deployment/docker/embedding_server.py /app/embedding_server.py
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Run the embedding server
CMD ["python", "embedding_server.py"]

View File

@@ -0,0 +1,73 @@
FROM python:3.11-slim
# Install system dependencies for DGX Grace ARM with optimized libraries
# Note: Removed libatlas-base-dev as it's not available in Debian Trixie ARM64
RUN apt-get update && apt-get install -y \
gcc \
g++ \
curl \
libblas-dev \
liblapack-dev \
libopenblas-dev \
gfortran \
pkg-config \
build-essential \
cmake \
&& rm -rf /var/lib/apt/lists/*
# Install PyTorch CPU-only for ARM with optimized BLAS
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# Install optimized dependencies for DGX Grace ARM64
RUN pip install --no-cache-dir \
transformers>=4.36.0 \
sentence-transformers \
fastapi \
uvicorn \
numpy \
accelerate \
onnxruntime \
optimum[onnxruntime] \
psutil
# Set comprehensive DGX Grace ARM64 environment variables for maximum performance
ENV OMP_NUM_THREADS=20
ENV MKL_NUM_THREADS=20
ENV BLIS_NUM_THREADS=20
ENV OPENBLAS_NUM_THREADS=20
ENV VECLIB_MAXIMUM_THREADS=20
ENV PYTORCH_NUM_THREADS=20
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
ENV CUDA_VISIBLE_DEVICES=""
ENV USE_ONNX_RUNTIME=true
ENV MALLOC_ARENA_MAX=8
# DGX Grace architecture optimizations
ENV CFLAGS="-march=armv8.2-a+fp16+rcpc+dotprod -O3 -ffast-math"
ENV CXXFLAGS="-march=armv8.2-a+fp16+rcpc+dotprod -O3 -ffast-math"
# Memory optimization for 128GB system
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
ENV OMP_STACKSIZE=2M
ENV KMP_STACKSIZE=2M
# Platform identification
ENV GT2_PLATFORM=dgx
ENV GT2_ARCHITECTURE=grace-arm
# Create app directory
WORKDIR /app
# Copy the custom OpenAI-compatible BGE-M3 server optimized for DGX
COPY .deployment/docker/embedding_server_dgx.py /app/embedding_server.py
# Expose port
EXPOSE 8000
# Health check with longer timeout for DGX startup
HEALTHCHECK --interval=30s --timeout=60s --start-period=600s --retries=5 \
CMD curl -f http://localhost:8000/health || exit 1
# Run the embedding server
CMD ["python", "embedding_server.py"]

View File

@@ -0,0 +1,56 @@
FROM python:3.11-slim
# Install system dependencies for x86_64 with optimized BLAS libraries
RUN apt-get update && apt-get install -y \
gcc \
g++ \
curl \
libblas-dev \
liblapack-dev \
libopenblas-dev \
gfortran \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# Install PyTorch with CUDA support for x86_64 (auto-falls back to CPU if no GPU)
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# Install optimized dependencies for x86_64
RUN pip install --no-cache-dir \
transformers>=4.36.0 \
sentence-transformers \
fastapi \
uvicorn \
numpy \
accelerate \
onnxruntime-gpu \
optimum[onnxruntime-gpu]
# Set comprehensive x86_64 environment variables for maximum performance
ENV OMP_NUM_THREADS=16
ENV BLIS_NUM_THREADS=16
ENV OPENBLAS_NUM_THREADS=16
ENV PYTORCH_NUM_THREADS=16
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
# GPU auto-detection: ONNX Runtime will use CUDAExecutionProvider if available, else CPU
ENV USE_ONNX_RUNTIME=true
# x86_64 specific compiler optimization flags
ENV CFLAGS="-march=native -O3 -mavx2 -mfma"
ENV CXXFLAGS="-march=native -O3 -mavx2 -mfma"
# Create app directory
WORKDIR /app
# Copy the custom OpenAI-compatible BGE-M3 server
COPY .deployment/docker/embedding_server.py /app/embedding_server.py
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Run the embedding server
CMD ["python", "embedding_server.py"]

View File

@@ -0,0 +1,381 @@
#!/usr/bin/env python3
"""
OpenAI-Compatible BGE-M3 Embedding Server for GT 2.0
Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
"""
import asyncio
import logging
import time
import uvicorn
from datetime import datetime
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from contextlib import asynccontextmanager
# Setup logging first
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# BGE-M3 Model with ONNX Runtime optimization
from sentence_transformers import SentenceTransformer
import torch
import os
import numpy as np
# Limit VRAM usage if GPU is available (BGE-M3 needs ~2.5GB)
if torch.cuda.is_available():
memory_fraction = float(os.environ.get('CUDA_MEMORY_FRACTION', '0.25'))
torch.cuda.set_per_process_memory_fraction(memory_fraction)
logger.info(f"CUDA memory limited to {memory_fraction*100:.0f}% of available VRAM")
# ONNX Runtime imports with direct session support
try:
import onnxruntime as ort
from transformers import AutoTokenizer
ONNX_AVAILABLE = True
logger.info(f"ONNX Runtime available (providers: {ort.get_available_providers()})")
except ImportError as e:
ONNX_AVAILABLE = False
logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
# Global model instances
model = None
tokenizer = None
onnx_session = None
use_onnx = False
model_mode = "unknown"
def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
"""
Perform mean pooling on token embeddings using attention mask.
Args:
token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
attention_mask: Attention mask [batch_size, seq_len]
Returns:
Pooled embeddings [batch_size, hidden_dim]
"""
# Expand attention mask to match embeddings dimensions
attention_mask_expanded = np.expand_dims(attention_mask, -1)
# Sum embeddings where attention mask is 1
sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
# Sum attention mask to get actual sequence lengths
sum_mask = np.sum(attention_mask_expanded, axis=1)
# Divide to get mean (avoid division by zero)
mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
return mean_embeddings
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load BGE-M3 model on startup with ONNX optimization"""
global model, tokenizer, onnx_session, use_onnx, model_mode
logger.info("Loading BGE-M3 model with ARM64 optimization...")
# Check if ONNX Runtime should be used
use_onnx_env = os.getenv('USE_ONNX_RUNTIME', 'true').lower() == 'true'
try:
if ONNX_AVAILABLE and use_onnx_env:
# Try ONNX Runtime with direct session for maximum ARM64 performance
logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
try:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
# Check for cached ONNX model
cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
model_id = 'models--BAAI--bge-m3'
# Find ONNX model in cache
import glob
onnx_pattern = f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx'
onnx_files = glob.glob(onnx_pattern)
if onnx_files:
onnx_path = onnx_files[0]
logger.info(f"Found cached ONNX model at: {onnx_path}")
# Configure ONNX session options to suppress ARM64 warnings
sess_options = ort.SessionOptions()
sess_options.log_severity_level = 3 # 3=ERROR (suppresses warnings)
# Create ONNX session with GPU auto-detection (falls back to CPU)
onnx_session = ort.InferenceSession(
onnx_path,
sess_options=sess_options,
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
use_onnx = True
model_mode = "ONNX Runtime (Direct Session)"
logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session")
# Log ONNX model outputs for debugging
logger.info("ONNX model outputs:")
for output in onnx_session.get_outputs():
logger.info(f" - {output.name}: {output.shape}")
else:
logger.warning("No cached ONNX model found, need to export first...")
logger.info("Attempting ONNX export via optimum...")
# Try to export ONNX model using optimum
from optimum.onnxruntime import ORTModelForFeatureExtraction
# This will cache the ONNX model for future use
temp_model = ORTModelForFeatureExtraction.from_pretrained(
'BAAI/bge-m3',
export=False,
provider="CPUExecutionProvider"
)
del temp_model
# Now find the newly exported model
onnx_files = glob.glob(onnx_pattern)
if onnx_files:
onnx_path = onnx_files[0]
logger.info(f"ONNX model exported to: {onnx_path}")
# Load with direct session (GPU auto-detection)
sess_options = ort.SessionOptions()
sess_options.log_severity_level = 3
onnx_session = ort.InferenceSession(
onnx_path,
sess_options=sess_options,
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
use_onnx = True
model_mode = "ONNX Runtime (Direct Session - Exported)"
logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session")
else:
raise FileNotFoundError("ONNX export completed but model file not found")
except Exception as onnx_error:
logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
logger.warning(f"Error type: {type(onnx_error).__name__}")
logger.info("Falling back to SentenceTransformers...")
raise onnx_error
else:
logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
raise ImportError("ONNX disabled")
except Exception:
# Fallback to SentenceTransformers with GPU auto-detection
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logger.info(f"Loading BGE-M3 with SentenceTransformers (fallback mode) on {device}...")
model = SentenceTransformer(
'BAAI/bge-m3',
device=device,
trust_remote_code=True
)
use_onnx = False
model_mode = f"SentenceTransformers ({device.upper()})"
logger.info(f"✅ BGE-M3 model loaded with SentenceTransformers on {device}")
logger.info(f"Model mode: {model_mode}")
logger.info(f"PyTorch threads: {torch.get_num_threads()}")
logger.info(f"OMP threads: {os.getenv('OMP_NUM_THREADS', 'not set')}")
logger.info(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
yield
# Cleanup
if model:
del model
if tokenizer:
del tokenizer
if onnx_session:
del onnx_session
torch.cuda.empty_cache() if torch.cuda.is_available() else None
app = FastAPI(
title="BGE-M3 Embedding Service",
description="OpenAI-compatible BGE-M3 embedding API for GT 2.0",
version="1.0.0",
lifespan=lifespan
)
# OpenAI-compatible request models
class EmbeddingRequest(BaseModel):
input: List[str] = Field(..., description="Input texts to embed")
model: str = Field(default="BAAI/bge-m3", description="Model name")
encoding_format: str = Field(default="float", description="Encoding format")
dimensions: Optional[int] = Field(None, description="Number of dimensions")
user: Optional[str] = Field(None, description="User identifier")
class EmbeddingData(BaseModel):
object: str = "embedding"
embedding: List[float]
index: int
class EmbeddingUsage(BaseModel):
prompt_tokens: int
total_tokens: int
class EmbeddingResponse(BaseModel):
object: str = "list"
data: List[EmbeddingData]
model: str
usage: EmbeddingUsage
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def create_embeddings(request: EmbeddingRequest):
"""Generate embeddings using BGE-M3 model"""
if not model and not onnx_session:
raise HTTPException(status_code=500, detail="BGE-M3 model not loaded")
if not request.input:
raise HTTPException(status_code=400, detail="No input texts provided")
start_time = time.time()
try:
logger.info(f"Generating embeddings for {len(request.input)} texts using {model_mode}")
# Generate embeddings with mode-specific logic
if use_onnx and onnx_session:
# Direct ONNX Runtime path for maximum performance
batch_size = min(len(request.input), 64)
embeddings = []
for i in range(0, len(request.input), batch_size):
batch_texts = request.input[i:i + batch_size]
# Tokenize
inputs = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="np",
max_length=512
)
# Run ONNX inference
# BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
outputs = onnx_session.run(
None, # Get all outputs
{
'input_ids': inputs['input_ids'].astype(np.int64),
'attention_mask': inputs['attention_mask'].astype(np.int64)
}
)
# Get token embeddings (first output)
token_embeddings = outputs[0]
# Mean pooling with attention mask
batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
# Normalize embeddings
norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
embeddings.extend(batch_embeddings)
embeddings = np.array(embeddings)
else:
# SentenceTransformers fallback path
embeddings = model.encode(
request.input,
batch_size=min(len(request.input), 64),
show_progress_bar=False,
convert_to_tensor=False,
normalize_embeddings=True
)
# Convert to list format
if hasattr(embeddings, 'tolist'):
embeddings = embeddings.tolist()
elif isinstance(embeddings, list) and len(embeddings) > 0:
if hasattr(embeddings[0], 'tolist'):
embeddings = [emb.tolist() for emb in embeddings]
# Create response in OpenAI format
embedding_data = [
EmbeddingData(
embedding=embedding,
index=i
)
for i, embedding in enumerate(embeddings)
]
# Calculate token usage (rough estimation)
total_tokens = sum(len(text.split()) for text in request.input)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(f"Generated {len(embeddings)} embeddings in {processing_time_ms}ms")
return EmbeddingResponse(
data=embedding_data,
model=request.model,
usage=EmbeddingUsage(
prompt_tokens=total_tokens,
total_tokens=total_tokens
)
)
except Exception as e:
logger.error(f"Error generating embeddings: {e}")
logger.exception("Full traceback:")
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy" if (model or onnx_session) else "unhealthy",
"model": "BAAI/bge-m3",
"service": "bge-m3-embeddings",
"mode": model_mode,
"onnx_enabled": use_onnx,
"gpu_available": torch.cuda.is_available(),
"gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
"pytorch_threads": torch.get_num_threads(),
"timestamp": datetime.utcnow().isoformat()
}
@app.get("/v1/models")
async def list_models():
"""List available models (OpenAI-compatible)"""
return {
"object": "list",
"data": [
{
"id": "BAAI/bge-m3",
"object": "model",
"created": int(time.time()),
"owned_by": "gt2"
}
]
}
@app.get("/")
async def root():
"""Root endpoint"""
return {
"service": "BGE-M3 Embedding Service",
"model": "BAAI/bge-m3",
"version": "1.0.0",
"api": "OpenAI-compatible",
"status": "ready" if (model or onnx_session) else "loading"
}
if __name__ == "__main__":
uvicorn.run(
"embedding_server:app",
host="0.0.0.0",
port=8000,
log_level="info"
)

View File

@@ -0,0 +1,464 @@
#!/usr/bin/env python3
"""
DGX-Optimized BGE-M3 Embedding Server for GT 2.0
Optimized for NVIDIA DGX Spark with 20-core Grace ARM architecture
Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
"""
import asyncio
import logging
import time
import uvicorn
import psutil
from datetime import datetime
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from contextlib import asynccontextmanager
# Setup logging first
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# BGE-M3 Model with DGX Grace optimizations
from sentence_transformers import SentenceTransformer
import torch
import os
import numpy as np
# ONNX Runtime imports with direct session support
try:
import onnxruntime as ort
from transformers import AutoTokenizer
ONNX_AVAILABLE = True
logger.info("ONNX Runtime available for DGX Grace ARM64 optimization")
except ImportError as e:
ONNX_AVAILABLE = False
logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
# Global model instances
model = None
tokenizer = None
onnx_session = None
use_onnx = False
model_mode = "unknown"
def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
"""
Perform mean pooling on token embeddings using attention mask.
Args:
token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
attention_mask: Attention mask [batch_size, seq_len]
Returns:
Pooled embeddings [batch_size, hidden_dim]
"""
# Expand attention mask to match embeddings dimensions
attention_mask_expanded = np.expand_dims(attention_mask, -1)
# Sum embeddings where attention mask is 1
sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
# Sum attention mask to get actual sequence lengths
sum_mask = np.sum(attention_mask_expanded, axis=1)
# Divide to get mean (avoid division by zero)
mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
return mean_embeddings
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load BGE-M3 model on startup with DGX Grace optimization"""
global model, tokenizer, onnx_session, use_onnx, model_mode
logger.info("Loading BGE-M3 model with DGX Grace ARM64 optimization...")
# Log system information
logger.info(f"CPU cores: {psutil.cpu_count(logical=True)}")
logger.info(f"Memory: {psutil.virtual_memory().total / (1024**3):.1f}GB")
logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
# Check if ONNX Runtime should be used and is available
use_onnx_env = os.environ.get('USE_ONNX_RUNTIME', 'true').lower() == 'true'
try:
if ONNX_AVAILABLE and use_onnx_env:
# Try ONNX Runtime with direct session for maximum DGX Grace performance
logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
try:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
# Check for cached ONNX model
cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
model_id = 'models--BAAI--bge-m3'
# Find ONNX model in cache - check multiple possible locations
import glob
onnx_locations = [
f'{cache_dir}/{model_id}/onnx/model.onnx', # Our export location
f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx', # HF cache location
]
onnx_files = []
for pattern in onnx_locations:
onnx_files = glob.glob(pattern)
if onnx_files:
break
if onnx_files:
onnx_path = onnx_files[0]
logger.info(f"Found cached ONNX model at: {onnx_path}")
# Configure ONNX session options for DGX Grace ARM64
sess_options = ort.SessionOptions()
sess_options.log_severity_level = 3 # 3=ERROR (suppresses warnings)
sess_options.intra_op_num_threads = 20 # DGX Grace 20 cores
sess_options.inter_op_num_threads = 4
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Create ONNX session with DGX optimized settings
onnx_session = ort.InferenceSession(
onnx_path,
sess_options=sess_options,
providers=['CPUExecutionProvider']
)
use_onnx = True
model_mode = "ONNX Runtime (Direct Session - DGX)"
logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session (DGX optimized)")
# Log ONNX model outputs for debugging
logger.info("ONNX model outputs:")
for output in onnx_session.get_outputs():
logger.info(f" - {output.name}: {output.shape}")
else:
logger.warning("No cached ONNX model found, need to export first...")
logger.info("Attempting ONNX export via optimum...")
# Try to export ONNX model using optimum
from optimum.onnxruntime import ORTModelForFeatureExtraction
# Define export path within the huggingface cache structure
onnx_export_path = os.path.expanduser('~/.cache/huggingface/hub/models--BAAI--bge-m3/onnx')
os.makedirs(onnx_export_path, exist_ok=True)
logger.info(f"Exporting ONNX model to: {onnx_export_path}")
# Export and save the ONNX model
temp_model = ORTModelForFeatureExtraction.from_pretrained(
'BAAI/bge-m3',
export=True,
provider="CPUExecutionProvider"
)
temp_model.save_pretrained(onnx_export_path)
logger.info(f"ONNX model saved to: {onnx_export_path}")
del temp_model
# Look for the exported model in the new location
onnx_export_pattern = f'{onnx_export_path}/model.onnx'
onnx_files = glob.glob(onnx_export_pattern)
# Also check the original pattern in case it was cached differently
if not onnx_files:
onnx_files = glob.glob(onnx_pattern)
if onnx_files:
onnx_path = onnx_files[0]
logger.info(f"ONNX model exported to: {onnx_path}")
# Load with direct session
sess_options = ort.SessionOptions()
sess_options.log_severity_level = 3
sess_options.intra_op_num_threads = 20
sess_options.inter_op_num_threads = 4
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
onnx_session = ort.InferenceSession(
onnx_path,
sess_options=sess_options,
providers=['CPUExecutionProvider']
)
use_onnx = True
model_mode = "ONNX Runtime (Direct Session - DGX Exported)"
logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session (DGX optimized)")
else:
raise FileNotFoundError("ONNX export completed but model file not found")
except Exception as onnx_error:
logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
logger.warning(f"Error type: {type(onnx_error).__name__}")
logger.info("Falling back to SentenceTransformers...")
raise onnx_error
else:
logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
raise ImportError("ONNX disabled")
except Exception:
# Fallback to SentenceTransformers if ONNX fails or is disabled
logger.info("Loading BGE-M3 with SentenceTransformers (DGX Grace optimized)...")
try:
# Configure PyTorch for DGX Grace
torch.set_num_threads(20) # DGX Grace 20 cores
torch.set_num_interop_threads(4)
# Load model with DGX optimizations
model = SentenceTransformer(
'BAAI/bge-m3',
device='cpu',
trust_remote_code=True,
model_kwargs={
'torch_dtype': torch.float16, # Memory optimization for large models
'low_cpu_mem_usage': False # Use full memory for performance
}
)
# Enable optimizations
model._modules['0'].auto_model.eval()
use_onnx = False
model_mode = "SentenceTransformers (DGX Grace)"
logger.info("✅ BGE-M3 loaded successfully with SentenceTransformers (DGX Grace optimized)")
except Exception as e:
logger.error(f"❌ Failed to load BGE-M3 model: {e}")
raise e
# Log model configuration
logger.info(f"Model mode: {model_mode}")
logger.info(f"Using ONNX: {use_onnx}")
logger.info(f"OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'not set')}")
logger.info(f"PYTORCH_NUM_THREADS: {os.environ.get('PYTORCH_NUM_THREADS', 'not set')}")
yield
# Cleanup
logger.info("Shutting down BGE-M3 embedding server...")
if model:
del model
if tokenizer:
del tokenizer
if onnx_session:
del onnx_session
torch.cuda.empty_cache() if torch.cuda.is_available() else None
# FastAPI app with lifespan
app = FastAPI(
title="GT 2.0 DGX BGE-M3 Embedding Server",
description="DGX Grace ARM optimized BGE-M3 embedding service for GT 2.0",
version="2.0.0-dgx",
lifespan=lifespan
)
# Pydantic models for OpenAI compatibility
class EmbeddingRequest(BaseModel):
input: List[str] = Field(..., description="Input texts to embed")
model: str = Field(default="BAAI/bge-m3", description="Model name")
encoding_format: str = Field(default="float", description="Encoding format")
dimensions: Optional[int] = Field(None, description="Number of dimensions")
user: Optional[str] = Field(None, description="User identifier")
class EmbeddingData(BaseModel):
object: str = "embedding"
embedding: List[float]
index: int
class EmbeddingUsage(BaseModel):
prompt_tokens: int
total_tokens: int
class EmbeddingResponse(BaseModel):
object: str = "list"
data: List[EmbeddingData]
model: str
usage: EmbeddingUsage
@app.get("/health")
async def health_check():
"""Health check endpoint with DGX system metrics"""
if not model and not onnx_session:
raise HTTPException(status_code=503, detail="Model not loaded")
# Include system metrics for DGX monitoring
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
return {
"status": "healthy",
"model": "BAAI/bge-m3",
"mode": model_mode,
"using_onnx": use_onnx,
"platform": os.environ.get('GT2_PLATFORM', 'unknown'),
"architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
"cpu_cores": psutil.cpu_count(logical=True),
"cpu_usage": cpu_percent,
"memory_total_gb": round(memory.total / (1024**3), 1),
"memory_used_gb": round(memory.used / (1024**3), 1),
"memory_available_gb": round(memory.available / (1024**3), 1),
"omp_threads": os.environ.get('OMP_NUM_THREADS', 'not set'),
"pytorch_threads": os.environ.get('PYTORCH_NUM_THREADS', 'not set'),
"timestamp": datetime.utcnow().isoformat()
}
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def create_embeddings(request: EmbeddingRequest):
"""Create embeddings using BGE-M3 model (OpenAI compatible)"""
if not model and not onnx_session:
raise HTTPException(status_code=503, detail="Model not loaded")
try:
start_time = time.time()
input_texts = request.input
# Validate input
if not input_texts or len(input_texts) == 0:
raise HTTPException(status_code=400, detail="Input texts cannot be empty")
# Log processing info for DGX monitoring
logger.info(f"Processing {len(input_texts)} texts with {model_mode}")
# DGX optimized batch processing
if use_onnx and onnx_session:
# Direct ONNX Runtime path for maximum DGX Grace performance
batch_size = min(len(input_texts), 128) # Larger batches for DGX Grace
embeddings = []
for i in range(0, len(input_texts), batch_size):
batch_texts = input_texts[i:i + batch_size]
# Tokenize
inputs = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="np",
max_length=512
)
# Run ONNX inference
# BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
outputs = onnx_session.run(
None, # Get all outputs
{
'input_ids': inputs['input_ids'].astype(np.int64),
'attention_mask': inputs['attention_mask'].astype(np.int64)
}
)
# Get token embeddings (first output)
token_embeddings = outputs[0]
# Mean pooling with attention mask
batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
# Normalize embeddings
norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
embeddings.extend(batch_embeddings)
embeddings = np.array(embeddings)
else:
# SentenceTransformers path with DGX optimization
with torch.no_grad():
embeddings = model.encode(
input_texts,
convert_to_numpy=True,
normalize_embeddings=True,
batch_size=32, # Optimal for DGX Grace
show_progress_bar=False
)
# Convert to list format for OpenAI compatibility
if hasattr(embeddings, 'tolist'):
embeddings = embeddings.tolist()
elif isinstance(embeddings, list) and len(embeddings) > 0:
if hasattr(embeddings[0], 'tolist'):
embeddings = [emb.tolist() for emb in embeddings]
# Create response in OpenAI format
embedding_data = [
EmbeddingData(
embedding=embedding,
index=i
)
for i, embedding in enumerate(embeddings)
]
processing_time = time.time() - start_time
# Calculate token usage (rough estimation)
total_tokens = sum(len(text.split()) for text in input_texts)
# Log performance metrics for DGX monitoring
texts_per_second = len(input_texts) / processing_time
logger.info(f"Processed {len(input_texts)} texts in {processing_time:.2f}s ({texts_per_second:.1f} texts/sec)")
return EmbeddingResponse(
data=embedding_data,
model=request.model,
usage=EmbeddingUsage(
prompt_tokens=total_tokens,
total_tokens=total_tokens
)
)
except Exception as e:
logger.error(f"❌ Embedding generation failed: {e}")
logger.exception("Full traceback:")
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
@app.get("/v1/models")
@app.get("/models")
async def list_models():
"""List available models (OpenAI compatible)"""
return {
"object": "list",
"data": [
{
"id": "BAAI/bge-m3",
"object": "model",
"created": int(time.time()),
"owned_by": "gt2-dgx",
"permission": [],
"root": "BAAI/bge-m3",
"parent": None
}
]
}
@app.get("/")
async def root():
"""Root endpoint with DGX info"""
return {
"service": "GT 2.0 DGX BGE-M3 Embedding Server",
"version": "2.0.0-dgx",
"model": "BAAI/bge-m3",
"mode": model_mode,
"platform": os.environ.get('GT2_PLATFORM', 'unknown'),
"architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
"cpu_cores": psutil.cpu_count(logical=True),
"openai_compatible": True,
"endpoints": {
"embeddings": "/v1/embeddings",
"models": "/models",
"health": "/health"
}
}
if __name__ == "__main__":
logger.info("Starting GT 2.0 DGX BGE-M3 Embedding Server...")
logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
uvicorn.run(
app,
host="0.0.0.0",
port=8000,
workers=1, # Single worker for model memory efficiency
loop="asyncio",
access_log=True
)