GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
56
.deployment/docker/Dockerfile.vllm-arm
Normal file
56
.deployment/docker/Dockerfile.vllm-arm
Normal file
@@ -0,0 +1,56 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies for ARM64 with optimized BLAS libraries
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
curl \
|
||||
libblas-dev \
|
||||
liblapack-dev \
|
||||
libopenblas-dev \
|
||||
gfortran \
|
||||
pkg-config \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install PyTorch CPU-only for ARM with optimized BLAS
|
||||
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
# Install optimized dependencies for ARM64
|
||||
RUN pip install --no-cache-dir \
|
||||
transformers>=4.36.0 \
|
||||
sentence-transformers \
|
||||
fastapi \
|
||||
uvicorn \
|
||||
numpy \
|
||||
accelerate \
|
||||
onnxruntime \
|
||||
optimum[onnxruntime]
|
||||
|
||||
# Set comprehensive ARM64 environment variables for maximum performance
|
||||
ENV OMP_NUM_THREADS=8
|
||||
ENV MKL_NUM_THREADS=8
|
||||
ENV BLIS_NUM_THREADS=8
|
||||
ENV VECLIB_MAXIMUM_THREADS=8
|
||||
ENV PYTORCH_NUM_THREADS=8
|
||||
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
|
||||
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
|
||||
ENV CUDA_VISIBLE_DEVICES=""
|
||||
ENV USE_ONNX_RUNTIME=true
|
||||
ENV CFLAGS="-march=armv8-a+simd+fp16 -O3"
|
||||
ENV CXXFLAGS="-march=armv8-a+simd+fp16 -O3"
|
||||
|
||||
# Create app directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the custom OpenAI-compatible BGE-M3 server
|
||||
COPY .deployment/docker/embedding_server.py /app/embedding_server.py
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
# Run the embedding server
|
||||
CMD ["python", "embedding_server.py"]
|
||||
73
.deployment/docker/Dockerfile.vllm-dgx
Normal file
73
.deployment/docker/Dockerfile.vllm-dgx
Normal file
@@ -0,0 +1,73 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies for DGX Grace ARM with optimized libraries
|
||||
# Note: Removed libatlas-base-dev as it's not available in Debian Trixie ARM64
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
curl \
|
||||
libblas-dev \
|
||||
liblapack-dev \
|
||||
libopenblas-dev \
|
||||
gfortran \
|
||||
pkg-config \
|
||||
build-essential \
|
||||
cmake \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install PyTorch CPU-only for ARM with optimized BLAS
|
||||
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
# Install optimized dependencies for DGX Grace ARM64
|
||||
RUN pip install --no-cache-dir \
|
||||
transformers>=4.36.0 \
|
||||
sentence-transformers \
|
||||
fastapi \
|
||||
uvicorn \
|
||||
numpy \
|
||||
accelerate \
|
||||
onnxruntime \
|
||||
optimum[onnxruntime] \
|
||||
psutil
|
||||
|
||||
# Set comprehensive DGX Grace ARM64 environment variables for maximum performance
|
||||
ENV OMP_NUM_THREADS=20
|
||||
ENV MKL_NUM_THREADS=20
|
||||
ENV BLIS_NUM_THREADS=20
|
||||
ENV OPENBLAS_NUM_THREADS=20
|
||||
ENV VECLIB_MAXIMUM_THREADS=20
|
||||
ENV PYTORCH_NUM_THREADS=20
|
||||
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
|
||||
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
|
||||
ENV CUDA_VISIBLE_DEVICES=""
|
||||
ENV USE_ONNX_RUNTIME=true
|
||||
ENV MALLOC_ARENA_MAX=8
|
||||
|
||||
# DGX Grace architecture optimizations
|
||||
ENV CFLAGS="-march=armv8.2-a+fp16+rcpc+dotprod -O3 -ffast-math"
|
||||
ENV CXXFLAGS="-march=armv8.2-a+fp16+rcpc+dotprod -O3 -ffast-math"
|
||||
|
||||
# Memory optimization for 128GB system
|
||||
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
|
||||
ENV OMP_STACKSIZE=2M
|
||||
ENV KMP_STACKSIZE=2M
|
||||
|
||||
# Platform identification
|
||||
ENV GT2_PLATFORM=dgx
|
||||
ENV GT2_ARCHITECTURE=grace-arm
|
||||
|
||||
# Create app directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the custom OpenAI-compatible BGE-M3 server optimized for DGX
|
||||
COPY .deployment/docker/embedding_server_dgx.py /app/embedding_server.py
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check with longer timeout for DGX startup
|
||||
HEALTHCHECK --interval=30s --timeout=60s --start-period=600s --retries=5 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
# Run the embedding server
|
||||
CMD ["python", "embedding_server.py"]
|
||||
56
.deployment/docker/Dockerfile.vllm-x86
Normal file
56
.deployment/docker/Dockerfile.vllm-x86
Normal file
@@ -0,0 +1,56 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies for x86_64 with optimized BLAS libraries
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
curl \
|
||||
libblas-dev \
|
||||
liblapack-dev \
|
||||
libopenblas-dev \
|
||||
gfortran \
|
||||
pkg-config \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install PyTorch with CUDA support for x86_64 (auto-falls back to CPU if no GPU)
|
||||
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
||||
|
||||
# Install optimized dependencies for x86_64
|
||||
RUN pip install --no-cache-dir \
|
||||
transformers>=4.36.0 \
|
||||
sentence-transformers \
|
||||
fastapi \
|
||||
uvicorn \
|
||||
numpy \
|
||||
accelerate \
|
||||
onnxruntime-gpu \
|
||||
optimum[onnxruntime-gpu]
|
||||
|
||||
# Set comprehensive x86_64 environment variables for maximum performance
|
||||
ENV OMP_NUM_THREADS=16
|
||||
ENV BLIS_NUM_THREADS=16
|
||||
ENV OPENBLAS_NUM_THREADS=16
|
||||
ENV PYTORCH_NUM_THREADS=16
|
||||
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
|
||||
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
|
||||
# GPU auto-detection: ONNX Runtime will use CUDAExecutionProvider if available, else CPU
|
||||
ENV USE_ONNX_RUNTIME=true
|
||||
# x86_64 specific compiler optimization flags
|
||||
ENV CFLAGS="-march=native -O3 -mavx2 -mfma"
|
||||
ENV CXXFLAGS="-march=native -O3 -mavx2 -mfma"
|
||||
|
||||
# Create app directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the custom OpenAI-compatible BGE-M3 server
|
||||
COPY .deployment/docker/embedding_server.py /app/embedding_server.py
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
# Run the embedding server
|
||||
CMD ["python", "embedding_server.py"]
|
||||
381
.deployment/docker/embedding_server.py
Normal file
381
.deployment/docker/embedding_server.py
Normal file
@@ -0,0 +1,381 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OpenAI-Compatible BGE-M3 Embedding Server for GT 2.0
|
||||
Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import uvicorn
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
# Setup logging first
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# BGE-M3 Model with ONNX Runtime optimization
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
# Limit VRAM usage if GPU is available (BGE-M3 needs ~2.5GB)
|
||||
if torch.cuda.is_available():
|
||||
memory_fraction = float(os.environ.get('CUDA_MEMORY_FRACTION', '0.25'))
|
||||
torch.cuda.set_per_process_memory_fraction(memory_fraction)
|
||||
logger.info(f"CUDA memory limited to {memory_fraction*100:.0f}% of available VRAM")
|
||||
|
||||
# ONNX Runtime imports with direct session support
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
from transformers import AutoTokenizer
|
||||
ONNX_AVAILABLE = True
|
||||
logger.info(f"ONNX Runtime available (providers: {ort.get_available_providers()})")
|
||||
except ImportError as e:
|
||||
ONNX_AVAILABLE = False
|
||||
logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
|
||||
|
||||
# Global model instances
|
||||
model = None
|
||||
tokenizer = None
|
||||
onnx_session = None
|
||||
use_onnx = False
|
||||
model_mode = "unknown"
|
||||
|
||||
def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Perform mean pooling on token embeddings using attention mask.
|
||||
|
||||
Args:
|
||||
token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
|
||||
attention_mask: Attention mask [batch_size, seq_len]
|
||||
|
||||
Returns:
|
||||
Pooled embeddings [batch_size, hidden_dim]
|
||||
"""
|
||||
# Expand attention mask to match embeddings dimensions
|
||||
attention_mask_expanded = np.expand_dims(attention_mask, -1)
|
||||
|
||||
# Sum embeddings where attention mask is 1
|
||||
sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
|
||||
|
||||
# Sum attention mask to get actual sequence lengths
|
||||
sum_mask = np.sum(attention_mask_expanded, axis=1)
|
||||
|
||||
# Divide to get mean (avoid division by zero)
|
||||
mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
|
||||
|
||||
return mean_embeddings
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Load BGE-M3 model on startup with ONNX optimization"""
|
||||
global model, tokenizer, onnx_session, use_onnx, model_mode
|
||||
logger.info("Loading BGE-M3 model with ARM64 optimization...")
|
||||
|
||||
# Check if ONNX Runtime should be used
|
||||
use_onnx_env = os.getenv('USE_ONNX_RUNTIME', 'true').lower() == 'true'
|
||||
|
||||
try:
|
||||
if ONNX_AVAILABLE and use_onnx_env:
|
||||
# Try ONNX Runtime with direct session for maximum ARM64 performance
|
||||
logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
|
||||
try:
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
|
||||
|
||||
# Check for cached ONNX model
|
||||
cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
|
||||
model_id = 'models--BAAI--bge-m3'
|
||||
|
||||
# Find ONNX model in cache
|
||||
import glob
|
||||
onnx_pattern = f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx'
|
||||
onnx_files = glob.glob(onnx_pattern)
|
||||
|
||||
if onnx_files:
|
||||
onnx_path = onnx_files[0]
|
||||
logger.info(f"Found cached ONNX model at: {onnx_path}")
|
||||
|
||||
# Configure ONNX session options to suppress ARM64 warnings
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.log_severity_level = 3 # 3=ERROR (suppresses warnings)
|
||||
|
||||
# Create ONNX session with GPU auto-detection (falls back to CPU)
|
||||
onnx_session = ort.InferenceSession(
|
||||
onnx_path,
|
||||
sess_options=sess_options,
|
||||
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
)
|
||||
|
||||
use_onnx = True
|
||||
model_mode = "ONNX Runtime (Direct Session)"
|
||||
logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session")
|
||||
|
||||
# Log ONNX model outputs for debugging
|
||||
logger.info("ONNX model outputs:")
|
||||
for output in onnx_session.get_outputs():
|
||||
logger.info(f" - {output.name}: {output.shape}")
|
||||
else:
|
||||
logger.warning("No cached ONNX model found, need to export first...")
|
||||
logger.info("Attempting ONNX export via optimum...")
|
||||
|
||||
# Try to export ONNX model using optimum
|
||||
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
||||
|
||||
# This will cache the ONNX model for future use
|
||||
temp_model = ORTModelForFeatureExtraction.from_pretrained(
|
||||
'BAAI/bge-m3',
|
||||
export=False,
|
||||
provider="CPUExecutionProvider"
|
||||
)
|
||||
del temp_model
|
||||
|
||||
# Now find the newly exported model
|
||||
onnx_files = glob.glob(onnx_pattern)
|
||||
if onnx_files:
|
||||
onnx_path = onnx_files[0]
|
||||
logger.info(f"ONNX model exported to: {onnx_path}")
|
||||
|
||||
# Load with direct session (GPU auto-detection)
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.log_severity_level = 3
|
||||
|
||||
onnx_session = ort.InferenceSession(
|
||||
onnx_path,
|
||||
sess_options=sess_options,
|
||||
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
)
|
||||
|
||||
use_onnx = True
|
||||
model_mode = "ONNX Runtime (Direct Session - Exported)"
|
||||
logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session")
|
||||
else:
|
||||
raise FileNotFoundError("ONNX export completed but model file not found")
|
||||
|
||||
except Exception as onnx_error:
|
||||
logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
|
||||
logger.warning(f"Error type: {type(onnx_error).__name__}")
|
||||
logger.info("Falling back to SentenceTransformers...")
|
||||
raise onnx_error
|
||||
else:
|
||||
logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
|
||||
raise ImportError("ONNX disabled")
|
||||
|
||||
except Exception:
|
||||
# Fallback to SentenceTransformers with GPU auto-detection
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
logger.info(f"Loading BGE-M3 with SentenceTransformers (fallback mode) on {device}...")
|
||||
model = SentenceTransformer(
|
||||
'BAAI/bge-m3',
|
||||
device=device,
|
||||
trust_remote_code=True
|
||||
)
|
||||
use_onnx = False
|
||||
model_mode = f"SentenceTransformers ({device.upper()})"
|
||||
logger.info(f"✅ BGE-M3 model loaded with SentenceTransformers on {device}")
|
||||
|
||||
logger.info(f"Model mode: {model_mode}")
|
||||
logger.info(f"PyTorch threads: {torch.get_num_threads()}")
|
||||
logger.info(f"OMP threads: {os.getenv('OMP_NUM_THREADS', 'not set')}")
|
||||
logger.info(f"CUDA available: {torch.cuda.is_available()}")
|
||||
if torch.cuda.is_available():
|
||||
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
|
||||
|
||||
yield
|
||||
|
||||
# Cleanup
|
||||
if model:
|
||||
del model
|
||||
if tokenizer:
|
||||
del tokenizer
|
||||
if onnx_session:
|
||||
del onnx_session
|
||||
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
||||
|
||||
app = FastAPI(
|
||||
title="BGE-M3 Embedding Service",
|
||||
description="OpenAI-compatible BGE-M3 embedding API for GT 2.0",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# OpenAI-compatible request models
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: List[str] = Field(..., description="Input texts to embed")
|
||||
model: str = Field(default="BAAI/bge-m3", description="Model name")
|
||||
encoding_format: str = Field(default="float", description="Encoding format")
|
||||
dimensions: Optional[int] = Field(None, description="Number of dimensions")
|
||||
user: Optional[str] = Field(None, description="User identifier")
|
||||
|
||||
class EmbeddingData(BaseModel):
|
||||
object: str = "embedding"
|
||||
embedding: List[float]
|
||||
index: int
|
||||
|
||||
class EmbeddingUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
class EmbeddingResponse(BaseModel):
|
||||
object: str = "list"
|
||||
data: List[EmbeddingData]
|
||||
model: str
|
||||
usage: EmbeddingUsage
|
||||
|
||||
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
|
||||
async def create_embeddings(request: EmbeddingRequest):
|
||||
"""Generate embeddings using BGE-M3 model"""
|
||||
|
||||
if not model and not onnx_session:
|
||||
raise HTTPException(status_code=500, detail="BGE-M3 model not loaded")
|
||||
|
||||
if not request.input:
|
||||
raise HTTPException(status_code=400, detail="No input texts provided")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
logger.info(f"Generating embeddings for {len(request.input)} texts using {model_mode}")
|
||||
|
||||
# Generate embeddings with mode-specific logic
|
||||
if use_onnx and onnx_session:
|
||||
# Direct ONNX Runtime path for maximum performance
|
||||
batch_size = min(len(request.input), 64)
|
||||
embeddings = []
|
||||
|
||||
for i in range(0, len(request.input), batch_size):
|
||||
batch_texts = request.input[i:i + batch_size]
|
||||
|
||||
# Tokenize
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
max_length=512
|
||||
)
|
||||
|
||||
# Run ONNX inference
|
||||
# BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
|
||||
outputs = onnx_session.run(
|
||||
None, # Get all outputs
|
||||
{
|
||||
'input_ids': inputs['input_ids'].astype(np.int64),
|
||||
'attention_mask': inputs['attention_mask'].astype(np.int64)
|
||||
}
|
||||
)
|
||||
|
||||
# Get token embeddings (first output)
|
||||
token_embeddings = outputs[0]
|
||||
|
||||
# Mean pooling with attention mask
|
||||
batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
|
||||
|
||||
# Normalize embeddings
|
||||
norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
|
||||
batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
|
||||
|
||||
embeddings.extend(batch_embeddings)
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
else:
|
||||
# SentenceTransformers fallback path
|
||||
embeddings = model.encode(
|
||||
request.input,
|
||||
batch_size=min(len(request.input), 64),
|
||||
show_progress_bar=False,
|
||||
convert_to_tensor=False,
|
||||
normalize_embeddings=True
|
||||
)
|
||||
|
||||
# Convert to list format
|
||||
if hasattr(embeddings, 'tolist'):
|
||||
embeddings = embeddings.tolist()
|
||||
elif isinstance(embeddings, list) and len(embeddings) > 0:
|
||||
if hasattr(embeddings[0], 'tolist'):
|
||||
embeddings = [emb.tolist() for emb in embeddings]
|
||||
|
||||
# Create response in OpenAI format
|
||||
embedding_data = [
|
||||
EmbeddingData(
|
||||
embedding=embedding,
|
||||
index=i
|
||||
)
|
||||
for i, embedding in enumerate(embeddings)
|
||||
]
|
||||
|
||||
# Calculate token usage (rough estimation)
|
||||
total_tokens = sum(len(text.split()) for text in request.input)
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
logger.info(f"Generated {len(embeddings)} embeddings in {processing_time_ms}ms")
|
||||
|
||||
return EmbeddingResponse(
|
||||
data=embedding_data,
|
||||
model=request.model,
|
||||
usage=EmbeddingUsage(
|
||||
prompt_tokens=total_tokens,
|
||||
total_tokens=total_tokens
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating embeddings: {e}")
|
||||
logger.exception("Full traceback:")
|
||||
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy" if (model or onnx_session) else "unhealthy",
|
||||
"model": "BAAI/bge-m3",
|
||||
"service": "bge-m3-embeddings",
|
||||
"mode": model_mode,
|
||||
"onnx_enabled": use_onnx,
|
||||
"gpu_available": torch.cuda.is_available(),
|
||||
"gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
|
||||
"pytorch_threads": torch.get_num_threads(),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_models():
|
||||
"""List available models (OpenAI-compatible)"""
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "BAAI/bge-m3",
|
||||
"object": "model",
|
||||
"created": int(time.time()),
|
||||
"owned_by": "gt2"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint"""
|
||||
return {
|
||||
"service": "BGE-M3 Embedding Service",
|
||||
"model": "BAAI/bge-m3",
|
||||
"version": "1.0.0",
|
||||
"api": "OpenAI-compatible",
|
||||
"status": "ready" if (model or onnx_session) else "loading"
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(
|
||||
"embedding_server:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
log_level="info"
|
||||
)
|
||||
464
.deployment/docker/embedding_server_dgx.py
Normal file
464
.deployment/docker/embedding_server_dgx.py
Normal file
@@ -0,0 +1,464 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DGX-Optimized BGE-M3 Embedding Server for GT 2.0
|
||||
Optimized for NVIDIA DGX Spark with 20-core Grace ARM architecture
|
||||
Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import uvicorn
|
||||
import psutil
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
# Setup logging first
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# BGE-M3 Model with DGX Grace optimizations
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
# ONNX Runtime imports with direct session support
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
from transformers import AutoTokenizer
|
||||
ONNX_AVAILABLE = True
|
||||
logger.info("ONNX Runtime available for DGX Grace ARM64 optimization")
|
||||
except ImportError as e:
|
||||
ONNX_AVAILABLE = False
|
||||
logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
|
||||
|
||||
# Global model instances
|
||||
model = None
|
||||
tokenizer = None
|
||||
onnx_session = None
|
||||
use_onnx = False
|
||||
model_mode = "unknown"
|
||||
|
||||
def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Perform mean pooling on token embeddings using attention mask.
|
||||
|
||||
Args:
|
||||
token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
|
||||
attention_mask: Attention mask [batch_size, seq_len]
|
||||
|
||||
Returns:
|
||||
Pooled embeddings [batch_size, hidden_dim]
|
||||
"""
|
||||
# Expand attention mask to match embeddings dimensions
|
||||
attention_mask_expanded = np.expand_dims(attention_mask, -1)
|
||||
|
||||
# Sum embeddings where attention mask is 1
|
||||
sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
|
||||
|
||||
# Sum attention mask to get actual sequence lengths
|
||||
sum_mask = np.sum(attention_mask_expanded, axis=1)
|
||||
|
||||
# Divide to get mean (avoid division by zero)
|
||||
mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
|
||||
|
||||
return mean_embeddings
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Load BGE-M3 model on startup with DGX Grace optimization"""
|
||||
global model, tokenizer, onnx_session, use_onnx, model_mode
|
||||
logger.info("Loading BGE-M3 model with DGX Grace ARM64 optimization...")
|
||||
|
||||
# Log system information
|
||||
logger.info(f"CPU cores: {psutil.cpu_count(logical=True)}")
|
||||
logger.info(f"Memory: {psutil.virtual_memory().total / (1024**3):.1f}GB")
|
||||
logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
|
||||
logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
|
||||
|
||||
# Check if ONNX Runtime should be used and is available
|
||||
use_onnx_env = os.environ.get('USE_ONNX_RUNTIME', 'true').lower() == 'true'
|
||||
|
||||
try:
|
||||
if ONNX_AVAILABLE and use_onnx_env:
|
||||
# Try ONNX Runtime with direct session for maximum DGX Grace performance
|
||||
logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
|
||||
try:
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
|
||||
|
||||
# Check for cached ONNX model
|
||||
cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
|
||||
model_id = 'models--BAAI--bge-m3'
|
||||
|
||||
# Find ONNX model in cache - check multiple possible locations
|
||||
import glob
|
||||
onnx_locations = [
|
||||
f'{cache_dir}/{model_id}/onnx/model.onnx', # Our export location
|
||||
f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx', # HF cache location
|
||||
]
|
||||
onnx_files = []
|
||||
for pattern in onnx_locations:
|
||||
onnx_files = glob.glob(pattern)
|
||||
if onnx_files:
|
||||
break
|
||||
|
||||
if onnx_files:
|
||||
onnx_path = onnx_files[0]
|
||||
logger.info(f"Found cached ONNX model at: {onnx_path}")
|
||||
|
||||
# Configure ONNX session options for DGX Grace ARM64
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.log_severity_level = 3 # 3=ERROR (suppresses warnings)
|
||||
sess_options.intra_op_num_threads = 20 # DGX Grace 20 cores
|
||||
sess_options.inter_op_num_threads = 4
|
||||
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
|
||||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
# Create ONNX session with DGX optimized settings
|
||||
onnx_session = ort.InferenceSession(
|
||||
onnx_path,
|
||||
sess_options=sess_options,
|
||||
providers=['CPUExecutionProvider']
|
||||
)
|
||||
|
||||
use_onnx = True
|
||||
model_mode = "ONNX Runtime (Direct Session - DGX)"
|
||||
logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session (DGX optimized)")
|
||||
|
||||
# Log ONNX model outputs for debugging
|
||||
logger.info("ONNX model outputs:")
|
||||
for output in onnx_session.get_outputs():
|
||||
logger.info(f" - {output.name}: {output.shape}")
|
||||
else:
|
||||
logger.warning("No cached ONNX model found, need to export first...")
|
||||
logger.info("Attempting ONNX export via optimum...")
|
||||
|
||||
# Try to export ONNX model using optimum
|
||||
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
||||
|
||||
# Define export path within the huggingface cache structure
|
||||
onnx_export_path = os.path.expanduser('~/.cache/huggingface/hub/models--BAAI--bge-m3/onnx')
|
||||
os.makedirs(onnx_export_path, exist_ok=True)
|
||||
|
||||
logger.info(f"Exporting ONNX model to: {onnx_export_path}")
|
||||
|
||||
# Export and save the ONNX model
|
||||
temp_model = ORTModelForFeatureExtraction.from_pretrained(
|
||||
'BAAI/bge-m3',
|
||||
export=True,
|
||||
provider="CPUExecutionProvider"
|
||||
)
|
||||
temp_model.save_pretrained(onnx_export_path)
|
||||
logger.info(f"ONNX model saved to: {onnx_export_path}")
|
||||
del temp_model
|
||||
|
||||
# Look for the exported model in the new location
|
||||
onnx_export_pattern = f'{onnx_export_path}/model.onnx'
|
||||
onnx_files = glob.glob(onnx_export_pattern)
|
||||
|
||||
# Also check the original pattern in case it was cached differently
|
||||
if not onnx_files:
|
||||
onnx_files = glob.glob(onnx_pattern)
|
||||
if onnx_files:
|
||||
onnx_path = onnx_files[0]
|
||||
logger.info(f"ONNX model exported to: {onnx_path}")
|
||||
|
||||
# Load with direct session
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.log_severity_level = 3
|
||||
sess_options.intra_op_num_threads = 20
|
||||
sess_options.inter_op_num_threads = 4
|
||||
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
|
||||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
onnx_session = ort.InferenceSession(
|
||||
onnx_path,
|
||||
sess_options=sess_options,
|
||||
providers=['CPUExecutionProvider']
|
||||
)
|
||||
|
||||
use_onnx = True
|
||||
model_mode = "ONNX Runtime (Direct Session - DGX Exported)"
|
||||
logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session (DGX optimized)")
|
||||
else:
|
||||
raise FileNotFoundError("ONNX export completed but model file not found")
|
||||
|
||||
except Exception as onnx_error:
|
||||
logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
|
||||
logger.warning(f"Error type: {type(onnx_error).__name__}")
|
||||
logger.info("Falling back to SentenceTransformers...")
|
||||
raise onnx_error
|
||||
else:
|
||||
logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
|
||||
raise ImportError("ONNX disabled")
|
||||
|
||||
except Exception:
|
||||
# Fallback to SentenceTransformers if ONNX fails or is disabled
|
||||
logger.info("Loading BGE-M3 with SentenceTransformers (DGX Grace optimized)...")
|
||||
try:
|
||||
# Configure PyTorch for DGX Grace
|
||||
torch.set_num_threads(20) # DGX Grace 20 cores
|
||||
torch.set_num_interop_threads(4)
|
||||
|
||||
# Load model with DGX optimizations
|
||||
model = SentenceTransformer(
|
||||
'BAAI/bge-m3',
|
||||
device='cpu',
|
||||
trust_remote_code=True,
|
||||
model_kwargs={
|
||||
'torch_dtype': torch.float16, # Memory optimization for large models
|
||||
'low_cpu_mem_usage': False # Use full memory for performance
|
||||
}
|
||||
)
|
||||
|
||||
# Enable optimizations
|
||||
model._modules['0'].auto_model.eval()
|
||||
|
||||
use_onnx = False
|
||||
model_mode = "SentenceTransformers (DGX Grace)"
|
||||
logger.info("✅ BGE-M3 loaded successfully with SentenceTransformers (DGX Grace optimized)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load BGE-M3 model: {e}")
|
||||
raise e
|
||||
|
||||
# Log model configuration
|
||||
logger.info(f"Model mode: {model_mode}")
|
||||
logger.info(f"Using ONNX: {use_onnx}")
|
||||
logger.info(f"OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'not set')}")
|
||||
logger.info(f"PYTORCH_NUM_THREADS: {os.environ.get('PYTORCH_NUM_THREADS', 'not set')}")
|
||||
|
||||
yield
|
||||
|
||||
# Cleanup
|
||||
logger.info("Shutting down BGE-M3 embedding server...")
|
||||
if model:
|
||||
del model
|
||||
if tokenizer:
|
||||
del tokenizer
|
||||
if onnx_session:
|
||||
del onnx_session
|
||||
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
||||
|
||||
# FastAPI app with lifespan
|
||||
app = FastAPI(
|
||||
title="GT 2.0 DGX BGE-M3 Embedding Server",
|
||||
description="DGX Grace ARM optimized BGE-M3 embedding service for GT 2.0",
|
||||
version="2.0.0-dgx",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Pydantic models for OpenAI compatibility
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: List[str] = Field(..., description="Input texts to embed")
|
||||
model: str = Field(default="BAAI/bge-m3", description="Model name")
|
||||
encoding_format: str = Field(default="float", description="Encoding format")
|
||||
dimensions: Optional[int] = Field(None, description="Number of dimensions")
|
||||
user: Optional[str] = Field(None, description="User identifier")
|
||||
|
||||
class EmbeddingData(BaseModel):
|
||||
object: str = "embedding"
|
||||
embedding: List[float]
|
||||
index: int
|
||||
|
||||
class EmbeddingUsage(BaseModel):
|
||||
prompt_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
class EmbeddingResponse(BaseModel):
|
||||
object: str = "list"
|
||||
data: List[EmbeddingData]
|
||||
model: str
|
||||
usage: EmbeddingUsage
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint with DGX system metrics"""
|
||||
if not model and not onnx_session:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
|
||||
# Include system metrics for DGX monitoring
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
memory = psutil.virtual_memory()
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"model": "BAAI/bge-m3",
|
||||
"mode": model_mode,
|
||||
"using_onnx": use_onnx,
|
||||
"platform": os.environ.get('GT2_PLATFORM', 'unknown'),
|
||||
"architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
|
||||
"cpu_cores": psutil.cpu_count(logical=True),
|
||||
"cpu_usage": cpu_percent,
|
||||
"memory_total_gb": round(memory.total / (1024**3), 1),
|
||||
"memory_used_gb": round(memory.used / (1024**3), 1),
|
||||
"memory_available_gb": round(memory.available / (1024**3), 1),
|
||||
"omp_threads": os.environ.get('OMP_NUM_THREADS', 'not set'),
|
||||
"pytorch_threads": os.environ.get('PYTORCH_NUM_THREADS', 'not set'),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
|
||||
async def create_embeddings(request: EmbeddingRequest):
|
||||
"""Create embeddings using BGE-M3 model (OpenAI compatible)"""
|
||||
if not model and not onnx_session:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
input_texts = request.input
|
||||
|
||||
# Validate input
|
||||
if not input_texts or len(input_texts) == 0:
|
||||
raise HTTPException(status_code=400, detail="Input texts cannot be empty")
|
||||
|
||||
# Log processing info for DGX monitoring
|
||||
logger.info(f"Processing {len(input_texts)} texts with {model_mode}")
|
||||
|
||||
# DGX optimized batch processing
|
||||
if use_onnx and onnx_session:
|
||||
# Direct ONNX Runtime path for maximum DGX Grace performance
|
||||
batch_size = min(len(input_texts), 128) # Larger batches for DGX Grace
|
||||
embeddings = []
|
||||
|
||||
for i in range(0, len(input_texts), batch_size):
|
||||
batch_texts = input_texts[i:i + batch_size]
|
||||
|
||||
# Tokenize
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
max_length=512
|
||||
)
|
||||
|
||||
# Run ONNX inference
|
||||
# BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
|
||||
outputs = onnx_session.run(
|
||||
None, # Get all outputs
|
||||
{
|
||||
'input_ids': inputs['input_ids'].astype(np.int64),
|
||||
'attention_mask': inputs['attention_mask'].astype(np.int64)
|
||||
}
|
||||
)
|
||||
|
||||
# Get token embeddings (first output)
|
||||
token_embeddings = outputs[0]
|
||||
|
||||
# Mean pooling with attention mask
|
||||
batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
|
||||
|
||||
# Normalize embeddings
|
||||
norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
|
||||
batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
|
||||
|
||||
embeddings.extend(batch_embeddings)
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
else:
|
||||
# SentenceTransformers path with DGX optimization
|
||||
with torch.no_grad():
|
||||
embeddings = model.encode(
|
||||
input_texts,
|
||||
convert_to_numpy=True,
|
||||
normalize_embeddings=True,
|
||||
batch_size=32, # Optimal for DGX Grace
|
||||
show_progress_bar=False
|
||||
)
|
||||
|
||||
# Convert to list format for OpenAI compatibility
|
||||
if hasattr(embeddings, 'tolist'):
|
||||
embeddings = embeddings.tolist()
|
||||
elif isinstance(embeddings, list) and len(embeddings) > 0:
|
||||
if hasattr(embeddings[0], 'tolist'):
|
||||
embeddings = [emb.tolist() for emb in embeddings]
|
||||
|
||||
# Create response in OpenAI format
|
||||
embedding_data = [
|
||||
EmbeddingData(
|
||||
embedding=embedding,
|
||||
index=i
|
||||
)
|
||||
for i, embedding in enumerate(embeddings)
|
||||
]
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
# Calculate token usage (rough estimation)
|
||||
total_tokens = sum(len(text.split()) for text in input_texts)
|
||||
|
||||
# Log performance metrics for DGX monitoring
|
||||
texts_per_second = len(input_texts) / processing_time
|
||||
logger.info(f"Processed {len(input_texts)} texts in {processing_time:.2f}s ({texts_per_second:.1f} texts/sec)")
|
||||
|
||||
return EmbeddingResponse(
|
||||
data=embedding_data,
|
||||
model=request.model,
|
||||
usage=EmbeddingUsage(
|
||||
prompt_tokens=total_tokens,
|
||||
total_tokens=total_tokens
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Embedding generation failed: {e}")
|
||||
logger.exception("Full traceback:")
|
||||
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
|
||||
|
||||
@app.get("/v1/models")
|
||||
@app.get("/models")
|
||||
async def list_models():
|
||||
"""List available models (OpenAI compatible)"""
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "BAAI/bge-m3",
|
||||
"object": "model",
|
||||
"created": int(time.time()),
|
||||
"owned_by": "gt2-dgx",
|
||||
"permission": [],
|
||||
"root": "BAAI/bge-m3",
|
||||
"parent": None
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint with DGX info"""
|
||||
return {
|
||||
"service": "GT 2.0 DGX BGE-M3 Embedding Server",
|
||||
"version": "2.0.0-dgx",
|
||||
"model": "BAAI/bge-m3",
|
||||
"mode": model_mode,
|
||||
"platform": os.environ.get('GT2_PLATFORM', 'unknown'),
|
||||
"architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
|
||||
"cpu_cores": psutil.cpu_count(logical=True),
|
||||
"openai_compatible": True,
|
||||
"endpoints": {
|
||||
"embeddings": "/v1/embeddings",
|
||||
"models": "/models",
|
||||
"health": "/health"
|
||||
}
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Starting GT 2.0 DGX BGE-M3 Embedding Server...")
|
||||
logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
|
||||
logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
|
||||
|
||||
uvicorn.run(
|
||||
app,
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
workers=1, # Single worker for model memory efficiency
|
||||
loop="asyncio",
|
||||
access_log=True
|
||||
)
|
||||
Reference in New Issue
Block a user