GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/.deployment/docker/Dockerfile.vllm-arm
+++ b/.deployment/docker/Dockerfile.vllm-arm
@@ -0,0 +1,56 @@
+FROM python:3.11-slim
+
+# Install system dependencies for ARM64 with optimized BLAS libraries
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    curl \
+    libblas-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    gfortran \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install PyTorch CPU-only for ARM with optimized BLAS
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+
+# Install optimized dependencies for ARM64
+RUN pip install --no-cache-dir \
+    transformers>=4.36.0 \
+    sentence-transformers \
+    fastapi \
+    uvicorn \
+    numpy \
+    accelerate \
+    onnxruntime \
+    optimum[onnxruntime]
+
+# Set comprehensive ARM64 environment variables for maximum performance
+ENV OMP_NUM_THREADS=8
+ENV MKL_NUM_THREADS=8
+ENV BLIS_NUM_THREADS=8
+ENV VECLIB_MAXIMUM_THREADS=8
+ENV PYTORCH_NUM_THREADS=8
+ENV PYTORCH_ENABLE_MPS_FALLBACK=1
+ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
+ENV CUDA_VISIBLE_DEVICES=""
+ENV USE_ONNX_RUNTIME=true
+ENV CFLAGS="-march=armv8-a+simd+fp16 -O3"
+ENV CXXFLAGS="-march=armv8-a+simd+fp16 -O3"
+
+# Create app directory
+WORKDIR /app
+
+# Copy the custom OpenAI-compatible BGE-M3 server
+COPY .deployment/docker/embedding_server.py /app/embedding_server.py
+
+# Expose port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Run the embedding server
+CMD ["python", "embedding_server.py"]
--- a/.deployment/docker/Dockerfile.vllm-dgx
+++ b/.deployment/docker/Dockerfile.vllm-dgx
@@ -0,0 +1,73 @@
+FROM python:3.11-slim
+
+# Install system dependencies for DGX Grace ARM with optimized libraries
+# Note: Removed libatlas-base-dev as it's not available in Debian Trixie ARM64
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    curl \
+    libblas-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    gfortran \
+    pkg-config \
+    build-essential \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install PyTorch CPU-only for ARM with optimized BLAS
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+
+# Install optimized dependencies for DGX Grace ARM64
+RUN pip install --no-cache-dir \
+    transformers>=4.36.0 \
+    sentence-transformers \
+    fastapi \
+    uvicorn \
+    numpy \
+    accelerate \
+    onnxruntime \
+    optimum[onnxruntime] \
+    psutil
+
+# Set comprehensive DGX Grace ARM64 environment variables for maximum performance
+ENV OMP_NUM_THREADS=20
+ENV MKL_NUM_THREADS=20
+ENV BLIS_NUM_THREADS=20
+ENV OPENBLAS_NUM_THREADS=20
+ENV VECLIB_MAXIMUM_THREADS=20
+ENV PYTORCH_NUM_THREADS=20
+ENV PYTORCH_ENABLE_MPS_FALLBACK=1
+ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
+ENV CUDA_VISIBLE_DEVICES=""
+ENV USE_ONNX_RUNTIME=true
+ENV MALLOC_ARENA_MAX=8
+
+# DGX Grace architecture optimizations
+ENV CFLAGS="-march=armv8.2-a+fp16+rcpc+dotprod -O3 -ffast-math"
+ENV CXXFLAGS="-march=armv8.2-a+fp16+rcpc+dotprod -O3 -ffast-math"
+
+# Memory optimization for 128GB system
+ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
+ENV OMP_STACKSIZE=2M
+ENV KMP_STACKSIZE=2M
+
+# Platform identification
+ENV GT2_PLATFORM=dgx
+ENV GT2_ARCHITECTURE=grace-arm
+
+# Create app directory
+WORKDIR /app
+
+# Copy the custom OpenAI-compatible BGE-M3 server optimized for DGX
+COPY .deployment/docker/embedding_server_dgx.py /app/embedding_server.py
+
+# Expose port
+EXPOSE 8000
+
+# Health check with longer timeout for DGX startup
+HEALTHCHECK --interval=30s --timeout=60s --start-period=600s --retries=5 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Run the embedding server
+CMD ["python", "embedding_server.py"]
--- a/.deployment/docker/Dockerfile.vllm-x86
+++ b/.deployment/docker/Dockerfile.vllm-x86
@@ -0,0 +1,56 @@
+FROM python:3.11-slim
+
+# Install system dependencies for x86_64 with optimized BLAS libraries
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    curl \
+    libblas-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    gfortran \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install PyTorch with CUDA support for x86_64 (auto-falls back to CPU if no GPU)
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+
+# Install optimized dependencies for x86_64
+RUN pip install --no-cache-dir \
+    transformers>=4.36.0 \
+    sentence-transformers \
+    fastapi \
+    uvicorn \
+    numpy \
+    accelerate \
+    onnxruntime-gpu \
+    optimum[onnxruntime-gpu]
+
+# Set comprehensive x86_64 environment variables for maximum performance
+ENV OMP_NUM_THREADS=16
+ENV BLIS_NUM_THREADS=16
+ENV OPENBLAS_NUM_THREADS=16
+ENV PYTORCH_NUM_THREADS=16
+ENV PYTORCH_ENABLE_MPS_FALLBACK=1
+ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
+# GPU auto-detection: ONNX Runtime will use CUDAExecutionProvider if available, else CPU
+ENV USE_ONNX_RUNTIME=true
+# x86_64 specific compiler optimization flags
+ENV CFLAGS="-march=native -O3 -mavx2 -mfma"
+ENV CXXFLAGS="-march=native -O3 -mavx2 -mfma"
+
+# Create app directory
+WORKDIR /app
+
+# Copy the custom OpenAI-compatible BGE-M3 server
+COPY .deployment/docker/embedding_server.py /app/embedding_server.py
+
+# Expose port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Run the embedding server
+CMD ["python", "embedding_server.py"]
--- a/.deployment/docker/embedding_server.py
+++ b/.deployment/docker/embedding_server.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+OpenAI-Compatible BGE-M3 Embedding Server for GT 2.0
+Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
+"""
+
+import asyncio
+import logging
+import time
+import uvicorn
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, HTTPException
+from contextlib import asynccontextmanager
+
+# Setup logging first
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# BGE-M3 Model with ONNX Runtime optimization
+from sentence_transformers import SentenceTransformer
+import torch
+import os
+import numpy as np
+
+# Limit VRAM usage if GPU is available (BGE-M3 needs ~2.5GB)
+if torch.cuda.is_available():
+    memory_fraction = float(os.environ.get('CUDA_MEMORY_FRACTION', '0.25'))
+    torch.cuda.set_per_process_memory_fraction(memory_fraction)
+    logger.info(f"CUDA memory limited to {memory_fraction*100:.0f}% of available VRAM")
+
+# ONNX Runtime imports with direct session support
+try:
+    import onnxruntime as ort
+    from transformers import AutoTokenizer
+    ONNX_AVAILABLE = True
+    logger.info(f"ONNX Runtime available (providers: {ort.get_available_providers()})")
+except ImportError as e:
+    ONNX_AVAILABLE = False
+    logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
+
+# Global model instances
+model = None
+tokenizer = None
+onnx_session = None
+use_onnx = False
+model_mode = "unknown"
+
+def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
+    """
+    Perform mean pooling on token embeddings using attention mask.
+
+    Args:
+        token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
+        attention_mask: Attention mask [batch_size, seq_len]
+
+    Returns:
+        Pooled embeddings [batch_size, hidden_dim]
+    """
+    # Expand attention mask to match embeddings dimensions
+    attention_mask_expanded = np.expand_dims(attention_mask, -1)
+
+    # Sum embeddings where attention mask is 1
+    sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
+
+    # Sum attention mask to get actual sequence lengths
+    sum_mask = np.sum(attention_mask_expanded, axis=1)
+
+    # Divide to get mean (avoid division by zero)
+    mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
+
+    return mean_embeddings
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load BGE-M3 model on startup with ONNX optimization"""
+    global model, tokenizer, onnx_session, use_onnx, model_mode
+    logger.info("Loading BGE-M3 model with ARM64 optimization...")
+
+    # Check if ONNX Runtime should be used
+    use_onnx_env = os.getenv('USE_ONNX_RUNTIME', 'true').lower() == 'true'
+
+    try:
+        if ONNX_AVAILABLE and use_onnx_env:
+            # Try ONNX Runtime with direct session for maximum ARM64 performance
+            logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
+            try:
+                # Load tokenizer
+                tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
+
+                # Check for cached ONNX model
+                cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
+                model_id = 'models--BAAI--bge-m3'
+
+                # Find ONNX model in cache
+                import glob
+                onnx_pattern = f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx'
+                onnx_files = glob.glob(onnx_pattern)
+
+                if onnx_files:
+                    onnx_path = onnx_files[0]
+                    logger.info(f"Found cached ONNX model at: {onnx_path}")
+
+                    # Configure ONNX session options to suppress ARM64 warnings
+                    sess_options = ort.SessionOptions()
+                    sess_options.log_severity_level = 3  # 3=ERROR (suppresses warnings)
+
+                    # Create ONNX session with GPU auto-detection (falls back to CPU)
+                    onnx_session = ort.InferenceSession(
+                        onnx_path,
+                        sess_options=sess_options,
+                        providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
+                    )
+
+                    use_onnx = True
+                    model_mode = "ONNX Runtime (Direct Session)"
+                    logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session")
+
+                    # Log ONNX model outputs for debugging
+                    logger.info("ONNX model outputs:")
+                    for output in onnx_session.get_outputs():
+                        logger.info(f"  - {output.name}: {output.shape}")
+                else:
+                    logger.warning("No cached ONNX model found, need to export first...")
+                    logger.info("Attempting ONNX export via optimum...")
+
+                    # Try to export ONNX model using optimum
+                    from optimum.onnxruntime import ORTModelForFeatureExtraction
+
+                    # This will cache the ONNX model for future use
+                    temp_model = ORTModelForFeatureExtraction.from_pretrained(
+                        'BAAI/bge-m3',
+                        export=False,
+                        provider="CPUExecutionProvider"
+                    )
+                    del temp_model
+
+                    # Now find the newly exported model
+                    onnx_files = glob.glob(onnx_pattern)
+                    if onnx_files:
+                        onnx_path = onnx_files[0]
+                        logger.info(f"ONNX model exported to: {onnx_path}")
+
+                        # Load with direct session (GPU auto-detection)
+                        sess_options = ort.SessionOptions()
+                        sess_options.log_severity_level = 3
+
+                        onnx_session = ort.InferenceSession(
+                            onnx_path,
+                            sess_options=sess_options,
+                            providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
+                        )
+
+                        use_onnx = True
+                        model_mode = "ONNX Runtime (Direct Session - Exported)"
+                        logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session")
+                    else:
+                        raise FileNotFoundError("ONNX export completed but model file not found")
+
+            except Exception as onnx_error:
+                logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
+                logger.warning(f"Error type: {type(onnx_error).__name__}")
+                logger.info("Falling back to SentenceTransformers...")
+                raise onnx_error
+        else:
+            logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
+            raise ImportError("ONNX disabled")
+
+    except Exception:
+        # Fallback to SentenceTransformers with GPU auto-detection
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        logger.info(f"Loading BGE-M3 with SentenceTransformers (fallback mode) on {device}...")
+        model = SentenceTransformer(
+            'BAAI/bge-m3',
+            device=device,
+            trust_remote_code=True
+        )
+        use_onnx = False
+        model_mode = f"SentenceTransformers ({device.upper()})"
+        logger.info(f"✅ BGE-M3 model loaded with SentenceTransformers on {device}")
+
+    logger.info(f"Model mode: {model_mode}")
+    logger.info(f"PyTorch threads: {torch.get_num_threads()}")
+    logger.info(f"OMP threads: {os.getenv('OMP_NUM_THREADS', 'not set')}")
+    logger.info(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
+
+    yield
+
+    # Cleanup
+    if model:
+        del model
+    if tokenizer:
+        del tokenizer
+    if onnx_session:
+        del onnx_session
+    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+
+app = FastAPI(
+    title="BGE-M3 Embedding Service",
+    description="OpenAI-compatible BGE-M3 embedding API for GT 2.0",
+    version="1.0.0",
+    lifespan=lifespan
+)
+
+# OpenAI-compatible request models
+class EmbeddingRequest(BaseModel):
+    input: List[str] = Field(..., description="Input texts to embed")
+    model: str = Field(default="BAAI/bge-m3", description="Model name")
+    encoding_format: str = Field(default="float", description="Encoding format")
+    dimensions: Optional[int] = Field(None, description="Number of dimensions")
+    user: Optional[str] = Field(None, description="User identifier")
+
+class EmbeddingData(BaseModel):
+    object: str = "embedding"
+    embedding: List[float]
+    index: int
+
+class EmbeddingUsage(BaseModel):
+    prompt_tokens: int
+    total_tokens: int
+
+class EmbeddingResponse(BaseModel):
+    object: str = "list"
+    data: List[EmbeddingData]
+    model: str
+    usage: EmbeddingUsage
+
+@app.post("/v1/embeddings", response_model=EmbeddingResponse)
+async def create_embeddings(request: EmbeddingRequest):
+    """Generate embeddings using BGE-M3 model"""
+
+    if not model and not onnx_session:
+        raise HTTPException(status_code=500, detail="BGE-M3 model not loaded")
+
+    if not request.input:
+        raise HTTPException(status_code=400, detail="No input texts provided")
+
+    start_time = time.time()
+
+    try:
+        logger.info(f"Generating embeddings for {len(request.input)} texts using {model_mode}")
+
+        # Generate embeddings with mode-specific logic
+        if use_onnx and onnx_session:
+            # Direct ONNX Runtime path for maximum performance
+            batch_size = min(len(request.input), 64)
+            embeddings = []
+
+            for i in range(0, len(request.input), batch_size):
+                batch_texts = request.input[i:i + batch_size]
+
+                # Tokenize
+                inputs = tokenizer(
+                    batch_texts,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="np",
+                    max_length=512
+                )
+
+                # Run ONNX inference
+                # BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
+                outputs = onnx_session.run(
+                    None,  # Get all outputs
+                    {
+                        'input_ids': inputs['input_ids'].astype(np.int64),
+                        'attention_mask': inputs['attention_mask'].astype(np.int64)
+                    }
+                )
+
+                # Get token embeddings (first output)
+                token_embeddings = outputs[0]
+
+                # Mean pooling with attention mask
+                batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
+
+                # Normalize embeddings
+                norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
+                batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
+
+                embeddings.extend(batch_embeddings)
+
+            embeddings = np.array(embeddings)
+        else:
+            # SentenceTransformers fallback path
+            embeddings = model.encode(
+                request.input,
+                batch_size=min(len(request.input), 64),
+                show_progress_bar=False,
+                convert_to_tensor=False,
+                normalize_embeddings=True
+            )
+
+        # Convert to list format
+        if hasattr(embeddings, 'tolist'):
+            embeddings = embeddings.tolist()
+        elif isinstance(embeddings, list) and len(embeddings) > 0:
+            if hasattr(embeddings[0], 'tolist'):
+                embeddings = [emb.tolist() for emb in embeddings]
+
+        # Create response in OpenAI format
+        embedding_data = [
+            EmbeddingData(
+                embedding=embedding,
+                index=i
+            )
+            for i, embedding in enumerate(embeddings)
+        ]
+
+        # Calculate token usage (rough estimation)
+        total_tokens = sum(len(text.split()) for text in request.input)
+
+        processing_time_ms = int((time.time() - start_time) * 1000)
+
+        logger.info(f"Generated {len(embeddings)} embeddings in {processing_time_ms}ms")
+
+        return EmbeddingResponse(
+            data=embedding_data,
+            model=request.model,
+            usage=EmbeddingUsage(
+                prompt_tokens=total_tokens,
+                total_tokens=total_tokens
+            )
+        )
+
+    except Exception as e:
+        logger.error(f"Error generating embeddings: {e}")
+        logger.exception("Full traceback:")
+        raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy" if (model or onnx_session) else "unhealthy",
+        "model": "BAAI/bge-m3",
+        "service": "bge-m3-embeddings",
+        "mode": model_mode,
+        "onnx_enabled": use_onnx,
+        "gpu_available": torch.cuda.is_available(),
+        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
+        "pytorch_threads": torch.get_num_threads(),
+        "timestamp": datetime.utcnow().isoformat()
+    }
+
+@app.get("/v1/models")
+async def list_models():
+    """List available models (OpenAI-compatible)"""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "BAAI/bge-m3",
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "gt2"
+            }
+        ]
+    }
+
+@app.get("/")
+async def root():
+    """Root endpoint"""
+    return {
+        "service": "BGE-M3 Embedding Service",
+        "model": "BAAI/bge-m3",
+        "version": "1.0.0",
+        "api": "OpenAI-compatible",
+        "status": "ready" if (model or onnx_session) else "loading"
+    }
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "embedding_server:app",
+        host="0.0.0.0",
+        port=8000,
+        log_level="info"
+    )
--- a/.deployment/docker/embedding_server_dgx.py
+++ b/.deployment/docker/embedding_server_dgx.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python3
+"""
+DGX-Optimized BGE-M3 Embedding Server for GT 2.0
+Optimized for NVIDIA DGX Spark with 20-core Grace ARM architecture
+Provides real BGE-M3 embeddings via OpenAI-compatible API - NO FALLBACKS
+"""
+
+import asyncio
+import logging
+import time
+import uvicorn
+import psutil
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, HTTPException
+from contextlib import asynccontextmanager
+
+# Setup logging first
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# BGE-M3 Model with DGX Grace optimizations
+from sentence_transformers import SentenceTransformer
+import torch
+import os
+import numpy as np
+
+# ONNX Runtime imports with direct session support
+try:
+    import onnxruntime as ort
+    from transformers import AutoTokenizer
+    ONNX_AVAILABLE = True
+    logger.info("ONNX Runtime available for DGX Grace ARM64 optimization")
+except ImportError as e:
+    ONNX_AVAILABLE = False
+    logger.warning(f"ONNX Runtime not available, falling back to SentenceTransformers: {e}")
+
+# Global model instances
+model = None
+tokenizer = None
+onnx_session = None
+use_onnx = False
+model_mode = "unknown"
+
+def mean_pooling(token_embeddings: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
+    """
+    Perform mean pooling on token embeddings using attention mask.
+
+    Args:
+        token_embeddings: Token-level embeddings [batch_size, seq_len, hidden_dim]
+        attention_mask: Attention mask [batch_size, seq_len]
+
+    Returns:
+        Pooled embeddings [batch_size, hidden_dim]
+    """
+    # Expand attention mask to match embeddings dimensions
+    attention_mask_expanded = np.expand_dims(attention_mask, -1)
+
+    # Sum embeddings where attention mask is 1
+    sum_embeddings = np.sum(token_embeddings * attention_mask_expanded, axis=1)
+
+    # Sum attention mask to get actual sequence lengths
+    sum_mask = np.sum(attention_mask_expanded, axis=1)
+
+    # Divide to get mean (avoid division by zero)
+    mean_embeddings = sum_embeddings / np.maximum(sum_mask, 1e-9)
+
+    return mean_embeddings
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load BGE-M3 model on startup with DGX Grace optimization"""
+    global model, tokenizer, onnx_session, use_onnx, model_mode
+    logger.info("Loading BGE-M3 model with DGX Grace ARM64 optimization...")
+
+    # Log system information
+    logger.info(f"CPU cores: {psutil.cpu_count(logical=True)}")
+    logger.info(f"Memory: {psutil.virtual_memory().total / (1024**3):.1f}GB")
+    logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
+    logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
+
+    # Check if ONNX Runtime should be used and is available
+    use_onnx_env = os.environ.get('USE_ONNX_RUNTIME', 'true').lower() == 'true'
+
+    try:
+        if ONNX_AVAILABLE and use_onnx_env:
+            # Try ONNX Runtime with direct session for maximum DGX Grace performance
+            logger.info("Attempting to load BGE-M3 with direct ONNX Runtime session...")
+            try:
+                # Load tokenizer
+                tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
+
+                # Check for cached ONNX model
+                cache_dir = os.path.expanduser('~/.cache/huggingface/hub')
+                model_id = 'models--BAAI--bge-m3'
+
+                # Find ONNX model in cache - check multiple possible locations
+                import glob
+                onnx_locations = [
+                    f'{cache_dir}/{model_id}/onnx/model.onnx',  # Our export location
+                    f'{cache_dir}/{model_id}/snapshots/*/onnx/model.onnx',  # HF cache location
+                ]
+                onnx_files = []
+                for pattern in onnx_locations:
+                    onnx_files = glob.glob(pattern)
+                    if onnx_files:
+                        break
+
+                if onnx_files:
+                    onnx_path = onnx_files[0]
+                    logger.info(f"Found cached ONNX model at: {onnx_path}")
+
+                    # Configure ONNX session options for DGX Grace ARM64
+                    sess_options = ort.SessionOptions()
+                    sess_options.log_severity_level = 3  # 3=ERROR (suppresses warnings)
+                    sess_options.intra_op_num_threads = 20  # DGX Grace 20 cores
+                    sess_options.inter_op_num_threads = 4
+                    sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
+                    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+                    # Create ONNX session with DGX optimized settings
+                    onnx_session = ort.InferenceSession(
+                        onnx_path,
+                        sess_options=sess_options,
+                        providers=['CPUExecutionProvider']
+                    )
+
+                    use_onnx = True
+                    model_mode = "ONNX Runtime (Direct Session - DGX)"
+                    logger.info("✅ BGE-M3 model loaded with direct ONNX Runtime session (DGX optimized)")
+
+                    # Log ONNX model outputs for debugging
+                    logger.info("ONNX model outputs:")
+                    for output in onnx_session.get_outputs():
+                        logger.info(f"  - {output.name}: {output.shape}")
+                else:
+                    logger.warning("No cached ONNX model found, need to export first...")
+                    logger.info("Attempting ONNX export via optimum...")
+
+                    # Try to export ONNX model using optimum
+                    from optimum.onnxruntime import ORTModelForFeatureExtraction
+
+                    # Define export path within the huggingface cache structure
+                    onnx_export_path = os.path.expanduser('~/.cache/huggingface/hub/models--BAAI--bge-m3/onnx')
+                    os.makedirs(onnx_export_path, exist_ok=True)
+
+                    logger.info(f"Exporting ONNX model to: {onnx_export_path}")
+
+                    # Export and save the ONNX model
+                    temp_model = ORTModelForFeatureExtraction.from_pretrained(
+                        'BAAI/bge-m3',
+                        export=True,
+                        provider="CPUExecutionProvider"
+                    )
+                    temp_model.save_pretrained(onnx_export_path)
+                    logger.info(f"ONNX model saved to: {onnx_export_path}")
+                    del temp_model
+
+                    # Look for the exported model in the new location
+                    onnx_export_pattern = f'{onnx_export_path}/model.onnx'
+                    onnx_files = glob.glob(onnx_export_pattern)
+
+                    # Also check the original pattern in case it was cached differently
+                    if not onnx_files:
+                        onnx_files = glob.glob(onnx_pattern)
+                    if onnx_files:
+                        onnx_path = onnx_files[0]
+                        logger.info(f"ONNX model exported to: {onnx_path}")
+
+                        # Load with direct session
+                        sess_options = ort.SessionOptions()
+                        sess_options.log_severity_level = 3
+                        sess_options.intra_op_num_threads = 20
+                        sess_options.inter_op_num_threads = 4
+                        sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
+                        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+                        onnx_session = ort.InferenceSession(
+                            onnx_path,
+                            sess_options=sess_options,
+                            providers=['CPUExecutionProvider']
+                        )
+
+                        use_onnx = True
+                        model_mode = "ONNX Runtime (Direct Session - DGX Exported)"
+                        logger.info("✅ BGE-M3 model exported and loaded with direct ONNX Runtime session (DGX optimized)")
+                    else:
+                        raise FileNotFoundError("ONNX export completed but model file not found")
+
+            except Exception as onnx_error:
+                logger.warning(f"ONNX Runtime setup failed: {onnx_error}")
+                logger.warning(f"Error type: {type(onnx_error).__name__}")
+                logger.info("Falling back to SentenceTransformers...")
+                raise onnx_error
+        else:
+            logger.info("ONNX Runtime disabled or unavailable, using SentenceTransformers...")
+            raise ImportError("ONNX disabled")
+
+    except Exception:
+        # Fallback to SentenceTransformers if ONNX fails or is disabled
+        logger.info("Loading BGE-M3 with SentenceTransformers (DGX Grace optimized)...")
+        try:
+            # Configure PyTorch for DGX Grace
+            torch.set_num_threads(20)  # DGX Grace 20 cores
+            torch.set_num_interop_threads(4)
+
+            # Load model with DGX optimizations
+            model = SentenceTransformer(
+                'BAAI/bge-m3',
+                device='cpu',
+                trust_remote_code=True,
+                model_kwargs={
+                    'torch_dtype': torch.float16,  # Memory optimization for large models
+                    'low_cpu_mem_usage': False  # Use full memory for performance
+                }
+            )
+
+            # Enable optimizations
+            model._modules['0'].auto_model.eval()
+
+            use_onnx = False
+            model_mode = "SentenceTransformers (DGX Grace)"
+            logger.info("✅ BGE-M3 loaded successfully with SentenceTransformers (DGX Grace optimized)")
+
+        except Exception as e:
+            logger.error(f"❌ Failed to load BGE-M3 model: {e}")
+            raise e
+
+    # Log model configuration
+    logger.info(f"Model mode: {model_mode}")
+    logger.info(f"Using ONNX: {use_onnx}")
+    logger.info(f"OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'not set')}")
+    logger.info(f"PYTORCH_NUM_THREADS: {os.environ.get('PYTORCH_NUM_THREADS', 'not set')}")
+
+    yield
+
+    # Cleanup
+    logger.info("Shutting down BGE-M3 embedding server...")
+    if model:
+        del model
+    if tokenizer:
+        del tokenizer
+    if onnx_session:
+        del onnx_session
+    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+
+# FastAPI app with lifespan
+app = FastAPI(
+    title="GT 2.0 DGX BGE-M3 Embedding Server",
+    description="DGX Grace ARM optimized BGE-M3 embedding service for GT 2.0",
+    version="2.0.0-dgx",
+    lifespan=lifespan
+)
+
+# Pydantic models for OpenAI compatibility
+class EmbeddingRequest(BaseModel):
+    input: List[str] = Field(..., description="Input texts to embed")
+    model: str = Field(default="BAAI/bge-m3", description="Model name")
+    encoding_format: str = Field(default="float", description="Encoding format")
+    dimensions: Optional[int] = Field(None, description="Number of dimensions")
+    user: Optional[str] = Field(None, description="User identifier")
+
+class EmbeddingData(BaseModel):
+    object: str = "embedding"
+    embedding: List[float]
+    index: int
+
+class EmbeddingUsage(BaseModel):
+    prompt_tokens: int
+    total_tokens: int
+
+class EmbeddingResponse(BaseModel):
+    object: str = "list"
+    data: List[EmbeddingData]
+    model: str
+    usage: EmbeddingUsage
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint with DGX system metrics"""
+    if not model and not onnx_session:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+
+    # Include system metrics for DGX monitoring
+    cpu_percent = psutil.cpu_percent(interval=1)
+    memory = psutil.virtual_memory()
+
+    return {
+        "status": "healthy",
+        "model": "BAAI/bge-m3",
+        "mode": model_mode,
+        "using_onnx": use_onnx,
+        "platform": os.environ.get('GT2_PLATFORM', 'unknown'),
+        "architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
+        "cpu_cores": psutil.cpu_count(logical=True),
+        "cpu_usage": cpu_percent,
+        "memory_total_gb": round(memory.total / (1024**3), 1),
+        "memory_used_gb": round(memory.used / (1024**3), 1),
+        "memory_available_gb": round(memory.available / (1024**3), 1),
+        "omp_threads": os.environ.get('OMP_NUM_THREADS', 'not set'),
+        "pytorch_threads": os.environ.get('PYTORCH_NUM_THREADS', 'not set'),
+        "timestamp": datetime.utcnow().isoformat()
+    }
+
+@app.post("/v1/embeddings", response_model=EmbeddingResponse)
+async def create_embeddings(request: EmbeddingRequest):
+    """Create embeddings using BGE-M3 model (OpenAI compatible)"""
+    if not model and not onnx_session:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+
+    try:
+        start_time = time.time()
+        input_texts = request.input
+
+        # Validate input
+        if not input_texts or len(input_texts) == 0:
+            raise HTTPException(status_code=400, detail="Input texts cannot be empty")
+
+        # Log processing info for DGX monitoring
+        logger.info(f"Processing {len(input_texts)} texts with {model_mode}")
+
+        # DGX optimized batch processing
+        if use_onnx and onnx_session:
+            # Direct ONNX Runtime path for maximum DGX Grace performance
+            batch_size = min(len(input_texts), 128)  # Larger batches for DGX Grace
+            embeddings = []
+
+            for i in range(0, len(input_texts), batch_size):
+                batch_texts = input_texts[i:i + batch_size]
+
+                # Tokenize
+                inputs = tokenizer(
+                    batch_texts,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="np",
+                    max_length=512
+                )
+
+                # Run ONNX inference
+                # BGE-M3 ONNX model outputs: [token_embeddings, sentence_embedding]
+                outputs = onnx_session.run(
+                    None,  # Get all outputs
+                    {
+                        'input_ids': inputs['input_ids'].astype(np.int64),
+                        'attention_mask': inputs['attention_mask'].astype(np.int64)
+                    }
+                )
+
+                # Get token embeddings (first output)
+                token_embeddings = outputs[0]
+
+                # Mean pooling with attention mask
+                batch_embeddings = mean_pooling(token_embeddings, inputs['attention_mask'])
+
+                # Normalize embeddings
+                norms = np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
+                batch_embeddings = batch_embeddings / np.maximum(norms, 1e-9)
+
+                embeddings.extend(batch_embeddings)
+
+            embeddings = np.array(embeddings)
+        else:
+            # SentenceTransformers path with DGX optimization
+            with torch.no_grad():
+                embeddings = model.encode(
+                    input_texts,
+                    convert_to_numpy=True,
+                    normalize_embeddings=True,
+                    batch_size=32,  # Optimal for DGX Grace
+                    show_progress_bar=False
+                )
+
+        # Convert to list format for OpenAI compatibility
+        if hasattr(embeddings, 'tolist'):
+            embeddings = embeddings.tolist()
+        elif isinstance(embeddings, list) and len(embeddings) > 0:
+            if hasattr(embeddings[0], 'tolist'):
+                embeddings = [emb.tolist() for emb in embeddings]
+
+        # Create response in OpenAI format
+        embedding_data = [
+            EmbeddingData(
+                embedding=embedding,
+                index=i
+            )
+            for i, embedding in enumerate(embeddings)
+        ]
+
+        processing_time = time.time() - start_time
+
+        # Calculate token usage (rough estimation)
+        total_tokens = sum(len(text.split()) for text in input_texts)
+
+        # Log performance metrics for DGX monitoring
+        texts_per_second = len(input_texts) / processing_time
+        logger.info(f"Processed {len(input_texts)} texts in {processing_time:.2f}s ({texts_per_second:.1f} texts/sec)")
+
+        return EmbeddingResponse(
+            data=embedding_data,
+            model=request.model,
+            usage=EmbeddingUsage(
+                prompt_tokens=total_tokens,
+                total_tokens=total_tokens
+            )
+        )
+
+    except Exception as e:
+        logger.error(f"❌ Embedding generation failed: {e}")
+        logger.exception("Full traceback:")
+        raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
+
+@app.get("/v1/models")
+@app.get("/models")
+async def list_models():
+    """List available models (OpenAI compatible)"""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "BAAI/bge-m3",
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "gt2-dgx",
+                "permission": [],
+                "root": "BAAI/bge-m3",
+                "parent": None
+            }
+        ]
+    }
+
+@app.get("/")
+async def root():
+    """Root endpoint with DGX info"""
+    return {
+        "service": "GT 2.0 DGX BGE-M3 Embedding Server",
+        "version": "2.0.0-dgx",
+        "model": "BAAI/bge-m3",
+        "mode": model_mode,
+        "platform": os.environ.get('GT2_PLATFORM', 'unknown'),
+        "architecture": os.environ.get('GT2_ARCHITECTURE', 'unknown'),
+        "cpu_cores": psutil.cpu_count(logical=True),
+        "openai_compatible": True,
+        "endpoints": {
+            "embeddings": "/v1/embeddings",
+            "models": "/models",
+            "health": "/health"
+        }
+    }
+
+if __name__ == "__main__":
+    logger.info("Starting GT 2.0 DGX BGE-M3 Embedding Server...")
+    logger.info(f"Platform: {os.environ.get('GT2_PLATFORM', 'unknown')}")
+    logger.info(f"Architecture: {os.environ.get('GT2_ARCHITECTURE', 'unknown')}")
+
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        workers=1,  # Single worker for model memory efficiency
+        loop="asyncio",
+        access_log=True
+    )