FROM python:3.11-slim # Install system dependencies for x86_64 with optimized BLAS libraries RUN apt-get update && apt-get install -y \ gcc \ g++ \ curl \ libblas-dev \ liblapack-dev \ libopenblas-dev \ gfortran \ pkg-config \ && rm -rf /var/lib/apt/lists/* # Install PyTorch with CUDA support for x86_64 (auto-falls back to CPU if no GPU) RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 # Install optimized dependencies for x86_64 RUN pip install --no-cache-dir \ transformers>=4.36.0 \ sentence-transformers \ fastapi \ uvicorn \ numpy \ accelerate \ onnxruntime-gpu \ optimum[onnxruntime-gpu] # Set comprehensive x86_64 environment variables for maximum performance ENV OMP_NUM_THREADS=16 ENV BLIS_NUM_THREADS=16 ENV OPENBLAS_NUM_THREADS=16 ENV PYTORCH_NUM_THREADS=16 ENV PYTORCH_ENABLE_MPS_FALLBACK=1 ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 # GPU auto-detection: ONNX Runtime will use CUDAExecutionProvider if available, else CPU ENV USE_ONNX_RUNTIME=true # x86_64 specific compiler optimization flags ENV CFLAGS="-march=native -O3 -mavx2 -mfma" ENV CXXFLAGS="-march=native -O3 -mavx2 -mfma" # Create app directory WORKDIR /app # Copy the custom OpenAI-compatible BGE-M3 server COPY .deployment/docker/embedding_server.py /app/embedding_server.py # Expose port EXPOSE 8000 # Health check HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \ CMD curl -f http://localhost:8000/health || exit 1 # Run the embedding server CMD ["python", "embedding_server.py"]