- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
56 lines
1.6 KiB
Docker
56 lines
1.6 KiB
Docker
FROM python:3.11-slim
|
|
|
|
# Install system dependencies for x86_64 with optimized BLAS libraries
|
|
RUN apt-get update && apt-get install -y \
|
|
gcc \
|
|
g++ \
|
|
curl \
|
|
libblas-dev \
|
|
liblapack-dev \
|
|
libopenblas-dev \
|
|
gfortran \
|
|
pkg-config \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Install PyTorch with CUDA support for x86_64 (auto-falls back to CPU if no GPU)
|
|
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
|
|
|
# Install optimized dependencies for x86_64
|
|
RUN pip install --no-cache-dir \
|
|
transformers>=4.36.0 \
|
|
sentence-transformers \
|
|
fastapi \
|
|
uvicorn \
|
|
numpy \
|
|
accelerate \
|
|
onnxruntime-gpu \
|
|
optimum[onnxruntime-gpu]
|
|
|
|
# Set comprehensive x86_64 environment variables for maximum performance
|
|
ENV OMP_NUM_THREADS=16
|
|
ENV BLIS_NUM_THREADS=16
|
|
ENV OPENBLAS_NUM_THREADS=16
|
|
ENV PYTORCH_NUM_THREADS=16
|
|
ENV PYTORCH_ENABLE_MPS_FALLBACK=1
|
|
ENV PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
|
|
# GPU auto-detection: ONNX Runtime will use CUDAExecutionProvider if available, else CPU
|
|
ENV USE_ONNX_RUNTIME=true
|
|
# x86_64 specific compiler optimization flags
|
|
ENV CFLAGS="-march=native -O3 -mavx2 -mfma"
|
|
ENV CXXFLAGS="-march=native -O3 -mavx2 -mfma"
|
|
|
|
# Create app directory
|
|
WORKDIR /app
|
|
|
|
# Copy the custom OpenAI-compatible BGE-M3 server
|
|
COPY .deployment/docker/embedding_server.py /app/embedding_server.py
|
|
|
|
# Expose port
|
|
EXPOSE 8000
|
|
|
|
# Health check
|
|
HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=3 \
|
|
CMD curl -f http://localhost:8000/health || exit 1
|
|
|
|
# Run the embedding server
|
|
CMD ["python", "embedding_server.py"] |