Files
gt-ai-os-community/docker-compose.yml
HackWeasel 310491a557 GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents
- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2
- Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2
  - Made more general-purpose (flexible targets, expanded tools)
- Added nemotron-mini-agent.csv for fast local inference via Ollama
- Added nemotron-agent.csv for advanced reasoning via Ollama
- Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00

431 lines
15 KiB
YAML

# GT 2.0 Unified Production Deployment
# Complete three-cluster architecture with Simple HA PostgreSQL
# Synthesized from full-stack and unified-deploy configurations
# NOTE: No explicit project name - Docker Compose derives it from directory name
# This ensures existing volumes (gt-20_*, gt2_*, etc.) continue to be used
networks:
gt2-admin:
driver: bridge
name: gt2-admin-network
gt2-tenant:
driver: bridge
name: gt2-tenant-network
gt2-resource:
driver: bridge
name: gt2-resource-network
gt2-shared:
driver: bridge
name: gt2-shared-network
tenant-test-network:
driver: bridge
name: tenant-test-network
volumes:
# ==============================================
# ADMIN CLUSTER VOLUMES
# No explicit names - Docker Compose derives from project/directory name
# This ensures existing volumes continue to be used
# ==============================================
admin_postgres_data:
driver: local
rabbitmq_data:
driver: local
# ==============================================
# TENANT CLUSTER VOLUMES (PostgreSQL)
# ==============================================
tenant_postgres_primary_data:
driver: local
# Per-Tenant Persistent Storage
tenant_test_data:
driver: local
driver_opts:
type: none
o: bind
device: ./volumes/tenants/test/tablespaces
tenant_test_files:
driver: local
driver_opts:
type: none
o: bind
device: ./volumes/tenants/test/files
# Resource Cluster volumes
consul_data:
driver: local
resource_cluster_data:
driver: local
services:
# ==============================================
# ADMIN CLUSTER - Control Panel Infrastructure
# ==============================================
# Control Panel PostgreSQL
postgres:
image: postgres:15-alpine
container_name: gentwo-controlpanel-postgres
entrypoint: ["/usr/local/bin/admin-entrypoint-wrapper.sh"]
command: ["postgres"]
environment:
POSTGRES_DB: gt2_admin
POSTGRES_USER: postgres
POSTGRES_PASSWORD: ${ADMIN_POSTGRES_PASSWORD:-dev_password_change_in_prod}
POSTGRES_HOST_AUTH_METHOD: md5
POSTGRES_INITDB_ARGS: "--auth-host=md5"
volumes:
- admin_postgres_data:/var/lib/postgresql/data
- ./scripts/postgresql/admin-entrypoint-wrapper.sh:/usr/local/bin/admin-entrypoint-wrapper.sh:ro
- ./scripts/postgresql/unified/00-create-databases.sql:/docker-entrypoint-initdb.d/00-create-databases.sql
- ./scripts/postgresql/admin-extensions.sql:/docker-entrypoint-initdb.d/00a-init-extensions.sql
- ./scripts/postgresql/unified/01-create-admin-roles.sql:/docker-entrypoint-initdb.d/01-create-roles.sql
- ./scripts/postgresql/unified/01-init-control-panel-schema-complete.sql:/docker-entrypoint-initdb.d/02-init-schema.sql
- ./scripts/postgresql/unified/05-create-test-data.sql:/docker-entrypoint-initdb.d/03-create-test-data.sql
ports:
- "5432:5432"
networks:
- gt2-admin
- gt2-shared
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres -d gt2_admin"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# RabbitMQ for inter-cluster messaging
rabbitmq:
image: rabbitmq:3-management-alpine
container_name: gentwo-controlpanel-rabbitmq
environment:
RABBITMQ_DEFAULT_USER: gt2_admin
RABBITMQ_DEFAULT_PASS: ${RABBITMQ_PASSWORD:-dev_password_change_in_prod}
volumes:
- rabbitmq_data:/var/lib/rabbitmq
ports:
- "5672:5672" # AMQP
- "15672:15672" # Management UI
networks:
- gt2-admin
- gt2-shared
restart: unless-stopped
healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "ping"]
interval: 10s
timeout: 5s
retries: 5
# Control Panel Backend
control-panel-backend:
image: ${IMAGE_REGISTRY:-ghcr.io/gt-edge-ai-internal/gt-ai-os-community}/control-panel-backend:${IMAGE_TAG:-latest}
build:
context: ./apps/control-panel-backend
dockerfile: Dockerfile
container_name: gentwo-controlpanel-backend
environment:
DATABASE_URL: postgresql+asyncpg://postgres:${ADMIN_POSTGRES_PASSWORD:-dev_password_change_in_prod}@postgres:5432/gt2_admin
RABBITMQ_URL: amqp://gt2_admin:${RABBITMQ_PASSWORD:-dev_password_change_in_prod}@rabbitmq:5672/
SECRET_KEY: ${SECRET_KEY:-production-secret-key}
JWT_SECRET: ${JWT_SECRET}
ENVIRONMENT: ${ENVIRONMENT:-production}
DEBUG: "${DEBUG:-false}"
# API Key Encryption (for tenant API keys stored in DB)
API_KEY_ENCRYPTION_KEY: ${API_KEY_ENCRYPTION_KEY:-}
SERVICE_AUTH_TOKEN: ${SERVICE_AUTH_TOKEN:-internal-service-token}
# SMTP Configuration (Brevo) - Enterprise Only
SMTP_HOST: ${SMTP_HOST:-}
SMTP_PORT: ${SMTP_PORT:-}
SMTP_USERNAME: ${SMTP_USERNAME:-}
SMTP_PASSWORD: ${SMTP_PASSWORD:-}
SMTP_FROM_EMAIL: ${SMTP_FROM_EMAIL:-}
SMTP_FROM_NAME: ${SMTP_FROM_NAME:-}
SMTP_USE_TLS: ${SMTP_USE_TLS:-}
# Two-Factor Authentication Configuration
TFA_ENCRYPTION_KEY: ${TFA_ENCRYPTION_KEY:-}
TFA_ISSUER_NAME: ${TFA_ISSUER_NAME:-}
TFA_TEMP_TOKEN_EXPIRY_MINUTES: ${TFA_TEMP_TOKEN_EXPIRY_MINUTES:-}
TFA_RATE_LIMIT_ATTEMPTS: ${TFA_RATE_LIMIT_ATTEMPTS:-}
TFA_RATE_LIMIT_WINDOW_MINUTES: ${TFA_RATE_LIMIT_WINDOW_MINUTES:-}
# Tenant Database Connection (for user sync)
TENANT_POSTGRES_PASSWORD: ${TENANT_USER_PASSWORD}
ports:
- "8001:8000"
networks:
- gt2-admin
- gt2-shared
restart: unless-stopped
depends_on:
postgres:
condition: service_healthy
rabbitmq:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# Control Panel Frontend
control-panel-frontend:
image: ${IMAGE_REGISTRY:-ghcr.io/gt-edge-ai-internal/gt-ai-os-community}/control-panel-frontend:${IMAGE_TAG:-latest}
build:
context: ./apps/control-panel-frontend
dockerfile: Dockerfile
args:
INTERNAL_API_URL: http://control-panel-backend:8000
NEXT_PUBLIC_API_URL: http://localhost:8001
NEXT_PUBLIC_WS_URL: ws://localhost:8001
container_name: gentwo-controlpanel-frontend
environment:
NODE_ENV: production
NEXT_PUBLIC_API_URL: http://localhost:8001
INTERNAL_API_URL: http://control-panel-backend:8000
NEXT_PUBLIC_ENVIRONMENT: ${ENVIRONMENT:-production}
ports:
- "3001:3000"
networks:
- gt2-admin
- gt2-shared
restart: unless-stopped
depends_on:
control-panel-backend:
condition: service_healthy
# ==============================================
# TENANT CLUSTER - User-Facing Services
# ==============================================
# Tenant PostgreSQL Primary (with PGVector)
tenant-postgres-primary:
image: pgvector/pgvector:pg15
container_name: gentwo-tenant-postgres-primary
entrypoint: ["/usr/local/bin/docker-entrypoint-wrapper.sh"]
command: ["postgres"]
environment:
POSTGRES_DB: gt2_tenants
POSTGRES_USER: postgres
POSTGRES_PASSWORD: ${TENANT_POSTGRES_PASSWORD:-gt2_tenant_dev_password}
POSTGRES_REPLICATION_USER: replicator
POSTGRES_REPLICATION_PASSWORD: ${TENANT_REPLICATOR_PASSWORD:-tenant_replicator_dev_password}
# User password for gt2_tenant_user (used by wrapper to sync passwords)
TENANT_USER_PASSWORD: ${TENANT_USER_PASSWORD:-gt2_tenant_dev_password}
POSTGRES_HOST_AUTH_METHOD: md5
POSTGRES_INITDB_ARGS: "--auth-host=md5"
# Performance settings
POSTGRES_SHARED_BUFFERS: 256MB
POSTGRES_EFFECTIVE_CACHE_SIZE: 1GB
POSTGRES_MAINTENANCE_WORK_MEM: 64MB
POSTGRES_MAX_CONNECTIONS: 200
volumes:
- tenant_postgres_primary_data:/var/lib/postgresql/data
- tenant_test_data:/var/lib/postgresql/tablespaces/tenant_test
- tenant_test_files:/var/lib/postgresql/files/tenant_test
- ./scripts/postgresql/docker-entrypoint-wrapper.sh:/usr/local/bin/docker-entrypoint-wrapper.sh:ro
- ./scripts/postgresql/unified/00-create-tenant-database.sql:/docker-entrypoint-initdb.d/00-create-database.sql
- ./scripts/postgresql/tenant-extensions.sql:/docker-entrypoint-initdb.d/00a-init-extensions.sql
- ./scripts/postgresql/unified/01-create-tenant-roles.sql:/docker-entrypoint-initdb.d/00b-create-roles.sql
- ./scripts/postgresql/unified/04-init-tenant-schema-complete.sql:/docker-entrypoint-initdb.d/01-init-tenant-schema.sql
- ./scripts/postgresql/unified/05-create-tenant-test-data.sql:/docker-entrypoint-initdb.d/04-create-test-data.sql
- ./scripts/postgresql/setup-tenant-tablespaces.sql:/docker-entrypoint-initdb.d/02-setup-tablespaces.sql
ports:
- "5433:5432"
networks:
- gt2-tenant
- gt2-shared
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres -d gt2_tenants"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
labels:
- "gt2.service=postgres"
- "gt2.cluster=tenant"
- "gt2.role=primary"
- "gt2.tenant_schema=tenant_test_company"
# Tenant Backend
tenant-backend:
image: ${IMAGE_REGISTRY:-ghcr.io/gt-edge-ai-internal/gt-ai-os-community}/tenant-backend:${IMAGE_TAG:-latest}
build:
context: ./apps/tenant-backend
dockerfile: Dockerfile
container_name: gentwo-tenant-backend
environment:
DATABASE_URL: postgresql://gt2_tenant_user:${TENANT_USER_PASSWORD:-gt2_tenant_dev_password}@tenant-postgres-primary:5432/gt2_tenants
RESOURCE_CLUSTER_URL: http://resource-cluster:8000
CONTROL_PANEL_URL: http://control-panel-backend:8000
REQUIRE_OAUTH2_AUTH: "false"
SECRET_KEY: ${SECRET_KEY:-production-secret-key}
JWT_SECRET: ${JWT_SECRET}
TENANT_ID: "test"
TENANT_DOMAIN: test-company
POSTGRES_SCHEMA: tenant_test_company
ENVIRONMENT: ${ENVIRONMENT:-production}
DEBUG: "${DEBUG:-false}"
# Control Panel Database connection for billing logs
CONTROL_PANEL_DB_HOST: gentwo-controlpanel-postgres
CONTROL_PANEL_DB_NAME: gt2_admin
CONTROL_PANEL_DB_USER: postgres
CONTROL_PANEL_DB_PASSWORD: ${ADMIN_POSTGRES_PASSWORD:-dev_password_change_in_prod}
# Two-Factor Authentication Configuration
TFA_ENCRYPTION_KEY: ${TFA_ENCRYPTION_KEY:-}
TFA_ISSUER_NAME: ${TFA_ISSUER_NAME:-}
TFA_TEMP_TOKEN_EXPIRY_MINUTES: ${TFA_TEMP_TOKEN_EXPIRY_MINUTES:-}
TFA_RATE_LIMIT_ATTEMPTS: ${TFA_RATE_LIMIT_ATTEMPTS:-}
TFA_RATE_LIMIT_WINDOW_MINUTES: ${TFA_RATE_LIMIT_WINDOW_MINUTES:-}
ports:
- "8002:8000"
networks:
- gt2-tenant
- gt2-shared
- tenant-test-network
restart: unless-stopped
depends_on:
tenant-postgres-primary:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# Tenant App (Frontend)
tenant-app:
image: ${IMAGE_REGISTRY:-ghcr.io/gt-edge-ai-internal/gt-ai-os-community}/tenant-app:${IMAGE_TAG:-latest}
build:
context: ./apps/tenant-app
dockerfile: Dockerfile
args:
INTERNAL_BACKEND_URL: http://tenant-backend:8000
NEXT_PUBLIC_API_URL: http://localhost:8002
NEXT_PUBLIC_WS_URL: ws://localhost:8002
NEXT_PUBLIC_TENANT_DOMAIN: test-company
container_name: gentwo-tenant-frontend
environment:
NODE_ENV: production
CONTROL_PANEL_URL: http://control-panel-backend:8000
TENANT_DOMAIN: test-company
TENANT_BACKEND_URL: http://tenant-backend:8000
NEXT_PUBLIC_TENANT_BACKEND_URL: http://localhost:8002
NEXT_PUBLIC_API_URL: http://localhost:8002
NEXT_PUBLIC_WS_URL: ws://localhost:8002
NEXT_PUBLIC_TENANT_DOMAIN: test-company
NEXT_PUBLIC_ENVIRONMENT: ${ENVIRONMENT:-production}
INTERNAL_BACKEND_URL: http://tenant-backend:8000
ports:
- "3002:3001"
networks:
- gt2-tenant
- gt2-shared
restart: unless-stopped
depends_on:
tenant-backend:
condition: service_healthy
# ==============================================
# RESOURCE CLUSTER - AI/ML Services
# ==============================================
# Consul for service discovery
consul:
image: hashicorp/consul:1.16
container_name: gentwo-resource-consul
command: agent -dev -ui -client=0.0.0.0
volumes:
- consul_data:/consul/data
ports:
- "8500:8500" # HTTP API
- "8600:8600" # DNS
networks:
- gt2-resource
- gt2-shared
restart: unless-stopped
healthcheck:
test: ["CMD", "consul", "members"]
interval: 10s
timeout: 5s
retries: 5
# Resource Backend (MCP Orchestration)
resource-cluster:
image: ${IMAGE_REGISTRY:-ghcr.io/gt-edge-ai-internal/gt-ai-os-community}/resource-cluster:${IMAGE_TAG:-latest}
build:
context: ./apps/resource-cluster
dockerfile: Dockerfile
container_name: gentwo-resource-backend
environment:
# DEPRECATED: GROQ_API_KEY now comes from Control Panel DB (#158, #219)
# Keep temporarily for backwards compatibility during migration
GROQ_API_KEY: ${GROQ_API_KEY:-}
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
# Control Panel Integration for API key retrieval
CONTROL_PANEL_URL: http://control-panel-backend:8000
SERVICE_AUTH_TOKEN: ${SERVICE_AUTH_TOKEN:-internal-service-token}
# Service configuration
CONSUL_URL: http://consul:8500
SECRET_KEY: ${SECRET_KEY:-production-secret-key}
CAPABILITY_JWT_SECRET: ${CAPABILITY_JWT_SECRET:-production-capability-jwt-secret}
ENVIRONMENT: ${ENVIRONMENT:-production}
DEBUG: "${DEBUG:-false}"
# Control Panel Database connection for billing logs
CONTROL_PANEL_DB_HOST: gentwo-controlpanel-postgres
CONTROL_PANEL_DB_NAME: gt2_admin
CONTROL_PANEL_DB_USER: postgres
CONTROL_PANEL_DB_PASSWORD: ${ADMIN_POSTGRES_PASSWORD:-dev_password_change_in_prod}
ports:
- "8004:8000"
networks:
- gt2-resource
- gt2-shared
volumes:
- resource_cluster_data:/data
restart: unless-stopped
depends_on:
consul:
condition: service_healthy
vllm-embeddings:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 10s
timeout: 5s
retries: 5
# VLLM Embeddings Service for RAG
# Platform-specific settings in overlay files (arm64.yml, x86.yml, dgx.yml)
vllm-embeddings:
container_name: gentwo-vllm-embeddings
# Default build context - overridden by platform-specific overlays
build:
context: .
dockerfile: .deployment/docker/Dockerfile.vllm-arm
ports:
- "8005:8000"
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface
networks:
- gt2-resource
- gt2-shared
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 15s
retries: 10
start_period: 300s
labels:
- "gt2.service=vllm-embeddings"
- "gt2.cluster=resource"
- "gt2.component=embedding"
# ==============================================
# DEVELOPMENT UTILITIES
# ==============================================