gt-ai-os-community/apps/resource-cluster/app/main.py

"""
GT 2.0 Resource Cluster - Main Application

Air-gapped resource management hub for AI/ML resources, RAG engines,
agentic workflows, app integrations, external services, and AI literacy.
"""

from contextlib import asynccontextmanager
from datetime import datetime
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.responses import JSONResponse
from prometheus_client import make_asgi_app
import logging

from app.core.config import get_settings
from app.api import inference, embeddings, rag, agents, templates, health, internal
from app.api.v1 import services, models, ai_inference, mcp_registry, mcp_executor
from app.core.backends import initialize_backends
from app.services.consul_registry import ConsulRegistry
from app.services.config_sync import get_config_sync_service
from app.api.v1.mcp_registry import initialize_mcp_servers

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

settings = get_settings()


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Manage application lifecycle"""
    # Startup
    logger.info("Starting GT 2.0 Resource Cluster")

    # Initialize resource backends
    await initialize_backends()

    # Initialize MCP servers (RAG and Conversation)
    try:
        await initialize_mcp_servers()
        logger.info("MCP servers initialized")
    except Exception as e:
        logger.error(f"MCP server initialization failed: {e}")

    # Start configuration sync from admin cluster
    if settings.config_sync_enabled:
        config_sync = get_config_sync_service()

        # Perform initial sync before starting background loop
        try:
            await config_sync.sync_configurations()
            logger.info("Initial configuration sync completed")

            # Give config sync time to complete provider updates
            import asyncio
            await asyncio.sleep(0.5)

            # Verify BGE-M3 model is loaded in registry before refreshing embedding backend
            try:
                from app.services.model_service import default_model_service
                from app.core.backends import get_embedding_backend

                # Retry logic to wait for BGE-M3 to appear in registry
                max_retries = 3
                retry_delay = 1.0  # seconds
                bge_m3_found = False

                for attempt in range(max_retries):
                    bge_m3_config = default_model_service.model_registry.get("BAAI/bge-m3")

                    if bge_m3_config:
                        endpoint = bge_m3_config.get("endpoint_url")
                        config = bge_m3_config.get("parameters", {})
                        is_local_mode = config.get("is_local_mode", True)

                        logger.info(f"BGE-M3 found in registry on attempt {attempt + 1}: endpoint={endpoint}, is_local_mode={is_local_mode}")
                        bge_m3_found = True
                        break
                    else:
                        logger.debug(f"BGE-M3 not yet in registry (attempt {attempt + 1}/{max_retries}), retrying...")
                        if attempt < max_retries - 1:
                            await asyncio.sleep(retry_delay)

                if not bge_m3_found:
                    logger.warning("BGE-M3 not found in registry after initial sync - will use defaults until next sync")

                # Refresh embedding backend with database configuration
                embedding_backend = get_embedding_backend()
                embedding_backend.refresh_endpoint_from_registry()
                logger.info(f"Embedding backend refreshed with database configuration: {embedding_backend.embedding_endpoint}")
            except Exception as e:
                logger.warning(f"Failed to refresh embedding backend on startup: {e}")
        except Exception as e:
            logger.warning(f"Initial configuration sync failed: {e}")

        # Start sync loop in background
        asyncio.create_task(config_sync.start_sync_loop())
        logger.info("Started configuration sync from admin cluster")

    # Register with Consul for service discovery
    if settings.environment == "production":
        consul = ConsulRegistry()
        await consul.register_service(
            name="resource-cluster",
            service_id=f"resource-cluster-{settings.cluster_name}",
            address="localhost",
            port=settings.service_port,
            tags=["ai", "resource", "cluster"],
            check_interval="10s"
        )

    logger.info(f"Resource Cluster started on port {settings.service_port}")

    yield

    # Shutdown
    logger.info("Shutting down Resource Cluster")

    # Deregister from Consul
    if settings.environment == "production":
        await consul.deregister_service(f"resource-cluster-{settings.cluster_name}")


# Create FastAPI application
app = FastAPI(
    title="GT 2.0 Resource Cluster",
    description="Centralized AI resource management with high availability",
    version="1.0.0",
    lifespan=lifespan
)

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.cors_origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Add trusted host middleware with configurable hosts
app.add_middleware(
    TrustedHostMiddleware,
    allowed_hosts=settings.trusted_hosts
)

# Include API routers
app.include_router(health.router, prefix="/health", tags=["health"])
app.include_router(inference.router, prefix="/api/v1/inference", tags=["inference"])
app.include_router(embeddings.router, prefix="/api/v1/embeddings", tags=["embeddings"])
app.include_router(rag.router, prefix="/api/v1/rag", tags=["rag"])
app.include_router(agents.router, prefix="/api/v1/agents", tags=["agents"])
app.include_router(templates.router, prefix="/api/v1/templates", tags=["templates"])
app.include_router(services.router, prefix="/api/v1/services", tags=["services"])
app.include_router(models.router, tags=["models"])
app.include_router(ai_inference.router, prefix="/api/v1", tags=["ai"])  # Add AI inference router
app.include_router(mcp_registry.router, prefix="/api/v1", tags=["mcp"])
app.include_router(mcp_executor.router, prefix="/api/v1", tags=["mcp"])
app.include_router(internal.router, tags=["internal"])  # Internal service-to-service APIs

# Mount Prometheus metrics endpoint
if settings.prometheus_enabled:
    metrics_app = make_asgi_app()
    app.mount("/metrics", metrics_app)


@app.get("/")
async def root():
    """Root endpoint"""
    return {
        "service": "GT 2.0 Resource Cluster",
        "version": "1.0.0",
        "status": "operational",
        "environment": settings.environment,
        "capabilities": {
            "ai_ml": ["llm", "embeddings", "image_generation"],
            "rag_engine": ["vector_search", "document_processing"],
            "agentic_workflows": ["single_agent", "multi_agent"],
            "app_integrations": ["oauth2", "webhooks"],
            "external_services": ["ctfd", "canvas", "guacamole", "iframe_embed", "sso"],
            "ai_literacy": ["games", "puzzles", "education"]
        }
    }


@app.get("/health")
async def health_check():
    """Docker health check endpoint (without trailing slash)"""
    return {
        "status": "healthy",
        "service": "resource-cluster",
        "timestamp": datetime.utcnow()
    }


@app.get("/ready")
async def ready_check():
    """Kubernetes readiness probe endpoint"""
    return {
        "status": "ready",
        "service": "resource-cluster",
        "timestamp": datetime.utcnow(),
        "health": "ok"
    }


@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
    """Global exception handler"""
    logger.error(f"Unhandled exception: {exc}", exc_info=True)
    return JSONResponse(
        status_code=500,
        content={
            "error": "Internal server error",
            "message": str(exc) if settings.debug else "An error occurred processing your request"
        }
    )


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        "app.main:app",
        host="0.0.0.0",
        port=settings.service_port,
        reload=settings.debug,
        log_level="info" if not settings.debug else "debug"
    )