- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
581 lines
19 KiB
Python
581 lines
19 KiB
Python
"""
|
|
System Management API Endpoints
|
|
"""
|
|
import asyncio
|
|
import subprocess
|
|
import json
|
|
import shutil
|
|
import os
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any, Optional
|
|
from fastapi import APIRouter, Depends, HTTPException, status, Query
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy import select, desc, text
|
|
from pydantic import BaseModel, Field
|
|
import structlog
|
|
|
|
from app.core.database import get_db
|
|
from app.core.auth import get_current_user
|
|
from app.models.user import User
|
|
from app.models.system import SystemVersion
|
|
from app.services.update_service import UpdateService
|
|
from app.services.backup_service import BackupService
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
router = APIRouter(prefix="/api/v1/system", tags=["System Management"])
|
|
|
|
|
|
# Request/Response Models
|
|
class VersionResponse(BaseModel):
|
|
"""Response model for version information"""
|
|
version: str
|
|
installed_at: str
|
|
installed_by: Optional[str]
|
|
is_current: bool
|
|
git_commit: Optional[str]
|
|
|
|
|
|
class SystemInfoResponse(BaseModel):
|
|
"""Response model for system information"""
|
|
current_version: str
|
|
version: str = "" # Alias for frontend compatibility - will be set from current_version
|
|
installation_date: str
|
|
container_count: Optional[int] = None
|
|
database_status: str = "healthy"
|
|
|
|
|
|
class CheckUpdateResponse(BaseModel):
|
|
"""Response model for update check"""
|
|
update_available: bool
|
|
available: bool = False # Alias for frontend compatibility
|
|
current_version: str
|
|
latest_version: Optional[str]
|
|
update_type: Optional[str] = None # "major", "minor", or "patch"
|
|
release_notes: Optional[str]
|
|
published_at: Optional[str]
|
|
released_at: Optional[str] = None # Alias for frontend compatibility
|
|
download_url: Optional[str]
|
|
checked_at: str # Timestamp when the check was performed
|
|
|
|
|
|
class ValidationCheckResult(BaseModel):
|
|
"""Individual validation check result"""
|
|
name: str
|
|
passed: bool
|
|
message: str
|
|
details: Dict[str, Any] = {}
|
|
|
|
|
|
class ValidateUpdateResponse(BaseModel):
|
|
"""Response model for update validation"""
|
|
valid: bool
|
|
checks: List[ValidationCheckResult]
|
|
warnings: List[str] = []
|
|
errors: List[str] = []
|
|
|
|
|
|
class ValidateUpdateRequest(BaseModel):
|
|
"""Request model for validating an update"""
|
|
target_version: str = Field(..., description="Target version to validate")
|
|
|
|
|
|
class StartUpdateRequest(BaseModel):
|
|
"""Request model for starting an update"""
|
|
target_version: str = Field(..., description="Version to update to")
|
|
create_backup: bool = Field(default=True, description="Create backup before update")
|
|
|
|
|
|
class StartUpdateResponse(BaseModel):
|
|
"""Response model for starting an update"""
|
|
update_id: str
|
|
target_version: str
|
|
message: str = "Update initiated"
|
|
|
|
|
|
class UpdateStatusResponse(BaseModel):
|
|
"""Response model for update status"""
|
|
update_id: str
|
|
target_version: str
|
|
status: str
|
|
started_at: str
|
|
completed_at: Optional[str]
|
|
current_stage: Optional[str]
|
|
logs: List[Dict[str, Any]] = []
|
|
error_message: Optional[str]
|
|
backup_id: Optional[int]
|
|
|
|
|
|
class RollbackRequest(BaseModel):
|
|
"""Request model for rollback"""
|
|
reason: Optional[str] = Field(None, description="Reason for rollback")
|
|
|
|
|
|
class BackupResponse(BaseModel):
|
|
"""Response model for backup information"""
|
|
id: int
|
|
uuid: str
|
|
backup_type: str
|
|
created_at: str
|
|
size_mb: Optional[float] # Keep for backward compatibility
|
|
size: Optional[int] = None # Size in bytes for frontend
|
|
version: Optional[str]
|
|
description: Optional[str]
|
|
is_valid: bool
|
|
download_url: Optional[str] = None # Download URL if available
|
|
|
|
|
|
class CreateBackupRequest(BaseModel):
|
|
"""Request model for creating a backup"""
|
|
backup_type: str = Field(default="manual", description="Type of backup")
|
|
description: Optional[str] = Field(None, description="Backup description")
|
|
|
|
|
|
class RestoreBackupRequest(BaseModel):
|
|
"""Request model for restoring a backup"""
|
|
backup_id: str = Field(..., description="UUID of backup to restore")
|
|
components: Optional[List[str]] = Field(None, description="Components to restore")
|
|
|
|
|
|
class ContainerStatus(BaseModel):
|
|
"""Container status from Docker"""
|
|
name: str
|
|
cluster: str # "admin", "tenant", "resource"
|
|
state: str # "running", "exited", "paused"
|
|
health: str # "healthy", "unhealthy", "starting", "none"
|
|
uptime: str
|
|
ports: List[str] = []
|
|
|
|
|
|
class DatabaseStats(BaseModel):
|
|
"""PostgreSQL database statistics"""
|
|
connections_active: int
|
|
connections_max: int
|
|
cache_hit_ratio: float
|
|
database_size: str
|
|
transactions_committed: int
|
|
|
|
|
|
class ClusterSummary(BaseModel):
|
|
"""Cluster health summary"""
|
|
name: str
|
|
healthy: int
|
|
unhealthy: int
|
|
total: int
|
|
|
|
|
|
class SystemHealthDetailedResponse(BaseModel):
|
|
"""Detailed system health response"""
|
|
overall_status: str
|
|
containers: List[ContainerStatus]
|
|
clusters: List[ClusterSummary]
|
|
database: DatabaseStats
|
|
version: str
|
|
|
|
|
|
# Helper Functions
|
|
async def _get_container_status() -> List[ContainerStatus]:
|
|
"""Get container status from Docker Compose"""
|
|
try:
|
|
# Run docker compose ps with JSON format
|
|
process = await asyncio.create_subprocess_exec(
|
|
"docker", "compose", "ps", "--format", "json",
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
cwd="/Users/hackweasel/Documents/GT-2.0"
|
|
)
|
|
|
|
stdout, stderr = await process.communicate()
|
|
|
|
if process.returncode != 0:
|
|
logger.error("docker_compose_ps_failed", stderr=stderr.decode())
|
|
return []
|
|
|
|
# Parse JSON output (one JSON object per line)
|
|
containers = []
|
|
for line in stdout.decode().strip().split('\n'):
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
container_data = json.loads(line)
|
|
name = container_data.get("Name", "")
|
|
state = container_data.get("State", "unknown")
|
|
health = container_data.get("Health", "none")
|
|
|
|
# Map container name to cluster
|
|
cluster = "unknown"
|
|
if "controlpanel" in name.lower():
|
|
cluster = "admin"
|
|
elif "tenant" in name.lower() and "controlpanel" not in name.lower():
|
|
cluster = "tenant"
|
|
elif "resource" in name.lower() or "vllm" in name.lower():
|
|
cluster = "resource"
|
|
|
|
# Extract ports
|
|
ports = []
|
|
publishers = container_data.get("Publishers", [])
|
|
if publishers:
|
|
for pub in publishers:
|
|
if pub.get("PublishedPort"):
|
|
ports.append(f"{pub.get('PublishedPort')}:{pub.get('TargetPort')}")
|
|
|
|
# Get uptime from status
|
|
status_text = container_data.get("Status", "")
|
|
uptime = status_text if status_text else "unknown"
|
|
|
|
containers.append(ContainerStatus(
|
|
name=name,
|
|
cluster=cluster,
|
|
state=state,
|
|
health=health if health else "none",
|
|
uptime=uptime,
|
|
ports=ports
|
|
))
|
|
except json.JSONDecodeError as e:
|
|
logger.warning("failed_to_parse_container_json", line=line, error=str(e))
|
|
continue
|
|
|
|
return containers
|
|
|
|
except Exception as e:
|
|
# Docker is not available inside the container - this is expected behavior
|
|
logger.debug("docker_not_available", error=str(e))
|
|
return []
|
|
|
|
|
|
async def _get_database_stats(db: AsyncSession) -> DatabaseStats:
|
|
"""Get PostgreSQL database statistics"""
|
|
try:
|
|
# Get connection and transaction stats
|
|
stats_query = text("""
|
|
SELECT
|
|
numbackends as active_connections,
|
|
xact_commit as transactions_committed,
|
|
ROUND(100.0 * blks_hit / NULLIF(blks_read + blks_hit, 0), 1) as cache_hit_ratio
|
|
FROM pg_stat_database
|
|
WHERE datname = current_database()
|
|
""")
|
|
|
|
stats_result = await db.execute(stats_query)
|
|
stats = stats_result.fetchone()
|
|
|
|
# Get database size
|
|
size_query = text("SELECT pg_size_pretty(pg_database_size(current_database()))")
|
|
size_result = await db.execute(size_query)
|
|
size = size_result.scalar()
|
|
|
|
# Get max connections
|
|
max_conn_query = text("SELECT current_setting('max_connections')::int")
|
|
max_conn_result = await db.execute(max_conn_query)
|
|
max_connections = max_conn_result.scalar()
|
|
|
|
return DatabaseStats(
|
|
connections_active=stats[0] if stats else 0,
|
|
connections_max=max_connections if max_connections else 100,
|
|
cache_hit_ratio=float(stats[2]) if stats and stats[2] else 0.0,
|
|
database_size=size if size else "0 bytes",
|
|
transactions_committed=stats[1] if stats else 0
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("failed_to_get_database_stats", error=str(e))
|
|
# Return default stats on error
|
|
return DatabaseStats(
|
|
connections_active=0,
|
|
connections_max=100,
|
|
cache_hit_ratio=0.0,
|
|
database_size="unknown",
|
|
transactions_committed=0
|
|
)
|
|
|
|
|
|
def _aggregate_clusters(containers: List[ContainerStatus]) -> List[ClusterSummary]:
|
|
"""Aggregate container health by cluster"""
|
|
cluster_data = {}
|
|
|
|
for container in containers:
|
|
cluster_name = container.cluster
|
|
|
|
if cluster_name not in cluster_data:
|
|
cluster_data[cluster_name] = {"healthy": 0, "unhealthy": 0, "total": 0}
|
|
|
|
cluster_data[cluster_name]["total"] += 1
|
|
|
|
# Consider container healthy if running and health is healthy/none
|
|
if container.state == "running" and container.health in ["healthy", "none"]:
|
|
cluster_data[cluster_name]["healthy"] += 1
|
|
else:
|
|
cluster_data[cluster_name]["unhealthy"] += 1
|
|
|
|
# Convert to ClusterSummary objects
|
|
summaries = []
|
|
for cluster_name, data in cluster_data.items():
|
|
summaries.append(ClusterSummary(
|
|
name=cluster_name,
|
|
healthy=data["healthy"],
|
|
unhealthy=data["unhealthy"],
|
|
total=data["total"]
|
|
))
|
|
|
|
return summaries
|
|
|
|
|
|
# Dependency for admin-only access
|
|
async def require_admin(current_user: User = Depends(get_current_user)):
|
|
"""Ensure user is a super admin"""
|
|
if current_user.user_type != "super_admin":
|
|
raise HTTPException(
|
|
status_code=status.HTTP_403_FORBIDDEN,
|
|
detail="Administrator access required"
|
|
)
|
|
return current_user
|
|
|
|
|
|
# Version Endpoints
|
|
@router.get("/version", response_model=SystemInfoResponse)
|
|
async def get_system_version(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Get current system version and information"""
|
|
# Get current version
|
|
stmt = select(SystemVersion).where(
|
|
SystemVersion.is_current == True
|
|
).order_by(desc(SystemVersion.installed_at)).limit(1)
|
|
|
|
result = await db.execute(stmt)
|
|
current = result.scalar_one_or_none()
|
|
|
|
if not current:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail="System version not found. Please run database migrations: alembic upgrade head"
|
|
)
|
|
|
|
return SystemInfoResponse(
|
|
current_version=current.version,
|
|
version=current.version, # Set version same as current_version for frontend compatibility
|
|
installation_date=current.installed_at.isoformat(),
|
|
database_status="healthy"
|
|
)
|
|
|
|
|
|
@router.get("/health-detailed", response_model=SystemHealthDetailedResponse)
|
|
async def get_detailed_health(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Get comprehensive system health with real container and database metrics"""
|
|
# Get current version
|
|
stmt = select(SystemVersion).where(
|
|
SystemVersion.is_current == True
|
|
).order_by(desc(SystemVersion.installed_at)).limit(1)
|
|
|
|
result = await db.execute(stmt)
|
|
current_version = result.scalar_one_or_none()
|
|
version_str = current_version.version if current_version else "unknown"
|
|
|
|
# Gather system metrics concurrently
|
|
containers = await _get_container_status()
|
|
database_stats = await _get_database_stats(db)
|
|
cluster_summaries = _aggregate_clusters(containers)
|
|
|
|
# Determine overall status
|
|
unhealthy_count = sum(cluster.unhealthy for cluster in cluster_summaries)
|
|
overall_status = "healthy" if unhealthy_count == 0 else "degraded"
|
|
|
|
return SystemHealthDetailedResponse(
|
|
overall_status=overall_status,
|
|
containers=containers,
|
|
clusters=cluster_summaries,
|
|
database=database_stats,
|
|
version=version_str
|
|
)
|
|
|
|
|
|
# Update Endpoints
|
|
@router.get("/check-update", response_model=CheckUpdateResponse)
|
|
async def check_for_updates(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Check for available system updates"""
|
|
service = UpdateService(db)
|
|
return await service.check_for_updates()
|
|
|
|
|
|
@router.post("/validate-update", response_model=ValidateUpdateResponse)
|
|
async def validate_update(
|
|
request: ValidateUpdateRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Run pre-update validation checks"""
|
|
service = UpdateService(db)
|
|
return await service.validate_update(request.target_version)
|
|
|
|
|
|
@router.post("/update", response_model=StartUpdateResponse)
|
|
async def start_update(
|
|
request: StartUpdateRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Start system update process"""
|
|
service = UpdateService(db)
|
|
update_id = await service.execute_update(
|
|
target_version=request.target_version,
|
|
create_backup=request.create_backup,
|
|
started_by=current_user.email
|
|
)
|
|
|
|
return StartUpdateResponse(
|
|
update_id=update_id,
|
|
target_version=request.target_version
|
|
)
|
|
|
|
|
|
@router.get("/update/{update_id}/status", response_model=UpdateStatusResponse)
|
|
async def get_update_status(
|
|
update_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Get status of an update job"""
|
|
service = UpdateService(db)
|
|
status_data = await service.get_update_status(update_id)
|
|
|
|
return UpdateStatusResponse(
|
|
update_id=status_data["uuid"],
|
|
target_version=status_data["target_version"],
|
|
status=status_data["status"],
|
|
started_at=status_data["started_at"],
|
|
completed_at=status_data.get("completed_at"),
|
|
current_stage=status_data.get("current_stage"),
|
|
logs=status_data.get("logs", []),
|
|
error_message=status_data.get("error_message"),
|
|
backup_id=status_data.get("backup_id")
|
|
)
|
|
|
|
|
|
@router.post("/update/{update_id}/rollback")
|
|
async def rollback_update(
|
|
update_id: str,
|
|
request: RollbackRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Rollback a failed update"""
|
|
service = UpdateService(db)
|
|
return await service.rollback(update_id, request.reason)
|
|
|
|
|
|
# Backup Endpoints
|
|
@router.get("/backups", response_model=Dict[str, Any])
|
|
async def list_backups(
|
|
limit: int = Query(default=50, ge=1, le=100),
|
|
offset: int = Query(default=0, ge=0),
|
|
backup_type: Optional[str] = Query(default=None, description="Filter by backup type"),
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""List available backups with storage information"""
|
|
service = BackupService(db)
|
|
backup_data = await service.list_backups(limit=limit, offset=offset, backup_type=backup_type)
|
|
|
|
# Add storage information
|
|
backup_dir = service.BACKUP_DIR
|
|
try:
|
|
# Create backup directory if it doesn't exist
|
|
os.makedirs(backup_dir, exist_ok=True)
|
|
disk_usage = shutil.disk_usage(backup_dir)
|
|
storage = {
|
|
"used": backup_data.get("storage_used", 0), # From service
|
|
"total": disk_usage.total,
|
|
"available": disk_usage.free
|
|
}
|
|
except Exception as e:
|
|
logger.debug("backup_dir_unavailable", error=str(e))
|
|
storage = {"used": 0, "total": 0, "available": 0}
|
|
|
|
backup_data["storage"] = storage
|
|
return backup_data
|
|
|
|
|
|
@router.post("/backups", response_model=BackupResponse)
|
|
async def create_backup(
|
|
request: CreateBackupRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Create a new system backup"""
|
|
service = BackupService(db)
|
|
backup_data = await service.create_backup(
|
|
backup_type=request.backup_type,
|
|
description=request.description,
|
|
created_by=current_user.email
|
|
)
|
|
|
|
return BackupResponse(
|
|
id=backup_data["id"],
|
|
uuid=backup_data["uuid"],
|
|
backup_type=backup_data["backup_type"],
|
|
created_at=backup_data["created_at"],
|
|
size_mb=backup_data.get("size_mb"),
|
|
size=backup_data.get("size"),
|
|
version=backup_data.get("version"),
|
|
description=backup_data.get("description"),
|
|
is_valid=backup_data["is_valid"],
|
|
download_url=backup_data.get("download_url")
|
|
)
|
|
|
|
|
|
@router.get("/backups/{backup_id}", response_model=BackupResponse)
|
|
async def get_backup(
|
|
backup_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Get details of a specific backup"""
|
|
service = BackupService(db)
|
|
backup_data = await service.get_backup(backup_id)
|
|
|
|
return BackupResponse(
|
|
id=backup_data["id"],
|
|
uuid=backup_data["uuid"],
|
|
backup_type=backup_data["backup_type"],
|
|
created_at=backup_data["created_at"],
|
|
size_mb=backup_data.get("size_mb"),
|
|
size=backup_data.get("size"),
|
|
version=backup_data.get("version"),
|
|
description=backup_data.get("description"),
|
|
is_valid=backup_data["is_valid"],
|
|
download_url=backup_data.get("download_url")
|
|
)
|
|
|
|
|
|
@router.delete("/backups/{backup_id}")
|
|
async def delete_backup(
|
|
backup_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Delete a backup"""
|
|
service = BackupService(db)
|
|
return await service.delete_backup(backup_id)
|
|
|
|
|
|
@router.post("/restore")
|
|
async def restore_backup(
|
|
request: RestoreBackupRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(require_admin)
|
|
):
|
|
"""Restore system from a backup"""
|
|
service = BackupService(db)
|
|
return await service.restore_backup(
|
|
backup_id=request.backup_id,
|
|
components=request.components
|
|
)
|