GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents

- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00
commit 310491a557
750 changed files with 232701 additions and 0 deletions
--- a/apps/control-panel-backend/app/services/update_service.py
+++ b/apps/control-panel-backend/app/services/update_service.py
@@ -0,0 +1,525 @@
+"""
+Update Service - Manages system updates and version checking
+"""
+import os
+import json
+import asyncio
+import httpx
+from typing import Dict, Any, Optional, List
+from datetime import datetime
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, and_, desc
+from fastapi import HTTPException, status
+import structlog
+
+from app.models.system import SystemVersion, UpdateJob, UpdateStatus, BackupRecord
+from app.services.backup_service import BackupService
+
+logger = structlog.get_logger()
+
+
+class UpdateService:
+    """Service for checking and executing system updates"""
+
+    GITHUB_API_BASE = "https://api.github.com"
+    REPO_OWNER = "GT-Edge-AI-Internal"
+    REPO_NAME = "gt-ai-os-community"
+    DEPLOY_SCRIPT = "/app/scripts/deploy.sh"
+    ROLLBACK_SCRIPT = "/app/scripts/rollback.sh"
+    MIN_DISK_SPACE_GB = 5
+
+    def __init__(self, db: AsyncSession):
+        self.db = db
+
+    async def check_for_updates(self) -> Dict[str, Any]:
+        """Check GitHub for available updates"""
+        try:
+            # Get current version
+            current_version = await self._get_current_version()
+
+            # Query GitHub releases API
+            url = f"{self.GITHUB_API_BASE}/repos/{self.REPO_OWNER}/{self.REPO_NAME}/releases/latest"
+
+            async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client:
+                response = await client.get(url)
+                if response.status_code == 404:
+                    logger.warning("No releases found in repository")
+                    return {
+                        "update_available": False,
+                        "current_version": current_version,
+                        "latest_version": None,
+                        "release_notes": None,
+                        "published_at": None,
+                        "download_url": None,
+                        "checked_at": datetime.utcnow().isoformat()
+                    }
+
+                if response.status_code != 200:
+                    logger.error(f"GitHub API error: {response.status_code}")
+                    raise HTTPException(
+                        status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                        detail="Unable to check for updates from GitHub"
+                    )
+
+                release_data = response.json()
+
+            latest_version = release_data.get("tag_name", "").lstrip("v")
+            release_notes = release_data.get("body", "")
+            published_at = release_data.get("published_at")
+
+            update_available = self._is_newer_version(latest_version, current_version)
+            update_type = self._determine_update_type(latest_version, current_version) if update_available else None
+
+            return {
+                "update_available": update_available,
+                "available": update_available,  # Alias for frontend compatibility
+                "current_version": current_version,
+                "latest_version": latest_version,
+                "update_type": update_type,
+                "release_notes": release_notes,
+                "published_at": published_at,
+                "released_at": published_at,  # Alias for frontend compatibility
+                "download_url": release_data.get("html_url"),
+                "checked_at": datetime.utcnow().isoformat()
+            }
+
+        except httpx.RequestError as e:
+            logger.error(f"Network error checking for updates: {str(e)}")
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail="Network error while checking for updates"
+            )
+        except Exception as e:
+            logger.error(f"Error checking for updates: {str(e)}")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Failed to check for updates: {str(e)}"
+            )
+
+    async def validate_update(self, target_version: str) -> Dict[str, Any]:
+        """Run pre-update validation checks"""
+        validation_results = {
+            "valid": True,
+            "checks": [],
+            "warnings": [],
+            "errors": []
+        }
+
+        # Check 1: Disk space
+        disk_check = await self._check_disk_space()
+        validation_results["checks"].append(disk_check)
+        if not disk_check["passed"]:
+            validation_results["valid"] = False
+            validation_results["errors"].append(disk_check["message"])
+
+        # Check 2: Container health
+        container_check = await self._check_container_health()
+        validation_results["checks"].append(container_check)
+        if not container_check["passed"]:
+            validation_results["valid"] = False
+            validation_results["errors"].append(container_check["message"])
+
+        # Check 3: Database connectivity
+        db_check = await self._check_database_connectivity()
+        validation_results["checks"].append(db_check)
+        if not db_check["passed"]:
+            validation_results["valid"] = False
+            validation_results["errors"].append(db_check["message"])
+
+        # Check 4: Recent backup exists
+        backup_check = await self._check_recent_backup()
+        validation_results["checks"].append(backup_check)
+        if not backup_check["passed"]:
+            validation_results["warnings"].append(backup_check["message"])
+
+        # Check 5: No running updates
+        running_update = await self._check_running_updates()
+        if running_update:
+            validation_results["valid"] = False
+            validation_results["errors"].append(
+                f"Update job {running_update} is already in progress"
+            )
+
+        return validation_results
+
+    async def execute_update(
+        self,
+        target_version: str,
+        create_backup: bool = True,
+        started_by: str = None
+    ) -> str:
+        """Execute system update"""
+        # Create update job
+        update_job = UpdateJob(
+            target_version=target_version,
+            status=UpdateStatus.pending,
+            started_by=started_by
+        )
+        update_job.add_log(f"Update to version {target_version} initiated", "info")
+
+        self.db.add(update_job)
+        await self.db.commit()
+        await self.db.refresh(update_job)
+
+        job_uuid = update_job.uuid
+
+        # Start update in background
+        asyncio.create_task(self._run_update_process(job_uuid, target_version, create_backup))
+
+        logger.info(f"Update job {job_uuid} created for version {target_version}")
+
+        return job_uuid
+
+    async def get_update_status(self, update_id: str) -> Dict[str, Any]:
+        """Get current status of an update job"""
+        stmt = select(UpdateJob).where(UpdateJob.uuid == update_id)
+        result = await self.db.execute(stmt)
+        update_job = result.scalar_one_or_none()
+
+        if not update_job:
+            raise HTTPException(
+                status_code=status.HTTP_404_NOT_FOUND,
+                detail=f"Update job {update_id} not found"
+            )
+
+        return update_job.to_dict()
+
+    async def rollback(self, update_id: str, reason: str = None) -> Dict[str, Any]:
+        """Rollback a failed update"""
+        stmt = select(UpdateJob).where(UpdateJob.uuid == update_id)
+        result = await self.db.execute(stmt)
+        update_job = result.scalar_one_or_none()
+
+        if not update_job:
+            raise HTTPException(
+                status_code=status.HTTP_404_NOT_FOUND,
+                detail=f"Update job {update_id} not found"
+            )
+
+        if update_job.status not in [UpdateStatus.failed, UpdateStatus.in_progress]:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Cannot rollback update in status: {update_job.status}"
+            )
+
+        update_job.rollback_reason = reason or "Manual rollback requested"
+        update_job.add_log(f"Rollback initiated: {update_job.rollback_reason}", "warning")
+
+        await self.db.commit()
+
+        # Execute rollback in background
+        asyncio.create_task(self._run_rollback_process(update_id))
+
+        return {"message": "Rollback initiated", "update_id": update_id}
+
+    async def _run_update_process(
+        self,
+        job_uuid: str,
+        target_version: str,
+        create_backup: bool
+    ):
+        """Background task to run update process"""
+        try:
+            # Reload job from database
+            stmt = select(UpdateJob).where(UpdateJob.uuid == job_uuid)
+            result = await self.db.execute(stmt)
+            update_job = result.scalar_one_or_none()
+
+            if not update_job:
+                logger.error(f"Update job {job_uuid} not found")
+                return
+
+            update_job.status = UpdateStatus.in_progress
+            await self.db.commit()
+
+            # Stage 1: Create pre-update backup
+            if create_backup:
+                update_job.current_stage = "creating_backup"
+                update_job.add_log("Creating pre-update backup", "info")
+                await self.db.commit()
+
+                backup_service = BackupService(self.db)
+                backup_result = await backup_service.create_backup(
+                    backup_type="pre_update",
+                    description=f"Pre-update backup before upgrading to {target_version}"
+                )
+                update_job.backup_id = backup_result["id"]
+                update_job.add_log(f"Backup created: {backup_result['uuid']}", "info")
+                await self.db.commit()
+
+            # Stage 2: Execute deploy script
+            update_job.current_stage = "executing_update"
+            update_job.add_log(f"Running deploy script for version {target_version}", "info")
+            await self.db.commit()
+
+            # Run deploy.sh script
+            process = await asyncio.create_subprocess_exec(
+                self.DEPLOY_SCRIPT,
+                target_version,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+
+            stdout, stderr = await process.communicate()
+
+            if process.returncode == 0:
+                # Success
+                update_job.status = UpdateStatus.completed
+                update_job.current_stage = "completed"
+                update_job.completed_at = datetime.utcnow()
+                update_job.add_log(f"Update to {target_version} completed successfully", "info")
+
+                # Record new version
+                await self._record_version(target_version, update_job.started_by)
+            else:
+                # Failure
+                update_job.status = UpdateStatus.failed
+                update_job.current_stage = "failed"
+                update_job.completed_at = datetime.utcnow()
+                error_msg = stderr.decode() if stderr else "Unknown error"
+                update_job.error_message = error_msg
+                update_job.add_log(f"Update failed: {error_msg}", "error")
+
+            await self.db.commit()
+
+        except Exception as e:
+            logger.error(f"Update process error: {str(e)}")
+            stmt = select(UpdateJob).where(UpdateJob.uuid == job_uuid)
+            result = await self.db.execute(stmt)
+            update_job = result.scalar_one_or_none()
+
+            if update_job:
+                update_job.status = UpdateStatus.failed
+                update_job.error_message = str(e)
+                update_job.completed_at = datetime.utcnow()
+                update_job.add_log(f"Update process exception: {str(e)}", "error")
+                await self.db.commit()
+
+    async def _run_rollback_process(self, job_uuid: str):
+        """Background task to run rollback process"""
+        try:
+            stmt = select(UpdateJob).where(UpdateJob.uuid == job_uuid)
+            result = await self.db.execute(stmt)
+            update_job = result.scalar_one_or_none()
+
+            if not update_job:
+                logger.error(f"Update job {job_uuid} not found")
+                return
+
+            update_job.current_stage = "rolling_back"
+            update_job.add_log("Executing rollback script", "warning")
+            await self.db.commit()
+
+            # Run rollback script
+            process = await asyncio.create_subprocess_exec(
+                self.ROLLBACK_SCRIPT,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+
+            stdout, stderr = await process.communicate()
+
+            if process.returncode == 0:
+                update_job.status = UpdateStatus.rolled_back
+                update_job.current_stage = "rolled_back"
+                update_job.completed_at = datetime.utcnow()
+                update_job.add_log("Rollback completed successfully", "info")
+            else:
+                error_msg = stderr.decode() if stderr else "Unknown error"
+                update_job.add_log(f"Rollback failed: {error_msg}", "error")
+
+            await self.db.commit()
+
+        except Exception as e:
+            logger.error(f"Rollback process error: {str(e)}")
+
+    async def _get_current_version(self) -> str:
+        """Get currently installed version"""
+        stmt = select(SystemVersion).where(
+            SystemVersion.is_current == True
+        ).order_by(desc(SystemVersion.installed_at)).limit(1)
+
+        result = await self.db.execute(stmt)
+        current = result.scalar_one_or_none()
+
+        return current.version if current else "unknown"
+
+    async def _record_version(self, version: str, installed_by: str):
+        """Record new system version"""
+        # Mark all versions as not current
+        stmt = select(SystemVersion).where(SystemVersion.is_current == True)
+        result = await self.db.execute(stmt)
+        old_versions = result.scalars().all()
+
+        for old_version in old_versions:
+            old_version.is_current = False
+
+        # Create new version record
+        new_version = SystemVersion(
+            version=version,
+            installed_by=installed_by,
+            is_current=True
+        )
+        self.db.add(new_version)
+        await self.db.commit()
+
+    def _is_newer_version(self, latest: str, current: str) -> bool:
+        """Compare version strings"""
+        try:
+            latest_parts = [int(x) for x in latest.split(".")]
+            current_parts = [int(x) for x in current.split(".")]
+
+            # Pad shorter version with zeros
+            max_len = max(len(latest_parts), len(current_parts))
+            latest_parts += [0] * (max_len - len(latest_parts))
+            current_parts += [0] * (max_len - len(current_parts))
+
+            return latest_parts > current_parts
+        except (ValueError, AttributeError):
+            return False
+
+    def _determine_update_type(self, latest: str, current: str) -> str:
+        """Determine if update is major, minor, or patch"""
+        try:
+            latest_parts = [int(x) for x in latest.split(".")]
+            current_parts = [int(x) for x in current.split(".")]
+
+            # Pad to at least 3 parts for comparison
+            while len(latest_parts) < 3:
+                latest_parts.append(0)
+            while len(current_parts) < 3:
+                current_parts.append(0)
+
+            if latest_parts[0] > current_parts[0]:
+                return "major"
+            elif latest_parts[1] > current_parts[1]:
+                return "minor"
+            else:
+                return "patch"
+        except (ValueError, IndexError, AttributeError):
+            return "patch"
+
+    async def _check_disk_space(self) -> Dict[str, Any]:
+        """Check available disk space"""
+        try:
+            stat = os.statvfs("/")
+            free_gb = (stat.f_bavail * stat.f_frsize) / (1024 ** 3)
+            passed = free_gb >= self.MIN_DISK_SPACE_GB
+
+            return {
+                "name": "disk_space",
+                "passed": passed,
+                "message": f"Available disk space: {free_gb:.2f} GB (minimum: {self.MIN_DISK_SPACE_GB} GB)",
+                "details": {"free_gb": round(free_gb, 2)}
+            }
+        except Exception as e:
+            return {
+                "name": "disk_space",
+                "passed": False,
+                "message": f"Failed to check disk space: {str(e)}",
+                "details": {}
+            }
+
+    async def _check_container_health(self) -> Dict[str, Any]:
+        """Check Docker container health"""
+        try:
+            # Run docker ps to check container status
+            process = await asyncio.create_subprocess_exec(
+                "docker", "ps", "--format", "{{.Names}}|{{.Status}}",
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await process.communicate()
+
+            if process.returncode != 0:
+                return {
+                    "name": "container_health",
+                    "passed": False,
+                    "message": "Failed to check container status",
+                    "details": {"error": stderr.decode()}
+                }
+
+            containers = stdout.decode().strip().split("\n")
+            unhealthy = [c for c in containers if "unhealthy" in c.lower()]
+
+            return {
+                "name": "container_health",
+                "passed": len(unhealthy) == 0,
+                "message": f"Container health check: {len(containers)} running, {len(unhealthy)} unhealthy",
+                "details": {"total": len(containers), "unhealthy": len(unhealthy)}
+            }
+        except Exception as e:
+            return {
+                "name": "container_health",
+                "passed": False,
+                "message": f"Failed to check container health: {str(e)}",
+                "details": {}
+            }
+
+    async def _check_database_connectivity(self) -> Dict[str, Any]:
+        """Check database connection"""
+        try:
+            await self.db.execute(select(1))
+            return {
+                "name": "database_connectivity",
+                "passed": True,
+                "message": "Database connection healthy",
+                "details": {}
+            }
+        except Exception as e:
+            return {
+                "name": "database_connectivity",
+                "passed": False,
+                "message": f"Database connection failed: {str(e)}",
+                "details": {}
+            }
+
+    async def _check_recent_backup(self) -> Dict[str, Any]:
+        """Check if a recent backup exists"""
+        try:
+            from datetime import timedelta
+            from app.models.system import BackupRecord
+
+            one_day_ago = datetime.utcnow() - timedelta(days=1)
+            stmt = select(BackupRecord).where(
+                and_(
+                    BackupRecord.created_at >= one_day_ago,
+                    BackupRecord.is_valid == True
+                )
+            ).order_by(desc(BackupRecord.created_at)).limit(1)
+
+            result = await self.db.execute(stmt)
+            recent_backup = result.scalar_one_or_none()
+
+            if recent_backup:
+                return {
+                    "name": "recent_backup",
+                    "passed": True,
+                    "message": f"Recent backup found: {recent_backup.uuid}",
+                    "details": {"backup_id": recent_backup.id, "created_at": recent_backup.created_at.isoformat()}
+                }
+            else:
+                return {
+                    "name": "recent_backup",
+                    "passed": False,
+                    "message": "No backup found within last 24 hours",
+                    "details": {}
+                }
+        except Exception as e:
+            return {
+                "name": "recent_backup",
+                "passed": False,
+                "message": f"Failed to check for recent backups: {str(e)}",
+                "details": {}
+            }
+
+    async def _check_running_updates(self) -> Optional[str]:
+        """Check for running update jobs"""
+        stmt = select(UpdateJob.uuid).where(
+            UpdateJob.status == UpdateStatus.in_progress
+        ).limit(1)
+
+        result = await self.db.execute(stmt)
+        running = result.scalar_one_or_none()
+
+        return running