GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents

- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00
commit 310491a557
750 changed files with 232701 additions and 0 deletions
--- a/apps/control-panel-backend/app/services/resource_allocation.py
+++ b/apps/control-panel-backend/app/services/resource_allocation.py
@@ -0,0 +1,525 @@
+"""
+GT 2.0 Resource Allocation Management Service
+
+Manages CPU, memory, storage, and API quotas for tenants following GT 2.0 principles:
+- Granular resource control per tenant
+- Real-time usage monitoring
+- Automatic scaling within limits
+- Cost tracking and optimization
+"""
+
+import asyncio
+import logging
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from typing import Dict, Any, List, Optional, Tuple
+from enum import Enum
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, update, func, and_
+
+from app.models.tenant import Tenant
+from app.models.resource_usage import ResourceUsage, ResourceQuota, ResourceAlert
+from app.core.config import get_settings
+
+logger = logging.getLogger(__name__)
+settings = get_settings()
+
+
+class ResourceType(Enum):
+    """Types of resources that can be allocated"""
+    CPU = "cpu"
+    MEMORY = "memory"
+    STORAGE = "storage"
+    API_CALLS = "api_calls"
+    GPU_TIME = "gpu_time"
+    VECTOR_OPERATIONS = "vector_operations"
+    MODEL_INFERENCE = "model_inference"
+
+
+class AlertLevel(Enum):
+    """Resource usage alert levels"""
+    INFO = "info"
+    WARNING = "warning"
+    CRITICAL = "critical"
+
+
+@dataclass
+class ResourceLimit:
+    """Resource limit configuration"""
+    resource_type: ResourceType
+    max_value: float
+    warning_threshold: float = 0.8  # 80% of max
+    critical_threshold: float = 0.95  # 95% of max
+    unit: str = "units"
+    cost_per_unit: float = 0.0
+
+
+@dataclass
+class ResourceUsageData:
+    """Current resource usage data"""
+    resource_type: ResourceType
+    current_usage: float
+    max_allowed: float
+    percentage_used: float
+    cost_accrued: float
+    last_updated: datetime
+
+
+class ResourceAllocationService:
+    """
+    Service for managing resource allocation and monitoring usage across tenants.
+    
+    Features:
+    - Dynamic quota allocation
+    - Real-time usage tracking
+    - Automatic scaling policies
+    - Cost optimization
+    - Alert generation
+    """
+    
+    def __init__(self, db: AsyncSession):
+        self.db = db
+        
+        # Default resource templates
+        self.resource_templates = {
+            "startup": {
+                ResourceType.CPU: ResourceLimit(ResourceType.CPU, 2.0, unit="cores", cost_per_unit=0.10),
+                ResourceType.MEMORY: ResourceLimit(ResourceType.MEMORY, 4096, unit="MB", cost_per_unit=0.05),
+                ResourceType.STORAGE: ResourceLimit(ResourceType.STORAGE, 10240, unit="MB", cost_per_unit=0.01),
+                ResourceType.API_CALLS: ResourceLimit(ResourceType.API_CALLS, 10000, unit="calls/hour", cost_per_unit=0.001),
+                ResourceType.MODEL_INFERENCE: ResourceLimit(ResourceType.MODEL_INFERENCE, 1000, unit="tokens", cost_per_unit=0.002),
+            },
+            "standard": {
+                ResourceType.CPU: ResourceLimit(ResourceType.CPU, 4.0, unit="cores", cost_per_unit=0.10),
+                ResourceType.MEMORY: ResourceLimit(ResourceType.MEMORY, 8192, unit="MB", cost_per_unit=0.05),
+                ResourceType.STORAGE: ResourceLimit(ResourceType.STORAGE, 51200, unit="MB", cost_per_unit=0.01),
+                ResourceType.API_CALLS: ResourceLimit(ResourceType.API_CALLS, 50000, unit="calls/hour", cost_per_unit=0.001),
+                ResourceType.MODEL_INFERENCE: ResourceLimit(ResourceType.MODEL_INFERENCE, 10000, unit="tokens", cost_per_unit=0.002),
+            },
+            "enterprise": {
+                ResourceType.CPU: ResourceLimit(ResourceType.CPU, 16.0, unit="cores", cost_per_unit=0.10),
+                ResourceType.MEMORY: ResourceLimit(ResourceType.MEMORY, 32768, unit="MB", cost_per_unit=0.05),
+                ResourceType.STORAGE: ResourceLimit(ResourceType.STORAGE, 102400, unit="MB", cost_per_unit=0.01),
+                ResourceType.API_CALLS: ResourceLimit(ResourceType.API_CALLS, 200000, unit="calls/hour", cost_per_unit=0.001),
+                ResourceType.MODEL_INFERENCE: ResourceLimit(ResourceType.MODEL_INFERENCE, 100000, unit="tokens", cost_per_unit=0.002),
+                ResourceType.GPU_TIME: ResourceLimit(ResourceType.GPU_TIME, 1000, unit="minutes", cost_per_unit=0.50),
+            }
+        }
+    
+    async def allocate_resources(self, tenant_id: int, template: str = "standard") -> bool:
+        """
+        Allocate initial resources to a tenant based on template.
+        
+        Args:
+            tenant_id: Tenant database ID
+            template: Resource template name
+            
+        Returns:
+            True if allocation successful
+        """
+        try:
+            # Get tenant
+            result = await self.db.execute(select(Tenant).where(Tenant.id == tenant_id))
+            tenant = result.scalar_one_or_none()
+            
+            if not tenant:
+                logger.error(f"Tenant {tenant_id} not found")
+                return False
+            
+            # Get resource template
+            if template not in self.resource_templates:
+                logger.error(f"Unknown resource template: {template}")
+                return False
+            
+            resources = self.resource_templates[template]
+            
+            # Create resource quotas
+            for resource_type, limit in resources.items():
+                quota = ResourceQuota(
+                    tenant_id=tenant_id,
+                    resource_type=resource_type.value,
+                    max_value=limit.max_value,
+                    warning_threshold=limit.warning_threshold,
+                    critical_threshold=limit.critical_threshold,
+                    unit=limit.unit,
+                    cost_per_unit=limit.cost_per_unit,
+                    current_usage=0.0,
+                    is_active=True
+                )
+                
+                self.db.add(quota)
+            
+            await self.db.commit()
+            
+            logger.info(f"Allocated {template} resources to tenant {tenant.domain}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to allocate resources to tenant {tenant_id}: {e}")
+            await self.db.rollback()
+            return False
+    
+    async def get_tenant_resource_usage(self, tenant_id: int) -> Dict[str, ResourceUsageData]:
+        """
+        Get current resource usage for a tenant.
+        
+        Args:
+            tenant_id: Tenant database ID
+            
+        Returns:
+            Dictionary of resource usage data
+        """
+        try:
+            # Get all quotas for tenant
+            result = await self.db.execute(
+                select(ResourceQuota).where(
+                    and_(ResourceQuota.tenant_id == tenant_id, ResourceQuota.is_active == True)
+                )
+            )
+            quotas = result.scalars().all()
+            
+            usage_data = {}
+            
+            for quota in quotas:
+                resource_type = ResourceType(quota.resource_type)
+                percentage_used = (quota.current_usage / quota.max_value) * 100 if quota.max_value > 0 else 0
+                
+                usage_data[quota.resource_type] = ResourceUsageData(
+                    resource_type=resource_type,
+                    current_usage=quota.current_usage,
+                    max_allowed=quota.max_value,
+                    percentage_used=percentage_used,
+                    cost_accrued=quota.current_usage * quota.cost_per_unit,
+                    last_updated=quota.updated_at
+                )
+            
+            return usage_data
+            
+        except Exception as e:
+            logger.error(f"Failed to get resource usage for tenant {tenant_id}: {e}")
+            return {}
+    
+    async def update_resource_usage(
+        self, 
+        tenant_id: int, 
+        resource_type: ResourceType, 
+        usage_delta: float
+    ) -> bool:
+        """
+        Update resource usage for a tenant.
+        
+        Args:
+            tenant_id: Tenant database ID
+            resource_type: Type of resource being used
+            usage_delta: Change in usage (positive for increase, negative for decrease)
+            
+        Returns:
+            True if update successful
+        """
+        try:
+            # Get resource quota
+            result = await self.db.execute(
+                select(ResourceQuota).where(
+                    and_(
+                        ResourceQuota.tenant_id == tenant_id,
+                        ResourceQuota.resource_type == resource_type.value,
+                        ResourceQuota.is_active == True
+                    )
+                )
+            )
+            quota = result.scalar_one_or_none()
+            
+            if not quota:
+                logger.warning(f"No quota found for {resource_type.value} for tenant {tenant_id}")
+                return False
+            
+            # Calculate new usage
+            new_usage = max(0, quota.current_usage + usage_delta)
+            
+            # Check if usage exceeds quota
+            if new_usage > quota.max_value:
+                logger.warning(
+                    f"Resource usage would exceed quota for tenant {tenant_id}: "
+                    f"{resource_type.value} {new_usage} > {quota.max_value}"
+                )
+                return False
+            
+            # Update usage
+            quota.current_usage = new_usage
+            quota.updated_at = datetime.utcnow()
+            
+            # Record usage history
+            usage_record = ResourceUsage(
+                tenant_id=tenant_id,
+                resource_type=resource_type.value,
+                usage_amount=usage_delta,
+                timestamp=datetime.utcnow(),
+                cost=usage_delta * quota.cost_per_unit
+            )
+            
+            self.db.add(usage_record)
+            await self.db.commit()
+            
+            # Check for alerts
+            await self._check_usage_alerts(tenant_id, quota)
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to update resource usage: {e}")
+            await self.db.rollback()
+            return False
+    
+    async def _check_usage_alerts(self, tenant_id: int, quota: ResourceQuota) -> None:
+        """Check if resource usage triggers alerts"""
+        try:
+            percentage_used = (quota.current_usage / quota.max_value) if quota.max_value > 0 else 0
+            
+            alert_level = None
+            message = None
+            
+            if percentage_used >= quota.critical_threshold:
+                alert_level = AlertLevel.CRITICAL
+                message = f"Critical: {quota.resource_type} usage at {percentage_used:.1f}%"
+            elif percentage_used >= quota.warning_threshold:
+                alert_level = AlertLevel.WARNING
+                message = f"Warning: {quota.resource_type} usage at {percentage_used:.1f}%"
+            
+            if alert_level:
+                # Check if we already have a recent alert
+                recent_alert = await self.db.execute(
+                    select(ResourceAlert).where(
+                        and_(
+                            ResourceAlert.tenant_id == tenant_id,
+                            ResourceAlert.resource_type == quota.resource_type,
+                            ResourceAlert.alert_level == alert_level.value,
+                            ResourceAlert.created_at >= datetime.utcnow() - timedelta(hours=1)
+                        )
+                    )
+                )
+                
+                if not recent_alert.scalar_one_or_none():
+                    # Create new alert
+                    alert = ResourceAlert(
+                        tenant_id=tenant_id,
+                        resource_type=quota.resource_type,
+                        alert_level=alert_level.value,
+                        message=message,
+                        current_usage=quota.current_usage,
+                        max_value=quota.max_value,
+                        percentage_used=percentage_used
+                    )
+                    
+                    self.db.add(alert)
+                    await self.db.commit()
+                    
+                    logger.warning(f"Resource alert for tenant {tenant_id}: {message}")
+        
+        except Exception as e:
+            logger.error(f"Failed to check usage alerts: {e}")
+    
+    async def get_tenant_costs(self, tenant_id: int, start_date: datetime, end_date: datetime) -> Dict[str, Any]:
+        """
+        Calculate costs for a tenant over a date range.
+        
+        Args:
+            tenant_id: Tenant database ID
+            start_date: Start of cost calculation period
+            end_date: End of cost calculation period
+            
+        Returns:
+            Cost breakdown by resource type
+        """
+        try:
+            # Get usage records for the period
+            result = await self.db.execute(
+                select(ResourceUsage).where(
+                    and_(
+                        ResourceUsage.tenant_id == tenant_id,
+                        ResourceUsage.timestamp >= start_date,
+                        ResourceUsage.timestamp <= end_date
+                    )
+                )
+            )
+            usage_records = result.scalars().all()
+            
+            # Calculate costs by resource type
+            costs_by_type = {}
+            total_cost = 0.0
+            
+            for record in usage_records:
+                if record.resource_type not in costs_by_type:
+                    costs_by_type[record.resource_type] = {
+                        "total_usage": 0.0,
+                        "total_cost": 0.0,
+                        "usage_events": 0
+                    }
+                
+                costs_by_type[record.resource_type]["total_usage"] += record.usage_amount
+                costs_by_type[record.resource_type]["total_cost"] += record.cost
+                costs_by_type[record.resource_type]["usage_events"] += 1
+                total_cost += record.cost
+            
+            return {
+                "tenant_id": tenant_id,
+                "period_start": start_date.isoformat(),
+                "period_end": end_date.isoformat(),
+                "total_cost": round(total_cost, 4),
+                "costs_by_resource": costs_by_type,
+                "currency": "USD"
+            }
+            
+        except Exception as e:
+            logger.error(f"Failed to calculate costs for tenant {tenant_id}: {e}")
+            return {}
+    
+    async def scale_tenant_resources(
+        self, 
+        tenant_id: int, 
+        resource_type: ResourceType, 
+        scale_factor: float
+    ) -> bool:
+        """
+        Scale tenant resources up or down.
+        
+        Args:
+            tenant_id: Tenant database ID
+            resource_type: Type of resource to scale
+            scale_factor: Scaling factor (1.5 = 50% increase, 0.8 = 20% decrease)
+            
+        Returns:
+            True if scaling successful
+        """
+        try:
+            # Get current quota
+            result = await self.db.execute(
+                select(ResourceQuota).where(
+                    and_(
+                        ResourceQuota.tenant_id == tenant_id,
+                        ResourceQuota.resource_type == resource_type.value,
+                        ResourceQuota.is_active == True
+                    )
+                )
+            )
+            quota = result.scalar_one_or_none()
+            
+            if not quota:
+                logger.error(f"No quota found for {resource_type.value} for tenant {tenant_id}")
+                return False
+            
+            # Calculate new limit
+            new_max_value = quota.max_value * scale_factor
+            
+            # Ensure we don't scale below current usage
+            if new_max_value < quota.current_usage:
+                logger.warning(
+                    f"Cannot scale {resource_type.value} below current usage: "
+                    f"{new_max_value} < {quota.current_usage}"
+                )
+                return False
+            
+            # Update quota
+            quota.max_value = new_max_value
+            quota.updated_at = datetime.utcnow()
+            
+            await self.db.commit()
+            
+            logger.info(
+                f"Scaled {resource_type.value} for tenant {tenant_id} by {scale_factor}x to {new_max_value}"
+            )
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to scale resources for tenant {tenant_id}: {e}")
+            await self.db.rollback()
+            return False
+    
+    async def get_system_resource_overview(self) -> Dict[str, Any]:
+        """
+        Get system-wide resource usage overview.
+        
+        Returns:
+            System resource usage statistics
+        """
+        try:
+            # Get aggregate usage by resource type
+            result = await self.db.execute(
+                select(
+                    ResourceQuota.resource_type,
+                    func.sum(ResourceQuota.current_usage).label('total_usage'),
+                    func.sum(ResourceQuota.max_value).label('total_allocated'),
+                    func.count(ResourceQuota.tenant_id).label('tenant_count')
+                ).where(ResourceQuota.is_active == True)
+                .group_by(ResourceQuota.resource_type)
+            )
+            
+            overview = {}
+            
+            for row in result:
+                resource_type = row.resource_type
+                total_usage = float(row.total_usage or 0)
+                total_allocated = float(row.total_allocated or 0)
+                tenant_count = int(row.tenant_count or 0)
+                
+                utilization = (total_usage / total_allocated) * 100 if total_allocated > 0 else 0
+                
+                overview[resource_type] = {
+                    "total_usage": total_usage,
+                    "total_allocated": total_allocated,
+                    "utilization_percentage": round(utilization, 2),
+                    "tenant_count": tenant_count
+                }
+            
+            return {
+                "timestamp": datetime.utcnow().isoformat(),
+                "resource_overview": overview,
+                "total_tenants": len(set([row.tenant_count for row in result]))
+            }
+            
+        except Exception as e:
+            logger.error(f"Failed to get system resource overview: {e}")
+            return {}
+    
+    async def get_resource_alerts(self, tenant_id: Optional[int] = None, hours: int = 24) -> List[Dict[str, Any]]:
+        """
+        Get resource alerts for tenant(s).
+        
+        Args:
+            tenant_id: Specific tenant ID (None for all tenants)
+            hours: Hours back to look for alerts
+            
+        Returns:
+            List of alert dictionaries
+        """
+        try:
+            query = select(ResourceAlert).where(
+                ResourceAlert.created_at >= datetime.utcnow() - timedelta(hours=hours)
+            )
+            
+            if tenant_id:
+                query = query.where(ResourceAlert.tenant_id == tenant_id)
+            
+            query = query.order_by(ResourceAlert.created_at.desc())
+            
+            result = await self.db.execute(query)
+            alerts = result.scalars().all()
+            
+            return [
+                {
+                    "id": alert.id,
+                    "tenant_id": alert.tenant_id,
+                    "resource_type": alert.resource_type,
+                    "alert_level": alert.alert_level,
+                    "message": alert.message,
+                    "current_usage": alert.current_usage,
+                    "max_value": alert.max_value,
+                    "percentage_used": alert.percentage_used,
+                    "created_at": alert.created_at.isoformat()
+                }
+                for alert in alerts
+            ]
+            
+        except Exception as e:
+            logger.error(f"Failed to get resource alerts: {e}")
+            return []