gt-ai-os-community/apps/control-panel-backend/app/services/resource_allocation.py

"""
GT 2.0 Resource Allocation Management Service

Manages CPU, memory, storage, and API quotas for tenants following GT 2.0 principles:
- Granular resource control per tenant
- Real-time usage monitoring
- Automatic scaling within limits
- Cost tracking and optimization
"""

import asyncio
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional, Tuple
from enum import Enum

from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, update, func, and_

from app.models.tenant import Tenant
from app.models.resource_usage import ResourceUsage, ResourceQuota, ResourceAlert
from app.core.config import get_settings

logger = logging.getLogger(__name__)
settings = get_settings()


class ResourceType(Enum):
    """Types of resources that can be allocated"""
    CPU = "cpu"
    MEMORY = "memory"
    STORAGE = "storage"
    API_CALLS = "api_calls"
    GPU_TIME = "gpu_time"
    VECTOR_OPERATIONS = "vector_operations"
    MODEL_INFERENCE = "model_inference"


class AlertLevel(Enum):
    """Resource usage alert levels"""
    INFO = "info"
    WARNING = "warning"
    CRITICAL = "critical"


@dataclass
class ResourceLimit:
    """Resource limit configuration"""
    resource_type: ResourceType
    max_value: float
    warning_threshold: float = 0.8  # 80% of max
    critical_threshold: float = 0.95  # 95% of max
    unit: str = "units"
    cost_per_unit: float = 0.0


@dataclass
class ResourceUsageData:
    """Current resource usage data"""
    resource_type: ResourceType
    current_usage: float
    max_allowed: float
    percentage_used: float
    cost_accrued: float
    last_updated: datetime


class ResourceAllocationService:
    """
    Service for managing resource allocation and monitoring usage across tenants.

    Features:
    - Dynamic quota allocation
    - Real-time usage tracking
    - Automatic scaling policies
    - Cost optimization
    - Alert generation
    """

    def __init__(self, db: AsyncSession):
        self.db = db

        # Default resource templates
        self.resource_templates = {
            "startup": {
                ResourceType.CPU: ResourceLimit(ResourceType.CPU, 2.0, unit="cores", cost_per_unit=0.10),
                ResourceType.MEMORY: ResourceLimit(ResourceType.MEMORY, 4096, unit="MB", cost_per_unit=0.05),
                ResourceType.STORAGE: ResourceLimit(ResourceType.STORAGE, 10240, unit="MB", cost_per_unit=0.01),
                ResourceType.API_CALLS: ResourceLimit(ResourceType.API_CALLS, 10000, unit="calls/hour", cost_per_unit=0.001),
                ResourceType.MODEL_INFERENCE: ResourceLimit(ResourceType.MODEL_INFERENCE, 1000, unit="tokens", cost_per_unit=0.002),
            },
            "standard": {
                ResourceType.CPU: ResourceLimit(ResourceType.CPU, 4.0, unit="cores", cost_per_unit=0.10),
                ResourceType.MEMORY: ResourceLimit(ResourceType.MEMORY, 8192, unit="MB", cost_per_unit=0.05),
                ResourceType.STORAGE: ResourceLimit(ResourceType.STORAGE, 51200, unit="MB", cost_per_unit=0.01),
                ResourceType.API_CALLS: ResourceLimit(ResourceType.API_CALLS, 50000, unit="calls/hour", cost_per_unit=0.001),
                ResourceType.MODEL_INFERENCE: ResourceLimit(ResourceType.MODEL_INFERENCE, 10000, unit="tokens", cost_per_unit=0.002),
            },
            "enterprise": {
                ResourceType.CPU: ResourceLimit(ResourceType.CPU, 16.0, unit="cores", cost_per_unit=0.10),
                ResourceType.MEMORY: ResourceLimit(ResourceType.MEMORY, 32768, unit="MB", cost_per_unit=0.05),
                ResourceType.STORAGE: ResourceLimit(ResourceType.STORAGE, 102400, unit="MB", cost_per_unit=0.01),
                ResourceType.API_CALLS: ResourceLimit(ResourceType.API_CALLS, 200000, unit="calls/hour", cost_per_unit=0.001),
                ResourceType.MODEL_INFERENCE: ResourceLimit(ResourceType.MODEL_INFERENCE, 100000, unit="tokens", cost_per_unit=0.002),
                ResourceType.GPU_TIME: ResourceLimit(ResourceType.GPU_TIME, 1000, unit="minutes", cost_per_unit=0.50),
            }
        }

    async def allocate_resources(self, tenant_id: int, template: str = "standard") -> bool:
        """
        Allocate initial resources to a tenant based on template.

        Args:
            tenant_id: Tenant database ID
            template: Resource template name

        Returns:
            True if allocation successful
        """
        try:
            # Get tenant
            result = await self.db.execute(select(Tenant).where(Tenant.id == tenant_id))
            tenant = result.scalar_one_or_none()

            if not tenant:
                logger.error(f"Tenant {tenant_id} not found")
                return False

            # Get resource template
            if template not in self.resource_templates:
                logger.error(f"Unknown resource template: {template}")
                return False

            resources = self.resource_templates[template]

            # Create resource quotas
            for resource_type, limit in resources.items():
                quota = ResourceQuota(
                    tenant_id=tenant_id,
                    resource_type=resource_type.value,
                    max_value=limit.max_value,
                    warning_threshold=limit.warning_threshold,
                    critical_threshold=limit.critical_threshold,
                    unit=limit.unit,
                    cost_per_unit=limit.cost_per_unit,
                    current_usage=0.0,
                    is_active=True
                )

                self.db.add(quota)

            await self.db.commit()

            logger.info(f"Allocated {template} resources to tenant {tenant.domain}")
            return True

        except Exception as e:
            logger.error(f"Failed to allocate resources to tenant {tenant_id}: {e}")
            await self.db.rollback()
            return False

    async def get_tenant_resource_usage(self, tenant_id: int) -> Dict[str, ResourceUsageData]:
        """
        Get current resource usage for a tenant.

        Args:
            tenant_id: Tenant database ID

        Returns:
            Dictionary of resource usage data
        """
        try:
            # Get all quotas for tenant
            result = await self.db.execute(
                select(ResourceQuota).where(
                    and_(ResourceQuota.tenant_id == tenant_id, ResourceQuota.is_active == True)
                )
            )
            quotas = result.scalars().all()

            usage_data = {}

            for quota in quotas:
                resource_type = ResourceType(quota.resource_type)
                percentage_used = (quota.current_usage / quota.max_value) * 100 if quota.max_value > 0 else 0

                usage_data[quota.resource_type] = ResourceUsageData(
                    resource_type=resource_type,
                    current_usage=quota.current_usage,
                    max_allowed=quota.max_value,
                    percentage_used=percentage_used,
                    cost_accrued=quota.current_usage * quota.cost_per_unit,
                    last_updated=quota.updated_at
                )

            return usage_data

        except Exception as e:
            logger.error(f"Failed to get resource usage for tenant {tenant_id}: {e}")
            return {}

    async def update_resource_usage(
        self,
        tenant_id: int,
        resource_type: ResourceType,
        usage_delta: float
    ) -> bool:
        """
        Update resource usage for a tenant.

        Args:
            tenant_id: Tenant database ID
            resource_type: Type of resource being used
            usage_delta: Change in usage (positive for increase, negative for decrease)

        Returns:
            True if update successful
        """
        try:
            # Get resource quota
            result = await self.db.execute(
                select(ResourceQuota).where(
                    and_(
                        ResourceQuota.tenant_id == tenant_id,
                        ResourceQuota.resource_type == resource_type.value,
                        ResourceQuota.is_active == True
                    )
                )
            )
            quota = result.scalar_one_or_none()

            if not quota:
                logger.warning(f"No quota found for {resource_type.value} for tenant {tenant_id}")
                return False

            # Calculate new usage
            new_usage = max(0, quota.current_usage + usage_delta)

            # Check if usage exceeds quota
            if new_usage > quota.max_value:
                logger.warning(
                    f"Resource usage would exceed quota for tenant {tenant_id}: "
                    f"{resource_type.value} {new_usage} > {quota.max_value}"
                )
                return False

            # Update usage
            quota.current_usage = new_usage
            quota.updated_at = datetime.utcnow()

            # Record usage history
            usage_record = ResourceUsage(
                tenant_id=tenant_id,
                resource_type=resource_type.value,
                usage_amount=usage_delta,
                timestamp=datetime.utcnow(),
                cost=usage_delta * quota.cost_per_unit
            )

            self.db.add(usage_record)
            await self.db.commit()

            # Check for alerts
            await self._check_usage_alerts(tenant_id, quota)

            return True

        except Exception as e:
            logger.error(f"Failed to update resource usage: {e}")
            await self.db.rollback()
            return False

    async def _check_usage_alerts(self, tenant_id: int, quota: ResourceQuota) -> None:
        """Check if resource usage triggers alerts"""
        try:
            percentage_used = (quota.current_usage / quota.max_value) if quota.max_value > 0 else 0

            alert_level = None
            message = None

            if percentage_used >= quota.critical_threshold:
                alert_level = AlertLevel.CRITICAL
                message = f"Critical: {quota.resource_type} usage at {percentage_used:.1f}%"
            elif percentage_used >= quota.warning_threshold:
                alert_level = AlertLevel.WARNING
                message = f"Warning: {quota.resource_type} usage at {percentage_used:.1f}%"

            if alert_level:
                # Check if we already have a recent alert
                recent_alert = await self.db.execute(
                    select(ResourceAlert).where(
                        and_(
                            ResourceAlert.tenant_id == tenant_id,
                            ResourceAlert.resource_type == quota.resource_type,
                            ResourceAlert.alert_level == alert_level.value,
                            ResourceAlert.created_at >= datetime.utcnow() - timedelta(hours=1)
                        )
                    )
                )

                if not recent_alert.scalar_one_or_none():
                    # Create new alert
                    alert = ResourceAlert(
                        tenant_id=tenant_id,
                        resource_type=quota.resource_type,
                        alert_level=alert_level.value,
                        message=message,
                        current_usage=quota.current_usage,
                        max_value=quota.max_value,
                        percentage_used=percentage_used
                    )

                    self.db.add(alert)
                    await self.db.commit()

                    logger.warning(f"Resource alert for tenant {tenant_id}: {message}")

        except Exception as e:
            logger.error(f"Failed to check usage alerts: {e}")

    async def get_tenant_costs(self, tenant_id: int, start_date: datetime, end_date: datetime) -> Dict[str, Any]:
        """
        Calculate costs for a tenant over a date range.

        Args:
            tenant_id: Tenant database ID
            start_date: Start of cost calculation period
            end_date: End of cost calculation period

        Returns:
            Cost breakdown by resource type
        """
        try:
            # Get usage records for the period
            result = await self.db.execute(
                select(ResourceUsage).where(
                    and_(
                        ResourceUsage.tenant_id == tenant_id,
                        ResourceUsage.timestamp >= start_date,
                        ResourceUsage.timestamp <= end_date
                    )
                )
            )
            usage_records = result.scalars().all()

            # Calculate costs by resource type
            costs_by_type = {}
            total_cost = 0.0

            for record in usage_records:
                if record.resource_type not in costs_by_type:
                    costs_by_type[record.resource_type] = {
                        "total_usage": 0.0,
                        "total_cost": 0.0,
                        "usage_events": 0
                    }

                costs_by_type[record.resource_type]["total_usage"] += record.usage_amount
                costs_by_type[record.resource_type]["total_cost"] += record.cost
                costs_by_type[record.resource_type]["usage_events"] += 1
                total_cost += record.cost

            return {
                "tenant_id": tenant_id,
                "period_start": start_date.isoformat(),
                "period_end": end_date.isoformat(),
                "total_cost": round(total_cost, 4),
                "costs_by_resource": costs_by_type,
                "currency": "USD"
            }

        except Exception as e:
            logger.error(f"Failed to calculate costs for tenant {tenant_id}: {e}")
            return {}

    async def scale_tenant_resources(
        self,
        tenant_id: int,
        resource_type: ResourceType,
        scale_factor: float
    ) -> bool:
        """
        Scale tenant resources up or down.

        Args:
            tenant_id: Tenant database ID
            resource_type: Type of resource to scale
            scale_factor: Scaling factor (1.5 = 50% increase, 0.8 = 20% decrease)

        Returns:
            True if scaling successful
        """
        try:
            # Get current quota
            result = await self.db.execute(
                select(ResourceQuota).where(
                    and_(
                        ResourceQuota.tenant_id == tenant_id,
                        ResourceQuota.resource_type == resource_type.value,
                        ResourceQuota.is_active == True
                    )
                )
            )
            quota = result.scalar_one_or_none()

            if not quota:
                logger.error(f"No quota found for {resource_type.value} for tenant {tenant_id}")
                return False

            # Calculate new limit
            new_max_value = quota.max_value * scale_factor

            # Ensure we don't scale below current usage
            if new_max_value < quota.current_usage:
                logger.warning(
                    f"Cannot scale {resource_type.value} below current usage: "
                    f"{new_max_value} < {quota.current_usage}"
                )
                return False

            # Update quota
            quota.max_value = new_max_value
            quota.updated_at = datetime.utcnow()

            await self.db.commit()

            logger.info(
                f"Scaled {resource_type.value} for tenant {tenant_id} by {scale_factor}x to {new_max_value}"
            )
            return True

        except Exception as e:
            logger.error(f"Failed to scale resources for tenant {tenant_id}: {e}")
            await self.db.rollback()
            return False

    async def get_system_resource_overview(self) -> Dict[str, Any]:
        """
        Get system-wide resource usage overview.

        Returns:
            System resource usage statistics
        """
        try:
            # Get aggregate usage by resource type
            result = await self.db.execute(
                select(
                    ResourceQuota.resource_type,
                    func.sum(ResourceQuota.current_usage).label('total_usage'),
                    func.sum(ResourceQuota.max_value).label('total_allocated'),
                    func.count(ResourceQuota.tenant_id).label('tenant_count')
                ).where(ResourceQuota.is_active == True)
                .group_by(ResourceQuota.resource_type)
            )

            overview = {}

            for row in result:
                resource_type = row.resource_type
                total_usage = float(row.total_usage or 0)
                total_allocated = float(row.total_allocated or 0)
                tenant_count = int(row.tenant_count or 0)

                utilization = (total_usage / total_allocated) * 100 if total_allocated > 0 else 0

                overview[resource_type] = {
                    "total_usage": total_usage,
                    "total_allocated": total_allocated,
                    "utilization_percentage": round(utilization, 2),
                    "tenant_count": tenant_count
                }

            return {
                "timestamp": datetime.utcnow().isoformat(),
                "resource_overview": overview,
                "total_tenants": len(set([row.tenant_count for row in result]))
            }

        except Exception as e:
            logger.error(f"Failed to get system resource overview: {e}")
            return {}

    async def get_resource_alerts(self, tenant_id: Optional[int] = None, hours: int = 24) -> List[Dict[str, Any]]:
        """
        Get resource alerts for tenant(s).

        Args:
            tenant_id: Specific tenant ID (None for all tenants)
            hours: Hours back to look for alerts

        Returns:
            List of alert dictionaries
        """
        try:
            query = select(ResourceAlert).where(
                ResourceAlert.created_at >= datetime.utcnow() - timedelta(hours=hours)
            )

            if tenant_id:
                query = query.where(ResourceAlert.tenant_id == tenant_id)

            query = query.order_by(ResourceAlert.created_at.desc())

            result = await self.db.execute(query)
            alerts = result.scalars().all()

            return [
                {
                    "id": alert.id,
                    "tenant_id": alert.tenant_id,
                    "resource_type": alert.resource_type,
                    "alert_level": alert.alert_level,
                    "message": alert.message,
                    "current_usage": alert.current_usage,
                    "max_value": alert.max_value,
                    "percentage_used": alert.percentage_used,
                    "created_at": alert.created_at.isoformat()
                }
                for alert in alerts
            ]

        except Exception as e:
            logger.error(f"Failed to get resource alerts: {e}")
            return []