GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/resource-cluster/app/services/service_manager.py
+++ b/apps/resource-cluster/app/services/service_manager.py
@@ -0,0 +1,931 @@
+"""
+GT 2.0 Resource Cluster - Service Manager
+Orchestrates external web services (CTFd, Canvas LMS, Guacamole, JupyterHub)
+with perfect tenant isolation and security.
+"""
+
+import asyncio
+import json
+import logging
+import subprocess
+import uuid
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Any, Tuple
+from dataclasses import dataclass, asdict
+from pathlib import Path
+try:
+    import docker
+    import kubernetes
+    from kubernetes import client, config
+    from kubernetes.client.rest import ApiException
+    DOCKER_AVAILABLE = True
+    KUBERNETES_AVAILABLE = True
+except ImportError:
+    # For development containerization mode, these are optional
+    docker = None
+    kubernetes = None
+    client = None
+    config = None
+    ApiException = Exception
+    DOCKER_AVAILABLE = False
+    KUBERNETES_AVAILABLE = False
+
+from app.core.config import get_settings
+from app.core.security import verify_capability_token
+from app.utils.encryption import encrypt_data, decrypt_data
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class ServiceInstance:
+    """Represents a deployed service instance"""
+    instance_id: str
+    tenant_id: str
+    service_type: str  # 'ctfd', 'canvas', 'guacamole', 'jupyter'
+    status: str  # 'starting', 'running', 'stopping', 'stopped', 'error'
+    endpoint_url: str
+    internal_port: int
+    external_port: int
+    namespace: str
+    deployment_name: str
+    service_name: str
+    ingress_name: str
+    sso_token: Optional[str] = None
+    created_at: datetime = datetime.utcnow()
+    last_heartbeat: datetime = datetime.utcnow()
+    resource_usage: Dict[str, Any] = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        data = asdict(self)
+        data['created_at'] = self.created_at.isoformat()
+        data['last_heartbeat'] = self.last_heartbeat.isoformat()
+        return data
+
+@dataclass
+class ServiceTemplate:
+    """Service deployment template configuration"""
+    service_type: str
+    image: str
+    ports: Dict[str, int]
+    environment: Dict[str, str]
+    volumes: List[Dict[str, str]]
+    resource_limits: Dict[str, str]
+    security_context: Dict[str, Any]
+    health_check: Dict[str, Any]
+    sso_config: Dict[str, Any]
+
+class ServiceManager:
+    """Manages external web service instances with Kubernetes orchestration"""
+    
+    def __init__(self):
+        # Initialize Docker client if available
+        if DOCKER_AVAILABLE:
+            try:
+                self.docker_client = docker.from_env()
+            except Exception as e:
+                logger.warning(f"Could not initialize Docker client: {e}")
+                self.docker_client = None
+        else:
+            self.docker_client = None
+            
+        self.k8s_client = None
+        self.active_instances: Dict[str, ServiceInstance] = {}
+        self.service_templates: Dict[str, ServiceTemplate] = {}
+        self.base_namespace = "gt-services"
+        self.storage_path = Path("/tmp/resource-cluster/services")
+        self.storage_path.mkdir(parents=True, exist_ok=True)
+        
+        # Initialize Kubernetes client if available
+        if KUBERNETES_AVAILABLE:
+            try:
+                config.load_incluster_config()  # If running in cluster
+            except:
+                try:
+                    config.load_kube_config()  # If running locally
+                except:
+                    logger.warning("Could not load Kubernetes config - using mock mode")
+                    
+            self.k8s_client = client.ApiClient() if client else None
+        else:
+            logger.warning("Kubernetes not available - running in development containerization mode")
+        self._initialize_service_templates()
+        self._load_persistent_instances()
+    
+    def _initialize_service_templates(self):
+        """Initialize service deployment templates"""
+        
+        # CTFd Template
+        self.service_templates['ctfd'] = ServiceTemplate(
+            service_type='ctfd',
+            image='ctfd/ctfd:3.6.0',
+            ports={'http': 8000},
+            environment={
+                'SECRET_KEY': '${TENANT_SECRET_KEY}',
+                'DATABASE_URL': 'sqlite:////data/ctfd.db',
+                'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants',
+                'UPLOAD_FOLDER': '/data/uploads',
+                'LOG_FOLDER': '/data/logs',
+            },
+            volumes=[
+                {'name': 'ctfd-data', 'mountPath': '/data', 'size': '5Gi'},
+                {'name': 'ctfd-uploads', 'mountPath': '/uploads', 'size': '2Gi'}
+            ],
+            resource_limits={
+                'memory': '2Gi',
+                'cpu': '1000m'
+            },
+            security_context={
+                'runAsNonRoot': True,
+                'runAsUser': 1000,
+                'fsGroup': 1000,
+                'readOnlyRootFilesystem': False
+            },
+            health_check={
+                'path': '/health',
+                'port': 8000,
+                'initial_delay': 30,
+                'period': 10
+            },
+            sso_config={
+                'enabled': True,
+                'provider': 'oauth2',
+                'callback_path': '/auth/oauth/callback'
+            }
+        )
+        
+        # Canvas LMS Template  
+        self.service_templates['canvas'] = ServiceTemplate(
+            service_type='canvas',
+            image='instructure/canvas-lms:stable',
+            ports={'http': 3000},
+            environment={
+                'CANVAS_LMS_ADMIN_EMAIL': 'admin@${TENANT_DOMAIN}',
+                'CANVAS_LMS_ADMIN_PASSWORD': '${CANVAS_ADMIN_PASSWORD}',
+                'CANVAS_LMS_ACCOUNT_NAME': '${TENANT_NAME}',
+                'CANVAS_LMS_STATS_COLLECTION': 'opt_out',
+                'POSTGRES_PASSWORD': '${POSTGRES_PASSWORD}',
+                'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants'
+            },
+            volumes=[
+                {'name': 'canvas-data', 'mountPath': '/app/log', 'size': '10Gi'},
+                {'name': 'canvas-files', 'mountPath': '/app/public/files', 'size': '20Gi'}
+            ],
+            resource_limits={
+                'memory': '4Gi', 
+                'cpu': '2000m'
+            },
+            security_context={
+                'runAsNonRoot': True,
+                'runAsUser': 1000,
+                'fsGroup': 1000
+            },
+            health_check={
+                'path': '/health_check',
+                'port': 3000,
+                'initial_delay': 60,
+                'period': 15
+            },
+            sso_config={
+                'enabled': True,
+                'provider': 'saml',
+                'metadata_url': '/auth/saml/metadata'
+            }
+        )
+        
+        # Guacamole Template
+        self.service_templates['guacamole'] = ServiceTemplate(
+            service_type='guacamole',
+            image='guacamole/guacamole:1.5.3',
+            ports={'http': 8080},
+            environment={
+                'GUACD_HOSTNAME': 'guacd',
+                'GUACD_PORT': '4822', 
+                'MYSQL_HOSTNAME': 'mysql',
+                'MYSQL_PORT': '3306',
+                'MYSQL_DATABASE': 'guacamole_db',
+                'MYSQL_USER': 'guacamole_user',
+                'MYSQL_PASSWORD': '${MYSQL_PASSWORD}',
+                'GUAC_LOG_LEVEL': 'INFO'
+            },
+            volumes=[
+                {'name': 'guacamole-data', 'mountPath': '/config', 'size': '1Gi'},
+                {'name': 'guacamole-recordings', 'mountPath': '/recordings', 'size': '10Gi'}
+            ],
+            resource_limits={
+                'memory': '1Gi',
+                'cpu': '500m'
+            },
+            security_context={
+                'runAsNonRoot': True,
+                'runAsUser': 1001,
+                'fsGroup': 1001
+            },
+            health_check={
+                'path': '/guacamole',
+                'port': 8080,
+                'initial_delay': 45,
+                'period': 10
+            },
+            sso_config={
+                'enabled': True,
+                'provider': 'openid',
+                'extension': 'guacamole-auth-openid'
+            }
+        )
+        
+        # JupyterHub Template
+        self.service_templates['jupyter'] = ServiceTemplate(
+            service_type='jupyter',
+            image='jupyterhub/jupyterhub:4.0',
+            ports={'http': 8000},
+            environment={
+                'JUPYTERHUB_CRYPT_KEY': '${JUPYTERHUB_CRYPT_KEY}',
+                'CONFIGPROXY_AUTH_TOKEN': '${CONFIGPROXY_AUTH_TOKEN}',
+                'DOCKER_NETWORK_NAME': 'jupyterhub',
+                'DOCKER_NOTEBOOK_IMAGE': 'jupyter/datascience-notebook:lab-4.0.7'
+            },
+            volumes=[
+                {'name': 'jupyter-data', 'mountPath': '/srv/jupyterhub', 'size': '5Gi'},
+                {'name': 'docker-socket', 'mountPath': '/var/run/docker.sock', 'hostPath': '/var/run/docker.sock'}
+            ],
+            resource_limits={
+                'memory': '2Gi',
+                'cpu': '1000m'
+            },
+            security_context={
+                'runAsNonRoot': False,  # Needs Docker access
+                'runAsUser': 0,
+                'privileged': True
+            },
+            health_check={
+                'path': '/hub/health',
+                'port': 8000,
+                'initial_delay': 30,
+                'period': 15
+            },
+            sso_config={
+                'enabled': True,
+                'provider': 'oauth',
+                'authenticator_class': 'oauthenticator.generic.GenericOAuthenticator'
+            }
+        )
+    
+    async def create_service_instance(
+        self, 
+        tenant_id: str, 
+        service_type: str,
+        config_overrides: Dict[str, Any] = None
+    ) -> ServiceInstance:
+        """Create a new service instance for a tenant"""
+        
+        if service_type not in self.service_templates:
+            raise ValueError(f"Unsupported service type: {service_type}")
+        
+        template = self.service_templates[service_type]
+        instance_id = f"{service_type}-{tenant_id}-{uuid.uuid4().hex[:8]}"
+        namespace = f"{self.base_namespace}-{tenant_id}"
+        
+        # Generate unique ports
+        external_port = await self._get_available_port()
+        
+        # Create service instance object
+        instance = ServiceInstance(
+            instance_id=instance_id,
+            tenant_id=tenant_id,
+            service_type=service_type,
+            status='starting',
+            endpoint_url=f"https://{service_type}.{tenant_id}.gt2.com",
+            internal_port=template.ports['http'],
+            external_port=external_port,
+            namespace=namespace,
+            deployment_name=f"{service_type}-{instance_id}",
+            service_name=f"{service_type}-service-{instance_id}",
+            ingress_name=f"{service_type}-ingress-{instance_id}",
+            resource_usage={'cpu': 0, 'memory': 0, 'storage': 0}
+        )
+        
+        try:
+            # Create Kubernetes namespace if not exists
+            await self._create_namespace(namespace, tenant_id)
+            
+            # Deploy the service
+            await self._deploy_service(instance, template, config_overrides)
+            
+            # Generate SSO token
+            instance.sso_token = await self._generate_sso_token(instance)
+            
+            # Store instance
+            self.active_instances[instance_id] = instance
+            await self._persist_instance(instance)
+            
+            logger.info(f"Created {service_type} instance {instance_id} for tenant {tenant_id}")
+            return instance
+            
+        except Exception as e:
+            logger.error(f"Failed to create service instance: {e}")
+            instance.status = 'error'
+            raise
+    
+    async def _create_namespace(self, namespace: str, tenant_id: str):
+        """Create Kubernetes namespace with proper labeling and network policies"""
+        
+        if not self.k8s_client:
+            logger.info(f"Mock: Created namespace {namespace}")
+            return
+            
+        v1 = client.CoreV1Api(self.k8s_client)
+        
+        # Create namespace
+        namespace_manifest = client.V1Namespace(
+            metadata=client.V1ObjectMeta(
+                name=namespace,
+                labels={
+                    'gt.tenant-id': tenant_id,
+                    'gt.cluster': 'resource',
+                    'gt.isolation': 'tenant'
+                },
+                annotations={
+                    'gt.created-by': 'service-manager',
+                    'gt.creation-time': datetime.utcnow().isoformat()
+                }
+            )
+        )
+        
+        try:
+            v1.create_namespace(namespace_manifest)
+            logger.info(f"Created namespace: {namespace}")
+        except ApiException as e:
+            if e.status == 409:  # Already exists
+                logger.info(f"Namespace {namespace} already exists")
+            else:
+                raise
+        
+        # Apply network policy for tenant isolation
+        await self._apply_network_policy(namespace, tenant_id)
+    
+    async def _apply_network_policy(self, namespace: str, tenant_id: str):
+        """Apply network policy for tenant isolation"""
+        
+        if not self.k8s_client:
+            logger.info(f"Mock: Applied network policy to {namespace}")
+            return
+        
+        networking_v1 = client.NetworkingV1Api(self.k8s_client)
+        
+        # Network policy that only allows:
+        # 1. Intra-namespace communication
+        # 2. Communication to system namespaces (DNS, etc.)
+        # 3. Egress to external services (for updates, etc.)
+        network_policy = client.V1NetworkPolicy(
+            metadata=client.V1ObjectMeta(
+                name=f"tenant-isolation-{tenant_id}",
+                namespace=namespace,
+                labels={'gt.tenant-id': tenant_id}
+            ),
+            spec=client.V1NetworkPolicySpec(
+                pod_selector=client.V1LabelSelector(),  # All pods in namespace
+                policy_types=['Ingress', 'Egress'],
+                ingress=[
+                    # Allow ingress from same namespace
+                    client.V1NetworkPolicyIngressRule(
+                        from_=[client.V1NetworkPolicyPeer(
+                            namespace_selector=client.V1LabelSelector(
+                                match_labels={'name': namespace}
+                            )
+                        )]
+                    ),
+                    # Allow ingress from ingress controller
+                    client.V1NetworkPolicyIngressRule(
+                        from_=[client.V1NetworkPolicyPeer(
+                            namespace_selector=client.V1LabelSelector(
+                                match_labels={'name': 'ingress-nginx'}
+                            )
+                        )]
+                    )
+                ],
+                egress=[
+                    # Allow egress within namespace
+                    client.V1NetworkPolicyEgressRule(
+                        to=[client.V1NetworkPolicyPeer(
+                            namespace_selector=client.V1LabelSelector(
+                                match_labels={'name': namespace}
+                            )
+                        )]
+                    ),
+                    # Allow DNS
+                    client.V1NetworkPolicyEgressRule(
+                        to=[client.V1NetworkPolicyPeer(
+                            namespace_selector=client.V1LabelSelector(
+                                match_labels={'name': 'kube-system'}
+                            )
+                        )],
+                        ports=[client.V1NetworkPolicyPort(port=53, protocol='UDP')]
+                    ),
+                    # Allow external HTTPS (for updates, etc.)
+                    client.V1NetworkPolicyEgressRule(
+                        ports=[
+                            client.V1NetworkPolicyPort(port=443, protocol='TCP'),
+                            client.V1NetworkPolicyPort(port=80, protocol='TCP')
+                        ]
+                    )
+                ]
+            )
+        )
+        
+        try:
+            networking_v1.create_namespaced_network_policy(
+                namespace=namespace,
+                body=network_policy
+            )
+            logger.info(f"Applied network policy to namespace: {namespace}")
+        except ApiException as e:
+            if e.status == 409:  # Already exists
+                logger.info(f"Network policy already exists in {namespace}")
+            else:
+                logger.error(f"Failed to create network policy: {e}")
+                raise
+    
+    async def _deploy_service(
+        self, 
+        instance: ServiceInstance, 
+        template: ServiceTemplate,
+        config_overrides: Dict[str, Any] = None
+    ):
+        """Deploy service to Kubernetes cluster"""
+        
+        if not self.k8s_client:
+            logger.info(f"Mock: Deployed {template.service_type} service")
+            instance.status = 'running'
+            return
+        
+        # Prepare environment variables with tenant-specific values
+        environment = template.environment.copy()
+        if config_overrides:
+            environment.update(config_overrides.get('environment', {}))
+        
+        # Substitute tenant-specific values
+        env_vars = []
+        for key, value in environment.items():
+            substituted_value = value.replace('${TENANT_ID}', instance.tenant_id)
+            substituted_value = substituted_value.replace('${TENANT_DOMAIN}', f"{instance.tenant_id}.gt2.com")
+            env_vars.append(client.V1EnvVar(name=key, value=substituted_value))
+        
+        # Create volumes
+        volumes = []
+        volume_mounts = []
+        for vol_config in template.volumes:
+            vol_name = f"{vol_config['name']}-{instance.instance_id}"
+            volumes.append(client.V1Volume(
+                name=vol_name,
+                persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
+                    claim_name=vol_name
+                )
+            ))
+            volume_mounts.append(client.V1VolumeMount(
+                name=vol_name,
+                mount_path=vol_config['mountPath']
+            ))
+        
+        # Create PVCs first
+        await self._create_persistent_volumes(instance, template)
+        
+        # Create deployment
+        deployment = client.V1Deployment(
+            metadata=client.V1ObjectMeta(
+                name=instance.deployment_name,
+                namespace=instance.namespace,
+                labels={
+                    'app': template.service_type,
+                    'instance': instance.instance_id,
+                    'gt.tenant-id': instance.tenant_id,
+                    'gt.service-type': template.service_type
+                }
+            ),
+            spec=client.V1DeploymentSpec(
+                replicas=1,
+                selector=client.V1LabelSelector(
+                    match_labels={'instance': instance.instance_id}
+                ),
+                template=client.V1PodTemplateSpec(
+                    metadata=client.V1ObjectMeta(
+                        labels={
+                            'app': template.service_type,
+                            'instance': instance.instance_id,
+                            'gt.tenant-id': instance.tenant_id
+                        }
+                    ),
+                    spec=client.V1PodSpec(
+                        containers=[client.V1Container(
+                            name=template.service_type,
+                            image=template.image,
+                            ports=[client.V1ContainerPort(
+                                container_port=template.ports['http']
+                            )],
+                            env=env_vars,
+                            volume_mounts=volume_mounts,
+                            resources=client.V1ResourceRequirements(
+                                limits=template.resource_limits,
+                                requests=template.resource_limits
+                            ),
+                            security_context=client.V1SecurityContext(**template.security_context),
+                            liveness_probe=client.V1Probe(
+                                http_get=client.V1HTTPGetAction(
+                                    path=template.health_check['path'],
+                                    port=template.health_check['port']
+                                ),
+                                initial_delay_seconds=template.health_check['initial_delay'],
+                                period_seconds=template.health_check['period']
+                            ),
+                            readiness_probe=client.V1Probe(
+                                http_get=client.V1HTTPGetAction(
+                                    path=template.health_check['path'],
+                                    port=template.health_check['port']
+                                ),
+                                initial_delay_seconds=10,
+                                period_seconds=5
+                            )
+                        )],
+                        volumes=volumes,
+                        security_context=client.V1PodSecurityContext(
+                            run_as_non_root=template.security_context.get('runAsNonRoot', True),
+                            fs_group=template.security_context.get('fsGroup', 1000)
+                        )
+                    )
+                )
+            )
+        )
+        
+        # Deploy to Kubernetes
+        apps_v1 = client.AppsV1Api(self.k8s_client)
+        apps_v1.create_namespaced_deployment(
+            namespace=instance.namespace,
+            body=deployment
+        )
+        
+        # Create service
+        await self._create_service(instance, template)
+        
+        # Create ingress
+        await self._create_ingress(instance, template)
+        
+        logger.info(f"Deployed {template.service_type} service: {instance.deployment_name}")
+    
+    async def _create_persistent_volumes(self, instance: ServiceInstance, template: ServiceTemplate):
+        """Create persistent volume claims for the service"""
+        
+        if not self.k8s_client:
+            return
+            
+        v1 = client.CoreV1Api(self.k8s_client)
+        
+        for vol_config in template.volumes:
+            if 'hostPath' in vol_config:  # Skip host path volumes
+                continue
+                
+            pvc_name = f"{vol_config['name']}-{instance.instance_id}"
+            
+            pvc = client.V1PersistentVolumeClaim(
+                metadata=client.V1ObjectMeta(
+                    name=pvc_name,
+                    namespace=instance.namespace,
+                    labels={
+                        'app': template.service_type,
+                        'instance': instance.instance_id,
+                        'gt.tenant-id': instance.tenant_id
+                    }
+                ),
+                spec=client.V1PersistentVolumeClaimSpec(
+                    access_modes=['ReadWriteOnce'],
+                    resources=client.V1ResourceRequirements(
+                        requests={'storage': vol_config['size']}
+                    ),
+                    storage_class_name='fast-ssd'  # Assuming SSD storage class
+                )
+            )
+            
+            try:
+                v1.create_namespaced_persistent_volume_claim(
+                    namespace=instance.namespace,
+                    body=pvc
+                )
+                logger.info(f"Created PVC: {pvc_name}")
+            except ApiException as e:
+                if e.status != 409:  # Ignore if already exists
+                    raise
+    
+    async def _create_service(self, instance: ServiceInstance, template: ServiceTemplate):
+        """Create Kubernetes service for the instance"""
+        
+        if not self.k8s_client:
+            return
+            
+        v1 = client.CoreV1Api(self.k8s_client)
+        
+        service = client.V1Service(
+            metadata=client.V1ObjectMeta(
+                name=instance.service_name,
+                namespace=instance.namespace,
+                labels={
+                    'app': template.service_type,
+                    'instance': instance.instance_id,
+                    'gt.tenant-id': instance.tenant_id
+                }
+            ),
+            spec=client.V1ServiceSpec(
+                selector={'instance': instance.instance_id},
+                ports=[client.V1ServicePort(
+                    port=80,
+                    target_port=template.ports['http'],
+                    protocol='TCP'
+                )],
+                type='ClusterIP'
+            )
+        )
+        
+        v1.create_namespaced_service(
+            namespace=instance.namespace,
+            body=service
+        )
+        
+        logger.info(f"Created service: {instance.service_name}")
+    
+    async def _create_ingress(self, instance: ServiceInstance, template: ServiceTemplate):
+        """Create ingress for external access with TLS"""
+        
+        if not self.k8s_client:
+            return
+            
+        networking_v1 = client.NetworkingV1Api(self.k8s_client)
+        
+        hostname = f"{template.service_type}.{instance.tenant_id}.gt2.com"
+        
+        ingress = client.V1Ingress(
+            metadata=client.V1ObjectMeta(
+                name=instance.ingress_name,
+                namespace=instance.namespace,
+                labels={
+                    'app': template.service_type,
+                    'instance': instance.instance_id,
+                    'gt.tenant-id': instance.tenant_id
+                },
+                annotations={
+                    'kubernetes.io/ingress.class': 'nginx',
+                    'cert-manager.io/cluster-issuer': 'letsencrypt-prod',
+                    'nginx.ingress.kubernetes.io/ssl-redirect': 'true',
+                    'nginx.ingress.kubernetes.io/force-ssl-redirect': 'true',
+                    'nginx.ingress.kubernetes.io/auth-url': f'https://auth.{instance.tenant_id}.gt2.com/auth',
+                    'nginx.ingress.kubernetes.io/auth-signin': f'https://auth.{instance.tenant_id}.gt2.com/signin'
+                }
+            ),
+            spec=client.V1IngressSpec(
+                tls=[client.V1IngressTLS(
+                    hosts=[hostname],
+                    secret_name=f"{template.service_type}-tls-{instance.instance_id}"
+                )],
+                rules=[client.V1IngressRule(
+                    host=hostname,
+                    http=client.V1HTTPIngressRuleValue(
+                        paths=[client.V1HTTPIngressPath(
+                            path='/',
+                            path_type='Prefix',
+                            backend=client.V1IngressBackend(
+                                service=client.V1IngressServiceBackend(
+                                    name=instance.service_name,
+                                    port=client.V1ServiceBackendPort(number=80)
+                                )
+                            )
+                        )]
+                    )
+                )]
+            )
+        )
+        
+        networking_v1.create_namespaced_ingress(
+            namespace=instance.namespace,
+            body=ingress
+        )
+        
+        logger.info(f"Created ingress: {instance.ingress_name} for {hostname}")
+    
+    async def _get_available_port(self) -> int:
+        """Get next available port for service"""
+        used_ports = {instance.external_port for instance in self.active_instances.values()}
+        port = 30000  # Start from NodePort range
+        while port in used_ports:
+            port += 1
+        return port
+    
+    async def _generate_sso_token(self, instance: ServiceInstance) -> str:
+        """Generate SSO token for iframe embedding"""
+        token_data = {
+            'tenant_id': instance.tenant_id,
+            'service_type': instance.service_type,
+            'instance_id': instance.instance_id,
+            'expires_at': (datetime.utcnow() + timedelta(hours=24)).isoformat(),
+            'permissions': ['read', 'write', 'admin']
+        }
+        
+        # Encrypt the token data
+        encrypted_token = encrypt_data(json.dumps(token_data))
+        return encrypted_token.decode('utf-8')
+    
+    async def get_service_instance(self, instance_id: str) -> Optional[ServiceInstance]:
+        """Get service instance by ID"""
+        return self.active_instances.get(instance_id)
+    
+    async def list_tenant_instances(self, tenant_id: str) -> List[ServiceInstance]:
+        """List all service instances for a tenant"""
+        return [
+            instance for instance in self.active_instances.values()
+            if instance.tenant_id == tenant_id
+        ]
+    
+    async def stop_service_instance(self, instance_id: str) -> bool:
+        """Stop a running service instance"""
+        instance = self.active_instances.get(instance_id)
+        if not instance:
+            return False
+        
+        try:
+            instance.status = 'stopping'
+            
+            if self.k8s_client:
+                # Delete Kubernetes resources
+                await self._cleanup_kubernetes_resources(instance)
+            
+            instance.status = 'stopped'
+            logger.info(f"Stopped service instance: {instance_id}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to stop instance {instance_id}: {e}")
+            instance.status = 'error'
+            return False
+    
+    async def _cleanup_kubernetes_resources(self, instance: ServiceInstance):
+        """Clean up all Kubernetes resources for an instance"""
+        
+        if not self.k8s_client:
+            return
+        
+        apps_v1 = client.AppsV1Api(self.k8s_client)
+        v1 = client.CoreV1Api(self.k8s_client)
+        networking_v1 = client.NetworkingV1Api(self.k8s_client)
+        
+        try:
+            # Delete deployment
+            apps_v1.delete_namespaced_deployment(
+                name=instance.deployment_name,
+                namespace=instance.namespace,
+                body=client.V1DeleteOptions()
+            )
+            
+            # Delete service
+            v1.delete_namespaced_service(
+                name=instance.service_name,
+                namespace=instance.namespace,
+                body=client.V1DeleteOptions()
+            )
+            
+            # Delete ingress
+            networking_v1.delete_namespaced_ingress(
+                name=instance.ingress_name,
+                namespace=instance.namespace,
+                body=client.V1DeleteOptions()
+            )
+            
+            # Delete PVCs (optional - may want to preserve data)
+            # Note: In production, you might want to keep PVCs for data persistence
+            
+            logger.info(f"Cleaned up Kubernetes resources for: {instance.instance_id}")
+            
+        except ApiException as e:
+            logger.error(f"Error cleaning up resources: {e}")
+            raise
+    
+    async def get_service_health(self, instance_id: str) -> Dict[str, Any]:
+        """Get health status of a service instance"""
+        instance = self.active_instances.get(instance_id)
+        if not instance:
+            return {'status': 'not_found'}
+        
+        if not self.k8s_client:
+            return {
+                'status': 'healthy',
+                'instance_status': instance.status,
+                'endpoint': instance.endpoint_url,
+                'last_check': datetime.utcnow().isoformat()
+            }
+        
+        # Check Kubernetes pod status
+        v1 = client.CoreV1Api(self.k8s_client)
+        
+        try:
+            pods = v1.list_namespaced_pod(
+                namespace=instance.namespace,
+                label_selector=f'instance={instance.instance_id}'
+            )
+            
+            if not pods.items:
+                return {
+                    'status': 'no_pods',
+                    'instance_status': instance.status
+                }
+            
+            pod = pods.items[0]
+            pod_status = 'unknown'
+            
+            if pod.status.phase == 'Running':
+                # Check container status
+                if pod.status.container_statuses:
+                    container_status = pod.status.container_statuses[0]
+                    if container_status.ready:
+                        pod_status = 'healthy'
+                    else:
+                        pod_status = 'unhealthy'
+                else:
+                    pod_status = 'starting'
+            elif pod.status.phase == 'Pending':
+                pod_status = 'starting'
+            elif pod.status.phase == 'Failed':
+                pod_status = 'failed'
+            
+            # Update instance heartbeat
+            instance.last_heartbeat = datetime.utcnow()
+            
+            return {
+                'status': pod_status,
+                'instance_status': instance.status,
+                'pod_phase': pod.status.phase,
+                'endpoint': instance.endpoint_url,
+                'last_check': datetime.utcnow().isoformat(),
+                'restart_count': pod.status.container_statuses[0].restart_count if pod.status.container_statuses else 0
+            }
+            
+        except ApiException as e:
+            logger.error(f"Failed to get health for {instance_id}: {e}")
+            return {
+                'status': 'error',
+                'error': str(e),
+                'instance_status': instance.status
+            }
+    
+    async def _persist_instance(self, instance: ServiceInstance):
+        """Persist instance data to disk"""
+        instance_file = self.storage_path / f"{instance.instance_id}.json"
+        
+        with open(instance_file, 'w') as f:
+            json.dump(instance.to_dict(), f, indent=2)
+    
+    def _load_persistent_instances(self):
+        """Load persistent instances from disk on startup"""
+        if not self.storage_path.exists():
+            return
+        
+        for instance_file in self.storage_path.glob("*.json"):
+            try:
+                with open(instance_file, 'r') as f:
+                    data = json.load(f)
+                
+                # Reconstruct instance object
+                instance = ServiceInstance(
+                    instance_id=data['instance_id'],
+                    tenant_id=data['tenant_id'],
+                    service_type=data['service_type'],
+                    status=data['status'],
+                    endpoint_url=data['endpoint_url'],
+                    internal_port=data['internal_port'],
+                    external_port=data['external_port'],
+                    namespace=data['namespace'],
+                    deployment_name=data['deployment_name'],
+                    service_name=data['service_name'],
+                    ingress_name=data['ingress_name'],
+                    sso_token=data.get('sso_token'),
+                    created_at=datetime.fromisoformat(data['created_at']),
+                    last_heartbeat=datetime.fromisoformat(data['last_heartbeat']),
+                    resource_usage=data.get('resource_usage', {})
+                )
+                
+                self.active_instances[instance.instance_id] = instance
+                logger.info(f"Loaded persistent instance: {instance.instance_id}")
+                
+            except Exception as e:
+                logger.error(f"Failed to load instance from {instance_file}: {e}")
+    
+    async def cleanup_orphaned_resources(self):
+        """Clean up orphaned Kubernetes resources"""
+        if not self.k8s_client:
+            return
+        
+        logger.info("Starting cleanup of orphaned resources...")
+        
+        # This would implement logic to find and clean up:
+        # 1. Deployments without corresponding instances
+        # 2. Services without deployments
+        # 3. Unused PVCs
+        # 4. Expired certificates
+        
+        # Implementation would query Kubernetes for resources with GT labels
+        # and cross-reference with active instances
+        
+        logger.info("Cleanup completed")