GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
HackWeasel
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions

View File

@@ -0,0 +1,931 @@
"""
GT 2.0 Resource Cluster - Service Manager
Orchestrates external web services (CTFd, Canvas LMS, Guacamole, JupyterHub)
with perfect tenant isolation and security.
"""
import asyncio
import json
import logging
import subprocess
import uuid
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path
try:
import docker
import kubernetes
from kubernetes import client, config
from kubernetes.client.rest import ApiException
DOCKER_AVAILABLE = True
KUBERNETES_AVAILABLE = True
except ImportError:
# For development containerization mode, these are optional
docker = None
kubernetes = None
client = None
config = None
ApiException = Exception
DOCKER_AVAILABLE = False
KUBERNETES_AVAILABLE = False
from app.core.config import get_settings
from app.core.security import verify_capability_token
from app.utils.encryption import encrypt_data, decrypt_data
logger = logging.getLogger(__name__)
@dataclass
class ServiceInstance:
"""Represents a deployed service instance"""
instance_id: str
tenant_id: str
service_type: str # 'ctfd', 'canvas', 'guacamole', 'jupyter'
status: str # 'starting', 'running', 'stopping', 'stopped', 'error'
endpoint_url: str
internal_port: int
external_port: int
namespace: str
deployment_name: str
service_name: str
ingress_name: str
sso_token: Optional[str] = None
created_at: datetime = datetime.utcnow()
last_heartbeat: datetime = datetime.utcnow()
resource_usage: Dict[str, Any] = None
def to_dict(self) -> Dict[str, Any]:
data = asdict(self)
data['created_at'] = self.created_at.isoformat()
data['last_heartbeat'] = self.last_heartbeat.isoformat()
return data
@dataclass
class ServiceTemplate:
"""Service deployment template configuration"""
service_type: str
image: str
ports: Dict[str, int]
environment: Dict[str, str]
volumes: List[Dict[str, str]]
resource_limits: Dict[str, str]
security_context: Dict[str, Any]
health_check: Dict[str, Any]
sso_config: Dict[str, Any]
class ServiceManager:
"""Manages external web service instances with Kubernetes orchestration"""
def __init__(self):
# Initialize Docker client if available
if DOCKER_AVAILABLE:
try:
self.docker_client = docker.from_env()
except Exception as e:
logger.warning(f"Could not initialize Docker client: {e}")
self.docker_client = None
else:
self.docker_client = None
self.k8s_client = None
self.active_instances: Dict[str, ServiceInstance] = {}
self.service_templates: Dict[str, ServiceTemplate] = {}
self.base_namespace = "gt-services"
self.storage_path = Path("/tmp/resource-cluster/services")
self.storage_path.mkdir(parents=True, exist_ok=True)
# Initialize Kubernetes client if available
if KUBERNETES_AVAILABLE:
try:
config.load_incluster_config() # If running in cluster
except:
try:
config.load_kube_config() # If running locally
except:
logger.warning("Could not load Kubernetes config - using mock mode")
self.k8s_client = client.ApiClient() if client else None
else:
logger.warning("Kubernetes not available - running in development containerization mode")
self._initialize_service_templates()
self._load_persistent_instances()
def _initialize_service_templates(self):
"""Initialize service deployment templates"""
# CTFd Template
self.service_templates['ctfd'] = ServiceTemplate(
service_type='ctfd',
image='ctfd/ctfd:3.6.0',
ports={'http': 8000},
environment={
'SECRET_KEY': '${TENANT_SECRET_KEY}',
'DATABASE_URL': 'sqlite:////data/ctfd.db',
'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants',
'UPLOAD_FOLDER': '/data/uploads',
'LOG_FOLDER': '/data/logs',
},
volumes=[
{'name': 'ctfd-data', 'mountPath': '/data', 'size': '5Gi'},
{'name': 'ctfd-uploads', 'mountPath': '/uploads', 'size': '2Gi'}
],
resource_limits={
'memory': '2Gi',
'cpu': '1000m'
},
security_context={
'runAsNonRoot': True,
'runAsUser': 1000,
'fsGroup': 1000,
'readOnlyRootFilesystem': False
},
health_check={
'path': '/health',
'port': 8000,
'initial_delay': 30,
'period': 10
},
sso_config={
'enabled': True,
'provider': 'oauth2',
'callback_path': '/auth/oauth/callback'
}
)
# Canvas LMS Template
self.service_templates['canvas'] = ServiceTemplate(
service_type='canvas',
image='instructure/canvas-lms:stable',
ports={'http': 3000},
environment={
'CANVAS_LMS_ADMIN_EMAIL': 'admin@${TENANT_DOMAIN}',
'CANVAS_LMS_ADMIN_PASSWORD': '${CANVAS_ADMIN_PASSWORD}',
'CANVAS_LMS_ACCOUNT_NAME': '${TENANT_NAME}',
'CANVAS_LMS_STATS_COLLECTION': 'opt_out',
'POSTGRES_PASSWORD': '${POSTGRES_PASSWORD}',
'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants'
},
volumes=[
{'name': 'canvas-data', 'mountPath': '/app/log', 'size': '10Gi'},
{'name': 'canvas-files', 'mountPath': '/app/public/files', 'size': '20Gi'}
],
resource_limits={
'memory': '4Gi',
'cpu': '2000m'
},
security_context={
'runAsNonRoot': True,
'runAsUser': 1000,
'fsGroup': 1000
},
health_check={
'path': '/health_check',
'port': 3000,
'initial_delay': 60,
'period': 15
},
sso_config={
'enabled': True,
'provider': 'saml',
'metadata_url': '/auth/saml/metadata'
}
)
# Guacamole Template
self.service_templates['guacamole'] = ServiceTemplate(
service_type='guacamole',
image='guacamole/guacamole:1.5.3',
ports={'http': 8080},
environment={
'GUACD_HOSTNAME': 'guacd',
'GUACD_PORT': '4822',
'MYSQL_HOSTNAME': 'mysql',
'MYSQL_PORT': '3306',
'MYSQL_DATABASE': 'guacamole_db',
'MYSQL_USER': 'guacamole_user',
'MYSQL_PASSWORD': '${MYSQL_PASSWORD}',
'GUAC_LOG_LEVEL': 'INFO'
},
volumes=[
{'name': 'guacamole-data', 'mountPath': '/config', 'size': '1Gi'},
{'name': 'guacamole-recordings', 'mountPath': '/recordings', 'size': '10Gi'}
],
resource_limits={
'memory': '1Gi',
'cpu': '500m'
},
security_context={
'runAsNonRoot': True,
'runAsUser': 1001,
'fsGroup': 1001
},
health_check={
'path': '/guacamole',
'port': 8080,
'initial_delay': 45,
'period': 10
},
sso_config={
'enabled': True,
'provider': 'openid',
'extension': 'guacamole-auth-openid'
}
)
# JupyterHub Template
self.service_templates['jupyter'] = ServiceTemplate(
service_type='jupyter',
image='jupyterhub/jupyterhub:4.0',
ports={'http': 8000},
environment={
'JUPYTERHUB_CRYPT_KEY': '${JUPYTERHUB_CRYPT_KEY}',
'CONFIGPROXY_AUTH_TOKEN': '${CONFIGPROXY_AUTH_TOKEN}',
'DOCKER_NETWORK_NAME': 'jupyterhub',
'DOCKER_NOTEBOOK_IMAGE': 'jupyter/datascience-notebook:lab-4.0.7'
},
volumes=[
{'name': 'jupyter-data', 'mountPath': '/srv/jupyterhub', 'size': '5Gi'},
{'name': 'docker-socket', 'mountPath': '/var/run/docker.sock', 'hostPath': '/var/run/docker.sock'}
],
resource_limits={
'memory': '2Gi',
'cpu': '1000m'
},
security_context={
'runAsNonRoot': False, # Needs Docker access
'runAsUser': 0,
'privileged': True
},
health_check={
'path': '/hub/health',
'port': 8000,
'initial_delay': 30,
'period': 15
},
sso_config={
'enabled': True,
'provider': 'oauth',
'authenticator_class': 'oauthenticator.generic.GenericOAuthenticator'
}
)
async def create_service_instance(
self,
tenant_id: str,
service_type: str,
config_overrides: Dict[str, Any] = None
) -> ServiceInstance:
"""Create a new service instance for a tenant"""
if service_type not in self.service_templates:
raise ValueError(f"Unsupported service type: {service_type}")
template = self.service_templates[service_type]
instance_id = f"{service_type}-{tenant_id}-{uuid.uuid4().hex[:8]}"
namespace = f"{self.base_namespace}-{tenant_id}"
# Generate unique ports
external_port = await self._get_available_port()
# Create service instance object
instance = ServiceInstance(
instance_id=instance_id,
tenant_id=tenant_id,
service_type=service_type,
status='starting',
endpoint_url=f"https://{service_type}.{tenant_id}.gt2.com",
internal_port=template.ports['http'],
external_port=external_port,
namespace=namespace,
deployment_name=f"{service_type}-{instance_id}",
service_name=f"{service_type}-service-{instance_id}",
ingress_name=f"{service_type}-ingress-{instance_id}",
resource_usage={'cpu': 0, 'memory': 0, 'storage': 0}
)
try:
# Create Kubernetes namespace if not exists
await self._create_namespace(namespace, tenant_id)
# Deploy the service
await self._deploy_service(instance, template, config_overrides)
# Generate SSO token
instance.sso_token = await self._generate_sso_token(instance)
# Store instance
self.active_instances[instance_id] = instance
await self._persist_instance(instance)
logger.info(f"Created {service_type} instance {instance_id} for tenant {tenant_id}")
return instance
except Exception as e:
logger.error(f"Failed to create service instance: {e}")
instance.status = 'error'
raise
async def _create_namespace(self, namespace: str, tenant_id: str):
"""Create Kubernetes namespace with proper labeling and network policies"""
if not self.k8s_client:
logger.info(f"Mock: Created namespace {namespace}")
return
v1 = client.CoreV1Api(self.k8s_client)
# Create namespace
namespace_manifest = client.V1Namespace(
metadata=client.V1ObjectMeta(
name=namespace,
labels={
'gt.tenant-id': tenant_id,
'gt.cluster': 'resource',
'gt.isolation': 'tenant'
},
annotations={
'gt.created-by': 'service-manager',
'gt.creation-time': datetime.utcnow().isoformat()
}
)
)
try:
v1.create_namespace(namespace_manifest)
logger.info(f"Created namespace: {namespace}")
except ApiException as e:
if e.status == 409: # Already exists
logger.info(f"Namespace {namespace} already exists")
else:
raise
# Apply network policy for tenant isolation
await self._apply_network_policy(namespace, tenant_id)
async def _apply_network_policy(self, namespace: str, tenant_id: str):
"""Apply network policy for tenant isolation"""
if not self.k8s_client:
logger.info(f"Mock: Applied network policy to {namespace}")
return
networking_v1 = client.NetworkingV1Api(self.k8s_client)
# Network policy that only allows:
# 1. Intra-namespace communication
# 2. Communication to system namespaces (DNS, etc.)
# 3. Egress to external services (for updates, etc.)
network_policy = client.V1NetworkPolicy(
metadata=client.V1ObjectMeta(
name=f"tenant-isolation-{tenant_id}",
namespace=namespace,
labels={'gt.tenant-id': tenant_id}
),
spec=client.V1NetworkPolicySpec(
pod_selector=client.V1LabelSelector(), # All pods in namespace
policy_types=['Ingress', 'Egress'],
ingress=[
# Allow ingress from same namespace
client.V1NetworkPolicyIngressRule(
from_=[client.V1NetworkPolicyPeer(
namespace_selector=client.V1LabelSelector(
match_labels={'name': namespace}
)
)]
),
# Allow ingress from ingress controller
client.V1NetworkPolicyIngressRule(
from_=[client.V1NetworkPolicyPeer(
namespace_selector=client.V1LabelSelector(
match_labels={'name': 'ingress-nginx'}
)
)]
)
],
egress=[
# Allow egress within namespace
client.V1NetworkPolicyEgressRule(
to=[client.V1NetworkPolicyPeer(
namespace_selector=client.V1LabelSelector(
match_labels={'name': namespace}
)
)]
),
# Allow DNS
client.V1NetworkPolicyEgressRule(
to=[client.V1NetworkPolicyPeer(
namespace_selector=client.V1LabelSelector(
match_labels={'name': 'kube-system'}
)
)],
ports=[client.V1NetworkPolicyPort(port=53, protocol='UDP')]
),
# Allow external HTTPS (for updates, etc.)
client.V1NetworkPolicyEgressRule(
ports=[
client.V1NetworkPolicyPort(port=443, protocol='TCP'),
client.V1NetworkPolicyPort(port=80, protocol='TCP')
]
)
]
)
)
try:
networking_v1.create_namespaced_network_policy(
namespace=namespace,
body=network_policy
)
logger.info(f"Applied network policy to namespace: {namespace}")
except ApiException as e:
if e.status == 409: # Already exists
logger.info(f"Network policy already exists in {namespace}")
else:
logger.error(f"Failed to create network policy: {e}")
raise
async def _deploy_service(
self,
instance: ServiceInstance,
template: ServiceTemplate,
config_overrides: Dict[str, Any] = None
):
"""Deploy service to Kubernetes cluster"""
if not self.k8s_client:
logger.info(f"Mock: Deployed {template.service_type} service")
instance.status = 'running'
return
# Prepare environment variables with tenant-specific values
environment = template.environment.copy()
if config_overrides:
environment.update(config_overrides.get('environment', {}))
# Substitute tenant-specific values
env_vars = []
for key, value in environment.items():
substituted_value = value.replace('${TENANT_ID}', instance.tenant_id)
substituted_value = substituted_value.replace('${TENANT_DOMAIN}', f"{instance.tenant_id}.gt2.com")
env_vars.append(client.V1EnvVar(name=key, value=substituted_value))
# Create volumes
volumes = []
volume_mounts = []
for vol_config in template.volumes:
vol_name = f"{vol_config['name']}-{instance.instance_id}"
volumes.append(client.V1Volume(
name=vol_name,
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
claim_name=vol_name
)
))
volume_mounts.append(client.V1VolumeMount(
name=vol_name,
mount_path=vol_config['mountPath']
))
# Create PVCs first
await self._create_persistent_volumes(instance, template)
# Create deployment
deployment = client.V1Deployment(
metadata=client.V1ObjectMeta(
name=instance.deployment_name,
namespace=instance.namespace,
labels={
'app': template.service_type,
'instance': instance.instance_id,
'gt.tenant-id': instance.tenant_id,
'gt.service-type': template.service_type
}
),
spec=client.V1DeploymentSpec(
replicas=1,
selector=client.V1LabelSelector(
match_labels={'instance': instance.instance_id}
),
template=client.V1PodTemplateSpec(
metadata=client.V1ObjectMeta(
labels={
'app': template.service_type,
'instance': instance.instance_id,
'gt.tenant-id': instance.tenant_id
}
),
spec=client.V1PodSpec(
containers=[client.V1Container(
name=template.service_type,
image=template.image,
ports=[client.V1ContainerPort(
container_port=template.ports['http']
)],
env=env_vars,
volume_mounts=volume_mounts,
resources=client.V1ResourceRequirements(
limits=template.resource_limits,
requests=template.resource_limits
),
security_context=client.V1SecurityContext(**template.security_context),
liveness_probe=client.V1Probe(
http_get=client.V1HTTPGetAction(
path=template.health_check['path'],
port=template.health_check['port']
),
initial_delay_seconds=template.health_check['initial_delay'],
period_seconds=template.health_check['period']
),
readiness_probe=client.V1Probe(
http_get=client.V1HTTPGetAction(
path=template.health_check['path'],
port=template.health_check['port']
),
initial_delay_seconds=10,
period_seconds=5
)
)],
volumes=volumes,
security_context=client.V1PodSecurityContext(
run_as_non_root=template.security_context.get('runAsNonRoot', True),
fs_group=template.security_context.get('fsGroup', 1000)
)
)
)
)
)
# Deploy to Kubernetes
apps_v1 = client.AppsV1Api(self.k8s_client)
apps_v1.create_namespaced_deployment(
namespace=instance.namespace,
body=deployment
)
# Create service
await self._create_service(instance, template)
# Create ingress
await self._create_ingress(instance, template)
logger.info(f"Deployed {template.service_type} service: {instance.deployment_name}")
async def _create_persistent_volumes(self, instance: ServiceInstance, template: ServiceTemplate):
"""Create persistent volume claims for the service"""
if not self.k8s_client:
return
v1 = client.CoreV1Api(self.k8s_client)
for vol_config in template.volumes:
if 'hostPath' in vol_config: # Skip host path volumes
continue
pvc_name = f"{vol_config['name']}-{instance.instance_id}"
pvc = client.V1PersistentVolumeClaim(
metadata=client.V1ObjectMeta(
name=pvc_name,
namespace=instance.namespace,
labels={
'app': template.service_type,
'instance': instance.instance_id,
'gt.tenant-id': instance.tenant_id
}
),
spec=client.V1PersistentVolumeClaimSpec(
access_modes=['ReadWriteOnce'],
resources=client.V1ResourceRequirements(
requests={'storage': vol_config['size']}
),
storage_class_name='fast-ssd' # Assuming SSD storage class
)
)
try:
v1.create_namespaced_persistent_volume_claim(
namespace=instance.namespace,
body=pvc
)
logger.info(f"Created PVC: {pvc_name}")
except ApiException as e:
if e.status != 409: # Ignore if already exists
raise
async def _create_service(self, instance: ServiceInstance, template: ServiceTemplate):
"""Create Kubernetes service for the instance"""
if not self.k8s_client:
return
v1 = client.CoreV1Api(self.k8s_client)
service = client.V1Service(
metadata=client.V1ObjectMeta(
name=instance.service_name,
namespace=instance.namespace,
labels={
'app': template.service_type,
'instance': instance.instance_id,
'gt.tenant-id': instance.tenant_id
}
),
spec=client.V1ServiceSpec(
selector={'instance': instance.instance_id},
ports=[client.V1ServicePort(
port=80,
target_port=template.ports['http'],
protocol='TCP'
)],
type='ClusterIP'
)
)
v1.create_namespaced_service(
namespace=instance.namespace,
body=service
)
logger.info(f"Created service: {instance.service_name}")
async def _create_ingress(self, instance: ServiceInstance, template: ServiceTemplate):
"""Create ingress for external access with TLS"""
if not self.k8s_client:
return
networking_v1 = client.NetworkingV1Api(self.k8s_client)
hostname = f"{template.service_type}.{instance.tenant_id}.gt2.com"
ingress = client.V1Ingress(
metadata=client.V1ObjectMeta(
name=instance.ingress_name,
namespace=instance.namespace,
labels={
'app': template.service_type,
'instance': instance.instance_id,
'gt.tenant-id': instance.tenant_id
},
annotations={
'kubernetes.io/ingress.class': 'nginx',
'cert-manager.io/cluster-issuer': 'letsencrypt-prod',
'nginx.ingress.kubernetes.io/ssl-redirect': 'true',
'nginx.ingress.kubernetes.io/force-ssl-redirect': 'true',
'nginx.ingress.kubernetes.io/auth-url': f'https://auth.{instance.tenant_id}.gt2.com/auth',
'nginx.ingress.kubernetes.io/auth-signin': f'https://auth.{instance.tenant_id}.gt2.com/signin'
}
),
spec=client.V1IngressSpec(
tls=[client.V1IngressTLS(
hosts=[hostname],
secret_name=f"{template.service_type}-tls-{instance.instance_id}"
)],
rules=[client.V1IngressRule(
host=hostname,
http=client.V1HTTPIngressRuleValue(
paths=[client.V1HTTPIngressPath(
path='/',
path_type='Prefix',
backend=client.V1IngressBackend(
service=client.V1IngressServiceBackend(
name=instance.service_name,
port=client.V1ServiceBackendPort(number=80)
)
)
)]
)
)]
)
)
networking_v1.create_namespaced_ingress(
namespace=instance.namespace,
body=ingress
)
logger.info(f"Created ingress: {instance.ingress_name} for {hostname}")
async def _get_available_port(self) -> int:
"""Get next available port for service"""
used_ports = {instance.external_port for instance in self.active_instances.values()}
port = 30000 # Start from NodePort range
while port in used_ports:
port += 1
return port
async def _generate_sso_token(self, instance: ServiceInstance) -> str:
"""Generate SSO token for iframe embedding"""
token_data = {
'tenant_id': instance.tenant_id,
'service_type': instance.service_type,
'instance_id': instance.instance_id,
'expires_at': (datetime.utcnow() + timedelta(hours=24)).isoformat(),
'permissions': ['read', 'write', 'admin']
}
# Encrypt the token data
encrypted_token = encrypt_data(json.dumps(token_data))
return encrypted_token.decode('utf-8')
async def get_service_instance(self, instance_id: str) -> Optional[ServiceInstance]:
"""Get service instance by ID"""
return self.active_instances.get(instance_id)
async def list_tenant_instances(self, tenant_id: str) -> List[ServiceInstance]:
"""List all service instances for a tenant"""
return [
instance for instance in self.active_instances.values()
if instance.tenant_id == tenant_id
]
async def stop_service_instance(self, instance_id: str) -> bool:
"""Stop a running service instance"""
instance = self.active_instances.get(instance_id)
if not instance:
return False
try:
instance.status = 'stopping'
if self.k8s_client:
# Delete Kubernetes resources
await self._cleanup_kubernetes_resources(instance)
instance.status = 'stopped'
logger.info(f"Stopped service instance: {instance_id}")
return True
except Exception as e:
logger.error(f"Failed to stop instance {instance_id}: {e}")
instance.status = 'error'
return False
async def _cleanup_kubernetes_resources(self, instance: ServiceInstance):
"""Clean up all Kubernetes resources for an instance"""
if not self.k8s_client:
return
apps_v1 = client.AppsV1Api(self.k8s_client)
v1 = client.CoreV1Api(self.k8s_client)
networking_v1 = client.NetworkingV1Api(self.k8s_client)
try:
# Delete deployment
apps_v1.delete_namespaced_deployment(
name=instance.deployment_name,
namespace=instance.namespace,
body=client.V1DeleteOptions()
)
# Delete service
v1.delete_namespaced_service(
name=instance.service_name,
namespace=instance.namespace,
body=client.V1DeleteOptions()
)
# Delete ingress
networking_v1.delete_namespaced_ingress(
name=instance.ingress_name,
namespace=instance.namespace,
body=client.V1DeleteOptions()
)
# Delete PVCs (optional - may want to preserve data)
# Note: In production, you might want to keep PVCs for data persistence
logger.info(f"Cleaned up Kubernetes resources for: {instance.instance_id}")
except ApiException as e:
logger.error(f"Error cleaning up resources: {e}")
raise
async def get_service_health(self, instance_id: str) -> Dict[str, Any]:
"""Get health status of a service instance"""
instance = self.active_instances.get(instance_id)
if not instance:
return {'status': 'not_found'}
if not self.k8s_client:
return {
'status': 'healthy',
'instance_status': instance.status,
'endpoint': instance.endpoint_url,
'last_check': datetime.utcnow().isoformat()
}
# Check Kubernetes pod status
v1 = client.CoreV1Api(self.k8s_client)
try:
pods = v1.list_namespaced_pod(
namespace=instance.namespace,
label_selector=f'instance={instance.instance_id}'
)
if not pods.items:
return {
'status': 'no_pods',
'instance_status': instance.status
}
pod = pods.items[0]
pod_status = 'unknown'
if pod.status.phase == 'Running':
# Check container status
if pod.status.container_statuses:
container_status = pod.status.container_statuses[0]
if container_status.ready:
pod_status = 'healthy'
else:
pod_status = 'unhealthy'
else:
pod_status = 'starting'
elif pod.status.phase == 'Pending':
pod_status = 'starting'
elif pod.status.phase == 'Failed':
pod_status = 'failed'
# Update instance heartbeat
instance.last_heartbeat = datetime.utcnow()
return {
'status': pod_status,
'instance_status': instance.status,
'pod_phase': pod.status.phase,
'endpoint': instance.endpoint_url,
'last_check': datetime.utcnow().isoformat(),
'restart_count': pod.status.container_statuses[0].restart_count if pod.status.container_statuses else 0
}
except ApiException as e:
logger.error(f"Failed to get health for {instance_id}: {e}")
return {
'status': 'error',
'error': str(e),
'instance_status': instance.status
}
async def _persist_instance(self, instance: ServiceInstance):
"""Persist instance data to disk"""
instance_file = self.storage_path / f"{instance.instance_id}.json"
with open(instance_file, 'w') as f:
json.dump(instance.to_dict(), f, indent=2)
def _load_persistent_instances(self):
"""Load persistent instances from disk on startup"""
if not self.storage_path.exists():
return
for instance_file in self.storage_path.glob("*.json"):
try:
with open(instance_file, 'r') as f:
data = json.load(f)
# Reconstruct instance object
instance = ServiceInstance(
instance_id=data['instance_id'],
tenant_id=data['tenant_id'],
service_type=data['service_type'],
status=data['status'],
endpoint_url=data['endpoint_url'],
internal_port=data['internal_port'],
external_port=data['external_port'],
namespace=data['namespace'],
deployment_name=data['deployment_name'],
service_name=data['service_name'],
ingress_name=data['ingress_name'],
sso_token=data.get('sso_token'),
created_at=datetime.fromisoformat(data['created_at']),
last_heartbeat=datetime.fromisoformat(data['last_heartbeat']),
resource_usage=data.get('resource_usage', {})
)
self.active_instances[instance.instance_id] = instance
logger.info(f"Loaded persistent instance: {instance.instance_id}")
except Exception as e:
logger.error(f"Failed to load instance from {instance_file}: {e}")
async def cleanup_orphaned_resources(self):
"""Clean up orphaned Kubernetes resources"""
if not self.k8s_client:
return
logger.info("Starting cleanup of orphaned resources...")
# This would implement logic to find and clean up:
# 1. Deployments without corresponding instances
# 2. Services without deployments
# 3. Unused PVCs
# 4. Expired certificates
# Implementation would query Kubernetes for resources with GT labels
# and cross-reference with active instances
logger.info("Cleanup completed")