GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
931
apps/resource-cluster/app/services/service_manager.py
Normal file
931
apps/resource-cluster/app/services/service_manager.py
Normal file
@@ -0,0 +1,931 @@
|
||||
"""
|
||||
GT 2.0 Resource Cluster - Service Manager
|
||||
Orchestrates external web services (CTFd, Canvas LMS, Guacamole, JupyterHub)
|
||||
with perfect tenant isolation and security.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
try:
|
||||
import docker
|
||||
import kubernetes
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.rest import ApiException
|
||||
DOCKER_AVAILABLE = True
|
||||
KUBERNETES_AVAILABLE = True
|
||||
except ImportError:
|
||||
# For development containerization mode, these are optional
|
||||
docker = None
|
||||
kubernetes = None
|
||||
client = None
|
||||
config = None
|
||||
ApiException = Exception
|
||||
DOCKER_AVAILABLE = False
|
||||
KUBERNETES_AVAILABLE = False
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.core.security import verify_capability_token
|
||||
from app.utils.encryption import encrypt_data, decrypt_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class ServiceInstance:
|
||||
"""Represents a deployed service instance"""
|
||||
instance_id: str
|
||||
tenant_id: str
|
||||
service_type: str # 'ctfd', 'canvas', 'guacamole', 'jupyter'
|
||||
status: str # 'starting', 'running', 'stopping', 'stopped', 'error'
|
||||
endpoint_url: str
|
||||
internal_port: int
|
||||
external_port: int
|
||||
namespace: str
|
||||
deployment_name: str
|
||||
service_name: str
|
||||
ingress_name: str
|
||||
sso_token: Optional[str] = None
|
||||
created_at: datetime = datetime.utcnow()
|
||||
last_heartbeat: datetime = datetime.utcnow()
|
||||
resource_usage: Dict[str, Any] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
data = asdict(self)
|
||||
data['created_at'] = self.created_at.isoformat()
|
||||
data['last_heartbeat'] = self.last_heartbeat.isoformat()
|
||||
return data
|
||||
|
||||
@dataclass
|
||||
class ServiceTemplate:
|
||||
"""Service deployment template configuration"""
|
||||
service_type: str
|
||||
image: str
|
||||
ports: Dict[str, int]
|
||||
environment: Dict[str, str]
|
||||
volumes: List[Dict[str, str]]
|
||||
resource_limits: Dict[str, str]
|
||||
security_context: Dict[str, Any]
|
||||
health_check: Dict[str, Any]
|
||||
sso_config: Dict[str, Any]
|
||||
|
||||
class ServiceManager:
|
||||
"""Manages external web service instances with Kubernetes orchestration"""
|
||||
|
||||
def __init__(self):
|
||||
# Initialize Docker client if available
|
||||
if DOCKER_AVAILABLE:
|
||||
try:
|
||||
self.docker_client = docker.from_env()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize Docker client: {e}")
|
||||
self.docker_client = None
|
||||
else:
|
||||
self.docker_client = None
|
||||
|
||||
self.k8s_client = None
|
||||
self.active_instances: Dict[str, ServiceInstance] = {}
|
||||
self.service_templates: Dict[str, ServiceTemplate] = {}
|
||||
self.base_namespace = "gt-services"
|
||||
self.storage_path = Path("/tmp/resource-cluster/services")
|
||||
self.storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize Kubernetes client if available
|
||||
if KUBERNETES_AVAILABLE:
|
||||
try:
|
||||
config.load_incluster_config() # If running in cluster
|
||||
except:
|
||||
try:
|
||||
config.load_kube_config() # If running locally
|
||||
except:
|
||||
logger.warning("Could not load Kubernetes config - using mock mode")
|
||||
|
||||
self.k8s_client = client.ApiClient() if client else None
|
||||
else:
|
||||
logger.warning("Kubernetes not available - running in development containerization mode")
|
||||
self._initialize_service_templates()
|
||||
self._load_persistent_instances()
|
||||
|
||||
def _initialize_service_templates(self):
|
||||
"""Initialize service deployment templates"""
|
||||
|
||||
# CTFd Template
|
||||
self.service_templates['ctfd'] = ServiceTemplate(
|
||||
service_type='ctfd',
|
||||
image='ctfd/ctfd:3.6.0',
|
||||
ports={'http': 8000},
|
||||
environment={
|
||||
'SECRET_KEY': '${TENANT_SECRET_KEY}',
|
||||
'DATABASE_URL': 'sqlite:////data/ctfd.db',
|
||||
'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants',
|
||||
'UPLOAD_FOLDER': '/data/uploads',
|
||||
'LOG_FOLDER': '/data/logs',
|
||||
},
|
||||
volumes=[
|
||||
{'name': 'ctfd-data', 'mountPath': '/data', 'size': '5Gi'},
|
||||
{'name': 'ctfd-uploads', 'mountPath': '/uploads', 'size': '2Gi'}
|
||||
],
|
||||
resource_limits={
|
||||
'memory': '2Gi',
|
||||
'cpu': '1000m'
|
||||
},
|
||||
security_context={
|
||||
'runAsNonRoot': True,
|
||||
'runAsUser': 1000,
|
||||
'fsGroup': 1000,
|
||||
'readOnlyRootFilesystem': False
|
||||
},
|
||||
health_check={
|
||||
'path': '/health',
|
||||
'port': 8000,
|
||||
'initial_delay': 30,
|
||||
'period': 10
|
||||
},
|
||||
sso_config={
|
||||
'enabled': True,
|
||||
'provider': 'oauth2',
|
||||
'callback_path': '/auth/oauth/callback'
|
||||
}
|
||||
)
|
||||
|
||||
# Canvas LMS Template
|
||||
self.service_templates['canvas'] = ServiceTemplate(
|
||||
service_type='canvas',
|
||||
image='instructure/canvas-lms:stable',
|
||||
ports={'http': 3000},
|
||||
environment={
|
||||
'CANVAS_LMS_ADMIN_EMAIL': 'admin@${TENANT_DOMAIN}',
|
||||
'CANVAS_LMS_ADMIN_PASSWORD': '${CANVAS_ADMIN_PASSWORD}',
|
||||
'CANVAS_LMS_ACCOUNT_NAME': '${TENANT_NAME}',
|
||||
'CANVAS_LMS_STATS_COLLECTION': 'opt_out',
|
||||
'POSTGRES_PASSWORD': '${POSTGRES_PASSWORD}',
|
||||
'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants'
|
||||
},
|
||||
volumes=[
|
||||
{'name': 'canvas-data', 'mountPath': '/app/log', 'size': '10Gi'},
|
||||
{'name': 'canvas-files', 'mountPath': '/app/public/files', 'size': '20Gi'}
|
||||
],
|
||||
resource_limits={
|
||||
'memory': '4Gi',
|
||||
'cpu': '2000m'
|
||||
},
|
||||
security_context={
|
||||
'runAsNonRoot': True,
|
||||
'runAsUser': 1000,
|
||||
'fsGroup': 1000
|
||||
},
|
||||
health_check={
|
||||
'path': '/health_check',
|
||||
'port': 3000,
|
||||
'initial_delay': 60,
|
||||
'period': 15
|
||||
},
|
||||
sso_config={
|
||||
'enabled': True,
|
||||
'provider': 'saml',
|
||||
'metadata_url': '/auth/saml/metadata'
|
||||
}
|
||||
)
|
||||
|
||||
# Guacamole Template
|
||||
self.service_templates['guacamole'] = ServiceTemplate(
|
||||
service_type='guacamole',
|
||||
image='guacamole/guacamole:1.5.3',
|
||||
ports={'http': 8080},
|
||||
environment={
|
||||
'GUACD_HOSTNAME': 'guacd',
|
||||
'GUACD_PORT': '4822',
|
||||
'MYSQL_HOSTNAME': 'mysql',
|
||||
'MYSQL_PORT': '3306',
|
||||
'MYSQL_DATABASE': 'guacamole_db',
|
||||
'MYSQL_USER': 'guacamole_user',
|
||||
'MYSQL_PASSWORD': '${MYSQL_PASSWORD}',
|
||||
'GUAC_LOG_LEVEL': 'INFO'
|
||||
},
|
||||
volumes=[
|
||||
{'name': 'guacamole-data', 'mountPath': '/config', 'size': '1Gi'},
|
||||
{'name': 'guacamole-recordings', 'mountPath': '/recordings', 'size': '10Gi'}
|
||||
],
|
||||
resource_limits={
|
||||
'memory': '1Gi',
|
||||
'cpu': '500m'
|
||||
},
|
||||
security_context={
|
||||
'runAsNonRoot': True,
|
||||
'runAsUser': 1001,
|
||||
'fsGroup': 1001
|
||||
},
|
||||
health_check={
|
||||
'path': '/guacamole',
|
||||
'port': 8080,
|
||||
'initial_delay': 45,
|
||||
'period': 10
|
||||
},
|
||||
sso_config={
|
||||
'enabled': True,
|
||||
'provider': 'openid',
|
||||
'extension': 'guacamole-auth-openid'
|
||||
}
|
||||
)
|
||||
|
||||
# JupyterHub Template
|
||||
self.service_templates['jupyter'] = ServiceTemplate(
|
||||
service_type='jupyter',
|
||||
image='jupyterhub/jupyterhub:4.0',
|
||||
ports={'http': 8000},
|
||||
environment={
|
||||
'JUPYTERHUB_CRYPT_KEY': '${JUPYTERHUB_CRYPT_KEY}',
|
||||
'CONFIGPROXY_AUTH_TOKEN': '${CONFIGPROXY_AUTH_TOKEN}',
|
||||
'DOCKER_NETWORK_NAME': 'jupyterhub',
|
||||
'DOCKER_NOTEBOOK_IMAGE': 'jupyter/datascience-notebook:lab-4.0.7'
|
||||
},
|
||||
volumes=[
|
||||
{'name': 'jupyter-data', 'mountPath': '/srv/jupyterhub', 'size': '5Gi'},
|
||||
{'name': 'docker-socket', 'mountPath': '/var/run/docker.sock', 'hostPath': '/var/run/docker.sock'}
|
||||
],
|
||||
resource_limits={
|
||||
'memory': '2Gi',
|
||||
'cpu': '1000m'
|
||||
},
|
||||
security_context={
|
||||
'runAsNonRoot': False, # Needs Docker access
|
||||
'runAsUser': 0,
|
||||
'privileged': True
|
||||
},
|
||||
health_check={
|
||||
'path': '/hub/health',
|
||||
'port': 8000,
|
||||
'initial_delay': 30,
|
||||
'period': 15
|
||||
},
|
||||
sso_config={
|
||||
'enabled': True,
|
||||
'provider': 'oauth',
|
||||
'authenticator_class': 'oauthenticator.generic.GenericOAuthenticator'
|
||||
}
|
||||
)
|
||||
|
||||
async def create_service_instance(
|
||||
self,
|
||||
tenant_id: str,
|
||||
service_type: str,
|
||||
config_overrides: Dict[str, Any] = None
|
||||
) -> ServiceInstance:
|
||||
"""Create a new service instance for a tenant"""
|
||||
|
||||
if service_type not in self.service_templates:
|
||||
raise ValueError(f"Unsupported service type: {service_type}")
|
||||
|
||||
template = self.service_templates[service_type]
|
||||
instance_id = f"{service_type}-{tenant_id}-{uuid.uuid4().hex[:8]}"
|
||||
namespace = f"{self.base_namespace}-{tenant_id}"
|
||||
|
||||
# Generate unique ports
|
||||
external_port = await self._get_available_port()
|
||||
|
||||
# Create service instance object
|
||||
instance = ServiceInstance(
|
||||
instance_id=instance_id,
|
||||
tenant_id=tenant_id,
|
||||
service_type=service_type,
|
||||
status='starting',
|
||||
endpoint_url=f"https://{service_type}.{tenant_id}.gt2.com",
|
||||
internal_port=template.ports['http'],
|
||||
external_port=external_port,
|
||||
namespace=namespace,
|
||||
deployment_name=f"{service_type}-{instance_id}",
|
||||
service_name=f"{service_type}-service-{instance_id}",
|
||||
ingress_name=f"{service_type}-ingress-{instance_id}",
|
||||
resource_usage={'cpu': 0, 'memory': 0, 'storage': 0}
|
||||
)
|
||||
|
||||
try:
|
||||
# Create Kubernetes namespace if not exists
|
||||
await self._create_namespace(namespace, tenant_id)
|
||||
|
||||
# Deploy the service
|
||||
await self._deploy_service(instance, template, config_overrides)
|
||||
|
||||
# Generate SSO token
|
||||
instance.sso_token = await self._generate_sso_token(instance)
|
||||
|
||||
# Store instance
|
||||
self.active_instances[instance_id] = instance
|
||||
await self._persist_instance(instance)
|
||||
|
||||
logger.info(f"Created {service_type} instance {instance_id} for tenant {tenant_id}")
|
||||
return instance
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create service instance: {e}")
|
||||
instance.status = 'error'
|
||||
raise
|
||||
|
||||
async def _create_namespace(self, namespace: str, tenant_id: str):
|
||||
"""Create Kubernetes namespace with proper labeling and network policies"""
|
||||
|
||||
if not self.k8s_client:
|
||||
logger.info(f"Mock: Created namespace {namespace}")
|
||||
return
|
||||
|
||||
v1 = client.CoreV1Api(self.k8s_client)
|
||||
|
||||
# Create namespace
|
||||
namespace_manifest = client.V1Namespace(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=namespace,
|
||||
labels={
|
||||
'gt.tenant-id': tenant_id,
|
||||
'gt.cluster': 'resource',
|
||||
'gt.isolation': 'tenant'
|
||||
},
|
||||
annotations={
|
||||
'gt.created-by': 'service-manager',
|
||||
'gt.creation-time': datetime.utcnow().isoformat()
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
v1.create_namespace(namespace_manifest)
|
||||
logger.info(f"Created namespace: {namespace}")
|
||||
except ApiException as e:
|
||||
if e.status == 409: # Already exists
|
||||
logger.info(f"Namespace {namespace} already exists")
|
||||
else:
|
||||
raise
|
||||
|
||||
# Apply network policy for tenant isolation
|
||||
await self._apply_network_policy(namespace, tenant_id)
|
||||
|
||||
async def _apply_network_policy(self, namespace: str, tenant_id: str):
|
||||
"""Apply network policy for tenant isolation"""
|
||||
|
||||
if not self.k8s_client:
|
||||
logger.info(f"Mock: Applied network policy to {namespace}")
|
||||
return
|
||||
|
||||
networking_v1 = client.NetworkingV1Api(self.k8s_client)
|
||||
|
||||
# Network policy that only allows:
|
||||
# 1. Intra-namespace communication
|
||||
# 2. Communication to system namespaces (DNS, etc.)
|
||||
# 3. Egress to external services (for updates, etc.)
|
||||
network_policy = client.V1NetworkPolicy(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=f"tenant-isolation-{tenant_id}",
|
||||
namespace=namespace,
|
||||
labels={'gt.tenant-id': tenant_id}
|
||||
),
|
||||
spec=client.V1NetworkPolicySpec(
|
||||
pod_selector=client.V1LabelSelector(), # All pods in namespace
|
||||
policy_types=['Ingress', 'Egress'],
|
||||
ingress=[
|
||||
# Allow ingress from same namespace
|
||||
client.V1NetworkPolicyIngressRule(
|
||||
from_=[client.V1NetworkPolicyPeer(
|
||||
namespace_selector=client.V1LabelSelector(
|
||||
match_labels={'name': namespace}
|
||||
)
|
||||
)]
|
||||
),
|
||||
# Allow ingress from ingress controller
|
||||
client.V1NetworkPolicyIngressRule(
|
||||
from_=[client.V1NetworkPolicyPeer(
|
||||
namespace_selector=client.V1LabelSelector(
|
||||
match_labels={'name': 'ingress-nginx'}
|
||||
)
|
||||
)]
|
||||
)
|
||||
],
|
||||
egress=[
|
||||
# Allow egress within namespace
|
||||
client.V1NetworkPolicyEgressRule(
|
||||
to=[client.V1NetworkPolicyPeer(
|
||||
namespace_selector=client.V1LabelSelector(
|
||||
match_labels={'name': namespace}
|
||||
)
|
||||
)]
|
||||
),
|
||||
# Allow DNS
|
||||
client.V1NetworkPolicyEgressRule(
|
||||
to=[client.V1NetworkPolicyPeer(
|
||||
namespace_selector=client.V1LabelSelector(
|
||||
match_labels={'name': 'kube-system'}
|
||||
)
|
||||
)],
|
||||
ports=[client.V1NetworkPolicyPort(port=53, protocol='UDP')]
|
||||
),
|
||||
# Allow external HTTPS (for updates, etc.)
|
||||
client.V1NetworkPolicyEgressRule(
|
||||
ports=[
|
||||
client.V1NetworkPolicyPort(port=443, protocol='TCP'),
|
||||
client.V1NetworkPolicyPort(port=80, protocol='TCP')
|
||||
]
|
||||
)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
networking_v1.create_namespaced_network_policy(
|
||||
namespace=namespace,
|
||||
body=network_policy
|
||||
)
|
||||
logger.info(f"Applied network policy to namespace: {namespace}")
|
||||
except ApiException as e:
|
||||
if e.status == 409: # Already exists
|
||||
logger.info(f"Network policy already exists in {namespace}")
|
||||
else:
|
||||
logger.error(f"Failed to create network policy: {e}")
|
||||
raise
|
||||
|
||||
async def _deploy_service(
|
||||
self,
|
||||
instance: ServiceInstance,
|
||||
template: ServiceTemplate,
|
||||
config_overrides: Dict[str, Any] = None
|
||||
):
|
||||
"""Deploy service to Kubernetes cluster"""
|
||||
|
||||
if not self.k8s_client:
|
||||
logger.info(f"Mock: Deployed {template.service_type} service")
|
||||
instance.status = 'running'
|
||||
return
|
||||
|
||||
# Prepare environment variables with tenant-specific values
|
||||
environment = template.environment.copy()
|
||||
if config_overrides:
|
||||
environment.update(config_overrides.get('environment', {}))
|
||||
|
||||
# Substitute tenant-specific values
|
||||
env_vars = []
|
||||
for key, value in environment.items():
|
||||
substituted_value = value.replace('${TENANT_ID}', instance.tenant_id)
|
||||
substituted_value = substituted_value.replace('${TENANT_DOMAIN}', f"{instance.tenant_id}.gt2.com")
|
||||
env_vars.append(client.V1EnvVar(name=key, value=substituted_value))
|
||||
|
||||
# Create volumes
|
||||
volumes = []
|
||||
volume_mounts = []
|
||||
for vol_config in template.volumes:
|
||||
vol_name = f"{vol_config['name']}-{instance.instance_id}"
|
||||
volumes.append(client.V1Volume(
|
||||
name=vol_name,
|
||||
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
||||
claim_name=vol_name
|
||||
)
|
||||
))
|
||||
volume_mounts.append(client.V1VolumeMount(
|
||||
name=vol_name,
|
||||
mount_path=vol_config['mountPath']
|
||||
))
|
||||
|
||||
# Create PVCs first
|
||||
await self._create_persistent_volumes(instance, template)
|
||||
|
||||
# Create deployment
|
||||
deployment = client.V1Deployment(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=instance.deployment_name,
|
||||
namespace=instance.namespace,
|
||||
labels={
|
||||
'app': template.service_type,
|
||||
'instance': instance.instance_id,
|
||||
'gt.tenant-id': instance.tenant_id,
|
||||
'gt.service-type': template.service_type
|
||||
}
|
||||
),
|
||||
spec=client.V1DeploymentSpec(
|
||||
replicas=1,
|
||||
selector=client.V1LabelSelector(
|
||||
match_labels={'instance': instance.instance_id}
|
||||
),
|
||||
template=client.V1PodTemplateSpec(
|
||||
metadata=client.V1ObjectMeta(
|
||||
labels={
|
||||
'app': template.service_type,
|
||||
'instance': instance.instance_id,
|
||||
'gt.tenant-id': instance.tenant_id
|
||||
}
|
||||
),
|
||||
spec=client.V1PodSpec(
|
||||
containers=[client.V1Container(
|
||||
name=template.service_type,
|
||||
image=template.image,
|
||||
ports=[client.V1ContainerPort(
|
||||
container_port=template.ports['http']
|
||||
)],
|
||||
env=env_vars,
|
||||
volume_mounts=volume_mounts,
|
||||
resources=client.V1ResourceRequirements(
|
||||
limits=template.resource_limits,
|
||||
requests=template.resource_limits
|
||||
),
|
||||
security_context=client.V1SecurityContext(**template.security_context),
|
||||
liveness_probe=client.V1Probe(
|
||||
http_get=client.V1HTTPGetAction(
|
||||
path=template.health_check['path'],
|
||||
port=template.health_check['port']
|
||||
),
|
||||
initial_delay_seconds=template.health_check['initial_delay'],
|
||||
period_seconds=template.health_check['period']
|
||||
),
|
||||
readiness_probe=client.V1Probe(
|
||||
http_get=client.V1HTTPGetAction(
|
||||
path=template.health_check['path'],
|
||||
port=template.health_check['port']
|
||||
),
|
||||
initial_delay_seconds=10,
|
||||
period_seconds=5
|
||||
)
|
||||
)],
|
||||
volumes=volumes,
|
||||
security_context=client.V1PodSecurityContext(
|
||||
run_as_non_root=template.security_context.get('runAsNonRoot', True),
|
||||
fs_group=template.security_context.get('fsGroup', 1000)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Deploy to Kubernetes
|
||||
apps_v1 = client.AppsV1Api(self.k8s_client)
|
||||
apps_v1.create_namespaced_deployment(
|
||||
namespace=instance.namespace,
|
||||
body=deployment
|
||||
)
|
||||
|
||||
# Create service
|
||||
await self._create_service(instance, template)
|
||||
|
||||
# Create ingress
|
||||
await self._create_ingress(instance, template)
|
||||
|
||||
logger.info(f"Deployed {template.service_type} service: {instance.deployment_name}")
|
||||
|
||||
async def _create_persistent_volumes(self, instance: ServiceInstance, template: ServiceTemplate):
|
||||
"""Create persistent volume claims for the service"""
|
||||
|
||||
if not self.k8s_client:
|
||||
return
|
||||
|
||||
v1 = client.CoreV1Api(self.k8s_client)
|
||||
|
||||
for vol_config in template.volumes:
|
||||
if 'hostPath' in vol_config: # Skip host path volumes
|
||||
continue
|
||||
|
||||
pvc_name = f"{vol_config['name']}-{instance.instance_id}"
|
||||
|
||||
pvc = client.V1PersistentVolumeClaim(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=pvc_name,
|
||||
namespace=instance.namespace,
|
||||
labels={
|
||||
'app': template.service_type,
|
||||
'instance': instance.instance_id,
|
||||
'gt.tenant-id': instance.tenant_id
|
||||
}
|
||||
),
|
||||
spec=client.V1PersistentVolumeClaimSpec(
|
||||
access_modes=['ReadWriteOnce'],
|
||||
resources=client.V1ResourceRequirements(
|
||||
requests={'storage': vol_config['size']}
|
||||
),
|
||||
storage_class_name='fast-ssd' # Assuming SSD storage class
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
v1.create_namespaced_persistent_volume_claim(
|
||||
namespace=instance.namespace,
|
||||
body=pvc
|
||||
)
|
||||
logger.info(f"Created PVC: {pvc_name}")
|
||||
except ApiException as e:
|
||||
if e.status != 409: # Ignore if already exists
|
||||
raise
|
||||
|
||||
async def _create_service(self, instance: ServiceInstance, template: ServiceTemplate):
|
||||
"""Create Kubernetes service for the instance"""
|
||||
|
||||
if not self.k8s_client:
|
||||
return
|
||||
|
||||
v1 = client.CoreV1Api(self.k8s_client)
|
||||
|
||||
service = client.V1Service(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=instance.service_name,
|
||||
namespace=instance.namespace,
|
||||
labels={
|
||||
'app': template.service_type,
|
||||
'instance': instance.instance_id,
|
||||
'gt.tenant-id': instance.tenant_id
|
||||
}
|
||||
),
|
||||
spec=client.V1ServiceSpec(
|
||||
selector={'instance': instance.instance_id},
|
||||
ports=[client.V1ServicePort(
|
||||
port=80,
|
||||
target_port=template.ports['http'],
|
||||
protocol='TCP'
|
||||
)],
|
||||
type='ClusterIP'
|
||||
)
|
||||
)
|
||||
|
||||
v1.create_namespaced_service(
|
||||
namespace=instance.namespace,
|
||||
body=service
|
||||
)
|
||||
|
||||
logger.info(f"Created service: {instance.service_name}")
|
||||
|
||||
async def _create_ingress(self, instance: ServiceInstance, template: ServiceTemplate):
|
||||
"""Create ingress for external access with TLS"""
|
||||
|
||||
if not self.k8s_client:
|
||||
return
|
||||
|
||||
networking_v1 = client.NetworkingV1Api(self.k8s_client)
|
||||
|
||||
hostname = f"{template.service_type}.{instance.tenant_id}.gt2.com"
|
||||
|
||||
ingress = client.V1Ingress(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=instance.ingress_name,
|
||||
namespace=instance.namespace,
|
||||
labels={
|
||||
'app': template.service_type,
|
||||
'instance': instance.instance_id,
|
||||
'gt.tenant-id': instance.tenant_id
|
||||
},
|
||||
annotations={
|
||||
'kubernetes.io/ingress.class': 'nginx',
|
||||
'cert-manager.io/cluster-issuer': 'letsencrypt-prod',
|
||||
'nginx.ingress.kubernetes.io/ssl-redirect': 'true',
|
||||
'nginx.ingress.kubernetes.io/force-ssl-redirect': 'true',
|
||||
'nginx.ingress.kubernetes.io/auth-url': f'https://auth.{instance.tenant_id}.gt2.com/auth',
|
||||
'nginx.ingress.kubernetes.io/auth-signin': f'https://auth.{instance.tenant_id}.gt2.com/signin'
|
||||
}
|
||||
),
|
||||
spec=client.V1IngressSpec(
|
||||
tls=[client.V1IngressTLS(
|
||||
hosts=[hostname],
|
||||
secret_name=f"{template.service_type}-tls-{instance.instance_id}"
|
||||
)],
|
||||
rules=[client.V1IngressRule(
|
||||
host=hostname,
|
||||
http=client.V1HTTPIngressRuleValue(
|
||||
paths=[client.V1HTTPIngressPath(
|
||||
path='/',
|
||||
path_type='Prefix',
|
||||
backend=client.V1IngressBackend(
|
||||
service=client.V1IngressServiceBackend(
|
||||
name=instance.service_name,
|
||||
port=client.V1ServiceBackendPort(number=80)
|
||||
)
|
||||
)
|
||||
)]
|
||||
)
|
||||
)]
|
||||
)
|
||||
)
|
||||
|
||||
networking_v1.create_namespaced_ingress(
|
||||
namespace=instance.namespace,
|
||||
body=ingress
|
||||
)
|
||||
|
||||
logger.info(f"Created ingress: {instance.ingress_name} for {hostname}")
|
||||
|
||||
async def _get_available_port(self) -> int:
|
||||
"""Get next available port for service"""
|
||||
used_ports = {instance.external_port for instance in self.active_instances.values()}
|
||||
port = 30000 # Start from NodePort range
|
||||
while port in used_ports:
|
||||
port += 1
|
||||
return port
|
||||
|
||||
async def _generate_sso_token(self, instance: ServiceInstance) -> str:
|
||||
"""Generate SSO token for iframe embedding"""
|
||||
token_data = {
|
||||
'tenant_id': instance.tenant_id,
|
||||
'service_type': instance.service_type,
|
||||
'instance_id': instance.instance_id,
|
||||
'expires_at': (datetime.utcnow() + timedelta(hours=24)).isoformat(),
|
||||
'permissions': ['read', 'write', 'admin']
|
||||
}
|
||||
|
||||
# Encrypt the token data
|
||||
encrypted_token = encrypt_data(json.dumps(token_data))
|
||||
return encrypted_token.decode('utf-8')
|
||||
|
||||
async def get_service_instance(self, instance_id: str) -> Optional[ServiceInstance]:
|
||||
"""Get service instance by ID"""
|
||||
return self.active_instances.get(instance_id)
|
||||
|
||||
async def list_tenant_instances(self, tenant_id: str) -> List[ServiceInstance]:
|
||||
"""List all service instances for a tenant"""
|
||||
return [
|
||||
instance for instance in self.active_instances.values()
|
||||
if instance.tenant_id == tenant_id
|
||||
]
|
||||
|
||||
async def stop_service_instance(self, instance_id: str) -> bool:
|
||||
"""Stop a running service instance"""
|
||||
instance = self.active_instances.get(instance_id)
|
||||
if not instance:
|
||||
return False
|
||||
|
||||
try:
|
||||
instance.status = 'stopping'
|
||||
|
||||
if self.k8s_client:
|
||||
# Delete Kubernetes resources
|
||||
await self._cleanup_kubernetes_resources(instance)
|
||||
|
||||
instance.status = 'stopped'
|
||||
logger.info(f"Stopped service instance: {instance_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to stop instance {instance_id}: {e}")
|
||||
instance.status = 'error'
|
||||
return False
|
||||
|
||||
async def _cleanup_kubernetes_resources(self, instance: ServiceInstance):
|
||||
"""Clean up all Kubernetes resources for an instance"""
|
||||
|
||||
if not self.k8s_client:
|
||||
return
|
||||
|
||||
apps_v1 = client.AppsV1Api(self.k8s_client)
|
||||
v1 = client.CoreV1Api(self.k8s_client)
|
||||
networking_v1 = client.NetworkingV1Api(self.k8s_client)
|
||||
|
||||
try:
|
||||
# Delete deployment
|
||||
apps_v1.delete_namespaced_deployment(
|
||||
name=instance.deployment_name,
|
||||
namespace=instance.namespace,
|
||||
body=client.V1DeleteOptions()
|
||||
)
|
||||
|
||||
# Delete service
|
||||
v1.delete_namespaced_service(
|
||||
name=instance.service_name,
|
||||
namespace=instance.namespace,
|
||||
body=client.V1DeleteOptions()
|
||||
)
|
||||
|
||||
# Delete ingress
|
||||
networking_v1.delete_namespaced_ingress(
|
||||
name=instance.ingress_name,
|
||||
namespace=instance.namespace,
|
||||
body=client.V1DeleteOptions()
|
||||
)
|
||||
|
||||
# Delete PVCs (optional - may want to preserve data)
|
||||
# Note: In production, you might want to keep PVCs for data persistence
|
||||
|
||||
logger.info(f"Cleaned up Kubernetes resources for: {instance.instance_id}")
|
||||
|
||||
except ApiException as e:
|
||||
logger.error(f"Error cleaning up resources: {e}")
|
||||
raise
|
||||
|
||||
async def get_service_health(self, instance_id: str) -> Dict[str, Any]:
|
||||
"""Get health status of a service instance"""
|
||||
instance = self.active_instances.get(instance_id)
|
||||
if not instance:
|
||||
return {'status': 'not_found'}
|
||||
|
||||
if not self.k8s_client:
|
||||
return {
|
||||
'status': 'healthy',
|
||||
'instance_status': instance.status,
|
||||
'endpoint': instance.endpoint_url,
|
||||
'last_check': datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Check Kubernetes pod status
|
||||
v1 = client.CoreV1Api(self.k8s_client)
|
||||
|
||||
try:
|
||||
pods = v1.list_namespaced_pod(
|
||||
namespace=instance.namespace,
|
||||
label_selector=f'instance={instance.instance_id}'
|
||||
)
|
||||
|
||||
if not pods.items:
|
||||
return {
|
||||
'status': 'no_pods',
|
||||
'instance_status': instance.status
|
||||
}
|
||||
|
||||
pod = pods.items[0]
|
||||
pod_status = 'unknown'
|
||||
|
||||
if pod.status.phase == 'Running':
|
||||
# Check container status
|
||||
if pod.status.container_statuses:
|
||||
container_status = pod.status.container_statuses[0]
|
||||
if container_status.ready:
|
||||
pod_status = 'healthy'
|
||||
else:
|
||||
pod_status = 'unhealthy'
|
||||
else:
|
||||
pod_status = 'starting'
|
||||
elif pod.status.phase == 'Pending':
|
||||
pod_status = 'starting'
|
||||
elif pod.status.phase == 'Failed':
|
||||
pod_status = 'failed'
|
||||
|
||||
# Update instance heartbeat
|
||||
instance.last_heartbeat = datetime.utcnow()
|
||||
|
||||
return {
|
||||
'status': pod_status,
|
||||
'instance_status': instance.status,
|
||||
'pod_phase': pod.status.phase,
|
||||
'endpoint': instance.endpoint_url,
|
||||
'last_check': datetime.utcnow().isoformat(),
|
||||
'restart_count': pod.status.container_statuses[0].restart_count if pod.status.container_statuses else 0
|
||||
}
|
||||
|
||||
except ApiException as e:
|
||||
logger.error(f"Failed to get health for {instance_id}: {e}")
|
||||
return {
|
||||
'status': 'error',
|
||||
'error': str(e),
|
||||
'instance_status': instance.status
|
||||
}
|
||||
|
||||
async def _persist_instance(self, instance: ServiceInstance):
|
||||
"""Persist instance data to disk"""
|
||||
instance_file = self.storage_path / f"{instance.instance_id}.json"
|
||||
|
||||
with open(instance_file, 'w') as f:
|
||||
json.dump(instance.to_dict(), f, indent=2)
|
||||
|
||||
def _load_persistent_instances(self):
|
||||
"""Load persistent instances from disk on startup"""
|
||||
if not self.storage_path.exists():
|
||||
return
|
||||
|
||||
for instance_file in self.storage_path.glob("*.json"):
|
||||
try:
|
||||
with open(instance_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Reconstruct instance object
|
||||
instance = ServiceInstance(
|
||||
instance_id=data['instance_id'],
|
||||
tenant_id=data['tenant_id'],
|
||||
service_type=data['service_type'],
|
||||
status=data['status'],
|
||||
endpoint_url=data['endpoint_url'],
|
||||
internal_port=data['internal_port'],
|
||||
external_port=data['external_port'],
|
||||
namespace=data['namespace'],
|
||||
deployment_name=data['deployment_name'],
|
||||
service_name=data['service_name'],
|
||||
ingress_name=data['ingress_name'],
|
||||
sso_token=data.get('sso_token'),
|
||||
created_at=datetime.fromisoformat(data['created_at']),
|
||||
last_heartbeat=datetime.fromisoformat(data['last_heartbeat']),
|
||||
resource_usage=data.get('resource_usage', {})
|
||||
)
|
||||
|
||||
self.active_instances[instance.instance_id] = instance
|
||||
logger.info(f"Loaded persistent instance: {instance.instance_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load instance from {instance_file}: {e}")
|
||||
|
||||
async def cleanup_orphaned_resources(self):
|
||||
"""Clean up orphaned Kubernetes resources"""
|
||||
if not self.k8s_client:
|
||||
return
|
||||
|
||||
logger.info("Starting cleanup of orphaned resources...")
|
||||
|
||||
# This would implement logic to find and clean up:
|
||||
# 1. Deployments without corresponding instances
|
||||
# 2. Services without deployments
|
||||
# 3. Unused PVCs
|
||||
# 4. Expired certificates
|
||||
|
||||
# Implementation would query Kubernetes for resources with GT labels
|
||||
# and cross-reference with active instances
|
||||
|
||||
logger.info("Cleanup completed")
|
||||
Reference in New Issue
Block a user