- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
931 lines
36 KiB
Python
931 lines
36 KiB
Python
"""
|
|
GT 2.0 Resource Cluster - Service Manager
|
|
Orchestrates external web services (CTFd, Canvas LMS, Guacamole, JupyterHub)
|
|
with perfect tenant isolation and security.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import subprocess
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
try:
|
|
import docker
|
|
import kubernetes
|
|
from kubernetes import client, config
|
|
from kubernetes.client.rest import ApiException
|
|
DOCKER_AVAILABLE = True
|
|
KUBERNETES_AVAILABLE = True
|
|
except ImportError:
|
|
# For development containerization mode, these are optional
|
|
docker = None
|
|
kubernetes = None
|
|
client = None
|
|
config = None
|
|
ApiException = Exception
|
|
DOCKER_AVAILABLE = False
|
|
KUBERNETES_AVAILABLE = False
|
|
|
|
from app.core.config import get_settings
|
|
from app.core.security import verify_capability_token
|
|
from app.utils.encryption import encrypt_data, decrypt_data
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class ServiceInstance:
|
|
"""Represents a deployed service instance"""
|
|
instance_id: str
|
|
tenant_id: str
|
|
service_type: str # 'ctfd', 'canvas', 'guacamole', 'jupyter'
|
|
status: str # 'starting', 'running', 'stopping', 'stopped', 'error'
|
|
endpoint_url: str
|
|
internal_port: int
|
|
external_port: int
|
|
namespace: str
|
|
deployment_name: str
|
|
service_name: str
|
|
ingress_name: str
|
|
sso_token: Optional[str] = None
|
|
created_at: datetime = datetime.utcnow()
|
|
last_heartbeat: datetime = datetime.utcnow()
|
|
resource_usage: Dict[str, Any] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
data = asdict(self)
|
|
data['created_at'] = self.created_at.isoformat()
|
|
data['last_heartbeat'] = self.last_heartbeat.isoformat()
|
|
return data
|
|
|
|
@dataclass
|
|
class ServiceTemplate:
|
|
"""Service deployment template configuration"""
|
|
service_type: str
|
|
image: str
|
|
ports: Dict[str, int]
|
|
environment: Dict[str, str]
|
|
volumes: List[Dict[str, str]]
|
|
resource_limits: Dict[str, str]
|
|
security_context: Dict[str, Any]
|
|
health_check: Dict[str, Any]
|
|
sso_config: Dict[str, Any]
|
|
|
|
class ServiceManager:
|
|
"""Manages external web service instances with Kubernetes orchestration"""
|
|
|
|
def __init__(self):
|
|
# Initialize Docker client if available
|
|
if DOCKER_AVAILABLE:
|
|
try:
|
|
self.docker_client = docker.from_env()
|
|
except Exception as e:
|
|
logger.warning(f"Could not initialize Docker client: {e}")
|
|
self.docker_client = None
|
|
else:
|
|
self.docker_client = None
|
|
|
|
self.k8s_client = None
|
|
self.active_instances: Dict[str, ServiceInstance] = {}
|
|
self.service_templates: Dict[str, ServiceTemplate] = {}
|
|
self.base_namespace = "gt-services"
|
|
self.storage_path = Path("/tmp/resource-cluster/services")
|
|
self.storage_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize Kubernetes client if available
|
|
if KUBERNETES_AVAILABLE:
|
|
try:
|
|
config.load_incluster_config() # If running in cluster
|
|
except:
|
|
try:
|
|
config.load_kube_config() # If running locally
|
|
except:
|
|
logger.warning("Could not load Kubernetes config - using mock mode")
|
|
|
|
self.k8s_client = client.ApiClient() if client else None
|
|
else:
|
|
logger.warning("Kubernetes not available - running in development containerization mode")
|
|
self._initialize_service_templates()
|
|
self._load_persistent_instances()
|
|
|
|
def _initialize_service_templates(self):
|
|
"""Initialize service deployment templates"""
|
|
|
|
# CTFd Template
|
|
self.service_templates['ctfd'] = ServiceTemplate(
|
|
service_type='ctfd',
|
|
image='ctfd/ctfd:3.6.0',
|
|
ports={'http': 8000},
|
|
environment={
|
|
'SECRET_KEY': '${TENANT_SECRET_KEY}',
|
|
'DATABASE_URL': 'sqlite:////data/ctfd.db',
|
|
'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants',
|
|
'UPLOAD_FOLDER': '/data/uploads',
|
|
'LOG_FOLDER': '/data/logs',
|
|
},
|
|
volumes=[
|
|
{'name': 'ctfd-data', 'mountPath': '/data', 'size': '5Gi'},
|
|
{'name': 'ctfd-uploads', 'mountPath': '/uploads', 'size': '2Gi'}
|
|
],
|
|
resource_limits={
|
|
'memory': '2Gi',
|
|
'cpu': '1000m'
|
|
},
|
|
security_context={
|
|
'runAsNonRoot': True,
|
|
'runAsUser': 1000,
|
|
'fsGroup': 1000,
|
|
'readOnlyRootFilesystem': False
|
|
},
|
|
health_check={
|
|
'path': '/health',
|
|
'port': 8000,
|
|
'initial_delay': 30,
|
|
'period': 10
|
|
},
|
|
sso_config={
|
|
'enabled': True,
|
|
'provider': 'oauth2',
|
|
'callback_path': '/auth/oauth/callback'
|
|
}
|
|
)
|
|
|
|
# Canvas LMS Template
|
|
self.service_templates['canvas'] = ServiceTemplate(
|
|
service_type='canvas',
|
|
image='instructure/canvas-lms:stable',
|
|
ports={'http': 3000},
|
|
environment={
|
|
'CANVAS_LMS_ADMIN_EMAIL': 'admin@${TENANT_DOMAIN}',
|
|
'CANVAS_LMS_ADMIN_PASSWORD': '${CANVAS_ADMIN_PASSWORD}',
|
|
'CANVAS_LMS_ACCOUNT_NAME': '${TENANT_NAME}',
|
|
'CANVAS_LMS_STATS_COLLECTION': 'opt_out',
|
|
'POSTGRES_PASSWORD': '${POSTGRES_PASSWORD}',
|
|
'DATABASE_CACHE_URL': 'postgresql://gt2_tenant_user:gt2_tenant_dev_password@tenant-postgres:5432/gt2_tenants'
|
|
},
|
|
volumes=[
|
|
{'name': 'canvas-data', 'mountPath': '/app/log', 'size': '10Gi'},
|
|
{'name': 'canvas-files', 'mountPath': '/app/public/files', 'size': '20Gi'}
|
|
],
|
|
resource_limits={
|
|
'memory': '4Gi',
|
|
'cpu': '2000m'
|
|
},
|
|
security_context={
|
|
'runAsNonRoot': True,
|
|
'runAsUser': 1000,
|
|
'fsGroup': 1000
|
|
},
|
|
health_check={
|
|
'path': '/health_check',
|
|
'port': 3000,
|
|
'initial_delay': 60,
|
|
'period': 15
|
|
},
|
|
sso_config={
|
|
'enabled': True,
|
|
'provider': 'saml',
|
|
'metadata_url': '/auth/saml/metadata'
|
|
}
|
|
)
|
|
|
|
# Guacamole Template
|
|
self.service_templates['guacamole'] = ServiceTemplate(
|
|
service_type='guacamole',
|
|
image='guacamole/guacamole:1.5.3',
|
|
ports={'http': 8080},
|
|
environment={
|
|
'GUACD_HOSTNAME': 'guacd',
|
|
'GUACD_PORT': '4822',
|
|
'MYSQL_HOSTNAME': 'mysql',
|
|
'MYSQL_PORT': '3306',
|
|
'MYSQL_DATABASE': 'guacamole_db',
|
|
'MYSQL_USER': 'guacamole_user',
|
|
'MYSQL_PASSWORD': '${MYSQL_PASSWORD}',
|
|
'GUAC_LOG_LEVEL': 'INFO'
|
|
},
|
|
volumes=[
|
|
{'name': 'guacamole-data', 'mountPath': '/config', 'size': '1Gi'},
|
|
{'name': 'guacamole-recordings', 'mountPath': '/recordings', 'size': '10Gi'}
|
|
],
|
|
resource_limits={
|
|
'memory': '1Gi',
|
|
'cpu': '500m'
|
|
},
|
|
security_context={
|
|
'runAsNonRoot': True,
|
|
'runAsUser': 1001,
|
|
'fsGroup': 1001
|
|
},
|
|
health_check={
|
|
'path': '/guacamole',
|
|
'port': 8080,
|
|
'initial_delay': 45,
|
|
'period': 10
|
|
},
|
|
sso_config={
|
|
'enabled': True,
|
|
'provider': 'openid',
|
|
'extension': 'guacamole-auth-openid'
|
|
}
|
|
)
|
|
|
|
# JupyterHub Template
|
|
self.service_templates['jupyter'] = ServiceTemplate(
|
|
service_type='jupyter',
|
|
image='jupyterhub/jupyterhub:4.0',
|
|
ports={'http': 8000},
|
|
environment={
|
|
'JUPYTERHUB_CRYPT_KEY': '${JUPYTERHUB_CRYPT_KEY}',
|
|
'CONFIGPROXY_AUTH_TOKEN': '${CONFIGPROXY_AUTH_TOKEN}',
|
|
'DOCKER_NETWORK_NAME': 'jupyterhub',
|
|
'DOCKER_NOTEBOOK_IMAGE': 'jupyter/datascience-notebook:lab-4.0.7'
|
|
},
|
|
volumes=[
|
|
{'name': 'jupyter-data', 'mountPath': '/srv/jupyterhub', 'size': '5Gi'},
|
|
{'name': 'docker-socket', 'mountPath': '/var/run/docker.sock', 'hostPath': '/var/run/docker.sock'}
|
|
],
|
|
resource_limits={
|
|
'memory': '2Gi',
|
|
'cpu': '1000m'
|
|
},
|
|
security_context={
|
|
'runAsNonRoot': False, # Needs Docker access
|
|
'runAsUser': 0,
|
|
'privileged': True
|
|
},
|
|
health_check={
|
|
'path': '/hub/health',
|
|
'port': 8000,
|
|
'initial_delay': 30,
|
|
'period': 15
|
|
},
|
|
sso_config={
|
|
'enabled': True,
|
|
'provider': 'oauth',
|
|
'authenticator_class': 'oauthenticator.generic.GenericOAuthenticator'
|
|
}
|
|
)
|
|
|
|
async def create_service_instance(
|
|
self,
|
|
tenant_id: str,
|
|
service_type: str,
|
|
config_overrides: Dict[str, Any] = None
|
|
) -> ServiceInstance:
|
|
"""Create a new service instance for a tenant"""
|
|
|
|
if service_type not in self.service_templates:
|
|
raise ValueError(f"Unsupported service type: {service_type}")
|
|
|
|
template = self.service_templates[service_type]
|
|
instance_id = f"{service_type}-{tenant_id}-{uuid.uuid4().hex[:8]}"
|
|
namespace = f"{self.base_namespace}-{tenant_id}"
|
|
|
|
# Generate unique ports
|
|
external_port = await self._get_available_port()
|
|
|
|
# Create service instance object
|
|
instance = ServiceInstance(
|
|
instance_id=instance_id,
|
|
tenant_id=tenant_id,
|
|
service_type=service_type,
|
|
status='starting',
|
|
endpoint_url=f"https://{service_type}.{tenant_id}.gt2.com",
|
|
internal_port=template.ports['http'],
|
|
external_port=external_port,
|
|
namespace=namespace,
|
|
deployment_name=f"{service_type}-{instance_id}",
|
|
service_name=f"{service_type}-service-{instance_id}",
|
|
ingress_name=f"{service_type}-ingress-{instance_id}",
|
|
resource_usage={'cpu': 0, 'memory': 0, 'storage': 0}
|
|
)
|
|
|
|
try:
|
|
# Create Kubernetes namespace if not exists
|
|
await self._create_namespace(namespace, tenant_id)
|
|
|
|
# Deploy the service
|
|
await self._deploy_service(instance, template, config_overrides)
|
|
|
|
# Generate SSO token
|
|
instance.sso_token = await self._generate_sso_token(instance)
|
|
|
|
# Store instance
|
|
self.active_instances[instance_id] = instance
|
|
await self._persist_instance(instance)
|
|
|
|
logger.info(f"Created {service_type} instance {instance_id} for tenant {tenant_id}")
|
|
return instance
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to create service instance: {e}")
|
|
instance.status = 'error'
|
|
raise
|
|
|
|
async def _create_namespace(self, namespace: str, tenant_id: str):
|
|
"""Create Kubernetes namespace with proper labeling and network policies"""
|
|
|
|
if not self.k8s_client:
|
|
logger.info(f"Mock: Created namespace {namespace}")
|
|
return
|
|
|
|
v1 = client.CoreV1Api(self.k8s_client)
|
|
|
|
# Create namespace
|
|
namespace_manifest = client.V1Namespace(
|
|
metadata=client.V1ObjectMeta(
|
|
name=namespace,
|
|
labels={
|
|
'gt.tenant-id': tenant_id,
|
|
'gt.cluster': 'resource',
|
|
'gt.isolation': 'tenant'
|
|
},
|
|
annotations={
|
|
'gt.created-by': 'service-manager',
|
|
'gt.creation-time': datetime.utcnow().isoformat()
|
|
}
|
|
)
|
|
)
|
|
|
|
try:
|
|
v1.create_namespace(namespace_manifest)
|
|
logger.info(f"Created namespace: {namespace}")
|
|
except ApiException as e:
|
|
if e.status == 409: # Already exists
|
|
logger.info(f"Namespace {namespace} already exists")
|
|
else:
|
|
raise
|
|
|
|
# Apply network policy for tenant isolation
|
|
await self._apply_network_policy(namespace, tenant_id)
|
|
|
|
async def _apply_network_policy(self, namespace: str, tenant_id: str):
|
|
"""Apply network policy for tenant isolation"""
|
|
|
|
if not self.k8s_client:
|
|
logger.info(f"Mock: Applied network policy to {namespace}")
|
|
return
|
|
|
|
networking_v1 = client.NetworkingV1Api(self.k8s_client)
|
|
|
|
# Network policy that only allows:
|
|
# 1. Intra-namespace communication
|
|
# 2. Communication to system namespaces (DNS, etc.)
|
|
# 3. Egress to external services (for updates, etc.)
|
|
network_policy = client.V1NetworkPolicy(
|
|
metadata=client.V1ObjectMeta(
|
|
name=f"tenant-isolation-{tenant_id}",
|
|
namespace=namespace,
|
|
labels={'gt.tenant-id': tenant_id}
|
|
),
|
|
spec=client.V1NetworkPolicySpec(
|
|
pod_selector=client.V1LabelSelector(), # All pods in namespace
|
|
policy_types=['Ingress', 'Egress'],
|
|
ingress=[
|
|
# Allow ingress from same namespace
|
|
client.V1NetworkPolicyIngressRule(
|
|
from_=[client.V1NetworkPolicyPeer(
|
|
namespace_selector=client.V1LabelSelector(
|
|
match_labels={'name': namespace}
|
|
)
|
|
)]
|
|
),
|
|
# Allow ingress from ingress controller
|
|
client.V1NetworkPolicyIngressRule(
|
|
from_=[client.V1NetworkPolicyPeer(
|
|
namespace_selector=client.V1LabelSelector(
|
|
match_labels={'name': 'ingress-nginx'}
|
|
)
|
|
)]
|
|
)
|
|
],
|
|
egress=[
|
|
# Allow egress within namespace
|
|
client.V1NetworkPolicyEgressRule(
|
|
to=[client.V1NetworkPolicyPeer(
|
|
namespace_selector=client.V1LabelSelector(
|
|
match_labels={'name': namespace}
|
|
)
|
|
)]
|
|
),
|
|
# Allow DNS
|
|
client.V1NetworkPolicyEgressRule(
|
|
to=[client.V1NetworkPolicyPeer(
|
|
namespace_selector=client.V1LabelSelector(
|
|
match_labels={'name': 'kube-system'}
|
|
)
|
|
)],
|
|
ports=[client.V1NetworkPolicyPort(port=53, protocol='UDP')]
|
|
),
|
|
# Allow external HTTPS (for updates, etc.)
|
|
client.V1NetworkPolicyEgressRule(
|
|
ports=[
|
|
client.V1NetworkPolicyPort(port=443, protocol='TCP'),
|
|
client.V1NetworkPolicyPort(port=80, protocol='TCP')
|
|
]
|
|
)
|
|
]
|
|
)
|
|
)
|
|
|
|
try:
|
|
networking_v1.create_namespaced_network_policy(
|
|
namespace=namespace,
|
|
body=network_policy
|
|
)
|
|
logger.info(f"Applied network policy to namespace: {namespace}")
|
|
except ApiException as e:
|
|
if e.status == 409: # Already exists
|
|
logger.info(f"Network policy already exists in {namespace}")
|
|
else:
|
|
logger.error(f"Failed to create network policy: {e}")
|
|
raise
|
|
|
|
async def _deploy_service(
|
|
self,
|
|
instance: ServiceInstance,
|
|
template: ServiceTemplate,
|
|
config_overrides: Dict[str, Any] = None
|
|
):
|
|
"""Deploy service to Kubernetes cluster"""
|
|
|
|
if not self.k8s_client:
|
|
logger.info(f"Mock: Deployed {template.service_type} service")
|
|
instance.status = 'running'
|
|
return
|
|
|
|
# Prepare environment variables with tenant-specific values
|
|
environment = template.environment.copy()
|
|
if config_overrides:
|
|
environment.update(config_overrides.get('environment', {}))
|
|
|
|
# Substitute tenant-specific values
|
|
env_vars = []
|
|
for key, value in environment.items():
|
|
substituted_value = value.replace('${TENANT_ID}', instance.tenant_id)
|
|
substituted_value = substituted_value.replace('${TENANT_DOMAIN}', f"{instance.tenant_id}.gt2.com")
|
|
env_vars.append(client.V1EnvVar(name=key, value=substituted_value))
|
|
|
|
# Create volumes
|
|
volumes = []
|
|
volume_mounts = []
|
|
for vol_config in template.volumes:
|
|
vol_name = f"{vol_config['name']}-{instance.instance_id}"
|
|
volumes.append(client.V1Volume(
|
|
name=vol_name,
|
|
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
claim_name=vol_name
|
|
)
|
|
))
|
|
volume_mounts.append(client.V1VolumeMount(
|
|
name=vol_name,
|
|
mount_path=vol_config['mountPath']
|
|
))
|
|
|
|
# Create PVCs first
|
|
await self._create_persistent_volumes(instance, template)
|
|
|
|
# Create deployment
|
|
deployment = client.V1Deployment(
|
|
metadata=client.V1ObjectMeta(
|
|
name=instance.deployment_name,
|
|
namespace=instance.namespace,
|
|
labels={
|
|
'app': template.service_type,
|
|
'instance': instance.instance_id,
|
|
'gt.tenant-id': instance.tenant_id,
|
|
'gt.service-type': template.service_type
|
|
}
|
|
),
|
|
spec=client.V1DeploymentSpec(
|
|
replicas=1,
|
|
selector=client.V1LabelSelector(
|
|
match_labels={'instance': instance.instance_id}
|
|
),
|
|
template=client.V1PodTemplateSpec(
|
|
metadata=client.V1ObjectMeta(
|
|
labels={
|
|
'app': template.service_type,
|
|
'instance': instance.instance_id,
|
|
'gt.tenant-id': instance.tenant_id
|
|
}
|
|
),
|
|
spec=client.V1PodSpec(
|
|
containers=[client.V1Container(
|
|
name=template.service_type,
|
|
image=template.image,
|
|
ports=[client.V1ContainerPort(
|
|
container_port=template.ports['http']
|
|
)],
|
|
env=env_vars,
|
|
volume_mounts=volume_mounts,
|
|
resources=client.V1ResourceRequirements(
|
|
limits=template.resource_limits,
|
|
requests=template.resource_limits
|
|
),
|
|
security_context=client.V1SecurityContext(**template.security_context),
|
|
liveness_probe=client.V1Probe(
|
|
http_get=client.V1HTTPGetAction(
|
|
path=template.health_check['path'],
|
|
port=template.health_check['port']
|
|
),
|
|
initial_delay_seconds=template.health_check['initial_delay'],
|
|
period_seconds=template.health_check['period']
|
|
),
|
|
readiness_probe=client.V1Probe(
|
|
http_get=client.V1HTTPGetAction(
|
|
path=template.health_check['path'],
|
|
port=template.health_check['port']
|
|
),
|
|
initial_delay_seconds=10,
|
|
period_seconds=5
|
|
)
|
|
)],
|
|
volumes=volumes,
|
|
security_context=client.V1PodSecurityContext(
|
|
run_as_non_root=template.security_context.get('runAsNonRoot', True),
|
|
fs_group=template.security_context.get('fsGroup', 1000)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
# Deploy to Kubernetes
|
|
apps_v1 = client.AppsV1Api(self.k8s_client)
|
|
apps_v1.create_namespaced_deployment(
|
|
namespace=instance.namespace,
|
|
body=deployment
|
|
)
|
|
|
|
# Create service
|
|
await self._create_service(instance, template)
|
|
|
|
# Create ingress
|
|
await self._create_ingress(instance, template)
|
|
|
|
logger.info(f"Deployed {template.service_type} service: {instance.deployment_name}")
|
|
|
|
async def _create_persistent_volumes(self, instance: ServiceInstance, template: ServiceTemplate):
|
|
"""Create persistent volume claims for the service"""
|
|
|
|
if not self.k8s_client:
|
|
return
|
|
|
|
v1 = client.CoreV1Api(self.k8s_client)
|
|
|
|
for vol_config in template.volumes:
|
|
if 'hostPath' in vol_config: # Skip host path volumes
|
|
continue
|
|
|
|
pvc_name = f"{vol_config['name']}-{instance.instance_id}"
|
|
|
|
pvc = client.V1PersistentVolumeClaim(
|
|
metadata=client.V1ObjectMeta(
|
|
name=pvc_name,
|
|
namespace=instance.namespace,
|
|
labels={
|
|
'app': template.service_type,
|
|
'instance': instance.instance_id,
|
|
'gt.tenant-id': instance.tenant_id
|
|
}
|
|
),
|
|
spec=client.V1PersistentVolumeClaimSpec(
|
|
access_modes=['ReadWriteOnce'],
|
|
resources=client.V1ResourceRequirements(
|
|
requests={'storage': vol_config['size']}
|
|
),
|
|
storage_class_name='fast-ssd' # Assuming SSD storage class
|
|
)
|
|
)
|
|
|
|
try:
|
|
v1.create_namespaced_persistent_volume_claim(
|
|
namespace=instance.namespace,
|
|
body=pvc
|
|
)
|
|
logger.info(f"Created PVC: {pvc_name}")
|
|
except ApiException as e:
|
|
if e.status != 409: # Ignore if already exists
|
|
raise
|
|
|
|
async def _create_service(self, instance: ServiceInstance, template: ServiceTemplate):
|
|
"""Create Kubernetes service for the instance"""
|
|
|
|
if not self.k8s_client:
|
|
return
|
|
|
|
v1 = client.CoreV1Api(self.k8s_client)
|
|
|
|
service = client.V1Service(
|
|
metadata=client.V1ObjectMeta(
|
|
name=instance.service_name,
|
|
namespace=instance.namespace,
|
|
labels={
|
|
'app': template.service_type,
|
|
'instance': instance.instance_id,
|
|
'gt.tenant-id': instance.tenant_id
|
|
}
|
|
),
|
|
spec=client.V1ServiceSpec(
|
|
selector={'instance': instance.instance_id},
|
|
ports=[client.V1ServicePort(
|
|
port=80,
|
|
target_port=template.ports['http'],
|
|
protocol='TCP'
|
|
)],
|
|
type='ClusterIP'
|
|
)
|
|
)
|
|
|
|
v1.create_namespaced_service(
|
|
namespace=instance.namespace,
|
|
body=service
|
|
)
|
|
|
|
logger.info(f"Created service: {instance.service_name}")
|
|
|
|
async def _create_ingress(self, instance: ServiceInstance, template: ServiceTemplate):
|
|
"""Create ingress for external access with TLS"""
|
|
|
|
if not self.k8s_client:
|
|
return
|
|
|
|
networking_v1 = client.NetworkingV1Api(self.k8s_client)
|
|
|
|
hostname = f"{template.service_type}.{instance.tenant_id}.gt2.com"
|
|
|
|
ingress = client.V1Ingress(
|
|
metadata=client.V1ObjectMeta(
|
|
name=instance.ingress_name,
|
|
namespace=instance.namespace,
|
|
labels={
|
|
'app': template.service_type,
|
|
'instance': instance.instance_id,
|
|
'gt.tenant-id': instance.tenant_id
|
|
},
|
|
annotations={
|
|
'kubernetes.io/ingress.class': 'nginx',
|
|
'cert-manager.io/cluster-issuer': 'letsencrypt-prod',
|
|
'nginx.ingress.kubernetes.io/ssl-redirect': 'true',
|
|
'nginx.ingress.kubernetes.io/force-ssl-redirect': 'true',
|
|
'nginx.ingress.kubernetes.io/auth-url': f'https://auth.{instance.tenant_id}.gt2.com/auth',
|
|
'nginx.ingress.kubernetes.io/auth-signin': f'https://auth.{instance.tenant_id}.gt2.com/signin'
|
|
}
|
|
),
|
|
spec=client.V1IngressSpec(
|
|
tls=[client.V1IngressTLS(
|
|
hosts=[hostname],
|
|
secret_name=f"{template.service_type}-tls-{instance.instance_id}"
|
|
)],
|
|
rules=[client.V1IngressRule(
|
|
host=hostname,
|
|
http=client.V1HTTPIngressRuleValue(
|
|
paths=[client.V1HTTPIngressPath(
|
|
path='/',
|
|
path_type='Prefix',
|
|
backend=client.V1IngressBackend(
|
|
service=client.V1IngressServiceBackend(
|
|
name=instance.service_name,
|
|
port=client.V1ServiceBackendPort(number=80)
|
|
)
|
|
)
|
|
)]
|
|
)
|
|
)]
|
|
)
|
|
)
|
|
|
|
networking_v1.create_namespaced_ingress(
|
|
namespace=instance.namespace,
|
|
body=ingress
|
|
)
|
|
|
|
logger.info(f"Created ingress: {instance.ingress_name} for {hostname}")
|
|
|
|
async def _get_available_port(self) -> int:
|
|
"""Get next available port for service"""
|
|
used_ports = {instance.external_port for instance in self.active_instances.values()}
|
|
port = 30000 # Start from NodePort range
|
|
while port in used_ports:
|
|
port += 1
|
|
return port
|
|
|
|
async def _generate_sso_token(self, instance: ServiceInstance) -> str:
|
|
"""Generate SSO token for iframe embedding"""
|
|
token_data = {
|
|
'tenant_id': instance.tenant_id,
|
|
'service_type': instance.service_type,
|
|
'instance_id': instance.instance_id,
|
|
'expires_at': (datetime.utcnow() + timedelta(hours=24)).isoformat(),
|
|
'permissions': ['read', 'write', 'admin']
|
|
}
|
|
|
|
# Encrypt the token data
|
|
encrypted_token = encrypt_data(json.dumps(token_data))
|
|
return encrypted_token.decode('utf-8')
|
|
|
|
async def get_service_instance(self, instance_id: str) -> Optional[ServiceInstance]:
|
|
"""Get service instance by ID"""
|
|
return self.active_instances.get(instance_id)
|
|
|
|
async def list_tenant_instances(self, tenant_id: str) -> List[ServiceInstance]:
|
|
"""List all service instances for a tenant"""
|
|
return [
|
|
instance for instance in self.active_instances.values()
|
|
if instance.tenant_id == tenant_id
|
|
]
|
|
|
|
async def stop_service_instance(self, instance_id: str) -> bool:
|
|
"""Stop a running service instance"""
|
|
instance = self.active_instances.get(instance_id)
|
|
if not instance:
|
|
return False
|
|
|
|
try:
|
|
instance.status = 'stopping'
|
|
|
|
if self.k8s_client:
|
|
# Delete Kubernetes resources
|
|
await self._cleanup_kubernetes_resources(instance)
|
|
|
|
instance.status = 'stopped'
|
|
logger.info(f"Stopped service instance: {instance_id}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to stop instance {instance_id}: {e}")
|
|
instance.status = 'error'
|
|
return False
|
|
|
|
async def _cleanup_kubernetes_resources(self, instance: ServiceInstance):
|
|
"""Clean up all Kubernetes resources for an instance"""
|
|
|
|
if not self.k8s_client:
|
|
return
|
|
|
|
apps_v1 = client.AppsV1Api(self.k8s_client)
|
|
v1 = client.CoreV1Api(self.k8s_client)
|
|
networking_v1 = client.NetworkingV1Api(self.k8s_client)
|
|
|
|
try:
|
|
# Delete deployment
|
|
apps_v1.delete_namespaced_deployment(
|
|
name=instance.deployment_name,
|
|
namespace=instance.namespace,
|
|
body=client.V1DeleteOptions()
|
|
)
|
|
|
|
# Delete service
|
|
v1.delete_namespaced_service(
|
|
name=instance.service_name,
|
|
namespace=instance.namespace,
|
|
body=client.V1DeleteOptions()
|
|
)
|
|
|
|
# Delete ingress
|
|
networking_v1.delete_namespaced_ingress(
|
|
name=instance.ingress_name,
|
|
namespace=instance.namespace,
|
|
body=client.V1DeleteOptions()
|
|
)
|
|
|
|
# Delete PVCs (optional - may want to preserve data)
|
|
# Note: In production, you might want to keep PVCs for data persistence
|
|
|
|
logger.info(f"Cleaned up Kubernetes resources for: {instance.instance_id}")
|
|
|
|
except ApiException as e:
|
|
logger.error(f"Error cleaning up resources: {e}")
|
|
raise
|
|
|
|
async def get_service_health(self, instance_id: str) -> Dict[str, Any]:
|
|
"""Get health status of a service instance"""
|
|
instance = self.active_instances.get(instance_id)
|
|
if not instance:
|
|
return {'status': 'not_found'}
|
|
|
|
if not self.k8s_client:
|
|
return {
|
|
'status': 'healthy',
|
|
'instance_status': instance.status,
|
|
'endpoint': instance.endpoint_url,
|
|
'last_check': datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Check Kubernetes pod status
|
|
v1 = client.CoreV1Api(self.k8s_client)
|
|
|
|
try:
|
|
pods = v1.list_namespaced_pod(
|
|
namespace=instance.namespace,
|
|
label_selector=f'instance={instance.instance_id}'
|
|
)
|
|
|
|
if not pods.items:
|
|
return {
|
|
'status': 'no_pods',
|
|
'instance_status': instance.status
|
|
}
|
|
|
|
pod = pods.items[0]
|
|
pod_status = 'unknown'
|
|
|
|
if pod.status.phase == 'Running':
|
|
# Check container status
|
|
if pod.status.container_statuses:
|
|
container_status = pod.status.container_statuses[0]
|
|
if container_status.ready:
|
|
pod_status = 'healthy'
|
|
else:
|
|
pod_status = 'unhealthy'
|
|
else:
|
|
pod_status = 'starting'
|
|
elif pod.status.phase == 'Pending':
|
|
pod_status = 'starting'
|
|
elif pod.status.phase == 'Failed':
|
|
pod_status = 'failed'
|
|
|
|
# Update instance heartbeat
|
|
instance.last_heartbeat = datetime.utcnow()
|
|
|
|
return {
|
|
'status': pod_status,
|
|
'instance_status': instance.status,
|
|
'pod_phase': pod.status.phase,
|
|
'endpoint': instance.endpoint_url,
|
|
'last_check': datetime.utcnow().isoformat(),
|
|
'restart_count': pod.status.container_statuses[0].restart_count if pod.status.container_statuses else 0
|
|
}
|
|
|
|
except ApiException as e:
|
|
logger.error(f"Failed to get health for {instance_id}: {e}")
|
|
return {
|
|
'status': 'error',
|
|
'error': str(e),
|
|
'instance_status': instance.status
|
|
}
|
|
|
|
async def _persist_instance(self, instance: ServiceInstance):
|
|
"""Persist instance data to disk"""
|
|
instance_file = self.storage_path / f"{instance.instance_id}.json"
|
|
|
|
with open(instance_file, 'w') as f:
|
|
json.dump(instance.to_dict(), f, indent=2)
|
|
|
|
def _load_persistent_instances(self):
|
|
"""Load persistent instances from disk on startup"""
|
|
if not self.storage_path.exists():
|
|
return
|
|
|
|
for instance_file in self.storage_path.glob("*.json"):
|
|
try:
|
|
with open(instance_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Reconstruct instance object
|
|
instance = ServiceInstance(
|
|
instance_id=data['instance_id'],
|
|
tenant_id=data['tenant_id'],
|
|
service_type=data['service_type'],
|
|
status=data['status'],
|
|
endpoint_url=data['endpoint_url'],
|
|
internal_port=data['internal_port'],
|
|
external_port=data['external_port'],
|
|
namespace=data['namespace'],
|
|
deployment_name=data['deployment_name'],
|
|
service_name=data['service_name'],
|
|
ingress_name=data['ingress_name'],
|
|
sso_token=data.get('sso_token'),
|
|
created_at=datetime.fromisoformat(data['created_at']),
|
|
last_heartbeat=datetime.fromisoformat(data['last_heartbeat']),
|
|
resource_usage=data.get('resource_usage', {})
|
|
)
|
|
|
|
self.active_instances[instance.instance_id] = instance
|
|
logger.info(f"Loaded persistent instance: {instance.instance_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load instance from {instance_file}: {e}")
|
|
|
|
async def cleanup_orphaned_resources(self):
|
|
"""Clean up orphaned Kubernetes resources"""
|
|
if not self.k8s_client:
|
|
return
|
|
|
|
logger.info("Starting cleanup of orphaned resources...")
|
|
|
|
# This would implement logic to find and clean up:
|
|
# 1. Deployments without corresponding instances
|
|
# 2. Services without deployments
|
|
# 3. Unused PVCs
|
|
# 4. Expired certificates
|
|
|
|
# Implementation would query Kubernetes for resources with GT labels
|
|
# and cross-reference with active instances
|
|
|
|
logger.info("Cleanup completed") |