Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
636 lines
25 KiB
Python
636 lines
25 KiB
Python
"""
|
|
GT 2.0 Tenant Backend - External Service Management
|
|
Business logic for managing external web services with Resource Cluster integration
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from sqlalchemy import select, update, delete, and_
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.models.external_service import ExternalServiceInstance, ServiceAccessLog, ServiceTemplate
|
|
from app.core.config import get_settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
settings = get_settings()
|
|
|
|
|
|
class ExternalServiceManager:
|
|
"""Manages external service instances and Resource Cluster integration"""
|
|
|
|
def __init__(self, db: AsyncSession):
|
|
self.db = db
|
|
self.resource_cluster_base_url = settings.resource_cluster_url or "http://resource-cluster:8003"
|
|
self.capability_token = None
|
|
|
|
def set_capability_token(self, token: str):
|
|
"""Set capability token for Resource Cluster API calls"""
|
|
self.capability_token = token
|
|
|
|
async def create_service_instance(
|
|
self,
|
|
service_type: str,
|
|
service_name: str,
|
|
user_email: str,
|
|
config_overrides: Optional[Dict[str, Any]] = None,
|
|
template_id: Optional[str] = None
|
|
) -> ExternalServiceInstance:
|
|
"""Create a new external service instance"""
|
|
|
|
# Validate service type
|
|
supported_services = ['ctfd', 'canvas', 'guacamole']
|
|
if service_type not in supported_services:
|
|
raise ValueError(f"Unsupported service type: {service_type}")
|
|
|
|
# Load template if provided
|
|
template = None
|
|
if template_id:
|
|
template = await self.get_service_template(template_id)
|
|
if not template:
|
|
raise ValueError(f"Template {template_id} not found")
|
|
|
|
# Prepare configuration
|
|
service_config = {}
|
|
if template:
|
|
service_config.update(template.default_config)
|
|
if config_overrides:
|
|
service_config.update(config_overrides)
|
|
|
|
# Call Resource Cluster to create instance
|
|
resource_instance = await self._create_resource_cluster_instance(
|
|
service_type=service_type,
|
|
config_overrides=service_config
|
|
)
|
|
|
|
# Create database record
|
|
instance = ExternalServiceInstance(
|
|
service_type=service_type,
|
|
service_name=service_name,
|
|
description=f"{service_type.title()} instance for {user_email}",
|
|
resource_instance_id=resource_instance['instance_id'],
|
|
endpoint_url=resource_instance['endpoint_url'],
|
|
status=resource_instance['status'],
|
|
service_config=service_config,
|
|
created_by=user_email,
|
|
allowed_users=[user_email],
|
|
resource_limits=template.resource_requirements if template else {},
|
|
auto_start=template.default_config.get('auto_start', True) if template else True
|
|
)
|
|
|
|
self.db.add(instance)
|
|
await self.db.commit()
|
|
await self.db.refresh(instance)
|
|
|
|
logger.info(
|
|
f"Created {service_type} service instance {instance.id} "
|
|
f"for user {user_email}"
|
|
)
|
|
|
|
return instance
|
|
|
|
async def _create_resource_cluster_instance(
|
|
self,
|
|
service_type: str,
|
|
config_overrides: Optional[Dict[str, Any]] = None
|
|
) -> Dict[str, Any]:
|
|
"""Create instance via Resource Cluster API with zero downtime error handling"""
|
|
|
|
if not self.capability_token:
|
|
raise ValueError("Capability token not set")
|
|
|
|
max_retries = 3
|
|
base_delay = 1.0
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
timeout = httpx.Timeout(60.0, connect=10.0)
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
response = await client.post(
|
|
f"{self.resource_cluster_base_url}/api/v1/services/instances",
|
|
json={
|
|
"service_type": service_type,
|
|
"config_overrides": config_overrides
|
|
},
|
|
headers={
|
|
"Authorization": f"Bearer {self.capability_token}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
elif response.status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
# Retry for server errors
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"Service creation failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
try:
|
|
error_detail = response.json().get('detail', f'HTTP {response.status_code}')
|
|
except:
|
|
error_detail = f'HTTP {response.status_code}'
|
|
raise RuntimeError(f"Failed to create service instance: {error_detail}")
|
|
|
|
except httpx.TimeoutException:
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"Service creation timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
raise RuntimeError("Failed to create service instance: timeout after retries")
|
|
except httpx.RequestError as e:
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"Service creation request error (attempt {attempt + 1}/{max_retries}): {e}, retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
raise RuntimeError(f"Failed to create service instance: {e}")
|
|
|
|
raise RuntimeError("Failed to create service instance: maximum retries exceeded")
|
|
|
|
async def get_service_instance(
|
|
self,
|
|
instance_id: str,
|
|
user_email: str
|
|
) -> Optional[ExternalServiceInstance]:
|
|
"""Get service instance with access control"""
|
|
|
|
query = select(ExternalServiceInstance).where(
|
|
and_(
|
|
ExternalServiceInstance.id == instance_id,
|
|
ExternalServiceInstance.allowed_users.op('json_extract_path_text')('*').op('@>')([user_email])
|
|
)
|
|
)
|
|
|
|
result = await self.db.execute(query)
|
|
return result.scalar_one_or_none()
|
|
|
|
async def list_user_services(
|
|
self,
|
|
user_email: str,
|
|
service_type: Optional[str] = None,
|
|
status: Optional[str] = None
|
|
) -> List[ExternalServiceInstance]:
|
|
"""List all services accessible to a user"""
|
|
|
|
query = select(ExternalServiceInstance).where(
|
|
ExternalServiceInstance.allowed_users.op('json_extract_path_text')('*').op('@>')([user_email])
|
|
)
|
|
|
|
if service_type:
|
|
query = query.where(ExternalServiceInstance.service_type == service_type)
|
|
|
|
if status:
|
|
query = query.where(ExternalServiceInstance.status == status)
|
|
|
|
query = query.order_by(ExternalServiceInstance.created_at.desc())
|
|
|
|
result = await self.db.execute(query)
|
|
return result.scalars().all()
|
|
|
|
async def stop_service_instance(
|
|
self,
|
|
instance_id: str,
|
|
user_email: str
|
|
) -> bool:
|
|
"""Stop a service instance"""
|
|
|
|
# Check access
|
|
instance = await self.get_service_instance(instance_id, user_email)
|
|
if not instance:
|
|
raise ValueError(f"Service instance {instance_id} not found or access denied")
|
|
|
|
# Call Resource Cluster to stop instance
|
|
success = await self._stop_resource_cluster_instance(instance.resource_instance_id)
|
|
|
|
if success:
|
|
# Update database status
|
|
instance.status = 'stopped'
|
|
instance.updated_at = datetime.utcnow()
|
|
await self.db.commit()
|
|
|
|
logger.info(
|
|
f"Stopped {instance.service_type} instance {instance_id} "
|
|
f"by user {user_email}"
|
|
)
|
|
|
|
return success
|
|
|
|
async def _stop_resource_cluster_instance(self, resource_instance_id: str) -> bool:
|
|
"""Stop instance via Resource Cluster API with zero downtime error handling"""
|
|
|
|
if not self.capability_token:
|
|
raise ValueError("Capability token not set")
|
|
|
|
max_retries = 3
|
|
base_delay = 1.0
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
timeout = httpx.Timeout(30.0, connect=10.0)
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
response = await client.delete(
|
|
f"{self.resource_cluster_base_url}/api/v1/services/instances/{resource_instance_id}",
|
|
headers={
|
|
"Authorization": f"Bearer {self.capability_token}"
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return True
|
|
elif response.status_code == 404:
|
|
# Instance already gone, consider it successfully stopped
|
|
logger.info(f"Instance {resource_instance_id} not found, assuming already stopped")
|
|
return True
|
|
elif response.status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
# Retry for server errors
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"Instance stop failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
logger.error(f"Failed to stop instance {resource_instance_id}: HTTP {response.status_code}")
|
|
return False
|
|
|
|
except httpx.TimeoutException:
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"Instance stop timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
logger.error(f"Failed to stop instance {resource_instance_id}: timeout after retries")
|
|
return False
|
|
except httpx.RequestError as e:
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"Instance stop request error (attempt {attempt + 1}/{max_retries}): {e}, retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
logger.error(f"Failed to stop instance {resource_instance_id}: {e}")
|
|
return False
|
|
|
|
logger.error(f"Failed to stop instance {resource_instance_id}: maximum retries exceeded")
|
|
return False
|
|
|
|
async def get_service_health(
|
|
self,
|
|
instance_id: str,
|
|
user_email: str
|
|
) -> Dict[str, Any]:
|
|
"""Get service health status"""
|
|
|
|
# Check access
|
|
instance = await self.get_service_instance(instance_id, user_email)
|
|
if not instance:
|
|
raise ValueError(f"Service instance {instance_id} not found or access denied")
|
|
|
|
# Get health from Resource Cluster
|
|
health = await self._get_resource_cluster_health(instance.resource_instance_id)
|
|
|
|
# Update instance health status
|
|
instance.health_status = health.get('status', 'unknown')
|
|
instance.last_health_check = datetime.utcnow()
|
|
if health.get('restart_count', 0) != instance.restart_count:
|
|
instance.restart_count = health.get('restart_count', 0)
|
|
|
|
await self.db.commit()
|
|
|
|
return health
|
|
|
|
async def _get_resource_cluster_health(self, resource_instance_id: str) -> Dict[str, Any]:
|
|
"""Get health status via Resource Cluster API with zero downtime error handling"""
|
|
|
|
if not self.capability_token:
|
|
raise ValueError("Capability token not set")
|
|
|
|
try:
|
|
timeout = httpx.Timeout(10.0, connect=5.0)
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
response = await client.get(
|
|
f"{self.resource_cluster_base_url}/api/v1/services/health/{resource_instance_id}",
|
|
headers={
|
|
"Authorization": f"Bearer {self.capability_token}"
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
elif response.status_code == 404:
|
|
return {
|
|
'status': 'not_found',
|
|
'error': 'Instance not found'
|
|
}
|
|
else:
|
|
return {
|
|
'status': 'error',
|
|
'error': f'Health check failed: HTTP {response.status_code}'
|
|
}
|
|
|
|
except httpx.TimeoutException:
|
|
logger.warning(f"Health check timeout for instance {resource_instance_id}")
|
|
return {
|
|
'status': 'timeout',
|
|
'error': 'Health check timeout'
|
|
}
|
|
except httpx.RequestError as e:
|
|
logger.warning(f"Health check request error for instance {resource_instance_id}: {e}")
|
|
return {
|
|
'status': 'connection_error',
|
|
'error': f'Connection error: {e}'
|
|
}
|
|
|
|
async def generate_sso_token(
|
|
self,
|
|
instance_id: str,
|
|
user_email: str
|
|
) -> Dict[str, Any]:
|
|
"""Generate SSO token for iframe embedding"""
|
|
|
|
# Check access
|
|
instance = await self.get_service_instance(instance_id, user_email)
|
|
if not instance:
|
|
raise ValueError(f"Service instance {instance_id} not found or access denied")
|
|
|
|
# Generate SSO token via Resource Cluster
|
|
sso_data = await self._generate_resource_cluster_sso_token(instance.resource_instance_id)
|
|
|
|
# Update last accessed time
|
|
instance.last_accessed = datetime.utcnow()
|
|
await self.db.commit()
|
|
|
|
return sso_data
|
|
|
|
async def _generate_resource_cluster_sso_token(self, resource_instance_id: str) -> Dict[str, Any]:
|
|
"""Generate SSO token via Resource Cluster API with zero downtime error handling"""
|
|
|
|
if not self.capability_token:
|
|
raise ValueError("Capability token not set")
|
|
|
|
max_retries = 3
|
|
base_delay = 1.0
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
timeout = httpx.Timeout(10.0, connect=5.0)
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
response = await client.post(
|
|
f"{self.resource_cluster_base_url}/api/v1/services/sso-token/{resource_instance_id}",
|
|
headers={
|
|
"Authorization": f"Bearer {self.capability_token}"
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
elif response.status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
|
|
# Retry for server errors
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"SSO token generation failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
try:
|
|
error_detail = response.json().get('detail', f'HTTP {response.status_code}')
|
|
except:
|
|
error_detail = f'HTTP {response.status_code}'
|
|
raise RuntimeError(f"Failed to generate SSO token: {error_detail}")
|
|
|
|
except httpx.TimeoutException:
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"SSO token generation timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
raise RuntimeError("Failed to generate SSO token: timeout after retries")
|
|
except httpx.RequestError as e:
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt)
|
|
logger.warning(f"SSO token generation request error (attempt {attempt + 1}/{max_retries}): {e}, retrying in {delay}s")
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
else:
|
|
raise RuntimeError(f"Failed to generate SSO token: {e}")
|
|
|
|
raise RuntimeError("Failed to generate SSO token: maximum retries exceeded")
|
|
|
|
async def log_service_access(
|
|
self,
|
|
service_instance_id: str,
|
|
service_type: str,
|
|
user_email: str,
|
|
access_type: str,
|
|
session_id: str,
|
|
ip_address: Optional[str] = None,
|
|
user_agent: Optional[str] = None,
|
|
referer: Optional[str] = None,
|
|
session_duration_seconds: Optional[int] = None,
|
|
actions_performed: Optional[List[str]] = None
|
|
) -> ServiceAccessLog:
|
|
"""Log service access event"""
|
|
|
|
access_log = ServiceAccessLog(
|
|
service_instance_id=service_instance_id,
|
|
service_type=service_type,
|
|
user_email=user_email,
|
|
session_id=session_id,
|
|
access_type=access_type,
|
|
ip_address=ip_address,
|
|
user_agent=user_agent,
|
|
referer=referer,
|
|
session_duration_seconds=session_duration_seconds,
|
|
actions_performed=actions_performed or []
|
|
)
|
|
|
|
self.db.add(access_log)
|
|
await self.db.commit()
|
|
await self.db.refresh(access_log)
|
|
|
|
return access_log
|
|
|
|
async def get_service_analytics(
|
|
self,
|
|
instance_id: str,
|
|
user_email: str,
|
|
days: int = 30
|
|
) -> Dict[str, Any]:
|
|
"""Get service usage analytics"""
|
|
|
|
# Check access
|
|
instance = await self.get_service_instance(instance_id, user_email)
|
|
if not instance:
|
|
raise ValueError(f"Service instance {instance_id} not found or access denied")
|
|
|
|
# Query access logs
|
|
since_date = datetime.utcnow() - timedelta(days=days)
|
|
|
|
query = select(ServiceAccessLog).where(
|
|
and_(
|
|
ServiceAccessLog.service_instance_id == instance_id,
|
|
ServiceAccessLog.timestamp >= since_date
|
|
)
|
|
).order_by(ServiceAccessLog.timestamp.desc())
|
|
|
|
result = await self.db.execute(query)
|
|
access_logs = result.scalars().all()
|
|
|
|
# Compute analytics
|
|
total_sessions = len(set(log.session_id for log in access_logs))
|
|
total_time_seconds = sum(
|
|
log.session_duration_seconds or 0
|
|
for log in access_logs
|
|
if log.session_duration_seconds
|
|
)
|
|
unique_users = len(set(log.user_email for log in access_logs))
|
|
|
|
# Group by day for trend analysis
|
|
daily_usage = {}
|
|
for log in access_logs:
|
|
day = log.timestamp.date().isoformat()
|
|
if day not in daily_usage:
|
|
daily_usage[day] = {'sessions': 0, 'users': set()}
|
|
if log.access_type == 'login':
|
|
daily_usage[day]['sessions'] += 1
|
|
daily_usage[day]['users'].add(log.user_email)
|
|
|
|
# Convert sets to counts
|
|
for day_data in daily_usage.values():
|
|
day_data['unique_users'] = len(day_data['users'])
|
|
del day_data['users']
|
|
|
|
return {
|
|
'instance_id': instance_id,
|
|
'service_type': instance.service_type,
|
|
'service_name': instance.service_name,
|
|
'analytics_period_days': days,
|
|
'total_sessions': total_sessions,
|
|
'total_time_hours': round(total_time_seconds / 3600, 1),
|
|
'unique_users': unique_users,
|
|
'average_session_duration_minutes': round(
|
|
total_time_seconds / max(total_sessions, 1) / 60, 1
|
|
),
|
|
'daily_usage': daily_usage,
|
|
'health_status': instance.health_status,
|
|
'uptime_percentage': self._calculate_uptime_percentage(access_logs, days),
|
|
'last_accessed': instance.last_accessed.isoformat() if instance.last_accessed else None,
|
|
'created_at': instance.created_at.isoformat()
|
|
}
|
|
|
|
def _calculate_uptime_percentage(self, access_logs: List[ServiceAccessLog], days: int) -> float:
|
|
"""Calculate approximate uptime percentage based on access patterns"""
|
|
if not access_logs:
|
|
return 0.0
|
|
|
|
# Simple heuristic: if we have recent login events, assume service is up
|
|
recent_logins = [
|
|
log for log in access_logs
|
|
if log.access_type == 'login' and
|
|
log.timestamp > datetime.utcnow() - timedelta(days=1)
|
|
]
|
|
|
|
if recent_logins:
|
|
return 95.0 # Assume good uptime if recently accessed
|
|
elif len(access_logs) > 0:
|
|
return 85.0 # Some historical usage
|
|
else:
|
|
return 50.0 # No usage data
|
|
|
|
async def create_service_template(
|
|
self,
|
|
template_name: str,
|
|
service_type: str,
|
|
description: str,
|
|
default_config: Dict[str, Any],
|
|
created_by: str,
|
|
**kwargs
|
|
) -> ServiceTemplate:
|
|
"""Create a new service template"""
|
|
|
|
template = ServiceTemplate(
|
|
template_name=template_name,
|
|
service_type=service_type,
|
|
description=description,
|
|
default_config=default_config,
|
|
created_by=created_by,
|
|
**kwargs
|
|
)
|
|
|
|
self.db.add(template)
|
|
await self.db.commit()
|
|
await self.db.refresh(template)
|
|
|
|
return template
|
|
|
|
async def get_service_template(self, template_id: str) -> Optional[ServiceTemplate]:
|
|
"""Get service template by ID"""
|
|
|
|
query = select(ServiceTemplate).where(ServiceTemplate.id == template_id)
|
|
result = await self.db.execute(query)
|
|
return result.scalar_one_or_none()
|
|
|
|
async def list_service_templates(
|
|
self,
|
|
service_type: Optional[str] = None,
|
|
category: Optional[str] = None,
|
|
public_only: bool = True
|
|
) -> List[ServiceTemplate]:
|
|
"""List available service templates"""
|
|
|
|
query = select(ServiceTemplate).where(ServiceTemplate.is_active == True)
|
|
|
|
if public_only:
|
|
query = query.where(ServiceTemplate.is_public == True)
|
|
|
|
if service_type:
|
|
query = query.where(ServiceTemplate.service_type == service_type)
|
|
|
|
if category:
|
|
query = query.where(ServiceTemplate.category == category)
|
|
|
|
query = query.order_by(ServiceTemplate.usage_count.desc())
|
|
|
|
result = await self.db.execute(query)
|
|
return result.scalars().all()
|
|
|
|
async def share_service_instance(
|
|
self,
|
|
instance_id: str,
|
|
owner_email: str,
|
|
share_with_emails: List[str],
|
|
access_level: str = 'read'
|
|
) -> bool:
|
|
"""Share service instance with other users"""
|
|
|
|
# Check owner access
|
|
instance = await self.get_service_instance(instance_id, owner_email)
|
|
if not instance:
|
|
raise ValueError(f"Service instance {instance_id} not found or access denied")
|
|
|
|
if instance.created_by != owner_email:
|
|
raise ValueError("Only the instance creator can share access")
|
|
|
|
# Update allowed users
|
|
current_users = set(instance.allowed_users)
|
|
new_users = current_users.union(set(share_with_emails))
|
|
|
|
instance.allowed_users = list(new_users)
|
|
instance.access_level = 'team' if len(new_users) > 1 else 'private'
|
|
instance.updated_at = datetime.utcnow()
|
|
|
|
await self.db.commit()
|
|
|
|
logger.info(
|
|
f"Shared {instance.service_type} instance {instance_id} "
|
|
f"with {len(share_with_emails)} users"
|
|
)
|
|
|
|
return True |