Files
gt-ai-os-community/apps/tenant-backend/app/services/external_service.py
HackWeasel b9dfb86260 GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00

636 lines
25 KiB
Python

"""
GT 2.0 Tenant Backend - External Service Management
Business logic for managing external web services with Resource Cluster integration
"""
import asyncio
import httpx
import json
import logging
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional, Tuple
from sqlalchemy import select, update, delete, and_
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.external_service import ExternalServiceInstance, ServiceAccessLog, ServiceTemplate
from app.core.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class ExternalServiceManager:
"""Manages external service instances and Resource Cluster integration"""
def __init__(self, db: AsyncSession):
self.db = db
self.resource_cluster_base_url = settings.resource_cluster_url or "http://resource-cluster:8003"
self.capability_token = None
def set_capability_token(self, token: str):
"""Set capability token for Resource Cluster API calls"""
self.capability_token = token
async def create_service_instance(
self,
service_type: str,
service_name: str,
user_email: str,
config_overrides: Optional[Dict[str, Any]] = None,
template_id: Optional[str] = None
) -> ExternalServiceInstance:
"""Create a new external service instance"""
# Validate service type
supported_services = ['ctfd', 'canvas', 'guacamole']
if service_type not in supported_services:
raise ValueError(f"Unsupported service type: {service_type}")
# Load template if provided
template = None
if template_id:
template = await self.get_service_template(template_id)
if not template:
raise ValueError(f"Template {template_id} not found")
# Prepare configuration
service_config = {}
if template:
service_config.update(template.default_config)
if config_overrides:
service_config.update(config_overrides)
# Call Resource Cluster to create instance
resource_instance = await self._create_resource_cluster_instance(
service_type=service_type,
config_overrides=service_config
)
# Create database record
instance = ExternalServiceInstance(
service_type=service_type,
service_name=service_name,
description=f"{service_type.title()} instance for {user_email}",
resource_instance_id=resource_instance['instance_id'],
endpoint_url=resource_instance['endpoint_url'],
status=resource_instance['status'],
service_config=service_config,
created_by=user_email,
allowed_users=[user_email],
resource_limits=template.resource_requirements if template else {},
auto_start=template.default_config.get('auto_start', True) if template else True
)
self.db.add(instance)
await self.db.commit()
await self.db.refresh(instance)
logger.info(
f"Created {service_type} service instance {instance.id} "
f"for user {user_email}"
)
return instance
async def _create_resource_cluster_instance(
self,
service_type: str,
config_overrides: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""Create instance via Resource Cluster API with zero downtime error handling"""
if not self.capability_token:
raise ValueError("Capability token not set")
max_retries = 3
base_delay = 1.0
for attempt in range(max_retries):
try:
timeout = httpx.Timeout(60.0, connect=10.0)
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.post(
f"{self.resource_cluster_base_url}/api/v1/services/instances",
json={
"service_type": service_type,
"config_overrides": config_overrides
},
headers={
"Authorization": f"Bearer {self.capability_token}",
"Content-Type": "application/json"
}
)
if response.status_code == 200:
return response.json()
elif response.status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
# Retry for server errors
delay = base_delay * (2 ** attempt)
logger.warning(f"Service creation failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
try:
error_detail = response.json().get('detail', f'HTTP {response.status_code}')
except:
error_detail = f'HTTP {response.status_code}'
raise RuntimeError(f"Failed to create service instance: {error_detail}")
except httpx.TimeoutException:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"Service creation timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
raise RuntimeError("Failed to create service instance: timeout after retries")
except httpx.RequestError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"Service creation request error (attempt {attempt + 1}/{max_retries}): {e}, retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
raise RuntimeError(f"Failed to create service instance: {e}")
raise RuntimeError("Failed to create service instance: maximum retries exceeded")
async def get_service_instance(
self,
instance_id: str,
user_email: str
) -> Optional[ExternalServiceInstance]:
"""Get service instance with access control"""
query = select(ExternalServiceInstance).where(
and_(
ExternalServiceInstance.id == instance_id,
ExternalServiceInstance.allowed_users.op('json_extract_path_text')('*').op('@>')([user_email])
)
)
result = await self.db.execute(query)
return result.scalar_one_or_none()
async def list_user_services(
self,
user_email: str,
service_type: Optional[str] = None,
status: Optional[str] = None
) -> List[ExternalServiceInstance]:
"""List all services accessible to a user"""
query = select(ExternalServiceInstance).where(
ExternalServiceInstance.allowed_users.op('json_extract_path_text')('*').op('@>')([user_email])
)
if service_type:
query = query.where(ExternalServiceInstance.service_type == service_type)
if status:
query = query.where(ExternalServiceInstance.status == status)
query = query.order_by(ExternalServiceInstance.created_at.desc())
result = await self.db.execute(query)
return result.scalars().all()
async def stop_service_instance(
self,
instance_id: str,
user_email: str
) -> bool:
"""Stop a service instance"""
# Check access
instance = await self.get_service_instance(instance_id, user_email)
if not instance:
raise ValueError(f"Service instance {instance_id} not found or access denied")
# Call Resource Cluster to stop instance
success = await self._stop_resource_cluster_instance(instance.resource_instance_id)
if success:
# Update database status
instance.status = 'stopped'
instance.updated_at = datetime.utcnow()
await self.db.commit()
logger.info(
f"Stopped {instance.service_type} instance {instance_id} "
f"by user {user_email}"
)
return success
async def _stop_resource_cluster_instance(self, resource_instance_id: str) -> bool:
"""Stop instance via Resource Cluster API with zero downtime error handling"""
if not self.capability_token:
raise ValueError("Capability token not set")
max_retries = 3
base_delay = 1.0
for attempt in range(max_retries):
try:
timeout = httpx.Timeout(30.0, connect=10.0)
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.delete(
f"{self.resource_cluster_base_url}/api/v1/services/instances/{resource_instance_id}",
headers={
"Authorization": f"Bearer {self.capability_token}"
}
)
if response.status_code == 200:
return True
elif response.status_code == 404:
# Instance already gone, consider it successfully stopped
logger.info(f"Instance {resource_instance_id} not found, assuming already stopped")
return True
elif response.status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
# Retry for server errors
delay = base_delay * (2 ** attempt)
logger.warning(f"Instance stop failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
logger.error(f"Failed to stop instance {resource_instance_id}: HTTP {response.status_code}")
return False
except httpx.TimeoutException:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"Instance stop timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
logger.error(f"Failed to stop instance {resource_instance_id}: timeout after retries")
return False
except httpx.RequestError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"Instance stop request error (attempt {attempt + 1}/{max_retries}): {e}, retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
logger.error(f"Failed to stop instance {resource_instance_id}: {e}")
return False
logger.error(f"Failed to stop instance {resource_instance_id}: maximum retries exceeded")
return False
async def get_service_health(
self,
instance_id: str,
user_email: str
) -> Dict[str, Any]:
"""Get service health status"""
# Check access
instance = await self.get_service_instance(instance_id, user_email)
if not instance:
raise ValueError(f"Service instance {instance_id} not found or access denied")
# Get health from Resource Cluster
health = await self._get_resource_cluster_health(instance.resource_instance_id)
# Update instance health status
instance.health_status = health.get('status', 'unknown')
instance.last_health_check = datetime.utcnow()
if health.get('restart_count', 0) != instance.restart_count:
instance.restart_count = health.get('restart_count', 0)
await self.db.commit()
return health
async def _get_resource_cluster_health(self, resource_instance_id: str) -> Dict[str, Any]:
"""Get health status via Resource Cluster API with zero downtime error handling"""
if not self.capability_token:
raise ValueError("Capability token not set")
try:
timeout = httpx.Timeout(10.0, connect=5.0)
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.get(
f"{self.resource_cluster_base_url}/api/v1/services/health/{resource_instance_id}",
headers={
"Authorization": f"Bearer {self.capability_token}"
}
)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
return {
'status': 'not_found',
'error': 'Instance not found'
}
else:
return {
'status': 'error',
'error': f'Health check failed: HTTP {response.status_code}'
}
except httpx.TimeoutException:
logger.warning(f"Health check timeout for instance {resource_instance_id}")
return {
'status': 'timeout',
'error': 'Health check timeout'
}
except httpx.RequestError as e:
logger.warning(f"Health check request error for instance {resource_instance_id}: {e}")
return {
'status': 'connection_error',
'error': f'Connection error: {e}'
}
async def generate_sso_token(
self,
instance_id: str,
user_email: str
) -> Dict[str, Any]:
"""Generate SSO token for iframe embedding"""
# Check access
instance = await self.get_service_instance(instance_id, user_email)
if not instance:
raise ValueError(f"Service instance {instance_id} not found or access denied")
# Generate SSO token via Resource Cluster
sso_data = await self._generate_resource_cluster_sso_token(instance.resource_instance_id)
# Update last accessed time
instance.last_accessed = datetime.utcnow()
await self.db.commit()
return sso_data
async def _generate_resource_cluster_sso_token(self, resource_instance_id: str) -> Dict[str, Any]:
"""Generate SSO token via Resource Cluster API with zero downtime error handling"""
if not self.capability_token:
raise ValueError("Capability token not set")
max_retries = 3
base_delay = 1.0
for attempt in range(max_retries):
try:
timeout = httpx.Timeout(10.0, connect=5.0)
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.post(
f"{self.resource_cluster_base_url}/api/v1/services/sso-token/{resource_instance_id}",
headers={
"Authorization": f"Bearer {self.capability_token}"
}
)
if response.status_code == 200:
return response.json()
elif response.status_code in [500, 502, 503, 504] and attempt < max_retries - 1:
# Retry for server errors
delay = base_delay * (2 ** attempt)
logger.warning(f"SSO token generation failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
try:
error_detail = response.json().get('detail', f'HTTP {response.status_code}')
except:
error_detail = f'HTTP {response.status_code}'
raise RuntimeError(f"Failed to generate SSO token: {error_detail}")
except httpx.TimeoutException:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"SSO token generation timeout (attempt {attempt + 1}/{max_retries}), retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
raise RuntimeError("Failed to generate SSO token: timeout after retries")
except httpx.RequestError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
logger.warning(f"SSO token generation request error (attempt {attempt + 1}/{max_retries}): {e}, retrying in {delay}s")
await asyncio.sleep(delay)
continue
else:
raise RuntimeError(f"Failed to generate SSO token: {e}")
raise RuntimeError("Failed to generate SSO token: maximum retries exceeded")
async def log_service_access(
self,
service_instance_id: str,
service_type: str,
user_email: str,
access_type: str,
session_id: str,
ip_address: Optional[str] = None,
user_agent: Optional[str] = None,
referer: Optional[str] = None,
session_duration_seconds: Optional[int] = None,
actions_performed: Optional[List[str]] = None
) -> ServiceAccessLog:
"""Log service access event"""
access_log = ServiceAccessLog(
service_instance_id=service_instance_id,
service_type=service_type,
user_email=user_email,
session_id=session_id,
access_type=access_type,
ip_address=ip_address,
user_agent=user_agent,
referer=referer,
session_duration_seconds=session_duration_seconds,
actions_performed=actions_performed or []
)
self.db.add(access_log)
await self.db.commit()
await self.db.refresh(access_log)
return access_log
async def get_service_analytics(
self,
instance_id: str,
user_email: str,
days: int = 30
) -> Dict[str, Any]:
"""Get service usage analytics"""
# Check access
instance = await self.get_service_instance(instance_id, user_email)
if not instance:
raise ValueError(f"Service instance {instance_id} not found or access denied")
# Query access logs
since_date = datetime.utcnow() - timedelta(days=days)
query = select(ServiceAccessLog).where(
and_(
ServiceAccessLog.service_instance_id == instance_id,
ServiceAccessLog.timestamp >= since_date
)
).order_by(ServiceAccessLog.timestamp.desc())
result = await self.db.execute(query)
access_logs = result.scalars().all()
# Compute analytics
total_sessions = len(set(log.session_id for log in access_logs))
total_time_seconds = sum(
log.session_duration_seconds or 0
for log in access_logs
if log.session_duration_seconds
)
unique_users = len(set(log.user_email for log in access_logs))
# Group by day for trend analysis
daily_usage = {}
for log in access_logs:
day = log.timestamp.date().isoformat()
if day not in daily_usage:
daily_usage[day] = {'sessions': 0, 'users': set()}
if log.access_type == 'login':
daily_usage[day]['sessions'] += 1
daily_usage[day]['users'].add(log.user_email)
# Convert sets to counts
for day_data in daily_usage.values():
day_data['unique_users'] = len(day_data['users'])
del day_data['users']
return {
'instance_id': instance_id,
'service_type': instance.service_type,
'service_name': instance.service_name,
'analytics_period_days': days,
'total_sessions': total_sessions,
'total_time_hours': round(total_time_seconds / 3600, 1),
'unique_users': unique_users,
'average_session_duration_minutes': round(
total_time_seconds / max(total_sessions, 1) / 60, 1
),
'daily_usage': daily_usage,
'health_status': instance.health_status,
'uptime_percentage': self._calculate_uptime_percentage(access_logs, days),
'last_accessed': instance.last_accessed.isoformat() if instance.last_accessed else None,
'created_at': instance.created_at.isoformat()
}
def _calculate_uptime_percentage(self, access_logs: List[ServiceAccessLog], days: int) -> float:
"""Calculate approximate uptime percentage based on access patterns"""
if not access_logs:
return 0.0
# Simple heuristic: if we have recent login events, assume service is up
recent_logins = [
log for log in access_logs
if log.access_type == 'login' and
log.timestamp > datetime.utcnow() - timedelta(days=1)
]
if recent_logins:
return 95.0 # Assume good uptime if recently accessed
elif len(access_logs) > 0:
return 85.0 # Some historical usage
else:
return 50.0 # No usage data
async def create_service_template(
self,
template_name: str,
service_type: str,
description: str,
default_config: Dict[str, Any],
created_by: str,
**kwargs
) -> ServiceTemplate:
"""Create a new service template"""
template = ServiceTemplate(
template_name=template_name,
service_type=service_type,
description=description,
default_config=default_config,
created_by=created_by,
**kwargs
)
self.db.add(template)
await self.db.commit()
await self.db.refresh(template)
return template
async def get_service_template(self, template_id: str) -> Optional[ServiceTemplate]:
"""Get service template by ID"""
query = select(ServiceTemplate).where(ServiceTemplate.id == template_id)
result = await self.db.execute(query)
return result.scalar_one_or_none()
async def list_service_templates(
self,
service_type: Optional[str] = None,
category: Optional[str] = None,
public_only: bool = True
) -> List[ServiceTemplate]:
"""List available service templates"""
query = select(ServiceTemplate).where(ServiceTemplate.is_active == True)
if public_only:
query = query.where(ServiceTemplate.is_public == True)
if service_type:
query = query.where(ServiceTemplate.service_type == service_type)
if category:
query = query.where(ServiceTemplate.category == category)
query = query.order_by(ServiceTemplate.usage_count.desc())
result = await self.db.execute(query)
return result.scalars().all()
async def share_service_instance(
self,
instance_id: str,
owner_email: str,
share_with_emails: List[str],
access_level: str = 'read'
) -> bool:
"""Share service instance with other users"""
# Check owner access
instance = await self.get_service_instance(instance_id, owner_email)
if not instance:
raise ValueError(f"Service instance {instance_id} not found or access denied")
if instance.created_by != owner_email:
raise ValueError("Only the instance creator can share access")
# Update allowed users
current_users = set(instance.allowed_users)
new_users = current_users.union(set(share_with_emails))
instance.allowed_users = list(new_users)
instance.access_level = 'team' if len(new_users) > 1 else 'private'
instance.updated_at = datetime.utcnow()
await self.db.commit()
logger.info(
f"Shared {instance.service_type} instance {instance_id} "
f"with {len(share_with_emails)} users"
)
return True