Files
gt-ai-os-community/apps/resource-cluster/app/services/agent_orchestrator.py
HackWeasel b9dfb86260 GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00

931 lines
31 KiB
Python

"""
Agent Orchestration System for GT 2.0 Resource Cluster
Provides multi-agent workflow execution with:
- Sequential, parallel, and conditional agent workflows
- Inter-agent communication and memory management
- Capability-based access control
- Agent lifecycle management
- Performance monitoring and metrics
GT 2.0 Architecture Principles:
- Perfect Tenant Isolation: Agent sessions isolated per tenant
- Zero Downtime: Stateless design, resumable workflows
- Self-Contained Security: Capability-based agent permissions
- No Complexity Addition: Simple orchestration patterns
"""
import asyncio
import logging
import json
import time
import uuid
from typing import Dict, Any, List, Optional, Union, Callable, Coroutine
from datetime import datetime, timedelta
from enum import Enum
from dataclasses import dataclass, asdict
import traceback
from app.core.capability_auth import verify_capability_token, CapabilityError
from app.core.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class AgentStatus(str, Enum):
"""Agent execution status"""
IDLE = "idle"
RUNNING = "running"
WAITING = "waiting"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class WorkflowType(str, Enum):
"""Types of agent workflows"""
SEQUENTIAL = "sequential"
PARALLEL = "parallel"
CONDITIONAL = "conditional"
PIPELINE = "pipeline"
MAP_REDUCE = "map_reduce"
class MessageType(str, Enum):
"""Inter-agent message types"""
DATA = "data"
CONTROL = "control"
ERROR = "error"
HEARTBEAT = "heartbeat"
@dataclass
class AgentDefinition:
"""Definition of an agent"""
agent_id: str
agent_type: str
name: str
description: str
capabilities_required: List[str]
memory_limit_mb: int = 256
timeout_seconds: int = 300
retry_count: int = 3
environment: Dict[str, Any] = None
@dataclass
class AgentMessage:
"""Message between agents"""
message_id: str
from_agent: str
to_agent: str
message_type: MessageType
content: Dict[str, Any]
timestamp: str
expires_at: Optional[str] = None
@dataclass
class AgentState:
"""Current state of an agent"""
agent_id: str
status: AgentStatus
current_task: Optional[str]
memory_usage_mb: int
cpu_usage_percent: float
started_at: str
last_activity: str
error_message: Optional[str] = None
output_data: Dict[str, Any] = None
@dataclass
class WorkflowExecution:
"""Workflow execution instance"""
workflow_id: str
workflow_type: WorkflowType
tenant_id: str
created_by: str
agents: List[AgentDefinition]
workflow_config: Dict[str, Any]
status: AgentStatus
started_at: str
completed_at: Optional[str] = None
results: Dict[str, Any] = None
error_message: Optional[str] = None
class AgentMemoryManager:
"""Manages agent memory and state"""
def __init__(self):
# In-memory storage (PostgreSQL used for persistent storage)
self._agent_memory: Dict[str, Dict[str, Any]] = {}
self._shared_memory: Dict[str, Dict[str, Any]] = {}
self._message_queues: Dict[str, List[AgentMessage]] = {}
async def store_agent_memory(
self,
agent_id: str,
key: str,
value: Any,
ttl_seconds: Optional[int] = None
) -> None:
"""Store data in agent-specific memory"""
if agent_id not in self._agent_memory:
self._agent_memory[agent_id] = {}
self._agent_memory[agent_id][key] = {
"value": value,
"created_at": datetime.utcnow().isoformat(),
"expires_at": (
datetime.utcnow() + timedelta(seconds=ttl_seconds)
).isoformat() if ttl_seconds else None
}
logger.debug(f"Stored memory for agent {agent_id}: {key}")
async def get_agent_memory(
self,
agent_id: str,
key: str
) -> Optional[Any]:
"""Retrieve data from agent-specific memory"""
if agent_id not in self._agent_memory:
return None
memory_item = self._agent_memory[agent_id].get(key)
if not memory_item:
return None
# Check expiration
if memory_item.get("expires_at"):
expires_at = datetime.fromisoformat(memory_item["expires_at"])
if datetime.utcnow() > expires_at:
del self._agent_memory[agent_id][key]
return None
return memory_item["value"]
async def store_shared_memory(
self,
tenant_id: str,
key: str,
value: Any,
ttl_seconds: Optional[int] = None
) -> None:
"""Store data in tenant-shared memory"""
if tenant_id not in self._shared_memory:
self._shared_memory[tenant_id] = {}
self._shared_memory[tenant_id][key] = {
"value": value,
"created_at": datetime.utcnow().isoformat(),
"expires_at": (
datetime.utcnow() + timedelta(seconds=ttl_seconds)
).isoformat() if ttl_seconds else None
}
logger.debug(f"Stored shared memory for tenant {tenant_id}: {key}")
async def get_shared_memory(
self,
tenant_id: str,
key: str
) -> Optional[Any]:
"""Retrieve data from tenant-shared memory"""
if tenant_id not in self._shared_memory:
return None
memory_item = self._shared_memory[tenant_id].get(key)
if not memory_item:
return None
# Check expiration
if memory_item.get("expires_at"):
expires_at = datetime.fromisoformat(memory_item["expires_at"])
if datetime.utcnow() > expires_at:
del self._shared_memory[tenant_id][key]
return None
return memory_item["value"]
async def send_message(
self,
message: AgentMessage
) -> None:
"""Send message to agent queue"""
if message.to_agent not in self._message_queues:
self._message_queues[message.to_agent] = []
self._message_queues[message.to_agent].append(message)
logger.debug(f"Message sent from {message.from_agent} to {message.to_agent}")
async def receive_messages(
self,
agent_id: str,
message_type: Optional[MessageType] = None
) -> List[AgentMessage]:
"""Receive messages for agent"""
if agent_id not in self._message_queues:
return []
messages = self._message_queues[agent_id]
# Filter expired messages
now = datetime.utcnow()
messages = [
msg for msg in messages
if not msg.expires_at or datetime.fromisoformat(msg.expires_at) > now
]
# Filter by message type if specified
if message_type:
messages = [msg for msg in messages if msg.message_type == message_type]
# Clear processed messages
if message_type:
self._message_queues[agent_id] = [
msg for msg in self._message_queues[agent_id]
if msg.message_type != message_type or
(msg.expires_at and datetime.fromisoformat(msg.expires_at) <= now)
]
else:
self._message_queues[agent_id] = []
return messages
async def cleanup_agent_memory(self, agent_id: str) -> None:
"""Clean up memory for completed agent"""
if agent_id in self._agent_memory:
del self._agent_memory[agent_id]
if agent_id in self._message_queues:
del self._message_queues[agent_id]
logger.debug(f"Cleaned up memory for agent {agent_id}")
class AgentOrchestrator:
"""
Main agent orchestration system for GT 2.0.
Manages agent lifecycle, workflows, communication, and resource allocation.
All operations are tenant-isolated and capability-protected.
"""
def __init__(self):
self.memory_manager = AgentMemoryManager()
self.active_workflows: Dict[str, WorkflowExecution] = {}
self.agent_states: Dict[str, AgentState] = {}
# Built-in agent types
self.agent_registry: Dict[str, Dict[str, Any]] = {
"data_processor": {
"description": "Processes and transforms data",
"capabilities": ["data.read", "data.transform"],
"memory_limit_mb": 512,
"timeout_seconds": 300
},
"llm_agent": {
"description": "Interacts with LLM services",
"capabilities": ["llm.inference", "llm.chat"],
"memory_limit_mb": 256,
"timeout_seconds": 600
},
"embedding_agent": {
"description": "Generates text embeddings",
"capabilities": ["embeddings.generate"],
"memory_limit_mb": 256,
"timeout_seconds": 180
},
"rag_agent": {
"description": "Performs retrieval-augmented generation",
"capabilities": ["rag.search", "rag.generate"],
"memory_limit_mb": 512,
"timeout_seconds": 450
},
"integration_agent": {
"description": "Connects to external services",
"capabilities": ["integration.call", "integration.webhook"],
"memory_limit_mb": 256,
"timeout_seconds": 300
}
}
logger.info("Agent orchestrator initialized")
async def create_workflow(
self,
workflow_type: WorkflowType,
agents: List[AgentDefinition],
workflow_config: Dict[str, Any],
capability_token: str,
workflow_name: Optional[str] = None
) -> str:
"""
Create a new agent workflow.
Args:
workflow_type: Type of workflow to create
agents: List of agents to include in workflow
workflow_config: Configuration for the workflow
capability_token: JWT token with workflow permissions
workflow_name: Optional name for the workflow
Returns:
Workflow ID
"""
# Verify capability token
capability = await verify_capability_token(capability_token)
tenant_id = capability.get("tenant_id")
user_id = capability.get("sub")
# Check workflow permissions
await self._verify_workflow_permissions(capability, workflow_type, agents)
# Generate workflow ID
workflow_id = str(uuid.uuid4())
# Create workflow execution
workflow = WorkflowExecution(
workflow_id=workflow_id,
workflow_type=workflow_type,
tenant_id=tenant_id,
created_by=user_id,
agents=agents,
workflow_config=workflow_config,
status=AgentStatus.IDLE,
started_at=datetime.utcnow().isoformat()
)
# Store workflow
self.active_workflows[workflow_id] = workflow
logger.info(
f"Created {workflow_type} workflow {workflow_id} "
f"with {len(agents)} agents for tenant {tenant_id}"
)
return workflow_id
async def execute_workflow(
self,
workflow_id: str,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""
Execute an agent workflow.
Args:
workflow_id: ID of workflow to execute
input_data: Input data for the workflow
capability_token: JWT token with execution permissions
Returns:
Workflow execution results
"""
# Verify capability token
capability = await verify_capability_token(capability_token)
tenant_id = capability.get("tenant_id")
# Get workflow
workflow = self.active_workflows.get(workflow_id)
if not workflow:
raise ValueError(f"Workflow {workflow_id} not found")
# Check tenant isolation
if workflow.tenant_id != tenant_id:
raise CapabilityError("Insufficient permissions for workflow")
# Check workflow permissions
await self._verify_execution_permissions(capability, workflow)
try:
# Update workflow status
workflow.status = AgentStatus.RUNNING
# Execute based on workflow type
if workflow.workflow_type == WorkflowType.SEQUENTIAL:
results = await self._execute_sequential_workflow(
workflow, input_data, capability_token
)
elif workflow.workflow_type == WorkflowType.PARALLEL:
results = await self._execute_parallel_workflow(
workflow, input_data, capability_token
)
elif workflow.workflow_type == WorkflowType.CONDITIONAL:
results = await self._execute_conditional_workflow(
workflow, input_data, capability_token
)
elif workflow.workflow_type == WorkflowType.PIPELINE:
results = await self._execute_pipeline_workflow(
workflow, input_data, capability_token
)
elif workflow.workflow_type == WorkflowType.MAP_REDUCE:
results = await self._execute_map_reduce_workflow(
workflow, input_data, capability_token
)
else:
raise ValueError(f"Unsupported workflow type: {workflow.workflow_type}")
# Update workflow completion
workflow.status = AgentStatus.COMPLETED
workflow.completed_at = datetime.utcnow().isoformat()
workflow.results = results
logger.info(f"Completed workflow {workflow_id} successfully")
return results
except Exception as e:
# Update workflow error status
workflow.status = AgentStatus.FAILED
workflow.completed_at = datetime.utcnow().isoformat()
workflow.error_message = str(e)
logger.error(f"Workflow {workflow_id} failed: {e}")
raise
async def get_workflow_status(
self,
workflow_id: str,
capability_token: str
) -> Dict[str, Any]:
"""Get status of a workflow"""
# Verify capability token
capability = await verify_capability_token(capability_token)
tenant_id = capability.get("tenant_id")
# Get workflow
workflow = self.active_workflows.get(workflow_id)
if not workflow:
raise ValueError(f"Workflow {workflow_id} not found")
# Check tenant isolation
if workflow.tenant_id != tenant_id:
raise CapabilityError("Insufficient permissions for workflow")
# Get agent states for this workflow
agent_states = {
agent.agent_id: asdict(self.agent_states.get(agent.agent_id))
for agent in workflow.agents
if agent.agent_id in self.agent_states
}
return {
"workflow": asdict(workflow),
"agent_states": agent_states
}
async def cancel_workflow(
self,
workflow_id: str,
capability_token: str
) -> None:
"""Cancel a running workflow"""
# Verify capability token
capability = await verify_capability_token(capability_token)
tenant_id = capability.get("tenant_id")
# Get workflow
workflow = self.active_workflows.get(workflow_id)
if not workflow:
raise ValueError(f"Workflow {workflow_id} not found")
# Check tenant isolation
if workflow.tenant_id != tenant_id:
raise CapabilityError("Insufficient permissions for workflow")
# Cancel workflow
workflow.status = AgentStatus.CANCELLED
workflow.completed_at = datetime.utcnow().isoformat()
# Cancel all agents in workflow
for agent in workflow.agents:
if agent.agent_id in self.agent_states:
self.agent_states[agent.agent_id].status = AgentStatus.CANCELLED
logger.info(f"Cancelled workflow {workflow_id}")
async def _execute_sequential_workflow(
self,
workflow: WorkflowExecution,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute agents sequentially"""
results = {}
current_data = input_data
for agent in workflow.agents:
agent_result = await self._execute_agent(
agent, current_data, capability_token
)
results[agent.agent_id] = agent_result
# Pass output to next agent
if "output" in agent_result:
current_data = agent_result["output"]
return {
"workflow_type": "sequential",
"final_output": current_data,
"agent_results": results
}
async def _execute_parallel_workflow(
self,
workflow: WorkflowExecution,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute agents in parallel"""
# Create tasks for all agents
tasks = []
for agent in workflow.agents:
task = asyncio.create_task(
self._execute_agent(agent, input_data, capability_token)
)
tasks.append((agent.agent_id, task))
# Wait for all tasks to complete
results = {}
for agent_id, task in tasks:
try:
results[agent_id] = await task
except Exception as e:
results[agent_id] = {"error": str(e)}
return {
"workflow_type": "parallel",
"agent_results": results
}
async def _execute_conditional_workflow(
self,
workflow: WorkflowExecution,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute agents based on conditions"""
results = {}
condition_config = workflow.workflow_config.get("conditions", {})
for agent in workflow.agents:
# Check if agent should execute based on conditions
should_execute = await self._evaluate_condition(
agent.agent_id, condition_config, input_data, results
)
if should_execute:
agent_result = await self._execute_agent(
agent, input_data, capability_token
)
results[agent.agent_id] = agent_result
else:
results[agent.agent_id] = {"status": "skipped"}
return {
"workflow_type": "conditional",
"agent_results": results
}
async def _execute_pipeline_workflow(
self,
workflow: WorkflowExecution,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute agents in pipeline with data transformation"""
results = {}
current_data = input_data
for i, agent in enumerate(workflow.agents):
# Add pipeline metadata
pipeline_data = {
**current_data,
"_pipeline_stage": i,
"_pipeline_total": len(workflow.agents)
}
agent_result = await self._execute_agent(
agent, pipeline_data, capability_token
)
results[agent.agent_id] = agent_result
# Transform data for next stage
if "transformed_output" in agent_result:
current_data = agent_result["transformed_output"]
elif "output" in agent_result:
current_data = agent_result["output"]
return {
"workflow_type": "pipeline",
"final_output": current_data,
"agent_results": results
}
async def _execute_map_reduce_workflow(
self,
workflow: WorkflowExecution,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute map-reduce workflow"""
# Separate map and reduce agents
map_agents = [a for a in workflow.agents if a.agent_type.endswith("_mapper")]
reduce_agents = [a for a in workflow.agents if a.agent_type.endswith("_reducer")]
# Execute map phase
map_tasks = []
input_chunks = input_data.get("chunks", [input_data])
for i, chunk in enumerate(input_chunks):
for agent in map_agents:
task = asyncio.create_task(
self._execute_agent(agent, chunk, capability_token)
)
map_tasks.append((f"{agent.agent_id}_chunk_{i}", task))
# Collect map results
map_results = {}
for task_id, task in map_tasks:
try:
map_results[task_id] = await task
except Exception as e:
map_results[task_id] = {"error": str(e)}
# Execute reduce phase
reduce_results = {}
reduce_input = {"map_results": map_results}
for agent in reduce_agents:
agent_result = await self._execute_agent(
agent, reduce_input, capability_token
)
reduce_results[agent.agent_id] = agent_result
return {
"workflow_type": "map_reduce",
"map_results": map_results,
"reduce_results": reduce_results
}
async def _execute_agent(
self,
agent: AgentDefinition,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute a single agent"""
start_time = time.time()
# Create agent state
agent_state = AgentState(
agent_id=agent.agent_id,
status=AgentStatus.RUNNING,
current_task=f"Executing {agent.agent_type}",
memory_usage_mb=0,
cpu_usage_percent=0.0,
started_at=datetime.utcnow().isoformat(),
last_activity=datetime.utcnow().isoformat()
)
self.agent_states[agent.agent_id] = agent_state
try:
# Simulate agent execution based on type
if agent.agent_type == "data_processor":
result = await self._execute_data_processor(agent, input_data)
elif agent.agent_type == "llm_agent":
result = await self._execute_llm_agent(agent, input_data, capability_token)
elif agent.agent_type == "embedding_agent":
result = await self._execute_embedding_agent(agent, input_data, capability_token)
elif agent.agent_type == "rag_agent":
result = await self._execute_rag_agent(agent, input_data, capability_token)
elif agent.agent_type == "integration_agent":
result = await self._execute_integration_agent(agent, input_data, capability_token)
else:
result = await self._execute_custom_agent(agent, input_data)
# Update agent state
agent_state.status = AgentStatus.COMPLETED
agent_state.output_data = result
agent_state.last_activity = datetime.utcnow().isoformat()
processing_time = time.time() - start_time
logger.info(
f"Agent {agent.agent_id} completed in {processing_time:.2f}s"
)
return {
"status": "completed",
"processing_time": processing_time,
"output": result
}
except Exception as e:
# Update agent error state
agent_state.status = AgentStatus.FAILED
agent_state.error_message = str(e)
agent_state.last_activity = datetime.utcnow().isoformat()
logger.error(f"Agent {agent.agent_id} failed: {e}")
return {
"status": "failed",
"error": str(e),
"processing_time": time.time() - start_time
}
# Agent execution implementations would go here...
# For now, these are placeholder implementations
async def _execute_data_processor(
self,
agent: AgentDefinition,
input_data: Dict[str, Any]
) -> Dict[str, Any]:
"""Execute data processing agent"""
await asyncio.sleep(0.1) # Simulate processing
return {
"processed_data": input_data,
"processing_info": "Data processed successfully"
}
async def _execute_llm_agent(
self,
agent: AgentDefinition,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute LLM agent"""
await asyncio.sleep(0.2) # Simulate LLM call
return {
"llm_response": f"LLM processed: {input_data.get('prompt', 'No prompt provided')}",
"model_used": "groq/llama-3-8b"
}
async def _execute_embedding_agent(
self,
agent: AgentDefinition,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute embedding agent"""
await asyncio.sleep(0.1) # Simulate embedding generation
texts = input_data.get("texts", [""])
return {
"embeddings": [[0.1] * 1024 for _ in texts], # Mock embeddings
"model_used": "BAAI/bge-m3"
}
async def _execute_rag_agent(
self,
agent: AgentDefinition,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute RAG agent"""
await asyncio.sleep(0.3) # Simulate RAG processing
return {
"rag_response": "RAG generated response",
"retrieved_docs": ["doc1", "doc2"],
"confidence_score": 0.85
}
async def _execute_integration_agent(
self,
agent: AgentDefinition,
input_data: Dict[str, Any],
capability_token: str
) -> Dict[str, Any]:
"""Execute integration agent"""
await asyncio.sleep(0.1) # Simulate external API call
return {
"integration_result": "External API called successfully",
"response_data": input_data
}
async def _execute_custom_agent(
self,
agent: AgentDefinition,
input_data: Dict[str, Any]
) -> Dict[str, Any]:
"""Execute custom agent type"""
await asyncio.sleep(0.1) # Simulate custom processing
return {
"custom_result": f"Custom agent {agent.agent_type} executed",
"input_data": input_data
}
async def _verify_workflow_permissions(
self,
capability: Dict[str, Any],
workflow_type: WorkflowType,
agents: List[AgentDefinition]
) -> None:
"""Verify workflow creation permissions"""
capabilities = capability.get("capabilities", [])
# Check for workflow creation permission
workflow_caps = [
cap for cap in capabilities
if cap.get("resource") == "workflows"
]
if not workflow_caps:
raise CapabilityError("No workflow permissions in capability token")
# Check specific workflow type permission
workflow_cap = workflow_caps[0]
actions = workflow_cap.get("actions", [])
if "create" not in actions:
raise CapabilityError("No workflow creation permission")
# Check agent-specific permissions
for agent in agents:
for required_cap in agent.capabilities_required:
if not any(
cap.get("resource") == required_cap.split(".")[0]
for cap in capabilities
):
raise CapabilityError(
f"Missing capability for agent {agent.agent_id}: {required_cap}"
)
async def _verify_execution_permissions(
self,
capability: Dict[str, Any],
workflow: WorkflowExecution
) -> None:
"""Verify workflow execution permissions"""
capabilities = capability.get("capabilities", [])
# Check for workflow execution permission
workflow_caps = [
cap for cap in capabilities
if cap.get("resource") == "workflows"
]
if not workflow_caps:
raise CapabilityError("No workflow permissions in capability token")
workflow_cap = workflow_caps[0]
actions = workflow_cap.get("actions", [])
if "execute" not in actions:
raise CapabilityError("No workflow execution permission")
async def _evaluate_condition(
self,
agent_id: str,
condition_config: Dict[str, Any],
input_data: Dict[str, Any],
results: Dict[str, Any]
) -> bool:
"""Evaluate condition for conditional workflow"""
agent_condition = condition_config.get(agent_id, {})
if not agent_condition:
return True # No condition means always execute
condition_type = agent_condition.get("type", "always")
if condition_type == "always":
return True
elif condition_type == "never":
return False
elif condition_type == "input_contains":
key = agent_condition.get("key")
value = agent_condition.get("value")
return input_data.get(key) == value
elif condition_type == "previous_success":
previous_agent = agent_condition.get("previous_agent")
return (
previous_agent in results and
results[previous_agent].get("status") == "completed"
)
elif condition_type == "previous_failure":
previous_agent = agent_condition.get("previous_agent")
return (
previous_agent in results and
results[previous_agent].get("status") == "failed"
)
return True # Default to execute if condition not recognized
# Global orchestrator instance
_agent_orchestrator = None
def get_agent_orchestrator() -> AgentOrchestrator:
"""Get the global agent orchestrator instance"""
global _agent_orchestrator
if _agent_orchestrator is None:
_agent_orchestrator = AgentOrchestrator()
return _agent_orchestrator