GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
HackWeasel
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions

View File

@@ -0,0 +1,385 @@
"""
GT 2.0 Summarization Service
Provides AI-powered summarization capabilities for documents and datasets.
Uses the same pattern as conversation title generation with Llama 3.1 8B.
"""
import logging
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime
from app.core.postgresql_client import get_postgresql_client
from app.core.resource_client import ResourceClusterClient
from app.core.config import get_settings
logger = logging.getLogger(__name__)
class SummarizationService:
"""
Service for generating AI summaries of documents and datasets.
Uses the same approach as conversation title generation:
- Llama 3.1 8B instant model
- Low temperature for consistency
- Resource cluster for AI responses
"""
def __init__(self, tenant_domain: str, user_id: str):
self.tenant_domain = tenant_domain
self.user_id = user_id
self.resource_client = ResourceClusterClient()
self.summarization_model = "llama-3.1-8b-instant"
self.settings = get_settings()
async def generate_document_summary(
self,
document_id: str,
document_content: str,
document_name: str
) -> Optional[str]:
"""
Generate AI summary for a document using Llama 3.1 8B.
Args:
document_id: UUID of the document
document_content: Full text content of the document
document_name: Original filename/name of the document
Returns:
Generated summary string or None if failed
"""
try:
# Truncate content to first 3000 chars (like conversation title generation)
content_preview = document_content[:3000]
# Create summarization prompt
prompt = f"""Summarize this document '{document_name}' in 2-3 sentences.
Focus on the main topics, key information, and purpose of the document.
Document content:
{content_preview}
Summary:"""
logger.info(f"Generating summary for document {document_id} ({document_name})")
# Call Resource Cluster with same pattern as conversation titles
summary = await self._call_ai_for_summary(
prompt=prompt,
context_type="document",
max_tokens=150
)
if summary:
# Store summary in database
await self._store_document_summary(document_id, summary)
logger.info(f"Generated summary for document {document_id}: {summary[:100]}...")
return summary
else:
logger.warning(f"Failed to generate summary for document {document_id}")
return None
except Exception as e:
logger.error(f"Error generating document summary for {document_id}: {e}")
return None
async def generate_dataset_summary(self, dataset_id: str) -> Optional[str]:
"""
Generate AI summary for a dataset based on its document summaries.
Args:
dataset_id: UUID of the dataset
Returns:
Generated dataset summary or None if failed
"""
try:
# Get all document summaries in this dataset
document_summaries = await self._get_document_summaries_for_dataset(dataset_id)
if not document_summaries:
logger.info(f"No document summaries found for dataset {dataset_id}")
return None
# Get dataset name for context
dataset_info = await self._get_dataset_info(dataset_id)
dataset_name = dataset_info.get('name', 'Unknown Dataset') if dataset_info else 'Unknown Dataset'
# Combine summaries for LLM context
combined_summaries = "\n".join([
f"- {doc['filename']}: {doc['summary']}"
for doc in document_summaries
if doc['summary'] # Only include docs that have summaries
])
if not combined_summaries.strip():
logger.info(f"No valid document summaries for dataset {dataset_id}")
return None
# Create dataset summarization prompt
prompt = f"""Based on these document summaries, create a comprehensive 3-4 sentence summary describing what the dataset '{dataset_name}' contains and its purpose:
Documents in dataset:
{combined_summaries}
Dataset summary:"""
logger.info(f"Generating summary for dataset {dataset_id} ({dataset_name})")
# Call AI for dataset summary
summary = await self._call_ai_for_summary(
prompt=prompt,
context_type="dataset",
max_tokens=200
)
if summary:
# Store dataset summary in database
await self._store_dataset_summary(dataset_id, summary)
logger.info(f"Generated dataset summary for {dataset_id}: {summary[:100]}...")
return summary
else:
logger.warning(f"Failed to generate summary for dataset {dataset_id}")
return None
except Exception as e:
logger.error(f"Error generating dataset summary for {dataset_id}: {e}")
return None
async def update_dataset_summary_on_change(self, dataset_id: str) -> bool:
"""
Regenerate dataset summary when documents are added/removed.
Args:
dataset_id: UUID of the dataset to update
Returns:
True if summary was updated successfully
"""
try:
summary = await self.generate_dataset_summary(dataset_id)
return summary is not None
except Exception as e:
logger.error(f"Error updating dataset summary for {dataset_id}: {e}")
return False
async def _call_ai_for_summary(
self,
prompt: str,
context_type: str,
max_tokens: int = 150
) -> Optional[str]:
"""
Call Resource Cluster for AI summary generation.
Uses ResourceClusterClient for consistent service discovery.
Args:
prompt: The summarization prompt
context_type: Type of summary (document, dataset)
max_tokens: Maximum tokens to generate
Returns:
Generated summary text or None if failed
"""
try:
# Prepare request payload (same format as conversation service)
request_data = {
"model": self.summarization_model,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3, # Lower temperature for consistent summaries
"max_tokens": max_tokens,
"top_p": 1.0
}
logger.info(f"Calling Resource Cluster for {context_type} summary generation")
# Use ResourceClusterClient for consistent service discovery and auth
result = await self.resource_client.call_inference_endpoint(
tenant_id=self.tenant_domain,
user_id=self.user_id,
endpoint="chat/completions",
data=request_data
)
if result and "choices" in result and len(result["choices"]) > 0:
summary = result["choices"][0]["message"]["content"].strip()
logger.info(f"✅ AI {context_type} summary generated successfully: {summary[:50]}...")
return summary
else:
logger.error(f"❌ Invalid AI response format for {context_type} summary: {result}")
return None
except Exception as e:
logger.error(f"❌ Error calling Resource Cluster for {context_type} summary: {e}", exc_info=True)
return None
async def _store_document_summary(self, document_id: str, summary: str) -> None:
"""Store document summary in database"""
try:
client = await get_postgresql_client()
async with client.get_connection() as conn:
schema_name = self.settings.postgres_schema
await conn.execute(f"""
UPDATE {schema_name}.documents
SET summary = $1,
summary_generated_at = $2,
summary_model = $3
WHERE id = $4
""", summary, datetime.now(), self.summarization_model, document_id)
except Exception as e:
logger.error(f"Error storing document summary for {document_id}: {e}")
raise
async def _store_dataset_summary(self, dataset_id: str, summary: str) -> None:
"""Store dataset summary in database"""
try:
client = await get_postgresql_client()
async with client.get_connection() as conn:
schema_name = self.settings.postgres_schema
await conn.execute(f"""
UPDATE {schema_name}.datasets
SET summary = $1,
summary_generated_at = $2,
summary_model = $3
WHERE id = $4
""", summary, datetime.now(), self.summarization_model, dataset_id)
except Exception as e:
logger.error(f"Error storing dataset summary for {dataset_id}: {e}")
raise
async def _get_document_summaries_for_dataset(self, dataset_id: str) -> List[Dict[str, Any]]:
"""Get all document summaries for a dataset"""
try:
client = await get_postgresql_client()
async with client.get_connection() as conn:
schema_name = self.settings.postgres_schema
rows = await conn.fetch(f"""
SELECT id, filename, original_filename, summary, summary_generated_at
FROM {schema_name}.documents
WHERE dataset_id = $1
AND summary IS NOT NULL
AND summary != ''
ORDER BY created_at ASC
""", dataset_id)
return [dict(row) for row in rows]
except Exception as e:
logger.error(f"Error getting document summaries for dataset {dataset_id}: {e}")
return []
async def _get_dataset_info(self, dataset_id: str) -> Optional[Dict[str, Any]]:
"""Get basic dataset information"""
try:
client = await get_postgresql_client()
async with client.get_connection() as conn:
schema_name = self.settings.postgres_schema
row = await conn.fetchrow(f"""
SELECT id, name, description
FROM {schema_name}.datasets
WHERE id = $1
""", dataset_id)
return dict(row) if row else None
except Exception as e:
logger.error(f"Error getting dataset info for {dataset_id}: {e}")
return None
async def get_datasets_with_summaries(self, user_id: str) -> List[Dict[str, Any]]:
"""
Get all user-accessible datasets with their summaries.
Used for context injection in chat.
Args:
user_id: UUID of the user
Returns:
List of datasets with summaries
"""
try:
client = await get_postgresql_client()
async with client.get_connection() as conn:
schema_name = self.settings.postgres_schema
rows = await conn.fetch(f"""
SELECT id, name, description, summary, summary_generated_at,
document_count, total_size_bytes
FROM {schema_name}.datasets
WHERE (created_by = $1::uuid
OR access_group IN ('team', 'organization'))
AND is_active = true
ORDER BY name ASC
""", user_id)
return [dict(row) for row in rows]
except Exception as e:
logger.error(f"Error getting datasets with summaries for user {user_id}: {e}")
return []
async def get_filtered_datasets_with_summaries(
self,
user_id: str,
allowed_dataset_ids: List[str]
) -> List[Dict[str, Any]]:
"""
Get datasets with summaries filtered by allowed dataset IDs.
Used for agent-aware context injection in chat.
Args:
user_id: User UUID string
allowed_dataset_ids: List of dataset IDs the agent/user should see
Returns:
List of dataset dictionaries with summaries, filtered by allowed IDs
"""
if not allowed_dataset_ids:
logger.info(f"No allowed dataset IDs provided for user {user_id} - returning empty list")
return []
try:
client = await get_postgresql_client()
async with client.get_connection() as conn:
schema_name = self.settings.postgres_schema
# Convert dataset IDs to UUID format for query
placeholders = ",".join(f"${i+2}::uuid" for i in range(len(allowed_dataset_ids)))
query = f"""
SELECT id, name, description, summary, summary_generated_at,
document_count, total_size_bytes
FROM {schema_name}.datasets
WHERE (created_by = $1::uuid
OR access_group IN ('team', 'organization'))
AND is_active = true
AND id = ANY(ARRAY[{placeholders}])
ORDER BY name ASC
"""
params = [user_id] + allowed_dataset_ids
rows = await conn.fetch(query, *params)
filtered_datasets = [dict(row) for row in rows]
logger.info(f"Filtered datasets for user {user_id}: {len(filtered_datasets)} out of {len(allowed_dataset_ids)} requested")
return filtered_datasets
except Exception as e:
logger.error(f"Error getting filtered datasets with summaries for user {user_id}: {e}")
return []
# Factory function for dependency injection
def get_summarization_service(tenant_domain: str, user_id: str) -> SummarizationService:
"""Factory function to create SummarizationService instance"""
return SummarizationService(tenant_domain, user_id)