GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
HackWeasel
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions

View File

@@ -0,0 +1,317 @@
"""
Document Summarization Service for GT 2.0
Generates AI-powered summaries for uploaded documents using the Resource Cluster.
Provides both quick summaries and detailed analysis for RAG visualization.
"""
import logging
import asyncio
import httpx
from typing import Dict, Any, Optional, List
from datetime import datetime
from app.core.database import get_db_session, execute_command, fetch_one
logger = logging.getLogger(__name__)
class DocumentSummarizer:
"""
Service for generating document summaries using Resource Cluster LLM.
Features:
- Quick document summaries (2-3 sentences)
- Detailed analysis with key topics and themes
- Metadata extraction (document type, language, etc.)
- Integration with document processor workflow
"""
def __init__(self):
self.resource_cluster_url = "http://gentwo-resource-backend:8000"
self.max_content_length = 4000 # Max chars to send for summarization
async def generate_document_summary(
self,
document_id: str,
content: str,
filename: str,
tenant_domain: str,
user_id: str
) -> Dict[str, Any]:
"""
Generate a comprehensive summary for a document.
Args:
document_id: Document ID in the database
content: Document text content
filename: Original filename
tenant_domain: Tenant domain for context
user_id: User who uploaded the document
Returns:
Dictionary with summary data including quick_summary, detailed_analysis,
topics, metadata, and confidence scores
"""
try:
# Truncate content if too long
truncated_content = content[:self.max_content_length]
if len(content) > self.max_content_length:
truncated_content += "... [content truncated]"
# Generate summary using Resource Cluster LLM
summary_data = await self._call_llm_for_summary(
content=truncated_content,
filename=filename,
document_type=self._detect_document_type(filename)
)
# Store summary in database
await self._store_document_summary(
document_id=document_id,
summary_data=summary_data,
tenant_domain=tenant_domain,
user_id=user_id
)
logger.info(f"Generated summary for document {document_id}: {filename}")
return summary_data
except Exception as e:
logger.error(f"Failed to generate summary for document {document_id}: {e}")
# Return basic fallback summary
return {
"quick_summary": f"Document: {filename}",
"detailed_analysis": "Summary generation failed",
"topics": [],
"metadata": {
"document_type": self._detect_document_type(filename),
"estimated_read_time": len(content) // 200, # ~200 words per minute
"character_count": len(content),
"language": "unknown"
},
"confidence": 0.0,
"error": str(e)
}
async def _call_llm_for_summary(
self,
content: str,
filename: str,
document_type: str
) -> Dict[str, Any]:
"""Call Resource Cluster LLM to generate document summary"""
prompt = f"""Analyze this {document_type} document and provide a comprehensive summary.
Document: {filename}
Content:
{content}
Please provide:
1. A concise 2-3 sentence summary
2. Key topics and themes (list)
3. Document analysis including tone, purpose, and target audience
4. Estimated language and reading level
Format your response as JSON with these keys:
- quick_summary: Brief 2-3 sentence overview
- detailed_analysis: Paragraph with deeper insights
- topics: Array of key topics/themes
- metadata: Object with language, tone, purpose, target_audience
- confidence: Float 0-1 indicating analysis confidence"""
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{self.resource_cluster_url}/api/v1/ai/chat/completions",
json={
"model": "llama-3.1-8b-instant",
"messages": [
{
"role": "system",
"content": "You are a document analysis expert. Provide accurate, concise summaries in valid JSON format."
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.3,
"max_tokens": 1000
},
headers={
"X-Tenant-ID": "default",
"Content-Type": "application/json"
}
)
if response.status_code == 200:
llm_response = response.json()
content_text = llm_response["choices"][0]["message"]["content"]
# Try to parse JSON response
try:
import json
summary_data = json.loads(content_text)
# Validate required fields and add defaults if missing
return {
"quick_summary": summary_data.get("quick_summary", f"Analysis of {filename}"),
"detailed_analysis": summary_data.get("detailed_analysis", "Detailed analysis not available"),
"topics": summary_data.get("topics", []),
"metadata": {
**summary_data.get("metadata", {}),
"document_type": document_type,
"generated_at": datetime.utcnow().isoformat()
},
"confidence": min(1.0, max(0.0, summary_data.get("confidence", 0.7)))
}
except json.JSONDecodeError:
# Fallback if LLM doesn't return valid JSON
return {
"quick_summary": content_text[:200] + "..." if len(content_text) > 200 else content_text,
"detailed_analysis": content_text,
"topics": [],
"metadata": {
"document_type": document_type,
"generated_at": datetime.utcnow().isoformat(),
"note": "Summary extracted from free-form LLM response"
},
"confidence": 0.5
}
else:
raise Exception(f"Resource Cluster API error: {response.status_code}")
except Exception as e:
logger.error(f"LLM summarization failed: {e}")
raise
async def _store_document_summary(
self,
document_id: str,
summary_data: Dict[str, Any],
tenant_domain: str,
user_id: str
):
"""Store generated summary in database"""
# Use the same database session pattern as document processor
async with get_db_session(tenant_domain) as session:
try:
# Insert or update document summary
query = """
INSERT INTO document_summaries (
document_id, user_id, quick_summary, detailed_analysis,
topics, metadata, confidence, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (document_id)
DO UPDATE SET
quick_summary = EXCLUDED.quick_summary,
detailed_analysis = EXCLUDED.detailed_analysis,
topics = EXCLUDED.topics,
metadata = EXCLUDED.metadata,
confidence = EXCLUDED.confidence,
updated_at = EXCLUDED.updated_at
"""
import json
await execute_command(
session,
query,
document_id,
user_id,
summary_data["quick_summary"],
summary_data["detailed_analysis"],
json.dumps(summary_data["topics"]),
json.dumps(summary_data["metadata"]),
summary_data["confidence"],
datetime.utcnow(),
datetime.utcnow()
)
logger.info(f"Stored summary for document {document_id}")
except Exception as e:
logger.error(f"Failed to store document summary: {e}")
raise
def _detect_document_type(self, filename: str) -> str:
"""Detect document type from filename extension"""
import pathlib
extension = pathlib.Path(filename).suffix.lower()
type_mapping = {
'.pdf': 'PDF document',
'.docx': 'Word document',
'.doc': 'Word document',
'.txt': 'Text file',
'.md': 'Markdown document',
'.csv': 'CSV data file',
'.json': 'JSON data file',
'.html': 'HTML document',
'.htm': 'HTML document',
'.rtf': 'Rich text document'
}
return type_mapping.get(extension, 'Unknown document type')
async def get_document_summary(
self,
document_id: str,
tenant_domain: str
) -> Optional[Dict[str, Any]]:
"""Retrieve stored document summary"""
async with get_db_session(tenant_domain) as session:
try:
query = """
SELECT quick_summary, detailed_analysis, topics, metadata,
confidence, created_at, updated_at
FROM document_summaries
WHERE document_id = $1
"""
result = await fetch_one(session, query, document_id)
if result:
import json
return {
"quick_summary": result["quick_summary"],
"detailed_analysis": result["detailed_analysis"],
"topics": json.loads(result["topics"]) if result["topics"] else [],
"metadata": json.loads(result["metadata"]) if result["metadata"] else {},
"confidence": result["confidence"],
"created_at": result["created_at"].isoformat(),
"updated_at": result["updated_at"].isoformat()
}
return None
except Exception as e:
logger.error(f"Failed to retrieve document summary: {e}")
return None
# Global instance
document_summarizer = DocumentSummarizer()
async def generate_document_summary(
document_id: str,
content: str,
filename: str,
tenant_domain: str,
user_id: str
) -> Dict[str, Any]:
"""Convenience function for document summary generation"""
return await document_summarizer.generate_document_summary(
document_id, content, filename, tenant_domain, user_id
)
async def get_document_summary(document_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
"""Convenience function for retrieving document summary"""
return await document_summarizer.get_document_summary(document_id, tenant_domain)