Files
gt-ai-os-community/apps/tenant-backend/app/services/dataset_summarizer.py
HackWeasel b9dfb86260 GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00

445 lines
18 KiB
Python

"""
Dataset Summarization Service for GT 2.0
Generates comprehensive summaries for datasets based on their constituent documents.
Provides analytics, topic clustering, and overview generation for RAG optimization.
"""
import logging
import asyncio
import httpx
import json
from typing import Dict, Any, Optional, List
from datetime import datetime
from collections import Counter
from app.core.database import get_db_session, execute_command, fetch_one, fetch_all
logger = logging.getLogger(__name__)
class DatasetSummarizer:
"""
Service for generating dataset-level summaries and analytics.
Features:
- Aggregate document summaries into dataset overview
- Topic clustering and theme analysis
- Dataset statistics and metrics
- Search optimization recommendations
- RAG performance insights
"""
def __init__(self):
self.resource_cluster_url = "http://gentwo-resource-backend:8000"
async def generate_dataset_summary(
self,
dataset_id: str,
tenant_domain: str,
user_id: str,
force_regenerate: bool = False
) -> Dict[str, Any]:
"""
Generate a comprehensive summary for a dataset.
Args:
dataset_id: Dataset ID to summarize
tenant_domain: Tenant domain for database context
user_id: User requesting the summary
force_regenerate: Force regeneration even if summary exists
Returns:
Dictionary with dataset summary including overview, topics,
statistics, and search optimization insights
"""
try:
# Check if summary already exists and is recent
if not force_regenerate:
existing_summary = await self._get_existing_summary(dataset_id, tenant_domain)
if existing_summary and self._is_summary_fresh(existing_summary):
logger.info(f"Using cached dataset summary for {dataset_id}")
return existing_summary
# Get dataset information and documents
dataset_info = await self._get_dataset_info(dataset_id, tenant_domain)
if not dataset_info:
raise ValueError(f"Dataset {dataset_id} not found")
documents = await self._get_dataset_documents(dataset_id, tenant_domain)
document_summaries = await self._get_document_summaries(dataset_id, tenant_domain)
# Generate statistics
stats = await self._calculate_dataset_statistics(dataset_id, tenant_domain)
# Analyze topics across all documents
topics_analysis = await self._analyze_dataset_topics(document_summaries)
# Generate overall summary using LLM
overview = await self._generate_dataset_overview(
dataset_info, document_summaries, topics_analysis, stats
)
# Create comprehensive summary
summary_data = {
"dataset_id": dataset_id,
"overview": overview,
"statistics": stats,
"topics": topics_analysis,
"recommendations": await self._generate_search_recommendations(stats, topics_analysis),
"metadata": {
"document_count": len(documents),
"has_summaries": len(document_summaries),
"generated_at": datetime.utcnow().isoformat(),
"generated_by": user_id
}
}
# Store summary in database
await self._store_dataset_summary(dataset_id, summary_data, tenant_domain, user_id)
logger.info(f"Generated dataset summary for {dataset_id} with {len(documents)} documents")
return summary_data
except Exception as e:
logger.error(f"Failed to generate dataset summary for {dataset_id}: {e}")
# Return basic fallback summary
return {
"dataset_id": dataset_id,
"overview": "Dataset summary generation failed",
"statistics": {"error": str(e)},
"topics": [],
"recommendations": [],
"metadata": {
"generated_at": datetime.utcnow().isoformat(),
"error": str(e)
}
}
async def _get_dataset_info(self, dataset_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
"""Get basic dataset information"""
async with get_db_session() as session:
query = """
SELECT id, dataset_name, description, chunking_strategy,
chunk_size, chunk_overlap, created_at
FROM datasets
WHERE id = $1
"""
result = await fetch_one(session, query, dataset_id)
return dict(result) if result else None
async def _get_dataset_documents(self, dataset_id: str, tenant_domain: str) -> List[Dict[str, Any]]:
"""Get all documents in the dataset"""
async with get_db_session() as session:
query = """
SELECT id, filename, original_filename, file_type,
file_size_bytes, chunk_count, created_at
FROM documents
WHERE dataset_id = $1 AND processing_status = 'completed'
ORDER BY created_at DESC
"""
results = await fetch_all(session, query, dataset_id)
return [dict(row) for row in results]
async def _get_document_summaries(self, dataset_id: str, tenant_domain: str) -> List[Dict[str, Any]]:
"""Get summaries for all documents in the dataset"""
async with get_db_session() as session:
query = """
SELECT ds.document_id, ds.quick_summary, ds.detailed_analysis,
ds.topics, ds.metadata, ds.confidence,
d.filename, d.original_filename
FROM document_summaries ds
JOIN documents d ON ds.document_id = d.id
WHERE d.dataset_id = $1
ORDER BY ds.created_at DESC
"""
results = await fetch_all(session, query, dataset_id)
summaries = []
for row in results:
summary = dict(row)
# Parse JSON fields
if summary["topics"]:
summary["topics"] = json.loads(summary["topics"])
if summary["metadata"]:
summary["metadata"] = json.loads(summary["metadata"])
summaries.append(summary)
return summaries
async def _calculate_dataset_statistics(self, dataset_id: str, tenant_domain: str) -> Dict[str, Any]:
"""Calculate comprehensive dataset statistics"""
async with get_db_session() as session:
# Basic document statistics
doc_stats_query = """
SELECT
COUNT(*) as total_documents,
SUM(file_size_bytes) as total_size_bytes,
SUM(chunk_count) as total_chunks,
AVG(chunk_count) as avg_chunks_per_doc,
COUNT(DISTINCT file_type) as unique_file_types
FROM documents
WHERE dataset_id = $1 AND processing_status = 'completed'
"""
doc_stats = await fetch_one(session, doc_stats_query, dataset_id)
# Chunk statistics
chunk_stats_query = """
SELECT
COUNT(*) as total_vector_embeddings,
AVG(token_count) as avg_tokens_per_chunk,
MIN(token_count) as min_tokens,
MAX(token_count) as max_tokens
FROM document_chunks
WHERE dataset_id = $1
"""
chunk_stats = await fetch_one(session, chunk_stats_query, dataset_id)
# File type distribution
file_types_query = """
SELECT file_type, COUNT(*) as count
FROM documents
WHERE dataset_id = $1 AND processing_status = 'completed'
GROUP BY file_type
ORDER BY count DESC
"""
file_types_results = await fetch_all(session, file_types_query, dataset_id)
file_types = {row["file_type"]: row["count"] for row in file_types_results}
return {
"documents": {
"total": doc_stats["total_documents"] or 0,
"total_size_mb": round((doc_stats["total_size_bytes"] or 0) / 1024 / 1024, 2),
"avg_chunks_per_document": round(doc_stats["avg_chunks_per_doc"] or 0, 1),
"unique_file_types": doc_stats["unique_file_types"] or 0,
"file_type_distribution": file_types
},
"chunks": {
"total": chunk_stats["total_vector_embeddings"] or 0,
"avg_tokens": round(chunk_stats["avg_tokens_per_chunk"] or 0, 1),
"token_range": {
"min": chunk_stats["min_tokens"] or 0,
"max": chunk_stats["max_tokens"] or 0
}
},
"search_readiness": {
"has_vectors": (chunk_stats["total_vector_embeddings"] or 0) > 0,
"vector_coverage": 1.0 if (doc_stats["total_chunks"] or 0) == (chunk_stats["total_vector_embeddings"] or 0) else 0.0
}
}
async def _analyze_dataset_topics(self, document_summaries: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze topics across all document summaries"""
if not document_summaries:
return {"main_topics": [], "topic_distribution": {}, "confidence": 0.0}
# Collect all topics from document summaries
all_topics = []
for summary in document_summaries:
topics = summary.get("topics", [])
if isinstance(topics, list):
all_topics.extend(topics)
# Count topic frequencies
topic_counts = Counter(all_topics)
# Get top topics
main_topics = [topic for topic, count in topic_counts.most_common(10)]
# Calculate topic distribution
total_topics = len(all_topics)
topic_distribution = {}
if total_topics > 0:
for topic, count in topic_counts.items():
topic_distribution[topic] = round(count / total_topics, 3)
# Calculate confidence based on number of summaries available
confidence = min(1.0, len(document_summaries) / 5.0) # Full confidence with 5+ documents
return {
"main_topics": main_topics,
"topic_distribution": topic_distribution,
"confidence": confidence,
"total_unique_topics": len(topic_counts)
}
async def _generate_dataset_overview(
self,
dataset_info: Dict[str, Any],
document_summaries: List[Dict[str, Any]],
topics_analysis: Dict[str, Any],
stats: Dict[str, Any]
) -> str:
"""Generate LLM-powered overview of the dataset"""
# Create context for LLM
context = f"""Dataset: {dataset_info['dataset_name']}
Description: {dataset_info.get('description', 'No description provided')}
Statistics:
- {stats['documents']['total']} documents ({stats['documents']['total_size_mb']} MB)
- {stats['chunks']['total']} text chunks for search
- Average {stats['documents']['avg_chunks_per_document']} chunks per document
Main Topics: {', '.join(topics_analysis['main_topics'][:5])}
Document Summaries:
"""
# Add sample document summaries
for i, summary in enumerate(document_summaries[:3]): # First 3 documents
context += f"\n{i+1}. {summary['filename']}: {summary['quick_summary']}"
prompt = f"""Analyze this dataset and provide a comprehensive 2-3 paragraph overview.
{context}
Focus on:
1. What type of content this dataset contains
2. The main themes and topics covered
3. How useful this would be for AI-powered search and retrieval
4. Any notable patterns or characteristics
Provide a professional, informative summary suitable for users exploring their datasets."""
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{self.resource_cluster_url}/api/v1/ai/chat/completions",
json={
"model": "llama-3.1-8b-instant",
"messages": [
{
"role": "system",
"content": "You are a data analysis expert. Provide clear, insightful dataset summaries."
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.3,
"max_tokens": 500
},
headers={
"X-Tenant-ID": "default",
"Content-Type": "application/json"
}
)
if response.status_code == 200:
llm_response = response.json()
return llm_response["choices"][0]["message"]["content"]
else:
raise Exception(f"LLM API error: {response.status_code}")
except Exception as e:
logger.warning(f"LLM overview generation failed: {e}")
# Fallback to template-based overview
return f"This dataset contains {stats['documents']['total']} documents covering topics such as {', '.join(topics_analysis['main_topics'][:3])}. The dataset includes {stats['chunks']['total']} searchable text chunks optimized for AI-powered retrieval and question answering."
async def _generate_search_recommendations(
self,
stats: Dict[str, Any],
topics_analysis: Dict[str, Any]
) -> List[str]:
"""Generate recommendations for optimizing search performance"""
recommendations = []
# Vector coverage recommendations
if not stats["search_readiness"]["has_vectors"]:
recommendations.append("Generate vector embeddings for all documents to enable semantic search")
elif stats["search_readiness"]["vector_coverage"] < 1.0:
recommendations.append("Complete vector embedding generation for optimal search performance")
# Chunk size recommendations
avg_tokens = stats["chunks"]["avg_tokens"]
if avg_tokens < 100:
recommendations.append("Consider increasing chunk size for better context in search results")
elif avg_tokens > 600:
recommendations.append("Consider reducing chunk size for more precise search matches")
# Topic diversity recommendations
if topics_analysis["total_unique_topics"] < 3:
recommendations.append("Dataset may benefit from more diverse content for comprehensive coverage")
elif topics_analysis["total_unique_topics"] > 50:
recommendations.append("Consider organizing content into focused sub-datasets for better search precision")
# Document count recommendations
doc_count = stats["documents"]["total"]
if doc_count < 5:
recommendations.append("Add more documents to improve search quality and coverage")
elif doc_count > 100:
recommendations.append("Consider implementing advanced filtering and categorization for better navigation")
return recommendations[:5] # Limit to top 5 recommendations
async def _store_dataset_summary(
self,
dataset_id: str,
summary_data: Dict[str, Any],
tenant_domain: str,
user_id: str
):
"""Store or update dataset summary in database"""
async with get_db_session() as session:
query = """
UPDATE datasets
SET
summary = $1,
summary_generated_at = $2,
updated_at = NOW()
WHERE id = $3
"""
await execute_command(
session,
query,
json.dumps(summary_data),
datetime.utcnow(),
dataset_id
)
async def _get_existing_summary(self, dataset_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
"""Get existing dataset summary if available"""
async with get_db_session() as session:
query = """
SELECT summary, summary_generated_at
FROM datasets
WHERE id = $1 AND summary IS NOT NULL
"""
result = await fetch_one(session, query, dataset_id)
if result and result["summary"]:
return json.loads(result["summary"])
return None
def _is_summary_fresh(self, summary: Dict[str, Any], max_age_hours: int = 24) -> bool:
"""Check if summary is recent enough to avoid regeneration"""
try:
generated_at = datetime.fromisoformat(summary["metadata"]["generated_at"])
age_hours = (datetime.utcnow() - generated_at).total_seconds() / 3600
return age_hours < max_age_hours
except (KeyError, ValueError):
return False
# Global instance
dataset_summarizer = DatasetSummarizer()
async def generate_dataset_summary(
dataset_id: str,
tenant_domain: str,
user_id: str,
force_regenerate: bool = False
) -> Dict[str, Any]:
"""Convenience function for dataset summary generation"""
return await dataset_summarizer.generate_dataset_summary(
dataset_id, tenant_domain, user_id, force_regenerate
)
async def get_dataset_summary(dataset_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
"""Convenience function for retrieving dataset summary"""
return await dataset_summarizer._get_existing_summary(dataset_id, tenant_domain)