- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
317 lines
12 KiB
Python
317 lines
12 KiB
Python
"""
|
|
Document Summarization Service for GT 2.0
|
|
|
|
Generates AI-powered summaries for uploaded documents using the Resource Cluster.
|
|
Provides both quick summaries and detailed analysis for RAG visualization.
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
import httpx
|
|
from typing import Dict, Any, Optional, List
|
|
from datetime import datetime
|
|
|
|
from app.core.database import get_db_session, execute_command, fetch_one
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentSummarizer:
|
|
"""
|
|
Service for generating document summaries using Resource Cluster LLM.
|
|
|
|
Features:
|
|
- Quick document summaries (2-3 sentences)
|
|
- Detailed analysis with key topics and themes
|
|
- Metadata extraction (document type, language, etc.)
|
|
- Integration with document processor workflow
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.resource_cluster_url = "http://gentwo-resource-backend:8000"
|
|
self.max_content_length = 4000 # Max chars to send for summarization
|
|
|
|
async def generate_document_summary(
|
|
self,
|
|
document_id: str,
|
|
content: str,
|
|
filename: str,
|
|
tenant_domain: str,
|
|
user_id: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Generate a comprehensive summary for a document.
|
|
|
|
Args:
|
|
document_id: Document ID in the database
|
|
content: Document text content
|
|
filename: Original filename
|
|
tenant_domain: Tenant domain for context
|
|
user_id: User who uploaded the document
|
|
|
|
Returns:
|
|
Dictionary with summary data including quick_summary, detailed_analysis,
|
|
topics, metadata, and confidence scores
|
|
"""
|
|
try:
|
|
# Truncate content if too long
|
|
truncated_content = content[:self.max_content_length]
|
|
if len(content) > self.max_content_length:
|
|
truncated_content += "... [content truncated]"
|
|
|
|
# Generate summary using Resource Cluster LLM
|
|
summary_data = await self._call_llm_for_summary(
|
|
content=truncated_content,
|
|
filename=filename,
|
|
document_type=self._detect_document_type(filename)
|
|
)
|
|
|
|
# Store summary in database
|
|
await self._store_document_summary(
|
|
document_id=document_id,
|
|
summary_data=summary_data,
|
|
tenant_domain=tenant_domain,
|
|
user_id=user_id
|
|
)
|
|
|
|
logger.info(f"Generated summary for document {document_id}: {filename}")
|
|
return summary_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate summary for document {document_id}: {e}")
|
|
# Return basic fallback summary
|
|
return {
|
|
"quick_summary": f"Document: {filename}",
|
|
"detailed_analysis": "Summary generation failed",
|
|
"topics": [],
|
|
"metadata": {
|
|
"document_type": self._detect_document_type(filename),
|
|
"estimated_read_time": len(content) // 200, # ~200 words per minute
|
|
"character_count": len(content),
|
|
"language": "unknown"
|
|
},
|
|
"confidence": 0.0,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def _call_llm_for_summary(
|
|
self,
|
|
content: str,
|
|
filename: str,
|
|
document_type: str
|
|
) -> Dict[str, Any]:
|
|
"""Call Resource Cluster LLM to generate document summary"""
|
|
|
|
prompt = f"""Analyze this {document_type} document and provide a comprehensive summary.
|
|
|
|
Document: {filename}
|
|
Content:
|
|
{content}
|
|
|
|
Please provide:
|
|
1. A concise 2-3 sentence summary
|
|
2. Key topics and themes (list)
|
|
3. Document analysis including tone, purpose, and target audience
|
|
4. Estimated language and reading level
|
|
|
|
Format your response as JSON with these keys:
|
|
- quick_summary: Brief 2-3 sentence overview
|
|
- detailed_analysis: Paragraph with deeper insights
|
|
- topics: Array of key topics/themes
|
|
- metadata: Object with language, tone, purpose, target_audience
|
|
- confidence: Float 0-1 indicating analysis confidence"""
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.post(
|
|
f"{self.resource_cluster_url}/api/v1/ai/chat/completions",
|
|
json={
|
|
"model": "llama-3.1-8b-instant",
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": "You are a document analysis expert. Provide accurate, concise summaries in valid JSON format."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
}
|
|
],
|
|
"temperature": 0.3,
|
|
"max_tokens": 1000
|
|
},
|
|
headers={
|
|
"X-Tenant-ID": "default",
|
|
"Content-Type": "application/json"
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
llm_response = response.json()
|
|
content_text = llm_response["choices"][0]["message"]["content"]
|
|
|
|
# Try to parse JSON response
|
|
try:
|
|
import json
|
|
summary_data = json.loads(content_text)
|
|
|
|
# Validate required fields and add defaults if missing
|
|
return {
|
|
"quick_summary": summary_data.get("quick_summary", f"Analysis of {filename}"),
|
|
"detailed_analysis": summary_data.get("detailed_analysis", "Detailed analysis not available"),
|
|
"topics": summary_data.get("topics", []),
|
|
"metadata": {
|
|
**summary_data.get("metadata", {}),
|
|
"document_type": document_type,
|
|
"generated_at": datetime.utcnow().isoformat()
|
|
},
|
|
"confidence": min(1.0, max(0.0, summary_data.get("confidence", 0.7)))
|
|
}
|
|
|
|
except json.JSONDecodeError:
|
|
# Fallback if LLM doesn't return valid JSON
|
|
return {
|
|
"quick_summary": content_text[:200] + "..." if len(content_text) > 200 else content_text,
|
|
"detailed_analysis": content_text,
|
|
"topics": [],
|
|
"metadata": {
|
|
"document_type": document_type,
|
|
"generated_at": datetime.utcnow().isoformat(),
|
|
"note": "Summary extracted from free-form LLM response"
|
|
},
|
|
"confidence": 0.5
|
|
}
|
|
else:
|
|
raise Exception(f"Resource Cluster API error: {response.status_code}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"LLM summarization failed: {e}")
|
|
raise
|
|
|
|
async def _store_document_summary(
|
|
self,
|
|
document_id: str,
|
|
summary_data: Dict[str, Any],
|
|
tenant_domain: str,
|
|
user_id: str
|
|
):
|
|
"""Store generated summary in database"""
|
|
|
|
# Use the same database session pattern as document processor
|
|
async with get_db_session(tenant_domain) as session:
|
|
try:
|
|
# Insert or update document summary
|
|
query = """
|
|
INSERT INTO document_summaries (
|
|
document_id, user_id, quick_summary, detailed_analysis,
|
|
topics, metadata, confidence, created_at, updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
|
ON CONFLICT (document_id)
|
|
DO UPDATE SET
|
|
quick_summary = EXCLUDED.quick_summary,
|
|
detailed_analysis = EXCLUDED.detailed_analysis,
|
|
topics = EXCLUDED.topics,
|
|
metadata = EXCLUDED.metadata,
|
|
confidence = EXCLUDED.confidence,
|
|
updated_at = EXCLUDED.updated_at
|
|
"""
|
|
|
|
import json
|
|
await execute_command(
|
|
session,
|
|
query,
|
|
document_id,
|
|
user_id,
|
|
summary_data["quick_summary"],
|
|
summary_data["detailed_analysis"],
|
|
json.dumps(summary_data["topics"]),
|
|
json.dumps(summary_data["metadata"]),
|
|
summary_data["confidence"],
|
|
datetime.utcnow(),
|
|
datetime.utcnow()
|
|
)
|
|
|
|
logger.info(f"Stored summary for document {document_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to store document summary: {e}")
|
|
raise
|
|
|
|
def _detect_document_type(self, filename: str) -> str:
|
|
"""Detect document type from filename extension"""
|
|
import pathlib
|
|
|
|
extension = pathlib.Path(filename).suffix.lower()
|
|
|
|
type_mapping = {
|
|
'.pdf': 'PDF document',
|
|
'.docx': 'Word document',
|
|
'.doc': 'Word document',
|
|
'.txt': 'Text file',
|
|
'.md': 'Markdown document',
|
|
'.csv': 'CSV data file',
|
|
'.json': 'JSON data file',
|
|
'.html': 'HTML document',
|
|
'.htm': 'HTML document',
|
|
'.rtf': 'Rich text document'
|
|
}
|
|
|
|
return type_mapping.get(extension, 'Unknown document type')
|
|
|
|
async def get_document_summary(
|
|
self,
|
|
document_id: str,
|
|
tenant_domain: str
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Retrieve stored document summary"""
|
|
|
|
async with get_db_session(tenant_domain) as session:
|
|
try:
|
|
query = """
|
|
SELECT quick_summary, detailed_analysis, topics, metadata,
|
|
confidence, created_at, updated_at
|
|
FROM document_summaries
|
|
WHERE document_id = $1
|
|
"""
|
|
|
|
result = await fetch_one(session, query, document_id)
|
|
|
|
if result:
|
|
import json
|
|
return {
|
|
"quick_summary": result["quick_summary"],
|
|
"detailed_analysis": result["detailed_analysis"],
|
|
"topics": json.loads(result["topics"]) if result["topics"] else [],
|
|
"metadata": json.loads(result["metadata"]) if result["metadata"] else {},
|
|
"confidence": result["confidence"],
|
|
"created_at": result["created_at"].isoformat(),
|
|
"updated_at": result["updated_at"].isoformat()
|
|
}
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to retrieve document summary: {e}")
|
|
return None
|
|
|
|
|
|
# Global instance
|
|
document_summarizer = DocumentSummarizer()
|
|
|
|
|
|
async def generate_document_summary(
|
|
document_id: str,
|
|
content: str,
|
|
filename: str,
|
|
tenant_domain: str,
|
|
user_id: str
|
|
) -> Dict[str, Any]:
|
|
"""Convenience function for document summary generation"""
|
|
return await document_summarizer.generate_document_summary(
|
|
document_id, content, filename, tenant_domain, user_id
|
|
)
|
|
|
|
|
|
async def get_document_summary(document_id: str, tenant_domain: str) -> Optional[Dict[str, Any]]:
|
|
"""Convenience function for retrieving document summary"""
|
|
return await document_summarizer.get_document_summary(document_id, tenant_domain) |