GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
HackWeasel
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions

View File

@@ -0,0 +1,834 @@
"""
Document Processing Service for GT 2.0
Handles file upload, text extraction, chunking, and embedding generation
for RAG pipeline. Supports multiple file formats with intelligent chunking.
"""
import asyncio
import logging
import hashlib
import mimetypes
import re
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import uuid
# Document processing libraries
import pypdf as PyPDF2 # pypdf is the maintained successor to PyPDF2
import docx
import pandas as pd
import json
import csv
from io import StringIO
# Database and core services
from app.core.postgresql_client import get_postgresql_client
# Resource cluster client for embeddings
import httpx
from app.services.embedding_client import get_embedding_client
# Document summarization
from app.services.summarization_service import SummarizationService
logger = logging.getLogger(__name__)
class DocumentProcessor:
"""
Comprehensive document processing service for RAG pipeline.
Features:
- Multi-format support (PDF, DOCX, TXT, MD, CSV, JSON)
- Intelligent chunking with overlap
- Async embedding generation with batch processing
- Progress tracking
- Error handling and recovery
"""
def __init__(self, db=None, tenant_domain=None):
self.db = db
self.tenant_domain = tenant_domain or "test" # Default fallback
# Use configurable embedding client instead of hardcoded URL
self.embedding_client = get_embedding_client()
self.chunk_size = 512 # Default chunk size in tokens
self.chunk_overlap = 128 # Default overlap
self.max_file_size = 100 * 1024 * 1024 # 100MB limit
# Embedding batch processing configuration
self.EMBEDDING_BATCH_SIZE = 15 # Process embeddings in batches of 15 (ARM64 optimized)
self.MAX_CONCURRENT_BATCHES = 3 # Process up to 3 batches concurrently
self.MAX_RETRIES = 3 # Maximum retries per batch
self.INITIAL_RETRY_DELAY = 1.0 # Initial delay in seconds
# Supported file types
self.supported_types = {
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.txt': 'text/plain',
'.md': 'text/markdown',
'.csv': 'text/csv',
'.json': 'application/json'
}
async def process_file(
self,
file_path: Path,
dataset_id: str,
user_id: str,
original_filename: str,
document_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Process a uploaded file through the complete RAG pipeline.
Args:
file_path: Path to uploaded file
dataset_id: Dataset UUID to attach to
user_id: User who uploaded the file
original_filename: Original filename
document_id: Optional existing document ID to update instead of creating new
Returns:
Dict: Document record with processing status
"""
logger.info(f"Processing file {original_filename} for dataset {dataset_id}")
# Process file directly (no session management needed with PostgreSQL client)
return await self._process_file_internal(file_path, dataset_id, user_id, original_filename, document_id)
async def _process_file_internal(
self,
file_path: Path,
dataset_id: str,
user_id: str,
original_filename: str,
document_id: Optional[str] = None
) -> Dict[str, Any]:
"""Internal file processing method"""
try:
# 1. Validate file
await self._validate_file(file_path)
# 2. Create or use existing document record
if document_id:
# Use existing document
document = {"id": document_id}
logger.info(f"Using existing document {document_id} for processing")
else:
# Create new document record
document = await self._create_document_record(
file_path, dataset_id, user_id, original_filename
)
# 3. Get or extract text content
await self._update_processing_status(document["id"], "processing", processing_stage="Getting text content")
# Check if content already exists (e.g., from upload-time extraction)
existing_content, storage_type = await self._get_existing_document_content(document["id"])
if existing_content and storage_type in ["pdf_extracted", "text"]:
# Use existing extracted content
text_content = existing_content
logger.info(f"Using existing extracted content ({len(text_content)} chars, type: {storage_type})")
else:
# Extract text from file
await self._update_processing_status(document["id"], "processing", processing_stage="Extracting text")
# Determine file type for extraction
if document_id:
# For existing documents, determine file type from file extension
file_ext = file_path.suffix.lower()
file_type = self.supported_types.get(file_ext, 'text/plain')
else:
file_type = document["file_type"]
text_content = await self._extract_text(file_path, file_type)
# 4. Update document with extracted text
await self._update_document_content(document["id"], text_content)
# 5. Generate document summary
await self._update_processing_status(document["id"], "processing", processing_stage="Generating summary")
await self._generate_document_summary(document["id"], text_content, original_filename, user_id)
# 6. Chunk the document
await self._update_processing_status(document["id"], "processing", processing_stage="Creating chunks")
chunks = await self._chunk_text(text_content, document["id"])
# Set expected chunk count for progress tracking
await self._update_processing_status(
document["id"], "processing",
processing_stage="Preparing for embedding generation",
total_chunks_expected=len(chunks)
)
# 7. Generate embeddings
await self._update_processing_status(document["id"], "processing", processing_stage="Starting embedding generation")
await self._generate_embeddings_for_chunks(chunks, dataset_id, user_id)
# 8. Update final status
await self._update_processing_status(
document["id"], "completed",
processing_stage="Completed",
chunks_processed=len(chunks),
total_chunks_expected=len(chunks)
)
await self._update_chunk_count(document["id"], len(chunks))
# 9. Update dataset summary (after document is fully processed)
await self._update_dataset_summary_after_document_change(dataset_id, user_id)
logger.info(f"Successfully processed {original_filename} with {len(chunks)} chunks")
return document
except Exception as e:
logger.error(f"Error processing file {original_filename}: {e}")
if 'document' in locals():
await self._update_processing_status(
document["id"], "failed",
error_message=str(e),
processing_stage="Failed"
)
raise
async def _validate_file(self, file_path: Path):
"""Validate file size and type"""
if not file_path.exists():
raise ValueError("File does not exist")
file_size = file_path.stat().st_size
if file_size > self.max_file_size:
raise ValueError(f"File too large: {file_size} bytes (max: {self.max_file_size})")
file_ext = file_path.suffix.lower()
if file_ext not in self.supported_types:
raise ValueError(f"Unsupported file type: {file_ext}")
async def _create_document_record(
self,
file_path: Path,
dataset_id: str,
user_id: str,
original_filename: str
) -> Dict[str, Any]:
"""Create document record in database"""
# Calculate file hash
with open(file_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
file_ext = file_path.suffix.lower()
file_size = file_path.stat().st_size
document_id = str(uuid.uuid4())
# Insert document record using raw SQL
# Note: tenant_id is nullable UUID, so we set it to NULL for individual documents
pg_client = await get_postgresql_client()
await pg_client.execute_command(
"""INSERT INTO documents (
id, user_id, dataset_id, filename, original_filename,
file_type, file_size_bytes, file_hash, processing_status
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
document_id, str(user_id), dataset_id, str(file_path.name),
original_filename, self.supported_types[file_ext], file_size, file_hash, "pending"
)
return {
"id": document_id,
"user_id": user_id,
"dataset_id": dataset_id,
"filename": str(file_path.name),
"original_filename": original_filename,
"file_type": self.supported_types[file_ext],
"file_size_bytes": file_size,
"file_hash": file_hash,
"processing_status": "pending",
"chunk_count": 0
}
async def _extract_text(self, file_path: Path, file_type: str) -> str:
"""Extract text content from various file formats"""
try:
if file_type == 'application/pdf':
return await self._extract_pdf_text(file_path)
elif 'wordprocessingml' in file_type:
return await self._extract_docx_text(file_path)
elif file_type == 'text/csv':
return await self._extract_csv_text(file_path)
elif file_type == 'application/json':
return await self._extract_json_text(file_path)
else: # text/plain, text/markdown
return await self._extract_plain_text(file_path)
except Exception as e:
logger.error(f"Text extraction failed for {file_path}: {e}")
raise ValueError(f"Could not extract text from file: {e}")
async def _extract_pdf_text(self, file_path: Path) -> str:
"""Extract text from PDF file"""
text_parts = []
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
if page_text.strip():
text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
except Exception as e:
logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
if not text_parts:
raise ValueError("No text could be extracted from PDF")
return "\n\n".join(text_parts)
async def _extract_docx_text(self, file_path: Path) -> str:
"""Extract text from DOCX file"""
doc = docx.Document(file_path)
text_parts = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_parts.append(paragraph.text)
if not text_parts:
raise ValueError("No text could be extracted from DOCX")
return "\n\n".join(text_parts)
async def _extract_csv_text(self, file_path: Path) -> str:
"""Extract and format text from CSV file"""
try:
df = pd.read_csv(file_path)
# Create readable format
text_parts = [f"CSV Data with {len(df)} rows and {len(df.columns)} columns"]
text_parts.append(f"Columns: {', '.join(df.columns.tolist())}")
text_parts.append("")
# Sample first few rows in readable format
for idx, row in df.head(20).iterrows():
row_text = []
for col in df.columns:
if pd.notna(row[col]):
row_text.append(f"{col}: {row[col]}")
text_parts.append(f"Row {idx + 1}: " + " | ".join(row_text))
return "\n".join(text_parts)
except Exception as e:
logger.error(f"CSV parsing error: {e}")
# Fallback to reading as plain text
return await self._extract_plain_text(file_path)
async def _extract_json_text(self, file_path: Path) -> str:
"""Extract and format text from JSON file"""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Convert JSON to readable text format
def json_to_text(obj, prefix=""):
text_parts = []
if isinstance(obj, dict):
for key, value in obj.items():
if isinstance(value, (dict, list)):
text_parts.append(f"{prefix}{key}:")
text_parts.extend(json_to_text(value, prefix + " "))
else:
text_parts.append(f"{prefix}{key}: {value}")
elif isinstance(obj, list):
for i, item in enumerate(obj):
if isinstance(item, (dict, list)):
text_parts.append(f"{prefix}Item {i + 1}:")
text_parts.extend(json_to_text(item, prefix + " "))
else:
text_parts.append(f"{prefix}Item {i + 1}: {item}")
else:
text_parts.append(f"{prefix}{obj}")
return text_parts
return "\n".join(json_to_text(data))
async def _extract_plain_text(self, file_path: Path) -> str:
"""Extract text from plain text files"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
# Try with latin-1 encoding
with open(file_path, 'r', encoding='latin-1') as f:
return f.read()
async def extract_text_from_path(self, file_path: Path, content_type: str) -> str:
"""Public wrapper for text extraction from file path"""
return await self._extract_text(file_path, content_type)
async def chunk_text_simple(self, text: str) -> List[str]:
"""Public wrapper for simple text chunking without document_id"""
chunks = []
chunk_size = self.chunk_size * 4 # ~2048 chars
overlap = self.chunk_overlap * 4 # ~512 chars
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
if chunk.strip():
chunks.append(chunk)
return chunks
async def _chunk_text(self, text: str, document_id: str) -> List[Dict[str, Any]]:
"""
Split text into overlapping chunks optimized for embeddings.
Returns:
List of chunk dictionaries with content and metadata
"""
# Simple sentence-aware chunking
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
chunks = []
current_chunk = ""
current_tokens = 0
chunk_index = 0
for sentence in sentences:
sentence_tokens = len(sentence.split())
# If adding this sentence would exceed chunk size, save current chunk
if current_tokens + sentence_tokens > self.chunk_size and current_chunk:
# Create chunk with overlap from previous chunk
chunk_content = current_chunk.strip()
if chunk_content:
chunks.append({
"document_id": document_id,
"chunk_index": chunk_index,
"content": chunk_content,
"token_count": current_tokens,
"content_hash": hashlib.md5(chunk_content.encode()).hexdigest()
})
chunk_index += 1
# Start new chunk with overlap
if self.chunk_overlap > 0 and chunks:
# Take last few sentences for overlap
overlap_sentences = current_chunk.split('.')[-2:] # Rough overlap
current_chunk = '. '.join(s.strip() for s in overlap_sentences if s.strip())
current_tokens = len(current_chunk.split())
else:
current_chunk = ""
current_tokens = 0
# Add sentence to current chunk
if current_chunk:
current_chunk += ". " + sentence
else:
current_chunk = sentence
current_tokens += sentence_tokens
# Add final chunk
if current_chunk.strip():
chunk_content = current_chunk.strip()
chunks.append({
"document_id": document_id,
"chunk_index": chunk_index,
"content": chunk_content,
"token_count": current_tokens,
"content_hash": hashlib.md5(chunk_content.encode()).hexdigest()
})
logger.info(f"Created {len(chunks)} chunks from document {document_id}")
return chunks
async def _generate_embeddings_for_chunks(
self,
chunks: List[Dict[str, Any]],
dataset_id: str,
user_id: str
):
"""
Generate embeddings for all chunks using concurrent batch processing.
Processes chunks in batches with controlled concurrency to optimize performance
while preventing system overload. Includes retry logic and progressive storage.
"""
if not chunks:
return
total_chunks = len(chunks)
document_id = chunks[0]["document_id"]
total_batches = (total_chunks + self.EMBEDDING_BATCH_SIZE - 1) // self.EMBEDDING_BATCH_SIZE
logger.info(f"Starting concurrent embedding generation for {total_chunks} chunks")
logger.info(f"Batch size: {self.EMBEDDING_BATCH_SIZE}, Total batches: {total_batches}, Max concurrent: {self.MAX_CONCURRENT_BATCHES}")
# Create semaphore to limit concurrent batches
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_BATCHES)
# Create batch data with metadata
batch_tasks = []
for batch_start in range(0, total_chunks, self.EMBEDDING_BATCH_SIZE):
batch_end = min(batch_start + self.EMBEDDING_BATCH_SIZE, total_chunks)
batch_chunks = chunks[batch_start:batch_end]
batch_num = (batch_start // self.EMBEDDING_BATCH_SIZE) + 1
batch_metadata = {
"chunks": batch_chunks,
"batch_num": batch_num,
"start_index": batch_start,
"end_index": batch_end,
"dataset_id": dataset_id,
"user_id": user_id,
"document_id": document_id
}
# Create task for this batch
task = self._process_batch_with_semaphore(semaphore, batch_metadata, total_batches, total_chunks)
batch_tasks.append(task)
# Process all batches concurrently
logger.info(f"Starting concurrent processing of {len(batch_tasks)} batches")
start_time = asyncio.get_event_loop().time()
results = await asyncio.gather(*batch_tasks, return_exceptions=True)
end_time = asyncio.get_event_loop().time()
processing_time = end_time - start_time
# Analyze results
successful_batches = []
failed_batches = []
for i, result in enumerate(results):
batch_num = i + 1
if isinstance(result, Exception):
failed_batches.append({
"batch_num": batch_num,
"error": str(result)
})
logger.error(f"Batch {batch_num} failed: {result}")
else:
successful_batches.append(result)
successful_chunks = sum(len(batch["chunks"]) for batch in successful_batches)
logger.info(f"Concurrent processing completed in {processing_time:.2f} seconds")
logger.info(f"Successfully processed {successful_chunks}/{total_chunks} chunks in {len(successful_batches)}/{total_batches} batches")
# Report final results
if failed_batches:
failed_chunk_count = total_chunks - successful_chunks
error_details = "; ".join([f"Batch {b['batch_num']}: {b['error']}" for b in failed_batches[:3]])
if len(failed_batches) > 3:
error_details += f" (and {len(failed_batches) - 3} more failures)"
raise ValueError(f"Failed to generate embeddings for {failed_chunk_count}/{total_chunks} chunks. Errors: {error_details}")
logger.info(f"Successfully stored all {total_chunks} chunks with embeddings")
async def _process_batch_with_semaphore(
self,
semaphore: asyncio.Semaphore,
batch_metadata: Dict[str, Any],
total_batches: int,
total_chunks: int
) -> Dict[str, Any]:
"""
Process a single batch with semaphore-controlled concurrency.
Args:
semaphore: Concurrency control semaphore
batch_metadata: Batch information including chunks and metadata
total_batches: Total number of batches
total_chunks: Total number of chunks
Returns:
Dict with batch processing results
"""
async with semaphore:
batch_chunks = batch_metadata["chunks"]
batch_num = batch_metadata["batch_num"]
dataset_id = batch_metadata["dataset_id"]
user_id = batch_metadata["user_id"]
document_id = batch_metadata["document_id"]
logger.info(f"Starting batch {batch_num}/{total_batches} ({len(batch_chunks)} chunks)")
try:
# Generate embeddings for this batch (pass user_id for billing)
embeddings = await self._generate_embedding_batch(batch_chunks, user_id=user_id)
# Store embeddings for this batch immediately
await self._store_chunk_embeddings(batch_chunks, embeddings, dataset_id, user_id)
# Update progress in database
progress_stage = f"Completed batch {batch_num}/{total_batches}"
# Calculate current progress (approximate since batches complete out of order)
await self._update_processing_status(
document_id, "processing",
processing_stage=progress_stage,
chunks_processed=batch_num * self.EMBEDDING_BATCH_SIZE, # Approximate
total_chunks_expected=total_chunks
)
logger.info(f"Successfully completed batch {batch_num}/{total_batches}")
return {
"batch_num": batch_num,
"chunks": batch_chunks,
"success": True
}
except Exception as e:
logger.error(f"Failed to process batch {batch_num}/{total_batches}: {e}")
raise ValueError(f"Batch {batch_num} failed: {str(e)}")
async def _generate_embedding_batch(
self,
batch_chunks: List[Dict[str, Any]],
user_id: str = None
) -> List[List[float]]:
"""
Generate embeddings for a single batch of chunks with retry logic.
Args:
batch_chunks: List of chunk dictionaries
user_id: User ID for usage tracking
Returns:
List of embedding vectors
Raises:
ValueError: If embedding generation fails after all retries
"""
texts = [chunk["content"] for chunk in batch_chunks]
for attempt in range(self.MAX_RETRIES + 1):
try:
# Use the configurable embedding client with tenant/user context for billing
embeddings = await self.embedding_client.generate_embeddings(
texts,
tenant_id=self.tenant_domain,
user_id=str(user_id) if user_id else None
)
if len(embeddings) != len(texts):
raise ValueError(f"Embedding count mismatch: expected {len(texts)}, got {len(embeddings)}")
return embeddings
except Exception as e:
if attempt < self.MAX_RETRIES:
delay = self.INITIAL_RETRY_DELAY * (2 ** attempt) # Exponential backoff
logger.warning(f"Embedding generation attempt {attempt + 1}/{self.MAX_RETRIES + 1} failed: {e}. Retrying in {delay}s...")
await asyncio.sleep(delay)
else:
logger.error(f"All {self.MAX_RETRIES + 1} embedding generation attempts failed. Final error: {e}")
logger.error(f"Failed request details: URL=http://gentwo-vllm-embeddings:8000/v1/embeddings, texts_count={len(texts)}")
raise ValueError(f"Embedding generation failed after {self.MAX_RETRIES + 1} attempts: {str(e)}")
async def _store_chunk_embeddings(
self,
batch_chunks: List[Dict[str, Any]],
embeddings: List[List[float]],
dataset_id: str,
user_id: str
):
"""Store chunk embeddings in database with proper error handling."""
pg_client = await get_postgresql_client()
for chunk_data, embedding in zip(batch_chunks, embeddings):
chunk_id = str(uuid.uuid4())
# Convert embedding list to PostgreSQL array format
embedding_array = f"[{','.join(map(str, embedding))}]" if embedding else None
await pg_client.execute_command(
"""INSERT INTO document_chunks (
id, document_id, user_id, dataset_id, chunk_index,
content, content_hash, token_count, embedding
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::vector)""",
chunk_id, chunk_data["document_id"], str(user_id),
dataset_id, chunk_data["chunk_index"], chunk_data["content"],
chunk_data["content_hash"], chunk_data["token_count"], embedding_array
)
async def _update_processing_status(
self,
document_id: str,
status: str,
error_message: Optional[str] = None,
processing_stage: Optional[str] = None,
chunks_processed: Optional[int] = None,
total_chunks_expected: Optional[int] = None
):
"""Update document processing status with progress tracking via metadata JSONB"""
# Calculate progress percentage if we have the data
processing_progress = None
if chunks_processed is not None and total_chunks_expected is not None and total_chunks_expected > 0:
processing_progress = min(100, int((chunks_processed / total_chunks_expected) * 100))
# Build progress metadata object
import json
progress_data = {}
if processing_stage is not None:
progress_data['processing_stage'] = processing_stage
if chunks_processed is not None:
progress_data['chunks_processed'] = chunks_processed
if total_chunks_expected is not None:
progress_data['total_chunks_expected'] = total_chunks_expected
if processing_progress is not None:
progress_data['processing_progress'] = processing_progress
pg_client = await get_postgresql_client()
if error_message:
await pg_client.execute_command(
"""UPDATE documents SET
processing_status = $1,
error_message = $2,
metadata = COALESCE(metadata, '{}'::jsonb) || $3::jsonb,
updated_at = NOW()
WHERE id = $4""",
status, error_message, json.dumps(progress_data), document_id
)
else:
await pg_client.execute_command(
"""UPDATE documents SET
processing_status = $1,
metadata = COALESCE(metadata, '{}'::jsonb) || $2::jsonb,
updated_at = NOW()
WHERE id = $3""",
status, json.dumps(progress_data), document_id
)
async def _get_existing_document_content(self, document_id: str) -> tuple[str, str]:
"""Get existing document content and storage type"""
pg_client = await get_postgresql_client()
result = await pg_client.fetch_one(
"SELECT content_text, metadata FROM documents WHERE id = $1",
document_id
)
if result and result["content_text"]:
# Handle metadata - might be JSON string or dict
metadata_raw = result["metadata"] or "{}"
if isinstance(metadata_raw, str):
import json
try:
metadata = json.loads(metadata_raw)
except json.JSONDecodeError:
metadata = {}
else:
metadata = metadata_raw or {}
storage_type = metadata.get("storage_type", "unknown")
return result["content_text"], storage_type
return None, None
async def _update_document_content(self, document_id: str, content: str):
"""Update document with extracted text content"""
pg_client = await get_postgresql_client()
await pg_client.execute_command(
"UPDATE documents SET content_text = $1, updated_at = NOW() WHERE id = $2",
content, document_id
)
async def _update_chunk_count(self, document_id: str, chunk_count: int):
"""Update document with final chunk count"""
pg_client = await get_postgresql_client()
await pg_client.execute_command(
"UPDATE documents SET chunk_count = $1, updated_at = NOW() WHERE id = $2",
chunk_count, document_id
)
async def _generate_document_summary(
self,
document_id: str,
content: str,
filename: str,
user_id: str
):
"""Generate and store AI summary for the document"""
try:
# Use tenant_domain from instance context
tenant_domain = self.tenant_domain
# Create summarization service instance
summarization_service = SummarizationService(tenant_domain, user_id)
# Generate summary using our new service
summary = await summarization_service.generate_document_summary(
document_id=document_id,
document_content=content,
document_name=filename
)
if summary:
logger.info(f"Generated summary for document {document_id}: {summary[:100]}...")
else:
logger.warning(f"Failed to generate summary for document {document_id}")
except Exception as e:
logger.error(f"Error generating document summary for {document_id}: {e}")
# Don't fail the entire document processing if summarization fails
async def _update_dataset_summary_after_document_change(
self,
dataset_id: str,
user_id: str
):
"""Update dataset summary after a document is added or removed"""
try:
# Create summarization service instance
summarization_service = SummarizationService(self.tenant_domain, user_id)
# Update dataset summary asynchronously (don't block document processing)
asyncio.create_task(
summarization_service.update_dataset_summary_on_change(dataset_id)
)
logger.info(f"Triggered dataset summary update for dataset {dataset_id}")
except Exception as e:
logger.error(f"Error triggering dataset summary update for {dataset_id}: {e}")
# Don't fail document processing if dataset summary update fails
async def get_processing_status(self, document_id: str) -> Dict[str, Any]:
"""Get current processing status of a document with progress information from metadata"""
pg_client = await get_postgresql_client()
result = await pg_client.fetch_one(
"""SELECT processing_status, error_message, chunk_count, metadata
FROM documents WHERE id = $1""",
document_id
)
if not result:
raise ValueError("Document not found")
# Extract progress data from metadata JSONB
metadata = result["metadata"] or {}
return {
"status": result["processing_status"],
"error_message": result["error_message"],
"chunk_count": result["chunk_count"],
"chunks_processed": metadata.get("chunks_processed"),
"total_chunks_expected": metadata.get("total_chunks_expected"),
"processing_progress": metadata.get("processing_progress"),
"processing_stage": metadata.get("processing_stage")
}
# Factory function for document processor
async def get_document_processor(tenant_domain=None):
"""Get document processor instance (will create its own DB session when needed)"""
return DocumentProcessor(tenant_domain=tenant_domain)