- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
834 lines
34 KiB
Python
834 lines
34 KiB
Python
"""
|
|
Document Processing Service for GT 2.0
|
|
|
|
Handles file upload, text extraction, chunking, and embedding generation
|
|
for RAG pipeline. Supports multiple file formats with intelligent chunking.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import hashlib
|
|
import mimetypes
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from datetime import datetime
|
|
import uuid
|
|
|
|
# Document processing libraries
|
|
import pypdf as PyPDF2 # pypdf is the maintained successor to PyPDF2
|
|
import docx
|
|
import pandas as pd
|
|
import json
|
|
import csv
|
|
from io import StringIO
|
|
|
|
# Database and core services
|
|
from app.core.postgresql_client import get_postgresql_client
|
|
|
|
# Resource cluster client for embeddings
|
|
import httpx
|
|
from app.services.embedding_client import get_embedding_client
|
|
|
|
# Document summarization
|
|
from app.services.summarization_service import SummarizationService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentProcessor:
|
|
"""
|
|
Comprehensive document processing service for RAG pipeline.
|
|
|
|
Features:
|
|
- Multi-format support (PDF, DOCX, TXT, MD, CSV, JSON)
|
|
- Intelligent chunking with overlap
|
|
- Async embedding generation with batch processing
|
|
- Progress tracking
|
|
- Error handling and recovery
|
|
"""
|
|
|
|
def __init__(self, db=None, tenant_domain=None):
|
|
self.db = db
|
|
self.tenant_domain = tenant_domain or "test" # Default fallback
|
|
# Use configurable embedding client instead of hardcoded URL
|
|
self.embedding_client = get_embedding_client()
|
|
self.chunk_size = 512 # Default chunk size in tokens
|
|
self.chunk_overlap = 128 # Default overlap
|
|
self.max_file_size = 100 * 1024 * 1024 # 100MB limit
|
|
|
|
# Embedding batch processing configuration
|
|
self.EMBEDDING_BATCH_SIZE = 15 # Process embeddings in batches of 15 (ARM64 optimized)
|
|
self.MAX_CONCURRENT_BATCHES = 3 # Process up to 3 batches concurrently
|
|
self.MAX_RETRIES = 3 # Maximum retries per batch
|
|
self.INITIAL_RETRY_DELAY = 1.0 # Initial delay in seconds
|
|
|
|
# Supported file types
|
|
self.supported_types = {
|
|
'.pdf': 'application/pdf',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.txt': 'text/plain',
|
|
'.md': 'text/markdown',
|
|
'.csv': 'text/csv',
|
|
'.json': 'application/json'
|
|
}
|
|
|
|
async def process_file(
|
|
self,
|
|
file_path: Path,
|
|
dataset_id: str,
|
|
user_id: str,
|
|
original_filename: str,
|
|
document_id: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process a uploaded file through the complete RAG pipeline.
|
|
|
|
Args:
|
|
file_path: Path to uploaded file
|
|
dataset_id: Dataset UUID to attach to
|
|
user_id: User who uploaded the file
|
|
original_filename: Original filename
|
|
document_id: Optional existing document ID to update instead of creating new
|
|
|
|
Returns:
|
|
Dict: Document record with processing status
|
|
"""
|
|
logger.info(f"Processing file {original_filename} for dataset {dataset_id}")
|
|
|
|
# Process file directly (no session management needed with PostgreSQL client)
|
|
return await self._process_file_internal(file_path, dataset_id, user_id, original_filename, document_id)
|
|
|
|
async def _process_file_internal(
|
|
self,
|
|
file_path: Path,
|
|
dataset_id: str,
|
|
user_id: str,
|
|
original_filename: str,
|
|
document_id: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""Internal file processing method"""
|
|
try:
|
|
# 1. Validate file
|
|
await self._validate_file(file_path)
|
|
|
|
# 2. Create or use existing document record
|
|
if document_id:
|
|
# Use existing document
|
|
document = {"id": document_id}
|
|
logger.info(f"Using existing document {document_id} for processing")
|
|
else:
|
|
# Create new document record
|
|
document = await self._create_document_record(
|
|
file_path, dataset_id, user_id, original_filename
|
|
)
|
|
|
|
# 3. Get or extract text content
|
|
await self._update_processing_status(document["id"], "processing", processing_stage="Getting text content")
|
|
|
|
# Check if content already exists (e.g., from upload-time extraction)
|
|
existing_content, storage_type = await self._get_existing_document_content(document["id"])
|
|
|
|
if existing_content and storage_type in ["pdf_extracted", "text"]:
|
|
# Use existing extracted content
|
|
text_content = existing_content
|
|
logger.info(f"Using existing extracted content ({len(text_content)} chars, type: {storage_type})")
|
|
else:
|
|
# Extract text from file
|
|
await self._update_processing_status(document["id"], "processing", processing_stage="Extracting text")
|
|
|
|
# Determine file type for extraction
|
|
if document_id:
|
|
# For existing documents, determine file type from file extension
|
|
file_ext = file_path.suffix.lower()
|
|
file_type = self.supported_types.get(file_ext, 'text/plain')
|
|
else:
|
|
file_type = document["file_type"]
|
|
|
|
text_content = await self._extract_text(file_path, file_type)
|
|
|
|
# 4. Update document with extracted text
|
|
await self._update_document_content(document["id"], text_content)
|
|
|
|
# 5. Generate document summary
|
|
await self._update_processing_status(document["id"], "processing", processing_stage="Generating summary")
|
|
await self._generate_document_summary(document["id"], text_content, original_filename, user_id)
|
|
|
|
# 6. Chunk the document
|
|
await self._update_processing_status(document["id"], "processing", processing_stage="Creating chunks")
|
|
chunks = await self._chunk_text(text_content, document["id"])
|
|
|
|
# Set expected chunk count for progress tracking
|
|
await self._update_processing_status(
|
|
document["id"], "processing",
|
|
processing_stage="Preparing for embedding generation",
|
|
total_chunks_expected=len(chunks)
|
|
)
|
|
|
|
# 7. Generate embeddings
|
|
await self._update_processing_status(document["id"], "processing", processing_stage="Starting embedding generation")
|
|
await self._generate_embeddings_for_chunks(chunks, dataset_id, user_id)
|
|
|
|
# 8. Update final status
|
|
await self._update_processing_status(
|
|
document["id"], "completed",
|
|
processing_stage="Completed",
|
|
chunks_processed=len(chunks),
|
|
total_chunks_expected=len(chunks)
|
|
)
|
|
await self._update_chunk_count(document["id"], len(chunks))
|
|
|
|
# 9. Update dataset summary (after document is fully processed)
|
|
await self._update_dataset_summary_after_document_change(dataset_id, user_id)
|
|
|
|
logger.info(f"Successfully processed {original_filename} with {len(chunks)} chunks")
|
|
return document
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing file {original_filename}: {e}")
|
|
if 'document' in locals():
|
|
await self._update_processing_status(
|
|
document["id"], "failed",
|
|
error_message=str(e),
|
|
processing_stage="Failed"
|
|
)
|
|
raise
|
|
|
|
async def _validate_file(self, file_path: Path):
|
|
"""Validate file size and type"""
|
|
if not file_path.exists():
|
|
raise ValueError("File does not exist")
|
|
|
|
file_size = file_path.stat().st_size
|
|
if file_size > self.max_file_size:
|
|
raise ValueError(f"File too large: {file_size} bytes (max: {self.max_file_size})")
|
|
|
|
file_ext = file_path.suffix.lower()
|
|
if file_ext not in self.supported_types:
|
|
raise ValueError(f"Unsupported file type: {file_ext}")
|
|
|
|
async def _create_document_record(
|
|
self,
|
|
file_path: Path,
|
|
dataset_id: str,
|
|
user_id: str,
|
|
original_filename: str
|
|
) -> Dict[str, Any]:
|
|
"""Create document record in database"""
|
|
|
|
# Calculate file hash
|
|
with open(file_path, 'rb') as f:
|
|
file_hash = hashlib.sha256(f.read()).hexdigest()
|
|
|
|
file_ext = file_path.suffix.lower()
|
|
file_size = file_path.stat().st_size
|
|
document_id = str(uuid.uuid4())
|
|
|
|
# Insert document record using raw SQL
|
|
# Note: tenant_id is nullable UUID, so we set it to NULL for individual documents
|
|
pg_client = await get_postgresql_client()
|
|
await pg_client.execute_command(
|
|
"""INSERT INTO documents (
|
|
id, user_id, dataset_id, filename, original_filename,
|
|
file_type, file_size_bytes, file_hash, processing_status
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
|
|
document_id, str(user_id), dataset_id, str(file_path.name),
|
|
original_filename, self.supported_types[file_ext], file_size, file_hash, "pending"
|
|
)
|
|
|
|
return {
|
|
"id": document_id,
|
|
"user_id": user_id,
|
|
"dataset_id": dataset_id,
|
|
"filename": str(file_path.name),
|
|
"original_filename": original_filename,
|
|
"file_type": self.supported_types[file_ext],
|
|
"file_size_bytes": file_size,
|
|
"file_hash": file_hash,
|
|
"processing_status": "pending",
|
|
"chunk_count": 0
|
|
}
|
|
|
|
async def _extract_text(self, file_path: Path, file_type: str) -> str:
|
|
"""Extract text content from various file formats"""
|
|
|
|
try:
|
|
if file_type == 'application/pdf':
|
|
return await self._extract_pdf_text(file_path)
|
|
elif 'wordprocessingml' in file_type:
|
|
return await self._extract_docx_text(file_path)
|
|
elif file_type == 'text/csv':
|
|
return await self._extract_csv_text(file_path)
|
|
elif file_type == 'application/json':
|
|
return await self._extract_json_text(file_path)
|
|
else: # text/plain, text/markdown
|
|
return await self._extract_plain_text(file_path)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Text extraction failed for {file_path}: {e}")
|
|
raise ValueError(f"Could not extract text from file: {e}")
|
|
|
|
async def _extract_pdf_text(self, file_path: Path) -> str:
|
|
"""Extract text from PDF file"""
|
|
text_parts = []
|
|
|
|
with open(file_path, 'rb') as file:
|
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
for page_num, page in enumerate(pdf_reader.pages):
|
|
try:
|
|
page_text = page.extract_text()
|
|
if page_text.strip():
|
|
text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
|
|
except Exception as e:
|
|
logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
|
|
|
|
if not text_parts:
|
|
raise ValueError("No text could be extracted from PDF")
|
|
|
|
return "\n\n".join(text_parts)
|
|
|
|
async def _extract_docx_text(self, file_path: Path) -> str:
|
|
"""Extract text from DOCX file"""
|
|
doc = docx.Document(file_path)
|
|
text_parts = []
|
|
|
|
for paragraph in doc.paragraphs:
|
|
if paragraph.text.strip():
|
|
text_parts.append(paragraph.text)
|
|
|
|
if not text_parts:
|
|
raise ValueError("No text could be extracted from DOCX")
|
|
|
|
return "\n\n".join(text_parts)
|
|
|
|
async def _extract_csv_text(self, file_path: Path) -> str:
|
|
"""Extract and format text from CSV file"""
|
|
try:
|
|
df = pd.read_csv(file_path)
|
|
|
|
# Create readable format
|
|
text_parts = [f"CSV Data with {len(df)} rows and {len(df.columns)} columns"]
|
|
text_parts.append(f"Columns: {', '.join(df.columns.tolist())}")
|
|
text_parts.append("")
|
|
|
|
# Sample first few rows in readable format
|
|
for idx, row in df.head(20).iterrows():
|
|
row_text = []
|
|
for col in df.columns:
|
|
if pd.notna(row[col]):
|
|
row_text.append(f"{col}: {row[col]}")
|
|
text_parts.append(f"Row {idx + 1}: " + " | ".join(row_text))
|
|
|
|
return "\n".join(text_parts)
|
|
|
|
except Exception as e:
|
|
logger.error(f"CSV parsing error: {e}")
|
|
# Fallback to reading as plain text
|
|
return await self._extract_plain_text(file_path)
|
|
|
|
async def _extract_json_text(self, file_path: Path) -> str:
|
|
"""Extract and format text from JSON file"""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Convert JSON to readable text format
|
|
def json_to_text(obj, prefix=""):
|
|
text_parts = []
|
|
|
|
if isinstance(obj, dict):
|
|
for key, value in obj.items():
|
|
if isinstance(value, (dict, list)):
|
|
text_parts.append(f"{prefix}{key}:")
|
|
text_parts.extend(json_to_text(value, prefix + " "))
|
|
else:
|
|
text_parts.append(f"{prefix}{key}: {value}")
|
|
elif isinstance(obj, list):
|
|
for i, item in enumerate(obj):
|
|
if isinstance(item, (dict, list)):
|
|
text_parts.append(f"{prefix}Item {i + 1}:")
|
|
text_parts.extend(json_to_text(item, prefix + " "))
|
|
else:
|
|
text_parts.append(f"{prefix}Item {i + 1}: {item}")
|
|
else:
|
|
text_parts.append(f"{prefix}{obj}")
|
|
|
|
return text_parts
|
|
|
|
return "\n".join(json_to_text(data))
|
|
|
|
async def _extract_plain_text(self, file_path: Path) -> str:
|
|
"""Extract text from plain text files"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
except UnicodeDecodeError:
|
|
# Try with latin-1 encoding
|
|
with open(file_path, 'r', encoding='latin-1') as f:
|
|
return f.read()
|
|
|
|
async def extract_text_from_path(self, file_path: Path, content_type: str) -> str:
|
|
"""Public wrapper for text extraction from file path"""
|
|
return await self._extract_text(file_path, content_type)
|
|
|
|
async def chunk_text_simple(self, text: str) -> List[str]:
|
|
"""Public wrapper for simple text chunking without document_id"""
|
|
chunks = []
|
|
chunk_size = self.chunk_size * 4 # ~2048 chars
|
|
overlap = self.chunk_overlap * 4 # ~512 chars
|
|
|
|
for i in range(0, len(text), chunk_size - overlap):
|
|
chunk = text[i:i + chunk_size]
|
|
if chunk.strip():
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
async def _chunk_text(self, text: str, document_id: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Split text into overlapping chunks optimized for embeddings.
|
|
|
|
Returns:
|
|
List of chunk dictionaries with content and metadata
|
|
"""
|
|
# Simple sentence-aware chunking
|
|
sentences = re.split(r'[.!?]+', text)
|
|
sentences = [s.strip() for s in sentences if s.strip()]
|
|
|
|
chunks = []
|
|
current_chunk = ""
|
|
current_tokens = 0
|
|
chunk_index = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_tokens = len(sentence.split())
|
|
|
|
# If adding this sentence would exceed chunk size, save current chunk
|
|
if current_tokens + sentence_tokens > self.chunk_size and current_chunk:
|
|
# Create chunk with overlap from previous chunk
|
|
chunk_content = current_chunk.strip()
|
|
if chunk_content:
|
|
chunks.append({
|
|
"document_id": document_id,
|
|
"chunk_index": chunk_index,
|
|
"content": chunk_content,
|
|
"token_count": current_tokens,
|
|
"content_hash": hashlib.md5(chunk_content.encode()).hexdigest()
|
|
})
|
|
chunk_index += 1
|
|
|
|
# Start new chunk with overlap
|
|
if self.chunk_overlap > 0 and chunks:
|
|
# Take last few sentences for overlap
|
|
overlap_sentences = current_chunk.split('.')[-2:] # Rough overlap
|
|
current_chunk = '. '.join(s.strip() for s in overlap_sentences if s.strip())
|
|
current_tokens = len(current_chunk.split())
|
|
else:
|
|
current_chunk = ""
|
|
current_tokens = 0
|
|
|
|
# Add sentence to current chunk
|
|
if current_chunk:
|
|
current_chunk += ". " + sentence
|
|
else:
|
|
current_chunk = sentence
|
|
current_tokens += sentence_tokens
|
|
|
|
# Add final chunk
|
|
if current_chunk.strip():
|
|
chunk_content = current_chunk.strip()
|
|
chunks.append({
|
|
"document_id": document_id,
|
|
"chunk_index": chunk_index,
|
|
"content": chunk_content,
|
|
"token_count": current_tokens,
|
|
"content_hash": hashlib.md5(chunk_content.encode()).hexdigest()
|
|
})
|
|
|
|
logger.info(f"Created {len(chunks)} chunks from document {document_id}")
|
|
return chunks
|
|
|
|
async def _generate_embeddings_for_chunks(
|
|
self,
|
|
chunks: List[Dict[str, Any]],
|
|
dataset_id: str,
|
|
user_id: str
|
|
):
|
|
"""
|
|
Generate embeddings for all chunks using concurrent batch processing.
|
|
|
|
Processes chunks in batches with controlled concurrency to optimize performance
|
|
while preventing system overload. Includes retry logic and progressive storage.
|
|
"""
|
|
|
|
if not chunks:
|
|
return
|
|
|
|
total_chunks = len(chunks)
|
|
document_id = chunks[0]["document_id"]
|
|
total_batches = (total_chunks + self.EMBEDDING_BATCH_SIZE - 1) // self.EMBEDDING_BATCH_SIZE
|
|
|
|
logger.info(f"Starting concurrent embedding generation for {total_chunks} chunks")
|
|
logger.info(f"Batch size: {self.EMBEDDING_BATCH_SIZE}, Total batches: {total_batches}, Max concurrent: {self.MAX_CONCURRENT_BATCHES}")
|
|
|
|
# Create semaphore to limit concurrent batches
|
|
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_BATCHES)
|
|
|
|
# Create batch data with metadata
|
|
batch_tasks = []
|
|
for batch_start in range(0, total_chunks, self.EMBEDDING_BATCH_SIZE):
|
|
batch_end = min(batch_start + self.EMBEDDING_BATCH_SIZE, total_chunks)
|
|
batch_chunks = chunks[batch_start:batch_end]
|
|
batch_num = (batch_start // self.EMBEDDING_BATCH_SIZE) + 1
|
|
|
|
batch_metadata = {
|
|
"chunks": batch_chunks,
|
|
"batch_num": batch_num,
|
|
"start_index": batch_start,
|
|
"end_index": batch_end,
|
|
"dataset_id": dataset_id,
|
|
"user_id": user_id,
|
|
"document_id": document_id
|
|
}
|
|
|
|
# Create task for this batch
|
|
task = self._process_batch_with_semaphore(semaphore, batch_metadata, total_batches, total_chunks)
|
|
batch_tasks.append(task)
|
|
|
|
# Process all batches concurrently
|
|
logger.info(f"Starting concurrent processing of {len(batch_tasks)} batches")
|
|
start_time = asyncio.get_event_loop().time()
|
|
|
|
results = await asyncio.gather(*batch_tasks, return_exceptions=True)
|
|
|
|
end_time = asyncio.get_event_loop().time()
|
|
processing_time = end_time - start_time
|
|
|
|
# Analyze results
|
|
successful_batches = []
|
|
failed_batches = []
|
|
|
|
for i, result in enumerate(results):
|
|
batch_num = i + 1
|
|
if isinstance(result, Exception):
|
|
failed_batches.append({
|
|
"batch_num": batch_num,
|
|
"error": str(result)
|
|
})
|
|
logger.error(f"Batch {batch_num} failed: {result}")
|
|
else:
|
|
successful_batches.append(result)
|
|
|
|
successful_chunks = sum(len(batch["chunks"]) for batch in successful_batches)
|
|
|
|
logger.info(f"Concurrent processing completed in {processing_time:.2f} seconds")
|
|
logger.info(f"Successfully processed {successful_chunks}/{total_chunks} chunks in {len(successful_batches)}/{total_batches} batches")
|
|
|
|
# Report final results
|
|
if failed_batches:
|
|
failed_chunk_count = total_chunks - successful_chunks
|
|
error_details = "; ".join([f"Batch {b['batch_num']}: {b['error']}" for b in failed_batches[:3]])
|
|
if len(failed_batches) > 3:
|
|
error_details += f" (and {len(failed_batches) - 3} more failures)"
|
|
|
|
raise ValueError(f"Failed to generate embeddings for {failed_chunk_count}/{total_chunks} chunks. Errors: {error_details}")
|
|
|
|
logger.info(f"Successfully stored all {total_chunks} chunks with embeddings")
|
|
|
|
async def _process_batch_with_semaphore(
|
|
self,
|
|
semaphore: asyncio.Semaphore,
|
|
batch_metadata: Dict[str, Any],
|
|
total_batches: int,
|
|
total_chunks: int
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process a single batch with semaphore-controlled concurrency.
|
|
|
|
Args:
|
|
semaphore: Concurrency control semaphore
|
|
batch_metadata: Batch information including chunks and metadata
|
|
total_batches: Total number of batches
|
|
total_chunks: Total number of chunks
|
|
|
|
Returns:
|
|
Dict with batch processing results
|
|
"""
|
|
async with semaphore:
|
|
batch_chunks = batch_metadata["chunks"]
|
|
batch_num = batch_metadata["batch_num"]
|
|
dataset_id = batch_metadata["dataset_id"]
|
|
user_id = batch_metadata["user_id"]
|
|
document_id = batch_metadata["document_id"]
|
|
|
|
logger.info(f"Starting batch {batch_num}/{total_batches} ({len(batch_chunks)} chunks)")
|
|
|
|
try:
|
|
# Generate embeddings for this batch (pass user_id for billing)
|
|
embeddings = await self._generate_embedding_batch(batch_chunks, user_id=user_id)
|
|
|
|
# Store embeddings for this batch immediately
|
|
await self._store_chunk_embeddings(batch_chunks, embeddings, dataset_id, user_id)
|
|
|
|
# Update progress in database
|
|
progress_stage = f"Completed batch {batch_num}/{total_batches}"
|
|
|
|
# Calculate current progress (approximate since batches complete out of order)
|
|
await self._update_processing_status(
|
|
document_id, "processing",
|
|
processing_stage=progress_stage,
|
|
chunks_processed=batch_num * self.EMBEDDING_BATCH_SIZE, # Approximate
|
|
total_chunks_expected=total_chunks
|
|
)
|
|
|
|
logger.info(f"Successfully completed batch {batch_num}/{total_batches}")
|
|
|
|
return {
|
|
"batch_num": batch_num,
|
|
"chunks": batch_chunks,
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process batch {batch_num}/{total_batches}: {e}")
|
|
raise ValueError(f"Batch {batch_num} failed: {str(e)}")
|
|
|
|
async def _generate_embedding_batch(
|
|
self,
|
|
batch_chunks: List[Dict[str, Any]],
|
|
user_id: str = None
|
|
) -> List[List[float]]:
|
|
"""
|
|
Generate embeddings for a single batch of chunks with retry logic.
|
|
|
|
Args:
|
|
batch_chunks: List of chunk dictionaries
|
|
user_id: User ID for usage tracking
|
|
|
|
Returns:
|
|
List of embedding vectors
|
|
|
|
Raises:
|
|
ValueError: If embedding generation fails after all retries
|
|
"""
|
|
texts = [chunk["content"] for chunk in batch_chunks]
|
|
|
|
for attempt in range(self.MAX_RETRIES + 1):
|
|
try:
|
|
# Use the configurable embedding client with tenant/user context for billing
|
|
embeddings = await self.embedding_client.generate_embeddings(
|
|
texts,
|
|
tenant_id=self.tenant_domain,
|
|
user_id=str(user_id) if user_id else None
|
|
)
|
|
|
|
if len(embeddings) != len(texts):
|
|
raise ValueError(f"Embedding count mismatch: expected {len(texts)}, got {len(embeddings)}")
|
|
|
|
return embeddings
|
|
|
|
except Exception as e:
|
|
if attempt < self.MAX_RETRIES:
|
|
delay = self.INITIAL_RETRY_DELAY * (2 ** attempt) # Exponential backoff
|
|
logger.warning(f"Embedding generation attempt {attempt + 1}/{self.MAX_RETRIES + 1} failed: {e}. Retrying in {delay}s...")
|
|
await asyncio.sleep(delay)
|
|
else:
|
|
logger.error(f"All {self.MAX_RETRIES + 1} embedding generation attempts failed. Final error: {e}")
|
|
logger.error(f"Failed request details: URL=http://gentwo-vllm-embeddings:8000/v1/embeddings, texts_count={len(texts)}")
|
|
raise ValueError(f"Embedding generation failed after {self.MAX_RETRIES + 1} attempts: {str(e)}")
|
|
|
|
async def _store_chunk_embeddings(
|
|
self,
|
|
batch_chunks: List[Dict[str, Any]],
|
|
embeddings: List[List[float]],
|
|
dataset_id: str,
|
|
user_id: str
|
|
):
|
|
"""Store chunk embeddings in database with proper error handling."""
|
|
|
|
pg_client = await get_postgresql_client()
|
|
for chunk_data, embedding in zip(batch_chunks, embeddings):
|
|
chunk_id = str(uuid.uuid4())
|
|
|
|
# Convert embedding list to PostgreSQL array format
|
|
embedding_array = f"[{','.join(map(str, embedding))}]" if embedding else None
|
|
|
|
await pg_client.execute_command(
|
|
"""INSERT INTO document_chunks (
|
|
id, document_id, user_id, dataset_id, chunk_index,
|
|
content, content_hash, token_count, embedding
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::vector)""",
|
|
chunk_id, chunk_data["document_id"], str(user_id),
|
|
dataset_id, chunk_data["chunk_index"], chunk_data["content"],
|
|
chunk_data["content_hash"], chunk_data["token_count"], embedding_array
|
|
)
|
|
|
|
async def _update_processing_status(
|
|
self,
|
|
document_id: str,
|
|
status: str,
|
|
error_message: Optional[str] = None,
|
|
processing_stage: Optional[str] = None,
|
|
chunks_processed: Optional[int] = None,
|
|
total_chunks_expected: Optional[int] = None
|
|
):
|
|
"""Update document processing status with progress tracking via metadata JSONB"""
|
|
|
|
# Calculate progress percentage if we have the data
|
|
processing_progress = None
|
|
if chunks_processed is not None and total_chunks_expected is not None and total_chunks_expected > 0:
|
|
processing_progress = min(100, int((chunks_processed / total_chunks_expected) * 100))
|
|
|
|
# Build progress metadata object
|
|
import json
|
|
progress_data = {}
|
|
if processing_stage is not None:
|
|
progress_data['processing_stage'] = processing_stage
|
|
if chunks_processed is not None:
|
|
progress_data['chunks_processed'] = chunks_processed
|
|
if total_chunks_expected is not None:
|
|
progress_data['total_chunks_expected'] = total_chunks_expected
|
|
if processing_progress is not None:
|
|
progress_data['processing_progress'] = processing_progress
|
|
|
|
pg_client = await get_postgresql_client()
|
|
if error_message:
|
|
await pg_client.execute_command(
|
|
"""UPDATE documents SET
|
|
processing_status = $1,
|
|
error_message = $2,
|
|
metadata = COALESCE(metadata, '{}'::jsonb) || $3::jsonb,
|
|
updated_at = NOW()
|
|
WHERE id = $4""",
|
|
status, error_message, json.dumps(progress_data), document_id
|
|
)
|
|
else:
|
|
await pg_client.execute_command(
|
|
"""UPDATE documents SET
|
|
processing_status = $1,
|
|
metadata = COALESCE(metadata, '{}'::jsonb) || $2::jsonb,
|
|
updated_at = NOW()
|
|
WHERE id = $3""",
|
|
status, json.dumps(progress_data), document_id
|
|
)
|
|
|
|
async def _get_existing_document_content(self, document_id: str) -> tuple[str, str]:
|
|
"""Get existing document content and storage type"""
|
|
pg_client = await get_postgresql_client()
|
|
result = await pg_client.fetch_one(
|
|
"SELECT content_text, metadata FROM documents WHERE id = $1",
|
|
document_id
|
|
)
|
|
if result and result["content_text"]:
|
|
# Handle metadata - might be JSON string or dict
|
|
metadata_raw = result["metadata"] or "{}"
|
|
if isinstance(metadata_raw, str):
|
|
import json
|
|
try:
|
|
metadata = json.loads(metadata_raw)
|
|
except json.JSONDecodeError:
|
|
metadata = {}
|
|
else:
|
|
metadata = metadata_raw or {}
|
|
storage_type = metadata.get("storage_type", "unknown")
|
|
return result["content_text"], storage_type
|
|
return None, None
|
|
|
|
async def _update_document_content(self, document_id: str, content: str):
|
|
"""Update document with extracted text content"""
|
|
pg_client = await get_postgresql_client()
|
|
await pg_client.execute_command(
|
|
"UPDATE documents SET content_text = $1, updated_at = NOW() WHERE id = $2",
|
|
content, document_id
|
|
)
|
|
|
|
async def _update_chunk_count(self, document_id: str, chunk_count: int):
|
|
"""Update document with final chunk count"""
|
|
pg_client = await get_postgresql_client()
|
|
await pg_client.execute_command(
|
|
"UPDATE documents SET chunk_count = $1, updated_at = NOW() WHERE id = $2",
|
|
chunk_count, document_id
|
|
)
|
|
|
|
async def _generate_document_summary(
|
|
self,
|
|
document_id: str,
|
|
content: str,
|
|
filename: str,
|
|
user_id: str
|
|
):
|
|
"""Generate and store AI summary for the document"""
|
|
try:
|
|
# Use tenant_domain from instance context
|
|
tenant_domain = self.tenant_domain
|
|
|
|
# Create summarization service instance
|
|
summarization_service = SummarizationService(tenant_domain, user_id)
|
|
|
|
# Generate summary using our new service
|
|
summary = await summarization_service.generate_document_summary(
|
|
document_id=document_id,
|
|
document_content=content,
|
|
document_name=filename
|
|
)
|
|
|
|
if summary:
|
|
logger.info(f"Generated summary for document {document_id}: {summary[:100]}...")
|
|
else:
|
|
logger.warning(f"Failed to generate summary for document {document_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating document summary for {document_id}: {e}")
|
|
# Don't fail the entire document processing if summarization fails
|
|
|
|
async def _update_dataset_summary_after_document_change(
|
|
self,
|
|
dataset_id: str,
|
|
user_id: str
|
|
):
|
|
"""Update dataset summary after a document is added or removed"""
|
|
try:
|
|
# Create summarization service instance
|
|
summarization_service = SummarizationService(self.tenant_domain, user_id)
|
|
|
|
# Update dataset summary asynchronously (don't block document processing)
|
|
asyncio.create_task(
|
|
summarization_service.update_dataset_summary_on_change(dataset_id)
|
|
)
|
|
|
|
logger.info(f"Triggered dataset summary update for dataset {dataset_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error triggering dataset summary update for {dataset_id}: {e}")
|
|
# Don't fail document processing if dataset summary update fails
|
|
|
|
async def get_processing_status(self, document_id: str) -> Dict[str, Any]:
|
|
"""Get current processing status of a document with progress information from metadata"""
|
|
pg_client = await get_postgresql_client()
|
|
result = await pg_client.fetch_one(
|
|
"""SELECT processing_status, error_message, chunk_count, metadata
|
|
FROM documents WHERE id = $1""",
|
|
document_id
|
|
)
|
|
|
|
if not result:
|
|
raise ValueError("Document not found")
|
|
|
|
# Extract progress data from metadata JSONB
|
|
metadata = result["metadata"] or {}
|
|
|
|
return {
|
|
"status": result["processing_status"],
|
|
"error_message": result["error_message"],
|
|
"chunk_count": result["chunk_count"],
|
|
"chunks_processed": metadata.get("chunks_processed"),
|
|
"total_chunks_expected": metadata.get("total_chunks_expected"),
|
|
"processing_progress": metadata.get("processing_progress"),
|
|
"processing_stage": metadata.get("processing_stage")
|
|
}
|
|
|
|
|
|
# Factory function for document processor
|
|
async def get_document_processor(tenant_domain=None):
|
|
"""Get document processor instance (will create its own DB session when needed)"""
|
|
return DocumentProcessor(tenant_domain=tenant_domain) |