- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
536 lines
18 KiB
Python
536 lines
18 KiB
Python
"""
|
|
Enhanced Document Processing Pipeline with Dual-Engine Support
|
|
|
|
Implements the DocumentProcessingPipeline from CLAUDE.md with both native
|
|
and Unstructured.io engine support, capability-based selection, and
|
|
stateless processing.
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
import gc
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
import hashlib
|
|
import json
|
|
|
|
from app.core.backends.document_processor import (
|
|
DocumentProcessorBackend,
|
|
ChunkingStrategy
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ProcessingResult:
|
|
"""Result of document processing"""
|
|
chunks: List[Dict[str, str]]
|
|
embeddings: Optional[List[List[float]]] # Optional embeddings
|
|
metadata: Dict[str, Any]
|
|
engine_used: str
|
|
processing_time_ms: float
|
|
token_count: int
|
|
|
|
|
|
@dataclass
|
|
class ProcessingOptions:
|
|
"""Options for document processing"""
|
|
engine_preference: str = "auto" # "native", "unstructured", "auto"
|
|
chunking_strategy: str = "hybrid" # "fixed", "semantic", "hierarchical", "hybrid"
|
|
chunk_size: int = 512 # tokens for BGE-M3
|
|
chunk_overlap: int = 128 # overlap tokens
|
|
generate_embeddings: bool = True
|
|
extract_metadata: bool = True
|
|
language_detection: bool = True
|
|
ocr_enabled: bool = False # For scanned PDFs
|
|
|
|
|
|
class UnstructuredAPIEngine:
|
|
"""
|
|
Mock Unstructured.io API engine for advanced document parsing.
|
|
In production, this would call the actual Unstructured API.
|
|
"""
|
|
|
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None):
|
|
self.api_key = api_key
|
|
self.api_url = api_url or "https://api.unstructured.io"
|
|
self.supported_features = [
|
|
"table_extraction",
|
|
"image_extraction",
|
|
"ocr",
|
|
"language_detection",
|
|
"metadata_extraction",
|
|
"hierarchical_parsing"
|
|
]
|
|
|
|
async def process(
|
|
self,
|
|
content: bytes,
|
|
file_type: str,
|
|
options: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process document using Unstructured API.
|
|
|
|
This is a mock implementation. In production:
|
|
1. Send content to Unstructured API
|
|
2. Handle rate limiting and retries
|
|
3. Parse structured response
|
|
"""
|
|
# Mock processing delay
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Mock response structure
|
|
return {
|
|
"elements": [
|
|
{
|
|
"type": "Title",
|
|
"text": "Document Title",
|
|
"metadata": {"page_number": 1}
|
|
},
|
|
{
|
|
"type": "NarrativeText",
|
|
"text": "This is the main content of the document...",
|
|
"metadata": {"page_number": 1}
|
|
}
|
|
],
|
|
"metadata": {
|
|
"languages": ["en"],
|
|
"page_count": 1,
|
|
"has_tables": False,
|
|
"has_images": False
|
|
}
|
|
}
|
|
|
|
|
|
class NativeChunkingEngine:
|
|
"""
|
|
Native chunking engine using the existing DocumentProcessorBackend.
|
|
Fast, lightweight, and suitable for most text documents.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.processor = DocumentProcessorBackend()
|
|
|
|
async def process(
|
|
self,
|
|
content: bytes,
|
|
file_type: str,
|
|
options: ProcessingOptions
|
|
) -> List[Dict[str, Any]]:
|
|
"""Process document using native chunking"""
|
|
|
|
strategy = ChunkingStrategy(
|
|
strategy_type=options.chunking_strategy,
|
|
chunk_size=options.chunk_size,
|
|
chunk_overlap=options.chunk_overlap,
|
|
preserve_paragraphs=True,
|
|
preserve_sentences=True
|
|
)
|
|
|
|
chunks = await self.processor.process_document(
|
|
content=content,
|
|
document_type=file_type,
|
|
strategy=strategy,
|
|
metadata={
|
|
"processing_timestamp": datetime.utcnow().isoformat(),
|
|
"engine": "native"
|
|
}
|
|
)
|
|
|
|
return chunks
|
|
|
|
|
|
class DocumentProcessingPipeline:
|
|
"""
|
|
Dual-engine document processing pipeline with capability-based selection.
|
|
|
|
Features:
|
|
- Native engine for fast, simple processing
|
|
- Unstructured API for advanced features
|
|
- Capability-based engine selection
|
|
- Stateless processing with memory cleanup
|
|
- Optional embedding generation
|
|
"""
|
|
|
|
def __init__(self, resource_cluster_url: Optional[str] = None):
|
|
self.resource_cluster_url = resource_cluster_url or "http://localhost:8004"
|
|
self.native_engine = NativeChunkingEngine()
|
|
self.unstructured_engine = None # Lazy initialization
|
|
self.embedding_cache = {} # Cache for frequently used embeddings
|
|
|
|
logger.info("Document Processing Pipeline initialized")
|
|
|
|
def select_engine(
|
|
self,
|
|
filename: str,
|
|
token_data: Dict[str, Any],
|
|
options: ProcessingOptions
|
|
) -> str:
|
|
"""
|
|
Select processing engine based on file type and capabilities.
|
|
|
|
Args:
|
|
filename: Name of the file being processed
|
|
token_data: Capability token data
|
|
options: Processing options
|
|
|
|
Returns:
|
|
Engine name: "native" or "unstructured"
|
|
"""
|
|
# Check if user has premium parsing capability
|
|
has_premium = any(
|
|
cap.get("resource") == "premium_parsing"
|
|
for cap in token_data.get("capabilities", [])
|
|
)
|
|
|
|
# Force native if no premium capability
|
|
if not has_premium and options.engine_preference == "unstructured":
|
|
logger.info("Premium parsing requested but not available, using native engine")
|
|
return "native"
|
|
|
|
# Auto selection logic
|
|
if options.engine_preference == "auto":
|
|
# Use Unstructured for complex formats if available
|
|
complex_formats = [".pdf", ".docx", ".pptx", ".xlsx"]
|
|
needs_ocr = options.ocr_enabled
|
|
needs_tables = filename.lower().endswith((".xlsx", ".csv"))
|
|
|
|
if has_premium and (
|
|
any(filename.lower().endswith(fmt) for fmt in complex_formats) or
|
|
needs_ocr or needs_tables
|
|
):
|
|
return "unstructured"
|
|
else:
|
|
return "native"
|
|
|
|
# Respect explicit preference if capability allows
|
|
if options.engine_preference == "unstructured" and has_premium:
|
|
return "unstructured"
|
|
|
|
return "native"
|
|
|
|
async def process_document(
|
|
self,
|
|
file: bytes,
|
|
filename: str,
|
|
token_data: Dict[str, Any],
|
|
options: Optional[ProcessingOptions] = None
|
|
) -> ProcessingResult:
|
|
"""
|
|
Process document with selected engine.
|
|
|
|
Args:
|
|
file: Document content as bytes
|
|
filename: Name of the file
|
|
token_data: Capability token data
|
|
options: Processing options
|
|
|
|
Returns:
|
|
ProcessingResult with chunks, embeddings, and metadata
|
|
"""
|
|
start_time = datetime.utcnow()
|
|
|
|
try:
|
|
# Use default options if not provided
|
|
if options is None:
|
|
options = ProcessingOptions()
|
|
|
|
# Determine file type
|
|
file_type = self._get_file_extension(filename)
|
|
|
|
# Select engine based on capabilities
|
|
engine = self.select_engine(filename, token_data, options)
|
|
|
|
# Process with selected engine
|
|
if engine == "unstructured" and token_data.get("has_capability", {}).get("premium_parsing"):
|
|
result = await self._process_with_unstructured(file, filename, token_data, options)
|
|
else:
|
|
result = await self._process_with_native(file, filename, token_data, options)
|
|
|
|
# Generate embeddings if requested
|
|
embeddings = None
|
|
if options.generate_embeddings:
|
|
embeddings = await self._generate_embeddings(result.chunks, token_data)
|
|
|
|
# Calculate processing time
|
|
processing_time = (datetime.utcnow() - start_time).total_seconds() * 1000
|
|
|
|
# Calculate token count
|
|
token_count = sum(len(chunk["text"].split()) for chunk in result.chunks)
|
|
|
|
return ProcessingResult(
|
|
chunks=result.chunks,
|
|
embeddings=embeddings,
|
|
metadata={
|
|
"filename": filename,
|
|
"file_type": file_type,
|
|
"processing_timestamp": start_time.isoformat(),
|
|
"chunk_count": len(result.chunks),
|
|
"engine_used": engine,
|
|
"options": {
|
|
"chunking_strategy": options.chunking_strategy,
|
|
"chunk_size": options.chunk_size,
|
|
"chunk_overlap": options.chunk_overlap
|
|
}
|
|
},
|
|
engine_used=engine,
|
|
processing_time_ms=processing_time,
|
|
token_count=token_count
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing document: {e}")
|
|
raise
|
|
finally:
|
|
# Ensure memory cleanup
|
|
del file
|
|
gc.collect()
|
|
|
|
async def _process_with_native(
|
|
self,
|
|
file: bytes,
|
|
filename: str,
|
|
token_data: Dict[str, Any],
|
|
options: ProcessingOptions
|
|
) -> ProcessingResult:
|
|
"""Process document with native engine"""
|
|
|
|
file_type = self._get_file_extension(filename)
|
|
chunks = await self.native_engine.process(file, file_type, options)
|
|
|
|
return ProcessingResult(
|
|
chunks=chunks,
|
|
embeddings=None,
|
|
metadata={"engine": "native"},
|
|
engine_used="native",
|
|
processing_time_ms=0,
|
|
token_count=0
|
|
)
|
|
|
|
async def _process_with_unstructured(
|
|
self,
|
|
file: bytes,
|
|
filename: str,
|
|
token_data: Dict[str, Any],
|
|
options: ProcessingOptions
|
|
) -> ProcessingResult:
|
|
"""Process document with Unstructured API"""
|
|
|
|
# Initialize Unstructured engine if needed
|
|
if self.unstructured_engine is None:
|
|
# Get API key from token constraints or environment
|
|
api_key = token_data.get("constraints", {}).get("unstructured_api_key")
|
|
self.unstructured_engine = UnstructuredAPIEngine(api_key=api_key)
|
|
|
|
file_type = self._get_file_extension(filename)
|
|
|
|
# Process with Unstructured
|
|
unstructured_result = await self.unstructured_engine.process(
|
|
content=file,
|
|
file_type=file_type,
|
|
options={
|
|
"ocr": options.ocr_enabled,
|
|
"extract_tables": True,
|
|
"extract_images": False, # Don't extract images for security
|
|
"languages": ["en", "es", "fr", "de", "zh"]
|
|
}
|
|
)
|
|
|
|
# Convert Unstructured elements to chunks
|
|
chunks = []
|
|
for element in unstructured_result.get("elements", []):
|
|
chunk_text = element.get("text", "")
|
|
if chunk_text.strip():
|
|
chunks.append({
|
|
"text": chunk_text,
|
|
"metadata": {
|
|
"element_type": element.get("type"),
|
|
"page_number": element.get("metadata", {}).get("page_number"),
|
|
"engine": "unstructured"
|
|
}
|
|
})
|
|
|
|
# Apply chunking strategy if chunks are too large
|
|
final_chunks = await self._apply_chunking_to_elements(chunks, options)
|
|
|
|
return ProcessingResult(
|
|
chunks=final_chunks,
|
|
embeddings=None,
|
|
metadata={
|
|
"engine": "unstructured",
|
|
"detected_languages": unstructured_result.get("metadata", {}).get("languages", []),
|
|
"page_count": unstructured_result.get("metadata", {}).get("page_count", 0),
|
|
"has_tables": unstructured_result.get("metadata", {}).get("has_tables", False),
|
|
"has_images": unstructured_result.get("metadata", {}).get("has_images", False)
|
|
},
|
|
engine_used="unstructured",
|
|
processing_time_ms=0,
|
|
token_count=0
|
|
)
|
|
|
|
async def _apply_chunking_to_elements(
|
|
self,
|
|
elements: List[Dict[str, Any]],
|
|
options: ProcessingOptions
|
|
) -> List[Dict[str, Any]]:
|
|
"""Apply chunking strategy to Unstructured elements if needed"""
|
|
|
|
final_chunks = []
|
|
|
|
for element in elements:
|
|
text = element["text"]
|
|
|
|
# Estimate token count (rough approximation)
|
|
estimated_tokens = len(text.split()) * 1.3
|
|
|
|
# If element is small enough, keep as is
|
|
if estimated_tokens <= options.chunk_size:
|
|
final_chunks.append(element)
|
|
else:
|
|
# Split large elements using native chunking
|
|
sub_chunks = await self._chunk_text(
|
|
text,
|
|
options.chunk_size,
|
|
options.chunk_overlap
|
|
)
|
|
|
|
for idx, sub_chunk in enumerate(sub_chunks):
|
|
chunk_metadata = element["metadata"].copy()
|
|
chunk_metadata["sub_chunk_index"] = idx
|
|
chunk_metadata["parent_element_type"] = element["metadata"].get("element_type")
|
|
|
|
final_chunks.append({
|
|
"text": sub_chunk,
|
|
"metadata": chunk_metadata
|
|
})
|
|
|
|
return final_chunks
|
|
|
|
async def _chunk_text(
|
|
self,
|
|
text: str,
|
|
chunk_size: int,
|
|
chunk_overlap: int
|
|
) -> List[str]:
|
|
"""Simple text chunking for large elements"""
|
|
|
|
words = text.split()
|
|
chunks = []
|
|
|
|
# Simple word-based chunking
|
|
for i in range(0, len(words), chunk_size - chunk_overlap):
|
|
chunk_words = words[i:i + chunk_size]
|
|
chunks.append(" ".join(chunk_words))
|
|
|
|
return chunks
|
|
|
|
async def _generate_embeddings(
|
|
self,
|
|
chunks: List[Dict[str, Any]],
|
|
token_data: Dict[str, Any]
|
|
) -> List[List[float]]:
|
|
"""
|
|
Generate embeddings for chunks.
|
|
|
|
This is a mock implementation. In production, this would:
|
|
1. Call the embedding service (BGE-M3 or similar)
|
|
2. Handle batching for efficiency
|
|
3. Apply caching for common chunks
|
|
"""
|
|
embeddings = []
|
|
|
|
for chunk in chunks:
|
|
# Check cache first
|
|
chunk_hash = hashlib.sha256(chunk["text"].encode()).hexdigest()
|
|
|
|
if chunk_hash in self.embedding_cache:
|
|
embeddings.append(self.embedding_cache[chunk_hash])
|
|
else:
|
|
# Mock embedding generation
|
|
# In production: call embedding API
|
|
embedding = [0.1] * 768 # Mock 768-dim embedding (BGE-M3 size)
|
|
embeddings.append(embedding)
|
|
|
|
# Cache for reuse (with size limit)
|
|
if len(self.embedding_cache) < 1000:
|
|
self.embedding_cache[chunk_hash] = embedding
|
|
|
|
return embeddings
|
|
|
|
def _get_file_extension(self, filename: str) -> str:
|
|
"""Extract file extension from filename"""
|
|
|
|
parts = filename.lower().split(".")
|
|
if len(parts) > 1:
|
|
return f".{parts[-1]}"
|
|
return ".txt" # Default to text
|
|
|
|
async def validate_document(
|
|
self,
|
|
file_size: int,
|
|
filename: str,
|
|
token_data: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Validate document before processing.
|
|
|
|
Args:
|
|
file_size: Size of file in bytes
|
|
filename: Name of the file
|
|
token_data: Capability token data
|
|
|
|
Returns:
|
|
Validation result with warnings and errors
|
|
"""
|
|
# Get size limits from token
|
|
max_size = token_data.get("constraints", {}).get("max_file_size", 50 * 1024 * 1024)
|
|
|
|
validation = {
|
|
"valid": True,
|
|
"warnings": [],
|
|
"errors": [],
|
|
"recommendations": []
|
|
}
|
|
|
|
# Check file size
|
|
if file_size > max_size:
|
|
validation["valid"] = False
|
|
validation["errors"].append(f"File exceeds maximum size of {max_size / 1024 / 1024:.1f} MiB")
|
|
elif file_size > 10 * 1024 * 1024:
|
|
validation["warnings"].append("Large file may take longer to process")
|
|
validation["recommendations"].append("Consider using streaming processing for better performance")
|
|
|
|
# Check file type
|
|
file_type = self._get_file_extension(filename)
|
|
supported_types = [".pdf", ".docx", ".txt", ".md", ".html", ".csv", ".xlsx", ".pptx"]
|
|
|
|
if file_type not in supported_types:
|
|
validation["valid"] = False
|
|
validation["errors"].append(f"Unsupported file type: {file_type}")
|
|
validation["recommendations"].append(f"Supported types: {', '.join(supported_types)}")
|
|
|
|
# Check for special processing needs
|
|
if file_type in [".xlsx", ".csv"]:
|
|
validation["recommendations"].append("Table extraction will be applied automatically")
|
|
|
|
if file_type == ".pdf":
|
|
validation["recommendations"].append("Enable OCR if document contains scanned images")
|
|
|
|
return validation
|
|
|
|
async def get_processing_stats(self) -> Dict[str, Any]:
|
|
"""Get processing statistics"""
|
|
|
|
return {
|
|
"engines_available": ["native", "unstructured"],
|
|
"native_engine_status": "ready",
|
|
"unstructured_engine_status": "ready" if self.unstructured_engine else "not_initialized",
|
|
"embedding_cache_size": len(self.embedding_cache),
|
|
"supported_formats": [".pdf", ".docx", ".txt", ".md", ".html", ".csv", ".xlsx", ".pptx"],
|
|
"default_chunk_size": 512,
|
|
"default_chunk_overlap": 128,
|
|
"stateless": True
|
|
} |