GT AI OS Community v2.0.33 - Add NVIDIA NIM and Nemotron agents

- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
2025-12-12 17:47:14 -05:00
commit 310491a557
750 changed files with 232701 additions and 0 deletions
--- a/apps/resource-cluster/app/services/document_processing_pipeline.py
+++ b/apps/resource-cluster/app/services/document_processing_pipeline.py
@@ -0,0 +1,536 @@
+"""
+Enhanced Document Processing Pipeline with Dual-Engine Support
+
+Implements the DocumentProcessingPipeline from CLAUDE.md with both native
+and Unstructured.io engine support, capability-based selection, and 
+stateless processing.
+"""
+
+import logging
+import asyncio
+import gc
+from typing import Dict, Any, List, Optional, Tuple
+from dataclasses import dataclass
+from datetime import datetime
+import hashlib
+import json
+
+from app.core.backends.document_processor import (
+    DocumentProcessorBackend,
+    ChunkingStrategy
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ProcessingResult:
+    """Result of document processing"""
+    chunks: List[Dict[str, str]]
+    embeddings: Optional[List[List[float]]]  # Optional embeddings
+    metadata: Dict[str, Any]
+    engine_used: str
+    processing_time_ms: float
+    token_count: int
+
+
+@dataclass
+class ProcessingOptions:
+    """Options for document processing"""
+    engine_preference: str = "auto"  # "native", "unstructured", "auto"
+    chunking_strategy: str = "hybrid"  # "fixed", "semantic", "hierarchical", "hybrid"
+    chunk_size: int = 512  # tokens for BGE-M3
+    chunk_overlap: int = 128  # overlap tokens
+    generate_embeddings: bool = True
+    extract_metadata: bool = True
+    language_detection: bool = True
+    ocr_enabled: bool = False  # For scanned PDFs
+
+
+class UnstructuredAPIEngine:
+    """
+    Mock Unstructured.io API engine for advanced document parsing.
+    In production, this would call the actual Unstructured API.
+    """
+    
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None):
+        self.api_key = api_key
+        self.api_url = api_url or "https://api.unstructured.io"
+        self.supported_features = [
+            "table_extraction",
+            "image_extraction",
+            "ocr",
+            "language_detection",
+            "metadata_extraction",
+            "hierarchical_parsing"
+        ]
+    
+    async def process(
+        self,
+        content: bytes,
+        file_type: str,
+        options: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Process document using Unstructured API.
+        
+        This is a mock implementation. In production:
+        1. Send content to Unstructured API
+        2. Handle rate limiting and retries
+        3. Parse structured response
+        """
+        # Mock processing delay
+        await asyncio.sleep(0.5)
+        
+        # Mock response structure
+        return {
+            "elements": [
+                {
+                    "type": "Title",
+                    "text": "Document Title",
+                    "metadata": {"page_number": 1}
+                },
+                {
+                    "type": "NarrativeText",
+                    "text": "This is the main content of the document...",
+                    "metadata": {"page_number": 1}
+                }
+            ],
+            "metadata": {
+                "languages": ["en"],
+                "page_count": 1,
+                "has_tables": False,
+                "has_images": False
+            }
+        }
+
+
+class NativeChunkingEngine:
+    """
+    Native chunking engine using the existing DocumentProcessorBackend.
+    Fast, lightweight, and suitable for most text documents.
+    """
+    
+    def __init__(self):
+        self.processor = DocumentProcessorBackend()
+    
+    async def process(
+        self,
+        content: bytes,
+        file_type: str,
+        options: ProcessingOptions
+    ) -> List[Dict[str, Any]]:
+        """Process document using native chunking"""
+        
+        strategy = ChunkingStrategy(
+            strategy_type=options.chunking_strategy,
+            chunk_size=options.chunk_size,
+            chunk_overlap=options.chunk_overlap,
+            preserve_paragraphs=True,
+            preserve_sentences=True
+        )
+        
+        chunks = await self.processor.process_document(
+            content=content,
+            document_type=file_type,
+            strategy=strategy,
+            metadata={
+                "processing_timestamp": datetime.utcnow().isoformat(),
+                "engine": "native"
+            }
+        )
+        
+        return chunks
+
+
+class DocumentProcessingPipeline:
+    """
+    Dual-engine document processing pipeline with capability-based selection.
+    
+    Features:
+    - Native engine for fast, simple processing
+    - Unstructured API for advanced features
+    - Capability-based engine selection
+    - Stateless processing with memory cleanup
+    - Optional embedding generation
+    """
+    
+    def __init__(self, resource_cluster_url: Optional[str] = None):
+        self.resource_cluster_url = resource_cluster_url or "http://localhost:8004"
+        self.native_engine = NativeChunkingEngine()
+        self.unstructured_engine = None  # Lazy initialization
+        self.embedding_cache = {}  # Cache for frequently used embeddings
+        
+        logger.info("Document Processing Pipeline initialized")
+    
+    def select_engine(
+        self,
+        filename: str,
+        token_data: Dict[str, Any],
+        options: ProcessingOptions
+    ) -> str:
+        """
+        Select processing engine based on file type and capabilities.
+        
+        Args:
+            filename: Name of the file being processed
+            token_data: Capability token data
+            options: Processing options
+            
+        Returns:
+            Engine name: "native" or "unstructured"
+        """
+        # Check if user has premium parsing capability
+        has_premium = any(
+            cap.get("resource") == "premium_parsing"
+            for cap in token_data.get("capabilities", [])
+        )
+        
+        # Force native if no premium capability
+        if not has_premium and options.engine_preference == "unstructured":
+            logger.info("Premium parsing requested but not available, using native engine")
+            return "native"
+        
+        # Auto selection logic
+        if options.engine_preference == "auto":
+            # Use Unstructured for complex formats if available
+            complex_formats = [".pdf", ".docx", ".pptx", ".xlsx"]
+            needs_ocr = options.ocr_enabled
+            needs_tables = filename.lower().endswith((".xlsx", ".csv"))
+            
+            if has_premium and (
+                any(filename.lower().endswith(fmt) for fmt in complex_formats) or
+                needs_ocr or needs_tables
+            ):
+                return "unstructured"
+            else:
+                return "native"
+        
+        # Respect explicit preference if capability allows
+        if options.engine_preference == "unstructured" and has_premium:
+            return "unstructured"
+        
+        return "native"
+    
+    async def process_document(
+        self,
+        file: bytes,
+        filename: str,
+        token_data: Dict[str, Any],
+        options: Optional[ProcessingOptions] = None
+    ) -> ProcessingResult:
+        """
+        Process document with selected engine.
+        
+        Args:
+            file: Document content as bytes
+            filename: Name of the file
+            token_data: Capability token data
+            options: Processing options
+            
+        Returns:
+            ProcessingResult with chunks, embeddings, and metadata
+        """
+        start_time = datetime.utcnow()
+        
+        try:
+            # Use default options if not provided
+            if options is None:
+                options = ProcessingOptions()
+            
+            # Determine file type
+            file_type = self._get_file_extension(filename)
+            
+            # Select engine based on capabilities
+            engine = self.select_engine(filename, token_data, options)
+            
+            # Process with selected engine
+            if engine == "unstructured" and token_data.get("has_capability", {}).get("premium_parsing"):
+                result = await self._process_with_unstructured(file, filename, token_data, options)
+            else:
+                result = await self._process_with_native(file, filename, token_data, options)
+            
+            # Generate embeddings if requested
+            embeddings = None
+            if options.generate_embeddings:
+                embeddings = await self._generate_embeddings(result.chunks, token_data)
+            
+            # Calculate processing time
+            processing_time = (datetime.utcnow() - start_time).total_seconds() * 1000
+            
+            # Calculate token count
+            token_count = sum(len(chunk["text"].split()) for chunk in result.chunks)
+            
+            return ProcessingResult(
+                chunks=result.chunks,
+                embeddings=embeddings,
+                metadata={
+                    "filename": filename,
+                    "file_type": file_type,
+                    "processing_timestamp": start_time.isoformat(),
+                    "chunk_count": len(result.chunks),
+                    "engine_used": engine,
+                    "options": {
+                        "chunking_strategy": options.chunking_strategy,
+                        "chunk_size": options.chunk_size,
+                        "chunk_overlap": options.chunk_overlap
+                    }
+                },
+                engine_used=engine,
+                processing_time_ms=processing_time,
+                token_count=token_count
+            )
+            
+        except Exception as e:
+            logger.error(f"Error processing document: {e}")
+            raise
+        finally:
+            # Ensure memory cleanup
+            del file
+            gc.collect()
+    
+    async def _process_with_native(
+        self,
+        file: bytes,
+        filename: str,
+        token_data: Dict[str, Any],
+        options: ProcessingOptions
+    ) -> ProcessingResult:
+        """Process document with native engine"""
+        
+        file_type = self._get_file_extension(filename)
+        chunks = await self.native_engine.process(file, file_type, options)
+        
+        return ProcessingResult(
+            chunks=chunks,
+            embeddings=None,
+            metadata={"engine": "native"},
+            engine_used="native",
+            processing_time_ms=0,
+            token_count=0
+        )
+    
+    async def _process_with_unstructured(
+        self,
+        file: bytes,
+        filename: str,
+        token_data: Dict[str, Any],
+        options: ProcessingOptions
+    ) -> ProcessingResult:
+        """Process document with Unstructured API"""
+        
+        # Initialize Unstructured engine if needed
+        if self.unstructured_engine is None:
+            # Get API key from token constraints or environment
+            api_key = token_data.get("constraints", {}).get("unstructured_api_key")
+            self.unstructured_engine = UnstructuredAPIEngine(api_key=api_key)
+        
+        file_type = self._get_file_extension(filename)
+        
+        # Process with Unstructured
+        unstructured_result = await self.unstructured_engine.process(
+            content=file,
+            file_type=file_type,
+            options={
+                "ocr": options.ocr_enabled,
+                "extract_tables": True,
+                "extract_images": False,  # Don't extract images for security
+                "languages": ["en", "es", "fr", "de", "zh"]
+            }
+        )
+        
+        # Convert Unstructured elements to chunks
+        chunks = []
+        for element in unstructured_result.get("elements", []):
+            chunk_text = element.get("text", "")
+            if chunk_text.strip():
+                chunks.append({
+                    "text": chunk_text,
+                    "metadata": {
+                        "element_type": element.get("type"),
+                        "page_number": element.get("metadata", {}).get("page_number"),
+                        "engine": "unstructured"
+                    }
+                })
+        
+        # Apply chunking strategy if chunks are too large
+        final_chunks = await self._apply_chunking_to_elements(chunks, options)
+        
+        return ProcessingResult(
+            chunks=final_chunks,
+            embeddings=None,
+            metadata={
+                "engine": "unstructured",
+                "detected_languages": unstructured_result.get("metadata", {}).get("languages", []),
+                "page_count": unstructured_result.get("metadata", {}).get("page_count", 0),
+                "has_tables": unstructured_result.get("metadata", {}).get("has_tables", False),
+                "has_images": unstructured_result.get("metadata", {}).get("has_images", False)
+            },
+            engine_used="unstructured",
+            processing_time_ms=0,
+            token_count=0
+        )
+    
+    async def _apply_chunking_to_elements(
+        self,
+        elements: List[Dict[str, Any]],
+        options: ProcessingOptions
+    ) -> List[Dict[str, Any]]:
+        """Apply chunking strategy to Unstructured elements if needed"""
+        
+        final_chunks = []
+        
+        for element in elements:
+            text = element["text"]
+            
+            # Estimate token count (rough approximation)
+            estimated_tokens = len(text.split()) * 1.3
+            
+            # If element is small enough, keep as is
+            if estimated_tokens <= options.chunk_size:
+                final_chunks.append(element)
+            else:
+                # Split large elements using native chunking
+                sub_chunks = await self._chunk_text(
+                    text,
+                    options.chunk_size,
+                    options.chunk_overlap
+                )
+                
+                for idx, sub_chunk in enumerate(sub_chunks):
+                    chunk_metadata = element["metadata"].copy()
+                    chunk_metadata["sub_chunk_index"] = idx
+                    chunk_metadata["parent_element_type"] = element["metadata"].get("element_type")
+                    
+                    final_chunks.append({
+                        "text": sub_chunk,
+                        "metadata": chunk_metadata
+                    })
+        
+        return final_chunks
+    
+    async def _chunk_text(
+        self,
+        text: str,
+        chunk_size: int,
+        chunk_overlap: int
+    ) -> List[str]:
+        """Simple text chunking for large elements"""
+        
+        words = text.split()
+        chunks = []
+        
+        # Simple word-based chunking
+        for i in range(0, len(words), chunk_size - chunk_overlap):
+            chunk_words = words[i:i + chunk_size]
+            chunks.append(" ".join(chunk_words))
+        
+        return chunks
+    
+    async def _generate_embeddings(
+        self,
+        chunks: List[Dict[str, Any]],
+        token_data: Dict[str, Any]
+    ) -> List[List[float]]:
+        """
+        Generate embeddings for chunks.
+        
+        This is a mock implementation. In production, this would:
+        1. Call the embedding service (BGE-M3 or similar)
+        2. Handle batching for efficiency
+        3. Apply caching for common chunks
+        """
+        embeddings = []
+        
+        for chunk in chunks:
+            # Check cache first
+            chunk_hash = hashlib.sha256(chunk["text"].encode()).hexdigest()
+            
+            if chunk_hash in self.embedding_cache:
+                embeddings.append(self.embedding_cache[chunk_hash])
+            else:
+                # Mock embedding generation
+                # In production: call embedding API
+                embedding = [0.1] * 768  # Mock 768-dim embedding (BGE-M3 size)
+                embeddings.append(embedding)
+                
+                # Cache for reuse (with size limit)
+                if len(self.embedding_cache) < 1000:
+                    self.embedding_cache[chunk_hash] = embedding
+        
+        return embeddings
+    
+    def _get_file_extension(self, filename: str) -> str:
+        """Extract file extension from filename"""
+        
+        parts = filename.lower().split(".")
+        if len(parts) > 1:
+            return f".{parts[-1]}"
+        return ".txt"  # Default to text
+    
+    async def validate_document(
+        self,
+        file_size: int,
+        filename: str,
+        token_data: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Validate document before processing.
+        
+        Args:
+            file_size: Size of file in bytes
+            filename: Name of the file
+            token_data: Capability token data
+            
+        Returns:
+            Validation result with warnings and errors
+        """
+        # Get size limits from token
+        max_size = token_data.get("constraints", {}).get("max_file_size", 50 * 1024 * 1024)
+        
+        validation = {
+            "valid": True,
+            "warnings": [],
+            "errors": [],
+            "recommendations": []
+        }
+        
+        # Check file size
+        if file_size > max_size:
+            validation["valid"] = False
+            validation["errors"].append(f"File exceeds maximum size of {max_size / 1024 / 1024:.1f} MiB")
+        elif file_size > 10 * 1024 * 1024:
+            validation["warnings"].append("Large file may take longer to process")
+            validation["recommendations"].append("Consider using streaming processing for better performance")
+        
+        # Check file type
+        file_type = self._get_file_extension(filename)
+        supported_types = [".pdf", ".docx", ".txt", ".md", ".html", ".csv", ".xlsx", ".pptx"]
+        
+        if file_type not in supported_types:
+            validation["valid"] = False
+            validation["errors"].append(f"Unsupported file type: {file_type}")
+            validation["recommendations"].append(f"Supported types: {', '.join(supported_types)}")
+        
+        # Check for special processing needs
+        if file_type in [".xlsx", ".csv"]:
+            validation["recommendations"].append("Table extraction will be applied automatically")
+        
+        if file_type == ".pdf":
+            validation["recommendations"].append("Enable OCR if document contains scanned images")
+        
+        return validation
+    
+    async def get_processing_stats(self) -> Dict[str, Any]:
+        """Get processing statistics"""
+        
+        return {
+            "engines_available": ["native", "unstructured"],
+            "native_engine_status": "ready",
+            "unstructured_engine_status": "ready" if self.unstructured_engine else "not_initialized",
+            "embedding_cache_size": len(self.embedding_cache),
+            "supported_formats": [".pdf", ".docx", ".txt", ".md", ".html", ".csv", ".xlsx", ".pptx"],
+            "default_chunk_size": 512,
+            "default_chunk_overlap": 128,
+            "stateless": True
+        }