gt-ai-os-community/apps/tenant-backend/app/services/document_processor.py

"""
Document Processing Service for GT 2.0

Handles file upload, text extraction, chunking, and embedding generation
for RAG pipeline. Supports multiple file formats with intelligent chunking.
"""

import asyncio
import logging
import hashlib
import mimetypes
import re
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import uuid

# Document processing libraries
import pypdf as PyPDF2  # pypdf is the maintained successor to PyPDF2
import docx
import pandas as pd
import json
import csv
from io import StringIO

# Database and core services
from app.core.postgresql_client import get_postgresql_client

# Resource cluster client for embeddings
import httpx
from app.services.embedding_client import get_embedding_client

# Document summarization
from app.services.summarization_service import SummarizationService

logger = logging.getLogger(__name__)


class DocumentProcessor:
    """
    Comprehensive document processing service for RAG pipeline.

    Features:
    - Multi-format support (PDF, DOCX, TXT, MD, CSV, JSON)
    - Intelligent chunking with overlap
    - Async embedding generation with batch processing
    - Progress tracking
    - Error handling and recovery
    """

    def __init__(self, db=None, tenant_domain=None):
        self.db = db
        self.tenant_domain = tenant_domain or "test"  # Default fallback
        # Use configurable embedding client instead of hardcoded URL
        self.embedding_client = get_embedding_client()
        self.chunk_size = 512  # Default chunk size in tokens
        self.chunk_overlap = 128  # Default overlap
        self.max_file_size = 100 * 1024 * 1024  # 100MB limit

        # Embedding batch processing configuration
        self.EMBEDDING_BATCH_SIZE = 15  # Process embeddings in batches of 15 (ARM64 optimized)
        self.MAX_CONCURRENT_BATCHES = 3  # Process up to 3 batches concurrently
        self.MAX_RETRIES = 3  # Maximum retries per batch
        self.INITIAL_RETRY_DELAY = 1.0  # Initial delay in seconds

        # Supported file types
        self.supported_types = {
            '.pdf': 'application/pdf',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.txt': 'text/plain',
            '.md': 'text/markdown',
            '.csv': 'text/csv',
            '.json': 'application/json'
        }

    async def process_file(
        self,
        file_path: Path,
        dataset_id: str,
        user_id: str,
        original_filename: str,
        document_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Process a uploaded file through the complete RAG pipeline.

        Args:
            file_path: Path to uploaded file
            dataset_id: Dataset UUID to attach to
            user_id: User who uploaded the file
            original_filename: Original filename
            document_id: Optional existing document ID to update instead of creating new

        Returns:
            Dict: Document record with processing status
        """
        logger.info(f"Processing file {original_filename} for dataset {dataset_id}")

        # Process file directly (no session management needed with PostgreSQL client)
        return await self._process_file_internal(file_path, dataset_id, user_id, original_filename, document_id)

    async def _process_file_internal(
        self,
        file_path: Path,
        dataset_id: str,
        user_id: str,
        original_filename: str,
        document_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Internal file processing method"""
        try:
            # 1. Validate file
            await self._validate_file(file_path)

            # 2. Create or use existing document record
            if document_id:
                # Use existing document
                document = {"id": document_id}
                logger.info(f"Using existing document {document_id} for processing")
            else:
                # Create new document record
                document = await self._create_document_record(
                    file_path, dataset_id, user_id, original_filename
                )

            # 3. Get or extract text content
            await self._update_processing_status(document["id"], "processing", processing_stage="Getting text content")

            # Check if content already exists (e.g., from upload-time extraction)
            existing_content, storage_type = await self._get_existing_document_content(document["id"])

            if existing_content and storage_type in ["pdf_extracted", "text"]:
                # Use existing extracted content
                text_content = existing_content
                logger.info(f"Using existing extracted content ({len(text_content)} chars, type: {storage_type})")
            else:
                # Extract text from file
                await self._update_processing_status(document["id"], "processing", processing_stage="Extracting text")

                # Determine file type for extraction
                if document_id:
                    # For existing documents, determine file type from file extension
                    file_ext = file_path.suffix.lower()
                    file_type = self.supported_types.get(file_ext, 'text/plain')
                else:
                    file_type = document["file_type"]

                text_content = await self._extract_text(file_path, file_type)

                # 4. Update document with extracted text
                await self._update_document_content(document["id"], text_content)

            # 5. Generate document summary
            await self._update_processing_status(document["id"], "processing", processing_stage="Generating summary")
            await self._generate_document_summary(document["id"], text_content, original_filename, user_id)

            # 6. Chunk the document
            await self._update_processing_status(document["id"], "processing", processing_stage="Creating chunks")
            chunks = await self._chunk_text(text_content, document["id"])

            # Set expected chunk count for progress tracking
            await self._update_processing_status(
                document["id"], "processing",
                processing_stage="Preparing for embedding generation",
                total_chunks_expected=len(chunks)
            )

            # 7. Generate embeddings
            await self._update_processing_status(document["id"], "processing", processing_stage="Starting embedding generation")
            await self._generate_embeddings_for_chunks(chunks, dataset_id, user_id)

            # 8. Update final status
            await self._update_processing_status(
                document["id"], "completed",
                processing_stage="Completed",
                chunks_processed=len(chunks),
                total_chunks_expected=len(chunks)
            )
            await self._update_chunk_count(document["id"], len(chunks))

            # 9. Update dataset summary (after document is fully processed)
            await self._update_dataset_summary_after_document_change(dataset_id, user_id)

            logger.info(f"Successfully processed {original_filename} with {len(chunks)} chunks")
            return document

        except Exception as e:
            logger.error(f"Error processing file {original_filename}: {e}")
            if 'document' in locals():
                await self._update_processing_status(
                    document["id"], "failed",
                    error_message=str(e),
                    processing_stage="Failed"
                )
            raise

    async def _validate_file(self, file_path: Path):
        """Validate file size and type"""
        if not file_path.exists():
            raise ValueError("File does not exist")

        file_size = file_path.stat().st_size
        if file_size > self.max_file_size:
            raise ValueError(f"File too large: {file_size} bytes (max: {self.max_file_size})")

        file_ext = file_path.suffix.lower()
        if file_ext not in self.supported_types:
            raise ValueError(f"Unsupported file type: {file_ext}")

    async def _create_document_record(
        self,
        file_path: Path,
        dataset_id: str,
        user_id: str,
        original_filename: str
    ) -> Dict[str, Any]:
        """Create document record in database"""

        # Calculate file hash
        with open(file_path, 'rb') as f:
            file_hash = hashlib.sha256(f.read()).hexdigest()

        file_ext = file_path.suffix.lower()
        file_size = file_path.stat().st_size
        document_id = str(uuid.uuid4())

        # Insert document record using raw SQL
        # Note: tenant_id is nullable UUID, so we set it to NULL for individual documents
        pg_client = await get_postgresql_client()
        await pg_client.execute_command(
            """INSERT INTO documents (
                id, user_id, dataset_id, filename, original_filename,
                file_type, file_size_bytes, file_hash, processing_status
            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
            document_id, str(user_id), dataset_id, str(file_path.name),
            original_filename, self.supported_types[file_ext], file_size, file_hash, "pending"
        )

        return {
            "id": document_id,
            "user_id": user_id,
            "dataset_id": dataset_id,
            "filename": str(file_path.name),
            "original_filename": original_filename,
            "file_type": self.supported_types[file_ext],
            "file_size_bytes": file_size,
            "file_hash": file_hash,
            "processing_status": "pending",
            "chunk_count": 0
        }

    async def _extract_text(self, file_path: Path, file_type: str) -> str:
        """Extract text content from various file formats"""

        try:
            if file_type == 'application/pdf':
                return await self._extract_pdf_text(file_path)
            elif 'wordprocessingml' in file_type:
                return await self._extract_docx_text(file_path)
            elif file_type == 'text/csv':
                return await self._extract_csv_text(file_path)
            elif file_type == 'application/json':
                return await self._extract_json_text(file_path)
            else:  # text/plain, text/markdown
                return await self._extract_plain_text(file_path)

        except Exception as e:
            logger.error(f"Text extraction failed for {file_path}: {e}")
            raise ValueError(f"Could not extract text from file: {e}")

    async def _extract_pdf_text(self, file_path: Path) -> str:
        """Extract text from PDF file"""
        text_parts = []

        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)

            for page_num, page in enumerate(pdf_reader.pages):
                try:
                    page_text = page.extract_text()
                    if page_text.strip():
                        text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
                except Exception as e:
                    logger.warning(f"Could not extract text from page {page_num + 1}: {e}")

        if not text_parts:
            raise ValueError("No text could be extracted from PDF")

        return "\n\n".join(text_parts)

    async def _extract_docx_text(self, file_path: Path) -> str:
        """Extract text from DOCX file"""
        doc = docx.Document(file_path)
        text_parts = []

        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                text_parts.append(paragraph.text)

        if not text_parts:
            raise ValueError("No text could be extracted from DOCX")

        return "\n\n".join(text_parts)

    async def _extract_csv_text(self, file_path: Path) -> str:
        """Extract and format text from CSV file"""
        try:
            df = pd.read_csv(file_path)

            # Create readable format
            text_parts = [f"CSV Data with {len(df)} rows and {len(df.columns)} columns"]
            text_parts.append(f"Columns: {', '.join(df.columns.tolist())}")
            text_parts.append("")

            # Sample first few rows in readable format
            for idx, row in df.head(20).iterrows():
                row_text = []
                for col in df.columns:
                    if pd.notna(row[col]):
                        row_text.append(f"{col}: {row[col]}")
                text_parts.append(f"Row {idx + 1}: " + " | ".join(row_text))

            return "\n".join(text_parts)

        except Exception as e:
            logger.error(f"CSV parsing error: {e}")
            # Fallback to reading as plain text
            return await self._extract_plain_text(file_path)

    async def _extract_json_text(self, file_path: Path) -> str:
        """Extract and format text from JSON file"""
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Convert JSON to readable text format
        def json_to_text(obj, prefix=""):
            text_parts = []

            if isinstance(obj, dict):
                for key, value in obj.items():
                    if isinstance(value, (dict, list)):
                        text_parts.append(f"{prefix}{key}:")
                        text_parts.extend(json_to_text(value, prefix + "  "))
                    else:
                        text_parts.append(f"{prefix}{key}: {value}")
            elif isinstance(obj, list):
                for i, item in enumerate(obj):
                    if isinstance(item, (dict, list)):
                        text_parts.append(f"{prefix}Item {i + 1}:")
                        text_parts.extend(json_to_text(item, prefix + "  "))
                    else:
                        text_parts.append(f"{prefix}Item {i + 1}: {item}")
            else:
                text_parts.append(f"{prefix}{obj}")

            return text_parts

        return "\n".join(json_to_text(data))

    async def _extract_plain_text(self, file_path: Path) -> str:
        """Extract text from plain text files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            # Try with latin-1 encoding
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()

    async def extract_text_from_path(self, file_path: Path, content_type: str) -> str:
        """Public wrapper for text extraction from file path"""
        return await self._extract_text(file_path, content_type)

    async def chunk_text_simple(self, text: str) -> List[str]:
        """Public wrapper for simple text chunking without document_id"""
        chunks = []
        chunk_size = self.chunk_size * 4  # ~2048 chars
        overlap = self.chunk_overlap * 4   # ~512 chars

        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            if chunk.strip():
                chunks.append(chunk)

        return chunks

    async def _chunk_text(self, text: str, document_id: str) -> List[Dict[str, Any]]:
        """
        Split text into overlapping chunks optimized for embeddings.

        Returns:
            List of chunk dictionaries with content and metadata
        """
        # Simple sentence-aware chunking
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]

        chunks = []
        current_chunk = ""
        current_tokens = 0
        chunk_index = 0

        for sentence in sentences:
            sentence_tokens = len(sentence.split())

            # If adding this sentence would exceed chunk size, save current chunk
            if current_tokens + sentence_tokens > self.chunk_size and current_chunk:
                # Create chunk with overlap from previous chunk
                chunk_content = current_chunk.strip()
                if chunk_content:
                    chunks.append({
                        "document_id": document_id,
                        "chunk_index": chunk_index,
                        "content": chunk_content,
                        "token_count": current_tokens,
                        "content_hash": hashlib.md5(chunk_content.encode()).hexdigest()
                    })
                    chunk_index += 1

                # Start new chunk with overlap
                if self.chunk_overlap > 0 and chunks:
                    # Take last few sentences for overlap
                    overlap_sentences = current_chunk.split('.')[-2:]  # Rough overlap
                    current_chunk = '. '.join(s.strip() for s in overlap_sentences if s.strip())
                    current_tokens = len(current_chunk.split())
                else:
                    current_chunk = ""
                    current_tokens = 0

            # Add sentence to current chunk
            if current_chunk:
                current_chunk += ". " + sentence
            else:
                current_chunk = sentence
            current_tokens += sentence_tokens

        # Add final chunk
        if current_chunk.strip():
            chunk_content = current_chunk.strip()
            chunks.append({
                "document_id": document_id,
                "chunk_index": chunk_index,
                "content": chunk_content,
                "token_count": current_tokens,
                "content_hash": hashlib.md5(chunk_content.encode()).hexdigest()
            })

        logger.info(f"Created {len(chunks)} chunks from document {document_id}")
        return chunks

    async def _generate_embeddings_for_chunks(
        self,
        chunks: List[Dict[str, Any]],
        dataset_id: str,
        user_id: str
    ):
        """
        Generate embeddings for all chunks using concurrent batch processing.

        Processes chunks in batches with controlled concurrency to optimize performance
        while preventing system overload. Includes retry logic and progressive storage.
        """

        if not chunks:
            return

        total_chunks = len(chunks)
        document_id = chunks[0]["document_id"]
        total_batches = (total_chunks + self.EMBEDDING_BATCH_SIZE - 1) // self.EMBEDDING_BATCH_SIZE

        logger.info(f"Starting concurrent embedding generation for {total_chunks} chunks")
        logger.info(f"Batch size: {self.EMBEDDING_BATCH_SIZE}, Total batches: {total_batches}, Max concurrent: {self.MAX_CONCURRENT_BATCHES}")

        # Create semaphore to limit concurrent batches
        semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_BATCHES)

        # Create batch data with metadata
        batch_tasks = []
        for batch_start in range(0, total_chunks, self.EMBEDDING_BATCH_SIZE):
            batch_end = min(batch_start + self.EMBEDDING_BATCH_SIZE, total_chunks)
            batch_chunks = chunks[batch_start:batch_end]
            batch_num = (batch_start // self.EMBEDDING_BATCH_SIZE) + 1

            batch_metadata = {
                "chunks": batch_chunks,
                "batch_num": batch_num,
                "start_index": batch_start,
                "end_index": batch_end,
                "dataset_id": dataset_id,
                "user_id": user_id,
                "document_id": document_id
            }

            # Create task for this batch
            task = self._process_batch_with_semaphore(semaphore, batch_metadata, total_batches, total_chunks)
            batch_tasks.append(task)

        # Process all batches concurrently
        logger.info(f"Starting concurrent processing of {len(batch_tasks)} batches")
        start_time = asyncio.get_event_loop().time()

        results = await asyncio.gather(*batch_tasks, return_exceptions=True)

        end_time = asyncio.get_event_loop().time()
        processing_time = end_time - start_time

        # Analyze results
        successful_batches = []
        failed_batches = []

        for i, result in enumerate(results):
            batch_num = i + 1
            if isinstance(result, Exception):
                failed_batches.append({
                    "batch_num": batch_num,
                    "error": str(result)
                })
                logger.error(f"Batch {batch_num} failed: {result}")
            else:
                successful_batches.append(result)

        successful_chunks = sum(len(batch["chunks"]) for batch in successful_batches)

        logger.info(f"Concurrent processing completed in {processing_time:.2f} seconds")
        logger.info(f"Successfully processed {successful_chunks}/{total_chunks} chunks in {len(successful_batches)}/{total_batches} batches")

        # Report final results
        if failed_batches:
            failed_chunk_count = total_chunks - successful_chunks
            error_details = "; ".join([f"Batch {b['batch_num']}: {b['error']}" for b in failed_batches[:3]])
            if len(failed_batches) > 3:
                error_details += f" (and {len(failed_batches) - 3} more failures)"

            raise ValueError(f"Failed to generate embeddings for {failed_chunk_count}/{total_chunks} chunks. Errors: {error_details}")

        logger.info(f"Successfully stored all {total_chunks} chunks with embeddings")

    async def _process_batch_with_semaphore(
        self,
        semaphore: asyncio.Semaphore,
        batch_metadata: Dict[str, Any],
        total_batches: int,
        total_chunks: int
    ) -> Dict[str, Any]:
        """
        Process a single batch with semaphore-controlled concurrency.

        Args:
            semaphore: Concurrency control semaphore
            batch_metadata: Batch information including chunks and metadata
            total_batches: Total number of batches
            total_chunks: Total number of chunks

        Returns:
            Dict with batch processing results
        """
        async with semaphore:
            batch_chunks = batch_metadata["chunks"]
            batch_num = batch_metadata["batch_num"]
            dataset_id = batch_metadata["dataset_id"]
            user_id = batch_metadata["user_id"]
            document_id = batch_metadata["document_id"]

            logger.info(f"Starting batch {batch_num}/{total_batches} ({len(batch_chunks)} chunks)")

            try:
                # Generate embeddings for this batch (pass user_id for billing)
                embeddings = await self._generate_embedding_batch(batch_chunks, user_id=user_id)

                # Store embeddings for this batch immediately
                await self._store_chunk_embeddings(batch_chunks, embeddings, dataset_id, user_id)

                # Update progress in database
                progress_stage = f"Completed batch {batch_num}/{total_batches}"

                # Calculate current progress (approximate since batches complete out of order)
                await self._update_processing_status(
                    document_id, "processing",
                    processing_stage=progress_stage,
                    chunks_processed=batch_num * self.EMBEDDING_BATCH_SIZE,  # Approximate
                    total_chunks_expected=total_chunks
                )

                logger.info(f"Successfully completed batch {batch_num}/{total_batches}")

                return {
                    "batch_num": batch_num,
                    "chunks": batch_chunks,
                    "success": True
                }

            except Exception as e:
                logger.error(f"Failed to process batch {batch_num}/{total_batches}: {e}")
                raise ValueError(f"Batch {batch_num} failed: {str(e)}")

    async def _generate_embedding_batch(
        self,
        batch_chunks: List[Dict[str, Any]],
        user_id: str = None
    ) -> List[List[float]]:
        """
        Generate embeddings for a single batch of chunks with retry logic.

        Args:
            batch_chunks: List of chunk dictionaries
            user_id: User ID for usage tracking

        Returns:
            List of embedding vectors

        Raises:
            ValueError: If embedding generation fails after all retries
        """
        texts = [chunk["content"] for chunk in batch_chunks]

        for attempt in range(self.MAX_RETRIES + 1):
            try:
                # Use the configurable embedding client with tenant/user context for billing
                embeddings = await self.embedding_client.generate_embeddings(
                    texts,
                    tenant_id=self.tenant_domain,
                    user_id=str(user_id) if user_id else None
                )

                if len(embeddings) != len(texts):
                    raise ValueError(f"Embedding count mismatch: expected {len(texts)}, got {len(embeddings)}")

                return embeddings

            except Exception as e:
                if attempt < self.MAX_RETRIES:
                    delay = self.INITIAL_RETRY_DELAY * (2 ** attempt)  # Exponential backoff
                    logger.warning(f"Embedding generation attempt {attempt + 1}/{self.MAX_RETRIES + 1} failed: {e}. Retrying in {delay}s...")
                    await asyncio.sleep(delay)
                else:
                    logger.error(f"All {self.MAX_RETRIES + 1} embedding generation attempts failed. Final error: {e}")
                    logger.error(f"Failed request details: URL=http://gentwo-vllm-embeddings:8000/v1/embeddings, texts_count={len(texts)}")
                    raise ValueError(f"Embedding generation failed after {self.MAX_RETRIES + 1} attempts: {str(e)}")

    async def _store_chunk_embeddings(
        self,
        batch_chunks: List[Dict[str, Any]],
        embeddings: List[List[float]],
        dataset_id: str,
        user_id: str
    ):
        """Store chunk embeddings in database with proper error handling."""

        pg_client = await get_postgresql_client()
        for chunk_data, embedding in zip(batch_chunks, embeddings):
            chunk_id = str(uuid.uuid4())

            # Convert embedding list to PostgreSQL array format
            embedding_array = f"[{','.join(map(str, embedding))}]" if embedding else None

            await pg_client.execute_command(
                """INSERT INTO document_chunks (
                    id, document_id, user_id, dataset_id, chunk_index,
                    content, content_hash, token_count, embedding
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::vector)""",
                chunk_id, chunk_data["document_id"], str(user_id),
                dataset_id, chunk_data["chunk_index"], chunk_data["content"],
                chunk_data["content_hash"], chunk_data["token_count"], embedding_array
            )

    async def _update_processing_status(
        self,
        document_id: str,
        status: str,
        error_message: Optional[str] = None,
        processing_stage: Optional[str] = None,
        chunks_processed: Optional[int] = None,
        total_chunks_expected: Optional[int] = None
    ):
        """Update document processing status with progress tracking via metadata JSONB"""

        # Calculate progress percentage if we have the data
        processing_progress = None
        if chunks_processed is not None and total_chunks_expected is not None and total_chunks_expected > 0:
            processing_progress = min(100, int((chunks_processed / total_chunks_expected) * 100))

        # Build progress metadata object
        import json
        progress_data = {}
        if processing_stage is not None:
            progress_data['processing_stage'] = processing_stage
        if chunks_processed is not None:
            progress_data['chunks_processed'] = chunks_processed
        if total_chunks_expected is not None:
            progress_data['total_chunks_expected'] = total_chunks_expected
        if processing_progress is not None:
            progress_data['processing_progress'] = processing_progress

        pg_client = await get_postgresql_client()
        if error_message:
            await pg_client.execute_command(
                """UPDATE documents SET
                   processing_status = $1,
                   error_message = $2,
                   metadata = COALESCE(metadata, '{}'::jsonb) || $3::jsonb,
                   updated_at = NOW()
                   WHERE id = $4""",
                status, error_message, json.dumps(progress_data), document_id
            )
        else:
            await pg_client.execute_command(
                """UPDATE documents SET
                   processing_status = $1,
                   metadata = COALESCE(metadata, '{}'::jsonb) || $2::jsonb,
                   updated_at = NOW()
                   WHERE id = $3""",
                status, json.dumps(progress_data), document_id
            )

    async def _get_existing_document_content(self, document_id: str) -> tuple[str, str]:
        """Get existing document content and storage type"""
        pg_client = await get_postgresql_client()
        result = await pg_client.fetch_one(
            "SELECT content_text, metadata FROM documents WHERE id = $1",
            document_id
        )
        if result and result["content_text"]:
            # Handle metadata - might be JSON string or dict
            metadata_raw = result["metadata"] or "{}"
            if isinstance(metadata_raw, str):
                import json
                try:
                    metadata = json.loads(metadata_raw)
                except json.JSONDecodeError:
                    metadata = {}
            else:
                metadata = metadata_raw or {}
            storage_type = metadata.get("storage_type", "unknown")
            return result["content_text"], storage_type
        return None, None

    async def _update_document_content(self, document_id: str, content: str):
        """Update document with extracted text content"""
        pg_client = await get_postgresql_client()
        await pg_client.execute_command(
            "UPDATE documents SET content_text = $1, updated_at = NOW() WHERE id = $2",
            content, document_id
        )

    async def _update_chunk_count(self, document_id: str, chunk_count: int):
        """Update document with final chunk count"""
        pg_client = await get_postgresql_client()
        await pg_client.execute_command(
            "UPDATE documents SET chunk_count = $1, updated_at = NOW() WHERE id = $2",
            chunk_count, document_id
        )

    async def _generate_document_summary(
        self,
        document_id: str,
        content: str,
        filename: str,
        user_id: str
    ):
        """Generate and store AI summary for the document"""
        try:
            # Use tenant_domain from instance context
            tenant_domain = self.tenant_domain

            # Create summarization service instance
            summarization_service = SummarizationService(tenant_domain, user_id)

            # Generate summary using our new service
            summary = await summarization_service.generate_document_summary(
                document_id=document_id,
                document_content=content,
                document_name=filename
            )

            if summary:
                logger.info(f"Generated summary for document {document_id}: {summary[:100]}...")
            else:
                logger.warning(f"Failed to generate summary for document {document_id}")

        except Exception as e:
            logger.error(f"Error generating document summary for {document_id}: {e}")
            # Don't fail the entire document processing if summarization fails

    async def _update_dataset_summary_after_document_change(
        self,
        dataset_id: str,
        user_id: str
    ):
        """Update dataset summary after a document is added or removed"""
        try:
            # Create summarization service instance
            summarization_service = SummarizationService(self.tenant_domain, user_id)

            # Update dataset summary asynchronously (don't block document processing)
            asyncio.create_task(
                summarization_service.update_dataset_summary_on_change(dataset_id)
            )

            logger.info(f"Triggered dataset summary update for dataset {dataset_id}")

        except Exception as e:
            logger.error(f"Error triggering dataset summary update for {dataset_id}: {e}")
            # Don't fail document processing if dataset summary update fails

    async def get_processing_status(self, document_id: str) -> Dict[str, Any]:
        """Get current processing status of a document with progress information from metadata"""
        pg_client = await get_postgresql_client()
        result = await pg_client.fetch_one(
            """SELECT processing_status, error_message, chunk_count, metadata
               FROM documents WHERE id = $1""",
            document_id
        )

        if not result:
            raise ValueError("Document not found")

        # Extract progress data from metadata JSONB
        metadata = result["metadata"] or {}

        return {
            "status": result["processing_status"],
            "error_message": result["error_message"],
            "chunk_count": result["chunk_count"],
            "chunks_processed": metadata.get("chunks_processed"),
            "total_chunks_expected": metadata.get("total_chunks_expected"),
            "processing_progress": metadata.get("processing_progress"),
            "processing_stage": metadata.get("processing_stage")
        }


# Factory function for document processor
async def get_document_processor(tenant_domain=None):
    """Get document processor instance (will create its own DB session when needed)"""
    return DocumentProcessor(tenant_domain=tenant_domain)