GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions
--- a/apps/tenant-backend/app/models/document.py
+++ b/apps/tenant-backend/app/models/document.py
@@ -0,0 +1,435 @@
+"""
+Document and RAG Models for GT 2.0 Tenant Backend - Service-Based Architecture
+
+Pydantic models for document entities using the PostgreSQL + PGVector backend.
+Stores document metadata, RAG datasets, and processing status.
+Perfect tenant isolation - each tenant has separate document data.
+All vectors stored encrypted in tenant-specific ChromaDB.
+"""
+
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from enum import Enum
+import uuid
+
+from pydantic import Field, ConfigDict
+from app.models.base import BaseServiceModel, BaseCreateModel, BaseUpdateModel, BaseResponseModel
+
+# SQLAlchemy imports for database models
+from sqlalchemy import Column, String, Integer, BigInteger, Text, DateTime, Boolean, JSON, ForeignKey
+from sqlalchemy.dialects.postgresql import UUID, JSONB
+from sqlalchemy.sql import func
+from sqlalchemy.orm import relationship
+from app.core.database import Base
+
+# PGVector import for embeddings
+try:
+    from pgvector.sqlalchemy import Vector
+except ImportError:
+    # Fallback if pgvector not available
+    from sqlalchemy import Text as Vector
+
+
+class DocumentStatus(str, Enum):
+    """Document processing status enumeration"""
+    UPLOADING = "uploading"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    ARCHIVED = "archived"
+
+
+class DocumentType(str, Enum):
+    """Document type enumeration"""
+    PDF = "pdf"
+    DOCX = "docx"
+    TXT = "txt"
+    MD = "md"
+    HTML = "html"
+    JSON = "json"
+    CSV = "csv"
+    OTHER = "other"
+
+
+class Document(BaseServiceModel):
+    """
+    Document model for GT 2.0 service-based architecture.
+    
+    Represents a document with metadata, processing status,
+    and RAG integration for knowledge retrieval.
+    """
+    
+    # Core document properties
+    filename: str = Field(..., min_length=1, max_length=255, description="Original filename")
+    original_name: str = Field(..., min_length=1, max_length=255, description="User-provided name")
+    file_size: int = Field(..., ge=0, description="File size in bytes")
+    mime_type: str = Field(..., max_length=100, description="MIME type of the file")
+    doc_type: DocumentType = Field(..., description="Document type classification")
+    
+    # Storage and processing
+    file_path: str = Field(..., description="Storage path for the file")
+    content_hash: Optional[str] = Field(None, max_length=64, description="SHA-256 hash of content")
+    status: DocumentStatus = Field(default=DocumentStatus.UPLOADING, description="Processing status")
+    
+    # Owner and access
+    owner_id: str = Field(..., description="User ID of the document owner")
+    dataset_id: Optional[str] = Field(None, description="Associated dataset ID")
+    
+    # RAG and processing metadata
+    content_preview: Optional[str] = Field(None, max_length=500, description="Content preview")
+    extracted_text: Optional[str] = Field(None, description="Extracted text content")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
+    
+    # Processing statistics
+    chunk_count: int = Field(default=0, description="Number of chunks created")
+    vector_count: int = Field(default=0, description="Number of vectors stored")
+    processing_time_ms: Optional[float] = Field(None, description="Processing time in milliseconds")
+    
+    # Errors and logs
+    error_message: Optional[str] = Field(None, description="Error message if processing failed")
+    processing_log: List[str] = Field(default_factory=list, description="Processing log entries")
+    
+    # Timestamps
+    processed_at: Optional[datetime] = Field(None, description="When processing completed")
+    
+    # Model configuration
+    model_config = ConfigDict(
+        protected_namespaces=(),
+        json_encoders={
+            datetime: lambda v: v.isoformat() if v else None
+        }
+    )
+    
+    @classmethod
+    def get_table_name(cls) -> str:
+        """Get the database table name"""
+        return "documents"
+    
+    def mark_processing(self) -> None:
+        """Mark document as processing"""
+        self.status = DocumentStatus.PROCESSING
+        self.update_timestamp()
+    
+    def mark_completed(self, chunk_count: int, vector_count: int, processing_time_ms: float) -> None:
+        """Mark document processing as completed"""
+        self.status = DocumentStatus.COMPLETED
+        self.chunk_count = chunk_count
+        self.vector_count = vector_count
+        self.processing_time_ms = processing_time_ms
+        self.processed_at = datetime.utcnow()
+        self.update_timestamp()
+    
+    def mark_failed(self, error_message: str) -> None:
+        """Mark document processing as failed"""
+        self.status = DocumentStatus.FAILED
+        self.error_message = error_message
+        self.update_timestamp()
+    
+    def add_log_entry(self, message: str) -> None:
+        """Add a processing log entry"""
+        timestamp = datetime.utcnow().isoformat()
+        self.processing_log.append(f"[{timestamp}] {message}")
+
+
+class RAGDataset(BaseServiceModel):
+    """
+    RAG Dataset model for organizing documents into collections.
+    
+    Groups related documents together for focused retrieval and
+    provides dataset-level configuration and statistics.
+    """
+    
+    # Core dataset properties
+    name: str = Field(..., min_length=1, max_length=255, description="Dataset name")
+    description: Optional[str] = Field(None, max_length=1000, description="Dataset description")
+    
+    # Owner and access
+    owner_id: str = Field(..., description="User ID of the dataset owner")
+    
+    # Configuration
+    chunk_size: int = Field(default=1000, ge=100, le=5000, description="Default chunk size")
+    chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Default chunk overlap")
+    embedding_model: str = Field(default="all-MiniLM-L6-v2", description="Embedding model to use")
+    
+    # Statistics
+    document_count: int = Field(default=0, description="Number of documents")
+    total_chunks: int = Field(default=0, description="Total chunks across all documents")
+    total_vectors: int = Field(default=0, description="Total vectors stored")
+    total_size_bytes: int = Field(default=0, description="Total size of all documents")
+    
+    # Status
+    is_public: bool = Field(default=False, description="Whether dataset is publicly accessible")
+    
+    # Model configuration
+    model_config = ConfigDict(
+        protected_namespaces=(),
+        json_encoders={
+            datetime: lambda v: v.isoformat() if v else None
+        }
+    )
+    
+    @classmethod
+    def get_table_name(cls) -> str:
+        """Get the database table name"""
+        return "rag_datasets"
+    
+    def update_statistics(self, doc_count: int, chunk_count: int, vector_count: int, size_bytes: int) -> None:
+        """Update dataset statistics"""
+        self.document_count = doc_count
+        self.total_chunks = chunk_count
+        self.total_vectors = vector_count
+        self.total_size_bytes = size_bytes
+        self.update_timestamp()
+
+
+class DatasetDocument(BaseServiceModel):
+    """
+    Dataset-Document relationship model for GT 2.0 service-based architecture.
+    
+    Junction table model that links documents to RAG datasets,
+    tracking the relationship and statistics.
+    """
+    
+    # Core relationship properties
+    dataset_id: str = Field(..., description="RAG dataset ID")
+    document_id: str = Field(..., description="Document ID")
+    user_id: str = Field(..., description="User who added document to dataset")
+    
+    # Statistics
+    chunk_count: int = Field(default=0, description="Number of chunks for this document")
+    vector_count: int = Field(default=0, description="Number of vectors stored for this document")
+    
+    # Status
+    processing_status: str = Field(default="pending", max_length=50, description="Processing status")
+    
+    # Model configuration
+    model_config = ConfigDict(
+        protected_namespaces=(),
+        json_encoders={
+            datetime: lambda v: v.isoformat() if v else None
+        }
+    )
+    
+    @classmethod
+    def get_table_name(cls) -> str:
+        """Get the database table name"""
+        return "dataset_documents"
+
+
+class DocumentChunk(BaseServiceModel):
+    """
+    Document chunk model for processed document pieces.
+    
+    Represents individual chunks of processed documents with 
+    embeddings and metadata for RAG retrieval.
+    """
+    
+    # Core chunk properties
+    document_id: str = Field(..., description="Parent document ID")
+    chunk_index: int = Field(..., ge=0, description="Chunk index within document")
+    chunk_text: str = Field(..., min_length=1, description="Chunk text content")
+    
+    # Chunk metadata
+    chunk_size: int = Field(..., ge=1, description="Character count of chunk")
+    token_count: Optional[int] = Field(None, description="Token count for chunk")
+    chunk_metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk-specific metadata")
+    
+    # Embedding information
+    embedding_id: Optional[str] = Field(None, description="Vector store embedding ID")
+    embedding_model: Optional[str] = Field(None, max_length=100, description="Model used for embedding")
+    
+    # Position and context
+    start_char: Optional[int] = Field(None, description="Starting character position in document")
+    end_char: Optional[int] = Field(None, description="Ending character position in document")
+    
+    # Model configuration
+    model_config = ConfigDict(
+        protected_namespaces=(),
+        json_encoders={
+            datetime: lambda v: v.isoformat() if v else None
+        }
+    )
+    
+    @classmethod
+    def get_table_name(cls) -> str:
+        """Get the database table name"""
+        return "document_chunks"
+
+
+class DocumentCreate(BaseCreateModel):
+    """Model for creating new documents"""
+    filename: str = Field(..., min_length=1, max_length=255)
+    original_name: str = Field(..., min_length=1, max_length=255)
+    file_size: int = Field(..., ge=0)
+    mime_type: str = Field(..., max_length=100)
+    doc_type: DocumentType
+    file_path: str
+    content_hash: Optional[str] = Field(None, max_length=64)
+    owner_id: str
+    dataset_id: Optional[str] = None
+    content_preview: Optional[str] = Field(None, max_length=500)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class DocumentUpdate(BaseUpdateModel):
+    """Model for updating documents"""
+    original_name: Optional[str] = Field(None, min_length=1, max_length=255)
+    status: Optional[DocumentStatus] = None
+    dataset_id: Optional[str] = None
+    content_preview: Optional[str] = Field(None, max_length=500)
+    extracted_text: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+    chunk_count: Optional[int] = Field(None, ge=0)
+    vector_count: Optional[int] = Field(None, ge=0)
+    processing_time_ms: Optional[float] = None
+    error_message: Optional[str] = None
+    processed_at: Optional[datetime] = None
+
+
+class DocumentResponse(BaseResponseModel):
+    """Model for document API responses"""
+    id: str
+    filename: str
+    original_name: str
+    file_size: int
+    mime_type: str
+    doc_type: DocumentType
+    file_path: str
+    content_hash: Optional[str]
+    status: DocumentStatus
+    owner_id: str
+    dataset_id: Optional[str]
+    content_preview: Optional[str]
+    metadata: Dict[str, Any]
+    chunk_count: int
+    vector_count: int
+    processing_time_ms: Optional[float]
+    error_message: Optional[str]
+    processing_log: List[str]
+    processed_at: Optional[datetime]
+    created_at: datetime
+    updated_at: datetime
+
+
+class RAGDatasetCreate(BaseCreateModel):
+    """Model for creating new RAG datasets"""
+    name: str = Field(..., min_length=1, max_length=255)
+    description: Optional[str] = Field(None, max_length=1000)
+    owner_id: str
+    chunk_size: int = Field(default=1000, ge=100, le=5000)
+    chunk_overlap: int = Field(default=200, ge=0, le=1000)
+    embedding_model: str = Field(default="all-MiniLM-L6-v2")
+    is_public: bool = Field(default=False)
+
+
+class RAGDatasetUpdate(BaseUpdateModel):
+    """Model for updating RAG datasets"""
+    name: Optional[str] = Field(None, min_length=1, max_length=255)
+    description: Optional[str] = Field(None, max_length=1000)
+    chunk_size: Optional[int] = Field(None, ge=100, le=5000)
+    chunk_overlap: Optional[int] = Field(None, ge=0, le=1000)
+    embedding_model: Optional[str] = None
+    is_public: Optional[bool] = None
+
+
+class RAGDatasetResponse(BaseResponseModel):
+    """Model for RAG dataset API responses"""
+    id: str
+    name: str
+    description: Optional[str]
+    owner_id: str
+    chunk_size: int
+    chunk_overlap: int
+    embedding_model: str
+    document_count: int
+    total_chunks: int
+    total_vectors: int
+    total_size_bytes: int
+    is_public: bool
+    created_at: datetime
+    updated_at: datetime
+
+
+# SQLAlchemy Database Models for PostgreSQL + PGVector
+
+class Document(Base):
+    """SQLAlchemy model for documents table"""
+    __tablename__ = "documents"
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
+    user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
+    dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
+
+    filename = Column(String(255), nullable=False)
+    original_filename = Column(String(255), nullable=False)
+    file_type = Column(String(100), nullable=False)
+    file_size_bytes = Column(BigInteger, nullable=False)
+    file_hash = Column(String(64), nullable=True)
+
+    content_text = Column(Text, nullable=True)
+    chunk_count = Column(Integer, default=0)
+    processing_status = Column(String(50), default="pending")
+    error_message = Column(Text, nullable=True)
+
+    doc_metadata = Column(JSONB, nullable=True)
+
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
+
+    # Relationships
+    chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")
+
+
+class DocumentChunk(Base):
+    """SQLAlchemy model for document_chunks table"""
+    __tablename__ = "document_chunks"
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document_id = Column(UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True)
+    tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
+    user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
+    dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
+
+    chunk_index = Column(Integer, nullable=False)
+    content = Column(Text, nullable=False)
+    content_hash = Column(String(32), nullable=True)
+    token_count = Column(Integer, nullable=True)
+
+    # PGVector embedding column (1024 dimensions for BGE-M3)
+    embedding = Column(Vector(1024), nullable=True)
+
+    chunk_metadata = Column(JSONB, nullable=True)
+
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
+
+    # Relationships
+    document = relationship("Document", back_populates="chunks")
+
+
+class Dataset(Base):
+    """SQLAlchemy model for datasets table"""
+    __tablename__ = "datasets"
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
+    user_id = Column(UUID(as_uuid=True), nullable=False, index=True)  # created_by in schema
+
+    name = Column(String(255), nullable=False)
+    description = Column(Text, nullable=True)
+
+    chunk_size = Column(Integer, default=512)
+    chunk_overlap = Column(Integer, default=128)
+    embedding_model = Column(String(100), default='BAAI/bge-m3')
+    search_method = Column(String(20), default='hybrid')
+    specialized_language = Column(Boolean, default=False)
+
+    is_active = Column(Boolean, default=True)
+    visibility = Column(String(20), default='individual')
+    access_group = Column(String(50), default='individual')
+
+    dataset_metadata = Column(JSONB, nullable=True)
+
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())