Files
gt-ai-os-community/apps/tenant-backend/app/models/document.py
HackWeasel b9dfb86260 GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00

435 lines
16 KiB
Python

"""
Document and RAG Models for GT 2.0 Tenant Backend - Service-Based Architecture
Pydantic models for document entities using the PostgreSQL + PGVector backend.
Stores document metadata, RAG datasets, and processing status.
Perfect tenant isolation - each tenant has separate document data.
All vectors stored encrypted in tenant-specific ChromaDB.
"""
from datetime import datetime
from typing import List, Optional, Dict, Any
from enum import Enum
import uuid
from pydantic import Field, ConfigDict
from app.models.base import BaseServiceModel, BaseCreateModel, BaseUpdateModel, BaseResponseModel
# SQLAlchemy imports for database models
from sqlalchemy import Column, String, Integer, BigInteger, Text, DateTime, Boolean, JSON, ForeignKey
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from app.core.database import Base
# PGVector import for embeddings
try:
from pgvector.sqlalchemy import Vector
except ImportError:
# Fallback if pgvector not available
from sqlalchemy import Text as Vector
class DocumentStatus(str, Enum):
"""Document processing status enumeration"""
UPLOADING = "uploading"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
ARCHIVED = "archived"
class DocumentType(str, Enum):
"""Document type enumeration"""
PDF = "pdf"
DOCX = "docx"
TXT = "txt"
MD = "md"
HTML = "html"
JSON = "json"
CSV = "csv"
OTHER = "other"
class Document(BaseServiceModel):
"""
Document model for GT 2.0 service-based architecture.
Represents a document with metadata, processing status,
and RAG integration for knowledge retrieval.
"""
# Core document properties
filename: str = Field(..., min_length=1, max_length=255, description="Original filename")
original_name: str = Field(..., min_length=1, max_length=255, description="User-provided name")
file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., max_length=100, description="MIME type of the file")
doc_type: DocumentType = Field(..., description="Document type classification")
# Storage and processing
file_path: str = Field(..., description="Storage path for the file")
content_hash: Optional[str] = Field(None, max_length=64, description="SHA-256 hash of content")
status: DocumentStatus = Field(default=DocumentStatus.UPLOADING, description="Processing status")
# Owner and access
owner_id: str = Field(..., description="User ID of the document owner")
dataset_id: Optional[str] = Field(None, description="Associated dataset ID")
# RAG and processing metadata
content_preview: Optional[str] = Field(None, max_length=500, description="Content preview")
extracted_text: Optional[str] = Field(None, description="Extracted text content")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
# Processing statistics
chunk_count: int = Field(default=0, description="Number of chunks created")
vector_count: int = Field(default=0, description="Number of vectors stored")
processing_time_ms: Optional[float] = Field(None, description="Processing time in milliseconds")
# Errors and logs
error_message: Optional[str] = Field(None, description="Error message if processing failed")
processing_log: List[str] = Field(default_factory=list, description="Processing log entries")
# Timestamps
processed_at: Optional[datetime] = Field(None, description="When processing completed")
# Model configuration
model_config = ConfigDict(
protected_namespaces=(),
json_encoders={
datetime: lambda v: v.isoformat() if v else None
}
)
@classmethod
def get_table_name(cls) -> str:
"""Get the database table name"""
return "documents"
def mark_processing(self) -> None:
"""Mark document as processing"""
self.status = DocumentStatus.PROCESSING
self.update_timestamp()
def mark_completed(self, chunk_count: int, vector_count: int, processing_time_ms: float) -> None:
"""Mark document processing as completed"""
self.status = DocumentStatus.COMPLETED
self.chunk_count = chunk_count
self.vector_count = vector_count
self.processing_time_ms = processing_time_ms
self.processed_at = datetime.utcnow()
self.update_timestamp()
def mark_failed(self, error_message: str) -> None:
"""Mark document processing as failed"""
self.status = DocumentStatus.FAILED
self.error_message = error_message
self.update_timestamp()
def add_log_entry(self, message: str) -> None:
"""Add a processing log entry"""
timestamp = datetime.utcnow().isoformat()
self.processing_log.append(f"[{timestamp}] {message}")
class RAGDataset(BaseServiceModel):
"""
RAG Dataset model for organizing documents into collections.
Groups related documents together for focused retrieval and
provides dataset-level configuration and statistics.
"""
# Core dataset properties
name: str = Field(..., min_length=1, max_length=255, description="Dataset name")
description: Optional[str] = Field(None, max_length=1000, description="Dataset description")
# Owner and access
owner_id: str = Field(..., description="User ID of the dataset owner")
# Configuration
chunk_size: int = Field(default=1000, ge=100, le=5000, description="Default chunk size")
chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Default chunk overlap")
embedding_model: str = Field(default="all-MiniLM-L6-v2", description="Embedding model to use")
# Statistics
document_count: int = Field(default=0, description="Number of documents")
total_chunks: int = Field(default=0, description="Total chunks across all documents")
total_vectors: int = Field(default=0, description="Total vectors stored")
total_size_bytes: int = Field(default=0, description="Total size of all documents")
# Status
is_public: bool = Field(default=False, description="Whether dataset is publicly accessible")
# Model configuration
model_config = ConfigDict(
protected_namespaces=(),
json_encoders={
datetime: lambda v: v.isoformat() if v else None
}
)
@classmethod
def get_table_name(cls) -> str:
"""Get the database table name"""
return "rag_datasets"
def update_statistics(self, doc_count: int, chunk_count: int, vector_count: int, size_bytes: int) -> None:
"""Update dataset statistics"""
self.document_count = doc_count
self.total_chunks = chunk_count
self.total_vectors = vector_count
self.total_size_bytes = size_bytes
self.update_timestamp()
class DatasetDocument(BaseServiceModel):
"""
Dataset-Document relationship model for GT 2.0 service-based architecture.
Junction table model that links documents to RAG datasets,
tracking the relationship and statistics.
"""
# Core relationship properties
dataset_id: str = Field(..., description="RAG dataset ID")
document_id: str = Field(..., description="Document ID")
user_id: str = Field(..., description="User who added document to dataset")
# Statistics
chunk_count: int = Field(default=0, description="Number of chunks for this document")
vector_count: int = Field(default=0, description="Number of vectors stored for this document")
# Status
processing_status: str = Field(default="pending", max_length=50, description="Processing status")
# Model configuration
model_config = ConfigDict(
protected_namespaces=(),
json_encoders={
datetime: lambda v: v.isoformat() if v else None
}
)
@classmethod
def get_table_name(cls) -> str:
"""Get the database table name"""
return "dataset_documents"
class DocumentChunk(BaseServiceModel):
"""
Document chunk model for processed document pieces.
Represents individual chunks of processed documents with
embeddings and metadata for RAG retrieval.
"""
# Core chunk properties
document_id: str = Field(..., description="Parent document ID")
chunk_index: int = Field(..., ge=0, description="Chunk index within document")
chunk_text: str = Field(..., min_length=1, description="Chunk text content")
# Chunk metadata
chunk_size: int = Field(..., ge=1, description="Character count of chunk")
token_count: Optional[int] = Field(None, description="Token count for chunk")
chunk_metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk-specific metadata")
# Embedding information
embedding_id: Optional[str] = Field(None, description="Vector store embedding ID")
embedding_model: Optional[str] = Field(None, max_length=100, description="Model used for embedding")
# Position and context
start_char: Optional[int] = Field(None, description="Starting character position in document")
end_char: Optional[int] = Field(None, description="Ending character position in document")
# Model configuration
model_config = ConfigDict(
protected_namespaces=(),
json_encoders={
datetime: lambda v: v.isoformat() if v else None
}
)
@classmethod
def get_table_name(cls) -> str:
"""Get the database table name"""
return "document_chunks"
class DocumentCreate(BaseCreateModel):
"""Model for creating new documents"""
filename: str = Field(..., min_length=1, max_length=255)
original_name: str = Field(..., min_length=1, max_length=255)
file_size: int = Field(..., ge=0)
mime_type: str = Field(..., max_length=100)
doc_type: DocumentType
file_path: str
content_hash: Optional[str] = Field(None, max_length=64)
owner_id: str
dataset_id: Optional[str] = None
content_preview: Optional[str] = Field(None, max_length=500)
metadata: Dict[str, Any] = Field(default_factory=dict)
class DocumentUpdate(BaseUpdateModel):
"""Model for updating documents"""
original_name: Optional[str] = Field(None, min_length=1, max_length=255)
status: Optional[DocumentStatus] = None
dataset_id: Optional[str] = None
content_preview: Optional[str] = Field(None, max_length=500)
extracted_text: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
chunk_count: Optional[int] = Field(None, ge=0)
vector_count: Optional[int] = Field(None, ge=0)
processing_time_ms: Optional[float] = None
error_message: Optional[str] = None
processed_at: Optional[datetime] = None
class DocumentResponse(BaseResponseModel):
"""Model for document API responses"""
id: str
filename: str
original_name: str
file_size: int
mime_type: str
doc_type: DocumentType
file_path: str
content_hash: Optional[str]
status: DocumentStatus
owner_id: str
dataset_id: Optional[str]
content_preview: Optional[str]
metadata: Dict[str, Any]
chunk_count: int
vector_count: int
processing_time_ms: Optional[float]
error_message: Optional[str]
processing_log: List[str]
processed_at: Optional[datetime]
created_at: datetime
updated_at: datetime
class RAGDatasetCreate(BaseCreateModel):
"""Model for creating new RAG datasets"""
name: str = Field(..., min_length=1, max_length=255)
description: Optional[str] = Field(None, max_length=1000)
owner_id: str
chunk_size: int = Field(default=1000, ge=100, le=5000)
chunk_overlap: int = Field(default=200, ge=0, le=1000)
embedding_model: str = Field(default="all-MiniLM-L6-v2")
is_public: bool = Field(default=False)
class RAGDatasetUpdate(BaseUpdateModel):
"""Model for updating RAG datasets"""
name: Optional[str] = Field(None, min_length=1, max_length=255)
description: Optional[str] = Field(None, max_length=1000)
chunk_size: Optional[int] = Field(None, ge=100, le=5000)
chunk_overlap: Optional[int] = Field(None, ge=0, le=1000)
embedding_model: Optional[str] = None
is_public: Optional[bool] = None
class RAGDatasetResponse(BaseResponseModel):
"""Model for RAG dataset API responses"""
id: str
name: str
description: Optional[str]
owner_id: str
chunk_size: int
chunk_overlap: int
embedding_model: str
document_count: int
total_chunks: int
total_vectors: int
total_size_bytes: int
is_public: bool
created_at: datetime
updated_at: datetime
# SQLAlchemy Database Models for PostgreSQL + PGVector
class Document(Base):
"""SQLAlchemy model for documents table"""
__tablename__ = "documents"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
filename = Column(String(255), nullable=False)
original_filename = Column(String(255), nullable=False)
file_type = Column(String(100), nullable=False)
file_size_bytes = Column(BigInteger, nullable=False)
file_hash = Column(String(64), nullable=True)
content_text = Column(Text, nullable=True)
chunk_count = Column(Integer, default=0)
processing_status = Column(String(50), default="pending")
error_message = Column(Text, nullable=True)
doc_metadata = Column(JSONB, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
# Relationships
chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")
class DocumentChunk(Base):
"""SQLAlchemy model for document_chunks table"""
__tablename__ = "document_chunks"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = Column(UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True)
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
chunk_index = Column(Integer, nullable=False)
content = Column(Text, nullable=False)
content_hash = Column(String(32), nullable=True)
token_count = Column(Integer, nullable=True)
# PGVector embedding column (1024 dimensions for BGE-M3)
embedding = Column(Vector(1024), nullable=True)
chunk_metadata = Column(JSONB, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
# Relationships
document = relationship("Document", back_populates="chunks")
class Dataset(Base):
"""SQLAlchemy model for datasets table"""
__tablename__ = "datasets"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
user_id = Column(UUID(as_uuid=True), nullable=False, index=True) # created_by in schema
name = Column(String(255), nullable=False)
description = Column(Text, nullable=True)
chunk_size = Column(Integer, default=512)
chunk_overlap = Column(Integer, default=128)
embedding_model = Column(String(100), default='BAAI/bge-m3')
search_method = Column(String(20), default='hybrid')
specialized_language = Column(Boolean, default=False)
is_active = Column(Boolean, default=True)
visibility = Column(String(20), default='individual')
access_group = Column(String(50), default='individual')
dataset_metadata = Column(JSONB, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())