GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
435
apps/tenant-backend/app/models/document.py
Normal file
435
apps/tenant-backend/app/models/document.py
Normal file
@@ -0,0 +1,435 @@
|
||||
"""
|
||||
Document and RAG Models for GT 2.0 Tenant Backend - Service-Based Architecture
|
||||
|
||||
Pydantic models for document entities using the PostgreSQL + PGVector backend.
|
||||
Stores document metadata, RAG datasets, and processing status.
|
||||
Perfect tenant isolation - each tenant has separate document data.
|
||||
All vectors stored encrypted in tenant-specific ChromaDB.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Dict, Any
|
||||
from enum import Enum
|
||||
import uuid
|
||||
|
||||
from pydantic import Field, ConfigDict
|
||||
from app.models.base import BaseServiceModel, BaseCreateModel, BaseUpdateModel, BaseResponseModel
|
||||
|
||||
# SQLAlchemy imports for database models
|
||||
from sqlalchemy import Column, String, Integer, BigInteger, Text, DateTime, Boolean, JSON, ForeignKey
|
||||
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
||||
from sqlalchemy.sql import func
|
||||
from sqlalchemy.orm import relationship
|
||||
from app.core.database import Base
|
||||
|
||||
# PGVector import for embeddings
|
||||
try:
|
||||
from pgvector.sqlalchemy import Vector
|
||||
except ImportError:
|
||||
# Fallback if pgvector not available
|
||||
from sqlalchemy import Text as Vector
|
||||
|
||||
|
||||
class DocumentStatus(str, Enum):
|
||||
"""Document processing status enumeration"""
|
||||
UPLOADING = "uploading"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
ARCHIVED = "archived"
|
||||
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
"""Document type enumeration"""
|
||||
PDF = "pdf"
|
||||
DOCX = "docx"
|
||||
TXT = "txt"
|
||||
MD = "md"
|
||||
HTML = "html"
|
||||
JSON = "json"
|
||||
CSV = "csv"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class Document(BaseServiceModel):
|
||||
"""
|
||||
Document model for GT 2.0 service-based architecture.
|
||||
|
||||
Represents a document with metadata, processing status,
|
||||
and RAG integration for knowledge retrieval.
|
||||
"""
|
||||
|
||||
# Core document properties
|
||||
filename: str = Field(..., min_length=1, max_length=255, description="Original filename")
|
||||
original_name: str = Field(..., min_length=1, max_length=255, description="User-provided name")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., max_length=100, description="MIME type of the file")
|
||||
doc_type: DocumentType = Field(..., description="Document type classification")
|
||||
|
||||
# Storage and processing
|
||||
file_path: str = Field(..., description="Storage path for the file")
|
||||
content_hash: Optional[str] = Field(None, max_length=64, description="SHA-256 hash of content")
|
||||
status: DocumentStatus = Field(default=DocumentStatus.UPLOADING, description="Processing status")
|
||||
|
||||
# Owner and access
|
||||
owner_id: str = Field(..., description="User ID of the document owner")
|
||||
dataset_id: Optional[str] = Field(None, description="Associated dataset ID")
|
||||
|
||||
# RAG and processing metadata
|
||||
content_preview: Optional[str] = Field(None, max_length=500, description="Content preview")
|
||||
extracted_text: Optional[str] = Field(None, description="Extracted text content")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
||||
|
||||
# Processing statistics
|
||||
chunk_count: int = Field(default=0, description="Number of chunks created")
|
||||
vector_count: int = Field(default=0, description="Number of vectors stored")
|
||||
processing_time_ms: Optional[float] = Field(None, description="Processing time in milliseconds")
|
||||
|
||||
# Errors and logs
|
||||
error_message: Optional[str] = Field(None, description="Error message if processing failed")
|
||||
processing_log: List[str] = Field(default_factory=list, description="Processing log entries")
|
||||
|
||||
# Timestamps
|
||||
processed_at: Optional[datetime] = Field(None, description="When processing completed")
|
||||
|
||||
# Model configuration
|
||||
model_config = ConfigDict(
|
||||
protected_namespaces=(),
|
||||
json_encoders={
|
||||
datetime: lambda v: v.isoformat() if v else None
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_table_name(cls) -> str:
|
||||
"""Get the database table name"""
|
||||
return "documents"
|
||||
|
||||
def mark_processing(self) -> None:
|
||||
"""Mark document as processing"""
|
||||
self.status = DocumentStatus.PROCESSING
|
||||
self.update_timestamp()
|
||||
|
||||
def mark_completed(self, chunk_count: int, vector_count: int, processing_time_ms: float) -> None:
|
||||
"""Mark document processing as completed"""
|
||||
self.status = DocumentStatus.COMPLETED
|
||||
self.chunk_count = chunk_count
|
||||
self.vector_count = vector_count
|
||||
self.processing_time_ms = processing_time_ms
|
||||
self.processed_at = datetime.utcnow()
|
||||
self.update_timestamp()
|
||||
|
||||
def mark_failed(self, error_message: str) -> None:
|
||||
"""Mark document processing as failed"""
|
||||
self.status = DocumentStatus.FAILED
|
||||
self.error_message = error_message
|
||||
self.update_timestamp()
|
||||
|
||||
def add_log_entry(self, message: str) -> None:
|
||||
"""Add a processing log entry"""
|
||||
timestamp = datetime.utcnow().isoformat()
|
||||
self.processing_log.append(f"[{timestamp}] {message}")
|
||||
|
||||
|
||||
class RAGDataset(BaseServiceModel):
|
||||
"""
|
||||
RAG Dataset model for organizing documents into collections.
|
||||
|
||||
Groups related documents together for focused retrieval and
|
||||
provides dataset-level configuration and statistics.
|
||||
"""
|
||||
|
||||
# Core dataset properties
|
||||
name: str = Field(..., min_length=1, max_length=255, description="Dataset name")
|
||||
description: Optional[str] = Field(None, max_length=1000, description="Dataset description")
|
||||
|
||||
# Owner and access
|
||||
owner_id: str = Field(..., description="User ID of the dataset owner")
|
||||
|
||||
# Configuration
|
||||
chunk_size: int = Field(default=1000, ge=100, le=5000, description="Default chunk size")
|
||||
chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Default chunk overlap")
|
||||
embedding_model: str = Field(default="all-MiniLM-L6-v2", description="Embedding model to use")
|
||||
|
||||
# Statistics
|
||||
document_count: int = Field(default=0, description="Number of documents")
|
||||
total_chunks: int = Field(default=0, description="Total chunks across all documents")
|
||||
total_vectors: int = Field(default=0, description="Total vectors stored")
|
||||
total_size_bytes: int = Field(default=0, description="Total size of all documents")
|
||||
|
||||
# Status
|
||||
is_public: bool = Field(default=False, description="Whether dataset is publicly accessible")
|
||||
|
||||
# Model configuration
|
||||
model_config = ConfigDict(
|
||||
protected_namespaces=(),
|
||||
json_encoders={
|
||||
datetime: lambda v: v.isoformat() if v else None
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_table_name(cls) -> str:
|
||||
"""Get the database table name"""
|
||||
return "rag_datasets"
|
||||
|
||||
def update_statistics(self, doc_count: int, chunk_count: int, vector_count: int, size_bytes: int) -> None:
|
||||
"""Update dataset statistics"""
|
||||
self.document_count = doc_count
|
||||
self.total_chunks = chunk_count
|
||||
self.total_vectors = vector_count
|
||||
self.total_size_bytes = size_bytes
|
||||
self.update_timestamp()
|
||||
|
||||
|
||||
class DatasetDocument(BaseServiceModel):
|
||||
"""
|
||||
Dataset-Document relationship model for GT 2.0 service-based architecture.
|
||||
|
||||
Junction table model that links documents to RAG datasets,
|
||||
tracking the relationship and statistics.
|
||||
"""
|
||||
|
||||
# Core relationship properties
|
||||
dataset_id: str = Field(..., description="RAG dataset ID")
|
||||
document_id: str = Field(..., description="Document ID")
|
||||
user_id: str = Field(..., description="User who added document to dataset")
|
||||
|
||||
# Statistics
|
||||
chunk_count: int = Field(default=0, description="Number of chunks for this document")
|
||||
vector_count: int = Field(default=0, description="Number of vectors stored for this document")
|
||||
|
||||
# Status
|
||||
processing_status: str = Field(default="pending", max_length=50, description="Processing status")
|
||||
|
||||
# Model configuration
|
||||
model_config = ConfigDict(
|
||||
protected_namespaces=(),
|
||||
json_encoders={
|
||||
datetime: lambda v: v.isoformat() if v else None
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_table_name(cls) -> str:
|
||||
"""Get the database table name"""
|
||||
return "dataset_documents"
|
||||
|
||||
|
||||
class DocumentChunk(BaseServiceModel):
|
||||
"""
|
||||
Document chunk model for processed document pieces.
|
||||
|
||||
Represents individual chunks of processed documents with
|
||||
embeddings and metadata for RAG retrieval.
|
||||
"""
|
||||
|
||||
# Core chunk properties
|
||||
document_id: str = Field(..., description="Parent document ID")
|
||||
chunk_index: int = Field(..., ge=0, description="Chunk index within document")
|
||||
chunk_text: str = Field(..., min_length=1, description="Chunk text content")
|
||||
|
||||
# Chunk metadata
|
||||
chunk_size: int = Field(..., ge=1, description="Character count of chunk")
|
||||
token_count: Optional[int] = Field(None, description="Token count for chunk")
|
||||
chunk_metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk-specific metadata")
|
||||
|
||||
# Embedding information
|
||||
embedding_id: Optional[str] = Field(None, description="Vector store embedding ID")
|
||||
embedding_model: Optional[str] = Field(None, max_length=100, description="Model used for embedding")
|
||||
|
||||
# Position and context
|
||||
start_char: Optional[int] = Field(None, description="Starting character position in document")
|
||||
end_char: Optional[int] = Field(None, description="Ending character position in document")
|
||||
|
||||
# Model configuration
|
||||
model_config = ConfigDict(
|
||||
protected_namespaces=(),
|
||||
json_encoders={
|
||||
datetime: lambda v: v.isoformat() if v else None
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_table_name(cls) -> str:
|
||||
"""Get the database table name"""
|
||||
return "document_chunks"
|
||||
|
||||
|
||||
class DocumentCreate(BaseCreateModel):
|
||||
"""Model for creating new documents"""
|
||||
filename: str = Field(..., min_length=1, max_length=255)
|
||||
original_name: str = Field(..., min_length=1, max_length=255)
|
||||
file_size: int = Field(..., ge=0)
|
||||
mime_type: str = Field(..., max_length=100)
|
||||
doc_type: DocumentType
|
||||
file_path: str
|
||||
content_hash: Optional[str] = Field(None, max_length=64)
|
||||
owner_id: str
|
||||
dataset_id: Optional[str] = None
|
||||
content_preview: Optional[str] = Field(None, max_length=500)
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class DocumentUpdate(BaseUpdateModel):
|
||||
"""Model for updating documents"""
|
||||
original_name: Optional[str] = Field(None, min_length=1, max_length=255)
|
||||
status: Optional[DocumentStatus] = None
|
||||
dataset_id: Optional[str] = None
|
||||
content_preview: Optional[str] = Field(None, max_length=500)
|
||||
extracted_text: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
chunk_count: Optional[int] = Field(None, ge=0)
|
||||
vector_count: Optional[int] = Field(None, ge=0)
|
||||
processing_time_ms: Optional[float] = None
|
||||
error_message: Optional[str] = None
|
||||
processed_at: Optional[datetime] = None
|
||||
|
||||
|
||||
class DocumentResponse(BaseResponseModel):
|
||||
"""Model for document API responses"""
|
||||
id: str
|
||||
filename: str
|
||||
original_name: str
|
||||
file_size: int
|
||||
mime_type: str
|
||||
doc_type: DocumentType
|
||||
file_path: str
|
||||
content_hash: Optional[str]
|
||||
status: DocumentStatus
|
||||
owner_id: str
|
||||
dataset_id: Optional[str]
|
||||
content_preview: Optional[str]
|
||||
metadata: Dict[str, Any]
|
||||
chunk_count: int
|
||||
vector_count: int
|
||||
processing_time_ms: Optional[float]
|
||||
error_message: Optional[str]
|
||||
processing_log: List[str]
|
||||
processed_at: Optional[datetime]
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
class RAGDatasetCreate(BaseCreateModel):
|
||||
"""Model for creating new RAG datasets"""
|
||||
name: str = Field(..., min_length=1, max_length=255)
|
||||
description: Optional[str] = Field(None, max_length=1000)
|
||||
owner_id: str
|
||||
chunk_size: int = Field(default=1000, ge=100, le=5000)
|
||||
chunk_overlap: int = Field(default=200, ge=0, le=1000)
|
||||
embedding_model: str = Field(default="all-MiniLM-L6-v2")
|
||||
is_public: bool = Field(default=False)
|
||||
|
||||
|
||||
class RAGDatasetUpdate(BaseUpdateModel):
|
||||
"""Model for updating RAG datasets"""
|
||||
name: Optional[str] = Field(None, min_length=1, max_length=255)
|
||||
description: Optional[str] = Field(None, max_length=1000)
|
||||
chunk_size: Optional[int] = Field(None, ge=100, le=5000)
|
||||
chunk_overlap: Optional[int] = Field(None, ge=0, le=1000)
|
||||
embedding_model: Optional[str] = None
|
||||
is_public: Optional[bool] = None
|
||||
|
||||
|
||||
class RAGDatasetResponse(BaseResponseModel):
|
||||
"""Model for RAG dataset API responses"""
|
||||
id: str
|
||||
name: str
|
||||
description: Optional[str]
|
||||
owner_id: str
|
||||
chunk_size: int
|
||||
chunk_overlap: int
|
||||
embedding_model: str
|
||||
document_count: int
|
||||
total_chunks: int
|
||||
total_vectors: int
|
||||
total_size_bytes: int
|
||||
is_public: bool
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
# SQLAlchemy Database Models for PostgreSQL + PGVector
|
||||
|
||||
class Document(Base):
|
||||
"""SQLAlchemy model for documents table"""
|
||||
__tablename__ = "documents"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
||||
user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
||||
dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
|
||||
|
||||
filename = Column(String(255), nullable=False)
|
||||
original_filename = Column(String(255), nullable=False)
|
||||
file_type = Column(String(100), nullable=False)
|
||||
file_size_bytes = Column(BigInteger, nullable=False)
|
||||
file_hash = Column(String(64), nullable=True)
|
||||
|
||||
content_text = Column(Text, nullable=True)
|
||||
chunk_count = Column(Integer, default=0)
|
||||
processing_status = Column(String(50), default="pending")
|
||||
error_message = Column(Text, nullable=True)
|
||||
|
||||
doc_metadata = Column(JSONB, nullable=True)
|
||||
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
|
||||
# Relationships
|
||||
chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
class DocumentChunk(Base):
|
||||
"""SQLAlchemy model for document_chunks table"""
|
||||
__tablename__ = "document_chunks"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
document_id = Column(UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
||||
user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
||||
dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
|
||||
|
||||
chunk_index = Column(Integer, nullable=False)
|
||||
content = Column(Text, nullable=False)
|
||||
content_hash = Column(String(32), nullable=True)
|
||||
token_count = Column(Integer, nullable=True)
|
||||
|
||||
# PGVector embedding column (1024 dimensions for BGE-M3)
|
||||
embedding = Column(Vector(1024), nullable=True)
|
||||
|
||||
chunk_metadata = Column(JSONB, nullable=True)
|
||||
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
|
||||
# Relationships
|
||||
document = relationship("Document", back_populates="chunks")
|
||||
|
||||
|
||||
class Dataset(Base):
|
||||
"""SQLAlchemy model for datasets table"""
|
||||
__tablename__ = "datasets"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
||||
user_id = Column(UUID(as_uuid=True), nullable=False, index=True) # created_by in schema
|
||||
|
||||
name = Column(String(255), nullable=False)
|
||||
description = Column(Text, nullable=True)
|
||||
|
||||
chunk_size = Column(Integer, default=512)
|
||||
chunk_overlap = Column(Integer, default=128)
|
||||
embedding_model = Column(String(100), default='BAAI/bge-m3')
|
||||
search_method = Column(String(20), default='hybrid')
|
||||
specialized_language = Column(Boolean, default=False)
|
||||
|
||||
is_active = Column(Boolean, default=True)
|
||||
visibility = Column(String(20), default='individual')
|
||||
access_group = Column(String(50), default='individual')
|
||||
|
||||
dataset_metadata = Column(JSONB, nullable=True)
|
||||
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
Reference in New Issue
Block a user