- Updated python_coding_microproject.csv to use NVIDIA NIM Kimi K2 - Updated kali_linux_shell_simulator.csv to use NVIDIA NIM Kimi K2 - Made more general-purpose (flexible targets, expanded tools) - Added nemotron-mini-agent.csv for fast local inference via Ollama - Added nemotron-agent.csv for advanced reasoning via Ollama - Added wiki page: Projects for NVIDIA NIMs and Nemotron
435 lines
16 KiB
Python
435 lines
16 KiB
Python
"""
|
|
Document and RAG Models for GT 2.0 Tenant Backend - Service-Based Architecture
|
|
|
|
Pydantic models for document entities using the PostgreSQL + PGVector backend.
|
|
Stores document metadata, RAG datasets, and processing status.
|
|
Perfect tenant isolation - each tenant has separate document data.
|
|
All vectors stored encrypted in tenant-specific ChromaDB.
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from typing import List, Optional, Dict, Any
|
|
from enum import Enum
|
|
import uuid
|
|
|
|
from pydantic import Field, ConfigDict
|
|
from app.models.base import BaseServiceModel, BaseCreateModel, BaseUpdateModel, BaseResponseModel
|
|
|
|
# SQLAlchemy imports for database models
|
|
from sqlalchemy import Column, String, Integer, BigInteger, Text, DateTime, Boolean, JSON, ForeignKey
|
|
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
|
from sqlalchemy.sql import func
|
|
from sqlalchemy.orm import relationship
|
|
from app.core.database import Base
|
|
|
|
# PGVector import for embeddings
|
|
try:
|
|
from pgvector.sqlalchemy import Vector
|
|
except ImportError:
|
|
# Fallback if pgvector not available
|
|
from sqlalchemy import Text as Vector
|
|
|
|
|
|
class DocumentStatus(str, Enum):
|
|
"""Document processing status enumeration"""
|
|
UPLOADING = "uploading"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
ARCHIVED = "archived"
|
|
|
|
|
|
class DocumentType(str, Enum):
|
|
"""Document type enumeration"""
|
|
PDF = "pdf"
|
|
DOCX = "docx"
|
|
TXT = "txt"
|
|
MD = "md"
|
|
HTML = "html"
|
|
JSON = "json"
|
|
CSV = "csv"
|
|
OTHER = "other"
|
|
|
|
|
|
class Document(BaseServiceModel):
|
|
"""
|
|
Document model for GT 2.0 service-based architecture.
|
|
|
|
Represents a document with metadata, processing status,
|
|
and RAG integration for knowledge retrieval.
|
|
"""
|
|
|
|
# Core document properties
|
|
filename: str = Field(..., min_length=1, max_length=255, description="Original filename")
|
|
original_name: str = Field(..., min_length=1, max_length=255, description="User-provided name")
|
|
file_size: int = Field(..., ge=0, description="File size in bytes")
|
|
mime_type: str = Field(..., max_length=100, description="MIME type of the file")
|
|
doc_type: DocumentType = Field(..., description="Document type classification")
|
|
|
|
# Storage and processing
|
|
file_path: str = Field(..., description="Storage path for the file")
|
|
content_hash: Optional[str] = Field(None, max_length=64, description="SHA-256 hash of content")
|
|
status: DocumentStatus = Field(default=DocumentStatus.UPLOADING, description="Processing status")
|
|
|
|
# Owner and access
|
|
owner_id: str = Field(..., description="User ID of the document owner")
|
|
dataset_id: Optional[str] = Field(None, description="Associated dataset ID")
|
|
|
|
# RAG and processing metadata
|
|
content_preview: Optional[str] = Field(None, max_length=500, description="Content preview")
|
|
extracted_text: Optional[str] = Field(None, description="Extracted text content")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata")
|
|
|
|
# Processing statistics
|
|
chunk_count: int = Field(default=0, description="Number of chunks created")
|
|
vector_count: int = Field(default=0, description="Number of vectors stored")
|
|
processing_time_ms: Optional[float] = Field(None, description="Processing time in milliseconds")
|
|
|
|
# Errors and logs
|
|
error_message: Optional[str] = Field(None, description="Error message if processing failed")
|
|
processing_log: List[str] = Field(default_factory=list, description="Processing log entries")
|
|
|
|
# Timestamps
|
|
processed_at: Optional[datetime] = Field(None, description="When processing completed")
|
|
|
|
# Model configuration
|
|
model_config = ConfigDict(
|
|
protected_namespaces=(),
|
|
json_encoders={
|
|
datetime: lambda v: v.isoformat() if v else None
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def get_table_name(cls) -> str:
|
|
"""Get the database table name"""
|
|
return "documents"
|
|
|
|
def mark_processing(self) -> None:
|
|
"""Mark document as processing"""
|
|
self.status = DocumentStatus.PROCESSING
|
|
self.update_timestamp()
|
|
|
|
def mark_completed(self, chunk_count: int, vector_count: int, processing_time_ms: float) -> None:
|
|
"""Mark document processing as completed"""
|
|
self.status = DocumentStatus.COMPLETED
|
|
self.chunk_count = chunk_count
|
|
self.vector_count = vector_count
|
|
self.processing_time_ms = processing_time_ms
|
|
self.processed_at = datetime.utcnow()
|
|
self.update_timestamp()
|
|
|
|
def mark_failed(self, error_message: str) -> None:
|
|
"""Mark document processing as failed"""
|
|
self.status = DocumentStatus.FAILED
|
|
self.error_message = error_message
|
|
self.update_timestamp()
|
|
|
|
def add_log_entry(self, message: str) -> None:
|
|
"""Add a processing log entry"""
|
|
timestamp = datetime.utcnow().isoformat()
|
|
self.processing_log.append(f"[{timestamp}] {message}")
|
|
|
|
|
|
class RAGDataset(BaseServiceModel):
|
|
"""
|
|
RAG Dataset model for organizing documents into collections.
|
|
|
|
Groups related documents together for focused retrieval and
|
|
provides dataset-level configuration and statistics.
|
|
"""
|
|
|
|
# Core dataset properties
|
|
name: str = Field(..., min_length=1, max_length=255, description="Dataset name")
|
|
description: Optional[str] = Field(None, max_length=1000, description="Dataset description")
|
|
|
|
# Owner and access
|
|
owner_id: str = Field(..., description="User ID of the dataset owner")
|
|
|
|
# Configuration
|
|
chunk_size: int = Field(default=1000, ge=100, le=5000, description="Default chunk size")
|
|
chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Default chunk overlap")
|
|
embedding_model: str = Field(default="all-MiniLM-L6-v2", description="Embedding model to use")
|
|
|
|
# Statistics
|
|
document_count: int = Field(default=0, description="Number of documents")
|
|
total_chunks: int = Field(default=0, description="Total chunks across all documents")
|
|
total_vectors: int = Field(default=0, description="Total vectors stored")
|
|
total_size_bytes: int = Field(default=0, description="Total size of all documents")
|
|
|
|
# Status
|
|
is_public: bool = Field(default=False, description="Whether dataset is publicly accessible")
|
|
|
|
# Model configuration
|
|
model_config = ConfigDict(
|
|
protected_namespaces=(),
|
|
json_encoders={
|
|
datetime: lambda v: v.isoformat() if v else None
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def get_table_name(cls) -> str:
|
|
"""Get the database table name"""
|
|
return "rag_datasets"
|
|
|
|
def update_statistics(self, doc_count: int, chunk_count: int, vector_count: int, size_bytes: int) -> None:
|
|
"""Update dataset statistics"""
|
|
self.document_count = doc_count
|
|
self.total_chunks = chunk_count
|
|
self.total_vectors = vector_count
|
|
self.total_size_bytes = size_bytes
|
|
self.update_timestamp()
|
|
|
|
|
|
class DatasetDocument(BaseServiceModel):
|
|
"""
|
|
Dataset-Document relationship model for GT 2.0 service-based architecture.
|
|
|
|
Junction table model that links documents to RAG datasets,
|
|
tracking the relationship and statistics.
|
|
"""
|
|
|
|
# Core relationship properties
|
|
dataset_id: str = Field(..., description="RAG dataset ID")
|
|
document_id: str = Field(..., description="Document ID")
|
|
user_id: str = Field(..., description="User who added document to dataset")
|
|
|
|
# Statistics
|
|
chunk_count: int = Field(default=0, description="Number of chunks for this document")
|
|
vector_count: int = Field(default=0, description="Number of vectors stored for this document")
|
|
|
|
# Status
|
|
processing_status: str = Field(default="pending", max_length=50, description="Processing status")
|
|
|
|
# Model configuration
|
|
model_config = ConfigDict(
|
|
protected_namespaces=(),
|
|
json_encoders={
|
|
datetime: lambda v: v.isoformat() if v else None
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def get_table_name(cls) -> str:
|
|
"""Get the database table name"""
|
|
return "dataset_documents"
|
|
|
|
|
|
class DocumentChunk(BaseServiceModel):
|
|
"""
|
|
Document chunk model for processed document pieces.
|
|
|
|
Represents individual chunks of processed documents with
|
|
embeddings and metadata for RAG retrieval.
|
|
"""
|
|
|
|
# Core chunk properties
|
|
document_id: str = Field(..., description="Parent document ID")
|
|
chunk_index: int = Field(..., ge=0, description="Chunk index within document")
|
|
chunk_text: str = Field(..., min_length=1, description="Chunk text content")
|
|
|
|
# Chunk metadata
|
|
chunk_size: int = Field(..., ge=1, description="Character count of chunk")
|
|
token_count: Optional[int] = Field(None, description="Token count for chunk")
|
|
chunk_metadata: Dict[str, Any] = Field(default_factory=dict, description="Chunk-specific metadata")
|
|
|
|
# Embedding information
|
|
embedding_id: Optional[str] = Field(None, description="Vector store embedding ID")
|
|
embedding_model: Optional[str] = Field(None, max_length=100, description="Model used for embedding")
|
|
|
|
# Position and context
|
|
start_char: Optional[int] = Field(None, description="Starting character position in document")
|
|
end_char: Optional[int] = Field(None, description="Ending character position in document")
|
|
|
|
# Model configuration
|
|
model_config = ConfigDict(
|
|
protected_namespaces=(),
|
|
json_encoders={
|
|
datetime: lambda v: v.isoformat() if v else None
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def get_table_name(cls) -> str:
|
|
"""Get the database table name"""
|
|
return "document_chunks"
|
|
|
|
|
|
class DocumentCreate(BaseCreateModel):
|
|
"""Model for creating new documents"""
|
|
filename: str = Field(..., min_length=1, max_length=255)
|
|
original_name: str = Field(..., min_length=1, max_length=255)
|
|
file_size: int = Field(..., ge=0)
|
|
mime_type: str = Field(..., max_length=100)
|
|
doc_type: DocumentType
|
|
file_path: str
|
|
content_hash: Optional[str] = Field(None, max_length=64)
|
|
owner_id: str
|
|
dataset_id: Optional[str] = None
|
|
content_preview: Optional[str] = Field(None, max_length=500)
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
|
|
class DocumentUpdate(BaseUpdateModel):
|
|
"""Model for updating documents"""
|
|
original_name: Optional[str] = Field(None, min_length=1, max_length=255)
|
|
status: Optional[DocumentStatus] = None
|
|
dataset_id: Optional[str] = None
|
|
content_preview: Optional[str] = Field(None, max_length=500)
|
|
extracted_text: Optional[str] = None
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
chunk_count: Optional[int] = Field(None, ge=0)
|
|
vector_count: Optional[int] = Field(None, ge=0)
|
|
processing_time_ms: Optional[float] = None
|
|
error_message: Optional[str] = None
|
|
processed_at: Optional[datetime] = None
|
|
|
|
|
|
class DocumentResponse(BaseResponseModel):
|
|
"""Model for document API responses"""
|
|
id: str
|
|
filename: str
|
|
original_name: str
|
|
file_size: int
|
|
mime_type: str
|
|
doc_type: DocumentType
|
|
file_path: str
|
|
content_hash: Optional[str]
|
|
status: DocumentStatus
|
|
owner_id: str
|
|
dataset_id: Optional[str]
|
|
content_preview: Optional[str]
|
|
metadata: Dict[str, Any]
|
|
chunk_count: int
|
|
vector_count: int
|
|
processing_time_ms: Optional[float]
|
|
error_message: Optional[str]
|
|
processing_log: List[str]
|
|
processed_at: Optional[datetime]
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
|
|
class RAGDatasetCreate(BaseCreateModel):
|
|
"""Model for creating new RAG datasets"""
|
|
name: str = Field(..., min_length=1, max_length=255)
|
|
description: Optional[str] = Field(None, max_length=1000)
|
|
owner_id: str
|
|
chunk_size: int = Field(default=1000, ge=100, le=5000)
|
|
chunk_overlap: int = Field(default=200, ge=0, le=1000)
|
|
embedding_model: str = Field(default="all-MiniLM-L6-v2")
|
|
is_public: bool = Field(default=False)
|
|
|
|
|
|
class RAGDatasetUpdate(BaseUpdateModel):
|
|
"""Model for updating RAG datasets"""
|
|
name: Optional[str] = Field(None, min_length=1, max_length=255)
|
|
description: Optional[str] = Field(None, max_length=1000)
|
|
chunk_size: Optional[int] = Field(None, ge=100, le=5000)
|
|
chunk_overlap: Optional[int] = Field(None, ge=0, le=1000)
|
|
embedding_model: Optional[str] = None
|
|
is_public: Optional[bool] = None
|
|
|
|
|
|
class RAGDatasetResponse(BaseResponseModel):
|
|
"""Model for RAG dataset API responses"""
|
|
id: str
|
|
name: str
|
|
description: Optional[str]
|
|
owner_id: str
|
|
chunk_size: int
|
|
chunk_overlap: int
|
|
embedding_model: str
|
|
document_count: int
|
|
total_chunks: int
|
|
total_vectors: int
|
|
total_size_bytes: int
|
|
is_public: bool
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
|
|
# SQLAlchemy Database Models for PostgreSQL + PGVector
|
|
|
|
class Document(Base):
|
|
"""SQLAlchemy model for documents table"""
|
|
__tablename__ = "documents"
|
|
|
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
|
user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
|
dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
|
|
|
|
filename = Column(String(255), nullable=False)
|
|
original_filename = Column(String(255), nullable=False)
|
|
file_type = Column(String(100), nullable=False)
|
|
file_size_bytes = Column(BigInteger, nullable=False)
|
|
file_hash = Column(String(64), nullable=True)
|
|
|
|
content_text = Column(Text, nullable=True)
|
|
chunk_count = Column(Integer, default=0)
|
|
processing_status = Column(String(50), default="pending")
|
|
error_message = Column(Text, nullable=True)
|
|
|
|
doc_metadata = Column(JSONB, nullable=True)
|
|
|
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
|
|
|
# Relationships
|
|
chunks = relationship("DocumentChunk", back_populates="document", cascade="all, delete-orphan")
|
|
|
|
|
|
class DocumentChunk(Base):
|
|
"""SQLAlchemy model for document_chunks table"""
|
|
__tablename__ = "document_chunks"
|
|
|
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
document_id = Column(UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True)
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
|
user_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
|
dataset_id = Column(UUID(as_uuid=True), nullable=True, index=True)
|
|
|
|
chunk_index = Column(Integer, nullable=False)
|
|
content = Column(Text, nullable=False)
|
|
content_hash = Column(String(32), nullable=True)
|
|
token_count = Column(Integer, nullable=True)
|
|
|
|
# PGVector embedding column (1024 dimensions for BGE-M3)
|
|
embedding = Column(Vector(1024), nullable=True)
|
|
|
|
chunk_metadata = Column(JSONB, nullable=True)
|
|
|
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
|
|
|
# Relationships
|
|
document = relationship("Document", back_populates="chunks")
|
|
|
|
|
|
class Dataset(Base):
|
|
"""SQLAlchemy model for datasets table"""
|
|
__tablename__ = "datasets"
|
|
|
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
|
user_id = Column(UUID(as_uuid=True), nullable=False, index=True) # created_by in schema
|
|
|
|
name = Column(String(255), nullable=False)
|
|
description = Column(Text, nullable=True)
|
|
|
|
chunk_size = Column(Integer, default=512)
|
|
chunk_overlap = Column(Integer, default=128)
|
|
embedding_model = Column(String(100), default='BAAI/bge-m3')
|
|
search_method = Column(String(20), default='hybrid')
|
|
specialized_language = Column(Boolean, default=False)
|
|
|
|
is_active = Column(Boolean, default=True)
|
|
visibility = Column(String(20), default='individual')
|
|
access_group = Column(String(50), default='individual')
|
|
|
|
dataset_metadata = Column(JSONB, nullable=True)
|
|
|
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) |