GT AI OS Community Edition v2.0.33

Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
HackWeasel
2025-12-12 17:04:45 -05:00
commit b9dfb86260
746 changed files with 232071 additions and 0 deletions

View File

@@ -0,0 +1,160 @@
"""
Document Pydantic schemas for GT 2.0 Tenant Backend
Defines request/response schemas for document and RAG operations.
"""
from datetime import datetime
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field, validator
class DocumentResponse(BaseModel):
"""Document response schema"""
id: int
uuid: str
filename: str
original_filename: str
file_type: str
file_extension: str
file_size_bytes: int
processing_status: str
chunk_count: int
content_summary: Optional[str] = None
detected_language: Optional[str] = None
content_type: Optional[str] = None
keywords: List[str] = Field(default_factory=list)
uploaded_by: str
tags: List[str] = Field(default_factory=list)
category: Optional[str] = None
access_count: int = 0
is_active: bool = True
is_searchable: bool = True
created_at: datetime
updated_at: datetime
processed_at: Optional[datetime] = None
last_accessed_at: Optional[datetime] = None
class Config:
from_attributes = True
class RAGDatasetCreate(BaseModel):
"""Schema for creating a RAG dataset"""
dataset_name: str = Field(..., min_length=1, max_length=255)
description: Optional[str] = Field(None, max_length=1000)
chunking_strategy: str = Field(default="hybrid", pattern="^(fixed|semantic|hierarchical|hybrid)$")
chunk_size: int = Field(default=512, ge=128, le=2048)
chunk_overlap: int = Field(default=128, ge=0, le=512)
embedding_model: str = Field(default="BAAI/bge-m3")
@validator('chunk_overlap')
def validate_chunk_overlap(cls, v, values):
if 'chunk_size' in values and v >= values['chunk_size']:
raise ValueError('chunk_overlap must be less than chunk_size')
return v
class RAGDatasetResponse(BaseModel):
"""RAG dataset response schema"""
id: str
user_id: str
dataset_name: str
description: Optional[str] = None
chunking_strategy: str
embedding_model: str
chunk_size: int
chunk_overlap: int
document_count: int = 0
chunk_count: int = 0
vector_count: int = 0
total_size_bytes: int = 0
status: str
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class DocumentChunkResponse(BaseModel):
"""Document chunk response schema"""
id: str
chunk_index: int
chunk_metadata: Dict[str, Any] = Field(default_factory=dict)
embedding_id: str
created_at: datetime
class Config:
from_attributes = True
class SearchRequest(BaseModel):
"""Document search request schema"""
query: str = Field(..., min_length=1, max_length=1000)
dataset_ids: Optional[List[str]] = None
top_k: int = Field(default=5, ge=1, le=20)
similarity_threshold: float = Field(default=0.7, ge=0.0, le=1.0)
class SearchResult(BaseModel):
"""Document search result schema"""
document_id: Optional[int] = None
dataset_id: Optional[str] = None
dataset_name: Optional[str] = None
text: str
similarity: float
metadata: Dict[str, Any] = Field(default_factory=dict)
filename: Optional[str] = None
chunk_index: Optional[int] = None
class SearchResponse(BaseModel):
"""Document search response schema"""
query: str
results: List[SearchResult]
total_results: int
search_time_ms: Optional[float] = None
class DocumentContextResponse(BaseModel):
"""Document context response schema"""
document_id: int
document_name: str
query: str
relevant_chunks: List[SearchResult]
context_text: str
class RAGStatistics(BaseModel):
"""RAG usage statistics schema"""
user_id: str
document_count: int
dataset_count: int
total_size_bytes: int
total_size_mb: float
total_chunks: int
processed_documents: int
pending_documents: int
failed_documents: int
class ProcessDocumentRequest(BaseModel):
"""Document processing request schema"""
chunking_strategy: Optional[str] = Field(default="hybrid", pattern="^(fixed|semantic|hierarchical|hybrid)$")
class ProcessDocumentResponse(BaseModel):
"""Document processing response schema"""
status: str
document_id: int
chunk_count: int
vector_store_ids: List[str]
processing_time_ms: Optional[float] = None
class UploadDocumentResponse(BaseModel):
"""Document upload response schema"""
document: DocumentResponse
processing_initiated: bool = False
message: str = "Document uploaded successfully"