GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts: - Fix stack trace exposure in error responses - Add SSRF protection with DNS resolution checking - Implement proper URL hostname validation (replaces substring matching) - Add centralized path sanitization to prevent path traversal - Fix ReDoS vulnerability in email validation regex - Improve HTML sanitization in validation utilities - Fix capability wildcard matching in auth utilities - Update glob dependency to address CVE - Add CodeQL suppression comments for verified false positives 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
563
apps/tenant-backend/app/services/conversation_file_service.py
Normal file
563
apps/tenant-backend/app/services/conversation_file_service.py
Normal file
@@ -0,0 +1,563 @@
|
||||
"""
|
||||
Conversation File Service for GT 2.0
|
||||
|
||||
Handles conversation-scoped file attachments as a simpler alternative to dataset-based uploads.
|
||||
Preserves all existing dataset infrastructure while providing direct conversation file storage.
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import UploadFile, HTTPException
|
||||
from app.core.config import get_settings
|
||||
from app.core.postgresql_client import get_postgresql_client
|
||||
from app.core.path_security import sanitize_tenant_domain
|
||||
from app.services.embedding_client import BGE_M3_EmbeddingClient
|
||||
from app.services.document_processor import DocumentProcessor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ConversationFileService:
|
||||
"""Service for managing conversation-scoped file attachments"""
|
||||
|
||||
def __init__(self, tenant_domain: str, user_id: str):
|
||||
self.tenant_domain = tenant_domain
|
||||
self.user_id = user_id
|
||||
self.settings = get_settings()
|
||||
self.schema_name = f"tenant_{tenant_domain.replace('.', '_').replace('-', '_')}"
|
||||
|
||||
# File storage configuration
|
||||
# Sanitize tenant_domain to prevent path traversal
|
||||
safe_tenant = sanitize_tenant_domain(tenant_domain)
|
||||
# codeql[py/path-injection] safe_tenant validated by sanitize_tenant_domain()
|
||||
self.storage_root = Path(self.settings.file_storage_path) / safe_tenant / "conversations"
|
||||
self.storage_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f"ConversationFileService initialized for {tenant_domain}/{user_id}")
|
||||
|
||||
def _get_conversation_storage_path(self, conversation_id: str) -> Path:
|
||||
"""Get storage directory for conversation files"""
|
||||
conv_path = self.storage_root / conversation_id
|
||||
conv_path.mkdir(parents=True, exist_ok=True)
|
||||
return conv_path
|
||||
|
||||
def _generate_safe_filename(self, original_filename: str, file_id: str) -> str:
|
||||
"""Generate safe filename for storage"""
|
||||
# Sanitize filename and prepend file ID
|
||||
safe_name = "".join(c for c in original_filename if c.isalnum() or c in ".-_")
|
||||
return f"{file_id}-{safe_name}"
|
||||
|
||||
async def upload_files(
|
||||
self,
|
||||
conversation_id: str,
|
||||
files: List[UploadFile],
|
||||
user_id: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Upload files directly to conversation"""
|
||||
try:
|
||||
# Validate conversation access
|
||||
await self._validate_conversation_access(conversation_id, user_id)
|
||||
|
||||
uploaded_files = []
|
||||
|
||||
for file in files:
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="File must have a filename")
|
||||
|
||||
# Generate file metadata
|
||||
file_id = str(uuid.uuid4())
|
||||
safe_filename = self._generate_safe_filename(file.filename, file_id)
|
||||
conversation_path = self._get_conversation_storage_path(conversation_id)
|
||||
file_path = conversation_path / safe_filename
|
||||
|
||||
# Store file to disk
|
||||
content = await file.read()
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
# Create database record
|
||||
file_record = await self._create_file_record(
|
||||
file_id=file_id,
|
||||
conversation_id=conversation_id,
|
||||
original_filename=file.filename,
|
||||
safe_filename=safe_filename,
|
||||
content_type=file.content_type or "application/octet-stream",
|
||||
file_size=len(content),
|
||||
file_path=str(file_path.relative_to(Path(self.settings.file_storage_path))),
|
||||
uploaded_by=user_id
|
||||
)
|
||||
|
||||
uploaded_files.append(file_record)
|
||||
|
||||
# Queue for background processing
|
||||
asyncio.create_task(self._process_file_embeddings(file_id))
|
||||
|
||||
logger.info(f"Uploaded conversation file: {file.filename} -> {file_id}")
|
||||
|
||||
return uploaded_files
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload conversation files: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
|
||||
|
||||
async def _get_user_uuid(self, user_email: str) -> str:
|
||||
"""Resolve user email to UUID"""
|
||||
client = await get_postgresql_client()
|
||||
query = f"SELECT id FROM {self.schema_name}.users WHERE email = $1 LIMIT 1"
|
||||
result = await client.fetch_one(query, user_email)
|
||||
if not result:
|
||||
raise ValueError(f"User not found: {user_email}")
|
||||
return str(result['id'])
|
||||
|
||||
async def _create_file_record(
|
||||
self,
|
||||
file_id: str,
|
||||
conversation_id: str,
|
||||
original_filename: str,
|
||||
safe_filename: str,
|
||||
content_type: str,
|
||||
file_size: int,
|
||||
file_path: str,
|
||||
uploaded_by: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Create conversation_files database record"""
|
||||
|
||||
client = await get_postgresql_client()
|
||||
|
||||
# Resolve user email to UUID if needed
|
||||
user_uuid = uploaded_by
|
||||
if '@' in uploaded_by: # Check if it's an email
|
||||
user_uuid = await self._get_user_uuid(uploaded_by)
|
||||
|
||||
query = f"""
|
||||
INSERT INTO {self.schema_name}.conversation_files (
|
||||
id, conversation_id, filename, original_filename, content_type,
|
||||
file_size_bytes, file_path, uploaded_by, processing_status
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'pending')
|
||||
RETURNING id, filename, original_filename, content_type, file_size_bytes,
|
||||
processing_status, uploaded_at
|
||||
"""
|
||||
|
||||
result = await client.fetch_one(
|
||||
query,
|
||||
file_id, conversation_id, safe_filename, original_filename,
|
||||
content_type, file_size, file_path, user_uuid
|
||||
)
|
||||
|
||||
# Convert UUID fields to strings for JSON serialization
|
||||
result_dict = dict(result)
|
||||
if 'id' in result_dict and result_dict['id']:
|
||||
result_dict['id'] = str(result_dict['id'])
|
||||
|
||||
return result_dict
|
||||
|
||||
async def _process_file_embeddings(self, file_id: str):
|
||||
"""Background task to process file content and generate embeddings"""
|
||||
try:
|
||||
# Update status to processing
|
||||
await self._update_processing_status(file_id, "processing")
|
||||
|
||||
# Get file record
|
||||
file_record = await self._get_file_record(file_id)
|
||||
if not file_record:
|
||||
logger.error(f"File record not found: {file_id}")
|
||||
return
|
||||
|
||||
# Read file content
|
||||
file_path = Path(self.settings.file_storage_path) / file_record['file_path']
|
||||
if not file_path.exists():
|
||||
logger.error(f"File not found on disk: {file_path}")
|
||||
await self._update_processing_status(file_id, "failed")
|
||||
return
|
||||
|
||||
# Extract text content using DocumentProcessor public methods
|
||||
processor = DocumentProcessor()
|
||||
|
||||
text_content = await processor.extract_text_from_path(
|
||||
file_path,
|
||||
file_record['content_type']
|
||||
)
|
||||
|
||||
if not text_content:
|
||||
logger.warning(f"No text content extracted from {file_record['original_filename']}")
|
||||
await self._update_processing_status(file_id, "completed")
|
||||
return
|
||||
|
||||
# Chunk content for RAG
|
||||
chunks = await processor.chunk_text_simple(text_content)
|
||||
|
||||
# Generate embeddings for full document (single embedding for semantic search)
|
||||
embedding_client = BGE_M3_EmbeddingClient()
|
||||
embeddings = await embedding_client.generate_embeddings([text_content])
|
||||
|
||||
if not embeddings:
|
||||
logger.error(f"Failed to generate embeddings for {file_id}")
|
||||
await self._update_processing_status(file_id, "failed")
|
||||
return
|
||||
|
||||
# Update record with processed content (chunks as JSONB, embedding as vector)
|
||||
await self._update_file_processing_results(
|
||||
file_id, chunks, embeddings[0], "completed"
|
||||
)
|
||||
|
||||
logger.info(f"Successfully processed file: {file_record['original_filename']}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process file {file_id}: {e}")
|
||||
await self._update_processing_status(file_id, "failed")
|
||||
|
||||
async def _update_processing_status(self, file_id: str, status: str):
|
||||
"""Update file processing status"""
|
||||
client = await get_postgresql_client()
|
||||
|
||||
query = f"""
|
||||
UPDATE {self.schema_name}.conversation_files
|
||||
SET processing_status = $1,
|
||||
processed_at = CASE WHEN $1 IN ('completed', 'failed') THEN NOW() ELSE processed_at END
|
||||
WHERE id = $2
|
||||
"""
|
||||
|
||||
await client.execute_query(query, status, file_id)
|
||||
|
||||
async def _update_file_processing_results(
|
||||
self,
|
||||
file_id: str,
|
||||
chunks: List[str],
|
||||
embedding: List[float],
|
||||
status: str
|
||||
):
|
||||
"""Update file with processing results"""
|
||||
import json
|
||||
client = await get_postgresql_client()
|
||||
|
||||
# Sanitize chunks: remove null bytes and other control characters
|
||||
# that PostgreSQL can't handle in JSONB
|
||||
sanitized_chunks = [
|
||||
chunk.replace('\u0000', '').replace('\x00', '')
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
# Convert chunks list to JSONB-compatible format
|
||||
chunks_json = json.dumps(sanitized_chunks)
|
||||
|
||||
# Convert embedding to PostgreSQL vector format
|
||||
embedding_str = f"[{','.join(map(str, embedding))}]"
|
||||
|
||||
query = f"""
|
||||
UPDATE {self.schema_name}.conversation_files
|
||||
SET processed_chunks = $1::jsonb,
|
||||
embeddings = $2::vector,
|
||||
processing_status = $3,
|
||||
processed_at = NOW()
|
||||
WHERE id = $4
|
||||
"""
|
||||
|
||||
await client.execute_query(query, chunks_json, embedding_str, status, file_id)
|
||||
|
||||
async def _get_file_record(self, file_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get file record by ID"""
|
||||
client = await get_postgresql_client()
|
||||
|
||||
query = f"""
|
||||
SELECT id, conversation_id, filename, original_filename, content_type,
|
||||
file_size_bytes, file_path, processing_status, uploaded_at
|
||||
FROM {self.schema_name}.conversation_files
|
||||
WHERE id = $1
|
||||
"""
|
||||
|
||||
result = await client.fetch_one(query, file_id)
|
||||
return dict(result) if result else None
|
||||
|
||||
async def list_files(self, conversation_id: str) -> List[Dict[str, Any]]:
|
||||
"""List files attached to conversation"""
|
||||
try:
|
||||
client = await get_postgresql_client()
|
||||
|
||||
query = f"""
|
||||
SELECT id, filename, original_filename, content_type, file_size_bytes,
|
||||
processing_status, uploaded_at, processed_at
|
||||
FROM {self.schema_name}.conversation_files
|
||||
WHERE conversation_id = $1
|
||||
ORDER BY uploaded_at DESC
|
||||
"""
|
||||
|
||||
rows = await client.execute_query(query, conversation_id)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list conversation files: {e}")
|
||||
return []
|
||||
|
||||
async def delete_file(self, conversation_id: str, file_id: str, user_id: str, allow_post_message_deletion: bool = False) -> bool:
|
||||
"""Delete specific file from conversation
|
||||
|
||||
Args:
|
||||
conversation_id: The conversation ID
|
||||
file_id: The file ID to delete
|
||||
user_id: The user requesting deletion
|
||||
allow_post_message_deletion: If False, prevents deletion after messages exist (default: False)
|
||||
"""
|
||||
try:
|
||||
logger.info(f"DELETE FILE CALLED: file_id={file_id}, conversation_id={conversation_id}, user_id={user_id}")
|
||||
|
||||
# Validate access
|
||||
await self._validate_conversation_access(conversation_id, user_id)
|
||||
logger.info(f"DELETE FILE: Access validated")
|
||||
|
||||
# Check if conversation has messages (unless explicitly allowed to delete post-message)
|
||||
if not allow_post_message_deletion:
|
||||
client = await get_postgresql_client()
|
||||
message_check_query = f"""
|
||||
SELECT COUNT(*) as count
|
||||
FROM {self.schema_name}.messages
|
||||
WHERE conversation_id = $1
|
||||
"""
|
||||
message_count_result = await client.fetch_one(message_check_query, conversation_id)
|
||||
message_count = message_count_result['count'] if message_count_result else 0
|
||||
|
||||
if message_count > 0:
|
||||
from fastapi import HTTPException
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Cannot delete files after conversation has started. Files are part of the conversation context."
|
||||
)
|
||||
|
||||
# Get file record for cleanup
|
||||
file_record = await self._get_file_record(file_id)
|
||||
logger.info(f"DELETE FILE: file_record={file_record}")
|
||||
if not file_record or str(file_record['conversation_id']) != conversation_id:
|
||||
logger.warning(f"DELETE FILE FAILED: file not found or conversation mismatch. file_record={file_record}, expected_conv_id={conversation_id}")
|
||||
return False
|
||||
|
||||
# Delete from database
|
||||
client = await get_postgresql_client()
|
||||
query = f"""
|
||||
DELETE FROM {self.schema_name}.conversation_files
|
||||
WHERE id = $1 AND conversation_id = $2
|
||||
"""
|
||||
|
||||
rows_deleted = await client.execute_command(query, file_id, conversation_id)
|
||||
|
||||
if rows_deleted > 0:
|
||||
# Delete file from disk
|
||||
file_path = Path(self.settings.file_storage_path) / file_record['file_path']
|
||||
if file_path.exists():
|
||||
file_path.unlink()
|
||||
|
||||
logger.info(f"Deleted conversation file: {file_id}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except HTTPException:
|
||||
raise # Re-raise HTTPException to preserve status code and message
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete conversation file: {e}")
|
||||
return False
|
||||
|
||||
async def search_conversation_files(
|
||||
self,
|
||||
conversation_id: str,
|
||||
query: str,
|
||||
max_results: int = 5
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Search files within a conversation using vector similarity"""
|
||||
try:
|
||||
# Generate query embedding
|
||||
embedding_client = BGE_M3_EmbeddingClient()
|
||||
embeddings = await embedding_client.generate_embeddings([query])
|
||||
|
||||
if not embeddings:
|
||||
return []
|
||||
|
||||
query_embedding = embeddings[0]
|
||||
|
||||
# Convert embedding to PostgreSQL vector format
|
||||
embedding_str = '[' + ','.join(map(str, query_embedding)) + ']'
|
||||
|
||||
# Vector search against conversation files
|
||||
client = await get_postgresql_client()
|
||||
|
||||
search_query = f"""
|
||||
SELECT id, filename, original_filename, processed_chunks,
|
||||
1 - (embeddings <=> $1::vector) as similarity_score
|
||||
FROM {self.schema_name}.conversation_files
|
||||
WHERE conversation_id = $2
|
||||
AND processing_status = 'completed'
|
||||
AND embeddings IS NOT NULL
|
||||
AND 1 - (embeddings <=> $1::vector) > 0.1
|
||||
ORDER BY embeddings <=> $1::vector
|
||||
LIMIT $3
|
||||
"""
|
||||
|
||||
rows = await client.execute_query(
|
||||
search_query, embedding_str, conversation_id, max_results
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
for row in rows:
|
||||
processed_chunks = row.get('processed_chunks', [])
|
||||
|
||||
if not processed_chunks:
|
||||
continue
|
||||
|
||||
# Handle case where processed_chunks might be returned as JSON string
|
||||
if isinstance(processed_chunks, str):
|
||||
import json
|
||||
processed_chunks = json.loads(processed_chunks)
|
||||
|
||||
for idx, chunk_text in enumerate(processed_chunks):
|
||||
results.append({
|
||||
'id': f"{row['id']}_chunk_{idx}",
|
||||
'document_id': row['id'],
|
||||
'document_name': row['original_filename'],
|
||||
'original_filename': row['original_filename'],
|
||||
'chunk_index': idx,
|
||||
'content': chunk_text,
|
||||
'similarity_score': row['similarity_score'],
|
||||
'source': 'conversation_file',
|
||||
'source_type': 'conversation_file'
|
||||
})
|
||||
|
||||
if len(results) >= max_results:
|
||||
results = results[:max_results]
|
||||
break
|
||||
|
||||
logger.info(f"Found {len(results)} chunks from {len(rows)} matching conversation files")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to search conversation files: {e}")
|
||||
return []
|
||||
|
||||
async def get_all_chunks_for_conversation(
|
||||
self,
|
||||
conversation_id: str,
|
||||
max_chunks_per_file: int = 50,
|
||||
max_total_chunks: int = 100
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Retrieve ALL chunks from files attached to conversation.
|
||||
Non-query-dependent - returns everything up to limits.
|
||||
|
||||
Args:
|
||||
conversation_id: UUID of conversation
|
||||
max_chunks_per_file: Limit per file (enforces diversity)
|
||||
max_total_chunks: Total chunk limit across all files
|
||||
|
||||
Returns:
|
||||
List of chunks with metadata, grouped by file
|
||||
"""
|
||||
try:
|
||||
client = await get_postgresql_client()
|
||||
|
||||
query = f"""
|
||||
SELECT id, filename, original_filename, processed_chunks,
|
||||
file_size_bytes, uploaded_at
|
||||
FROM {self.schema_name}.conversation_files
|
||||
WHERE conversation_id = $1
|
||||
AND processing_status = 'completed'
|
||||
AND processed_chunks IS NOT NULL
|
||||
ORDER BY uploaded_at ASC
|
||||
"""
|
||||
|
||||
rows = await client.execute_query(query, conversation_id)
|
||||
|
||||
results = []
|
||||
total_chunks = 0
|
||||
|
||||
for row in rows:
|
||||
if total_chunks >= max_total_chunks:
|
||||
break
|
||||
|
||||
processed_chunks = row.get('processed_chunks', [])
|
||||
|
||||
# Handle JSON string if needed
|
||||
if isinstance(processed_chunks, str):
|
||||
import json
|
||||
processed_chunks = json.loads(processed_chunks)
|
||||
|
||||
# Limit chunks per file (diversity enforcement)
|
||||
chunks_from_this_file = 0
|
||||
|
||||
for idx, chunk_text in enumerate(processed_chunks):
|
||||
if chunks_from_this_file >= max_chunks_per_file:
|
||||
break
|
||||
if total_chunks >= max_total_chunks:
|
||||
break
|
||||
|
||||
results.append({
|
||||
'id': f"{row['id']}_chunk_{idx}",
|
||||
'document_id': row['id'],
|
||||
'document_name': row['original_filename'],
|
||||
'original_filename': row['original_filename'],
|
||||
'chunk_index': idx,
|
||||
'total_chunks': len(processed_chunks),
|
||||
'content': chunk_text,
|
||||
'file_size_bytes': row['file_size_bytes'],
|
||||
'source': 'conversation_file',
|
||||
'source_type': 'conversation_file'
|
||||
})
|
||||
|
||||
chunks_from_this_file += 1
|
||||
total_chunks += 1
|
||||
|
||||
logger.info(f"Retrieved {len(results)} total chunks from {len(rows)} conversation files")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get all chunks for conversation: {e}")
|
||||
return []
|
||||
|
||||
async def _validate_conversation_access(self, conversation_id: str, user_id: str):
|
||||
"""Validate user has access to conversation"""
|
||||
client = await get_postgresql_client()
|
||||
|
||||
query = f"""
|
||||
SELECT id FROM {self.schema_name}.conversations
|
||||
WHERE id = $1 AND user_id = (
|
||||
SELECT id FROM {self.schema_name}.users WHERE email = $2 LIMIT 1
|
||||
)
|
||||
"""
|
||||
|
||||
result = await client.fetch_one(query, conversation_id, user_id)
|
||||
if not result:
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail="Access denied: conversation not found or access denied"
|
||||
)
|
||||
|
||||
async def get_file_content(self, file_id: str, user_id: str) -> Optional[bytes]:
|
||||
"""Get file content for download"""
|
||||
try:
|
||||
file_record = await self._get_file_record(file_id)
|
||||
if not file_record:
|
||||
return None
|
||||
|
||||
# Validate access to conversation
|
||||
await self._validate_conversation_access(file_record['conversation_id'], user_id)
|
||||
|
||||
# Read file content
|
||||
file_path = Path(self.settings.file_storage_path) / file_record['file_path']
|
||||
if file_path.exists():
|
||||
with open(file_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get file content: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# Factory function for service instances
|
||||
def get_conversation_file_service(tenant_domain: str, user_id: str) -> ConversationFileService:
|
||||
"""Get conversation file service instance"""
|
||||
return ConversationFileService(tenant_domain, user_id)
|
||||
Reference in New Issue
Block a user