Files
gt-ai-os-community/apps/tenant-backend/app/services/dataset_sharing.py
HackWeasel b9dfb86260 GT AI OS Community Edition v2.0.33
Security hardening release addressing CodeQL and Dependabot alerts:

- Fix stack trace exposure in error responses
- Add SSRF protection with DNS resolution checking
- Implement proper URL hostname validation (replaces substring matching)
- Add centralized path sanitization to prevent path traversal
- Fix ReDoS vulnerability in email validation regex
- Improve HTML sanitization in validation utilities
- Fix capability wildcard matching in auth utilities
- Update glob dependency to address CVE
- Add CodeQL suppression comments for verified false positives

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 17:04:45 -05:00

585 lines
21 KiB
Python

"""
Dataset Sharing Service for GT 2.0
Implements hierarchical dataset sharing with perfect tenant isolation.
Enables secure data collaboration while maintaining ownership and access control.
"""
import os
import stat
import json
import logging
from typing import Dict, Any, List, Optional, Tuple
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field
from enum import Enum
from uuid import uuid4
from app.models.access_group import AccessGroup, Resource
from app.services.access_controller import AccessController
from app.core.security import verify_capability_token
logger = logging.getLogger(__name__)
class SharingPermission(Enum):
"""Sharing permission levels"""
READ = "read" # Can view and search dataset
WRITE = "write" # Can add documents
ADMIN = "admin" # Can modify sharing settings
@dataclass
class DatasetShare:
"""Dataset sharing configuration"""
id: str = field(default_factory=lambda: str(uuid4()))
dataset_id: str = ""
owner_id: str = ""
access_group: AccessGroup = AccessGroup.INDIVIDUAL
team_members: List[str] = field(default_factory=list)
team_permissions: Dict[str, SharingPermission] = field(default_factory=dict)
shared_at: datetime = field(default_factory=datetime.utcnow)
expires_at: Optional[datetime] = None
is_active: bool = True
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage"""
return {
"id": self.id,
"dataset_id": self.dataset_id,
"owner_id": self.owner_id,
"access_group": self.access_group.value,
"team_members": self.team_members,
"team_permissions": {k: v.value for k, v in self.team_permissions.items()},
"shared_at": self.shared_at.isoformat(),
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
"is_active": self.is_active
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DatasetShare":
"""Create from dictionary"""
return cls(
id=data.get("id", str(uuid4())),
dataset_id=data["dataset_id"],
owner_id=data["owner_id"],
access_group=AccessGroup(data["access_group"]),
team_members=data.get("team_members", []),
team_permissions={
k: SharingPermission(v) for k, v in data.get("team_permissions", {}).items()
},
shared_at=datetime.fromisoformat(data["shared_at"]),
expires_at=datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None,
is_active=data.get("is_active", True)
)
@dataclass
class DatasetInfo:
"""Dataset information for sharing"""
id: str
name: str
description: str
owner_id: str
document_count: int
size_bytes: int
created_at: datetime
updated_at: datetime
tags: List[str] = field(default_factory=list)
class DatasetSharingService:
"""
Service for hierarchical dataset sharing with capability-based access control.
Features:
- Individual, Team, and Organization level sharing
- Granular permission management (read, write, admin)
- Time-based expiration of shares
- Perfect tenant isolation through file-based storage
- Event emission for sharing activities
"""
def __init__(self, tenant_domain: str, access_controller: AccessController):
self.tenant_domain = tenant_domain
self.access_controller = access_controller
self.base_path = Path(f"/data/{tenant_domain}/dataset_sharing")
self.shares_path = self.base_path / "shares"
self.datasets_path = self.base_path / "datasets"
# Ensure directories exist with proper permissions
self._ensure_directories()
logger.info(f"DatasetSharingService initialized for {tenant_domain}")
def _ensure_directories(self):
"""Ensure sharing directories exist with proper permissions"""
for path in [self.shares_path, self.datasets_path]:
path.mkdir(parents=True, exist_ok=True)
# Set permissions to 700 (owner only)
os.chmod(path, stat.S_IRWXU)
async def share_dataset(
self,
dataset_id: str,
owner_id: str,
access_group: AccessGroup,
team_members: Optional[List[str]] = None,
team_permissions: Optional[Dict[str, SharingPermission]] = None,
expires_at: Optional[datetime] = None,
capability_token: str = ""
) -> DatasetShare:
"""
Share a dataset with specified access group.
Args:
dataset_id: Dataset to share
owner_id: Owner of the dataset
access_group: Level of sharing (Individual, Team, Organization)
team_members: List of team members (if Team access)
team_permissions: Permissions for each team member
expires_at: Optional expiration time
capability_token: JWT capability token
Returns:
DatasetShare configuration
"""
# Verify capability token
token_data = verify_capability_token(capability_token)
if not token_data or token_data.get("tenant_id") != self.tenant_domain:
raise PermissionError("Invalid capability token")
# Verify ownership
dataset_resource = await self._load_dataset_resource(dataset_id)
if not dataset_resource or dataset_resource.owner_id != owner_id:
raise PermissionError("Only dataset owner can modify sharing")
# Validate team members for team sharing
if access_group == AccessGroup.TEAM:
if not team_members:
raise ValueError("Team members required for team sharing")
# Ensure all team members are valid users in tenant
for member in team_members:
if not await self._is_valid_tenant_user(member):
logger.warning(f"Invalid team member: {member}")
# Create sharing configuration
share = DatasetShare(
dataset_id=dataset_id,
owner_id=owner_id,
access_group=access_group,
team_members=team_members or [],
team_permissions=team_permissions or {},
expires_at=expires_at
)
# Set default permissions for team members
if access_group == AccessGroup.TEAM:
for member in share.team_members:
if member not in share.team_permissions:
share.team_permissions[member] = SharingPermission.READ
# Store sharing configuration
await self._store_share(share)
# Update dataset resource access group
await self.access_controller.update_resource_access(
owner_id, dataset_id, access_group, team_members
)
# Emit sharing event
if hasattr(self.access_controller, 'event_bus'):
await self.access_controller.event_bus.emit_event(
"dataset.shared",
owner_id,
{
"dataset_id": dataset_id,
"access_group": access_group.value,
"team_members": team_members or [],
"expires_at": expires_at.isoformat() if expires_at else None
}
)
logger.info(f"Dataset {dataset_id} shared as {access_group.value} by {owner_id}")
return share
async def get_dataset_sharing(
self,
dataset_id: str,
user_id: str,
capability_token: str
) -> Optional[DatasetShare]:
"""
Get sharing configuration for a dataset.
Args:
dataset_id: Dataset ID
user_id: Requesting user
capability_token: JWT capability token
Returns:
DatasetShare if user has access, None otherwise
"""
# Verify capability token
token_data = verify_capability_token(capability_token)
if not token_data or token_data.get("tenant_id") != self.tenant_domain:
raise PermissionError("Invalid capability token")
# Load sharing configuration
share = await self._load_share(dataset_id)
if not share:
return None
# Check if user has access to view sharing info
if share.owner_id == user_id:
return share # Owner can always see
if share.access_group == AccessGroup.TEAM and user_id in share.team_members:
return share # Team member can see
if share.access_group == AccessGroup.ORGANIZATION:
# All tenant users can see organization shares
if await self._is_valid_tenant_user(user_id):
return share
return None
async def check_dataset_access(
self,
dataset_id: str,
user_id: str,
permission: SharingPermission = SharingPermission.READ
) -> Tuple[bool, Optional[str]]:
"""
Check if user has specified permission on dataset.
Args:
dataset_id: Dataset to check
user_id: User requesting access
permission: Required permission level
Returns:
Tuple of (allowed, reason)
"""
# Load sharing configuration
share = await self._load_share(dataset_id)
if not share or not share.is_active:
return False, "Dataset not shared or sharing inactive"
# Check expiration
if share.expires_at and datetime.utcnow() > share.expires_at:
return False, "Dataset sharing has expired"
# Owner has all permissions
if share.owner_id == user_id:
return True, "Owner access"
# Check access group permissions
if share.access_group == AccessGroup.INDIVIDUAL:
return False, "Private dataset"
elif share.access_group == AccessGroup.TEAM:
if user_id not in share.team_members:
return False, "Not a team member"
# Check specific permission
user_permission = share.team_permissions.get(user_id, SharingPermission.READ)
if self._has_permission(user_permission, permission):
return True, f"Team member with {user_permission.value} permission"
else:
return False, f"Insufficient permission: has {user_permission.value}, needs {permission.value}"
elif share.access_group == AccessGroup.ORGANIZATION:
# Organization sharing is typically read-only
if permission == SharingPermission.READ:
if await self._is_valid_tenant_user(user_id):
return True, "Organization-wide read access"
return False, "Organization access is read-only"
return False, "Unknown access configuration"
async def list_accessible_datasets(
self,
user_id: str,
capability_token: str,
include_owned: bool = True,
include_shared: bool = True
) -> List[DatasetInfo]:
"""
List datasets accessible to user.
Args:
user_id: User requesting list
capability_token: JWT capability token
include_owned: Include user's own datasets
include_shared: Include datasets shared with user
Returns:
List of accessible datasets
"""
# Verify capability token
token_data = verify_capability_token(capability_token)
if not token_data or token_data.get("tenant_id") != self.tenant_domain:
raise PermissionError("Invalid capability token")
accessible_datasets = []
# Get all dataset shares
all_shares = await self._list_all_shares()
for share in all_shares:
# Skip inactive or expired shares
if not share.is_active:
continue
if share.expires_at and datetime.utcnow() > share.expires_at:
continue
# Check if user has access
has_access = False
if include_owned and share.owner_id == user_id:
has_access = True
elif include_shared:
allowed, _ = await self.check_dataset_access(share.dataset_id, user_id)
has_access = allowed
if has_access:
dataset_info = await self._load_dataset_info(share.dataset_id)
if dataset_info:
accessible_datasets.append(dataset_info)
return accessible_datasets
async def revoke_dataset_sharing(
self,
dataset_id: str,
owner_id: str,
capability_token: str
) -> bool:
"""
Revoke dataset sharing (make it private).
Args:
dataset_id: Dataset to make private
owner_id: Owner of the dataset
capability_token: JWT capability token
Returns:
True if revoked successfully
"""
# Verify capability token
token_data = verify_capability_token(capability_token)
if not token_data or token_data.get("tenant_id") != self.tenant_domain:
raise PermissionError("Invalid capability token")
# Verify ownership
share = await self._load_share(dataset_id)
if not share or share.owner_id != owner_id:
raise PermissionError("Only dataset owner can revoke sharing")
# Update sharing to individual (private)
share.access_group = AccessGroup.INDIVIDUAL
share.team_members = []
share.team_permissions = {}
share.is_active = False
# Store updated share
await self._store_share(share)
# Update resource access
await self.access_controller.update_resource_access(
owner_id, dataset_id, AccessGroup.INDIVIDUAL, []
)
# Emit revocation event
if hasattr(self.access_controller, 'event_bus'):
await self.access_controller.event_bus.emit_event(
"dataset.sharing_revoked",
owner_id,
{"dataset_id": dataset_id}
)
logger.info(f"Dataset {dataset_id} sharing revoked by {owner_id}")
return True
async def update_team_permissions(
self,
dataset_id: str,
owner_id: str,
user_id: str,
permission: SharingPermission,
capability_token: str
) -> bool:
"""
Update team member permissions for a dataset.
Args:
dataset_id: Dataset ID
owner_id: Owner of the dataset
user_id: Team member to update
permission: New permission level
capability_token: JWT capability token
Returns:
True if updated successfully
"""
# Verify capability token
token_data = verify_capability_token(capability_token)
if not token_data or token_data.get("tenant_id") != self.tenant_domain:
raise PermissionError("Invalid capability token")
# Load and verify sharing
share = await self._load_share(dataset_id)
if not share or share.owner_id != owner_id:
raise PermissionError("Only dataset owner can update permissions")
if share.access_group != AccessGroup.TEAM:
raise ValueError("Can only update permissions for team-shared datasets")
if user_id not in share.team_members:
raise ValueError("User is not a team member")
# Update permission
share.team_permissions[user_id] = permission
# Store updated share
await self._store_share(share)
logger.info(f"Updated {user_id} permission to {permission.value} for dataset {dataset_id}")
return True
async def get_sharing_statistics(
self,
user_id: str,
capability_token: str
) -> Dict[str, Any]:
"""
Get sharing statistics for user.
Args:
user_id: User to get stats for
capability_token: JWT capability token
Returns:
Statistics dictionary
"""
# Verify capability token
token_data = verify_capability_token(capability_token)
if not token_data or token_data.get("tenant_id") != self.tenant_domain:
raise PermissionError("Invalid capability token")
stats = {
"owned_datasets": 0,
"shared_with_me": 0,
"sharing_breakdown": {
AccessGroup.INDIVIDUAL: 0,
AccessGroup.TEAM: 0,
AccessGroup.ORGANIZATION: 0
},
"total_team_members": 0,
"expired_shares": 0
}
all_shares = await self._list_all_shares()
for share in all_shares:
# Count owned datasets
if share.owner_id == user_id:
stats["owned_datasets"] += 1
stats["sharing_breakdown"][share.access_group] += 1
stats["total_team_members"] += len(share.team_members)
# Count expired shares
if share.expires_at and datetime.utcnow() > share.expires_at:
stats["expired_shares"] += 1
# Count datasets shared with user
elif user_id in share.team_members or share.access_group == AccessGroup.ORGANIZATION:
if share.is_active and (not share.expires_at or datetime.utcnow() <= share.expires_at):
stats["shared_with_me"] += 1
return stats
def _has_permission(self, user_permission: SharingPermission, required: SharingPermission) -> bool:
"""Check if user permission satisfies required permission"""
permission_hierarchy = {
SharingPermission.READ: 1,
SharingPermission.WRITE: 2,
SharingPermission.ADMIN: 3
}
return permission_hierarchy[user_permission] >= permission_hierarchy[required]
async def _store_share(self, share: DatasetShare):
"""Store sharing configuration to file system"""
share_file = self.shares_path / f"{share.dataset_id}.json"
with open(share_file, "w") as f:
json.dump(share.to_dict(), f, indent=2)
# Set secure permissions
os.chmod(share_file, stat.S_IRUSR | stat.S_IWUSR) # 600
async def _load_share(self, dataset_id: str) -> Optional[DatasetShare]:
"""Load sharing configuration from file system"""
share_file = self.shares_path / f"{dataset_id}.json"
if not share_file.exists():
return None
try:
with open(share_file, "r") as f:
data = json.load(f)
return DatasetShare.from_dict(data)
except Exception as e:
logger.error(f"Error loading share for dataset {dataset_id}: {e}")
return None
async def _list_all_shares(self) -> List[DatasetShare]:
"""List all sharing configurations"""
shares = []
if self.shares_path.exists():
for share_file in self.shares_path.glob("*.json"):
try:
with open(share_file, "r") as f:
data = json.load(f)
shares.append(DatasetShare.from_dict(data))
except Exception as e:
logger.error(f"Error loading share file {share_file}: {e}")
return shares
async def _load_dataset_resource(self, dataset_id: str) -> Optional[Resource]:
"""Load dataset resource (implementation would query storage)"""
# Placeholder - would integrate with actual resource storage
return Resource(
id=dataset_id,
name=f"Dataset {dataset_id}",
resource_type="dataset",
owner_id="mock_owner",
tenant_domain=self.tenant_domain,
access_group=AccessGroup.INDIVIDUAL
)
async def _load_dataset_info(self, dataset_id: str) -> Optional[DatasetInfo]:
"""Load dataset information (implementation would query storage)"""
# Placeholder - would integrate with actual dataset storage
return DatasetInfo(
id=dataset_id,
name=f"Dataset {dataset_id}",
description="Mock dataset for testing",
owner_id="mock_owner",
document_count=10,
size_bytes=1024000,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
tags=["test", "mock"]
)
async def _is_valid_tenant_user(self, user_id: str) -> bool:
"""Check if user is valid in tenant (implementation would query user store)"""
# Placeholder - would integrate with actual user management
return "@" in user_id and user_id.endswith((".com", ".org", ".edu"))