Spaces:
Running
Running
File size: 7,039 Bytes
8755993 |
|
"""
Path obfuscation module for privacy-preserving codebase indexing.
Implements HMAC-based path component hashing to mask sensitive file paths
while preserving directory structure for retrieval. Inspired by Cursor's
privacy features.
"""
import hashlib
import hmac
import json
import logging
import secrets
from pathlib import Path
from typing import Dict, Optional
logger = logging.getLogger(__name__)
class PathObfuscator:
"""
Obfuscates file paths using HMAC-based hashing.
Each path component (directory/file name) is hashed separately,
preserving the directory structure while masking actual names.
Example:
src/payments/invoice_processor.py -> a9f3/x72k/qp1m8d.f4
"""
def __init__(self, secret_key: Optional[str] = None, mapping_file: Optional[str] = None):
"""
Initialize path obfuscator.
Args:
secret_key: Secret key for HMAC (auto-generated if not provided)
mapping_file: File to store path mappings for decryption
"""
self.secret_key = secret_key or self._generate_key()
self.mapping_file = mapping_file or "chroma_db/.path_mapping.json"
# Load existing mappings
self.obfuscated_to_original: Dict[str, str] = {}
self.original_to_obfuscated: Dict[str, str] = {}
self._load_mappings()
def _generate_key(self) -> str:
"""Generate a random secret key."""
return secrets.token_hex(32)
def _hash_component(self, component: str) -> str:
"""
Hash a single path component using HMAC.
Args:
component: Path component (directory or file name)
Returns:
Hashed component (shortened for readability)
"""
# Use HMAC-SHA256 for secure hashing
h = hmac.new(
self.secret_key.encode(),
component.encode(),
hashlib.sha256
)
# Take first 8 characters of hex digest for readability
return h.hexdigest()[:8]
def obfuscate_path(self, original_path: str) -> str:
"""
Obfuscate a file path.
Args:
original_path: Original file path (e.g., "src/payments/invoice.py")
Returns:
Obfuscated path (e.g., "a9f3/x72k/qp1m8d.f4")
"""
# Check if already obfuscated
if original_path in self.original_to_obfuscated:
return self.original_to_obfuscated[original_path]
# Split path into components
path_obj = Path(original_path)
components = list(path_obj.parts)
# Hash each component
obfuscated_components = []
for component in components:
# Preserve file extension for type identification
if '.' in component and component == components[-1]:
# This is a file with extension
name, ext = component.rsplit('.', 1)
hashed_name = self._hash_component(name)
# Shorten extension hash
hashed_ext = self._hash_component(ext)[:2]
obfuscated_components.append(f"{hashed_name}.{hashed_ext}")
else:
# Directory or file without extension
obfuscated_components.append(self._hash_component(component))
# Reconstruct path
obfuscated_path = '/'.join(obfuscated_components)
# Store mapping
self.original_to_obfuscated[original_path] = obfuscated_path
self.obfuscated_to_original[obfuscated_path] = original_path
self._save_mappings()
logger.debug(f"Obfuscated: {original_path} -> {obfuscated_path}")
return obfuscated_path
def deobfuscate_path(self, obfuscated_path: str) -> Optional[str]:
"""
Deobfuscate a file path.
Args:
obfuscated_path: Obfuscated path
Returns:
Original path or None if not found
"""
return self.obfuscated_to_original.get(obfuscated_path)
def _load_mappings(self):
"""Load path mappings from disk."""
mapping_path = Path(self.mapping_file)
if not mapping_path.exists():
logger.info(f"No existing path mappings found at {self.mapping_file}")
return
try:
with open(mapping_path, 'r') as f:
data = json.load(f)
self.obfuscated_to_original = data.get('obfuscated_to_original', {})
self.original_to_obfuscated = data.get('original_to_obfuscated', {})
logger.info(f"Loaded {len(self.original_to_obfuscated)} path mappings")
except Exception as e:
logger.error(f"Failed to load path mappings: {e}")
def _save_mappings(self):
"""Save path mappings to disk."""
mapping_path = Path(self.mapping_file)
mapping_path.parent.mkdir(parents=True, exist_ok=True)
try:
data = {
'obfuscated_to_original': self.obfuscated_to_original,
'original_to_obfuscated': self.original_to_obfuscated,
'secret_key': self.secret_key # Store for consistency
}
with open(mapping_path, 'w') as f:
json.dump(data, f, indent=2)
logger.debug(f"Saved {len(self.original_to_obfuscated)} path mappings")
except Exception as e:
logger.error(f"Failed to save path mappings: {e}")
def clear_mappings(self):
"""Clear all path mappings."""
self.obfuscated_to_original.clear()
self.original_to_obfuscated.clear()
mapping_path = Path(self.mapping_file)
if mapping_path.exists():
mapping_path.unlink()
logger.info("Cleared all path mappings")
def get_stats(self) -> Dict[str, int]:
"""Get statistics about path mappings."""
return {
'total_paths': len(self.original_to_obfuscated),
'unique_directories': len(set(
str(Path(p).parent) for p in self.original_to_obfuscated.keys()
))
}
# Global obfuscator instance
_obfuscator: Optional[PathObfuscator] = None
def get_obfuscator(
secret_key: Optional[str] = None,
mapping_file: Optional[str] = None
) -> PathObfuscator:
"""
Get the global path obfuscator instance.
Args:
secret_key: Secret key for HMAC (auto-generated if not provided)
mapping_file: File to store path mappings
Returns:
PathObfuscator instance
"""
global _obfuscator
if _obfuscator is None:
_obfuscator = PathObfuscator(secret_key, mapping_file)
return _obfuscator
def reset_obfuscator():
"""Reset the global obfuscator (useful for testing)."""
global _obfuscator
_obfuscator = None
|