| | """ |
| | Deterministic ID generation for code chunks. |
| | |
| | This module provides deterministic hashing for chunk IDs, ensuring that |
| | identical code chunks receive the same ID across runs. This is crucial for: |
| | 1. Version tracking and change detection |
| | 2. Cache consistency |
| | 3. Reproducible datasets |
| | 4. Efficient deduplication |
| | |
| | ID GENERATION STRATEGY: |
| | Hash = SHA256(file_path + chunk_type + name + parent + |
| | start_line + end_line + code + byte_spans) |
| | |
| | Result: prefix_hash (e.g., "primary_5c442008") |
| | |
| | KEY PROPERTIES: |
| | 1. Deterministic: Same input β same ID |
| | 2. Content-aware: Code changes β ID changes |
| | 3. Position-aware: Line/byte changes β ID changes |
| | 4. Hierarchical: Parent relationships affect ID |
| | |
| | USE CASE: |
| | Ensures that during RAG operations, identical code chunks are |
| | recognized as the same entity, improving retrieval accuracy. |
| | |
| | EXAMPLE: |
| | deterministic_chunk_id( |
| | file_path="src/module.py", |
| | chunk_type="class", |
| | name="MyClass", |
| | parent="module", |
| | start_line=10, |
| | end_line=50, |
| | code="class MyClass: ...", |
| | start_byte=100, |
| | end_byte=500 |
| | ) |
| | β "primary_a1b2c3d4" |
| | """ |
| |
|
| | import hashlib |
| | from typing import Optional |
| |
|
| | def deterministic_chunk_id( |
| | *, |
| | file_path: str, |
| | chunk_type: str, |
| | name: Optional[str], |
| | parent: Optional[str], |
| | start_line: Optional[int], |
| | end_line: Optional[int], |
| | code: str, |
| | prefix: str = "primary", |
| | start_byte: Optional[int] = None, |
| | end_byte: Optional[int] = None, |
| | ) -> str: |
| | """ |
| | Generate deterministic chunk ID that includes code content. |
| | |
| | Args: |
| | file_path: Path to source file |
| | chunk_type: Type of chunk (function, class, method, etc.) |
| | name: Name of the symbol |
| | parent: Parent symbol name |
| | start_line: Starting line number |
| | end_line: Ending line number |
| | code: Actual code content |
| | prefix: ID prefix (primary/secondary) |
| | start_byte: Starting byte offset |
| | end_byte: Ending byte offset |
| | |
| | Returns: |
| | Deterministic chunk ID |
| | """ |
| | |
| | payload = f""" |
| | {file_path} |
| | {chunk_type} |
| | {name} |
| | {parent} |
| | {start_line} |
| | {end_line} |
| | {start_byte} |
| | {end_byte} |
| | {code} |
| | """.strip() |
| | |
| | |
| | hash_digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8] |
| | return f"{prefix}_{hash_digest}" |
| |
|