| | """ |
| | Local Codebase Pipeline Runner - Processes local codebases for dataset creation. |
| | |
| | This is the main entry point for processing LOCAL CODEBASES (not Git repos). |
| | It orchestrates the entire chunking pipeline for local files, handling both |
| | code files and documentation with intelligent fallback strategies. |
| | |
| | ARCHITECTURE POSITION: |
| | - Local Pipeline Orchestrator: Coordinates local file processing |
| | - Fallback Handler: Intelligent fallback from code to documentation |
| | - Dataset Exporter: Creates final JSONL datasets with statistics |
| | |
| | KEY FEATURES: |
| | 1. Unified processing of Python files and documentation |
| | 2. Intelligent fallback (failed code chunking β documentation chunking) |
| | 3. Hierarchical chunking for Python files |
| | 4. Documentation-aware chunking for markdown/text files |
| | 5. Dataset statistics and metadata generation |
| | |
| | DATA FLOW: |
| | Local files β Type detection β Python chunking (or fallback) β |
| | Documentation chunking β JSONL export β Statistics |
| | |
| | USE CASES: |
| | - Processing locally saved code examples |
| | - Creating datasets from example repositories |
| | - Testing chunking strategies on local files |
| | |
| | USAGE: |
| | python run_python_pipeline.py --name crewai_examples --include crewai |
| | python run_python_pipeline.py --name test_dataset --exclude large_repos |
| | """ |
| |
|
| | from pathlib import Path |
| | import json |
| | import argparse |
| |
|
| | from src.task_3_data_engineering.chunking.hierarchical_chunker import HierarchicalChunker |
| | from src.task_3_data_engineering.export.jsonl_exporter import export_chunks_jsonl |
| | from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats |
| | from src.task_3_data_engineering.export.dataset_metadata import write_dataset_metadata |
| | from src.task_3_data_engineering.chunking.doc_chunker import chunk_document , wrap_doc_chunks |
| |
|
| |
|
| | INPUT_DIR = Path("data/raw/codebases") |
| | BASE_OUTPUT_DIR = Path("data/processed/chunks") |
| |
|
| | DOC_EXTS = {".md", ".txt", ".rst"} |
| |
|
| |
|
| | def run(dataset_name: str, include: list[str] | None, exclude: list[str] | None): |
| | output_dir = BASE_OUTPUT_DIR / dataset_name |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | chunker = HierarchicalChunker() |
| | all_chunks = [] |
| |
|
| | files = [p for p in INPUT_DIR.rglob("*") if p.is_file()] |
| |
|
| | for file_path in files: |
| | rel = file_path.relative_to(INPUT_DIR).parts |
| | if include and rel[0] not in include: |
| | continue |
| | if exclude and rel[0] in exclude: |
| | continue |
| |
|
| | print(f"Processing: {file_path}") |
| |
|
| | |
| | if file_path.suffix == ".py": |
| | try: |
| | code_chunks = chunker.chunk_file(file_path) |
| | if code_chunks: |
| | all_chunks.extend(code_chunks) |
| | continue |
| | except Exception: |
| | pass |
| |
|
| | |
| | if file_path.suffix.lower() in DOC_EXTS or file_path.suffix == ".py": |
| | try: |
| | raw_text = file_path.read_text(encoding="utf-8", errors="ignore") |
| | except Exception: |
| | continue |
| |
|
| | if not raw_text.strip(): |
| | continue |
| |
|
| | doc_chunks = chunk_document( |
| | raw_text=raw_text, |
| | source_name=str(file_path), |
| | source_url=None, |
| | ) |
| |
|
| | all_chunks.extend(wrap_doc_chunks(doc_chunks)) |
| |
|
| | |
| | export_chunks_jsonl(all_chunks, output_dir / "chunks.jsonl", print_stats=True) |
| |
|
| | stats = compute_dataset_stats(all_chunks) |
| |
|
| | primary = [c for c in all_chunks if c.hierarchy.is_primary] |
| | stats["hierarchy"] = { |
| | "primary_chunks": len(primary), |
| | "secondary_chunks": len(all_chunks) - len(primary), |
| | } |
| |
|
| | with (output_dir / "dataset_stats.json").open("w", encoding="utf-8") as f: |
| | json.dump(stats, f, indent=2) |
| |
|
| | write_dataset_metadata( |
| | chunks=all_chunks, |
| | output_path=output_dir / "dataset_metadata.json", |
| | dataset_name=dataset_name, |
| | dataset_version="v1", |
| | ) |
| |
|
| | print("\nβ
Dataset built successfully") |
| | print(f" - Files: {len({c.file_path for c in all_chunks})}") |
| | print(f" - Chunks: {len(all_chunks)}") |
| | print(f" - Output: {output_dir}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--name", required=True) |
| | parser.add_argument("--include", nargs="+") |
| | parser.add_argument("--exclude", nargs="+") |
| | args = parser.parse_args() |
| |
|
| | run(args.name, args.include, args.exclude) |