| #!/bin/bash -l |
|
|
| |
| poetry shell |
|
|
| |
| TS=$(date '+%Y%m%d%H%M%S') |
|
|
| CONFIG_FILE="configs/tokenization_config_${TS}.json" |
|
|
| |
| cat <<EOF > "$CONFIG_FILE" |
| { |
| "tokenizer_name_or_path": "teddy/models/teddy_g/400M", |
| "gene_id_column": "index", |
| "bio_annotations": true, |
| "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json", |
| "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json", |
| "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json", |
| "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json", |
| "max_shard_samples": 500, |
| "max_seq_len": 2048, |
| "pad_length": 2048, |
| "add_cls": false, |
| "bins": 0, |
| "continuous_rank": true, |
| "add_disease_annotation": false, |
| "include_zero_genes": false, |
| "load_dir": "data/processed", |
| "save_dir": "data/tokenized" |
| } |
| EOF |
|
|
| |
| |
| python teddy/data_processing/tokenization/tokenization.py \ |
| --data_path data/processed/sample_data.h5ad \ |
| --metadata_path data/processed/sample_data_metadata.json \ |
| --config_path "$CONFIG_FILE" |
|
|