Merck
/

TEDDY

Model card Files Files and versions

TEDDY / scripts /tokenize_sample_data.sh

soumyatghosh's picture

Upload folder using huggingface_hub

4527b5f verified 8 months ago

history blame contribute delete

1.46 kB

	#!/bin/bash -l

	# Activate the Poetry environment (adjust this if needed)
	poetry shell

	# Generate a timestamp string (e.g., 20230404123056)
	TS=$(date '+%Y%m%d%H%M%S')

	CONFIG_FILE="configs/tokenization_config_${TS}.json"

	# Create the config file containing your tokenization arguments
	cat <<EOF > "$CONFIG_FILE"
	{
	"tokenizer_name_or_path": "teddy/models/teddy_g/400M",
	"gene_id_column": "index",
	"bio_annotations": true,
	"disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json",
	"tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json",
	"cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json",
	"sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json",
	"max_shard_samples": 500,
	"max_seq_len": 2048,
	"pad_length": 2048,
	"add_cls": false,
	"bins": 0,
	"continuous_rank": true,
	"add_disease_annotation": false,
	"include_zero_genes": false,
	"load_dir": "data/processed",
	"save_dir": "data/tokenized"
	}
	EOF

	# Execute the tokenization.py script with three arguments:
	# --data_path, --metadata_path, and --config_path
	python teddy/data_processing/tokenization/tokenization.py \
	--data_path data/processed/sample_data.h5ad \
	--metadata_path data/processed/sample_data_metadata.json \
	--config_path "$CONFIG_FILE"