| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| export OMP_NUM_THREADS=64 |
|
|
| |
| export NCCL_NVLS_ENABLE=1 |
| export NCCL_IB_ADAPTIVE_ROUTING=1 |
| export NCCL_IB_SL=1 |
| export NCCL_IB_QPS_PER_CONNECTION=2 |
| export NCCL_IB_SPLIT_DATA_ON_QPS=0 |
| export NCCL_IB_HCA=mlx5_15,mlx5_10,mlx5_14,mlx5_13,mlx5_8,mlx5_7,mlx5_9,mlx5_4 |
| export NCCL_SOCKET_IFNAME=bond0 |
| export NCCL_ALGO=RING |
| export UCX_TLS=rc |
|
|
| python ./peptide/rectify_train.py \ |
| --train_dataset_path ./peptide/ectified_datasets/v3/train \ |
| --val_dataset_path ./peptide/rectified_datasets/v3/validation \ |
| --version 3 \ |
| --model_dim 512 \ |
| --n_heads 8 \ |
| --n_layers 6 \ |
| --vocab_size 24 \ |
| --seq_len 100 \ |
| --epochs 50 \ |
| --learning_rate 1e-4 \ |
| --weight_decay 2e-5 \ |
| --label_smoothing 0.0 \ |
| --checkpoint_dir ./peptide/ckpt \ |
| --tc_batches 20 \ |
| --tc_k_samples 50 \ |
| --resume_from_checkpoint ./peptide/ckpt/PepReDi_v2.pt |
|
|