| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.0530589220438898e-05, | |
| "loss": 0.3756, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.369382574598752e-05, | |
| "loss": 0.3539, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.685706227153615e-05, | |
| "loss": 0.3358, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 5.0020298797084764e-05, | |
| "loss": 0.3247, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 6.31835353226334e-05, | |
| "loss": 0.3264, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 7.634677184818202e-05, | |
| "loss": 0.3223, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 8.951000837373064e-05, | |
| "loss": 0.3088, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 9.210381305193982e-05, | |
| "loss": 0.3045, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 9.19461645374081e-05, | |
| "loss": 0.3089, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 9.166783579068711e-05, | |
| "loss": 0.305, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.625, | |
| "learning_rate": 9.12698044516507e-05, | |
| "loss": 0.3025, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 9.075346861989402e-05, | |
| "loss": 0.301, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 9.012064194385754e-05, | |
| "loss": 0.2987, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 8.937354725032093e-05, | |
| "loss": 0.2934, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 8.851480873664277e-05, | |
| "loss": 0.2906, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 8.754744275317171e-05, | |
| "loss": 0.2889, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 8.647484720820588e-05, | |
| "loss": 0.2857, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 8.530078963271592e-05, | |
| "loss": 0.2853, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 8.402939394675527e-05, | |
| "loss": 0.2861, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 8.266512597404037e-05, | |
| "loss": 0.281, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.3017810583114624, | |
| "eval_runtime": 2.9203, | |
| "eval_samples_per_second": 18.491, | |
| "eval_steps_per_second": 18.491, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 8.121277775558209e-05, | |
| "loss": 0.2349, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 7.967745071746685e-05, | |
| "loss": 0.2345, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 7.806453775191155e-05, | |
| "loss": 0.2296, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 7.637970427453278e-05, | |
| "loss": 0.2277, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 7.4628868324368e-05, | |
| "loss": 0.2305, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.625, | |
| "learning_rate": 7.281817977654755e-05, | |
| "loss": 0.2355, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 7.095399874063389e-05, | |
| "loss": 0.2291, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 6.90428732205049e-05, | |
| "loss": 0.2308, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 6.709151611425138e-05, | |
| "loss": 0.2344, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 6.510678163487767e-05, | |
| "loss": 0.2316, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 6.309564123462851e-05, | |
| "loss": 0.2317, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 6.106515911750884e-05, | |
| "loss": 0.2291, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 5.9022467426009464e-05, | |
| "loss": 0.2263, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 5.6974741189196505e-05, | |
| "loss": 0.2306, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 5.4929173120160137e-05, | |
| "loss": 0.2259, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 5.289294835134747e-05, | |
| "loss": 0.2308, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 5.087321919652288e-05, | |
| "loss": 0.2265, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 4.887708002800489e-05, | |
| "loss": 0.2264, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.6911542357424886e-05, | |
| "loss": 0.2237, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 4.498351020753657e-05, | |
| "loss": 0.2241, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.28645917773246765, | |
| "eval_runtime": 2.8041, | |
| "eval_samples_per_second": 19.258, | |
| "eval_steps_per_second": 19.258, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.51548994781184e+17, | |
| "train_batch_size": 140, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |