| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.346229462857143e-05, | |
| "loss": 1.3505, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 7.52901629142857e-05, | |
| "loss": 1.2791, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00011711803120000001, | |
| "loss": 1.2566, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00015894589948571428, | |
| "loss": 1.2063, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00020077376777142858, | |
| "loss": 1.1916, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.00024260163605714289, | |
| "loss": 1.1682, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00028442950434285713, | |
| "loss": 1.1543, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00029277547472836675, | |
| "loss": 1.1483, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00029269584843411034, | |
| "loss": 1.1248, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.000292555018304119, | |
| "loss": 1.1387, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00029235306291112414, | |
| "loss": 1.1138, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.00029209009493120456, | |
| "loss": 1.1046, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 0.0002917662610809221, | |
| "loss": 1.0948, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00029138174203546406, | |
| "loss": 1.0962, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0002909367523278405, | |
| "loss": 1.0982, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.0002904315402291901, | |
| "loss": 1.0886, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.0002898663876102642, | |
| "loss": 1.0848, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.00028924160978416303, | |
| "loss": 1.0718, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00028855755533041517, | |
| "loss": 1.0656, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00028781460590049563, | |
| "loss": 1.0677, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.0002870131760048931, | |
| "loss": 1.0704, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00028615371278184357, | |
| "loss": 1.0547, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.0002852366957478608, | |
| "loss": 1.0609, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.0002842626365302016, | |
| "loss": 1.0536, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.000283232078581416, | |
| "loss": 1.0426, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00028214559687614115, | |
| "loss": 1.0412, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.0002810037975903082, | |
| "loss": 1.0504, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.0002798073177629413, | |
| "loss": 1.0451, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.010427474975586, | |
| "eval_runtime": 0.9944, | |
| "eval_samples_per_second": 25.14, | |
| "eval_steps_per_second": 25.14, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0357142857142858, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.00027855682494073726, | |
| "loss": 0.8536, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.0002772530168056241, | |
| "loss": 0.8519, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1071428571428572, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.0002758966207855065, | |
| "loss": 0.8498, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00027448839364841533, | |
| "loss": 0.851, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1785714285714286, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.00027302912108028696, | |
| "loss": 0.8525, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2142857142857142, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00027151961724660947, | |
| "loss": 0.8354, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.0002699607243381787, | |
| "loss": 0.8455, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.0002683533121012184, | |
| "loss": 0.8607, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.3214285714285714, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.0002666982773521266, | |
| "loss": 0.8555, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3571428571428572, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.000264996543477119, | |
| "loss": 0.8599, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.3928571428571428, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.0002632490599170478, | |
| "loss": 0.8665, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00026145680163768475, | |
| "loss": 0.858, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4642857142857144, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0002596207685857627, | |
| "loss": 0.8568, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00025774198513107944, | |
| "loss": 0.8664, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5357142857142856, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.00025582149949497613, | |
| "loss": 0.8518, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.00025386038316550736, | |
| "loss": 0.8535, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.0002518597302996307, | |
| "loss": 0.8543, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6428571428571428, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.0002498206571127487, | |
| "loss": 0.8629, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.6785714285714286, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00024774430125594336, | |
| "loss": 0.8589, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.0002456318211812522, | |
| "loss": 0.862, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00024348439549533736, | |
| "loss": 0.8568, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.00024130322230191153, | |
| "loss": 0.8635, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8214285714285714, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00023908951853328497, | |
| "loss": 0.8514, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00023684451927140764, | |
| "loss": 0.86, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8928571428571428, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00023456947705878574, | |
| "loss": 0.8597, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9285714285714286, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0002322656611996561, | |
| "loss": 0.8569, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00022993435705180877, | |
| "loss": 0.8624, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.00022757686530945314, | |
| "loss": 0.8536, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.9837232232093811, | |
| "eval_runtime": 0.8983, | |
| "eval_samples_per_second": 27.831, | |
| "eval_steps_per_second": 27.831, | |
| "step": 280 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 700, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.55938468888576e+17, | |
| "train_batch_size": 100, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |