gr15_open_pot_CKA / trainer_state.json
binhng's picture
Upload folder using huggingface_hub
54cd42c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 18.537590113285273,
"eval_steps": 500,
"global_step": 18000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010298661174047374,
"grad_norm": 28.08726692199707,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.0981,
"step": 10
},
{
"epoch": 0.02059732234809475,
"grad_norm": 16.46196746826172,
"learning_rate": 2.1111111111111114e-06,
"loss": 1.9619,
"step": 20
},
{
"epoch": 0.030895983522142123,
"grad_norm": 13.453218460083008,
"learning_rate": 3.2222222222222222e-06,
"loss": 1.5883,
"step": 30
},
{
"epoch": 0.0411946446961895,
"grad_norm": 3.0111494064331055,
"learning_rate": 4.333333333333334e-06,
"loss": 0.8443,
"step": 40
},
{
"epoch": 0.05149330587023687,
"grad_norm": 1.8473039865493774,
"learning_rate": 5.444444444444445e-06,
"loss": 0.4851,
"step": 50
},
{
"epoch": 0.061791967044284246,
"grad_norm": 1.983799695968628,
"learning_rate": 6.555555555555556e-06,
"loss": 0.4895,
"step": 60
},
{
"epoch": 0.07209062821833162,
"grad_norm": 1.359467625617981,
"learning_rate": 7.666666666666667e-06,
"loss": 0.3476,
"step": 70
},
{
"epoch": 0.082389289392379,
"grad_norm": 1.6559157371520996,
"learning_rate": 8.777777777777778e-06,
"loss": 0.3161,
"step": 80
},
{
"epoch": 0.09268795056642637,
"grad_norm": 1.4577065706253052,
"learning_rate": 9.888888888888889e-06,
"loss": 0.2894,
"step": 90
},
{
"epoch": 0.10298661174047374,
"grad_norm": 1.9685674905776978,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.2669,
"step": 100
},
{
"epoch": 0.11328527291452112,
"grad_norm": 1.0735788345336914,
"learning_rate": 1.2111111111111112e-05,
"loss": 0.2384,
"step": 110
},
{
"epoch": 0.12358393408856849,
"grad_norm": 2.065934419631958,
"learning_rate": 1.3222222222222221e-05,
"loss": 0.2315,
"step": 120
},
{
"epoch": 0.13388259526261587,
"grad_norm": 1.3160645961761475,
"learning_rate": 1.4333333333333334e-05,
"loss": 0.2212,
"step": 130
},
{
"epoch": 0.14418125643666324,
"grad_norm": 1.132812738418579,
"learning_rate": 1.5444444444444446e-05,
"loss": 0.2107,
"step": 140
},
{
"epoch": 0.15447991761071062,
"grad_norm": 0.8556684851646423,
"learning_rate": 1.655555555555556e-05,
"loss": 0.1983,
"step": 150
},
{
"epoch": 0.164778578784758,
"grad_norm": 1.1401009559631348,
"learning_rate": 1.7666666666666668e-05,
"loss": 0.1821,
"step": 160
},
{
"epoch": 0.17507723995880536,
"grad_norm": 0.9898369312286377,
"learning_rate": 1.8777777777777777e-05,
"loss": 0.1725,
"step": 170
},
{
"epoch": 0.18537590113285274,
"grad_norm": 1.2845979928970337,
"learning_rate": 1.988888888888889e-05,
"loss": 0.1798,
"step": 180
},
{
"epoch": 0.1956745623069001,
"grad_norm": 0.7349956631660461,
"learning_rate": 2.1e-05,
"loss": 0.1553,
"step": 190
},
{
"epoch": 0.2059732234809475,
"grad_norm": 1.0893903970718384,
"learning_rate": 2.211111111111111e-05,
"loss": 0.161,
"step": 200
},
{
"epoch": 0.21627188465499486,
"grad_norm": 1.4773167371749878,
"learning_rate": 2.3222222222222224e-05,
"loss": 0.1687,
"step": 210
},
{
"epoch": 0.22657054582904224,
"grad_norm": 0.7343375086784363,
"learning_rate": 2.4333333333333336e-05,
"loss": 0.1541,
"step": 220
},
{
"epoch": 0.2368692070030896,
"grad_norm": 1.459641456604004,
"learning_rate": 2.5444444444444442e-05,
"loss": 0.1546,
"step": 230
},
{
"epoch": 0.24716786817713698,
"grad_norm": 1.007576823234558,
"learning_rate": 2.6555555555555555e-05,
"loss": 0.1397,
"step": 240
},
{
"epoch": 0.25746652935118436,
"grad_norm": 0.7707590460777283,
"learning_rate": 2.7666666666666667e-05,
"loss": 0.1395,
"step": 250
},
{
"epoch": 0.26776519052523173,
"grad_norm": 0.8418192863464355,
"learning_rate": 2.877777777777778e-05,
"loss": 0.1367,
"step": 260
},
{
"epoch": 0.2780638516992791,
"grad_norm": 1.433361291885376,
"learning_rate": 2.988888888888889e-05,
"loss": 0.1443,
"step": 270
},
{
"epoch": 0.2883625128733265,
"grad_norm": 1.6851385831832886,
"learning_rate": 3.1e-05,
"loss": 0.1412,
"step": 280
},
{
"epoch": 0.29866117404737386,
"grad_norm": 1.0967495441436768,
"learning_rate": 3.2111111111111114e-05,
"loss": 0.1465,
"step": 290
},
{
"epoch": 0.30895983522142123,
"grad_norm": 0.9680765867233276,
"learning_rate": 3.322222222222222e-05,
"loss": 0.1409,
"step": 300
},
{
"epoch": 0.3192584963954686,
"grad_norm": 0.8024266362190247,
"learning_rate": 3.433333333333333e-05,
"loss": 0.151,
"step": 310
},
{
"epoch": 0.329557157569516,
"grad_norm": 1.2099324464797974,
"learning_rate": 3.5444444444444445e-05,
"loss": 0.1276,
"step": 320
},
{
"epoch": 0.33985581874356335,
"grad_norm": 1.553401231765747,
"learning_rate": 3.655555555555556e-05,
"loss": 0.1407,
"step": 330
},
{
"epoch": 0.35015447991761073,
"grad_norm": 0.9965718388557434,
"learning_rate": 3.766666666666667e-05,
"loss": 0.1193,
"step": 340
},
{
"epoch": 0.3604531410916581,
"grad_norm": 1.0881636142730713,
"learning_rate": 3.877777777777778e-05,
"loss": 0.1161,
"step": 350
},
{
"epoch": 0.3707518022657055,
"grad_norm": 0.7971917986869812,
"learning_rate": 3.9888888888888895e-05,
"loss": 0.1153,
"step": 360
},
{
"epoch": 0.38105046343975285,
"grad_norm": 0.6419103741645813,
"learning_rate": 4.1e-05,
"loss": 0.1268,
"step": 370
},
{
"epoch": 0.3913491246138002,
"grad_norm": 0.8467381596565247,
"learning_rate": 4.211111111111111e-05,
"loss": 0.1089,
"step": 380
},
{
"epoch": 0.4016477857878476,
"grad_norm": 0.7437835335731506,
"learning_rate": 4.3222222222222226e-05,
"loss": 0.1196,
"step": 390
},
{
"epoch": 0.411946446961895,
"grad_norm": 1.1879000663757324,
"learning_rate": 4.433333333333334e-05,
"loss": 0.1104,
"step": 400
},
{
"epoch": 0.42224510813594235,
"grad_norm": 1.103964924812317,
"learning_rate": 4.5444444444444444e-05,
"loss": 0.1154,
"step": 410
},
{
"epoch": 0.4325437693099897,
"grad_norm": 1.20859956741333,
"learning_rate": 4.6555555555555556e-05,
"loss": 0.1151,
"step": 420
},
{
"epoch": 0.4428424304840371,
"grad_norm": 1.3592861890792847,
"learning_rate": 4.766666666666667e-05,
"loss": 0.1221,
"step": 430
},
{
"epoch": 0.45314109165808447,
"grad_norm": 0.7694193720817566,
"learning_rate": 4.8777777777777775e-05,
"loss": 0.1081,
"step": 440
},
{
"epoch": 0.46343975283213185,
"grad_norm": 0.8526501655578613,
"learning_rate": 4.9888888888888894e-05,
"loss": 0.1071,
"step": 450
},
{
"epoch": 0.4737384140061792,
"grad_norm": 0.8666425943374634,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.1125,
"step": 460
},
{
"epoch": 0.4840370751802266,
"grad_norm": 1.0404722690582275,
"learning_rate": 5.211111111111111e-05,
"loss": 0.1235,
"step": 470
},
{
"epoch": 0.49433573635427397,
"grad_norm": 0.8314346671104431,
"learning_rate": 5.322222222222223e-05,
"loss": 0.1156,
"step": 480
},
{
"epoch": 0.5046343975283213,
"grad_norm": 0.8053165674209595,
"learning_rate": 5.433333333333334e-05,
"loss": 0.0963,
"step": 490
},
{
"epoch": 0.5149330587023687,
"grad_norm": 0.9703218340873718,
"learning_rate": 5.544444444444444e-05,
"loss": 0.1081,
"step": 500
},
{
"epoch": 0.525231719876416,
"grad_norm": 1.0357967615127563,
"learning_rate": 5.655555555555556e-05,
"loss": 0.1053,
"step": 510
},
{
"epoch": 0.5355303810504635,
"grad_norm": 0.6202366948127747,
"learning_rate": 5.766666666666667e-05,
"loss": 0.1161,
"step": 520
},
{
"epoch": 0.5458290422245108,
"grad_norm": 0.9413891434669495,
"learning_rate": 5.8777777777777774e-05,
"loss": 0.1109,
"step": 530
},
{
"epoch": 0.5561277033985582,
"grad_norm": 0.9725326299667358,
"learning_rate": 5.988888888888889e-05,
"loss": 0.1087,
"step": 540
},
{
"epoch": 0.5664263645726055,
"grad_norm": 1.1372697353363037,
"learning_rate": 6.1e-05,
"loss": 0.0934,
"step": 550
},
{
"epoch": 0.576725025746653,
"grad_norm": 0.9730582237243652,
"learning_rate": 6.21111111111111e-05,
"loss": 0.089,
"step": 560
},
{
"epoch": 0.5870236869207003,
"grad_norm": 1.031986117362976,
"learning_rate": 6.322222222222223e-05,
"loss": 0.0921,
"step": 570
},
{
"epoch": 0.5973223480947477,
"grad_norm": 0.9803087115287781,
"learning_rate": 6.433333333333333e-05,
"loss": 0.109,
"step": 580
},
{
"epoch": 0.607621009268795,
"grad_norm": 1.2565224170684814,
"learning_rate": 6.544444444444446e-05,
"loss": 0.1075,
"step": 590
},
{
"epoch": 0.6179196704428425,
"grad_norm": 0.6035177707672119,
"learning_rate": 6.655555555555555e-05,
"loss": 0.1069,
"step": 600
},
{
"epoch": 0.6282183316168898,
"grad_norm": 0.6485044360160828,
"learning_rate": 6.766666666666667e-05,
"loss": 0.1041,
"step": 610
},
{
"epoch": 0.6385169927909372,
"grad_norm": 0.9063082337379456,
"learning_rate": 6.877777777777778e-05,
"loss": 0.087,
"step": 620
},
{
"epoch": 0.6488156539649845,
"grad_norm": 0.7508301734924316,
"learning_rate": 6.988888888888889e-05,
"loss": 0.0993,
"step": 630
},
{
"epoch": 0.659114315139032,
"grad_norm": 0.7371131777763367,
"learning_rate": 7.1e-05,
"loss": 0.0965,
"step": 640
},
{
"epoch": 0.6694129763130793,
"grad_norm": 0.9033893942832947,
"learning_rate": 7.211111111111112e-05,
"loss": 0.0927,
"step": 650
},
{
"epoch": 0.6797116374871267,
"grad_norm": 1.0828319787979126,
"learning_rate": 7.322222222222223e-05,
"loss": 0.1039,
"step": 660
},
{
"epoch": 0.690010298661174,
"grad_norm": 0.7973754405975342,
"learning_rate": 7.433333333333333e-05,
"loss": 0.0942,
"step": 670
},
{
"epoch": 0.7003089598352215,
"grad_norm": 0.9999275803565979,
"learning_rate": 7.544444444444445e-05,
"loss": 0.0938,
"step": 680
},
{
"epoch": 0.7106076210092688,
"grad_norm": 0.7432506680488586,
"learning_rate": 7.655555555555555e-05,
"loss": 0.0822,
"step": 690
},
{
"epoch": 0.7209062821833162,
"grad_norm": 0.7960357069969177,
"learning_rate": 7.766666666666667e-05,
"loss": 0.0885,
"step": 700
},
{
"epoch": 0.7312049433573635,
"grad_norm": 0.6295223236083984,
"learning_rate": 7.877777777777778e-05,
"loss": 0.0984,
"step": 710
},
{
"epoch": 0.741503604531411,
"grad_norm": 0.6425987482070923,
"learning_rate": 7.988888888888889e-05,
"loss": 0.0851,
"step": 720
},
{
"epoch": 0.7518022657054583,
"grad_norm": 0.7241719961166382,
"learning_rate": 8.1e-05,
"loss": 0.0818,
"step": 730
},
{
"epoch": 0.7621009268795057,
"grad_norm": 0.6875414252281189,
"learning_rate": 8.211111111111112e-05,
"loss": 0.0776,
"step": 740
},
{
"epoch": 0.772399588053553,
"grad_norm": 0.7593461275100708,
"learning_rate": 8.322222222222223e-05,
"loss": 0.0862,
"step": 750
},
{
"epoch": 0.7826982492276005,
"grad_norm": 1.1254090070724487,
"learning_rate": 8.433333333333334e-05,
"loss": 0.0831,
"step": 760
},
{
"epoch": 0.7929969104016478,
"grad_norm": 0.6563543677330017,
"learning_rate": 8.544444444444445e-05,
"loss": 0.0756,
"step": 770
},
{
"epoch": 0.8032955715756952,
"grad_norm": 0.500499963760376,
"learning_rate": 8.655555555555555e-05,
"loss": 0.09,
"step": 780
},
{
"epoch": 0.8135942327497425,
"grad_norm": 0.6962169408798218,
"learning_rate": 8.766666666666668e-05,
"loss": 0.0913,
"step": 790
},
{
"epoch": 0.82389289392379,
"grad_norm": 0.8879425525665283,
"learning_rate": 8.877777777777778e-05,
"loss": 0.094,
"step": 800
},
{
"epoch": 0.8341915550978373,
"grad_norm": 0.7109111547470093,
"learning_rate": 8.988888888888889e-05,
"loss": 0.0899,
"step": 810
},
{
"epoch": 0.8444902162718847,
"grad_norm": 0.6895614266395569,
"learning_rate": 9.1e-05,
"loss": 0.0899,
"step": 820
},
{
"epoch": 0.854788877445932,
"grad_norm": 0.5885145664215088,
"learning_rate": 9.211111111111112e-05,
"loss": 0.0894,
"step": 830
},
{
"epoch": 0.8650875386199794,
"grad_norm": 0.6228615641593933,
"learning_rate": 9.322222222222223e-05,
"loss": 0.0826,
"step": 840
},
{
"epoch": 0.8753861997940268,
"grad_norm": 0.6920461654663086,
"learning_rate": 9.433333333333334e-05,
"loss": 0.0926,
"step": 850
},
{
"epoch": 0.8856848609680742,
"grad_norm": 0.8142651319503784,
"learning_rate": 9.544444444444445e-05,
"loss": 0.0769,
"step": 860
},
{
"epoch": 0.8959835221421215,
"grad_norm": 0.8525772094726562,
"learning_rate": 9.655555555555555e-05,
"loss": 0.0775,
"step": 870
},
{
"epoch": 0.9062821833161689,
"grad_norm": 0.6274034976959229,
"learning_rate": 9.766666666666668e-05,
"loss": 0.0793,
"step": 880
},
{
"epoch": 0.9165808444902163,
"grad_norm": 0.7031662464141846,
"learning_rate": 9.877777777777778e-05,
"loss": 0.081,
"step": 890
},
{
"epoch": 0.9268795056642637,
"grad_norm": 0.542312741279602,
"learning_rate": 9.98888888888889e-05,
"loss": 0.0878,
"step": 900
},
{
"epoch": 0.937178166838311,
"grad_norm": 0.5504183173179626,
"learning_rate": 9.999993165095463e-05,
"loss": 0.0711,
"step": 910
},
{
"epoch": 0.9474768280123584,
"grad_norm": 0.6083622574806213,
"learning_rate": 9.999969538288952e-05,
"loss": 0.0774,
"step": 920
},
{
"epoch": 0.9577754891864058,
"grad_norm": 0.7640944123268127,
"learning_rate": 9.999929035278659e-05,
"loss": 0.0711,
"step": 930
},
{
"epoch": 0.9680741503604532,
"grad_norm": 0.34581655263900757,
"learning_rate": 9.999871656201292e-05,
"loss": 0.0716,
"step": 940
},
{
"epoch": 0.9783728115345005,
"grad_norm": 0.6435947418212891,
"learning_rate": 9.999797401250521e-05,
"loss": 0.0833,
"step": 950
},
{
"epoch": 0.9886714727085479,
"grad_norm": 0.6153683662414551,
"learning_rate": 9.999706270676973e-05,
"loss": 0.0683,
"step": 960
},
{
"epoch": 0.9989701338825953,
"grad_norm": 0.5145250558853149,
"learning_rate": 9.999598264788241e-05,
"loss": 0.0679,
"step": 970
},
{
"epoch": 1.0092687950566426,
"grad_norm": 0.5474639534950256,
"learning_rate": 9.999473383948872e-05,
"loss": 0.0652,
"step": 980
},
{
"epoch": 1.01956745623069,
"grad_norm": 0.4673866331577301,
"learning_rate": 9.99933162858037e-05,
"loss": 0.0806,
"step": 990
},
{
"epoch": 1.0298661174047374,
"grad_norm": 0.500733494758606,
"learning_rate": 9.999172999161198e-05,
"loss": 0.0746,
"step": 1000
},
{
"epoch": 1.0401647785787849,
"grad_norm": 0.6277178525924683,
"learning_rate": 9.998997496226772e-05,
"loss": 0.0691,
"step": 1010
},
{
"epoch": 1.050463439752832,
"grad_norm": 0.34232184290885925,
"learning_rate": 9.998805120369458e-05,
"loss": 0.069,
"step": 1020
},
{
"epoch": 1.0607621009268795,
"grad_norm": 0.6583809852600098,
"learning_rate": 9.998595872238577e-05,
"loss": 0.0646,
"step": 1030
},
{
"epoch": 1.071060762100927,
"grad_norm": 0.5400450825691223,
"learning_rate": 9.998369752540395e-05,
"loss": 0.0709,
"step": 1040
},
{
"epoch": 1.0813594232749741,
"grad_norm": 0.716460645198822,
"learning_rate": 9.998126762038126e-05,
"loss": 0.0659,
"step": 1050
},
{
"epoch": 1.0916580844490216,
"grad_norm": 0.7969040274620056,
"learning_rate": 9.997866901551926e-05,
"loss": 0.0834,
"step": 1060
},
{
"epoch": 1.101956745623069,
"grad_norm": 0.6805360317230225,
"learning_rate": 9.997590171958892e-05,
"loss": 0.0661,
"step": 1070
},
{
"epoch": 1.1122554067971164,
"grad_norm": 0.6645709872245789,
"learning_rate": 9.997296574193058e-05,
"loss": 0.0719,
"step": 1080
},
{
"epoch": 1.1225540679711639,
"grad_norm": 0.9983972311019897,
"learning_rate": 9.996986109245395e-05,
"loss": 0.063,
"step": 1090
},
{
"epoch": 1.132852729145211,
"grad_norm": 0.47811999917030334,
"learning_rate": 9.996658778163802e-05,
"loss": 0.0812,
"step": 1100
},
{
"epoch": 1.1431513903192585,
"grad_norm": 0.9598459601402283,
"learning_rate": 9.996314582053106e-05,
"loss": 0.0797,
"step": 1110
},
{
"epoch": 1.153450051493306,
"grad_norm": 0.8147891759872437,
"learning_rate": 9.995953522075061e-05,
"loss": 0.076,
"step": 1120
},
{
"epoch": 1.1637487126673531,
"grad_norm": 0.36551281809806824,
"learning_rate": 9.995575599448336e-05,
"loss": 0.0689,
"step": 1130
},
{
"epoch": 1.1740473738414006,
"grad_norm": 0.41024380922317505,
"learning_rate": 9.995180815448523e-05,
"loss": 0.091,
"step": 1140
},
{
"epoch": 1.184346035015448,
"grad_norm": 0.5559478998184204,
"learning_rate": 9.994769171408118e-05,
"loss": 0.0783,
"step": 1150
},
{
"epoch": 1.1946446961894954,
"grad_norm": 0.39498281478881836,
"learning_rate": 9.994340668716527e-05,
"loss": 0.0655,
"step": 1160
},
{
"epoch": 1.2049433573635429,
"grad_norm": 0.7332147359848022,
"learning_rate": 9.993895308820058e-05,
"loss": 0.0739,
"step": 1170
},
{
"epoch": 1.21524201853759,
"grad_norm": 0.5935864448547363,
"learning_rate": 9.99343309322192e-05,
"loss": 0.0651,
"step": 1180
},
{
"epoch": 1.2255406797116375,
"grad_norm": 0.5222606658935547,
"learning_rate": 9.99295402348221e-05,
"loss": 0.0676,
"step": 1190
},
{
"epoch": 1.235839340885685,
"grad_norm": 0.5474528670310974,
"learning_rate": 9.992458101217912e-05,
"loss": 0.0775,
"step": 1200
},
{
"epoch": 1.2461380020597321,
"grad_norm": 0.7393515110015869,
"learning_rate": 9.991945328102897e-05,
"loss": 0.0679,
"step": 1210
},
{
"epoch": 1.2564366632337796,
"grad_norm": 0.48135286569595337,
"learning_rate": 9.991415705867903e-05,
"loss": 0.0627,
"step": 1220
},
{
"epoch": 1.266735324407827,
"grad_norm": 0.40880492329597473,
"learning_rate": 9.990869236300546e-05,
"loss": 0.0621,
"step": 1230
},
{
"epoch": 1.2770339855818744,
"grad_norm": 0.4522377550601959,
"learning_rate": 9.990305921245306e-05,
"loss": 0.0629,
"step": 1240
},
{
"epoch": 1.2873326467559219,
"grad_norm": 0.5431732535362244,
"learning_rate": 9.989725762603515e-05,
"loss": 0.0711,
"step": 1250
},
{
"epoch": 1.297631307929969,
"grad_norm": 0.4390816390514374,
"learning_rate": 9.989128762333362e-05,
"loss": 0.058,
"step": 1260
},
{
"epoch": 1.3079299691040165,
"grad_norm": 0.5823209881782532,
"learning_rate": 9.988514922449879e-05,
"loss": 0.0742,
"step": 1270
},
{
"epoch": 1.318228630278064,
"grad_norm": 0.6167677044868469,
"learning_rate": 9.987884245024934e-05,
"loss": 0.0698,
"step": 1280
},
{
"epoch": 1.3285272914521111,
"grad_norm": 0.470501184463501,
"learning_rate": 9.98723673218723e-05,
"loss": 0.0669,
"step": 1290
},
{
"epoch": 1.3388259526261586,
"grad_norm": 0.3435496985912323,
"learning_rate": 9.986572386122291e-05,
"loss": 0.0655,
"step": 1300
},
{
"epoch": 1.349124613800206,
"grad_norm": 0.5990545749664307,
"learning_rate": 9.98589120907246e-05,
"loss": 0.0653,
"step": 1310
},
{
"epoch": 1.3594232749742534,
"grad_norm": 0.7209518551826477,
"learning_rate": 9.985193203336886e-05,
"loss": 0.0654,
"step": 1320
},
{
"epoch": 1.3697219361483008,
"grad_norm": 0.6588581800460815,
"learning_rate": 9.984478371271521e-05,
"loss": 0.066,
"step": 1330
},
{
"epoch": 1.380020597322348,
"grad_norm": 0.5437431931495667,
"learning_rate": 9.98374671528911e-05,
"loss": 0.0685,
"step": 1340
},
{
"epoch": 1.3903192584963955,
"grad_norm": 0.4081268012523651,
"learning_rate": 9.982998237859184e-05,
"loss": 0.0649,
"step": 1350
},
{
"epoch": 1.400617919670443,
"grad_norm": 0.5363196134567261,
"learning_rate": 9.98223294150805e-05,
"loss": 0.0614,
"step": 1360
},
{
"epoch": 1.4109165808444901,
"grad_norm": 0.5327999591827393,
"learning_rate": 9.981450828818783e-05,
"loss": 0.058,
"step": 1370
},
{
"epoch": 1.4212152420185376,
"grad_norm": 0.39524152874946594,
"learning_rate": 9.980651902431216e-05,
"loss": 0.0606,
"step": 1380
},
{
"epoch": 1.431513903192585,
"grad_norm": 0.5942156910896301,
"learning_rate": 9.979836165041936e-05,
"loss": 0.0589,
"step": 1390
},
{
"epoch": 1.4418125643666324,
"grad_norm": 0.6506125330924988,
"learning_rate": 9.97900361940427e-05,
"loss": 0.0618,
"step": 1400
},
{
"epoch": 1.4521112255406798,
"grad_norm": 0.43637052178382874,
"learning_rate": 9.978154268328276e-05,
"loss": 0.0728,
"step": 1410
},
{
"epoch": 1.462409886714727,
"grad_norm": 0.5816675424575806,
"learning_rate": 9.977288114680737e-05,
"loss": 0.0738,
"step": 1420
},
{
"epoch": 1.4727085478887745,
"grad_norm": 0.3983500301837921,
"learning_rate": 9.976405161385147e-05,
"loss": 0.0674,
"step": 1430
},
{
"epoch": 1.483007209062822,
"grad_norm": 0.41254571080207825,
"learning_rate": 9.975505411421704e-05,
"loss": 0.066,
"step": 1440
},
{
"epoch": 1.4933058702368691,
"grad_norm": 0.4647277593612671,
"learning_rate": 9.974588867827301e-05,
"loss": 0.0646,
"step": 1450
},
{
"epoch": 1.5036045314109165,
"grad_norm": 0.4378807544708252,
"learning_rate": 9.97365553369551e-05,
"loss": 0.0589,
"step": 1460
},
{
"epoch": 1.513903192584964,
"grad_norm": 0.6178969144821167,
"learning_rate": 9.972705412176577e-05,
"loss": 0.0621,
"step": 1470
},
{
"epoch": 1.5242018537590112,
"grad_norm": 0.5825141072273254,
"learning_rate": 9.971738506477414e-05,
"loss": 0.0644,
"step": 1480
},
{
"epoch": 1.5345005149330588,
"grad_norm": 0.5849868655204773,
"learning_rate": 9.970754819861577e-05,
"loss": 0.0669,
"step": 1490
},
{
"epoch": 1.544799176107106,
"grad_norm": 0.5067623853683472,
"learning_rate": 9.969754355649268e-05,
"loss": 0.071,
"step": 1500
},
{
"epoch": 1.5550978372811535,
"grad_norm": 0.5842755436897278,
"learning_rate": 9.968737117217313e-05,
"loss": 0.0713,
"step": 1510
},
{
"epoch": 1.565396498455201,
"grad_norm": 0.3868110179901123,
"learning_rate": 9.967703107999158e-05,
"loss": 0.0635,
"step": 1520
},
{
"epoch": 1.575695159629248,
"grad_norm": 0.4535583257675171,
"learning_rate": 9.966652331484853e-05,
"loss": 0.0587,
"step": 1530
},
{
"epoch": 1.5859938208032955,
"grad_norm": 0.38644909858703613,
"learning_rate": 9.965584791221048e-05,
"loss": 0.0708,
"step": 1540
},
{
"epoch": 1.596292481977343,
"grad_norm": 0.460753858089447,
"learning_rate": 9.964500490810966e-05,
"loss": 0.0645,
"step": 1550
},
{
"epoch": 1.6065911431513902,
"grad_norm": 0.5585173964500427,
"learning_rate": 9.963399433914405e-05,
"loss": 0.0587,
"step": 1560
},
{
"epoch": 1.6168898043254378,
"grad_norm": 0.6196934580802917,
"learning_rate": 9.962281624247722e-05,
"loss": 0.0663,
"step": 1570
},
{
"epoch": 1.627188465499485,
"grad_norm": 0.440153568983078,
"learning_rate": 9.961147065583813e-05,
"loss": 0.0568,
"step": 1580
},
{
"epoch": 1.6374871266735325,
"grad_norm": 0.49740493297576904,
"learning_rate": 9.959995761752112e-05,
"loss": 0.0616,
"step": 1590
},
{
"epoch": 1.64778578784758,
"grad_norm": 0.7940653562545776,
"learning_rate": 9.958827716638572e-05,
"loss": 0.0656,
"step": 1600
},
{
"epoch": 1.658084449021627,
"grad_norm": 0.39363256096839905,
"learning_rate": 9.957642934185648e-05,
"loss": 0.059,
"step": 1610
},
{
"epoch": 1.6683831101956745,
"grad_norm": 0.5798192620277405,
"learning_rate": 9.95644141839229e-05,
"loss": 0.057,
"step": 1620
},
{
"epoch": 1.678681771369722,
"grad_norm": 0.43519875407218933,
"learning_rate": 9.955223173313931e-05,
"loss": 0.0547,
"step": 1630
},
{
"epoch": 1.6889804325437692,
"grad_norm": 0.5713900327682495,
"learning_rate": 9.953988203062463e-05,
"loss": 0.0655,
"step": 1640
},
{
"epoch": 1.6992790937178168,
"grad_norm": 0.8694477677345276,
"learning_rate": 9.952736511806236e-05,
"loss": 0.0793,
"step": 1650
},
{
"epoch": 1.709577754891864,
"grad_norm": 0.344855397939682,
"learning_rate": 9.951468103770032e-05,
"loss": 0.0654,
"step": 1660
},
{
"epoch": 1.7198764160659115,
"grad_norm": 0.747203528881073,
"learning_rate": 9.950182983235063e-05,
"loss": 0.0694,
"step": 1670
},
{
"epoch": 1.730175077239959,
"grad_norm": 0.44555550813674927,
"learning_rate": 9.948881154538945e-05,
"loss": 0.0729,
"step": 1680
},
{
"epoch": 1.740473738414006,
"grad_norm": 0.4354792535305023,
"learning_rate": 9.94756262207569e-05,
"loss": 0.0739,
"step": 1690
},
{
"epoch": 1.7507723995880535,
"grad_norm": 0.4117138683795929,
"learning_rate": 9.946227390295689e-05,
"loss": 0.0648,
"step": 1700
},
{
"epoch": 1.761071060762101,
"grad_norm": 0.5352147221565247,
"learning_rate": 9.9448754637057e-05,
"loss": 0.0614,
"step": 1710
},
{
"epoch": 1.7713697219361482,
"grad_norm": 0.3937685787677765,
"learning_rate": 9.943506846868826e-05,
"loss": 0.0668,
"step": 1720
},
{
"epoch": 1.7816683831101958,
"grad_norm": 0.510313868522644,
"learning_rate": 9.942121544404509e-05,
"loss": 0.0564,
"step": 1730
},
{
"epoch": 1.791967044284243,
"grad_norm": 0.43196746706962585,
"learning_rate": 9.940719560988505e-05,
"loss": 0.0515,
"step": 1740
},
{
"epoch": 1.8022657054582905,
"grad_norm": 0.4649578928947449,
"learning_rate": 9.939300901352876e-05,
"loss": 0.0681,
"step": 1750
},
{
"epoch": 1.8125643666323379,
"grad_norm": 0.6281247735023499,
"learning_rate": 9.937865570285967e-05,
"loss": 0.0721,
"step": 1760
},
{
"epoch": 1.822863027806385,
"grad_norm": 0.6799906492233276,
"learning_rate": 9.936413572632397e-05,
"loss": 0.0565,
"step": 1770
},
{
"epoch": 1.8331616889804325,
"grad_norm": 0.4169757068157196,
"learning_rate": 9.934944913293038e-05,
"loss": 0.0626,
"step": 1780
},
{
"epoch": 1.84346035015448,
"grad_norm": 0.42282024025917053,
"learning_rate": 9.933459597224997e-05,
"loss": 0.0654,
"step": 1790
},
{
"epoch": 1.8537590113285272,
"grad_norm": 0.34127193689346313,
"learning_rate": 9.931957629441607e-05,
"loss": 0.0572,
"step": 1800
},
{
"epoch": 1.8640576725025748,
"grad_norm": 0.3683079183101654,
"learning_rate": 9.930439015012396e-05,
"loss": 0.0621,
"step": 1810
},
{
"epoch": 1.874356333676622,
"grad_norm": 0.5137266516685486,
"learning_rate": 9.92890375906309e-05,
"loss": 0.0554,
"step": 1820
},
{
"epoch": 1.8846549948506695,
"grad_norm": 0.4121856391429901,
"learning_rate": 9.927351866775578e-05,
"loss": 0.0631,
"step": 1830
},
{
"epoch": 1.8949536560247169,
"grad_norm": 0.5225406289100647,
"learning_rate": 9.925783343387903e-05,
"loss": 0.0557,
"step": 1840
},
{
"epoch": 1.905252317198764,
"grad_norm": 0.3983275294303894,
"learning_rate": 9.924198194194237e-05,
"loss": 0.0631,
"step": 1850
},
{
"epoch": 1.9155509783728115,
"grad_norm": 0.49256351590156555,
"learning_rate": 9.922596424544876e-05,
"loss": 0.0661,
"step": 1860
},
{
"epoch": 1.925849639546859,
"grad_norm": 0.5363610982894897,
"learning_rate": 9.92097803984621e-05,
"loss": 0.0706,
"step": 1870
},
{
"epoch": 1.9361483007209062,
"grad_norm": 0.4455360472202301,
"learning_rate": 9.919343045560712e-05,
"loss": 0.0698,
"step": 1880
},
{
"epoch": 1.9464469618949538,
"grad_norm": 0.5394087433815002,
"learning_rate": 9.917691447206913e-05,
"loss": 0.0616,
"step": 1890
},
{
"epoch": 1.956745623069001,
"grad_norm": 0.3595924377441406,
"learning_rate": 9.91602325035939e-05,
"loss": 0.067,
"step": 1900
},
{
"epoch": 1.9670442842430484,
"grad_norm": 0.2918682396411896,
"learning_rate": 9.914338460648743e-05,
"loss": 0.0732,
"step": 1910
},
{
"epoch": 1.9773429454170959,
"grad_norm": 0.41418296098709106,
"learning_rate": 9.912637083761578e-05,
"loss": 0.0635,
"step": 1920
},
{
"epoch": 1.987641606591143,
"grad_norm": 0.5165850520133972,
"learning_rate": 9.910919125440485e-05,
"loss": 0.069,
"step": 1930
},
{
"epoch": 1.9979402677651905,
"grad_norm": 0.3793902099132538,
"learning_rate": 9.909184591484027e-05,
"loss": 0.0717,
"step": 1940
},
{
"epoch": 2.008238928939238,
"grad_norm": 0.6616620421409607,
"learning_rate": 9.907433487746702e-05,
"loss": 0.0586,
"step": 1950
},
{
"epoch": 2.018537590113285,
"grad_norm": 0.5687305331230164,
"learning_rate": 9.905665820138949e-05,
"loss": 0.0569,
"step": 1960
},
{
"epoch": 2.028836251287333,
"grad_norm": 0.49890944361686707,
"learning_rate": 9.903881594627105e-05,
"loss": 0.0668,
"step": 1970
},
{
"epoch": 2.03913491246138,
"grad_norm": 0.5814046859741211,
"learning_rate": 9.902080817233398e-05,
"loss": 0.0644,
"step": 1980
},
{
"epoch": 2.049433573635427,
"grad_norm": 0.32920873165130615,
"learning_rate": 9.900263494035921e-05,
"loss": 0.0611,
"step": 1990
},
{
"epoch": 2.059732234809475,
"grad_norm": 0.5075499415397644,
"learning_rate": 9.898429631168619e-05,
"loss": 0.0586,
"step": 2000
},
{
"epoch": 2.070030895983522,
"grad_norm": 0.4823492169380188,
"learning_rate": 9.896579234821253e-05,
"loss": 0.0468,
"step": 2010
},
{
"epoch": 2.0803295571575697,
"grad_norm": 0.5481283068656921,
"learning_rate": 9.894712311239398e-05,
"loss": 0.0611,
"step": 2020
},
{
"epoch": 2.090628218331617,
"grad_norm": 0.4776170551776886,
"learning_rate": 9.892828866724406e-05,
"loss": 0.0657,
"step": 2030
},
{
"epoch": 2.100926879505664,
"grad_norm": 0.5601367354393005,
"learning_rate": 9.8909289076334e-05,
"loss": 0.0665,
"step": 2040
},
{
"epoch": 2.111225540679712,
"grad_norm": 0.3499130308628082,
"learning_rate": 9.88901244037923e-05,
"loss": 0.0563,
"step": 2050
},
{
"epoch": 2.121524201853759,
"grad_norm": 0.4545436501502991,
"learning_rate": 9.88707947143048e-05,
"loss": 0.0557,
"step": 2060
},
{
"epoch": 2.131822863027806,
"grad_norm": 0.46852630376815796,
"learning_rate": 9.885130007311423e-05,
"loss": 0.0522,
"step": 2070
},
{
"epoch": 2.142121524201854,
"grad_norm": 0.308856338262558,
"learning_rate": 9.883164054602012e-05,
"loss": 0.058,
"step": 2080
},
{
"epoch": 2.152420185375901,
"grad_norm": 0.7965716123580933,
"learning_rate": 9.881181619937848e-05,
"loss": 0.0535,
"step": 2090
},
{
"epoch": 2.1627188465499483,
"grad_norm": 0.3949962556362152,
"learning_rate": 9.879182710010169e-05,
"loss": 0.0536,
"step": 2100
},
{
"epoch": 2.173017507723996,
"grad_norm": 0.40669289231300354,
"learning_rate": 9.877167331565816e-05,
"loss": 0.0598,
"step": 2110
},
{
"epoch": 2.183316168898043,
"grad_norm": 0.6267198324203491,
"learning_rate": 9.875135491407217e-05,
"loss": 0.0647,
"step": 2120
},
{
"epoch": 2.193614830072091,
"grad_norm": 0.3919011950492859,
"learning_rate": 9.873087196392368e-05,
"loss": 0.063,
"step": 2130
},
{
"epoch": 2.203913491246138,
"grad_norm": 0.3769017457962036,
"learning_rate": 9.871022453434798e-05,
"loss": 0.0558,
"step": 2140
},
{
"epoch": 2.214212152420185,
"grad_norm": 0.382344126701355,
"learning_rate": 9.868941269503551e-05,
"loss": 0.0615,
"step": 2150
},
{
"epoch": 2.224510813594233,
"grad_norm": 0.7266145348548889,
"learning_rate": 9.86684365162317e-05,
"loss": 0.0611,
"step": 2160
},
{
"epoch": 2.23480947476828,
"grad_norm": 0.5791377425193787,
"learning_rate": 9.864729606873663e-05,
"loss": 0.0575,
"step": 2170
},
{
"epoch": 2.2451081359423277,
"grad_norm": 0.40031886100769043,
"learning_rate": 9.862599142390482e-05,
"loss": 0.0559,
"step": 2180
},
{
"epoch": 2.255406797116375,
"grad_norm": 0.34372609853744507,
"learning_rate": 9.860452265364502e-05,
"loss": 0.0623,
"step": 2190
},
{
"epoch": 2.265705458290422,
"grad_norm": 0.5310713052749634,
"learning_rate": 9.858288983041996e-05,
"loss": 0.0628,
"step": 2200
},
{
"epoch": 2.27600411946447,
"grad_norm": 0.4002261459827423,
"learning_rate": 9.856109302724603e-05,
"loss": 0.0528,
"step": 2210
},
{
"epoch": 2.286302780638517,
"grad_norm": 0.3995415270328522,
"learning_rate": 9.853913231769318e-05,
"loss": 0.0603,
"step": 2220
},
{
"epoch": 2.296601441812564,
"grad_norm": 0.5082608461380005,
"learning_rate": 9.851700777588453e-05,
"loss": 0.0555,
"step": 2230
},
{
"epoch": 2.306900102986612,
"grad_norm": 0.3878387212753296,
"learning_rate": 9.849471947649617e-05,
"loss": 0.054,
"step": 2240
},
{
"epoch": 2.317198764160659,
"grad_norm": 0.44272416830062866,
"learning_rate": 9.847226749475695e-05,
"loss": 0.067,
"step": 2250
},
{
"epoch": 2.3274974253347063,
"grad_norm": 0.38929831981658936,
"learning_rate": 9.844965190644817e-05,
"loss": 0.0518,
"step": 2260
},
{
"epoch": 2.337796086508754,
"grad_norm": 0.3083374798297882,
"learning_rate": 9.842687278790337e-05,
"loss": 0.0484,
"step": 2270
},
{
"epoch": 2.348094747682801,
"grad_norm": 0.41075581312179565,
"learning_rate": 9.8403930216008e-05,
"loss": 0.0635,
"step": 2280
},
{
"epoch": 2.358393408856849,
"grad_norm": 0.2911306917667389,
"learning_rate": 9.838082426819926e-05,
"loss": 0.0599,
"step": 2290
},
{
"epoch": 2.368692070030896,
"grad_norm": 0.524851381778717,
"learning_rate": 9.835755502246575e-05,
"loss": 0.0542,
"step": 2300
},
{
"epoch": 2.378990731204943,
"grad_norm": 0.45933887362480164,
"learning_rate": 9.833412255734724e-05,
"loss": 0.0671,
"step": 2310
},
{
"epoch": 2.389289392378991,
"grad_norm": 0.38324400782585144,
"learning_rate": 9.831052695193445e-05,
"loss": 0.0596,
"step": 2320
},
{
"epoch": 2.399588053553038,
"grad_norm": 0.7916087508201599,
"learning_rate": 9.828676828586871e-05,
"loss": 0.0722,
"step": 2330
},
{
"epoch": 2.4098867147270857,
"grad_norm": 0.4739670157432556,
"learning_rate": 9.826284663934171e-05,
"loss": 0.0596,
"step": 2340
},
{
"epoch": 2.420185375901133,
"grad_norm": 0.37064895033836365,
"learning_rate": 9.823876209309527e-05,
"loss": 0.062,
"step": 2350
},
{
"epoch": 2.43048403707518,
"grad_norm": 0.6001970171928406,
"learning_rate": 9.821451472842102e-05,
"loss": 0.0623,
"step": 2360
},
{
"epoch": 2.4407826982492278,
"grad_norm": 0.40998250246047974,
"learning_rate": 9.819010462716016e-05,
"loss": 0.0586,
"step": 2370
},
{
"epoch": 2.451081359423275,
"grad_norm": 0.4756927490234375,
"learning_rate": 9.816553187170317e-05,
"loss": 0.0522,
"step": 2380
},
{
"epoch": 2.461380020597322,
"grad_norm": 0.47659242153167725,
"learning_rate": 9.814079654498949e-05,
"loss": 0.0573,
"step": 2390
},
{
"epoch": 2.47167868177137,
"grad_norm": 0.4043289124965668,
"learning_rate": 9.811589873050735e-05,
"loss": 0.0654,
"step": 2400
},
{
"epoch": 2.481977342945417,
"grad_norm": 0.7355890870094299,
"learning_rate": 9.809083851229335e-05,
"loss": 0.0523,
"step": 2410
},
{
"epoch": 2.4922760041194643,
"grad_norm": 0.4957990348339081,
"learning_rate": 9.806561597493228e-05,
"loss": 0.0566,
"step": 2420
},
{
"epoch": 2.502574665293512,
"grad_norm": 0.3758098781108856,
"learning_rate": 9.80402312035568e-05,
"loss": 0.0509,
"step": 2430
},
{
"epoch": 2.512873326467559,
"grad_norm": 0.4361479878425598,
"learning_rate": 9.801468428384716e-05,
"loss": 0.0566,
"step": 2440
},
{
"epoch": 2.5231719876416063,
"grad_norm": 0.4788246750831604,
"learning_rate": 9.798897530203087e-05,
"loss": 0.0577,
"step": 2450
},
{
"epoch": 2.533470648815654,
"grad_norm": 0.3828676640987396,
"learning_rate": 9.796310434488248e-05,
"loss": 0.0552,
"step": 2460
},
{
"epoch": 2.543769309989701,
"grad_norm": 0.34888461232185364,
"learning_rate": 9.79370714997232e-05,
"loss": 0.0562,
"step": 2470
},
{
"epoch": 2.554067971163749,
"grad_norm": 0.5660400986671448,
"learning_rate": 9.791087685442071e-05,
"loss": 0.0593,
"step": 2480
},
{
"epoch": 2.564366632337796,
"grad_norm": 0.3883237838745117,
"learning_rate": 9.788452049738879e-05,
"loss": 0.0567,
"step": 2490
},
{
"epoch": 2.5746652935118437,
"grad_norm": 0.34366926550865173,
"learning_rate": 9.785800251758701e-05,
"loss": 0.055,
"step": 2500
},
{
"epoch": 2.584963954685891,
"grad_norm": 0.2992055416107178,
"learning_rate": 9.783132300452049e-05,
"loss": 0.053,
"step": 2510
},
{
"epoch": 2.595262615859938,
"grad_norm": 0.3543379306793213,
"learning_rate": 9.780448204823958e-05,
"loss": 0.0587,
"step": 2520
},
{
"epoch": 2.6055612770339858,
"grad_norm": 0.32997754216194153,
"learning_rate": 9.777747973933948e-05,
"loss": 0.0483,
"step": 2530
},
{
"epoch": 2.615859938208033,
"grad_norm": 0.4290192425251007,
"learning_rate": 9.775031616896008e-05,
"loss": 0.0565,
"step": 2540
},
{
"epoch": 2.62615859938208,
"grad_norm": 0.39540722966194153,
"learning_rate": 9.772299142878549e-05,
"loss": 0.0567,
"step": 2550
},
{
"epoch": 2.636457260556128,
"grad_norm": 0.46537721157073975,
"learning_rate": 9.769550561104388e-05,
"loss": 0.0511,
"step": 2560
},
{
"epoch": 2.646755921730175,
"grad_norm": 0.4019800126552582,
"learning_rate": 9.766785880850707e-05,
"loss": 0.0576,
"step": 2570
},
{
"epoch": 2.6570545829042223,
"grad_norm": 0.3543599545955658,
"learning_rate": 9.764005111449021e-05,
"loss": 0.0561,
"step": 2580
},
{
"epoch": 2.66735324407827,
"grad_norm": 0.459049791097641,
"learning_rate": 9.761208262285155e-05,
"loss": 0.0626,
"step": 2590
},
{
"epoch": 2.677651905252317,
"grad_norm": 0.4867796003818512,
"learning_rate": 9.758395342799206e-05,
"loss": 0.0504,
"step": 2600
},
{
"epoch": 2.6879505664263643,
"grad_norm": 0.42788106203079224,
"learning_rate": 9.755566362485512e-05,
"loss": 0.0578,
"step": 2610
},
{
"epoch": 2.698249227600412,
"grad_norm": 0.3226776719093323,
"learning_rate": 9.752721330892624e-05,
"loss": 0.0552,
"step": 2620
},
{
"epoch": 2.708547888774459,
"grad_norm": 0.4271225333213806,
"learning_rate": 9.749860257623263e-05,
"loss": 0.0549,
"step": 2630
},
{
"epoch": 2.718846549948507,
"grad_norm": 0.39057081937789917,
"learning_rate": 9.7469831523343e-05,
"loss": 0.0558,
"step": 2640
},
{
"epoch": 2.729145211122554,
"grad_norm": 0.4585021436214447,
"learning_rate": 9.744090024736719e-05,
"loss": 0.0481,
"step": 2650
},
{
"epoch": 2.7394438722966017,
"grad_norm": 0.4004554748535156,
"learning_rate": 9.741180884595578e-05,
"loss": 0.0671,
"step": 2660
},
{
"epoch": 2.749742533470649,
"grad_norm": 0.3565993010997772,
"learning_rate": 9.738255741729987e-05,
"loss": 0.0623,
"step": 2670
},
{
"epoch": 2.760041194644696,
"grad_norm": 0.30855366587638855,
"learning_rate": 9.735314606013068e-05,
"loss": 0.0588,
"step": 2680
},
{
"epoch": 2.7703398558187438,
"grad_norm": 0.4170495271682739,
"learning_rate": 9.732357487371924e-05,
"loss": 0.056,
"step": 2690
},
{
"epoch": 2.780638516992791,
"grad_norm": 0.5667279362678528,
"learning_rate": 9.729384395787602e-05,
"loss": 0.0612,
"step": 2700
},
{
"epoch": 2.790937178166838,
"grad_norm": 0.27353501319885254,
"learning_rate": 9.726395341295062e-05,
"loss": 0.0493,
"step": 2710
},
{
"epoch": 2.801235839340886,
"grad_norm": 0.5288174152374268,
"learning_rate": 9.723390333983144e-05,
"loss": 0.0629,
"step": 2720
},
{
"epoch": 2.811534500514933,
"grad_norm": 0.4831124544143677,
"learning_rate": 9.720369383994535e-05,
"loss": 0.0549,
"step": 2730
},
{
"epoch": 2.8218331616889802,
"grad_norm": 0.3807002902030945,
"learning_rate": 9.717332501525729e-05,
"loss": 0.0561,
"step": 2740
},
{
"epoch": 2.832131822863028,
"grad_norm": 0.6944444179534912,
"learning_rate": 9.714279696826998e-05,
"loss": 0.0564,
"step": 2750
},
{
"epoch": 2.842430484037075,
"grad_norm": 0.3146667778491974,
"learning_rate": 9.711210980202354e-05,
"loss": 0.0544,
"step": 2760
},
{
"epoch": 2.8527291452111223,
"grad_norm": 0.4342884421348572,
"learning_rate": 9.708126362009522e-05,
"loss": 0.0541,
"step": 2770
},
{
"epoch": 2.86302780638517,
"grad_norm": 0.4473581612110138,
"learning_rate": 9.70502585265989e-05,
"loss": 0.0567,
"step": 2780
},
{
"epoch": 2.873326467559217,
"grad_norm": 0.34954315423965454,
"learning_rate": 9.70190946261849e-05,
"loss": 0.0508,
"step": 2790
},
{
"epoch": 2.883625128733265,
"grad_norm": 0.37677961587905884,
"learning_rate": 9.698777202403953e-05,
"loss": 0.0555,
"step": 2800
},
{
"epoch": 2.893923789907312,
"grad_norm": 0.3924347460269928,
"learning_rate": 9.695629082588473e-05,
"loss": 0.0607,
"step": 2810
},
{
"epoch": 2.9042224510813597,
"grad_norm": 0.34362998604774475,
"learning_rate": 9.69246511379778e-05,
"loss": 0.0479,
"step": 2820
},
{
"epoch": 2.914521112255407,
"grad_norm": 0.48478758335113525,
"learning_rate": 9.689285306711094e-05,
"loss": 0.0564,
"step": 2830
},
{
"epoch": 2.924819773429454,
"grad_norm": 0.39429691433906555,
"learning_rate": 9.686089672061094e-05,
"loss": 0.0552,
"step": 2840
},
{
"epoch": 2.9351184346035017,
"grad_norm": 0.27760738134384155,
"learning_rate": 9.682878220633885e-05,
"loss": 0.0507,
"step": 2850
},
{
"epoch": 2.945417095777549,
"grad_norm": 0.3564143180847168,
"learning_rate": 9.679650963268951e-05,
"loss": 0.0529,
"step": 2860
},
{
"epoch": 2.955715756951596,
"grad_norm": 0.3425343930721283,
"learning_rate": 9.676407910859131e-05,
"loss": 0.05,
"step": 2870
},
{
"epoch": 2.966014418125644,
"grad_norm": 0.3504887819290161,
"learning_rate": 9.673149074350573e-05,
"loss": 0.0529,
"step": 2880
},
{
"epoch": 2.976313079299691,
"grad_norm": 0.432216078042984,
"learning_rate": 9.669874464742705e-05,
"loss": 0.0582,
"step": 2890
},
{
"epoch": 2.9866117404737382,
"grad_norm": 0.4117823541164398,
"learning_rate": 9.666584093088189e-05,
"loss": 0.0516,
"step": 2900
},
{
"epoch": 2.996910401647786,
"grad_norm": 0.4118179380893707,
"learning_rate": 9.663277970492886e-05,
"loss": 0.0664,
"step": 2910
},
{
"epoch": 3.007209062821833,
"grad_norm": 0.31822094321250916,
"learning_rate": 9.659956108115827e-05,
"loss": 0.0607,
"step": 2920
},
{
"epoch": 3.0175077239958807,
"grad_norm": 0.34220412373542786,
"learning_rate": 9.656618517169164e-05,
"loss": 0.0523,
"step": 2930
},
{
"epoch": 3.027806385169928,
"grad_norm": 0.33871203660964966,
"learning_rate": 9.65326520891814e-05,
"loss": 0.0486,
"step": 2940
},
{
"epoch": 3.038105046343975,
"grad_norm": 0.4035494327545166,
"learning_rate": 9.649896194681045e-05,
"loss": 0.0497,
"step": 2950
},
{
"epoch": 3.048403707518023,
"grad_norm": 0.36851248145103455,
"learning_rate": 9.646511485829186e-05,
"loss": 0.062,
"step": 2960
},
{
"epoch": 3.05870236869207,
"grad_norm": 0.3193969428539276,
"learning_rate": 9.643111093786835e-05,
"loss": 0.0514,
"step": 2970
},
{
"epoch": 3.0690010298661172,
"grad_norm": 0.331909716129303,
"learning_rate": 9.639695030031204e-05,
"loss": 0.0488,
"step": 2980
},
{
"epoch": 3.079299691040165,
"grad_norm": 0.35757410526275635,
"learning_rate": 9.636263306092406e-05,
"loss": 0.0576,
"step": 2990
},
{
"epoch": 3.089598352214212,
"grad_norm": 0.4217674434185028,
"learning_rate": 9.6328159335534e-05,
"loss": 0.0554,
"step": 3000
},
{
"epoch": 3.0998970133882597,
"grad_norm": 0.3531946539878845,
"learning_rate": 9.629352924049975e-05,
"loss": 0.059,
"step": 3010
},
{
"epoch": 3.110195674562307,
"grad_norm": 0.39479324221611023,
"learning_rate": 9.625874289270688e-05,
"loss": 0.0621,
"step": 3020
},
{
"epoch": 3.120494335736354,
"grad_norm": 0.29987436532974243,
"learning_rate": 9.622380040956842e-05,
"loss": 0.0511,
"step": 3030
},
{
"epoch": 3.130792996910402,
"grad_norm": 0.5292258262634277,
"learning_rate": 9.61887019090244e-05,
"loss": 0.0564,
"step": 3040
},
{
"epoch": 3.141091658084449,
"grad_norm": 0.33128613233566284,
"learning_rate": 9.615344750954141e-05,
"loss": 0.0548,
"step": 3050
},
{
"epoch": 3.151390319258496,
"grad_norm": 0.43356847763061523,
"learning_rate": 9.611803733011229e-05,
"loss": 0.0557,
"step": 3060
},
{
"epoch": 3.161688980432544,
"grad_norm": 0.4408741295337677,
"learning_rate": 9.60824714902556e-05,
"loss": 0.0582,
"step": 3070
},
{
"epoch": 3.171987641606591,
"grad_norm": 0.307669460773468,
"learning_rate": 9.604675011001538e-05,
"loss": 0.0442,
"step": 3080
},
{
"epoch": 3.1822863027806383,
"grad_norm": 0.49202683568000793,
"learning_rate": 9.601087330996061e-05,
"loss": 0.0599,
"step": 3090
},
{
"epoch": 3.192584963954686,
"grad_norm": 0.3430628180503845,
"learning_rate": 9.597484121118487e-05,
"loss": 0.0501,
"step": 3100
},
{
"epoch": 3.202883625128733,
"grad_norm": 0.45715686678886414,
"learning_rate": 9.593865393530592e-05,
"loss": 0.0533,
"step": 3110
},
{
"epoch": 3.213182286302781,
"grad_norm": 0.29405537247657776,
"learning_rate": 9.590231160446526e-05,
"loss": 0.0579,
"step": 3120
},
{
"epoch": 3.223480947476828,
"grad_norm": 0.4138418436050415,
"learning_rate": 9.586581434132775e-05,
"loss": 0.0553,
"step": 3130
},
{
"epoch": 3.233779608650875,
"grad_norm": 0.2747637927532196,
"learning_rate": 9.582916226908118e-05,
"loss": 0.0534,
"step": 3140
},
{
"epoch": 3.244078269824923,
"grad_norm": 0.3608400821685791,
"learning_rate": 9.57923555114359e-05,
"loss": 0.0512,
"step": 3150
},
{
"epoch": 3.25437693099897,
"grad_norm": 0.4042729437351227,
"learning_rate": 9.575539419262434e-05,
"loss": 0.0445,
"step": 3160
},
{
"epoch": 3.2646755921730177,
"grad_norm": 0.35471370816230774,
"learning_rate": 9.571827843740057e-05,
"loss": 0.0542,
"step": 3170
},
{
"epoch": 3.274974253347065,
"grad_norm": 0.2936842441558838,
"learning_rate": 9.568100837104e-05,
"loss": 0.0505,
"step": 3180
},
{
"epoch": 3.285272914521112,
"grad_norm": 0.2880595028400421,
"learning_rate": 9.56435841193388e-05,
"loss": 0.0458,
"step": 3190
},
{
"epoch": 3.29557157569516,
"grad_norm": 0.33003637194633484,
"learning_rate": 9.560600580861365e-05,
"loss": 0.0576,
"step": 3200
},
{
"epoch": 3.305870236869207,
"grad_norm": 0.4025996923446655,
"learning_rate": 9.556827356570116e-05,
"loss": 0.0598,
"step": 3210
},
{
"epoch": 3.316168898043254,
"grad_norm": 0.5448514819145203,
"learning_rate": 9.553038751795746e-05,
"loss": 0.0503,
"step": 3220
},
{
"epoch": 3.326467559217302,
"grad_norm": 0.39959079027175903,
"learning_rate": 9.549234779325792e-05,
"loss": 0.0581,
"step": 3230
},
{
"epoch": 3.336766220391349,
"grad_norm": 0.31689804792404175,
"learning_rate": 9.545415451999653e-05,
"loss": 0.054,
"step": 3240
},
{
"epoch": 3.3470648815653963,
"grad_norm": 0.5861422419548035,
"learning_rate": 9.541580782708557e-05,
"loss": 0.0498,
"step": 3250
},
{
"epoch": 3.357363542739444,
"grad_norm": 0.36639899015426636,
"learning_rate": 9.537730784395514e-05,
"loss": 0.0625,
"step": 3260
},
{
"epoch": 3.367662203913491,
"grad_norm": 0.3032686710357666,
"learning_rate": 9.533865470055275e-05,
"loss": 0.0543,
"step": 3270
},
{
"epoch": 3.377960865087539,
"grad_norm": 0.4109341502189636,
"learning_rate": 9.529984852734285e-05,
"loss": 0.0582,
"step": 3280
},
{
"epoch": 3.388259526261586,
"grad_norm": 0.38670700788497925,
"learning_rate": 9.526088945530645e-05,
"loss": 0.0547,
"step": 3290
},
{
"epoch": 3.398558187435633,
"grad_norm": 0.30283281207084656,
"learning_rate": 9.522177761594057e-05,
"loss": 0.0434,
"step": 3300
},
{
"epoch": 3.408856848609681,
"grad_norm": 0.3940243721008301,
"learning_rate": 9.518251314125788e-05,
"loss": 0.0548,
"step": 3310
},
{
"epoch": 3.419155509783728,
"grad_norm": 0.6107800006866455,
"learning_rate": 9.514309616378626e-05,
"loss": 0.0453,
"step": 3320
},
{
"epoch": 3.4294541709577757,
"grad_norm": 0.3535449802875519,
"learning_rate": 9.510352681656832e-05,
"loss": 0.0509,
"step": 3330
},
{
"epoch": 3.439752832131823,
"grad_norm": 0.4279785454273224,
"learning_rate": 9.50638052331609e-05,
"loss": 0.0511,
"step": 3340
},
{
"epoch": 3.45005149330587,
"grad_norm": 0.5184943675994873,
"learning_rate": 9.502393154763478e-05,
"loss": 0.0553,
"step": 3350
},
{
"epoch": 3.460350154479918,
"grad_norm": 0.6247850656509399,
"learning_rate": 9.498390589457404e-05,
"loss": 0.0485,
"step": 3360
},
{
"epoch": 3.470648815653965,
"grad_norm": 0.4810273349285126,
"learning_rate": 9.494372840907572e-05,
"loss": 0.0646,
"step": 3370
},
{
"epoch": 3.480947476828012,
"grad_norm": 0.31024450063705444,
"learning_rate": 9.490339922674934e-05,
"loss": 0.0506,
"step": 3380
},
{
"epoch": 3.49124613800206,
"grad_norm": 0.3408045172691345,
"learning_rate": 9.486291848371643e-05,
"loss": 0.0598,
"step": 3390
},
{
"epoch": 3.501544799176107,
"grad_norm": 0.3190326988697052,
"learning_rate": 9.482228631661005e-05,
"loss": 0.0569,
"step": 3400
},
{
"epoch": 3.5118434603501543,
"grad_norm": 0.3894359767436981,
"learning_rate": 9.478150286257443e-05,
"loss": 0.048,
"step": 3410
},
{
"epoch": 3.522142121524202,
"grad_norm": 0.33339062333106995,
"learning_rate": 9.474056825926434e-05,
"loss": 0.0533,
"step": 3420
},
{
"epoch": 3.532440782698249,
"grad_norm": 0.4688987731933594,
"learning_rate": 9.46994826448448e-05,
"loss": 0.0495,
"step": 3430
},
{
"epoch": 3.5427394438722963,
"grad_norm": 0.24669192731380463,
"learning_rate": 9.465824615799046e-05,
"loss": 0.0487,
"step": 3440
},
{
"epoch": 3.553038105046344,
"grad_norm": 0.43672746419906616,
"learning_rate": 9.461685893788526e-05,
"loss": 0.0529,
"step": 3450
},
{
"epoch": 3.563336766220391,
"grad_norm": 0.3806833028793335,
"learning_rate": 9.457532112422187e-05,
"loss": 0.0644,
"step": 3460
},
{
"epoch": 3.573635427394439,
"grad_norm": 0.43160000443458557,
"learning_rate": 9.453363285720129e-05,
"loss": 0.046,
"step": 3470
},
{
"epoch": 3.583934088568486,
"grad_norm": 0.3873897194862366,
"learning_rate": 9.44917942775323e-05,
"loss": 0.0561,
"step": 3480
},
{
"epoch": 3.5942327497425337,
"grad_norm": 0.420244425535202,
"learning_rate": 9.444980552643103e-05,
"loss": 0.0544,
"step": 3490
},
{
"epoch": 3.604531410916581,
"grad_norm": 0.2572662830352783,
"learning_rate": 9.44076667456205e-05,
"loss": 0.0609,
"step": 3500
},
{
"epoch": 3.614830072090628,
"grad_norm": 0.5829557776451111,
"learning_rate": 9.43653780773301e-05,
"loss": 0.0683,
"step": 3510
},
{
"epoch": 3.6251287332646758,
"grad_norm": 0.5830304622650146,
"learning_rate": 9.432293966429514e-05,
"loss": 0.067,
"step": 3520
},
{
"epoch": 3.635427394438723,
"grad_norm": 0.38021519780158997,
"learning_rate": 9.428035164975636e-05,
"loss": 0.0498,
"step": 3530
},
{
"epoch": 3.64572605561277,
"grad_norm": 0.4201594591140747,
"learning_rate": 9.423761417745942e-05,
"loss": 0.0569,
"step": 3540
},
{
"epoch": 3.656024716786818,
"grad_norm": 0.5576770305633545,
"learning_rate": 9.419472739165449e-05,
"loss": 0.0667,
"step": 3550
},
{
"epoch": 3.666323377960865,
"grad_norm": 0.34150251746177673,
"learning_rate": 9.415169143709565e-05,
"loss": 0.0539,
"step": 3560
},
{
"epoch": 3.6766220391349123,
"grad_norm": 0.5191327333450317,
"learning_rate": 9.410850645904049e-05,
"loss": 0.0609,
"step": 3570
},
{
"epoch": 3.68692070030896,
"grad_norm": 0.3418954610824585,
"learning_rate": 9.40651726032496e-05,
"loss": 0.0485,
"step": 3580
},
{
"epoch": 3.697219361483007,
"grad_norm": 0.44254234433174133,
"learning_rate": 9.402169001598611e-05,
"loss": 0.0552,
"step": 3590
},
{
"epoch": 3.7075180226570543,
"grad_norm": 0.549349308013916,
"learning_rate": 9.397805884401504e-05,
"loss": 0.0601,
"step": 3600
},
{
"epoch": 3.717816683831102,
"grad_norm": 0.4500453472137451,
"learning_rate": 9.393427923460308e-05,
"loss": 0.0496,
"step": 3610
},
{
"epoch": 3.728115345005149,
"grad_norm": 0.5540750622749329,
"learning_rate": 9.389035133551778e-05,
"loss": 0.0563,
"step": 3620
},
{
"epoch": 3.738414006179197,
"grad_norm": 0.28786641359329224,
"learning_rate": 9.38462752950273e-05,
"loss": 0.0532,
"step": 3630
},
{
"epoch": 3.748712667353244,
"grad_norm": 0.3725302219390869,
"learning_rate": 9.380205126189983e-05,
"loss": 0.0558,
"step": 3640
},
{
"epoch": 3.7590113285272917,
"grad_norm": 0.47449609637260437,
"learning_rate": 9.375767938540299e-05,
"loss": 0.0559,
"step": 3650
},
{
"epoch": 3.769309989701339,
"grad_norm": 0.5294702649116516,
"learning_rate": 9.371315981530349e-05,
"loss": 0.0534,
"step": 3660
},
{
"epoch": 3.779608650875386,
"grad_norm": 0.29216107726097107,
"learning_rate": 9.366849270186649e-05,
"loss": 0.0519,
"step": 3670
},
{
"epoch": 3.7899073120494338,
"grad_norm": 0.28166675567626953,
"learning_rate": 9.362367819585518e-05,
"loss": 0.0532,
"step": 3680
},
{
"epoch": 3.800205973223481,
"grad_norm": 0.5699660778045654,
"learning_rate": 9.357871644853024e-05,
"loss": 0.0533,
"step": 3690
},
{
"epoch": 3.810504634397528,
"grad_norm": 0.44877076148986816,
"learning_rate": 9.353360761164931e-05,
"loss": 0.0569,
"step": 3700
},
{
"epoch": 3.820803295571576,
"grad_norm": 0.4341685175895691,
"learning_rate": 9.348835183746649e-05,
"loss": 0.0579,
"step": 3710
},
{
"epoch": 3.831101956745623,
"grad_norm": 0.37804657220840454,
"learning_rate": 9.344294927873188e-05,
"loss": 0.0535,
"step": 3720
},
{
"epoch": 3.8414006179196702,
"grad_norm": 0.47172001004219055,
"learning_rate": 9.339740008869092e-05,
"loss": 0.049,
"step": 3730
},
{
"epoch": 3.851699279093718,
"grad_norm": 0.29430967569351196,
"learning_rate": 9.335170442108408e-05,
"loss": 0.0547,
"step": 3740
},
{
"epoch": 3.861997940267765,
"grad_norm": 0.40547069907188416,
"learning_rate": 9.330586243014617e-05,
"loss": 0.0486,
"step": 3750
},
{
"epoch": 3.8722966014418123,
"grad_norm": 0.3896206319332123,
"learning_rate": 9.325987427060586e-05,
"loss": 0.0585,
"step": 3760
},
{
"epoch": 3.88259526261586,
"grad_norm": 0.29565155506134033,
"learning_rate": 9.321374009768525e-05,
"loss": 0.0508,
"step": 3770
},
{
"epoch": 3.892893923789907,
"grad_norm": 0.5239169597625732,
"learning_rate": 9.316746006709919e-05,
"loss": 0.0608,
"step": 3780
},
{
"epoch": 3.903192584963955,
"grad_norm": 0.2817414402961731,
"learning_rate": 9.31210343350549e-05,
"loss": 0.0465,
"step": 3790
},
{
"epoch": 3.913491246138002,
"grad_norm": 0.4744998514652252,
"learning_rate": 9.307446305825135e-05,
"loss": 0.0616,
"step": 3800
},
{
"epoch": 3.9237899073120497,
"grad_norm": 0.4715334475040436,
"learning_rate": 9.302774639387877e-05,
"loss": 0.0557,
"step": 3810
},
{
"epoch": 3.934088568486097,
"grad_norm": 0.5753309726715088,
"learning_rate": 9.298088449961813e-05,
"loss": 0.0592,
"step": 3820
},
{
"epoch": 3.944387229660144,
"grad_norm": 0.318158358335495,
"learning_rate": 9.293387753364052e-05,
"loss": 0.0604,
"step": 3830
},
{
"epoch": 3.9546858908341918,
"grad_norm": 0.4752749800682068,
"learning_rate": 9.288672565460679e-05,
"loss": 0.049,
"step": 3840
},
{
"epoch": 3.964984552008239,
"grad_norm": 0.284682035446167,
"learning_rate": 9.283942902166681e-05,
"loss": 0.0491,
"step": 3850
},
{
"epoch": 3.975283213182286,
"grad_norm": 0.4126709997653961,
"learning_rate": 9.27919877944591e-05,
"loss": 0.0508,
"step": 3860
},
{
"epoch": 3.985581874356334,
"grad_norm": 0.34126409888267517,
"learning_rate": 9.27444021331102e-05,
"loss": 0.0545,
"step": 3870
},
{
"epoch": 3.995880535530381,
"grad_norm": 0.5670478343963623,
"learning_rate": 9.269667219823412e-05,
"loss": 0.0483,
"step": 3880
},
{
"epoch": 4.006179196704428,
"grad_norm": 0.3084736466407776,
"learning_rate": 9.264879815093191e-05,
"loss": 0.0499,
"step": 3890
},
{
"epoch": 4.016477857878476,
"grad_norm": 0.4823373258113861,
"learning_rate": 9.260078015279096e-05,
"loss": 0.0558,
"step": 3900
},
{
"epoch": 4.0267765190525235,
"grad_norm": 0.2889825105667114,
"learning_rate": 9.255261836588458e-05,
"loss": 0.0561,
"step": 3910
},
{
"epoch": 4.03707518022657,
"grad_norm": 0.28834298253059387,
"learning_rate": 9.250431295277137e-05,
"loss": 0.0498,
"step": 3920
},
{
"epoch": 4.047373841400618,
"grad_norm": 0.40643489360809326,
"learning_rate": 9.245586407649473e-05,
"loss": 0.0479,
"step": 3930
},
{
"epoch": 4.057672502574666,
"grad_norm": 0.3214862644672394,
"learning_rate": 9.240727190058227e-05,
"loss": 0.0498,
"step": 3940
},
{
"epoch": 4.067971163748712,
"grad_norm": 0.40402647852897644,
"learning_rate": 9.235853658904529e-05,
"loss": 0.0522,
"step": 3950
},
{
"epoch": 4.07826982492276,
"grad_norm": 0.3338010311126709,
"learning_rate": 9.230965830637821e-05,
"loss": 0.0506,
"step": 3960
},
{
"epoch": 4.088568486096808,
"grad_norm": 0.42742258310317993,
"learning_rate": 9.226063721755799e-05,
"loss": 0.053,
"step": 3970
},
{
"epoch": 4.098867147270854,
"grad_norm": 0.3947793245315552,
"learning_rate": 9.221147348804362e-05,
"loss": 0.0541,
"step": 3980
},
{
"epoch": 4.109165808444902,
"grad_norm": 0.4395465552806854,
"learning_rate": 9.216216728377554e-05,
"loss": 0.0509,
"step": 3990
},
{
"epoch": 4.11946446961895,
"grad_norm": 0.28796476125717163,
"learning_rate": 9.211271877117507e-05,
"loss": 0.0501,
"step": 4000
},
{
"epoch": 4.1297631307929965,
"grad_norm": 0.31560418009757996,
"learning_rate": 9.206312811714386e-05,
"loss": 0.0502,
"step": 4010
},
{
"epoch": 4.140061791967044,
"grad_norm": 0.45714765787124634,
"learning_rate": 9.201339548906332e-05,
"loss": 0.0579,
"step": 4020
},
{
"epoch": 4.150360453141092,
"grad_norm": 0.3373541831970215,
"learning_rate": 9.196352105479409e-05,
"loss": 0.0504,
"step": 4030
},
{
"epoch": 4.1606591143151395,
"grad_norm": 0.5105737447738647,
"learning_rate": 9.19135049826754e-05,
"loss": 0.0619,
"step": 4040
},
{
"epoch": 4.170957775489186,
"grad_norm": 0.3023523688316345,
"learning_rate": 9.186334744152458e-05,
"loss": 0.0499,
"step": 4050
},
{
"epoch": 4.181256436663234,
"grad_norm": 0.3311084508895874,
"learning_rate": 9.18130486006364e-05,
"loss": 0.0484,
"step": 4060
},
{
"epoch": 4.1915550978372815,
"grad_norm": 0.3167574405670166,
"learning_rate": 9.176260862978263e-05,
"loss": 0.0605,
"step": 4070
},
{
"epoch": 4.201853759011328,
"grad_norm": 0.3764163553714752,
"learning_rate": 9.171202769921134e-05,
"loss": 0.0521,
"step": 4080
},
{
"epoch": 4.212152420185376,
"grad_norm": 0.325210303068161,
"learning_rate": 9.16613059796464e-05,
"loss": 0.0471,
"step": 4090
},
{
"epoch": 4.222451081359424,
"grad_norm": 0.3970625102519989,
"learning_rate": 9.161044364228683e-05,
"loss": 0.0545,
"step": 4100
},
{
"epoch": 4.23274974253347,
"grad_norm": 0.306384414434433,
"learning_rate": 9.155944085880637e-05,
"loss": 0.0539,
"step": 4110
},
{
"epoch": 4.243048403707518,
"grad_norm": 0.4230334162712097,
"learning_rate": 9.150829780135269e-05,
"loss": 0.0456,
"step": 4120
},
{
"epoch": 4.253347064881566,
"grad_norm": 0.29097849130630493,
"learning_rate": 9.145701464254698e-05,
"loss": 0.0511,
"step": 4130
},
{
"epoch": 4.263645726055612,
"grad_norm": 0.390979140996933,
"learning_rate": 9.140559155548333e-05,
"loss": 0.0461,
"step": 4140
},
{
"epoch": 4.27394438722966,
"grad_norm": 0.2566828429698944,
"learning_rate": 9.135402871372808e-05,
"loss": 0.0508,
"step": 4150
},
{
"epoch": 4.284243048403708,
"grad_norm": 0.4710136651992798,
"learning_rate": 9.130232629131932e-05,
"loss": 0.0503,
"step": 4160
},
{
"epoch": 4.2945417095777545,
"grad_norm": 0.4374995827674866,
"learning_rate": 9.125048446276618e-05,
"loss": 0.0599,
"step": 4170
},
{
"epoch": 4.304840370751802,
"grad_norm": 0.43765076994895935,
"learning_rate": 9.119850340304843e-05,
"loss": 0.0531,
"step": 4180
},
{
"epoch": 4.31513903192585,
"grad_norm": 0.45118576288223267,
"learning_rate": 9.114638328761571e-05,
"loss": 0.0527,
"step": 4190
},
{
"epoch": 4.325437693099897,
"grad_norm": 0.3243924379348755,
"learning_rate": 9.109412429238704e-05,
"loss": 0.0431,
"step": 4200
},
{
"epoch": 4.335736354273944,
"grad_norm": 0.33518919348716736,
"learning_rate": 9.104172659375017e-05,
"loss": 0.0491,
"step": 4210
},
{
"epoch": 4.346035015447992,
"grad_norm": 0.6875081062316895,
"learning_rate": 9.098919036856102e-05,
"loss": 0.0488,
"step": 4220
},
{
"epoch": 4.3563336766220395,
"grad_norm": 0.5093826055526733,
"learning_rate": 9.093651579414311e-05,
"loss": 0.0487,
"step": 4230
},
{
"epoch": 4.366632337796086,
"grad_norm": 0.37270835041999817,
"learning_rate": 9.088370304828685e-05,
"loss": 0.0559,
"step": 4240
},
{
"epoch": 4.376930998970134,
"grad_norm": 0.4596996307373047,
"learning_rate": 9.083075230924913e-05,
"loss": 0.0578,
"step": 4250
},
{
"epoch": 4.387229660144182,
"grad_norm": 0.3775595426559448,
"learning_rate": 9.077766375575246e-05,
"loss": 0.0562,
"step": 4260
},
{
"epoch": 4.397528321318228,
"grad_norm": 0.3252449333667755,
"learning_rate": 9.072443756698459e-05,
"loss": 0.0558,
"step": 4270
},
{
"epoch": 4.407826982492276,
"grad_norm": 0.42610299587249756,
"learning_rate": 9.067107392259783e-05,
"loss": 0.0455,
"step": 4280
},
{
"epoch": 4.418125643666324,
"grad_norm": 0.36227330565452576,
"learning_rate": 9.061757300270845e-05,
"loss": 0.0498,
"step": 4290
},
{
"epoch": 4.42842430484037,
"grad_norm": 0.4343869686126709,
"learning_rate": 9.056393498789602e-05,
"loss": 0.0504,
"step": 4300
},
{
"epoch": 4.438722966014418,
"grad_norm": 0.4492502808570862,
"learning_rate": 9.051016005920282e-05,
"loss": 0.0526,
"step": 4310
},
{
"epoch": 4.449021627188466,
"grad_norm": 0.2649560868740082,
"learning_rate": 9.045624839813334e-05,
"loss": 0.0488,
"step": 4320
},
{
"epoch": 4.4593202883625125,
"grad_norm": 0.2290182262659073,
"learning_rate": 9.040220018665347e-05,
"loss": 0.0427,
"step": 4330
},
{
"epoch": 4.46961894953656,
"grad_norm": 0.37687376141548157,
"learning_rate": 9.034801560719011e-05,
"loss": 0.0437,
"step": 4340
},
{
"epoch": 4.479917610710608,
"grad_norm": 0.21943651139736176,
"learning_rate": 9.029369484263033e-05,
"loss": 0.047,
"step": 4350
},
{
"epoch": 4.490216271884655,
"grad_norm": 0.32304951548576355,
"learning_rate": 9.02392380763209e-05,
"loss": 0.0461,
"step": 4360
},
{
"epoch": 4.500514933058702,
"grad_norm": 0.21305856108665466,
"learning_rate": 9.018464549206769e-05,
"loss": 0.0461,
"step": 4370
},
{
"epoch": 4.51081359423275,
"grad_norm": 0.6847507953643799,
"learning_rate": 9.012991727413487e-05,
"loss": 0.0475,
"step": 4380
},
{
"epoch": 4.521112255406797,
"grad_norm": 0.3444644808769226,
"learning_rate": 9.007505360724453e-05,
"loss": 0.0423,
"step": 4390
},
{
"epoch": 4.531410916580844,
"grad_norm": 0.3524458110332489,
"learning_rate": 9.002005467657586e-05,
"loss": 0.058,
"step": 4400
},
{
"epoch": 4.541709577754892,
"grad_norm": 0.4131333529949188,
"learning_rate": 8.996492066776464e-05,
"loss": 0.0462,
"step": 4410
},
{
"epoch": 4.55200823892894,
"grad_norm": 0.35865673422813416,
"learning_rate": 8.990965176690252e-05,
"loss": 0.0493,
"step": 4420
},
{
"epoch": 4.562306900102986,
"grad_norm": 0.3511912524700165,
"learning_rate": 8.985424816053651e-05,
"loss": 0.0561,
"step": 4430
},
{
"epoch": 4.572605561277034,
"grad_norm": 0.2704029083251953,
"learning_rate": 8.979871003566826e-05,
"loss": 0.0526,
"step": 4440
},
{
"epoch": 4.582904222451082,
"grad_norm": 0.3202318847179413,
"learning_rate": 8.974303757975345e-05,
"loss": 0.0532,
"step": 4450
},
{
"epoch": 4.593202883625128,
"grad_norm": 0.31483763456344604,
"learning_rate": 8.968723098070117e-05,
"loss": 0.051,
"step": 4460
},
{
"epoch": 4.603501544799176,
"grad_norm": 0.3457460403442383,
"learning_rate": 8.963129042687329e-05,
"loss": 0.0507,
"step": 4470
},
{
"epoch": 4.613800205973224,
"grad_norm": 0.31409910321235657,
"learning_rate": 8.957521610708375e-05,
"loss": 0.0503,
"step": 4480
},
{
"epoch": 4.6240988671472705,
"grad_norm": 0.2827114164829254,
"learning_rate": 8.951900821059809e-05,
"loss": 0.0494,
"step": 4490
},
{
"epoch": 4.634397528321318,
"grad_norm": 0.31604471802711487,
"learning_rate": 8.946266692713261e-05,
"loss": 0.0483,
"step": 4500
},
{
"epoch": 4.644696189495366,
"grad_norm": 0.3118681311607361,
"learning_rate": 8.940619244685388e-05,
"loss": 0.0553,
"step": 4510
},
{
"epoch": 4.6549948506694125,
"grad_norm": 0.2974856197834015,
"learning_rate": 8.934958496037802e-05,
"loss": 0.051,
"step": 4520
},
{
"epoch": 4.66529351184346,
"grad_norm": 0.3584068715572357,
"learning_rate": 8.92928446587701e-05,
"loss": 0.0459,
"step": 4530
},
{
"epoch": 4.675592173017508,
"grad_norm": 0.36687174439430237,
"learning_rate": 8.923597173354345e-05,
"loss": 0.0483,
"step": 4540
},
{
"epoch": 4.6858908341915555,
"grad_norm": 0.35569944977760315,
"learning_rate": 8.917896637665908e-05,
"loss": 0.05,
"step": 4550
},
{
"epoch": 4.696189495365602,
"grad_norm": 0.38467368483543396,
"learning_rate": 8.912182878052495e-05,
"loss": 0.0421,
"step": 4560
},
{
"epoch": 4.70648815653965,
"grad_norm": 0.36783739924430847,
"learning_rate": 8.906455913799538e-05,
"loss": 0.0509,
"step": 4570
},
{
"epoch": 4.716786817713698,
"grad_norm": 0.2462991178035736,
"learning_rate": 8.900715764237037e-05,
"loss": 0.0469,
"step": 4580
},
{
"epoch": 4.727085478887744,
"grad_norm": 0.3449934720993042,
"learning_rate": 8.894962448739499e-05,
"loss": 0.0467,
"step": 4590
},
{
"epoch": 4.737384140061792,
"grad_norm": 0.38251376152038574,
"learning_rate": 8.889195986725865e-05,
"loss": 0.049,
"step": 4600
},
{
"epoch": 4.74768280123584,
"grad_norm": 0.30399325489997864,
"learning_rate": 8.883416397659452e-05,
"loss": 0.0532,
"step": 4610
},
{
"epoch": 4.757981462409886,
"grad_norm": 0.4609906077384949,
"learning_rate": 8.877623701047885e-05,
"loss": 0.0511,
"step": 4620
},
{
"epoch": 4.768280123583934,
"grad_norm": 0.40049266815185547,
"learning_rate": 8.871817916443025e-05,
"loss": 0.0567,
"step": 4630
},
{
"epoch": 4.778578784757982,
"grad_norm": 0.5834691524505615,
"learning_rate": 8.865999063440916e-05,
"loss": 0.0491,
"step": 4640
},
{
"epoch": 4.7888774459320285,
"grad_norm": 0.4367988705635071,
"learning_rate": 8.860167161681707e-05,
"loss": 0.0573,
"step": 4650
},
{
"epoch": 4.799176107106076,
"grad_norm": 0.33364230394363403,
"learning_rate": 8.854322230849588e-05,
"loss": 0.0604,
"step": 4660
},
{
"epoch": 4.809474768280124,
"grad_norm": 0.42235320806503296,
"learning_rate": 8.848464290672729e-05,
"loss": 0.0518,
"step": 4670
},
{
"epoch": 4.819773429454171,
"grad_norm": 0.32555538415908813,
"learning_rate": 8.84259336092321e-05,
"loss": 0.0457,
"step": 4680
},
{
"epoch": 4.830072090628218,
"grad_norm": 0.34331732988357544,
"learning_rate": 8.836709461416952e-05,
"loss": 0.0558,
"step": 4690
},
{
"epoch": 4.840370751802266,
"grad_norm": 0.6019324064254761,
"learning_rate": 8.830812612013655e-05,
"loss": 0.0573,
"step": 4700
},
{
"epoch": 4.850669412976313,
"grad_norm": 0.2844030261039734,
"learning_rate": 8.824902832616723e-05,
"loss": 0.0571,
"step": 4710
},
{
"epoch": 4.86096807415036,
"grad_norm": 0.47788453102111816,
"learning_rate": 8.818980143173213e-05,
"loss": 0.0565,
"step": 4720
},
{
"epoch": 4.871266735324408,
"grad_norm": 0.24314385652542114,
"learning_rate": 8.81304456367374e-05,
"loss": 0.046,
"step": 4730
},
{
"epoch": 4.8815653964984556,
"grad_norm": 0.3316558301448822,
"learning_rate": 8.807096114152442e-05,
"loss": 0.0519,
"step": 4740
},
{
"epoch": 4.891864057672502,
"grad_norm": 0.4027853012084961,
"learning_rate": 8.801134814686891e-05,
"loss": 0.0495,
"step": 4750
},
{
"epoch": 4.90216271884655,
"grad_norm": 0.3290289342403412,
"learning_rate": 8.795160685398027e-05,
"loss": 0.0449,
"step": 4760
},
{
"epoch": 4.912461380020598,
"grad_norm": 0.3217390775680542,
"learning_rate": 8.789173746450101e-05,
"loss": 0.0578,
"step": 4770
},
{
"epoch": 4.922760041194644,
"grad_norm": 0.43397730588912964,
"learning_rate": 8.783174018050594e-05,
"loss": 0.0483,
"step": 4780
},
{
"epoch": 4.933058702368692,
"grad_norm": 0.38298988342285156,
"learning_rate": 8.777161520450158e-05,
"loss": 0.0479,
"step": 4790
},
{
"epoch": 4.94335736354274,
"grad_norm": 0.36208289861679077,
"learning_rate": 8.771136273942544e-05,
"loss": 0.0525,
"step": 4800
},
{
"epoch": 4.9536560247167865,
"grad_norm": 0.3291323482990265,
"learning_rate": 8.765098298864533e-05,
"loss": 0.0469,
"step": 4810
},
{
"epoch": 4.963954685890834,
"grad_norm": 0.23334382474422455,
"learning_rate": 8.759047615595869e-05,
"loss": 0.0478,
"step": 4820
},
{
"epoch": 4.974253347064882,
"grad_norm": 0.3632581830024719,
"learning_rate": 8.752984244559188e-05,
"loss": 0.0558,
"step": 4830
},
{
"epoch": 4.9845520082389285,
"grad_norm": 0.3983827531337738,
"learning_rate": 8.746908206219955e-05,
"loss": 0.0584,
"step": 4840
},
{
"epoch": 4.994850669412976,
"grad_norm": 0.5021440982818604,
"learning_rate": 8.740819521086383e-05,
"loss": 0.0522,
"step": 4850
},
{
"epoch": 5.005149330587024,
"grad_norm": 0.4782863259315491,
"learning_rate": 8.734718209709377e-05,
"loss": 0.0503,
"step": 4860
},
{
"epoch": 5.0154479917610715,
"grad_norm": 0.3124346435070038,
"learning_rate": 8.728604292682459e-05,
"loss": 0.0523,
"step": 4870
},
{
"epoch": 5.025746652935118,
"grad_norm": 0.46991485357284546,
"learning_rate": 8.722477790641694e-05,
"loss": 0.0507,
"step": 4880
},
{
"epoch": 5.036045314109166,
"grad_norm": 0.381569504737854,
"learning_rate": 8.71633872426563e-05,
"loss": 0.0473,
"step": 4890
},
{
"epoch": 5.0463439752832135,
"grad_norm": 0.4210774004459381,
"learning_rate": 8.710187114275219e-05,
"loss": 0.0521,
"step": 4900
},
{
"epoch": 5.05664263645726,
"grad_norm": 0.3999352753162384,
"learning_rate": 8.70402298143375e-05,
"loss": 0.0548,
"step": 4910
},
{
"epoch": 5.066941297631308,
"grad_norm": 0.32023027539253235,
"learning_rate": 8.697846346546787e-05,
"loss": 0.0508,
"step": 4920
},
{
"epoch": 5.077239958805356,
"grad_norm": 0.38814589381217957,
"learning_rate": 8.691657230462083e-05,
"loss": 0.0484,
"step": 4930
},
{
"epoch": 5.087538619979402,
"grad_norm": 0.3033084571361542,
"learning_rate": 8.685455654069523e-05,
"loss": 0.0432,
"step": 4940
},
{
"epoch": 5.09783728115345,
"grad_norm": 0.39010483026504517,
"learning_rate": 8.679241638301049e-05,
"loss": 0.0506,
"step": 4950
},
{
"epoch": 5.108135942327498,
"grad_norm": 0.28835776448249817,
"learning_rate": 8.673015204130586e-05,
"loss": 0.0543,
"step": 4960
},
{
"epoch": 5.1184346035015444,
"grad_norm": 0.5217164754867554,
"learning_rate": 8.66677637257398e-05,
"loss": 0.0501,
"step": 4970
},
{
"epoch": 5.128733264675592,
"grad_norm": 0.4083517789840698,
"learning_rate": 8.660525164688913e-05,
"loss": 0.0572,
"step": 4980
},
{
"epoch": 5.13903192584964,
"grad_norm": 0.5034805536270142,
"learning_rate": 8.654261601574849e-05,
"loss": 0.0541,
"step": 4990
},
{
"epoch": 5.1493305870236865,
"grad_norm": 0.3255571126937866,
"learning_rate": 8.647985704372948e-05,
"loss": 0.0539,
"step": 5000
},
{
"epoch": 5.159629248197734,
"grad_norm": 0.589500367641449,
"learning_rate": 8.641697494266006e-05,
"loss": 0.0497,
"step": 5010
},
{
"epoch": 5.169927909371782,
"grad_norm": 0.3600839674472809,
"learning_rate": 8.635396992478371e-05,
"loss": 0.0564,
"step": 5020
},
{
"epoch": 5.1802265705458295,
"grad_norm": 0.3535096049308777,
"learning_rate": 8.629084220275887e-05,
"loss": 0.0528,
"step": 5030
},
{
"epoch": 5.190525231719876,
"grad_norm": 0.3266212046146393,
"learning_rate": 8.622759198965809e-05,
"loss": 0.0476,
"step": 5040
},
{
"epoch": 5.200823892893924,
"grad_norm": 0.4038067162036896,
"learning_rate": 8.616421949896734e-05,
"loss": 0.0517,
"step": 5050
},
{
"epoch": 5.2111225540679715,
"grad_norm": 0.3460542857646942,
"learning_rate": 8.610072494458535e-05,
"loss": 0.0474,
"step": 5060
},
{
"epoch": 5.221421215242018,
"grad_norm": 0.41362518072128296,
"learning_rate": 8.603710854082286e-05,
"loss": 0.0515,
"step": 5070
},
{
"epoch": 5.231719876416066,
"grad_norm": 0.2805697023868561,
"learning_rate": 8.597337050240184e-05,
"loss": 0.0519,
"step": 5080
},
{
"epoch": 5.242018537590114,
"grad_norm": 0.4825451374053955,
"learning_rate": 8.590951104445482e-05,
"loss": 0.0504,
"step": 5090
},
{
"epoch": 5.25231719876416,
"grad_norm": 0.3441821038722992,
"learning_rate": 8.584553038252414e-05,
"loss": 0.0581,
"step": 5100
},
{
"epoch": 5.262615859938208,
"grad_norm": 0.39510828256607056,
"learning_rate": 8.578142873256129e-05,
"loss": 0.0532,
"step": 5110
},
{
"epoch": 5.272914521112256,
"grad_norm": 0.3733309805393219,
"learning_rate": 8.571720631092609e-05,
"loss": 0.057,
"step": 5120
},
{
"epoch": 5.283213182286302,
"grad_norm": 0.3860830068588257,
"learning_rate": 8.565286333438594e-05,
"loss": 0.049,
"step": 5130
},
{
"epoch": 5.29351184346035,
"grad_norm": 0.3507029414176941,
"learning_rate": 8.558840002011528e-05,
"loss": 0.0542,
"step": 5140
},
{
"epoch": 5.303810504634398,
"grad_norm": 0.30535757541656494,
"learning_rate": 8.552381658569457e-05,
"loss": 0.0584,
"step": 5150
},
{
"epoch": 5.3141091658084445,
"grad_norm": 0.3580070734024048,
"learning_rate": 8.545911324910982e-05,
"loss": 0.0509,
"step": 5160
},
{
"epoch": 5.324407826982492,
"grad_norm": 0.21992090344429016,
"learning_rate": 8.539429022875169e-05,
"loss": 0.0412,
"step": 5170
},
{
"epoch": 5.33470648815654,
"grad_norm": 0.6406000852584839,
"learning_rate": 8.532934774341483e-05,
"loss": 0.0518,
"step": 5180
},
{
"epoch": 5.3450051493305875,
"grad_norm": 0.43300265073776245,
"learning_rate": 8.526428601229706e-05,
"loss": 0.0539,
"step": 5190
},
{
"epoch": 5.355303810504634,
"grad_norm": 0.5168215036392212,
"learning_rate": 8.519910525499874e-05,
"loss": 0.0552,
"step": 5200
},
{
"epoch": 5.365602471678682,
"grad_norm": 0.2501913905143738,
"learning_rate": 8.513380569152196e-05,
"loss": 0.0506,
"step": 5210
},
{
"epoch": 5.3759011328527295,
"grad_norm": 0.2757486402988434,
"learning_rate": 8.506838754226982e-05,
"loss": 0.0565,
"step": 5220
},
{
"epoch": 5.386199794026776,
"grad_norm": 0.47264114022254944,
"learning_rate": 8.500285102804568e-05,
"loss": 0.0519,
"step": 5230
},
{
"epoch": 5.396498455200824,
"grad_norm": 0.30214348435401917,
"learning_rate": 8.493719637005237e-05,
"loss": 0.0424,
"step": 5240
},
{
"epoch": 5.406797116374872,
"grad_norm": 0.4345119893550873,
"learning_rate": 8.487142378989152e-05,
"loss": 0.0412,
"step": 5250
},
{
"epoch": 5.417095777548918,
"grad_norm": 0.33627235889434814,
"learning_rate": 8.480553350956282e-05,
"loss": 0.0481,
"step": 5260
},
{
"epoch": 5.427394438722966,
"grad_norm": 0.3047385811805725,
"learning_rate": 8.473952575146312e-05,
"loss": 0.0481,
"step": 5270
},
{
"epoch": 5.437693099897014,
"grad_norm": 0.4447433352470398,
"learning_rate": 8.46734007383859e-05,
"loss": 0.046,
"step": 5280
},
{
"epoch": 5.44799176107106,
"grad_norm": 0.4087453782558441,
"learning_rate": 8.460715869352035e-05,
"loss": 0.0487,
"step": 5290
},
{
"epoch": 5.458290422245108,
"grad_norm": 0.3321467339992523,
"learning_rate": 8.454079984045065e-05,
"loss": 0.0413,
"step": 5300
},
{
"epoch": 5.468589083419156,
"grad_norm": 0.356514036655426,
"learning_rate": 8.447432440315533e-05,
"loss": 0.049,
"step": 5310
},
{
"epoch": 5.4788877445932025,
"grad_norm": 0.37567445635795593,
"learning_rate": 8.44077326060063e-05,
"loss": 0.0461,
"step": 5320
},
{
"epoch": 5.48918640576725,
"grad_norm": 0.3040042519569397,
"learning_rate": 8.434102467376832e-05,
"loss": 0.0401,
"step": 5330
},
{
"epoch": 5.499485066941298,
"grad_norm": 0.39934873580932617,
"learning_rate": 8.427420083159807e-05,
"loss": 0.0493,
"step": 5340
},
{
"epoch": 5.509783728115345,
"grad_norm": 0.4000271260738373,
"learning_rate": 8.420726130504351e-05,
"loss": 0.0541,
"step": 5350
},
{
"epoch": 5.520082389289392,
"grad_norm": 0.2750590443611145,
"learning_rate": 8.414020632004299e-05,
"loss": 0.0481,
"step": 5360
},
{
"epoch": 5.53038105046344,
"grad_norm": 0.4174776077270508,
"learning_rate": 8.407303610292462e-05,
"loss": 0.0501,
"step": 5370
},
{
"epoch": 5.5406797116374875,
"grad_norm": 0.2651192247867584,
"learning_rate": 8.400575088040548e-05,
"loss": 0.0491,
"step": 5380
},
{
"epoch": 5.550978372811534,
"grad_norm": 0.49490901827812195,
"learning_rate": 8.393835087959072e-05,
"loss": 0.0488,
"step": 5390
},
{
"epoch": 5.561277033985582,
"grad_norm": 0.6012644171714783,
"learning_rate": 8.387083632797299e-05,
"loss": 0.05,
"step": 5400
},
{
"epoch": 5.57157569515963,
"grad_norm": 0.4538785219192505,
"learning_rate": 8.380320745343153e-05,
"loss": 0.0479,
"step": 5410
},
{
"epoch": 5.581874356333676,
"grad_norm": 0.358992338180542,
"learning_rate": 8.373546448423147e-05,
"loss": 0.05,
"step": 5420
},
{
"epoch": 5.592173017507724,
"grad_norm": 0.3814113736152649,
"learning_rate": 8.366760764902304e-05,
"loss": 0.0415,
"step": 5430
},
{
"epoch": 5.602471678681772,
"grad_norm": 0.6442550420761108,
"learning_rate": 8.359963717684077e-05,
"loss": 0.0495,
"step": 5440
},
{
"epoch": 5.612770339855818,
"grad_norm": 0.34561294317245483,
"learning_rate": 8.353155329710279e-05,
"loss": 0.0507,
"step": 5450
},
{
"epoch": 5.623069001029866,
"grad_norm": 0.333892822265625,
"learning_rate": 8.346335623960998e-05,
"loss": 0.0406,
"step": 5460
},
{
"epoch": 5.633367662203914,
"grad_norm": 0.21642594039440155,
"learning_rate": 8.339504623454521e-05,
"loss": 0.05,
"step": 5470
},
{
"epoch": 5.6436663233779605,
"grad_norm": 0.21974137425422668,
"learning_rate": 8.332662351247262e-05,
"loss": 0.0497,
"step": 5480
},
{
"epoch": 5.653964984552008,
"grad_norm": 0.35917189717292786,
"learning_rate": 8.325808830433679e-05,
"loss": 0.041,
"step": 5490
},
{
"epoch": 5.664263645726056,
"grad_norm": 0.2640712857246399,
"learning_rate": 8.318944084146192e-05,
"loss": 0.047,
"step": 5500
},
{
"epoch": 5.674562306900103,
"grad_norm": 6.280691623687744,
"learning_rate": 8.312068135555115e-05,
"loss": 0.0481,
"step": 5510
},
{
"epoch": 5.68486096807415,
"grad_norm": 0.269490122795105,
"learning_rate": 8.305181007868572e-05,
"loss": 0.0416,
"step": 5520
},
{
"epoch": 5.695159629248198,
"grad_norm": 0.408123254776001,
"learning_rate": 8.298282724332419e-05,
"loss": 0.049,
"step": 5530
},
{
"epoch": 5.705458290422245,
"grad_norm": 0.2983226478099823,
"learning_rate": 8.291373308230165e-05,
"loss": 0.0497,
"step": 5540
},
{
"epoch": 5.715756951596292,
"grad_norm": 0.35842761397361755,
"learning_rate": 8.284452782882894e-05,
"loss": 0.0477,
"step": 5550
},
{
"epoch": 5.72605561277034,
"grad_norm": 0.2742210328578949,
"learning_rate": 8.277521171649189e-05,
"loss": 0.052,
"step": 5560
},
{
"epoch": 5.736354273944388,
"grad_norm": 0.2822439968585968,
"learning_rate": 8.27057849792505e-05,
"loss": 0.0491,
"step": 5570
},
{
"epoch": 5.746652935118434,
"grad_norm": 0.3104664385318756,
"learning_rate": 8.263624785143812e-05,
"loss": 0.0493,
"step": 5580
},
{
"epoch": 5.756951596292482,
"grad_norm": 0.32532253861427307,
"learning_rate": 8.256660056776076e-05,
"loss": 0.0581,
"step": 5590
},
{
"epoch": 5.76725025746653,
"grad_norm": 0.3366002142429352,
"learning_rate": 8.249684336329617e-05,
"loss": 0.043,
"step": 5600
},
{
"epoch": 5.777548918640576,
"grad_norm": 0.25842759013175964,
"learning_rate": 8.242697647349317e-05,
"loss": 0.0485,
"step": 5610
},
{
"epoch": 5.787847579814624,
"grad_norm": 0.302432656288147,
"learning_rate": 8.235700013417076e-05,
"loss": 0.0521,
"step": 5620
},
{
"epoch": 5.798146240988672,
"grad_norm": 0.3358532190322876,
"learning_rate": 8.228691458151738e-05,
"loss": 0.0441,
"step": 5630
},
{
"epoch": 5.8084449021627185,
"grad_norm": 0.4343230724334717,
"learning_rate": 8.221672005209008e-05,
"loss": 0.0521,
"step": 5640
},
{
"epoch": 5.818743563336766,
"grad_norm": 0.30650976300239563,
"learning_rate": 8.214641678281374e-05,
"loss": 0.0538,
"step": 5650
},
{
"epoch": 5.829042224510814,
"grad_norm": 0.3401453197002411,
"learning_rate": 8.207600501098026e-05,
"loss": 0.0428,
"step": 5660
},
{
"epoch": 5.8393408856848605,
"grad_norm": 0.45636221766471863,
"learning_rate": 8.200548497424778e-05,
"loss": 0.0582,
"step": 5670
},
{
"epoch": 5.849639546858908,
"grad_norm": 0.2774709165096283,
"learning_rate": 8.193485691063985e-05,
"loss": 0.048,
"step": 5680
},
{
"epoch": 5.859938208032956,
"grad_norm": 0.29194507002830505,
"learning_rate": 8.186412105854463e-05,
"loss": 0.0534,
"step": 5690
},
{
"epoch": 5.8702368692070035,
"grad_norm": 0.36549675464630127,
"learning_rate": 8.17932776567141e-05,
"loss": 0.0571,
"step": 5700
},
{
"epoch": 5.88053553038105,
"grad_norm": 0.302418977022171,
"learning_rate": 8.172232694426329e-05,
"loss": 0.0423,
"step": 5710
},
{
"epoch": 5.890834191555098,
"grad_norm": 0.27770909667015076,
"learning_rate": 8.165126916066936e-05,
"loss": 0.0487,
"step": 5720
},
{
"epoch": 5.901132852729146,
"grad_norm": 0.3784064054489136,
"learning_rate": 8.158010454577093e-05,
"loss": 0.0504,
"step": 5730
},
{
"epoch": 5.911431513903192,
"grad_norm": 0.29943570494651794,
"learning_rate": 8.150883333976713e-05,
"loss": 0.0458,
"step": 5740
},
{
"epoch": 5.92173017507724,
"grad_norm": 0.26842376589775085,
"learning_rate": 8.143745578321695e-05,
"loss": 0.0523,
"step": 5750
},
{
"epoch": 5.932028836251288,
"grad_norm": 0.19866850972175598,
"learning_rate": 8.136597211703827e-05,
"loss": 0.0429,
"step": 5760
},
{
"epoch": 5.942327497425334,
"grad_norm": 0.30413612723350525,
"learning_rate": 8.129438258250712e-05,
"loss": 0.0441,
"step": 5770
},
{
"epoch": 5.952626158599382,
"grad_norm": 0.2791491746902466,
"learning_rate": 8.122268742125695e-05,
"loss": 0.047,
"step": 5780
},
{
"epoch": 5.96292481977343,
"grad_norm": 0.34201282262802124,
"learning_rate": 8.115088687527761e-05,
"loss": 0.0501,
"step": 5790
},
{
"epoch": 5.9732234809474765,
"grad_norm": 0.39383724331855774,
"learning_rate": 8.107898118691473e-05,
"loss": 0.0497,
"step": 5800
},
{
"epoch": 5.983522142121524,
"grad_norm": 0.3670088052749634,
"learning_rate": 8.100697059886879e-05,
"loss": 0.0428,
"step": 5810
},
{
"epoch": 5.993820803295572,
"grad_norm": 0.3595752716064453,
"learning_rate": 8.093485535419434e-05,
"loss": 0.0467,
"step": 5820
},
{
"epoch": 6.0041194644696185,
"grad_norm": 0.403352290391922,
"learning_rate": 8.086263569629919e-05,
"loss": 0.0441,
"step": 5830
},
{
"epoch": 6.014418125643666,
"grad_norm": 0.18506278097629547,
"learning_rate": 8.079031186894354e-05,
"loss": 0.0508,
"step": 5840
},
{
"epoch": 6.024716786817714,
"grad_norm": 0.5713401436805725,
"learning_rate": 8.071788411623922e-05,
"loss": 0.0491,
"step": 5850
},
{
"epoch": 6.0350154479917615,
"grad_norm": 0.20415346324443817,
"learning_rate": 8.064535268264883e-05,
"loss": 0.0502,
"step": 5860
},
{
"epoch": 6.045314109165808,
"grad_norm": 0.28075137734413147,
"learning_rate": 8.057271781298489e-05,
"loss": 0.0512,
"step": 5870
},
{
"epoch": 6.055612770339856,
"grad_norm": 0.3114660680294037,
"learning_rate": 8.049997975240909e-05,
"loss": 0.0508,
"step": 5880
},
{
"epoch": 6.0659114315139036,
"grad_norm": 0.3134065866470337,
"learning_rate": 8.042713874643136e-05,
"loss": 0.0531,
"step": 5890
},
{
"epoch": 6.07621009268795,
"grad_norm": 0.24600578844547272,
"learning_rate": 8.035419504090915e-05,
"loss": 0.0478,
"step": 5900
},
{
"epoch": 6.086508753861998,
"grad_norm": 0.34766799211502075,
"learning_rate": 8.028114888204653e-05,
"loss": 0.0486,
"step": 5910
},
{
"epoch": 6.096807415036046,
"grad_norm": 0.3067956268787384,
"learning_rate": 8.020800051639337e-05,
"loss": 0.0452,
"step": 5920
},
{
"epoch": 6.107106076210092,
"grad_norm": 0.3019874691963196,
"learning_rate": 8.013475019084453e-05,
"loss": 0.0458,
"step": 5930
},
{
"epoch": 6.11740473738414,
"grad_norm": 0.3271634578704834,
"learning_rate": 8.006139815263898e-05,
"loss": 0.0561,
"step": 5940
},
{
"epoch": 6.127703398558188,
"grad_norm": 0.2930561304092407,
"learning_rate": 7.998794464935904e-05,
"loss": 0.0407,
"step": 5950
},
{
"epoch": 6.1380020597322344,
"grad_norm": 0.37962770462036133,
"learning_rate": 7.991438992892946e-05,
"loss": 0.048,
"step": 5960
},
{
"epoch": 6.148300720906282,
"grad_norm": 0.36476749181747437,
"learning_rate": 7.984073423961664e-05,
"loss": 0.0439,
"step": 5970
},
{
"epoch": 6.15859938208033,
"grad_norm": 0.31208914518356323,
"learning_rate": 7.97669778300278e-05,
"loss": 0.0431,
"step": 5980
},
{
"epoch": 6.1688980432543765,
"grad_norm": 0.758002758026123,
"learning_rate": 7.969312094911007e-05,
"loss": 0.0481,
"step": 5990
},
{
"epoch": 6.179196704428424,
"grad_norm": 1.8981136083602905,
"learning_rate": 7.961916384614975e-05,
"loss": 0.0621,
"step": 6000
},
{
"epoch": 6.189495365602472,
"grad_norm": 0.277136892080307,
"learning_rate": 7.954510677077138e-05,
"loss": 0.0586,
"step": 6010
},
{
"epoch": 6.1997940267765195,
"grad_norm": 0.27095285058021545,
"learning_rate": 7.947094997293695e-05,
"loss": 0.0484,
"step": 6020
},
{
"epoch": 6.210092687950566,
"grad_norm": 0.2608092427253723,
"learning_rate": 7.9396693702945e-05,
"loss": 0.0457,
"step": 6030
},
{
"epoch": 6.220391349124614,
"grad_norm": 0.5210095643997192,
"learning_rate": 7.932233821142987e-05,
"loss": 0.0473,
"step": 6040
},
{
"epoch": 6.2306900102986615,
"grad_norm": 0.254302978515625,
"learning_rate": 7.924788374936078e-05,
"loss": 0.045,
"step": 6050
},
{
"epoch": 6.240988671472708,
"grad_norm": 0.343322217464447,
"learning_rate": 7.917333056804097e-05,
"loss": 0.054,
"step": 6060
},
{
"epoch": 6.251287332646756,
"grad_norm": 0.4098043143749237,
"learning_rate": 7.909867891910694e-05,
"loss": 0.0435,
"step": 6070
},
{
"epoch": 6.261585993820804,
"grad_norm": 0.34776240587234497,
"learning_rate": 7.902392905452749e-05,
"loss": 0.0538,
"step": 6080
},
{
"epoch": 6.27188465499485,
"grad_norm": 0.5250643491744995,
"learning_rate": 7.894908122660296e-05,
"loss": 0.0431,
"step": 6090
},
{
"epoch": 6.282183316168898,
"grad_norm": 0.37657663226127625,
"learning_rate": 7.887413568796433e-05,
"loss": 0.0532,
"step": 6100
},
{
"epoch": 6.292481977342946,
"grad_norm": 0.28036069869995117,
"learning_rate": 7.879909269157236e-05,
"loss": 0.0382,
"step": 6110
},
{
"epoch": 6.302780638516992,
"grad_norm": 0.4012965261936188,
"learning_rate": 7.87239524907168e-05,
"loss": 0.0472,
"step": 6120
},
{
"epoch": 6.31307929969104,
"grad_norm": 0.4002419412136078,
"learning_rate": 7.864871533901544e-05,
"loss": 0.051,
"step": 6130
},
{
"epoch": 6.323377960865088,
"grad_norm": 0.3897566795349121,
"learning_rate": 7.857338149041332e-05,
"loss": 0.0487,
"step": 6140
},
{
"epoch": 6.3336766220391345,
"grad_norm": 0.4365810751914978,
"learning_rate": 7.849795119918191e-05,
"loss": 0.0486,
"step": 6150
},
{
"epoch": 6.343975283213182,
"grad_norm": 0.38556814193725586,
"learning_rate": 7.842242471991809e-05,
"loss": 0.0509,
"step": 6160
},
{
"epoch": 6.35427394438723,
"grad_norm": 0.3570299744606018,
"learning_rate": 7.834680230754353e-05,
"loss": 0.0485,
"step": 6170
},
{
"epoch": 6.364572605561277,
"grad_norm": 0.25796523690223694,
"learning_rate": 7.82710842173036e-05,
"loss": 0.0474,
"step": 6180
},
{
"epoch": 6.374871266735324,
"grad_norm": 0.4013979732990265,
"learning_rate": 7.819527070476665e-05,
"loss": 0.0453,
"step": 6190
},
{
"epoch": 6.385169927909372,
"grad_norm": 0.2755083739757538,
"learning_rate": 7.811936202582306e-05,
"loss": 0.0407,
"step": 6200
},
{
"epoch": 6.3954685890834195,
"grad_norm": 0.8050864338874817,
"learning_rate": 7.80433584366845e-05,
"loss": 0.0468,
"step": 6210
},
{
"epoch": 6.405767250257466,
"grad_norm": 0.5987268686294556,
"learning_rate": 7.796726019388295e-05,
"loss": 0.0445,
"step": 6220
},
{
"epoch": 6.416065911431514,
"grad_norm": 0.31688612699508667,
"learning_rate": 7.789106755426985e-05,
"loss": 0.0414,
"step": 6230
},
{
"epoch": 6.426364572605562,
"grad_norm": 0.2687252163887024,
"learning_rate": 7.781478077501525e-05,
"loss": 0.0381,
"step": 6240
},
{
"epoch": 6.436663233779608,
"grad_norm": 0.31859585642814636,
"learning_rate": 7.773840011360698e-05,
"loss": 0.0486,
"step": 6250
},
{
"epoch": 6.446961894953656,
"grad_norm": 0.39176130294799805,
"learning_rate": 7.766192582784974e-05,
"loss": 0.0492,
"step": 6260
},
{
"epoch": 6.457260556127704,
"grad_norm": 0.4192884862422943,
"learning_rate": 7.758535817586424e-05,
"loss": 0.0524,
"step": 6270
},
{
"epoch": 6.46755921730175,
"grad_norm": 0.41165101528167725,
"learning_rate": 7.750869741608628e-05,
"loss": 0.0459,
"step": 6280
},
{
"epoch": 6.477857878475798,
"grad_norm": 0.37704214453697205,
"learning_rate": 7.7431943807266e-05,
"loss": 0.0555,
"step": 6290
},
{
"epoch": 6.488156539649846,
"grad_norm": 0.4949089586734772,
"learning_rate": 7.735509760846682e-05,
"loss": 0.0493,
"step": 6300
},
{
"epoch": 6.4984552008238925,
"grad_norm": 0.27363213896751404,
"learning_rate": 7.727815907906481e-05,
"loss": 0.0498,
"step": 6310
},
{
"epoch": 6.50875386199794,
"grad_norm": 0.32286787033081055,
"learning_rate": 7.720112847874759e-05,
"loss": 0.0445,
"step": 6320
},
{
"epoch": 6.519052523171988,
"grad_norm": 0.2211546152830124,
"learning_rate": 7.712400606751356e-05,
"loss": 0.0475,
"step": 6330
},
{
"epoch": 6.5293511843460355,
"grad_norm": 0.2400301843881607,
"learning_rate": 7.7046792105671e-05,
"loss": 0.0459,
"step": 6340
},
{
"epoch": 6.539649845520082,
"grad_norm": 0.3111647069454193,
"learning_rate": 7.696948685383725e-05,
"loss": 0.0492,
"step": 6350
},
{
"epoch": 6.54994850669413,
"grad_norm": 0.3468630313873291,
"learning_rate": 7.68920905729377e-05,
"loss": 0.0422,
"step": 6360
},
{
"epoch": 6.5602471678681775,
"grad_norm": 0.4992178678512573,
"learning_rate": 7.6814603524205e-05,
"loss": 0.0489,
"step": 6370
},
{
"epoch": 6.570545829042224,
"grad_norm": 0.33954063057899475,
"learning_rate": 7.673702596917824e-05,
"loss": 0.0483,
"step": 6380
},
{
"epoch": 6.580844490216272,
"grad_norm": 0.3721350133419037,
"learning_rate": 7.665935816970193e-05,
"loss": 0.0415,
"step": 6390
},
{
"epoch": 6.59114315139032,
"grad_norm": 0.30230167508125305,
"learning_rate": 7.658160038792518e-05,
"loss": 0.0431,
"step": 6400
},
{
"epoch": 6.601441812564366,
"grad_norm": 0.2966795861721039,
"learning_rate": 7.650375288630083e-05,
"loss": 0.0431,
"step": 6410
},
{
"epoch": 6.611740473738414,
"grad_norm": 0.28090888261795044,
"learning_rate": 7.642581592758453e-05,
"loss": 0.0413,
"step": 6420
},
{
"epoch": 6.622039134912462,
"grad_norm": 0.3371041715145111,
"learning_rate": 7.634778977483389e-05,
"loss": 0.0469,
"step": 6430
},
{
"epoch": 6.632337796086508,
"grad_norm": 0.28260523080825806,
"learning_rate": 7.626967469140754e-05,
"loss": 0.0437,
"step": 6440
},
{
"epoch": 6.642636457260556,
"grad_norm": 0.2734527289867401,
"learning_rate": 7.619147094096434e-05,
"loss": 0.043,
"step": 6450
},
{
"epoch": 6.652935118434604,
"grad_norm": 0.3294004797935486,
"learning_rate": 7.611317878746238e-05,
"loss": 0.0414,
"step": 6460
},
{
"epoch": 6.663233779608651,
"grad_norm": 0.45815443992614746,
"learning_rate": 7.60347984951581e-05,
"loss": 0.0496,
"step": 6470
},
{
"epoch": 6.673532440782698,
"grad_norm": 0.24537749588489532,
"learning_rate": 7.59563303286055e-05,
"loss": 0.0425,
"step": 6480
},
{
"epoch": 6.683831101956746,
"grad_norm": 0.32262513041496277,
"learning_rate": 7.587777455265515e-05,
"loss": 0.042,
"step": 6490
},
{
"epoch": 6.6941297631307926,
"grad_norm": 0.19561485946178436,
"learning_rate": 7.579913143245328e-05,
"loss": 0.0424,
"step": 6500
},
{
"epoch": 6.70442842430484,
"grad_norm": 0.29754048585891724,
"learning_rate": 7.572040123344103e-05,
"loss": 0.0466,
"step": 6510
},
{
"epoch": 6.714727085478888,
"grad_norm": 0.33084553480148315,
"learning_rate": 7.564158422135337e-05,
"loss": 0.0496,
"step": 6520
},
{
"epoch": 6.7250257466529355,
"grad_norm": 0.40858951210975647,
"learning_rate": 7.55626806622183e-05,
"loss": 0.0481,
"step": 6530
},
{
"epoch": 6.735324407826982,
"grad_norm": 0.9231746792793274,
"learning_rate": 7.548369082235595e-05,
"loss": 0.0512,
"step": 6540
},
{
"epoch": 6.74562306900103,
"grad_norm": 0.4263251721858978,
"learning_rate": 7.54046149683777e-05,
"loss": 0.0429,
"step": 6550
},
{
"epoch": 6.755921730175078,
"grad_norm": 0.2868654131889343,
"learning_rate": 7.532545336718521e-05,
"loss": 0.048,
"step": 6560
},
{
"epoch": 6.766220391349124,
"grad_norm": 0.250887930393219,
"learning_rate": 7.524620628596954e-05,
"loss": 0.0477,
"step": 6570
},
{
"epoch": 6.776519052523172,
"grad_norm": 0.3410227298736572,
"learning_rate": 7.516687399221037e-05,
"loss": 0.0474,
"step": 6580
},
{
"epoch": 6.78681771369722,
"grad_norm": 0.42289555072784424,
"learning_rate": 7.508745675367483e-05,
"loss": 0.0445,
"step": 6590
},
{
"epoch": 6.797116374871266,
"grad_norm": 0.3723140358924866,
"learning_rate": 7.500795483841692e-05,
"loss": 0.0473,
"step": 6600
},
{
"epoch": 6.807415036045314,
"grad_norm": 0.5165073275566101,
"learning_rate": 7.492836851477636e-05,
"loss": 0.0502,
"step": 6610
},
{
"epoch": 6.817713697219362,
"grad_norm": 0.3081056773662567,
"learning_rate": 7.484869805137778e-05,
"loss": 0.0478,
"step": 6620
},
{
"epoch": 6.8280123583934085,
"grad_norm": 0.39798182249069214,
"learning_rate": 7.476894371712982e-05,
"loss": 0.0516,
"step": 6630
},
{
"epoch": 6.838311019567456,
"grad_norm": 0.3031449615955353,
"learning_rate": 7.468910578122418e-05,
"loss": 0.0458,
"step": 6640
},
{
"epoch": 6.848609680741504,
"grad_norm": 0.40421777963638306,
"learning_rate": 7.460918451313481e-05,
"loss": 0.0464,
"step": 6650
},
{
"epoch": 6.858908341915551,
"grad_norm": 0.3347015976905823,
"learning_rate": 7.452918018261684e-05,
"loss": 0.0427,
"step": 6660
},
{
"epoch": 6.869207003089598,
"grad_norm": 0.46592167019844055,
"learning_rate": 7.444909305970578e-05,
"loss": 0.0395,
"step": 6670
},
{
"epoch": 6.879505664263646,
"grad_norm": 0.31017211079597473,
"learning_rate": 7.436892341471663e-05,
"loss": 0.052,
"step": 6680
},
{
"epoch": 6.889804325437693,
"grad_norm": 0.575901210308075,
"learning_rate": 7.428867151824287e-05,
"loss": 0.0489,
"step": 6690
},
{
"epoch": 6.90010298661174,
"grad_norm": 0.372746080160141,
"learning_rate": 7.420833764115561e-05,
"loss": 0.0428,
"step": 6700
},
{
"epoch": 6.910401647785788,
"grad_norm": 0.37451857328414917,
"learning_rate": 7.41279220546027e-05,
"loss": 0.0432,
"step": 6710
},
{
"epoch": 6.920700308959836,
"grad_norm": 0.3189006447792053,
"learning_rate": 7.404742503000776e-05,
"loss": 0.0519,
"step": 6720
},
{
"epoch": 6.930998970133882,
"grad_norm": 0.22485186159610748,
"learning_rate": 7.396684683906928e-05,
"loss": 0.0507,
"step": 6730
},
{
"epoch": 6.94129763130793,
"grad_norm": 0.3649514615535736,
"learning_rate": 7.38861877537597e-05,
"loss": 0.0485,
"step": 6740
},
{
"epoch": 6.951596292481978,
"grad_norm": 0.37899455428123474,
"learning_rate": 7.380544804632453e-05,
"loss": 0.0454,
"step": 6750
},
{
"epoch": 6.961894953656024,
"grad_norm": 0.4623110294342041,
"learning_rate": 7.372462798928137e-05,
"loss": 0.0446,
"step": 6760
},
{
"epoch": 6.972193614830072,
"grad_norm": 0.41896483302116394,
"learning_rate": 7.364372785541902e-05,
"loss": 0.0432,
"step": 6770
},
{
"epoch": 6.98249227600412,
"grad_norm": 0.28001904487609863,
"learning_rate": 7.356274791779661e-05,
"loss": 0.0447,
"step": 6780
},
{
"epoch": 6.9927909371781665,
"grad_norm": 0.35105225443840027,
"learning_rate": 7.348168844974254e-05,
"loss": 0.0445,
"step": 6790
},
{
"epoch": 7.003089598352214,
"grad_norm": 0.41556599736213684,
"learning_rate": 7.340054972485371e-05,
"loss": 0.0512,
"step": 6800
},
{
"epoch": 7.013388259526262,
"grad_norm": 0.4035722017288208,
"learning_rate": 7.331933201699457e-05,
"loss": 0.0423,
"step": 6810
},
{
"epoch": 7.0236869207003085,
"grad_norm": 0.4090428352355957,
"learning_rate": 7.323803560029605e-05,
"loss": 0.0514,
"step": 6820
},
{
"epoch": 7.033985581874356,
"grad_norm": 0.3787795901298523,
"learning_rate": 7.315666074915481e-05,
"loss": 0.0402,
"step": 6830
},
{
"epoch": 7.044284243048404,
"grad_norm": 0.32284408807754517,
"learning_rate": 7.307520773823227e-05,
"loss": 0.0466,
"step": 6840
},
{
"epoch": 7.0545829042224515,
"grad_norm": 0.35008612275123596,
"learning_rate": 7.299367684245362e-05,
"loss": 0.0451,
"step": 6850
},
{
"epoch": 7.064881565396498,
"grad_norm": 0.38151565194129944,
"learning_rate": 7.29120683370069e-05,
"loss": 0.0364,
"step": 6860
},
{
"epoch": 7.075180226570546,
"grad_norm": 0.21700677275657654,
"learning_rate": 7.283038249734217e-05,
"loss": 0.0504,
"step": 6870
},
{
"epoch": 7.085478887744594,
"grad_norm": 0.3018152415752411,
"learning_rate": 7.27486195991705e-05,
"loss": 0.0519,
"step": 6880
},
{
"epoch": 7.09577754891864,
"grad_norm": 0.2052696943283081,
"learning_rate": 7.266677991846301e-05,
"loss": 0.042,
"step": 6890
},
{
"epoch": 7.106076210092688,
"grad_norm": 0.39970454573631287,
"learning_rate": 7.258486373144999e-05,
"loss": 0.0409,
"step": 6900
},
{
"epoch": 7.116374871266736,
"grad_norm": 0.22980281710624695,
"learning_rate": 7.250287131462004e-05,
"loss": 0.0445,
"step": 6910
},
{
"epoch": 7.126673532440782,
"grad_norm": 0.3403468430042267,
"learning_rate": 7.242080294471895e-05,
"loss": 0.0565,
"step": 6920
},
{
"epoch": 7.13697219361483,
"grad_norm": 0.25713488459587097,
"learning_rate": 7.233865889874891e-05,
"loss": 0.0456,
"step": 6930
},
{
"epoch": 7.147270854788878,
"grad_norm": 0.3376232981681824,
"learning_rate": 7.225643945396757e-05,
"loss": 0.0378,
"step": 6940
},
{
"epoch": 7.1575695159629245,
"grad_norm": 0.255604088306427,
"learning_rate": 7.217414488788702e-05,
"loss": 0.041,
"step": 6950
},
{
"epoch": 7.167868177136972,
"grad_norm": 0.2713391184806824,
"learning_rate": 7.209177547827294e-05,
"loss": 0.0527,
"step": 6960
},
{
"epoch": 7.17816683831102,
"grad_norm": 0.2645740509033203,
"learning_rate": 7.20093315031436e-05,
"loss": 0.0432,
"step": 6970
},
{
"epoch": 7.1884654994850665,
"grad_norm": 0.3499581515789032,
"learning_rate": 7.192681324076896e-05,
"loss": 0.0516,
"step": 6980
},
{
"epoch": 7.198764160659114,
"grad_norm": 0.24416272342205048,
"learning_rate": 7.184422096966971e-05,
"loss": 0.0435,
"step": 6990
},
{
"epoch": 7.209062821833162,
"grad_norm": 0.3371264338493347,
"learning_rate": 7.176155496861638e-05,
"loss": 0.0463,
"step": 7000
},
{
"epoch": 7.2193614830072095,
"grad_norm": 0.3851630687713623,
"learning_rate": 7.167881551662831e-05,
"loss": 0.0407,
"step": 7010
},
{
"epoch": 7.229660144181256,
"grad_norm": 0.2070106714963913,
"learning_rate": 7.159600289297276e-05,
"loss": 0.0386,
"step": 7020
},
{
"epoch": 7.239958805355304,
"grad_norm": 0.3137363791465759,
"learning_rate": 7.151311737716397e-05,
"loss": 0.0411,
"step": 7030
},
{
"epoch": 7.2502574665293515,
"grad_norm": 0.3703240752220154,
"learning_rate": 7.143015924896226e-05,
"loss": 0.0426,
"step": 7040
},
{
"epoch": 7.260556127703398,
"grad_norm": 0.3365670144557953,
"learning_rate": 7.134712878837294e-05,
"loss": 0.0506,
"step": 7050
},
{
"epoch": 7.270854788877446,
"grad_norm": 0.2538038194179535,
"learning_rate": 7.126402627564555e-05,
"loss": 0.0466,
"step": 7060
},
{
"epoch": 7.281153450051494,
"grad_norm": 0.43290919065475464,
"learning_rate": 7.118085199127276e-05,
"loss": 0.0463,
"step": 7070
},
{
"epoch": 7.29145211122554,
"grad_norm": 0.2167598456144333,
"learning_rate": 7.109760621598952e-05,
"loss": 0.0421,
"step": 7080
},
{
"epoch": 7.301750772399588,
"grad_norm": 0.24321898818016052,
"learning_rate": 7.101428923077209e-05,
"loss": 0.0382,
"step": 7090
},
{
"epoch": 7.312049433573636,
"grad_norm": 0.31298938393592834,
"learning_rate": 7.093090131683704e-05,
"loss": 0.0401,
"step": 7100
},
{
"epoch": 7.3223480947476824,
"grad_norm": 0.38020390272140503,
"learning_rate": 7.08474427556404e-05,
"loss": 0.0454,
"step": 7110
},
{
"epoch": 7.33264675592173,
"grad_norm": 0.37544867396354675,
"learning_rate": 7.076391382887661e-05,
"loss": 0.0408,
"step": 7120
},
{
"epoch": 7.342945417095778,
"grad_norm": 0.2992228865623474,
"learning_rate": 7.068031481847762e-05,
"loss": 0.0454,
"step": 7130
},
{
"epoch": 7.3532440782698245,
"grad_norm": 0.48509418964385986,
"learning_rate": 7.059664600661196e-05,
"loss": 0.044,
"step": 7140
},
{
"epoch": 7.363542739443872,
"grad_norm": 0.4964796304702759,
"learning_rate": 7.051290767568371e-05,
"loss": 0.0526,
"step": 7150
},
{
"epoch": 7.37384140061792,
"grad_norm": 0.22935813665390015,
"learning_rate": 7.042910010833163e-05,
"loss": 0.0416,
"step": 7160
},
{
"epoch": 7.3841400617919675,
"grad_norm": 0.2570447325706482,
"learning_rate": 7.034522358742816e-05,
"loss": 0.0488,
"step": 7170
},
{
"epoch": 7.394438722966014,
"grad_norm": 0.23174193501472473,
"learning_rate": 7.026127839607847e-05,
"loss": 0.0423,
"step": 7180
},
{
"epoch": 7.404737384140062,
"grad_norm": 0.33260369300842285,
"learning_rate": 7.017726481761951e-05,
"loss": 0.0464,
"step": 7190
},
{
"epoch": 7.4150360453141095,
"grad_norm": 0.4475546181201935,
"learning_rate": 7.009318313561908e-05,
"loss": 0.0475,
"step": 7200
},
{
"epoch": 7.425334706488156,
"grad_norm": 0.2761160731315613,
"learning_rate": 7.000903363387482e-05,
"loss": 0.0448,
"step": 7210
},
{
"epoch": 7.435633367662204,
"grad_norm": 0.39867162704467773,
"learning_rate": 6.99248165964133e-05,
"loss": 0.0455,
"step": 7220
},
{
"epoch": 7.445932028836252,
"grad_norm": 0.3500315546989441,
"learning_rate": 6.9840532307489e-05,
"loss": 0.0452,
"step": 7230
},
{
"epoch": 7.456230690010298,
"grad_norm": 0.30247119069099426,
"learning_rate": 6.975618105158346e-05,
"loss": 0.0458,
"step": 7240
},
{
"epoch": 7.466529351184346,
"grad_norm": 0.357147753238678,
"learning_rate": 6.967176311340418e-05,
"loss": 0.0401,
"step": 7250
},
{
"epoch": 7.476828012358394,
"grad_norm": 0.36390820145606995,
"learning_rate": 6.958727877788378e-05,
"loss": 0.0432,
"step": 7260
},
{
"epoch": 7.48712667353244,
"grad_norm": 0.3110693395137787,
"learning_rate": 6.950272833017896e-05,
"loss": 0.0413,
"step": 7270
},
{
"epoch": 7.497425334706488,
"grad_norm": 0.26132798194885254,
"learning_rate": 6.941811205566957e-05,
"loss": 0.0448,
"step": 7280
},
{
"epoch": 7.507723995880536,
"grad_norm": 0.2721041142940521,
"learning_rate": 6.933343023995767e-05,
"loss": 0.0358,
"step": 7290
},
{
"epoch": 7.518022657054583,
"grad_norm": 0.26367267966270447,
"learning_rate": 6.924868316886649e-05,
"loss": 0.0515,
"step": 7300
},
{
"epoch": 7.52832131822863,
"grad_norm": 0.4417518377304077,
"learning_rate": 6.916387112843957e-05,
"loss": 0.054,
"step": 7310
},
{
"epoch": 7.538619979402678,
"grad_norm": 0.3166719079017639,
"learning_rate": 6.907899440493968e-05,
"loss": 0.0485,
"step": 7320
},
{
"epoch": 7.548918640576725,
"grad_norm": 0.330705463886261,
"learning_rate": 6.899405328484794e-05,
"loss": 0.0444,
"step": 7330
},
{
"epoch": 7.559217301750772,
"grad_norm": 0.22663088142871857,
"learning_rate": 6.890904805486286e-05,
"loss": 0.0424,
"step": 7340
},
{
"epoch": 7.56951596292482,
"grad_norm": 0.3720453083515167,
"learning_rate": 6.88239790018993e-05,
"loss": 0.043,
"step": 7350
},
{
"epoch": 7.5798146240988675,
"grad_norm": 0.2161106914281845,
"learning_rate": 6.873884641308752e-05,
"loss": 0.042,
"step": 7360
},
{
"epoch": 7.590113285272914,
"grad_norm": 0.3371187448501587,
"learning_rate": 6.865365057577227e-05,
"loss": 0.0463,
"step": 7370
},
{
"epoch": 7.600411946446962,
"grad_norm": 0.3055129945278168,
"learning_rate": 6.856839177751176e-05,
"loss": 0.0474,
"step": 7380
},
{
"epoch": 7.61071060762101,
"grad_norm": 0.3375736474990845,
"learning_rate": 6.84830703060767e-05,
"loss": 0.0439,
"step": 7390
},
{
"epoch": 7.621009268795056,
"grad_norm": 0.3460111916065216,
"learning_rate": 6.839768644944937e-05,
"loss": 0.0464,
"step": 7400
},
{
"epoch": 7.631307929969104,
"grad_norm": 0.3610309660434723,
"learning_rate": 6.83122404958226e-05,
"loss": 0.0441,
"step": 7410
},
{
"epoch": 7.641606591143152,
"grad_norm": 0.32009249925613403,
"learning_rate": 6.82267327335988e-05,
"loss": 0.0405,
"step": 7420
},
{
"epoch": 7.651905252317198,
"grad_norm": 0.532019853591919,
"learning_rate": 6.814116345138902e-05,
"loss": 0.0401,
"step": 7430
},
{
"epoch": 7.662203913491246,
"grad_norm": 0.25246256589889526,
"learning_rate": 6.805553293801196e-05,
"loss": 0.0476,
"step": 7440
},
{
"epoch": 7.672502574665294,
"grad_norm": 0.2576782703399658,
"learning_rate": 6.796984148249295e-05,
"loss": 0.0456,
"step": 7450
},
{
"epoch": 7.6828012358393405,
"grad_norm": 0.4437432885169983,
"learning_rate": 6.788408937406307e-05,
"loss": 0.0434,
"step": 7460
},
{
"epoch": 7.693099897013388,
"grad_norm": 0.3884623050689697,
"learning_rate": 6.77982769021581e-05,
"loss": 0.0433,
"step": 7470
},
{
"epoch": 7.703398558187436,
"grad_norm": 0.30564385652542114,
"learning_rate": 6.771240435641754e-05,
"loss": 0.0419,
"step": 7480
},
{
"epoch": 7.7136972193614834,
"grad_norm": 0.29946035146713257,
"learning_rate": 6.762647202668366e-05,
"loss": 0.0481,
"step": 7490
},
{
"epoch": 7.72399588053553,
"grad_norm": 0.270355761051178,
"learning_rate": 6.754048020300054e-05,
"loss": 0.0432,
"step": 7500
},
{
"epoch": 7.734294541709578,
"grad_norm": 0.3664805293083191,
"learning_rate": 6.745442917561309e-05,
"loss": 0.0379,
"step": 7510
},
{
"epoch": 7.7445932028836255,
"grad_norm": 0.788110077381134,
"learning_rate": 6.736831923496596e-05,
"loss": 0.0521,
"step": 7520
},
{
"epoch": 7.754891864057672,
"grad_norm": 0.46117472648620605,
"learning_rate": 6.728215067170273e-05,
"loss": 0.0487,
"step": 7530
},
{
"epoch": 7.76519052523172,
"grad_norm": 0.18957702815532684,
"learning_rate": 6.719592377666483e-05,
"loss": 0.0479,
"step": 7540
},
{
"epoch": 7.775489186405768,
"grad_norm": 0.4086840748786926,
"learning_rate": 6.710963884089054e-05,
"loss": 0.0426,
"step": 7550
},
{
"epoch": 7.785787847579814,
"grad_norm": 0.21845366060733795,
"learning_rate": 6.70232961556141e-05,
"loss": 0.0402,
"step": 7560
},
{
"epoch": 7.796086508753862,
"grad_norm": 0.18775074183940887,
"learning_rate": 6.693689601226458e-05,
"loss": 0.04,
"step": 7570
},
{
"epoch": 7.80638516992791,
"grad_norm": 0.30147698521614075,
"learning_rate": 6.685043870246507e-05,
"loss": 0.0434,
"step": 7580
},
{
"epoch": 7.816683831101956,
"grad_norm": 0.366470068693161,
"learning_rate": 6.676392451803161e-05,
"loss": 0.0463,
"step": 7590
},
{
"epoch": 7.826982492276004,
"grad_norm": 0.3885975778102875,
"learning_rate": 6.667735375097214e-05,
"loss": 0.0453,
"step": 7600
},
{
"epoch": 7.837281153450052,
"grad_norm": 0.29683852195739746,
"learning_rate": 6.659072669348564e-05,
"loss": 0.0419,
"step": 7610
},
{
"epoch": 7.8475798146240985,
"grad_norm": 0.29188981652259827,
"learning_rate": 6.650404363796108e-05,
"loss": 0.0371,
"step": 7620
},
{
"epoch": 7.857878475798146,
"grad_norm": 0.40961870551109314,
"learning_rate": 6.641730487697639e-05,
"loss": 0.0435,
"step": 7630
},
{
"epoch": 7.868177136972194,
"grad_norm": 0.33139774203300476,
"learning_rate": 6.633051070329759e-05,
"loss": 0.0413,
"step": 7640
},
{
"epoch": 7.8784757981462405,
"grad_norm": 0.28173500299453735,
"learning_rate": 6.624366140987768e-05,
"loss": 0.0452,
"step": 7650
},
{
"epoch": 7.888774459320288,
"grad_norm": 0.2889021039009094,
"learning_rate": 6.615675728985572e-05,
"loss": 0.0423,
"step": 7660
},
{
"epoch": 7.899073120494336,
"grad_norm": 0.6384182572364807,
"learning_rate": 6.606979863655583e-05,
"loss": 0.0379,
"step": 7670
},
{
"epoch": 7.9093717816683835,
"grad_norm": 0.4132192134857178,
"learning_rate": 6.598278574348619e-05,
"loss": 0.0391,
"step": 7680
},
{
"epoch": 7.91967044284243,
"grad_norm": 0.3432478606700897,
"learning_rate": 6.589571890433803e-05,
"loss": 0.0473,
"step": 7690
},
{
"epoch": 7.929969104016478,
"grad_norm": 0.3030139207839966,
"learning_rate": 6.580859841298471e-05,
"loss": 0.0374,
"step": 7700
},
{
"epoch": 7.940267765190526,
"grad_norm": 0.27307939529418945,
"learning_rate": 6.572142456348065e-05,
"loss": 0.0402,
"step": 7710
},
{
"epoch": 7.950566426364572,
"grad_norm": 0.2667880952358246,
"learning_rate": 6.563419765006038e-05,
"loss": 0.0463,
"step": 7720
},
{
"epoch": 7.96086508753862,
"grad_norm": 0.37028032541275024,
"learning_rate": 6.55469179671375e-05,
"loss": 0.038,
"step": 7730
},
{
"epoch": 7.971163748712668,
"grad_norm": 0.3381376266479492,
"learning_rate": 6.545958580930377e-05,
"loss": 0.0455,
"step": 7740
},
{
"epoch": 7.981462409886714,
"grad_norm": 0.28161460161209106,
"learning_rate": 6.537220147132805e-05,
"loss": 0.0396,
"step": 7750
},
{
"epoch": 7.991761071060762,
"grad_norm": 0.26298263669013977,
"learning_rate": 6.528476524815528e-05,
"loss": 0.0424,
"step": 7760
},
{
"epoch": 8.002059732234809,
"grad_norm": 0.2671511769294739,
"learning_rate": 6.519727743490561e-05,
"loss": 0.0384,
"step": 7770
},
{
"epoch": 8.012358393408856,
"grad_norm": 0.3101862967014313,
"learning_rate": 6.510973832687323e-05,
"loss": 0.0465,
"step": 7780
},
{
"epoch": 8.022657054582904,
"grad_norm": 0.3037969768047333,
"learning_rate": 6.502214821952555e-05,
"loss": 0.0473,
"step": 7790
},
{
"epoch": 8.032955715756952,
"grad_norm": 0.45323264598846436,
"learning_rate": 6.493450740850203e-05,
"loss": 0.0432,
"step": 7800
},
{
"epoch": 8.043254376931,
"grad_norm": 0.41797924041748047,
"learning_rate": 6.484681618961331e-05,
"loss": 0.048,
"step": 7810
},
{
"epoch": 8.053553038105047,
"grad_norm": 0.4865727424621582,
"learning_rate": 6.47590748588402e-05,
"loss": 0.0512,
"step": 7820
},
{
"epoch": 8.063851699279093,
"grad_norm": 0.3105076849460602,
"learning_rate": 6.46712837123326e-05,
"loss": 0.0448,
"step": 7830
},
{
"epoch": 8.07415036045314,
"grad_norm": 0.25625815987586975,
"learning_rate": 6.458344304640858e-05,
"loss": 0.0416,
"step": 7840
},
{
"epoch": 8.084449021627188,
"grad_norm": 0.31119033694267273,
"learning_rate": 6.449555315755333e-05,
"loss": 0.041,
"step": 7850
},
{
"epoch": 8.094747682801236,
"grad_norm": 0.39366838335990906,
"learning_rate": 6.440761434241821e-05,
"loss": 0.0404,
"step": 7860
},
{
"epoch": 8.105046343975284,
"grad_norm": 0.31691083312034607,
"learning_rate": 6.431962689781969e-05,
"loss": 0.0392,
"step": 7870
},
{
"epoch": 8.115345005149331,
"grad_norm": 0.23836584389209747,
"learning_rate": 6.423159112073838e-05,
"loss": 0.0455,
"step": 7880
},
{
"epoch": 8.125643666323377,
"grad_norm": 0.2766348719596863,
"learning_rate": 6.414350730831805e-05,
"loss": 0.0405,
"step": 7890
},
{
"epoch": 8.135942327497425,
"grad_norm": 0.3610820174217224,
"learning_rate": 6.405537575786456e-05,
"loss": 0.0459,
"step": 7900
},
{
"epoch": 8.146240988671472,
"grad_norm": 0.4069831669330597,
"learning_rate": 6.396719676684494e-05,
"loss": 0.0449,
"step": 7910
},
{
"epoch": 8.15653964984552,
"grad_norm": 0.38294172286987305,
"learning_rate": 6.387897063288635e-05,
"loss": 0.0495,
"step": 7920
},
{
"epoch": 8.166838311019568,
"grad_norm": 0.3302978575229645,
"learning_rate": 6.3790697653775e-05,
"loss": 0.0453,
"step": 7930
},
{
"epoch": 8.177136972193615,
"grad_norm": 0.26982101798057556,
"learning_rate": 6.37023781274553e-05,
"loss": 0.0463,
"step": 7940
},
{
"epoch": 8.187435633367663,
"grad_norm": 0.23370954394340515,
"learning_rate": 6.361401235202872e-05,
"loss": 0.0465,
"step": 7950
},
{
"epoch": 8.197734294541709,
"grad_norm": 0.3092534840106964,
"learning_rate": 6.352560062575284e-05,
"loss": 0.055,
"step": 7960
},
{
"epoch": 8.208032955715757,
"grad_norm": 0.36051103472709656,
"learning_rate": 6.343714324704034e-05,
"loss": 0.0551,
"step": 7970
},
{
"epoch": 8.218331616889804,
"grad_norm": 0.33508798480033875,
"learning_rate": 6.3348640514458e-05,
"loss": 0.0462,
"step": 7980
},
{
"epoch": 8.228630278063852,
"grad_norm": 0.9673136472702026,
"learning_rate": 6.326009272672564e-05,
"loss": 0.0442,
"step": 7990
},
{
"epoch": 8.2389289392379,
"grad_norm": 1.469125509262085,
"learning_rate": 6.317150018271522e-05,
"loss": 0.0465,
"step": 8000
},
{
"epoch": 8.249227600411947,
"grad_norm": 0.3022879660129547,
"learning_rate": 6.308286318144971e-05,
"loss": 0.052,
"step": 8010
},
{
"epoch": 8.259526261585993,
"grad_norm": 0.240738183259964,
"learning_rate": 6.299418202210214e-05,
"loss": 0.044,
"step": 8020
},
{
"epoch": 8.26982492276004,
"grad_norm": 0.3125,
"learning_rate": 6.290545700399462e-05,
"loss": 0.0413,
"step": 8030
},
{
"epoch": 8.280123583934088,
"grad_norm": 0.3256394565105438,
"learning_rate": 6.281668842659725e-05,
"loss": 0.0381,
"step": 8040
},
{
"epoch": 8.290422245108136,
"grad_norm": 0.3764393925666809,
"learning_rate": 6.27278765895272e-05,
"loss": 0.0412,
"step": 8050
},
{
"epoch": 8.300720906282184,
"grad_norm": 0.28021517395973206,
"learning_rate": 6.263902179254762e-05,
"loss": 0.0392,
"step": 8060
},
{
"epoch": 8.311019567456231,
"grad_norm": 0.3545322120189667,
"learning_rate": 6.255012433556665e-05,
"loss": 0.039,
"step": 8070
},
{
"epoch": 8.321318228630279,
"grad_norm": 0.33872804045677185,
"learning_rate": 6.246118451863646e-05,
"loss": 0.0417,
"step": 8080
},
{
"epoch": 8.331616889804325,
"grad_norm": 0.9136466383934021,
"learning_rate": 6.237220264195216e-05,
"loss": 0.0429,
"step": 8090
},
{
"epoch": 8.341915550978372,
"grad_norm": 0.31747815012931824,
"learning_rate": 6.228317900585083e-05,
"loss": 0.0425,
"step": 8100
},
{
"epoch": 8.35221421215242,
"grad_norm": 0.3648073375225067,
"learning_rate": 6.219411391081055e-05,
"loss": 0.0384,
"step": 8110
},
{
"epoch": 8.362512873326468,
"grad_norm": 0.26562437415122986,
"learning_rate": 6.210500765744925e-05,
"loss": 0.036,
"step": 8120
},
{
"epoch": 8.372811534500515,
"grad_norm": 0.2761411666870117,
"learning_rate": 6.201586054652379e-05,
"loss": 0.0466,
"step": 8130
},
{
"epoch": 8.383110195674563,
"grad_norm": 0.46033117175102234,
"learning_rate": 6.192667287892905e-05,
"loss": 0.0432,
"step": 8140
},
{
"epoch": 8.393408856848609,
"grad_norm": 0.3292730450630188,
"learning_rate": 6.183744495569666e-05,
"loss": 0.0426,
"step": 8150
},
{
"epoch": 8.403707518022657,
"grad_norm": 0.2943620979785919,
"learning_rate": 6.174817707799417e-05,
"loss": 0.0483,
"step": 8160
},
{
"epoch": 8.414006179196704,
"grad_norm": 0.3903990685939789,
"learning_rate": 6.165886954712401e-05,
"loss": 0.043,
"step": 8170
},
{
"epoch": 8.424304840370752,
"grad_norm": 0.41772767901420593,
"learning_rate": 6.156952266452247e-05,
"loss": 0.0407,
"step": 8180
},
{
"epoch": 8.4346035015448,
"grad_norm": 0.5899285078048706,
"learning_rate": 6.148013673175857e-05,
"loss": 0.0434,
"step": 8190
},
{
"epoch": 8.444902162718847,
"grad_norm": 0.22386884689331055,
"learning_rate": 6.13907120505332e-05,
"loss": 0.042,
"step": 8200
},
{
"epoch": 8.455200823892893,
"grad_norm": 0.3034772276878357,
"learning_rate": 6.130124892267806e-05,
"loss": 0.0365,
"step": 8210
},
{
"epoch": 8.46549948506694,
"grad_norm": 0.37777379155158997,
"learning_rate": 6.121174765015455e-05,
"loss": 0.0419,
"step": 8220
},
{
"epoch": 8.475798146240988,
"grad_norm": 0.30282172560691833,
"learning_rate": 6.112220853505288e-05,
"loss": 0.0418,
"step": 8230
},
{
"epoch": 8.486096807415036,
"grad_norm": 0.5801701545715332,
"learning_rate": 6.103263187959095e-05,
"loss": 0.049,
"step": 8240
},
{
"epoch": 8.496395468589084,
"grad_norm": 0.32179057598114014,
"learning_rate": 6.094301798611338e-05,
"loss": 0.0396,
"step": 8250
},
{
"epoch": 8.506694129763131,
"grad_norm": 0.2766133248806,
"learning_rate": 6.085336715709049e-05,
"loss": 0.0484,
"step": 8260
},
{
"epoch": 8.516992790937179,
"grad_norm": 0.2891679108142853,
"learning_rate": 6.076367969511725e-05,
"loss": 0.0483,
"step": 8270
},
{
"epoch": 8.527291452111225,
"grad_norm": 0.35707661509513855,
"learning_rate": 6.067395590291226e-05,
"loss": 0.0468,
"step": 8280
},
{
"epoch": 8.537590113285273,
"grad_norm": 0.29469162225723267,
"learning_rate": 6.0584196083316794e-05,
"loss": 0.0441,
"step": 8290
},
{
"epoch": 8.54788877445932,
"grad_norm": 0.29220518469810486,
"learning_rate": 6.0494400539293675e-05,
"loss": 0.0389,
"step": 8300
},
{
"epoch": 8.558187435633368,
"grad_norm": 0.3941989243030548,
"learning_rate": 6.040456957392635e-05,
"loss": 0.0389,
"step": 8310
},
{
"epoch": 8.568486096807415,
"grad_norm": 0.2707824409008026,
"learning_rate": 6.03147034904178e-05,
"loss": 0.0471,
"step": 8320
},
{
"epoch": 8.578784757981463,
"grad_norm": 0.35828855633735657,
"learning_rate": 6.0224802592089513e-05,
"loss": 0.0453,
"step": 8330
},
{
"epoch": 8.589083419155509,
"grad_norm": 0.2687852382659912,
"learning_rate": 6.013486718238055e-05,
"loss": 0.041,
"step": 8340
},
{
"epoch": 8.599382080329557,
"grad_norm": 0.25436437129974365,
"learning_rate": 6.004489756484641e-05,
"loss": 0.0411,
"step": 8350
},
{
"epoch": 8.609680741503604,
"grad_norm": 0.22475087642669678,
"learning_rate": 5.995489404315806e-05,
"loss": 0.0409,
"step": 8360
},
{
"epoch": 8.619979402677652,
"grad_norm": 0.32723718881607056,
"learning_rate": 5.98648569211009e-05,
"loss": 0.0477,
"step": 8370
},
{
"epoch": 8.6302780638517,
"grad_norm": 0.2676869034767151,
"learning_rate": 5.977478650257374e-05,
"loss": 0.0363,
"step": 8380
},
{
"epoch": 8.640576725025747,
"grad_norm": 0.6640805006027222,
"learning_rate": 5.9684683091587804e-05,
"loss": 0.0396,
"step": 8390
},
{
"epoch": 8.650875386199793,
"grad_norm": 0.29109275341033936,
"learning_rate": 5.959454699226562e-05,
"loss": 0.0452,
"step": 8400
},
{
"epoch": 8.66117404737384,
"grad_norm": 0.39319050312042236,
"learning_rate": 5.95043785088401e-05,
"loss": 0.0359,
"step": 8410
},
{
"epoch": 8.671472708547888,
"grad_norm": 0.2134009450674057,
"learning_rate": 5.941417794565343e-05,
"loss": 0.0387,
"step": 8420
},
{
"epoch": 8.681771369721936,
"grad_norm": 0.21827584505081177,
"learning_rate": 5.9323945607156076e-05,
"loss": 0.0382,
"step": 8430
},
{
"epoch": 8.692070030895984,
"grad_norm": 0.41963616013526917,
"learning_rate": 5.9233681797905785e-05,
"loss": 0.0404,
"step": 8440
},
{
"epoch": 8.702368692070031,
"grad_norm": 0.21744829416275024,
"learning_rate": 5.914338682256647e-05,
"loss": 0.0437,
"step": 8450
},
{
"epoch": 8.712667353244079,
"grad_norm": 0.27720943093299866,
"learning_rate": 5.905306098590728e-05,
"loss": 0.0403,
"step": 8460
},
{
"epoch": 8.722966014418125,
"grad_norm": 0.30195143818855286,
"learning_rate": 5.896270459280153e-05,
"loss": 0.0374,
"step": 8470
},
{
"epoch": 8.733264675592173,
"grad_norm": 0.32989758253097534,
"learning_rate": 5.8872317948225644e-05,
"loss": 0.0368,
"step": 8480
},
{
"epoch": 8.74356333676622,
"grad_norm": 0.22078627347946167,
"learning_rate": 5.8781901357258165e-05,
"loss": 0.0467,
"step": 8490
},
{
"epoch": 8.753861997940268,
"grad_norm": 0.5876451134681702,
"learning_rate": 5.869145512507872e-05,
"loss": 0.0407,
"step": 8500
},
{
"epoch": 8.764160659114316,
"grad_norm": 0.44796323776245117,
"learning_rate": 5.860097955696698e-05,
"loss": 0.0382,
"step": 8510
},
{
"epoch": 8.774459320288363,
"grad_norm": 0.35779476165771484,
"learning_rate": 5.851047495830163e-05,
"loss": 0.0438,
"step": 8520
},
{
"epoch": 8.784757981462409,
"grad_norm": 0.28585049510002136,
"learning_rate": 5.841994163455934e-05,
"loss": 0.0376,
"step": 8530
},
{
"epoch": 8.795056642636457,
"grad_norm": 0.26791223883628845,
"learning_rate": 5.832937989131374e-05,
"loss": 0.0387,
"step": 8540
},
{
"epoch": 8.805355303810504,
"grad_norm": 0.5671482086181641,
"learning_rate": 5.823879003423438e-05,
"loss": 0.0366,
"step": 8550
},
{
"epoch": 8.815653964984552,
"grad_norm": 0.1565544456243515,
"learning_rate": 5.8148172369085686e-05,
"loss": 0.0369,
"step": 8560
},
{
"epoch": 8.8259526261586,
"grad_norm": 0.46639129519462585,
"learning_rate": 5.8057527201725984e-05,
"loss": 0.0398,
"step": 8570
},
{
"epoch": 8.836251287332647,
"grad_norm": 0.8469918370246887,
"learning_rate": 5.796685483810637e-05,
"loss": 0.047,
"step": 8580
},
{
"epoch": 8.846549948506695,
"grad_norm": 0.1878482550382614,
"learning_rate": 5.7876155584269785e-05,
"loss": 0.0386,
"step": 8590
},
{
"epoch": 8.85684860968074,
"grad_norm": 0.26714402437210083,
"learning_rate": 5.7785429746349905e-05,
"loss": 0.049,
"step": 8600
},
{
"epoch": 8.867147270854788,
"grad_norm": 0.35005736351013184,
"learning_rate": 5.7694677630570146e-05,
"loss": 0.0435,
"step": 8610
},
{
"epoch": 8.877445932028836,
"grad_norm": 0.48994550108909607,
"learning_rate": 5.760389954324261e-05,
"loss": 0.049,
"step": 8620
},
{
"epoch": 8.887744593202884,
"grad_norm": 0.24901621043682098,
"learning_rate": 5.7513095790767066e-05,
"loss": 0.0445,
"step": 8630
},
{
"epoch": 8.898043254376931,
"grad_norm": 0.32309484481811523,
"learning_rate": 5.742226667962991e-05,
"loss": 0.0471,
"step": 8640
},
{
"epoch": 8.908341915550979,
"grad_norm": 0.30904820561408997,
"learning_rate": 5.733141251640315e-05,
"loss": 0.0377,
"step": 8650
},
{
"epoch": 8.918640576725025,
"grad_norm": 0.30617690086364746,
"learning_rate": 5.724053360774327e-05,
"loss": 0.0378,
"step": 8660
},
{
"epoch": 8.928939237899073,
"grad_norm": 0.19513899087905884,
"learning_rate": 5.7149630260390384e-05,
"loss": 0.0315,
"step": 8670
},
{
"epoch": 8.93923789907312,
"grad_norm": 0.5502423048019409,
"learning_rate": 5.705870278116703e-05,
"loss": 0.0422,
"step": 8680
},
{
"epoch": 8.949536560247168,
"grad_norm": 0.3435225486755371,
"learning_rate": 5.6967751476977215e-05,
"loss": 0.0406,
"step": 8690
},
{
"epoch": 8.959835221421216,
"grad_norm": 0.28045403957366943,
"learning_rate": 5.687677665480533e-05,
"loss": 0.0473,
"step": 8700
},
{
"epoch": 8.970133882595263,
"grad_norm": 0.2749752700328827,
"learning_rate": 5.6785778621715225e-05,
"loss": 0.0378,
"step": 8710
},
{
"epoch": 8.98043254376931,
"grad_norm": 0.39981475472450256,
"learning_rate": 5.669475768484901e-05,
"loss": 0.0406,
"step": 8720
},
{
"epoch": 8.990731204943357,
"grad_norm": 0.28953787684440613,
"learning_rate": 5.660371415142611e-05,
"loss": 0.0379,
"step": 8730
},
{
"epoch": 9.001029866117404,
"grad_norm": 0.17452044785022736,
"learning_rate": 5.65126483287423e-05,
"loss": 0.0412,
"step": 8740
},
{
"epoch": 9.011328527291452,
"grad_norm": 0.3600793182849884,
"learning_rate": 5.642156052416849e-05,
"loss": 0.041,
"step": 8750
},
{
"epoch": 9.0216271884655,
"grad_norm": 0.2760295569896698,
"learning_rate": 5.633045104514982e-05,
"loss": 0.0435,
"step": 8760
},
{
"epoch": 9.031925849639547,
"grad_norm": 0.3825409710407257,
"learning_rate": 5.6239320199204616e-05,
"loss": 0.0408,
"step": 8770
},
{
"epoch": 9.042224510813595,
"grad_norm": 0.374891072511673,
"learning_rate": 5.614816829392328e-05,
"loss": 0.0383,
"step": 8780
},
{
"epoch": 9.052523171987641,
"grad_norm": 0.27747559547424316,
"learning_rate": 5.60569956369673e-05,
"loss": 0.0464,
"step": 8790
},
{
"epoch": 9.062821833161689,
"grad_norm": 0.28678062558174133,
"learning_rate": 5.596580253606824e-05,
"loss": 0.0487,
"step": 8800
},
{
"epoch": 9.073120494335736,
"grad_norm": 0.4970363676548004,
"learning_rate": 5.587458929902664e-05,
"loss": 0.051,
"step": 8810
},
{
"epoch": 9.083419155509784,
"grad_norm": 0.30037108063697815,
"learning_rate": 5.5783356233711005e-05,
"loss": 0.0383,
"step": 8820
},
{
"epoch": 9.093717816683832,
"grad_norm": 0.2640860676765442,
"learning_rate": 5.569210364805677e-05,
"loss": 0.0462,
"step": 8830
},
{
"epoch": 9.10401647785788,
"grad_norm": 0.30006083846092224,
"learning_rate": 5.5600831850065274e-05,
"loss": 0.0362,
"step": 8840
},
{
"epoch": 9.114315139031925,
"grad_norm": 0.3721349537372589,
"learning_rate": 5.550954114780269e-05,
"loss": 0.0399,
"step": 8850
},
{
"epoch": 9.124613800205973,
"grad_norm": 0.336732417345047,
"learning_rate": 5.541823184939896e-05,
"loss": 0.0421,
"step": 8860
},
{
"epoch": 9.13491246138002,
"grad_norm": 0.26279309391975403,
"learning_rate": 5.532690426304685e-05,
"loss": 0.0433,
"step": 8870
},
{
"epoch": 9.145211122554068,
"grad_norm": 0.2945043742656708,
"learning_rate": 5.5235558697000836e-05,
"loss": 0.0439,
"step": 8880
},
{
"epoch": 9.155509783728116,
"grad_norm": 0.47877517342567444,
"learning_rate": 5.514419545957606e-05,
"loss": 0.0431,
"step": 8890
},
{
"epoch": 9.165808444902163,
"grad_norm": 0.3854601979255676,
"learning_rate": 5.5052814859147315e-05,
"loss": 0.0365,
"step": 8900
},
{
"epoch": 9.176107106076211,
"grad_norm": 0.3006962835788727,
"learning_rate": 5.496141720414804e-05,
"loss": 0.0427,
"step": 8910
},
{
"epoch": 9.186405767250257,
"grad_norm": 0.5065596699714661,
"learning_rate": 5.487000280306917e-05,
"loss": 0.0395,
"step": 8920
},
{
"epoch": 9.196704428424304,
"grad_norm": 0.4032178521156311,
"learning_rate": 5.4778571964458214e-05,
"loss": 0.0341,
"step": 8930
},
{
"epoch": 9.207003089598352,
"grad_norm": 0.357695609331131,
"learning_rate": 5.468712499691816e-05,
"loss": 0.0427,
"step": 8940
},
{
"epoch": 9.2173017507724,
"grad_norm": 0.6212796568870544,
"learning_rate": 5.45956622091064e-05,
"loss": 0.0444,
"step": 8950
},
{
"epoch": 9.227600411946447,
"grad_norm": 0.29458391666412354,
"learning_rate": 5.4504183909733734e-05,
"loss": 0.0402,
"step": 8960
},
{
"epoch": 9.237899073120495,
"grad_norm": 0.309467613697052,
"learning_rate": 5.441269040756334e-05,
"loss": 0.0412,
"step": 8970
},
{
"epoch": 9.248197734294541,
"grad_norm": 0.17707674205303192,
"learning_rate": 5.43211820114097e-05,
"loss": 0.0423,
"step": 8980
},
{
"epoch": 9.258496395468589,
"grad_norm": 0.4098307490348816,
"learning_rate": 5.422965903013757e-05,
"loss": 0.0421,
"step": 8990
},
{
"epoch": 9.268795056642636,
"grad_norm": 0.31290164589881897,
"learning_rate": 5.41381217726609e-05,
"loss": 0.0402,
"step": 9000
},
{
"epoch": 9.279093717816684,
"grad_norm": 0.20957662165164948,
"learning_rate": 5.404657054794189e-05,
"loss": 0.0426,
"step": 9010
},
{
"epoch": 9.289392378990732,
"grad_norm": 0.2308698147535324,
"learning_rate": 5.3955005664989834e-05,
"loss": 0.0389,
"step": 9020
},
{
"epoch": 9.29969104016478,
"grad_norm": 0.2409774512052536,
"learning_rate": 5.3863427432860125e-05,
"loss": 0.0352,
"step": 9030
},
{
"epoch": 9.309989701338825,
"grad_norm": 0.24483443796634674,
"learning_rate": 5.3771836160653254e-05,
"loss": 0.0406,
"step": 9040
},
{
"epoch": 9.320288362512873,
"grad_norm": 0.2869531810283661,
"learning_rate": 5.368023215751369e-05,
"loss": 0.0379,
"step": 9050
},
{
"epoch": 9.33058702368692,
"grad_norm": 0.27807915210723877,
"learning_rate": 5.3588615732628854e-05,
"loss": 0.0451,
"step": 9060
},
{
"epoch": 9.340885684860968,
"grad_norm": 0.33199331164360046,
"learning_rate": 5.3496987195228156e-05,
"loss": 0.034,
"step": 9070
},
{
"epoch": 9.351184346035016,
"grad_norm": 0.2562348246574402,
"learning_rate": 5.340534685458185e-05,
"loss": 0.0413,
"step": 9080
},
{
"epoch": 9.361483007209063,
"grad_norm": 0.3097791075706482,
"learning_rate": 5.3313695020000024e-05,
"loss": 0.039,
"step": 9090
},
{
"epoch": 9.371781668383111,
"grad_norm": 0.3079645037651062,
"learning_rate": 5.322203200083154e-05,
"loss": 0.0349,
"step": 9100
},
{
"epoch": 9.382080329557157,
"grad_norm": 0.4117037057876587,
"learning_rate": 5.3130358106463104e-05,
"loss": 0.0407,
"step": 9110
},
{
"epoch": 9.392378990731205,
"grad_norm": 0.4133201539516449,
"learning_rate": 5.303867364631804e-05,
"loss": 0.045,
"step": 9120
},
{
"epoch": 9.402677651905252,
"grad_norm": 0.2096584141254425,
"learning_rate": 5.294697892985534e-05,
"loss": 0.0335,
"step": 9130
},
{
"epoch": 9.4129763130793,
"grad_norm": 0.28559908270835876,
"learning_rate": 5.285527426656865e-05,
"loss": 0.0398,
"step": 9140
},
{
"epoch": 9.423274974253347,
"grad_norm": 0.3598606288433075,
"learning_rate": 5.2763559965985184e-05,
"loss": 0.0419,
"step": 9150
},
{
"epoch": 9.433573635427395,
"grad_norm": 0.35209372639656067,
"learning_rate": 5.2671836337664634e-05,
"loss": 0.0405,
"step": 9160
},
{
"epoch": 9.443872296601441,
"grad_norm": 0.23415158689022064,
"learning_rate": 5.2580103691198255e-05,
"loss": 0.0366,
"step": 9170
},
{
"epoch": 9.454170957775489,
"grad_norm": 0.2906668484210968,
"learning_rate": 5.24883623362077e-05,
"loss": 0.0493,
"step": 9180
},
{
"epoch": 9.464469618949536,
"grad_norm": 0.21137650310993195,
"learning_rate": 5.2396612582343986e-05,
"loss": 0.0423,
"step": 9190
},
{
"epoch": 9.474768280123584,
"grad_norm": 0.23499812185764313,
"learning_rate": 5.230485473928651e-05,
"loss": 0.0416,
"step": 9200
},
{
"epoch": 9.485066941297632,
"grad_norm": 0.372158020734787,
"learning_rate": 5.221308911674201e-05,
"loss": 0.0407,
"step": 9210
},
{
"epoch": 9.49536560247168,
"grad_norm": 0.2552221119403839,
"learning_rate": 5.2121316024443415e-05,
"loss": 0.0408,
"step": 9220
},
{
"epoch": 9.505664263645727,
"grad_norm": 0.27116450667381287,
"learning_rate": 5.202953577214889e-05,
"loss": 0.0375,
"step": 9230
},
{
"epoch": 9.515962924819773,
"grad_norm": 1.0216639041900635,
"learning_rate": 5.1937748669640776e-05,
"loss": 0.0412,
"step": 9240
},
{
"epoch": 9.52626158599382,
"grad_norm": 0.39132076501846313,
"learning_rate": 5.1845955026724535e-05,
"loss": 0.0408,
"step": 9250
},
{
"epoch": 9.536560247167868,
"grad_norm": 0.3046022653579712,
"learning_rate": 5.175415515322768e-05,
"loss": 0.0349,
"step": 9260
},
{
"epoch": 9.546858908341916,
"grad_norm": 0.5317039489746094,
"learning_rate": 5.1662349358998796e-05,
"loss": 0.0377,
"step": 9270
},
{
"epoch": 9.557157569515963,
"grad_norm": 0.308902382850647,
"learning_rate": 5.157053795390642e-05,
"loss": 0.0416,
"step": 9280
},
{
"epoch": 9.567456230690011,
"grad_norm": 0.1709175854921341,
"learning_rate": 5.147872124783805e-05,
"loss": 0.0367,
"step": 9290
},
{
"epoch": 9.577754891864057,
"grad_norm": 0.35447025299072266,
"learning_rate": 5.138689955069902e-05,
"loss": 0.0339,
"step": 9300
},
{
"epoch": 9.588053553038105,
"grad_norm": 0.20557384192943573,
"learning_rate": 5.12950731724116e-05,
"loss": 0.0435,
"step": 9310
},
{
"epoch": 9.598352214212152,
"grad_norm": 0.27278539538383484,
"learning_rate": 5.12032424229138e-05,
"loss": 0.0399,
"step": 9320
},
{
"epoch": 9.6086508753862,
"grad_norm": 0.3033859133720398,
"learning_rate": 5.111140761215839e-05,
"loss": 0.0376,
"step": 9330
},
{
"epoch": 9.618949536560248,
"grad_norm": 0.3543021082878113,
"learning_rate": 5.101956905011185e-05,
"loss": 0.0427,
"step": 9340
},
{
"epoch": 9.629248197734295,
"grad_norm": 0.2944181561470032,
"learning_rate": 5.0927727046753336e-05,
"loss": 0.0371,
"step": 9350
},
{
"epoch": 9.639546858908343,
"grad_norm": 0.3597414493560791,
"learning_rate": 5.08358819120736e-05,
"loss": 0.0373,
"step": 9360
},
{
"epoch": 9.649845520082389,
"grad_norm": 0.33194977045059204,
"learning_rate": 5.074403395607399e-05,
"loss": 0.0424,
"step": 9370
},
{
"epoch": 9.660144181256436,
"grad_norm": 0.21433711051940918,
"learning_rate": 5.0652183488765335e-05,
"loss": 0.0407,
"step": 9380
},
{
"epoch": 9.670442842430484,
"grad_norm": 0.3961849808692932,
"learning_rate": 5.056033082016699e-05,
"loss": 0.0419,
"step": 9390
},
{
"epoch": 9.680741503604532,
"grad_norm": 0.9774559140205383,
"learning_rate": 5.046847626030569e-05,
"loss": 0.041,
"step": 9400
},
{
"epoch": 9.69104016477858,
"grad_norm": 0.36883220076560974,
"learning_rate": 5.037662011921459e-05,
"loss": 0.0377,
"step": 9410
},
{
"epoch": 9.701338825952627,
"grad_norm": 0.37542909383773804,
"learning_rate": 5.028476270693217e-05,
"loss": 0.0408,
"step": 9420
},
{
"epoch": 9.711637487126673,
"grad_norm": 0.45353376865386963,
"learning_rate": 5.0192904333501214e-05,
"loss": 0.0419,
"step": 9430
},
{
"epoch": 9.72193614830072,
"grad_norm": 0.27116161584854126,
"learning_rate": 5.010104530896771e-05,
"loss": 0.0447,
"step": 9440
},
{
"epoch": 9.732234809474768,
"grad_norm": 0.26916906237602234,
"learning_rate": 5.000918594337989e-05,
"loss": 0.0461,
"step": 9450
},
{
"epoch": 9.742533470648816,
"grad_norm": 0.3069358766078949,
"learning_rate": 4.991732654678709e-05,
"loss": 0.0458,
"step": 9460
},
{
"epoch": 9.752832131822863,
"grad_norm": 0.42274564504623413,
"learning_rate": 4.9825467429238834e-05,
"loss": 0.0401,
"step": 9470
},
{
"epoch": 9.763130792996911,
"grad_norm": 0.17982327938079834,
"learning_rate": 4.973360890078358e-05,
"loss": 0.0427,
"step": 9480
},
{
"epoch": 9.773429454170957,
"grad_norm": 0.23251447081565857,
"learning_rate": 4.96417512714679e-05,
"loss": 0.0326,
"step": 9490
},
{
"epoch": 9.783728115345005,
"grad_norm": 0.2869229018688202,
"learning_rate": 4.954989485133533e-05,
"loss": 0.0507,
"step": 9500
},
{
"epoch": 9.794026776519052,
"grad_norm": 1.0959696769714355,
"learning_rate": 4.9458039950425224e-05,
"loss": 0.0518,
"step": 9510
},
{
"epoch": 9.8043254376931,
"grad_norm": 0.3641543686389923,
"learning_rate": 4.9366186878771926e-05,
"loss": 0.0434,
"step": 9520
},
{
"epoch": 9.814624098867148,
"grad_norm": 0.5896167159080505,
"learning_rate": 4.927433594640354e-05,
"loss": 0.0409,
"step": 9530
},
{
"epoch": 9.824922760041195,
"grad_norm": 0.24302540719509125,
"learning_rate": 4.918248746334096e-05,
"loss": 0.0451,
"step": 9540
},
{
"epoch": 9.835221421215241,
"grad_norm": 0.2889201045036316,
"learning_rate": 4.909064173959681e-05,
"loss": 0.0384,
"step": 9550
},
{
"epoch": 9.845520082389289,
"grad_norm": 0.37873101234436035,
"learning_rate": 4.8998799085174455e-05,
"loss": 0.0404,
"step": 9560
},
{
"epoch": 9.855818743563336,
"grad_norm": 0.4369457960128784,
"learning_rate": 4.89069598100668e-05,
"loss": 0.0431,
"step": 9570
},
{
"epoch": 9.866117404737384,
"grad_norm": 0.37580832839012146,
"learning_rate": 4.881512422425541e-05,
"loss": 0.044,
"step": 9580
},
{
"epoch": 9.876416065911432,
"grad_norm": 0.46920913457870483,
"learning_rate": 4.872329263770942e-05,
"loss": 0.0469,
"step": 9590
},
{
"epoch": 9.88671472708548,
"grad_norm": 0.24571798741817474,
"learning_rate": 4.8631465360384385e-05,
"loss": 0.0398,
"step": 9600
},
{
"epoch": 9.897013388259527,
"grad_norm": 0.3728749454021454,
"learning_rate": 4.85396427022214e-05,
"loss": 0.0352,
"step": 9610
},
{
"epoch": 9.907312049433573,
"grad_norm": 0.301878958940506,
"learning_rate": 4.844782497314591e-05,
"loss": 0.0432,
"step": 9620
},
{
"epoch": 9.91761071060762,
"grad_norm": 0.26632949709892273,
"learning_rate": 4.835601248306675e-05,
"loss": 0.0439,
"step": 9630
},
{
"epoch": 9.927909371781668,
"grad_norm": 0.31497064232826233,
"learning_rate": 4.826420554187506e-05,
"loss": 0.0399,
"step": 9640
},
{
"epoch": 9.938208032955716,
"grad_norm": 0.26114657521247864,
"learning_rate": 4.817240445944327e-05,
"loss": 0.0408,
"step": 9650
},
{
"epoch": 9.948506694129764,
"grad_norm": 0.2729547619819641,
"learning_rate": 4.8080609545624004e-05,
"loss": 0.0392,
"step": 9660
},
{
"epoch": 9.958805355303811,
"grad_norm": 0.22712601721286774,
"learning_rate": 4.798882111024912e-05,
"loss": 0.0363,
"step": 9670
},
{
"epoch": 9.969104016477857,
"grad_norm": 0.47241315245628357,
"learning_rate": 4.7897039463128524e-05,
"loss": 0.0369,
"step": 9680
},
{
"epoch": 9.979402677651905,
"grad_norm": 0.3929249048233032,
"learning_rate": 4.780526491404929e-05,
"loss": 0.0436,
"step": 9690
},
{
"epoch": 9.989701338825952,
"grad_norm": 0.32324254512786865,
"learning_rate": 4.771349777277452e-05,
"loss": 0.0418,
"step": 9700
},
{
"epoch": 10.0,
"grad_norm": 0.4991161525249481,
"learning_rate": 4.762173834904225e-05,
"loss": 0.0352,
"step": 9710
},
{
"epoch": 10.010298661174048,
"grad_norm": 0.2615014612674713,
"learning_rate": 4.752998695256455e-05,
"loss": 0.0412,
"step": 9720
},
{
"epoch": 10.020597322348095,
"grad_norm": 0.29027608036994934,
"learning_rate": 4.743824389302635e-05,
"loss": 0.035,
"step": 9730
},
{
"epoch": 10.030895983522143,
"grad_norm": 0.3496328294277191,
"learning_rate": 4.734650948008445e-05,
"loss": 0.038,
"step": 9740
},
{
"epoch": 10.041194644696189,
"grad_norm": 0.25003111362457275,
"learning_rate": 4.7254784023366444e-05,
"loss": 0.0408,
"step": 9750
},
{
"epoch": 10.051493305870236,
"grad_norm": 0.28183093667030334,
"learning_rate": 4.716306783246977e-05,
"loss": 0.0415,
"step": 9760
},
{
"epoch": 10.061791967044284,
"grad_norm": 0.3574424386024475,
"learning_rate": 4.707136121696048e-05,
"loss": 0.0394,
"step": 9770
},
{
"epoch": 10.072090628218332,
"grad_norm": 0.2761897146701813,
"learning_rate": 4.69796644863724e-05,
"loss": 0.034,
"step": 9780
},
{
"epoch": 10.08238928939238,
"grad_norm": 0.2602722644805908,
"learning_rate": 4.688797795020597e-05,
"loss": 0.0354,
"step": 9790
},
{
"epoch": 10.092687950566427,
"grad_norm": 0.2515560984611511,
"learning_rate": 4.6796301917927166e-05,
"loss": 0.0402,
"step": 9800
},
{
"epoch": 10.102986611740473,
"grad_norm": 0.24942000210285187,
"learning_rate": 4.670463669896659e-05,
"loss": 0.0406,
"step": 9810
},
{
"epoch": 10.11328527291452,
"grad_norm": 0.29609471559524536,
"learning_rate": 4.66129826027183e-05,
"loss": 0.0397,
"step": 9820
},
{
"epoch": 10.123583934088568,
"grad_norm": 0.3640936613082886,
"learning_rate": 4.652133993853883e-05,
"loss": 0.0456,
"step": 9830
},
{
"epoch": 10.133882595262616,
"grad_norm": 0.2724517285823822,
"learning_rate": 4.64297090157461e-05,
"loss": 0.0371,
"step": 9840
},
{
"epoch": 10.144181256436664,
"grad_norm": 0.33307430148124695,
"learning_rate": 4.633809014361843e-05,
"loss": 0.0438,
"step": 9850
},
{
"epoch": 10.154479917610711,
"grad_norm": 0.45976462960243225,
"learning_rate": 4.624648363139344e-05,
"loss": 0.0479,
"step": 9860
},
{
"epoch": 10.164778578784759,
"grad_norm": 0.24571570754051208,
"learning_rate": 4.615488978826709e-05,
"loss": 0.0375,
"step": 9870
},
{
"epoch": 10.175077239958805,
"grad_norm": 0.4202505052089691,
"learning_rate": 4.6063308923392485e-05,
"loss": 0.0446,
"step": 9880
},
{
"epoch": 10.185375901132852,
"grad_norm": 0.30180397629737854,
"learning_rate": 4.5971741345879e-05,
"loss": 0.0372,
"step": 9890
},
{
"epoch": 10.1956745623069,
"grad_norm": 0.39542245864868164,
"learning_rate": 4.588018736479115e-05,
"loss": 0.0407,
"step": 9900
},
{
"epoch": 10.205973223480948,
"grad_norm": 0.5576333403587341,
"learning_rate": 4.5788647289147516e-05,
"loss": 0.0372,
"step": 9910
},
{
"epoch": 10.216271884654995,
"grad_norm": 0.2639693319797516,
"learning_rate": 4.56971214279198e-05,
"loss": 0.0463,
"step": 9920
},
{
"epoch": 10.226570545829043,
"grad_norm": 0.26938265562057495,
"learning_rate": 4.56056100900317e-05,
"loss": 0.0367,
"step": 9930
},
{
"epoch": 10.236869207003089,
"grad_norm": 0.27783456444740295,
"learning_rate": 4.5514113584357873e-05,
"loss": 0.0369,
"step": 9940
},
{
"epoch": 10.247167868177137,
"grad_norm": 0.27680081129074097,
"learning_rate": 4.542263221972295e-05,
"loss": 0.0393,
"step": 9950
},
{
"epoch": 10.257466529351184,
"grad_norm": 0.2161240130662918,
"learning_rate": 4.5331166304900464e-05,
"loss": 0.042,
"step": 9960
},
{
"epoch": 10.267765190525232,
"grad_norm": 0.27455902099609375,
"learning_rate": 4.5239716148611724e-05,
"loss": 0.0434,
"step": 9970
},
{
"epoch": 10.27806385169928,
"grad_norm": 0.3013168275356293,
"learning_rate": 4.514828205952495e-05,
"loss": 0.0395,
"step": 9980
},
{
"epoch": 10.288362512873327,
"grad_norm": 0.2296813279390335,
"learning_rate": 4.505686434625409e-05,
"loss": 0.0368,
"step": 9990
},
{
"epoch": 10.298661174047373,
"grad_norm": 0.19806218147277832,
"learning_rate": 4.496546331735778e-05,
"loss": 0.0391,
"step": 10000
},
{
"epoch": 10.30895983522142,
"grad_norm": 0.24850870668888092,
"learning_rate": 4.4874079281338416e-05,
"loss": 0.0407,
"step": 10010
},
{
"epoch": 10.319258496395468,
"grad_norm": 0.16531158983707428,
"learning_rate": 4.478271254664097e-05,
"loss": 0.0359,
"step": 10020
},
{
"epoch": 10.329557157569516,
"grad_norm": 0.5394207835197449,
"learning_rate": 4.469136342165207e-05,
"loss": 0.0375,
"step": 10030
},
{
"epoch": 10.339855818743564,
"grad_norm": 0.4204263687133789,
"learning_rate": 4.460003221469886e-05,
"loss": 0.042,
"step": 10040
},
{
"epoch": 10.350154479917611,
"grad_norm": 2.313096284866333,
"learning_rate": 4.450871923404806e-05,
"loss": 0.0465,
"step": 10050
},
{
"epoch": 10.360453141091659,
"grad_norm": 0.6360970735549927,
"learning_rate": 4.441742478790481e-05,
"loss": 0.0421,
"step": 10060
},
{
"epoch": 10.370751802265705,
"grad_norm": 0.23286186158657074,
"learning_rate": 4.432614918441175e-05,
"loss": 0.0352,
"step": 10070
},
{
"epoch": 10.381050463439752,
"grad_norm": 0.3724748194217682,
"learning_rate": 4.4234892731647866e-05,
"loss": 0.0434,
"step": 10080
},
{
"epoch": 10.3913491246138,
"grad_norm": 0.212792307138443,
"learning_rate": 4.414365573762755e-05,
"loss": 0.0357,
"step": 10090
},
{
"epoch": 10.401647785787848,
"grad_norm": 0.22442536056041718,
"learning_rate": 4.4052438510299515e-05,
"loss": 0.0398,
"step": 10100
},
{
"epoch": 10.411946446961895,
"grad_norm": 0.3250674307346344,
"learning_rate": 4.3961241357545706e-05,
"loss": 0.0377,
"step": 10110
},
{
"epoch": 10.422245108135943,
"grad_norm": 0.2997426986694336,
"learning_rate": 4.387006458718037e-05,
"loss": 0.0385,
"step": 10120
},
{
"epoch": 10.432543769309989,
"grad_norm": 0.26953554153442383,
"learning_rate": 4.377890850694893e-05,
"loss": 0.0352,
"step": 10130
},
{
"epoch": 10.442842430484037,
"grad_norm": 0.3824928402900696,
"learning_rate": 4.368777342452697e-05,
"loss": 0.038,
"step": 10140
},
{
"epoch": 10.453141091658084,
"grad_norm": 0.33039042353630066,
"learning_rate": 4.35966596475192e-05,
"loss": 0.0354,
"step": 10150
},
{
"epoch": 10.463439752832132,
"grad_norm": 0.665787935256958,
"learning_rate": 4.3505567483458456e-05,
"loss": 0.0393,
"step": 10160
},
{
"epoch": 10.47373841400618,
"grad_norm": 0.25892671942710876,
"learning_rate": 4.341449723980457e-05,
"loss": 0.0403,
"step": 10170
},
{
"epoch": 10.484037075180227,
"grad_norm": 0.8381480574607849,
"learning_rate": 4.3323449223943416e-05,
"loss": 0.0403,
"step": 10180
},
{
"epoch": 10.494335736354273,
"grad_norm": 0.2520352303981781,
"learning_rate": 4.323242374318586e-05,
"loss": 0.0376,
"step": 10190
},
{
"epoch": 10.50463439752832,
"grad_norm": 0.30395472049713135,
"learning_rate": 4.314142110476666e-05,
"loss": 0.039,
"step": 10200
},
{
"epoch": 10.514933058702368,
"grad_norm": 0.2134946584701538,
"learning_rate": 4.305044161584352e-05,
"loss": 0.0356,
"step": 10210
},
{
"epoch": 10.525231719876416,
"grad_norm": 0.30410531163215637,
"learning_rate": 4.295948558349598e-05,
"loss": 0.0399,
"step": 10220
},
{
"epoch": 10.535530381050464,
"grad_norm": 0.3639879524707794,
"learning_rate": 4.2868553314724425e-05,
"loss": 0.0377,
"step": 10230
},
{
"epoch": 10.545829042224511,
"grad_norm": 0.7833529114723206,
"learning_rate": 4.2777645116449004e-05,
"loss": 0.042,
"step": 10240
},
{
"epoch": 10.556127703398559,
"grad_norm": 0.3496880829334259,
"learning_rate": 4.268676129550869e-05,
"loss": 0.043,
"step": 10250
},
{
"epoch": 10.566426364572605,
"grad_norm": 0.24933426082134247,
"learning_rate": 4.2595902158660074e-05,
"loss": 0.0392,
"step": 10260
},
{
"epoch": 10.576725025746653,
"grad_norm": 0.35013383626937866,
"learning_rate": 4.250506801257653e-05,
"loss": 0.0403,
"step": 10270
},
{
"epoch": 10.5870236869207,
"grad_norm": 0.5155181884765625,
"learning_rate": 4.241425916384699e-05,
"loss": 0.0383,
"step": 10280
},
{
"epoch": 10.597322348094748,
"grad_norm": 0.5019784569740295,
"learning_rate": 4.2323475918975075e-05,
"loss": 0.0412,
"step": 10290
},
{
"epoch": 10.607621009268795,
"grad_norm": 0.38487544655799866,
"learning_rate": 4.223271858437799e-05,
"loss": 0.0377,
"step": 10300
},
{
"epoch": 10.617919670442843,
"grad_norm": 0.2794114947319031,
"learning_rate": 4.21419874663854e-05,
"loss": 0.0398,
"step": 10310
},
{
"epoch": 10.628218331616889,
"grad_norm": 0.1784840226173401,
"learning_rate": 4.205128287123858e-05,
"loss": 0.0375,
"step": 10320
},
{
"epoch": 10.638516992790937,
"grad_norm": 0.19784130156040192,
"learning_rate": 4.196060510508922e-05,
"loss": 0.0329,
"step": 10330
},
{
"epoch": 10.648815653964984,
"grad_norm": 0.25078096985816956,
"learning_rate": 4.186995447399849e-05,
"loss": 0.0305,
"step": 10340
},
{
"epoch": 10.659114315139032,
"grad_norm": 0.2800082862377167,
"learning_rate": 4.177933128393594e-05,
"loss": 0.0386,
"step": 10350
},
{
"epoch": 10.66941297631308,
"grad_norm": 0.2689889073371887,
"learning_rate": 4.1688735840778546e-05,
"loss": 0.0355,
"step": 10360
},
{
"epoch": 10.679711637487127,
"grad_norm": 0.26448753476142883,
"learning_rate": 4.159816845030957e-05,
"loss": 0.0357,
"step": 10370
},
{
"epoch": 10.690010298661175,
"grad_norm": 0.2718246579170227,
"learning_rate": 4.1507629418217634e-05,
"loss": 0.0339,
"step": 10380
},
{
"epoch": 10.70030895983522,
"grad_norm": 0.2607558071613312,
"learning_rate": 4.141711905009566e-05,
"loss": 0.0397,
"step": 10390
},
{
"epoch": 10.710607621009268,
"grad_norm": 0.324266254901886,
"learning_rate": 4.132663765143975e-05,
"loss": 0.0355,
"step": 10400
},
{
"epoch": 10.720906282183316,
"grad_norm": 0.31110501289367676,
"learning_rate": 4.1236185527648294e-05,
"loss": 0.0389,
"step": 10410
},
{
"epoch": 10.731204943357364,
"grad_norm": 0.3010208010673523,
"learning_rate": 4.114576298402084e-05,
"loss": 0.0384,
"step": 10420
},
{
"epoch": 10.741503604531411,
"grad_norm": 0.42494192719459534,
"learning_rate": 4.1055370325757106e-05,
"loss": 0.0407,
"step": 10430
},
{
"epoch": 10.751802265705459,
"grad_norm": 0.26597830653190613,
"learning_rate": 4.096500785795591e-05,
"loss": 0.0351,
"step": 10440
},
{
"epoch": 10.762100926879505,
"grad_norm": 0.3270758092403412,
"learning_rate": 4.087467588561424e-05,
"loss": 0.0351,
"step": 10450
},
{
"epoch": 10.772399588053553,
"grad_norm": 0.35372480750083923,
"learning_rate": 4.0784374713626076e-05,
"loss": 0.0431,
"step": 10460
},
{
"epoch": 10.7826982492276,
"grad_norm": 0.3251330256462097,
"learning_rate": 4.069410464678148e-05,
"loss": 0.0352,
"step": 10470
},
{
"epoch": 10.792996910401648,
"grad_norm": 0.26621249318122864,
"learning_rate": 4.0603865989765504e-05,
"loss": 0.0432,
"step": 10480
},
{
"epoch": 10.803295571575696,
"grad_norm": 0.3128867745399475,
"learning_rate": 4.05136590471572e-05,
"loss": 0.0412,
"step": 10490
},
{
"epoch": 10.813594232749743,
"grad_norm": 0.20734545588493347,
"learning_rate": 4.042348412342861e-05,
"loss": 0.0352,
"step": 10500
},
{
"epoch": 10.82389289392379,
"grad_norm": 0.3195039629936218,
"learning_rate": 4.0333341522943614e-05,
"loss": 0.0374,
"step": 10510
},
{
"epoch": 10.834191555097837,
"grad_norm": 0.27724260091781616,
"learning_rate": 4.024323154995708e-05,
"loss": 0.0405,
"step": 10520
},
{
"epoch": 10.844490216271884,
"grad_norm": 0.2909531593322754,
"learning_rate": 4.015315450861371e-05,
"loss": 0.0364,
"step": 10530
},
{
"epoch": 10.854788877445932,
"grad_norm": 0.28578925132751465,
"learning_rate": 4.006311070294702e-05,
"loss": 0.0354,
"step": 10540
},
{
"epoch": 10.86508753861998,
"grad_norm": 0.2503175437450409,
"learning_rate": 3.997310043687842e-05,
"loss": 0.0348,
"step": 10550
},
{
"epoch": 10.875386199794027,
"grad_norm": 0.36039701104164124,
"learning_rate": 3.988312401421609e-05,
"loss": 0.0414,
"step": 10560
},
{
"epoch": 10.885684860968075,
"grad_norm": 0.45128464698791504,
"learning_rate": 3.979318173865393e-05,
"loss": 0.04,
"step": 10570
},
{
"epoch": 10.89598352214212,
"grad_norm": 0.35974377393722534,
"learning_rate": 3.970327391377064e-05,
"loss": 0.0392,
"step": 10580
},
{
"epoch": 10.906282183316168,
"grad_norm": 0.22907008230686188,
"learning_rate": 3.9613400843028666e-05,
"loss": 0.0342,
"step": 10590
},
{
"epoch": 10.916580844490216,
"grad_norm": 0.3276582956314087,
"learning_rate": 3.9523562829773036e-05,
"loss": 0.043,
"step": 10600
},
{
"epoch": 10.926879505664264,
"grad_norm": 0.27974191308021545,
"learning_rate": 3.943376017723057e-05,
"loss": 0.0357,
"step": 10610
},
{
"epoch": 10.937178166838311,
"grad_norm": 0.3858673572540283,
"learning_rate": 3.934399318850868e-05,
"loss": 0.0369,
"step": 10620
},
{
"epoch": 10.947476828012359,
"grad_norm": 0.29965823888778687,
"learning_rate": 3.925426216659438e-05,
"loss": 0.0369,
"step": 10630
},
{
"epoch": 10.957775489186405,
"grad_norm": 0.3583829998970032,
"learning_rate": 3.916456741435336e-05,
"loss": 0.0425,
"step": 10640
},
{
"epoch": 10.968074150360453,
"grad_norm": 0.27793335914611816,
"learning_rate": 3.9074909234528826e-05,
"loss": 0.0399,
"step": 10650
},
{
"epoch": 10.9783728115345,
"grad_norm": 0.24120087921619415,
"learning_rate": 3.898528792974056e-05,
"loss": 0.0403,
"step": 10660
},
{
"epoch": 10.988671472708548,
"grad_norm": 0.22013327479362488,
"learning_rate": 3.8895703802483916e-05,
"loss": 0.034,
"step": 10670
},
{
"epoch": 10.998970133882596,
"grad_norm": 0.2588166296482086,
"learning_rate": 3.880615715512868e-05,
"loss": 0.0316,
"step": 10680
},
{
"epoch": 11.009268795056643,
"grad_norm": 0.2514420449733734,
"learning_rate": 3.871664828991822e-05,
"loss": 0.0383,
"step": 10690
},
{
"epoch": 11.019567456230691,
"grad_norm": 0.3404804468154907,
"learning_rate": 3.862717750896837e-05,
"loss": 0.0352,
"step": 10700
},
{
"epoch": 11.029866117404737,
"grad_norm": 0.9497872591018677,
"learning_rate": 3.853774511426634e-05,
"loss": 0.0366,
"step": 10710
},
{
"epoch": 11.040164778578784,
"grad_norm": 0.28247174620628357,
"learning_rate": 3.844835140766988e-05,
"loss": 0.0473,
"step": 10720
},
{
"epoch": 11.050463439752832,
"grad_norm": 0.28879600763320923,
"learning_rate": 3.83589966909061e-05,
"loss": 0.0344,
"step": 10730
},
{
"epoch": 11.06076210092688,
"grad_norm": 0.23894581198692322,
"learning_rate": 3.82696812655705e-05,
"loss": 0.0349,
"step": 10740
},
{
"epoch": 11.071060762100927,
"grad_norm": 0.26289770007133484,
"learning_rate": 3.818040543312598e-05,
"loss": 0.0384,
"step": 10750
},
{
"epoch": 11.081359423274975,
"grad_norm": 0.33045023679733276,
"learning_rate": 3.809116949490184e-05,
"loss": 0.0331,
"step": 10760
},
{
"epoch": 11.091658084449021,
"grad_norm": 0.46705836057662964,
"learning_rate": 3.8001973752092655e-05,
"loss": 0.0386,
"step": 10770
},
{
"epoch": 11.101956745623069,
"grad_norm": 0.5863741040229797,
"learning_rate": 3.791281850575737e-05,
"loss": 0.0415,
"step": 10780
},
{
"epoch": 11.112255406797116,
"grad_norm": 0.24471549689769745,
"learning_rate": 3.782370405681828e-05,
"loss": 0.0372,
"step": 10790
},
{
"epoch": 11.122554067971164,
"grad_norm": 0.3259426951408386,
"learning_rate": 3.773463070605987e-05,
"loss": 0.043,
"step": 10800
},
{
"epoch": 11.132852729145212,
"grad_norm": 0.2583596408367157,
"learning_rate": 3.764559875412803e-05,
"loss": 0.0354,
"step": 10810
},
{
"epoch": 11.14315139031926,
"grad_norm": 0.46032634377479553,
"learning_rate": 3.7556608501528846e-05,
"loss": 0.0393,
"step": 10820
},
{
"epoch": 11.153450051493305,
"grad_norm": 0.38069912791252136,
"learning_rate": 3.7467660248627654e-05,
"loss": 0.0398,
"step": 10830
},
{
"epoch": 11.163748712667353,
"grad_norm": 0.28435567021369934,
"learning_rate": 3.737875429564807e-05,
"loss": 0.0388,
"step": 10840
},
{
"epoch": 11.1740473738414,
"grad_norm": 0.34043052792549133,
"learning_rate": 3.7289890942670946e-05,
"loss": 0.0296,
"step": 10850
},
{
"epoch": 11.184346035015448,
"grad_norm": 0.3213551938533783,
"learning_rate": 3.720107048963327e-05,
"loss": 0.0296,
"step": 10860
},
{
"epoch": 11.194644696189496,
"grad_norm": 0.45642250776290894,
"learning_rate": 3.711229323632732e-05,
"loss": 0.0347,
"step": 10870
},
{
"epoch": 11.204943357363543,
"grad_norm": 0.29973405599594116,
"learning_rate": 3.70235594823995e-05,
"loss": 0.036,
"step": 10880
},
{
"epoch": 11.215242018537591,
"grad_norm": 0.2634925842285156,
"learning_rate": 3.693486952734941e-05,
"loss": 0.0337,
"step": 10890
},
{
"epoch": 11.225540679711637,
"grad_norm": 0.25237777829170227,
"learning_rate": 3.684622367052887e-05,
"loss": 0.0347,
"step": 10900
},
{
"epoch": 11.235839340885684,
"grad_norm": 0.20709861814975739,
"learning_rate": 3.675762221114077e-05,
"loss": 0.0305,
"step": 10910
},
{
"epoch": 11.246138002059732,
"grad_norm": 0.14299030601978302,
"learning_rate": 3.66690654482382e-05,
"loss": 0.0334,
"step": 10920
},
{
"epoch": 11.25643666323378,
"grad_norm": 0.2454812377691269,
"learning_rate": 3.658055368072339e-05,
"loss": 0.0375,
"step": 10930
},
{
"epoch": 11.266735324407827,
"grad_norm": 0.2894679307937622,
"learning_rate": 3.6492087207346666e-05,
"loss": 0.0416,
"step": 10940
},
{
"epoch": 11.277033985581875,
"grad_norm": 0.2871219217777252,
"learning_rate": 3.640366632670549e-05,
"loss": 0.034,
"step": 10950
},
{
"epoch": 11.287332646755921,
"grad_norm": 0.30559393763542175,
"learning_rate": 3.631529133724348e-05,
"loss": 0.0369,
"step": 10960
},
{
"epoch": 11.297631307929969,
"grad_norm": 0.35164326429367065,
"learning_rate": 3.622696253724927e-05,
"loss": 0.035,
"step": 10970
},
{
"epoch": 11.307929969104016,
"grad_norm": 0.27396318316459656,
"learning_rate": 3.613868022485566e-05,
"loss": 0.0389,
"step": 10980
},
{
"epoch": 11.318228630278064,
"grad_norm": 0.27721869945526123,
"learning_rate": 3.605044469803854e-05,
"loss": 0.0365,
"step": 10990
},
{
"epoch": 11.328527291452112,
"grad_norm": 0.2726707458496094,
"learning_rate": 3.5962256254615853e-05,
"loss": 0.0382,
"step": 11000
},
{
"epoch": 11.33882595262616,
"grad_norm": 0.3522757589817047,
"learning_rate": 3.587411519224665e-05,
"loss": 0.0432,
"step": 11010
},
{
"epoch": 11.349124613800207,
"grad_norm": 0.2744219899177551,
"learning_rate": 3.5786021808430054e-05,
"loss": 0.0328,
"step": 11020
},
{
"epoch": 11.359423274974253,
"grad_norm": 0.36627647280693054,
"learning_rate": 3.569797640050423e-05,
"loss": 0.0407,
"step": 11030
},
{
"epoch": 11.3697219361483,
"grad_norm": 0.20793434977531433,
"learning_rate": 3.560997926564545e-05,
"loss": 0.0284,
"step": 11040
},
{
"epoch": 11.380020597322348,
"grad_norm": 0.23446743190288544,
"learning_rate": 3.552203070086707e-05,
"loss": 0.0355,
"step": 11050
},
{
"epoch": 11.390319258496396,
"grad_norm": 0.48527511954307556,
"learning_rate": 3.543413100301843e-05,
"loss": 0.0378,
"step": 11060
},
{
"epoch": 11.400617919670443,
"grad_norm": 0.39768174290657043,
"learning_rate": 3.534628046878403e-05,
"loss": 0.0329,
"step": 11070
},
{
"epoch": 11.410916580844491,
"grad_norm": 0.19781740009784698,
"learning_rate": 3.525847939468233e-05,
"loss": 0.0371,
"step": 11080
},
{
"epoch": 11.421215242018537,
"grad_norm": 0.2503238022327423,
"learning_rate": 3.517072807706492e-05,
"loss": 0.0363,
"step": 11090
},
{
"epoch": 11.431513903192585,
"grad_norm": 0.3444472849369049,
"learning_rate": 3.508302681211546e-05,
"loss": 0.0343,
"step": 11100
},
{
"epoch": 11.441812564366632,
"grad_norm": 0.3007254898548126,
"learning_rate": 3.499537589584859e-05,
"loss": 0.0441,
"step": 11110
},
{
"epoch": 11.45211122554068,
"grad_norm": 0.38914212584495544,
"learning_rate": 3.490777562410907e-05,
"loss": 0.0331,
"step": 11120
},
{
"epoch": 11.462409886714727,
"grad_norm": 0.3051401674747467,
"learning_rate": 3.482022629257074e-05,
"loss": 0.0328,
"step": 11130
},
{
"epoch": 11.472708547888775,
"grad_norm": 0.306740403175354,
"learning_rate": 3.473272819673542e-05,
"loss": 0.039,
"step": 11140
},
{
"epoch": 11.483007209062821,
"grad_norm": 0.42291760444641113,
"learning_rate": 3.4645281631932074e-05,
"loss": 0.0526,
"step": 11150
},
{
"epoch": 11.493305870236869,
"grad_norm": 0.2984221577644348,
"learning_rate": 3.455788689331574e-05,
"loss": 0.0345,
"step": 11160
},
{
"epoch": 11.503604531410916,
"grad_norm": 0.19411993026733398,
"learning_rate": 3.447054427586644e-05,
"loss": 0.0384,
"step": 11170
},
{
"epoch": 11.513903192584964,
"grad_norm": 0.3595150113105774,
"learning_rate": 3.438325407438837e-05,
"loss": 0.0358,
"step": 11180
},
{
"epoch": 11.524201853759012,
"grad_norm": 0.289594829082489,
"learning_rate": 3.4296016583508775e-05,
"loss": 0.0314,
"step": 11190
},
{
"epoch": 11.53450051493306,
"grad_norm": 0.3801267743110657,
"learning_rate": 3.420883209767697e-05,
"loss": 0.0453,
"step": 11200
},
{
"epoch": 11.544799176107105,
"grad_norm": 0.45930567383766174,
"learning_rate": 3.4121700911163366e-05,
"loss": 0.0418,
"step": 11210
},
{
"epoch": 11.555097837281153,
"grad_norm": 0.2295006662607193,
"learning_rate": 3.403462331805852e-05,
"loss": 0.0378,
"step": 11220
},
{
"epoch": 11.5653964984552,
"grad_norm": 0.38683414459228516,
"learning_rate": 3.394759961227202e-05,
"loss": 0.038,
"step": 11230
},
{
"epoch": 11.575695159629248,
"grad_norm": 0.32741764187812805,
"learning_rate": 3.386063008753164e-05,
"loss": 0.0403,
"step": 11240
},
{
"epoch": 11.585993820803296,
"grad_norm": 0.3826991319656372,
"learning_rate": 3.377371503738227e-05,
"loss": 0.0408,
"step": 11250
},
{
"epoch": 11.596292481977343,
"grad_norm": 0.5855404138565063,
"learning_rate": 3.368685475518488e-05,
"loss": 0.0343,
"step": 11260
},
{
"epoch": 11.606591143151391,
"grad_norm": 0.30145469307899475,
"learning_rate": 3.360004953411566e-05,
"loss": 0.0292,
"step": 11270
},
{
"epoch": 11.616889804325437,
"grad_norm": 1.2090197801589966,
"learning_rate": 3.3513299667164864e-05,
"loss": 0.0298,
"step": 11280
},
{
"epoch": 11.627188465499485,
"grad_norm": 0.7051903009414673,
"learning_rate": 3.3426605447136004e-05,
"loss": 0.0366,
"step": 11290
},
{
"epoch": 11.637487126673532,
"grad_norm": 0.3094668984413147,
"learning_rate": 3.3339967166644726e-05,
"loss": 0.0378,
"step": 11300
},
{
"epoch": 11.64778578784758,
"grad_norm": 0.3277672231197357,
"learning_rate": 3.325338511811784e-05,
"loss": 0.0407,
"step": 11310
},
{
"epoch": 11.658084449021628,
"grad_norm": 0.27167952060699463,
"learning_rate": 3.316685959379241e-05,
"loss": 0.0377,
"step": 11320
},
{
"epoch": 11.668383110195675,
"grad_norm": 0.5050401091575623,
"learning_rate": 3.308039088571469e-05,
"loss": 0.039,
"step": 11330
},
{
"epoch": 11.678681771369721,
"grad_norm": 0.23651434481143951,
"learning_rate": 3.2993979285739143e-05,
"loss": 0.0339,
"step": 11340
},
{
"epoch": 11.688980432543769,
"grad_norm": 0.3040764331817627,
"learning_rate": 3.2907625085527503e-05,
"loss": 0.0351,
"step": 11350
},
{
"epoch": 11.699279093717816,
"grad_norm": 0.23311540484428406,
"learning_rate": 3.28213285765478e-05,
"loss": 0.0347,
"step": 11360
},
{
"epoch": 11.709577754891864,
"grad_norm": 0.21837526559829712,
"learning_rate": 3.273509005007327e-05,
"loss": 0.0397,
"step": 11370
},
{
"epoch": 11.719876416065912,
"grad_norm": 0.24095067381858826,
"learning_rate": 3.264890979718147e-05,
"loss": 0.0335,
"step": 11380
},
{
"epoch": 11.73017507723996,
"grad_norm": 0.4714142680168152,
"learning_rate": 3.256278810875332e-05,
"loss": 0.0355,
"step": 11390
},
{
"epoch": 11.740473738414007,
"grad_norm": 0.3001396059989929,
"learning_rate": 3.247672527547197e-05,
"loss": 0.0311,
"step": 11400
},
{
"epoch": 11.750772399588053,
"grad_norm": 0.2514890730381012,
"learning_rate": 3.239072158782198e-05,
"loss": 0.0374,
"step": 11410
},
{
"epoch": 11.7610710607621,
"grad_norm": 0.22603774070739746,
"learning_rate": 3.230477733608831e-05,
"loss": 0.0368,
"step": 11420
},
{
"epoch": 11.771369721936148,
"grad_norm": 0.22810235619544983,
"learning_rate": 3.221889281035522e-05,
"loss": 0.0331,
"step": 11430
},
{
"epoch": 11.781668383110196,
"grad_norm": 0.18763025104999542,
"learning_rate": 3.2133068300505455e-05,
"loss": 0.0328,
"step": 11440
},
{
"epoch": 11.791967044284243,
"grad_norm": 0.32261693477630615,
"learning_rate": 3.204730409621917e-05,
"loss": 0.0408,
"step": 11450
},
{
"epoch": 11.802265705458291,
"grad_norm": 0.27985504269599915,
"learning_rate": 3.196160048697293e-05,
"loss": 0.0415,
"step": 11460
},
{
"epoch": 11.812564366632337,
"grad_norm": 0.28317996859550476,
"learning_rate": 3.187595776203886e-05,
"loss": 0.0413,
"step": 11470
},
{
"epoch": 11.822863027806385,
"grad_norm": 0.2768697440624237,
"learning_rate": 3.1790376210483494e-05,
"loss": 0.0433,
"step": 11480
},
{
"epoch": 11.833161688980432,
"grad_norm": 0.27718645334243774,
"learning_rate": 3.170485612116697e-05,
"loss": 0.028,
"step": 11490
},
{
"epoch": 11.84346035015448,
"grad_norm": 0.27956560254096985,
"learning_rate": 3.161939778274191e-05,
"loss": 0.0318,
"step": 11500
},
{
"epoch": 11.853759011328528,
"grad_norm": 0.25807636976242065,
"learning_rate": 3.1534001483652556e-05,
"loss": 0.0439,
"step": 11510
},
{
"epoch": 11.864057672502575,
"grad_norm": 0.6703087687492371,
"learning_rate": 3.14486675121337e-05,
"loss": 0.0298,
"step": 11520
},
{
"epoch": 11.874356333676623,
"grad_norm": 0.46335524320602417,
"learning_rate": 3.136339615620985e-05,
"loss": 0.0481,
"step": 11530
},
{
"epoch": 11.884654994850669,
"grad_norm": 0.250967800617218,
"learning_rate": 3.127818770369406e-05,
"loss": 0.0337,
"step": 11540
},
{
"epoch": 11.894953656024716,
"grad_norm": 0.2240300476551056,
"learning_rate": 3.119304244218715e-05,
"loss": 0.0327,
"step": 11550
},
{
"epoch": 11.905252317198764,
"grad_norm": 0.2884691655635834,
"learning_rate": 3.110796065907665e-05,
"loss": 0.0363,
"step": 11560
},
{
"epoch": 11.915550978372812,
"grad_norm": 0.28418871760368347,
"learning_rate": 3.102294264153577e-05,
"loss": 0.0325,
"step": 11570
},
{
"epoch": 11.92584963954686,
"grad_norm": 0.2494005262851715,
"learning_rate": 3.093798867652257e-05,
"loss": 0.0358,
"step": 11580
},
{
"epoch": 11.936148300720907,
"grad_norm": 0.43249595165252686,
"learning_rate": 3.0853099050778854e-05,
"loss": 0.0361,
"step": 11590
},
{
"epoch": 11.946446961894953,
"grad_norm": 0.32216548919677734,
"learning_rate": 3.0768274050829306e-05,
"loss": 0.0359,
"step": 11600
},
{
"epoch": 11.956745623069,
"grad_norm": 0.3839482069015503,
"learning_rate": 3.0683513962980456e-05,
"loss": 0.0338,
"step": 11610
},
{
"epoch": 11.967044284243048,
"grad_norm": 0.25899192690849304,
"learning_rate": 3.059881907331979e-05,
"loss": 0.0326,
"step": 11620
},
{
"epoch": 11.977342945417096,
"grad_norm": 0.2512173652648926,
"learning_rate": 3.0514189667714632e-05,
"loss": 0.0352,
"step": 11630
},
{
"epoch": 11.987641606591144,
"grad_norm": 0.43213722109794617,
"learning_rate": 3.042962603181138e-05,
"loss": 0.0395,
"step": 11640
},
{
"epoch": 11.997940267765191,
"grad_norm": 0.25386422872543335,
"learning_rate": 3.034512845103441e-05,
"loss": 0.0314,
"step": 11650
},
{
"epoch": 12.008238928939237,
"grad_norm": 0.35718950629234314,
"learning_rate": 3.0260697210585108e-05,
"loss": 0.0371,
"step": 11660
},
{
"epoch": 12.018537590113285,
"grad_norm": 0.29993295669555664,
"learning_rate": 3.017633259544101e-05,
"loss": 0.035,
"step": 11670
},
{
"epoch": 12.028836251287332,
"grad_norm": 0.3331249952316284,
"learning_rate": 3.0092034890354694e-05,
"loss": 0.0406,
"step": 11680
},
{
"epoch": 12.03913491246138,
"grad_norm": 0.22086752951145172,
"learning_rate": 3.0007804379852977e-05,
"loss": 0.0252,
"step": 11690
},
{
"epoch": 12.049433573635428,
"grad_norm": 0.22861167788505554,
"learning_rate": 2.9923641348235843e-05,
"loss": 0.0426,
"step": 11700
},
{
"epoch": 12.059732234809475,
"grad_norm": 0.26923444867134094,
"learning_rate": 2.9839546079575497e-05,
"loss": 0.0454,
"step": 11710
},
{
"epoch": 12.070030895983523,
"grad_norm": 0.23918205499649048,
"learning_rate": 2.9755518857715448e-05,
"loss": 0.0402,
"step": 11720
},
{
"epoch": 12.080329557157569,
"grad_norm": 0.23139654099941254,
"learning_rate": 2.967155996626956e-05,
"loss": 0.0303,
"step": 11730
},
{
"epoch": 12.090628218331616,
"grad_norm": 0.38359567523002625,
"learning_rate": 2.9587669688620988e-05,
"loss": 0.0398,
"step": 11740
},
{
"epoch": 12.100926879505664,
"grad_norm": 0.23274274170398712,
"learning_rate": 2.950384830792136e-05,
"loss": 0.0283,
"step": 11750
},
{
"epoch": 12.111225540679712,
"grad_norm": 0.29843324422836304,
"learning_rate": 2.942009610708976e-05,
"loss": 0.0339,
"step": 11760
},
{
"epoch": 12.12152420185376,
"grad_norm": 0.2866639494895935,
"learning_rate": 2.9336413368811723e-05,
"loss": 0.0325,
"step": 11770
},
{
"epoch": 12.131822863027807,
"grad_norm": 0.3042534589767456,
"learning_rate": 2.9252800375538368e-05,
"loss": 0.0355,
"step": 11780
},
{
"epoch": 12.142121524201853,
"grad_norm": 0.2678833305835724,
"learning_rate": 2.9169257409485418e-05,
"loss": 0.0329,
"step": 11790
},
{
"epoch": 12.1524201853759,
"grad_norm": 0.19894133508205414,
"learning_rate": 2.9085784752632157e-05,
"loss": 0.0383,
"step": 11800
},
{
"epoch": 12.162718846549948,
"grad_norm": 0.19369176030158997,
"learning_rate": 2.9002382686720676e-05,
"loss": 0.0303,
"step": 11810
},
{
"epoch": 12.173017507723996,
"grad_norm": 0.23142315447330475,
"learning_rate": 2.8919051493254724e-05,
"loss": 0.0404,
"step": 11820
},
{
"epoch": 12.183316168898044,
"grad_norm": 0.2168169468641281,
"learning_rate": 2.883579145349884e-05,
"loss": 0.0352,
"step": 11830
},
{
"epoch": 12.193614830072091,
"grad_norm": 0.27123361825942993,
"learning_rate": 2.8752602848477432e-05,
"loss": 0.0358,
"step": 11840
},
{
"epoch": 12.203913491246137,
"grad_norm": 1.34294593334198,
"learning_rate": 2.8669485958973775e-05,
"loss": 0.0336,
"step": 11850
},
{
"epoch": 12.214212152420185,
"grad_norm": 0.35292431712150574,
"learning_rate": 2.858644106552909e-05,
"loss": 0.0356,
"step": 11860
},
{
"epoch": 12.224510813594232,
"grad_norm": 0.5437068939208984,
"learning_rate": 2.850346844844157e-05,
"loss": 0.04,
"step": 11870
},
{
"epoch": 12.23480947476828,
"grad_norm": 0.7077152729034424,
"learning_rate": 2.8420568387765557e-05,
"loss": 0.0381,
"step": 11880
},
{
"epoch": 12.245108135942328,
"grad_norm": 1.2102924585342407,
"learning_rate": 2.8337741163310317e-05,
"loss": 0.0316,
"step": 11890
},
{
"epoch": 12.255406797116375,
"grad_norm": 0.22898398339748383,
"learning_rate": 2.825498705463947e-05,
"loss": 0.0355,
"step": 11900
},
{
"epoch": 12.265705458290423,
"grad_norm": 0.16343450546264648,
"learning_rate": 2.8172306341069672e-05,
"loss": 0.0333,
"step": 11910
},
{
"epoch": 12.276004119464469,
"grad_norm": 0.2778915762901306,
"learning_rate": 2.8089699301670002e-05,
"loss": 0.034,
"step": 11920
},
{
"epoch": 12.286302780638517,
"grad_norm": 0.2954021096229553,
"learning_rate": 2.800716621526078e-05,
"loss": 0.03,
"step": 11930
},
{
"epoch": 12.296601441812564,
"grad_norm": 0.18878135085105896,
"learning_rate": 2.7924707360412746e-05,
"loss": 0.0322,
"step": 11940
},
{
"epoch": 12.306900102986612,
"grad_norm": 0.25053462386131287,
"learning_rate": 2.7842323015446082e-05,
"loss": 0.0376,
"step": 11950
},
{
"epoch": 12.31719876416066,
"grad_norm": 0.21085461974143982,
"learning_rate": 2.7760013458429475e-05,
"loss": 0.0333,
"step": 11960
},
{
"epoch": 12.327497425334707,
"grad_norm": 0.27033373713493347,
"learning_rate": 2.767777896717919e-05,
"loss": 0.0387,
"step": 11970
},
{
"epoch": 12.337796086508753,
"grad_norm": 0.2603791356086731,
"learning_rate": 2.7595619819258116e-05,
"loss": 0.0336,
"step": 11980
},
{
"epoch": 12.3480947476828,
"grad_norm": 0.2735675573348999,
"learning_rate": 2.7513536291974895e-05,
"loss": 0.0367,
"step": 11990
},
{
"epoch": 12.358393408856848,
"grad_norm": 0.2710510790348053,
"learning_rate": 2.743152866238281e-05,
"loss": 0.0359,
"step": 12000
},
{
"epoch": 12.368692070030896,
"grad_norm": 0.3120410144329071,
"learning_rate": 2.7349597207279088e-05,
"loss": 0.0353,
"step": 12010
},
{
"epoch": 12.378990731204944,
"grad_norm": 1.238741159439087,
"learning_rate": 2.7267742203203795e-05,
"loss": 0.0328,
"step": 12020
},
{
"epoch": 12.389289392378991,
"grad_norm": 0.24720178544521332,
"learning_rate": 2.718596392643895e-05,
"loss": 0.035,
"step": 12030
},
{
"epoch": 12.399588053553039,
"grad_norm": 0.5230728387832642,
"learning_rate": 2.7104262653007616e-05,
"loss": 0.0385,
"step": 12040
},
{
"epoch": 12.409886714727085,
"grad_norm": 0.30197054147720337,
"learning_rate": 2.7022638658672933e-05,
"loss": 0.0378,
"step": 12050
},
{
"epoch": 12.420185375901132,
"grad_norm": 0.35036417841911316,
"learning_rate": 2.6941092218937214e-05,
"loss": 0.0316,
"step": 12060
},
{
"epoch": 12.43048403707518,
"grad_norm": 0.1900859922170639,
"learning_rate": 2.6859623609040984e-05,
"loss": 0.0416,
"step": 12070
},
{
"epoch": 12.440782698249228,
"grad_norm": 0.3137092888355255,
"learning_rate": 2.6778233103962158e-05,
"loss": 0.0347,
"step": 12080
},
{
"epoch": 12.451081359423275,
"grad_norm": 0.2586371600627899,
"learning_rate": 2.6696920978414862e-05,
"loss": 0.0313,
"step": 12090
},
{
"epoch": 12.461380020597323,
"grad_norm": 0.22871264815330505,
"learning_rate": 2.6615687506848864e-05,
"loss": 0.0384,
"step": 12100
},
{
"epoch": 12.471678681771369,
"grad_norm": 0.500694751739502,
"learning_rate": 2.6534532963448274e-05,
"loss": 0.0365,
"step": 12110
},
{
"epoch": 12.481977342945417,
"grad_norm": 0.23115640878677368,
"learning_rate": 2.645345762213094e-05,
"loss": 0.0359,
"step": 12120
},
{
"epoch": 12.492276004119464,
"grad_norm": 0.27199363708496094,
"learning_rate": 2.6372461756547306e-05,
"loss": 0.0367,
"step": 12130
},
{
"epoch": 12.502574665293512,
"grad_norm": 0.4970080256462097,
"learning_rate": 2.6291545640079583e-05,
"loss": 0.038,
"step": 12140
},
{
"epoch": 12.51287332646756,
"grad_norm": 0.31872427463531494,
"learning_rate": 2.6210709545840816e-05,
"loss": 0.0349,
"step": 12150
},
{
"epoch": 12.523171987641607,
"grad_norm": 0.543602705001831,
"learning_rate": 2.612995374667394e-05,
"loss": 0.0456,
"step": 12160
},
{
"epoch": 12.533470648815655,
"grad_norm": 0.24425791203975677,
"learning_rate": 2.6049278515150888e-05,
"loss": 0.0343,
"step": 12170
},
{
"epoch": 12.5437693099897,
"grad_norm": 0.32970938086509705,
"learning_rate": 2.5968684123571625e-05,
"loss": 0.0358,
"step": 12180
},
{
"epoch": 12.554067971163748,
"grad_norm": 0.24140028655529022,
"learning_rate": 2.5888170843963332e-05,
"loss": 0.0415,
"step": 12190
},
{
"epoch": 12.564366632337796,
"grad_norm": 0.1907021552324295,
"learning_rate": 2.5807738948079307e-05,
"loss": 0.0332,
"step": 12200
},
{
"epoch": 12.574665293511844,
"grad_norm": 0.2994469404220581,
"learning_rate": 2.572738870739827e-05,
"loss": 0.0332,
"step": 12210
},
{
"epoch": 12.584963954685891,
"grad_norm": 0.3281172811985016,
"learning_rate": 2.5647120393123246e-05,
"loss": 0.0355,
"step": 12220
},
{
"epoch": 12.595262615859939,
"grad_norm": 0.222566619515419,
"learning_rate": 2.5566934276180792e-05,
"loss": 0.0299,
"step": 12230
},
{
"epoch": 12.605561277033985,
"grad_norm": 0.38741955161094666,
"learning_rate": 2.5486830627219993e-05,
"loss": 0.0369,
"step": 12240
},
{
"epoch": 12.615859938208033,
"grad_norm": 0.24740222096443176,
"learning_rate": 2.540680971661161e-05,
"loss": 0.034,
"step": 12250
},
{
"epoch": 12.62615859938208,
"grad_norm": 0.2917155623435974,
"learning_rate": 2.5326871814447116e-05,
"loss": 0.0325,
"step": 12260
},
{
"epoch": 12.636457260556128,
"grad_norm": 0.3306695818901062,
"learning_rate": 2.5247017190537802e-05,
"loss": 0.0314,
"step": 12270
},
{
"epoch": 12.646755921730175,
"grad_norm": 0.3189143240451813,
"learning_rate": 2.5167246114413956e-05,
"loss": 0.0406,
"step": 12280
},
{
"epoch": 12.657054582904223,
"grad_norm": 0.27937018871307373,
"learning_rate": 2.5087558855323718e-05,
"loss": 0.037,
"step": 12290
},
{
"epoch": 12.667353244078269,
"grad_norm": 0.23929426074028015,
"learning_rate": 2.5007955682232498e-05,
"loss": 0.0366,
"step": 12300
},
{
"epoch": 12.677651905252317,
"grad_norm": 0.38764917850494385,
"learning_rate": 2.4928436863821725e-05,
"loss": 0.0357,
"step": 12310
},
{
"epoch": 12.687950566426364,
"grad_norm": 0.22392131388187408,
"learning_rate": 2.4849002668488245e-05,
"loss": 0.031,
"step": 12320
},
{
"epoch": 12.698249227600412,
"grad_norm": 0.35927116870880127,
"learning_rate": 2.4769653364343222e-05,
"loss": 0.0355,
"step": 12330
},
{
"epoch": 12.70854788877446,
"grad_norm": 0.3391915261745453,
"learning_rate": 2.4690389219211273e-05,
"loss": 0.0346,
"step": 12340
},
{
"epoch": 12.718846549948507,
"grad_norm": 0.21950756013393402,
"learning_rate": 2.4611210500629618e-05,
"loss": 0.0339,
"step": 12350
},
{
"epoch": 12.729145211122553,
"grad_norm": 0.22874067723751068,
"learning_rate": 2.453211747584711e-05,
"loss": 0.0347,
"step": 12360
},
{
"epoch": 12.7394438722966,
"grad_norm": 0.5297624468803406,
"learning_rate": 2.4453110411823382e-05,
"loss": 0.0308,
"step": 12370
},
{
"epoch": 12.749742533470648,
"grad_norm": 0.31514862179756165,
"learning_rate": 2.4374189575227902e-05,
"loss": 0.032,
"step": 12380
},
{
"epoch": 12.760041194644696,
"grad_norm": 0.26266971230506897,
"learning_rate": 2.429535523243917e-05,
"loss": 0.0357,
"step": 12390
},
{
"epoch": 12.770339855818744,
"grad_norm": 0.18397288024425507,
"learning_rate": 2.4216607649543628e-05,
"loss": 0.0307,
"step": 12400
},
{
"epoch": 12.780638516992791,
"grad_norm": 0.26537027955055237,
"learning_rate": 2.4137947092334994e-05,
"loss": 0.0363,
"step": 12410
},
{
"epoch": 12.790937178166839,
"grad_norm": 0.28661102056503296,
"learning_rate": 2.4059373826313185e-05,
"loss": 0.0306,
"step": 12420
},
{
"epoch": 12.801235839340885,
"grad_norm": 0.26964297890663147,
"learning_rate": 2.3980888116683515e-05,
"loss": 0.0324,
"step": 12430
},
{
"epoch": 12.811534500514933,
"grad_norm": 0.2776640057563782,
"learning_rate": 2.3902490228355756e-05,
"loss": 0.0329,
"step": 12440
},
{
"epoch": 12.82183316168898,
"grad_norm": 0.4814803898334503,
"learning_rate": 2.3824180425943277e-05,
"loss": 0.0303,
"step": 12450
},
{
"epoch": 12.832131822863028,
"grad_norm": 0.22867955267429352,
"learning_rate": 2.374595897376211e-05,
"loss": 0.0288,
"step": 12460
},
{
"epoch": 12.842430484037076,
"grad_norm": 0.21567359566688538,
"learning_rate": 2.366782613583009e-05,
"loss": 0.0325,
"step": 12470
},
{
"epoch": 12.852729145211123,
"grad_norm": 0.290703684091568,
"learning_rate": 2.3589782175866015e-05,
"loss": 0.0298,
"step": 12480
},
{
"epoch": 12.863027806385169,
"grad_norm": 0.3255325257778168,
"learning_rate": 2.3511827357288575e-05,
"loss": 0.0363,
"step": 12490
},
{
"epoch": 12.873326467559217,
"grad_norm": 0.44946736097335815,
"learning_rate": 2.343396194321572e-05,
"loss": 0.0332,
"step": 12500
},
{
"epoch": 12.883625128733264,
"grad_norm": 0.25294211506843567,
"learning_rate": 2.33561861964635e-05,
"loss": 0.0348,
"step": 12510
},
{
"epoch": 12.893923789907312,
"grad_norm": 0.18743322789669037,
"learning_rate": 2.3278500379545436e-05,
"loss": 0.0336,
"step": 12520
},
{
"epoch": 12.90422245108136,
"grad_norm": 0.16629280149936676,
"learning_rate": 2.3200904754671453e-05,
"loss": 0.0381,
"step": 12530
},
{
"epoch": 12.914521112255407,
"grad_norm": 0.1841958910226822,
"learning_rate": 2.312339958374705e-05,
"loss": 0.0273,
"step": 12540
},
{
"epoch": 12.924819773429455,
"grad_norm": 0.3820919096469879,
"learning_rate": 2.3045985128372442e-05,
"loss": 0.0354,
"step": 12550
},
{
"epoch": 12.9351184346035,
"grad_norm": 0.22891731560230255,
"learning_rate": 2.2968661649841643e-05,
"loss": 0.0393,
"step": 12560
},
{
"epoch": 12.945417095777549,
"grad_norm": 0.21805356442928314,
"learning_rate": 2.2891429409141594e-05,
"loss": 0.0312,
"step": 12570
},
{
"epoch": 12.955715756951596,
"grad_norm": 0.29530712962150574,
"learning_rate": 2.281428866695128e-05,
"loss": 0.034,
"step": 12580
},
{
"epoch": 12.966014418125644,
"grad_norm": 0.3417767286300659,
"learning_rate": 2.2737239683640908e-05,
"loss": 0.0291,
"step": 12590
},
{
"epoch": 12.976313079299691,
"grad_norm": 0.36338862776756287,
"learning_rate": 2.266028271927087e-05,
"loss": 0.0288,
"step": 12600
},
{
"epoch": 12.98661174047374,
"grad_norm": 0.18803521990776062,
"learning_rate": 2.258341803359108e-05,
"loss": 0.035,
"step": 12610
},
{
"epoch": 12.996910401647785,
"grad_norm": 0.2204011231660843,
"learning_rate": 2.2506645886039918e-05,
"loss": 0.0331,
"step": 12620
},
{
"epoch": 13.007209062821833,
"grad_norm": 0.23867210745811462,
"learning_rate": 2.242996653574345e-05,
"loss": 0.0327,
"step": 12630
},
{
"epoch": 13.01750772399588,
"grad_norm": 0.22372329235076904,
"learning_rate": 2.2353380241514515e-05,
"loss": 0.0313,
"step": 12640
},
{
"epoch": 13.027806385169928,
"grad_norm": 0.2398245483636856,
"learning_rate": 2.2276887261851875e-05,
"loss": 0.0405,
"step": 12650
},
{
"epoch": 13.038105046343976,
"grad_norm": 0.20746667683124542,
"learning_rate": 2.2200487854939322e-05,
"loss": 0.0332,
"step": 12660
},
{
"epoch": 13.048403707518023,
"grad_norm": 0.23980452120304108,
"learning_rate": 2.21241822786448e-05,
"loss": 0.0331,
"step": 12670
},
{
"epoch": 13.058702368692071,
"grad_norm": 0.2431352734565735,
"learning_rate": 2.204797079051962e-05,
"loss": 0.0337,
"step": 12680
},
{
"epoch": 13.069001029866117,
"grad_norm": 0.21622303128242493,
"learning_rate": 2.1971853647797415e-05,
"loss": 0.0369,
"step": 12690
},
{
"epoch": 13.079299691040164,
"grad_norm": 0.17636331915855408,
"learning_rate": 2.1895831107393484e-05,
"loss": 0.0385,
"step": 12700
},
{
"epoch": 13.089598352214212,
"grad_norm": 0.3212912976741791,
"learning_rate": 2.181990342590371e-05,
"loss": 0.0388,
"step": 12710
},
{
"epoch": 13.09989701338826,
"grad_norm": 0.4048994183540344,
"learning_rate": 2.1744070859603897e-05,
"loss": 0.0314,
"step": 12720
},
{
"epoch": 13.110195674562307,
"grad_norm": 0.2608017921447754,
"learning_rate": 2.1668333664448776e-05,
"loss": 0.0348,
"step": 12730
},
{
"epoch": 13.120494335736355,
"grad_norm": 0.22120167315006256,
"learning_rate": 2.1592692096071153e-05,
"loss": 0.0282,
"step": 12740
},
{
"epoch": 13.130792996910401,
"grad_norm": 0.22117048501968384,
"learning_rate": 2.1517146409781103e-05,
"loss": 0.0346,
"step": 12750
},
{
"epoch": 13.141091658084449,
"grad_norm": 0.2921169102191925,
"learning_rate": 2.1441696860565048e-05,
"loss": 0.0342,
"step": 12760
},
{
"epoch": 13.151390319258496,
"grad_norm": 0.22612257301807404,
"learning_rate": 2.1366343703084936e-05,
"loss": 0.0312,
"step": 12770
},
{
"epoch": 13.161688980432544,
"grad_norm": 0.27955397963523865,
"learning_rate": 2.1291087191677343e-05,
"loss": 0.0332,
"step": 12780
},
{
"epoch": 13.171987641606592,
"grad_norm": 0.2641075849533081,
"learning_rate": 2.121592758035273e-05,
"loss": 0.0368,
"step": 12790
},
{
"epoch": 13.18228630278064,
"grad_norm": 0.26150405406951904,
"learning_rate": 2.114086512279434e-05,
"loss": 0.0355,
"step": 12800
},
{
"epoch": 13.192584963954685,
"grad_norm": 0.2792717218399048,
"learning_rate": 2.1065900072357635e-05,
"loss": 0.029,
"step": 12810
},
{
"epoch": 13.202883625128733,
"grad_norm": 0.21909286081790924,
"learning_rate": 2.0991032682069246e-05,
"loss": 0.0379,
"step": 12820
},
{
"epoch": 13.21318228630278,
"grad_norm": 0.2866324782371521,
"learning_rate": 2.0916263204626162e-05,
"loss": 0.0282,
"step": 12830
},
{
"epoch": 13.223480947476828,
"grad_norm": 0.28694427013397217,
"learning_rate": 2.0841591892394925e-05,
"loss": 0.0399,
"step": 12840
},
{
"epoch": 13.233779608650876,
"grad_norm": 0.31920716166496277,
"learning_rate": 2.0767018997410713e-05,
"loss": 0.0365,
"step": 12850
},
{
"epoch": 13.244078269824923,
"grad_norm": 0.35022082924842834,
"learning_rate": 2.0692544771376543e-05,
"loss": 0.0264,
"step": 12860
},
{
"epoch": 13.254376930998971,
"grad_norm": 0.25149139761924744,
"learning_rate": 2.0618169465662364e-05,
"loss": 0.0302,
"step": 12870
},
{
"epoch": 13.264675592173017,
"grad_norm": 0.2645907402038574,
"learning_rate": 2.0543893331304333e-05,
"loss": 0.0328,
"step": 12880
},
{
"epoch": 13.274974253347064,
"grad_norm": 0.17596539855003357,
"learning_rate": 2.0469716619003725e-05,
"loss": 0.0328,
"step": 12890
},
{
"epoch": 13.285272914521112,
"grad_norm": 0.2291368991136551,
"learning_rate": 2.039563957912642e-05,
"loss": 0.0318,
"step": 12900
},
{
"epoch": 13.29557157569516,
"grad_norm": 0.21256229281425476,
"learning_rate": 2.0321662461701696e-05,
"loss": 0.0334,
"step": 12910
},
{
"epoch": 13.305870236869207,
"grad_norm": 0.30739450454711914,
"learning_rate": 2.024778551642172e-05,
"loss": 0.0321,
"step": 12920
},
{
"epoch": 13.316168898043255,
"grad_norm": 0.2791813015937805,
"learning_rate": 2.017400899264047e-05,
"loss": 0.0302,
"step": 12930
},
{
"epoch": 13.326467559217301,
"grad_norm": 0.3258625864982605,
"learning_rate": 2.0100333139372985e-05,
"loss": 0.0361,
"step": 12940
},
{
"epoch": 13.336766220391349,
"grad_norm": 0.2523643672466278,
"learning_rate": 2.0026758205294533e-05,
"loss": 0.0322,
"step": 12950
},
{
"epoch": 13.347064881565396,
"grad_norm": 0.2704935073852539,
"learning_rate": 1.9953284438739733e-05,
"loss": 0.0321,
"step": 12960
},
{
"epoch": 13.357363542739444,
"grad_norm": 0.45123302936553955,
"learning_rate": 1.9879912087701753e-05,
"loss": 0.0331,
"step": 12970
},
{
"epoch": 13.367662203913492,
"grad_norm": 1.1362191438674927,
"learning_rate": 1.9806641399831433e-05,
"loss": 0.0352,
"step": 12980
},
{
"epoch": 13.37796086508754,
"grad_norm": 0.3239549398422241,
"learning_rate": 1.9733472622436544e-05,
"loss": 0.0317,
"step": 12990
},
{
"epoch": 13.388259526261585,
"grad_norm": 0.20692795515060425,
"learning_rate": 1.9660406002480765e-05,
"loss": 0.0328,
"step": 13000
},
{
"epoch": 13.398558187435633,
"grad_norm": 0.24428331851959229,
"learning_rate": 1.9587441786583076e-05,
"loss": 0.0344,
"step": 13010
},
{
"epoch": 13.40885684860968,
"grad_norm": 0.17566567659378052,
"learning_rate": 1.951458022101676e-05,
"loss": 0.0346,
"step": 13020
},
{
"epoch": 13.419155509783728,
"grad_norm": 0.2601017951965332,
"learning_rate": 1.944182155170864e-05,
"loss": 0.0413,
"step": 13030
},
{
"epoch": 13.429454170957776,
"grad_norm": 0.22690336406230927,
"learning_rate": 1.9369166024238232e-05,
"loss": 0.039,
"step": 13040
},
{
"epoch": 13.439752832131823,
"grad_norm": 0.34189629554748535,
"learning_rate": 1.9296613883836945e-05,
"loss": 0.0297,
"step": 13050
},
{
"epoch": 13.450051493305871,
"grad_norm": 0.39015287160873413,
"learning_rate": 1.9224165375387193e-05,
"loss": 0.0352,
"step": 13060
},
{
"epoch": 13.460350154479917,
"grad_norm": 0.16422075033187866,
"learning_rate": 1.9151820743421617e-05,
"loss": 0.0298,
"step": 13070
},
{
"epoch": 13.470648815653965,
"grad_norm": 0.20099236071109772,
"learning_rate": 1.9079580232122303e-05,
"loss": 0.0271,
"step": 13080
},
{
"epoch": 13.480947476828012,
"grad_norm": 0.37444478273391724,
"learning_rate": 1.9007444085319786e-05,
"loss": 0.0382,
"step": 13090
},
{
"epoch": 13.49124613800206,
"grad_norm": 0.24139359593391418,
"learning_rate": 1.8935412546492486e-05,
"loss": 0.0334,
"step": 13100
},
{
"epoch": 13.501544799176108,
"grad_norm": 0.3007052540779114,
"learning_rate": 1.88634858587656e-05,
"loss": 0.0341,
"step": 13110
},
{
"epoch": 13.511843460350155,
"grad_norm": 0.30898720026016235,
"learning_rate": 1.8791664264910537e-05,
"loss": 0.0324,
"step": 13120
},
{
"epoch": 13.522142121524201,
"grad_norm": 0.3256855905056,
"learning_rate": 1.8719948007343936e-05,
"loss": 0.0376,
"step": 13130
},
{
"epoch": 13.532440782698249,
"grad_norm": 0.2092374563217163,
"learning_rate": 1.8648337328126906e-05,
"loss": 0.0298,
"step": 13140
},
{
"epoch": 13.542739443872296,
"grad_norm": 0.34433215856552124,
"learning_rate": 1.85768324689642e-05,
"loss": 0.0371,
"step": 13150
},
{
"epoch": 13.553038105046344,
"grad_norm": 0.47145530581474304,
"learning_rate": 1.850543367120341e-05,
"loss": 0.0389,
"step": 13160
},
{
"epoch": 13.563336766220392,
"grad_norm": 1.9276230335235596,
"learning_rate": 1.8434141175834125e-05,
"loss": 0.0356,
"step": 13170
},
{
"epoch": 13.57363542739444,
"grad_norm": 0.1196000725030899,
"learning_rate": 1.8362955223487143e-05,
"loss": 0.0292,
"step": 13180
},
{
"epoch": 13.583934088568487,
"grad_norm": 0.21239057183265686,
"learning_rate": 1.8291876054433693e-05,
"loss": 0.0314,
"step": 13190
},
{
"epoch": 13.594232749742533,
"grad_norm": 0.27161744236946106,
"learning_rate": 1.8220903908584492e-05,
"loss": 0.0323,
"step": 13200
},
{
"epoch": 13.60453141091658,
"grad_norm": 0.23213060200214386,
"learning_rate": 1.8150039025489113e-05,
"loss": 0.0335,
"step": 13210
},
{
"epoch": 13.614830072090628,
"grad_norm": 0.26432856917381287,
"learning_rate": 1.8079281644335055e-05,
"loss": 0.0348,
"step": 13220
},
{
"epoch": 13.625128733264676,
"grad_norm": 0.24627777934074402,
"learning_rate": 1.8008632003946957e-05,
"loss": 0.0308,
"step": 13230
},
{
"epoch": 13.635427394438723,
"grad_norm": 0.3506312966346741,
"learning_rate": 1.7938090342785817e-05,
"loss": 0.0379,
"step": 13240
},
{
"epoch": 13.645726055612771,
"grad_norm": 0.20565661787986755,
"learning_rate": 1.7867656898948187e-05,
"loss": 0.0338,
"step": 13250
},
{
"epoch": 13.656024716786817,
"grad_norm": 0.2677291929721832,
"learning_rate": 1.7797331910165336e-05,
"loss": 0.0325,
"step": 13260
},
{
"epoch": 13.666323377960865,
"grad_norm": 0.30942559242248535,
"learning_rate": 1.7727115613802465e-05,
"loss": 0.0365,
"step": 13270
},
{
"epoch": 13.676622039134912,
"grad_norm": 0.23922519385814667,
"learning_rate": 1.765700824685797e-05,
"loss": 0.0366,
"step": 13280
},
{
"epoch": 13.68692070030896,
"grad_norm": 0.18366648256778717,
"learning_rate": 1.758701004596247e-05,
"loss": 0.0305,
"step": 13290
},
{
"epoch": 13.697219361483008,
"grad_norm": 0.2875716984272003,
"learning_rate": 1.751712124737826e-05,
"loss": 0.0363,
"step": 13300
},
{
"epoch": 13.707518022657055,
"grad_norm": 0.3050890564918518,
"learning_rate": 1.744734208699822e-05,
"loss": 0.037,
"step": 13310
},
{
"epoch": 13.717816683831103,
"grad_norm": 0.24879583716392517,
"learning_rate": 1.7377672800345302e-05,
"loss": 0.0285,
"step": 13320
},
{
"epoch": 13.728115345005149,
"grad_norm": 0.22065865993499756,
"learning_rate": 1.7308113622571544e-05,
"loss": 0.0299,
"step": 13330
},
{
"epoch": 13.738414006179196,
"grad_norm": 0.1869887113571167,
"learning_rate": 1.7238664788457342e-05,
"loss": 0.0344,
"step": 13340
},
{
"epoch": 13.748712667353244,
"grad_norm": 0.21137484908103943,
"learning_rate": 1.7169326532410663e-05,
"loss": 0.0332,
"step": 13350
},
{
"epoch": 13.759011328527292,
"grad_norm": 0.3234722912311554,
"learning_rate": 1.7100099088466242e-05,
"loss": 0.0345,
"step": 13360
},
{
"epoch": 13.76930998970134,
"grad_norm": 0.2264581024646759,
"learning_rate": 1.7030982690284792e-05,
"loss": 0.0291,
"step": 13370
},
{
"epoch": 13.779608650875387,
"grad_norm": 0.29631558060646057,
"learning_rate": 1.69619775711522e-05,
"loss": 0.0361,
"step": 13380
},
{
"epoch": 13.789907312049433,
"grad_norm": 0.292219340801239,
"learning_rate": 1.689308396397882e-05,
"loss": 0.0256,
"step": 13390
},
{
"epoch": 13.80020597322348,
"grad_norm": 0.17191918194293976,
"learning_rate": 1.6824302101298526e-05,
"loss": 0.0349,
"step": 13400
},
{
"epoch": 13.810504634397528,
"grad_norm": 0.22219271957874298,
"learning_rate": 1.6755632215268118e-05,
"loss": 0.0316,
"step": 13410
},
{
"epoch": 13.820803295571576,
"grad_norm": 0.18818335235118866,
"learning_rate": 1.6687074537666398e-05,
"loss": 0.0325,
"step": 13420
},
{
"epoch": 13.831101956745623,
"grad_norm": 0.2848359942436218,
"learning_rate": 1.6618629299893434e-05,
"loss": 0.0327,
"step": 13430
},
{
"epoch": 13.841400617919671,
"grad_norm": 0.26240599155426025,
"learning_rate": 1.6550296732969795e-05,
"loss": 0.0321,
"step": 13440
},
{
"epoch": 13.851699279093717,
"grad_norm": 0.166743665933609,
"learning_rate": 1.648207706753575e-05,
"loss": 0.0361,
"step": 13450
},
{
"epoch": 13.861997940267765,
"grad_norm": 0.2783146798610687,
"learning_rate": 1.6413970533850498e-05,
"loss": 0.0395,
"step": 13460
},
{
"epoch": 13.872296601441812,
"grad_norm": 0.2442004680633545,
"learning_rate": 1.6345977361791366e-05,
"loss": 0.0385,
"step": 13470
},
{
"epoch": 13.88259526261586,
"grad_norm": 0.16581279039382935,
"learning_rate": 1.6278097780853136e-05,
"loss": 0.0356,
"step": 13480
},
{
"epoch": 13.892893923789908,
"grad_norm": 0.37210017442703247,
"learning_rate": 1.6210332020147055e-05,
"loss": 0.0363,
"step": 13490
},
{
"epoch": 13.903192584963955,
"grad_norm": 0.18403227627277374,
"learning_rate": 1.6142680308400338e-05,
"loss": 0.0389,
"step": 13500
},
{
"epoch": 13.913491246138001,
"grad_norm": 0.283448189496994,
"learning_rate": 1.6075142873955164e-05,
"loss": 0.0318,
"step": 13510
},
{
"epoch": 13.923789907312049,
"grad_norm": 0.24017812311649323,
"learning_rate": 1.6007719944768025e-05,
"loss": 0.035,
"step": 13520
},
{
"epoch": 13.934088568486096,
"grad_norm": 0.14648008346557617,
"learning_rate": 1.594041174840894e-05,
"loss": 0.0276,
"step": 13530
},
{
"epoch": 13.944387229660144,
"grad_norm": 0.31949880719184875,
"learning_rate": 1.587321851206061e-05,
"loss": 0.0312,
"step": 13540
},
{
"epoch": 13.954685890834192,
"grad_norm": 0.27566295862197876,
"learning_rate": 1.5806140462517828e-05,
"loss": 0.0308,
"step": 13550
},
{
"epoch": 13.96498455200824,
"grad_norm": 0.221617192029953,
"learning_rate": 1.573917782618651e-05,
"loss": 0.033,
"step": 13560
},
{
"epoch": 13.975283213182287,
"grad_norm": 0.15257342159748077,
"learning_rate": 1.567233082908306e-05,
"loss": 0.0272,
"step": 13570
},
{
"epoch": 13.985581874356333,
"grad_norm": 0.31881460547447205,
"learning_rate": 1.5605599696833544e-05,
"loss": 0.036,
"step": 13580
},
{
"epoch": 13.99588053553038,
"grad_norm": 0.21161913871765137,
"learning_rate": 1.5538984654673016e-05,
"loss": 0.0272,
"step": 13590
},
{
"epoch": 14.006179196704428,
"grad_norm": 0.22538325190544128,
"learning_rate": 1.5472485927444597e-05,
"loss": 0.023,
"step": 13600
},
{
"epoch": 14.016477857878476,
"grad_norm": 0.2999170422554016,
"learning_rate": 1.5406103739598903e-05,
"loss": 0.032,
"step": 13610
},
{
"epoch": 14.026776519052524,
"grad_norm": 0.26565343141555786,
"learning_rate": 1.5339838315193156e-05,
"loss": 0.031,
"step": 13620
},
{
"epoch": 14.037075180226571,
"grad_norm": 0.3137536942958832,
"learning_rate": 1.5273689877890485e-05,
"loss": 0.0302,
"step": 13630
},
{
"epoch": 14.047373841400617,
"grad_norm": 0.1854087859392166,
"learning_rate": 1.5207658650959138e-05,
"loss": 0.0345,
"step": 13640
},
{
"epoch": 14.057672502574665,
"grad_norm": 0.2928926348686218,
"learning_rate": 1.5141744857271778e-05,
"loss": 0.0334,
"step": 13650
},
{
"epoch": 14.067971163748712,
"grad_norm": 0.42930635809898376,
"learning_rate": 1.5075948719304672e-05,
"loss": 0.0272,
"step": 13660
},
{
"epoch": 14.07826982492276,
"grad_norm": 0.20846472680568695,
"learning_rate": 1.5010270459136966e-05,
"loss": 0.0331,
"step": 13670
},
{
"epoch": 14.088568486096808,
"grad_norm": 0.2335253208875656,
"learning_rate": 1.4944710298449999e-05,
"loss": 0.0312,
"step": 13680
},
{
"epoch": 14.098867147270855,
"grad_norm": 0.18406903743743896,
"learning_rate": 1.4879268458526379e-05,
"loss": 0.033,
"step": 13690
},
{
"epoch": 14.109165808444903,
"grad_norm": 0.26444944739341736,
"learning_rate": 1.481394516024947e-05,
"loss": 0.0282,
"step": 13700
},
{
"epoch": 14.119464469618949,
"grad_norm": 0.19681231677532196,
"learning_rate": 1.4748740624102459e-05,
"loss": 0.0354,
"step": 13710
},
{
"epoch": 14.129763130792997,
"grad_norm": 0.22566291689872742,
"learning_rate": 1.468365507016769e-05,
"loss": 0.0327,
"step": 13720
},
{
"epoch": 14.140061791967044,
"grad_norm": 0.24647872149944305,
"learning_rate": 1.4618688718125929e-05,
"loss": 0.0301,
"step": 13730
},
{
"epoch": 14.150360453141092,
"grad_norm": 0.2727005183696747,
"learning_rate": 1.455384178725555e-05,
"loss": 0.0261,
"step": 13740
},
{
"epoch": 14.16065911431514,
"grad_norm": 0.2636515200138092,
"learning_rate": 1.4489114496431938e-05,
"loss": 0.0362,
"step": 13750
},
{
"epoch": 14.170957775489187,
"grad_norm": 0.24423463642597198,
"learning_rate": 1.4424507064126597e-05,
"loss": 0.0308,
"step": 13760
},
{
"epoch": 14.181256436663233,
"grad_norm": 0.2822682559490204,
"learning_rate": 1.4360019708406487e-05,
"loss": 0.038,
"step": 13770
},
{
"epoch": 14.19155509783728,
"grad_norm": 0.19930243492126465,
"learning_rate": 1.4295652646933277e-05,
"loss": 0.0291,
"step": 13780
},
{
"epoch": 14.201853759011328,
"grad_norm": 0.1978948414325714,
"learning_rate": 1.4231406096962669e-05,
"loss": 0.0302,
"step": 13790
},
{
"epoch": 14.212152420185376,
"grad_norm": 0.17142613232135773,
"learning_rate": 1.4167280275343492e-05,
"loss": 0.0257,
"step": 13800
},
{
"epoch": 14.222451081359424,
"grad_norm": 0.2695595622062683,
"learning_rate": 1.4103275398517197e-05,
"loss": 0.0349,
"step": 13810
},
{
"epoch": 14.232749742533471,
"grad_norm": 0.23960620164871216,
"learning_rate": 1.4039391682516972e-05,
"loss": 0.0307,
"step": 13820
},
{
"epoch": 14.243048403707519,
"grad_norm": 0.279876172542572,
"learning_rate": 1.3975629342967001e-05,
"loss": 0.0334,
"step": 13830
},
{
"epoch": 14.253347064881565,
"grad_norm": 0.260696142911911,
"learning_rate": 1.3911988595081893e-05,
"loss": 0.0316,
"step": 13840
},
{
"epoch": 14.263645726055612,
"grad_norm": 0.24109739065170288,
"learning_rate": 1.3848469653665786e-05,
"loss": 0.0306,
"step": 13850
},
{
"epoch": 14.27394438722966,
"grad_norm": 0.3289351165294647,
"learning_rate": 1.378507273311171e-05,
"loss": 0.0362,
"step": 13860
},
{
"epoch": 14.284243048403708,
"grad_norm": 0.33488863706588745,
"learning_rate": 1.3721798047400813e-05,
"loss": 0.0408,
"step": 13870
},
{
"epoch": 14.294541709577755,
"grad_norm": 3.9080820083618164,
"learning_rate": 1.3658645810101755e-05,
"loss": 0.0278,
"step": 13880
},
{
"epoch": 14.304840370751803,
"grad_norm": 0.2996270954608917,
"learning_rate": 1.3595616234369762e-05,
"loss": 0.0277,
"step": 13890
},
{
"epoch": 14.315139031925849,
"grad_norm": 0.2796926498413086,
"learning_rate": 1.3532709532946186e-05,
"loss": 0.0328,
"step": 13900
},
{
"epoch": 14.325437693099897,
"grad_norm": 0.24468347430229187,
"learning_rate": 1.3469925918157567e-05,
"loss": 0.0327,
"step": 13910
},
{
"epoch": 14.335736354273944,
"grad_norm": 0.23212593793869019,
"learning_rate": 1.3407265601914976e-05,
"loss": 0.0317,
"step": 13920
},
{
"epoch": 14.346035015447992,
"grad_norm": 0.23879218101501465,
"learning_rate": 1.3344728795713413e-05,
"loss": 0.0365,
"step": 13930
},
{
"epoch": 14.35633367662204,
"grad_norm": 0.2575908303260803,
"learning_rate": 1.3282315710630882e-05,
"loss": 0.0385,
"step": 13940
},
{
"epoch": 14.366632337796087,
"grad_norm": 0.3186909556388855,
"learning_rate": 1.3220026557327898e-05,
"loss": 0.0403,
"step": 13950
},
{
"epoch": 14.376930998970133,
"grad_norm": 0.2613557279109955,
"learning_rate": 1.3157861546046613e-05,
"loss": 0.0328,
"step": 13960
},
{
"epoch": 14.38722966014418,
"grad_norm": 0.3558288514614105,
"learning_rate": 1.3095820886610188e-05,
"loss": 0.0293,
"step": 13970
},
{
"epoch": 14.397528321318228,
"grad_norm": 0.2622450292110443,
"learning_rate": 1.3033904788422047e-05,
"loss": 0.0261,
"step": 13980
},
{
"epoch": 14.407826982492276,
"grad_norm": 0.23433591425418854,
"learning_rate": 1.2972113460465246e-05,
"loss": 0.0286,
"step": 13990
},
{
"epoch": 14.418125643666324,
"grad_norm": 0.2427792251110077,
"learning_rate": 1.2910447111301604e-05,
"loss": 0.0316,
"step": 14000
},
{
"epoch": 14.428424304840371,
"grad_norm": 0.3044346570968628,
"learning_rate": 1.284890594907121e-05,
"loss": 0.0273,
"step": 14010
},
{
"epoch": 14.438722966014419,
"grad_norm": 0.16404663026332855,
"learning_rate": 1.2787490181491568e-05,
"loss": 0.0257,
"step": 14020
},
{
"epoch": 14.449021627188465,
"grad_norm": 0.26250144839286804,
"learning_rate": 1.2726200015856892e-05,
"loss": 0.0328,
"step": 14030
},
{
"epoch": 14.459320288362512,
"grad_norm": 0.7278460264205933,
"learning_rate": 1.2665035659037561e-05,
"loss": 0.0297,
"step": 14040
},
{
"epoch": 14.46961894953656,
"grad_norm": 0.34996357560157776,
"learning_rate": 1.2603997317479238e-05,
"loss": 0.0324,
"step": 14050
},
{
"epoch": 14.479917610710608,
"grad_norm": 0.44799286127090454,
"learning_rate": 1.2543085197202287e-05,
"loss": 0.036,
"step": 14060
},
{
"epoch": 14.490216271884655,
"grad_norm": 0.24697241187095642,
"learning_rate": 1.2482299503801016e-05,
"loss": 0.0315,
"step": 14070
},
{
"epoch": 14.500514933058703,
"grad_norm": 0.3266669511795044,
"learning_rate": 1.2421640442443055e-05,
"loss": 0.0351,
"step": 14080
},
{
"epoch": 14.510813594232749,
"grad_norm": 0.42595696449279785,
"learning_rate": 1.2361108217868544e-05,
"loss": 0.029,
"step": 14090
},
{
"epoch": 14.521112255406797,
"grad_norm": 0.28600630164146423,
"learning_rate": 1.23007030343896e-05,
"loss": 0.0288,
"step": 14100
},
{
"epoch": 14.531410916580844,
"grad_norm": 0.32830336689949036,
"learning_rate": 1.2240425095889495e-05,
"loss": 0.0323,
"step": 14110
},
{
"epoch": 14.541709577754892,
"grad_norm": 0.23947954177856445,
"learning_rate": 1.2180274605821989e-05,
"loss": 0.0301,
"step": 14120
},
{
"epoch": 14.55200823892894,
"grad_norm": 0.14854808151721954,
"learning_rate": 1.2120251767210755e-05,
"loss": 0.0305,
"step": 14130
},
{
"epoch": 14.562306900102987,
"grad_norm": 0.4753403961658478,
"learning_rate": 1.2060356782648503e-05,
"loss": 0.0333,
"step": 14140
},
{
"epoch": 14.572605561277033,
"grad_norm": 0.15201760828495026,
"learning_rate": 1.2000589854296507e-05,
"loss": 0.0348,
"step": 14150
},
{
"epoch": 14.58290422245108,
"grad_norm": 0.36805441975593567,
"learning_rate": 1.1940951183883742e-05,
"loss": 0.0315,
"step": 14160
},
{
"epoch": 14.593202883625128,
"grad_norm": 0.22207669913768768,
"learning_rate": 1.1881440972706315e-05,
"loss": 0.0299,
"step": 14170
},
{
"epoch": 14.603501544799176,
"grad_norm": 0.27251651883125305,
"learning_rate": 1.1822059421626724e-05,
"loss": 0.0364,
"step": 14180
},
{
"epoch": 14.613800205973224,
"grad_norm": 0.2771929204463959,
"learning_rate": 1.1762806731073261e-05,
"loss": 0.0272,
"step": 14190
},
{
"epoch": 14.624098867147271,
"grad_norm": 0.2667066156864166,
"learning_rate": 1.1703683101039197e-05,
"loss": 0.0271,
"step": 14200
},
{
"epoch": 14.634397528321319,
"grad_norm": 0.2355891466140747,
"learning_rate": 1.1644688731082242e-05,
"loss": 0.0299,
"step": 14210
},
{
"epoch": 14.644696189495365,
"grad_norm": 0.39315053820610046,
"learning_rate": 1.1585823820323843e-05,
"loss": 0.0334,
"step": 14220
},
{
"epoch": 14.654994850669413,
"grad_norm": 0.298880010843277,
"learning_rate": 1.1527088567448407e-05,
"loss": 0.0309,
"step": 14230
},
{
"epoch": 14.66529351184346,
"grad_norm": 0.21369227766990662,
"learning_rate": 1.1468483170702805e-05,
"loss": 0.0271,
"step": 14240
},
{
"epoch": 14.675592173017508,
"grad_norm": 0.21962594985961914,
"learning_rate": 1.141000782789554e-05,
"loss": 0.0296,
"step": 14250
},
{
"epoch": 14.685890834191555,
"grad_norm": 0.3962979316711426,
"learning_rate": 1.135166273639619e-05,
"loss": 0.0361,
"step": 14260
},
{
"epoch": 14.696189495365603,
"grad_norm": 0.2696010172367096,
"learning_rate": 1.1293448093134656e-05,
"loss": 0.0317,
"step": 14270
},
{
"epoch": 14.706488156539649,
"grad_norm": 0.16473254561424255,
"learning_rate": 1.1235364094600632e-05,
"loss": 0.0259,
"step": 14280
},
{
"epoch": 14.716786817713697,
"grad_norm": 0.18638800084590912,
"learning_rate": 1.1177410936842719e-05,
"loss": 0.0236,
"step": 14290
},
{
"epoch": 14.727085478887744,
"grad_norm": 0.35101962089538574,
"learning_rate": 1.1119588815468012e-05,
"loss": 0.0266,
"step": 14300
},
{
"epoch": 14.737384140061792,
"grad_norm": 0.2792340815067291,
"learning_rate": 1.1061897925641296e-05,
"loss": 0.0318,
"step": 14310
},
{
"epoch": 14.74768280123584,
"grad_norm": 0.19751253724098206,
"learning_rate": 1.100433846208434e-05,
"loss": 0.0294,
"step": 14320
},
{
"epoch": 14.757981462409887,
"grad_norm": 0.2783863842487335,
"learning_rate": 1.094691061907544e-05,
"loss": 0.0359,
"step": 14330
},
{
"epoch": 14.768280123583935,
"grad_norm": 0.2864331305027008,
"learning_rate": 1.088961459044852e-05,
"loss": 0.0289,
"step": 14340
},
{
"epoch": 14.77857878475798,
"grad_norm": 0.19958889484405518,
"learning_rate": 1.0832450569592684e-05,
"loss": 0.0296,
"step": 14350
},
{
"epoch": 14.788877445932028,
"grad_norm": 0.2572004199028015,
"learning_rate": 1.0775418749451427e-05,
"loss": 0.0299,
"step": 14360
},
{
"epoch": 14.799176107106076,
"grad_norm": 0.24685412645339966,
"learning_rate": 1.0718519322522053e-05,
"loss": 0.0346,
"step": 14370
},
{
"epoch": 14.809474768280124,
"grad_norm": 0.2643430829048157,
"learning_rate": 1.0661752480854975e-05,
"loss": 0.0253,
"step": 14380
},
{
"epoch": 14.819773429454171,
"grad_norm": 0.2792705297470093,
"learning_rate": 1.0605118416053162e-05,
"loss": 0.0295,
"step": 14390
},
{
"epoch": 14.830072090628219,
"grad_norm": 0.4018799662590027,
"learning_rate": 1.0548617319271342e-05,
"loss": 0.034,
"step": 14400
},
{
"epoch": 14.840370751802265,
"grad_norm": 0.20562392473220825,
"learning_rate": 1.049224938121548e-05,
"loss": 0.0386,
"step": 14410
},
{
"epoch": 14.850669412976313,
"grad_norm": 0.2107439637184143,
"learning_rate": 1.043601479214214e-05,
"loss": 0.038,
"step": 14420
},
{
"epoch": 14.86096807415036,
"grad_norm": 0.2785644829273224,
"learning_rate": 1.0379913741857699e-05,
"loss": 0.0308,
"step": 14430
},
{
"epoch": 14.871266735324408,
"grad_norm": 0.23650747537612915,
"learning_rate": 1.03239464197179e-05,
"loss": 0.0312,
"step": 14440
},
{
"epoch": 14.881565396498456,
"grad_norm": 0.2766387462615967,
"learning_rate": 1.0268113014627073e-05,
"loss": 0.0265,
"step": 14450
},
{
"epoch": 14.891864057672503,
"grad_norm": 0.2568782567977905,
"learning_rate": 1.021241371503755e-05,
"loss": 0.037,
"step": 14460
},
{
"epoch": 14.90216271884655,
"grad_norm": 0.18696804344654083,
"learning_rate": 1.0156848708949006e-05,
"loss": 0.0266,
"step": 14470
},
{
"epoch": 14.912461380020597,
"grad_norm": 0.23785705864429474,
"learning_rate": 1.0101418183907896e-05,
"loss": 0.0304,
"step": 14480
},
{
"epoch": 14.922760041194644,
"grad_norm": 0.2720486521720886,
"learning_rate": 1.004612232700669e-05,
"loss": 0.0359,
"step": 14490
},
{
"epoch": 14.933058702368692,
"grad_norm": 0.21330799162387848,
"learning_rate": 9.990961324883358e-06,
"loss": 0.0288,
"step": 14500
},
{
"epoch": 14.94335736354274,
"grad_norm": 0.24091622233390808,
"learning_rate": 9.935935363720728e-06,
"loss": 0.0275,
"step": 14510
},
{
"epoch": 14.953656024716787,
"grad_norm": 0.34269654750823975,
"learning_rate": 9.88104462924575e-06,
"loss": 0.0323,
"step": 14520
},
{
"epoch": 14.963954685890835,
"grad_norm": 0.23459886014461517,
"learning_rate": 9.826289306729052e-06,
"loss": 0.0293,
"step": 14530
},
{
"epoch": 14.97425334706488,
"grad_norm": 0.27133437991142273,
"learning_rate": 9.7716695809841e-06,
"loss": 0.0329,
"step": 14540
},
{
"epoch": 14.984552008238929,
"grad_norm": 0.24615567922592163,
"learning_rate": 9.717185636366783e-06,
"loss": 0.0317,
"step": 14550
},
{
"epoch": 14.994850669412976,
"grad_norm": 0.26164570450782776,
"learning_rate": 9.662837656774632e-06,
"loss": 0.031,
"step": 14560
},
{
"epoch": 15.005149330587024,
"grad_norm": 0.18910399079322815,
"learning_rate": 9.608625825646288e-06,
"loss": 0.0349,
"step": 14570
},
{
"epoch": 15.015447991761071,
"grad_norm": 0.3117832541465759,
"learning_rate": 9.554550325960853e-06,
"loss": 0.032,
"step": 14580
},
{
"epoch": 15.02574665293512,
"grad_norm": 0.22034838795661926,
"learning_rate": 9.500611340237258e-06,
"loss": 0.0301,
"step": 14590
},
{
"epoch": 15.036045314109165,
"grad_norm": 0.2756035029888153,
"learning_rate": 9.446809050533678e-06,
"loss": 0.0272,
"step": 14600
},
{
"epoch": 15.046343975283213,
"grad_norm": 0.3038906157016754,
"learning_rate": 9.393143638446889e-06,
"loss": 0.0327,
"step": 14610
},
{
"epoch": 15.05664263645726,
"grad_norm": 0.22907866537570953,
"learning_rate": 9.33961528511172e-06,
"loss": 0.0307,
"step": 14620
},
{
"epoch": 15.066941297631308,
"grad_norm": 0.4842381775379181,
"learning_rate": 9.286224171200297e-06,
"loss": 0.0284,
"step": 14630
},
{
"epoch": 15.077239958805356,
"grad_norm": 0.8235160112380981,
"learning_rate": 9.232970476921626e-06,
"loss": 0.0336,
"step": 14640
},
{
"epoch": 15.087538619979403,
"grad_norm": 0.4762952923774719,
"learning_rate": 9.17985438202082e-06,
"loss": 0.0315,
"step": 14650
},
{
"epoch": 15.097837281153451,
"grad_norm": 0.20582009851932526,
"learning_rate": 9.12687606577859e-06,
"loss": 0.0283,
"step": 14660
},
{
"epoch": 15.108135942327497,
"grad_norm": 0.20658078789710999,
"learning_rate": 9.074035707010575e-06,
"loss": 0.0277,
"step": 14670
},
{
"epoch": 15.118434603501544,
"grad_norm": 0.2650274336338043,
"learning_rate": 9.02133348406684e-06,
"loss": 0.031,
"step": 14680
},
{
"epoch": 15.128733264675592,
"grad_norm": 0.26044949889183044,
"learning_rate": 8.968769574831115e-06,
"loss": 0.0287,
"step": 14690
},
{
"epoch": 15.13903192584964,
"grad_norm": 0.25187498331069946,
"learning_rate": 8.916344156720335e-06,
"loss": 0.0301,
"step": 14700
},
{
"epoch": 15.149330587023687,
"grad_norm": 0.4505482017993927,
"learning_rate": 8.864057406684023e-06,
"loss": 0.0264,
"step": 14710
},
{
"epoch": 15.159629248197735,
"grad_norm": 0.2146962434053421,
"learning_rate": 8.81190950120357e-06,
"loss": 0.0386,
"step": 14720
},
{
"epoch": 15.169927909371781,
"grad_norm": 0.17643073201179504,
"learning_rate": 8.759900616291834e-06,
"loss": 0.0271,
"step": 14730
},
{
"epoch": 15.180226570545829,
"grad_norm": 0.3004768192768097,
"learning_rate": 8.708030927492345e-06,
"loss": 0.034,
"step": 14740
},
{
"epoch": 15.190525231719876,
"grad_norm": 0.33159592747688293,
"learning_rate": 8.656300609878898e-06,
"loss": 0.033,
"step": 14750
},
{
"epoch": 15.200823892893924,
"grad_norm": 0.2567281126976013,
"learning_rate": 8.604709838054813e-06,
"loss": 0.0325,
"step": 14760
},
{
"epoch": 15.211122554067972,
"grad_norm": 0.20799218118190765,
"learning_rate": 8.55325878615244e-06,
"loss": 0.0317,
"step": 14770
},
{
"epoch": 15.22142121524202,
"grad_norm": 0.2914055585861206,
"learning_rate": 8.501947627832507e-06,
"loss": 0.0308,
"step": 14780
},
{
"epoch": 15.231719876416065,
"grad_norm": 0.24458810687065125,
"learning_rate": 8.450776536283594e-06,
"loss": 0.0359,
"step": 14790
},
{
"epoch": 15.242018537590113,
"grad_norm": 0.30409494042396545,
"learning_rate": 8.399745684221499e-06,
"loss": 0.0357,
"step": 14800
},
{
"epoch": 15.25231719876416,
"grad_norm": 0.2720089852809906,
"learning_rate": 8.348855243888681e-06,
"loss": 0.0344,
"step": 14810
},
{
"epoch": 15.262615859938208,
"grad_norm": 0.25461846590042114,
"learning_rate": 8.2981053870537e-06,
"loss": 0.0325,
"step": 14820
},
{
"epoch": 15.272914521112256,
"grad_norm": 0.2355855405330658,
"learning_rate": 8.247496285010548e-06,
"loss": 0.0276,
"step": 14830
},
{
"epoch": 15.283213182286303,
"grad_norm": 0.1807708442211151,
"learning_rate": 8.197028108578197e-06,
"loss": 0.03,
"step": 14840
},
{
"epoch": 15.293511843460351,
"grad_norm": 0.21903660893440247,
"learning_rate": 8.146701028099917e-06,
"loss": 0.0254,
"step": 14850
},
{
"epoch": 15.303810504634397,
"grad_norm": 0.5081159472465515,
"learning_rate": 8.096515213442762e-06,
"loss": 0.0276,
"step": 14860
},
{
"epoch": 15.314109165808445,
"grad_norm": 0.22669517993927002,
"learning_rate": 8.046470833996973e-06,
"loss": 0.0272,
"step": 14870
},
{
"epoch": 15.324407826982492,
"grad_norm": 0.2578093409538269,
"learning_rate": 7.996568058675402e-06,
"loss": 0.0304,
"step": 14880
},
{
"epoch": 15.33470648815654,
"grad_norm": 0.20256255567073822,
"learning_rate": 7.946807055912959e-06,
"loss": 0.0292,
"step": 14890
},
{
"epoch": 15.345005149330587,
"grad_norm": 0.2500031888484955,
"learning_rate": 7.897187993666022e-06,
"loss": 0.0315,
"step": 14900
},
{
"epoch": 15.355303810504635,
"grad_norm": 0.2907675802707672,
"learning_rate": 7.84771103941192e-06,
"loss": 0.0341,
"step": 14910
},
{
"epoch": 15.365602471678681,
"grad_norm": 0.1547321081161499,
"learning_rate": 7.79837636014827e-06,
"loss": 0.0249,
"step": 14920
},
{
"epoch": 15.375901132852729,
"grad_norm": 0.2814120054244995,
"learning_rate": 7.749184122392539e-06,
"loss": 0.0365,
"step": 14930
},
{
"epoch": 15.386199794026776,
"grad_norm": 0.37319841980934143,
"learning_rate": 7.700134492181344e-06,
"loss": 0.0274,
"step": 14940
},
{
"epoch": 15.396498455200824,
"grad_norm": 0.24200180172920227,
"learning_rate": 7.651227635070041e-06,
"loss": 0.0306,
"step": 14950
},
{
"epoch": 15.406797116374872,
"grad_norm": 0.6322610378265381,
"learning_rate": 7.602463716132041e-06,
"loss": 0.0279,
"step": 14960
},
{
"epoch": 15.41709577754892,
"grad_norm": 0.43964508175849915,
"learning_rate": 7.553842899958308e-06,
"loss": 0.032,
"step": 14970
},
{
"epoch": 15.427394438722967,
"grad_norm": 0.3598411977291107,
"learning_rate": 7.505365350656812e-06,
"loss": 0.0275,
"step": 14980
},
{
"epoch": 15.437693099897013,
"grad_norm": 0.19508050382137299,
"learning_rate": 7.457031231851941e-06,
"loss": 0.034,
"step": 14990
},
{
"epoch": 15.44799176107106,
"grad_norm": 0.29256248474121094,
"learning_rate": 7.4088407066839784e-06,
"loss": 0.0387,
"step": 15000
},
{
"epoch": 15.458290422245108,
"grad_norm": 0.2301289290189743,
"learning_rate": 7.36079393780853e-06,
"loss": 0.0311,
"step": 15010
},
{
"epoch": 15.468589083419156,
"grad_norm": 0.29095834493637085,
"learning_rate": 7.312891087396034e-06,
"loss": 0.0259,
"step": 15020
},
{
"epoch": 15.478887744593203,
"grad_norm": 0.2932276129722595,
"learning_rate": 7.2651323171310795e-06,
"loss": 0.0293,
"step": 15030
},
{
"epoch": 15.489186405767251,
"grad_norm": 0.24277035892009735,
"learning_rate": 7.217517788212025e-06,
"loss": 0.0334,
"step": 15040
},
{
"epoch": 15.499485066941297,
"grad_norm": 0.23208442330360413,
"learning_rate": 7.170047661350349e-06,
"loss": 0.0296,
"step": 15050
},
{
"epoch": 15.509783728115345,
"grad_norm": 0.1625526398420334,
"learning_rate": 7.122722096770123e-06,
"loss": 0.0283,
"step": 15060
},
{
"epoch": 15.520082389289392,
"grad_norm": 0.29437604546546936,
"learning_rate": 7.075541254207502e-06,
"loss": 0.0284,
"step": 15070
},
{
"epoch": 15.53038105046344,
"grad_norm": 0.3337920308113098,
"learning_rate": 7.028505292910154e-06,
"loss": 0.0235,
"step": 15080
},
{
"epoch": 15.540679711637488,
"grad_norm": 0.16761137545108795,
"learning_rate": 6.981614371636747e-06,
"loss": 0.0261,
"step": 15090
},
{
"epoch": 15.550978372811535,
"grad_norm": 0.18191471695899963,
"learning_rate": 6.934868648656373e-06,
"loss": 0.0273,
"step": 15100
},
{
"epoch": 15.561277033985581,
"grad_norm": 0.2083984911441803,
"learning_rate": 6.8882682817481006e-06,
"loss": 0.0339,
"step": 15110
},
{
"epoch": 15.571575695159629,
"grad_norm": 0.33254730701446533,
"learning_rate": 6.841813428200306e-06,
"loss": 0.0335,
"step": 15120
},
{
"epoch": 15.581874356333676,
"grad_norm": 0.22721487283706665,
"learning_rate": 6.795504244810285e-06,
"loss": 0.0284,
"step": 15130
},
{
"epoch": 15.592173017507724,
"grad_norm": 0.3968798816204071,
"learning_rate": 6.749340887883626e-06,
"loss": 0.0326,
"step": 15140
},
{
"epoch": 15.602471678681772,
"grad_norm": 0.1721322387456894,
"learning_rate": 6.7033235132337225e-06,
"loss": 0.0267,
"step": 15150
},
{
"epoch": 15.61277033985582,
"grad_norm": 0.3585062026977539,
"learning_rate": 6.6574522761812366e-06,
"loss": 0.0297,
"step": 15160
},
{
"epoch": 15.623069001029865,
"grad_norm": 0.45918750762939453,
"learning_rate": 6.611727331553586e-06,
"loss": 0.0275,
"step": 15170
},
{
"epoch": 15.633367662203913,
"grad_norm": 0.3067721724510193,
"learning_rate": 6.566148833684399e-06,
"loss": 0.0287,
"step": 15180
},
{
"epoch": 15.64366632337796,
"grad_norm": 0.2751639187335968,
"learning_rate": 6.520716936413018e-06,
"loss": 0.0295,
"step": 15190
},
{
"epoch": 15.653964984552008,
"grad_norm": 0.21889840066432953,
"learning_rate": 6.475431793083974e-06,
"loss": 0.0321,
"step": 15200
},
{
"epoch": 15.664263645726056,
"grad_norm": 0.3290077745914459,
"learning_rate": 6.4302935565464514e-06,
"loss": 0.031,
"step": 15210
},
{
"epoch": 15.674562306900103,
"grad_norm": 0.5243391394615173,
"learning_rate": 6.385302379153818e-06,
"loss": 0.0248,
"step": 15220
},
{
"epoch": 15.684860968074151,
"grad_norm": 1.0162177085876465,
"learning_rate": 6.3404584127630115e-06,
"loss": 0.0243,
"step": 15230
},
{
"epoch": 15.695159629248197,
"grad_norm": 0.33608901500701904,
"learning_rate": 6.295761808734174e-06,
"loss": 0.0307,
"step": 15240
},
{
"epoch": 15.705458290422245,
"grad_norm": 0.2736285626888275,
"learning_rate": 6.251212717930017e-06,
"loss": 0.0341,
"step": 15250
},
{
"epoch": 15.715756951596292,
"grad_norm": 0.3048650920391083,
"learning_rate": 6.206811290715353e-06,
"loss": 0.035,
"step": 15260
},
{
"epoch": 15.72605561277034,
"grad_norm": 0.2898007929325104,
"learning_rate": 6.16255767695661e-06,
"loss": 0.0304,
"step": 15270
},
{
"epoch": 15.736354273944388,
"grad_norm": 0.2866269052028656,
"learning_rate": 6.118452026021299e-06,
"loss": 0.0344,
"step": 15280
},
{
"epoch": 15.746652935118435,
"grad_norm": 0.29790258407592773,
"learning_rate": 6.07449448677751e-06,
"loss": 0.0333,
"step": 15290
},
{
"epoch": 15.756951596292481,
"grad_norm": 0.33838725090026855,
"learning_rate": 6.030685207593423e-06,
"loss": 0.0345,
"step": 15300
},
{
"epoch": 15.767250257466529,
"grad_norm": 0.28657403588294983,
"learning_rate": 5.9870243363368275e-06,
"loss": 0.0321,
"step": 15310
},
{
"epoch": 15.777548918640576,
"grad_norm": 0.34499257802963257,
"learning_rate": 5.943512020374537e-06,
"loss": 0.0367,
"step": 15320
},
{
"epoch": 15.787847579814624,
"grad_norm": 0.2314077764749527,
"learning_rate": 5.90014840657202e-06,
"loss": 0.0351,
"step": 15330
},
{
"epoch": 15.798146240988672,
"grad_norm": 0.40013644099235535,
"learning_rate": 5.856933641292789e-06,
"loss": 0.0305,
"step": 15340
},
{
"epoch": 15.80844490216272,
"grad_norm": 0.6308583617210388,
"learning_rate": 5.813867870397977e-06,
"loss": 0.0331,
"step": 15350
},
{
"epoch": 15.818743563336767,
"grad_norm": 0.3136028051376343,
"learning_rate": 5.770951239245803e-06,
"loss": 0.0313,
"step": 15360
},
{
"epoch": 15.829042224510813,
"grad_norm": 0.18756185472011566,
"learning_rate": 5.72818389269113e-06,
"loss": 0.0261,
"step": 15370
},
{
"epoch": 15.83934088568486,
"grad_norm": 0.22854579985141754,
"learning_rate": 5.685565975084911e-06,
"loss": 0.0307,
"step": 15380
},
{
"epoch": 15.849639546858908,
"grad_norm": 0.18659406900405884,
"learning_rate": 5.643097630273769e-06,
"loss": 0.0293,
"step": 15390
},
{
"epoch": 15.859938208032956,
"grad_norm": 0.2682023048400879,
"learning_rate": 5.600779001599455e-06,
"loss": 0.0339,
"step": 15400
},
{
"epoch": 15.870236869207003,
"grad_norm": 0.29009154438972473,
"learning_rate": 5.558610231898393e-06,
"loss": 0.037,
"step": 15410
},
{
"epoch": 15.880535530381051,
"grad_norm": 0.32601863145828247,
"learning_rate": 5.516591463501231e-06,
"loss": 0.0322,
"step": 15420
},
{
"epoch": 15.890834191555097,
"grad_norm": 0.25241759419441223,
"learning_rate": 5.474722838232254e-06,
"loss": 0.0335,
"step": 15430
},
{
"epoch": 15.901132852729145,
"grad_norm": 0.34431523084640503,
"learning_rate": 5.433004497409039e-06,
"loss": 0.027,
"step": 15440
},
{
"epoch": 15.911431513903192,
"grad_norm": 0.24490360915660858,
"learning_rate": 5.391436581841886e-06,
"loss": 0.0287,
"step": 15450
},
{
"epoch": 15.92173017507724,
"grad_norm": 0.25288495421409607,
"learning_rate": 5.350019231833364e-06,
"loss": 0.0301,
"step": 15460
},
{
"epoch": 15.932028836251288,
"grad_norm": 0.23814049363136292,
"learning_rate": 5.3087525871778565e-06,
"loss": 0.0291,
"step": 15470
},
{
"epoch": 15.942327497425335,
"grad_norm": 0.2367774397134781,
"learning_rate": 5.2676367871610675e-06,
"loss": 0.0325,
"step": 15480
},
{
"epoch": 15.952626158599383,
"grad_norm": 0.20925898849964142,
"learning_rate": 5.226671970559577e-06,
"loss": 0.0307,
"step": 15490
},
{
"epoch": 15.962924819773429,
"grad_norm": 0.36154627799987793,
"learning_rate": 5.185858275640332e-06,
"loss": 0.0328,
"step": 15500
},
{
"epoch": 15.973223480947476,
"grad_norm": 0.25385522842407227,
"learning_rate": 5.145195840160239e-06,
"loss": 0.0299,
"step": 15510
},
{
"epoch": 15.983522142121524,
"grad_norm": 0.25496914982795715,
"learning_rate": 5.1046848013656165e-06,
"loss": 0.0292,
"step": 15520
},
{
"epoch": 15.993820803295572,
"grad_norm": 0.2563509941101074,
"learning_rate": 5.064325295991829e-06,
"loss": 0.0284,
"step": 15530
},
{
"epoch": 16.004119464469618,
"grad_norm": 0.2616461217403412,
"learning_rate": 5.024117460262751e-06,
"loss": 0.0439,
"step": 15540
},
{
"epoch": 16.014418125643665,
"grad_norm": 0.3009835481643677,
"learning_rate": 4.984061429890324e-06,
"loss": 0.0304,
"step": 15550
},
{
"epoch": 16.024716786817713,
"grad_norm": 0.29534780979156494,
"learning_rate": 4.94415734007413e-06,
"loss": 0.0319,
"step": 15560
},
{
"epoch": 16.03501544799176,
"grad_norm": 0.21110209822654724,
"learning_rate": 4.9044053255008935e-06,
"loss": 0.0309,
"step": 15570
},
{
"epoch": 16.04531410916581,
"grad_norm": 0.257237046957016,
"learning_rate": 4.864805520344051e-06,
"loss": 0.0274,
"step": 15580
},
{
"epoch": 16.055612770339856,
"grad_norm": 0.3104022741317749,
"learning_rate": 4.8253580582632906e-06,
"loss": 0.0294,
"step": 15590
},
{
"epoch": 16.065911431513904,
"grad_norm": 0.1543678343296051,
"learning_rate": 4.786063072404112e-06,
"loss": 0.0247,
"step": 15600
},
{
"epoch": 16.07621009268795,
"grad_norm": 0.18241259455680847,
"learning_rate": 4.7469206953973495e-06,
"loss": 0.0245,
"step": 15610
},
{
"epoch": 16.086508753862,
"grad_norm": 0.18561235070228577,
"learning_rate": 4.707931059358783e-06,
"loss": 0.0282,
"step": 15620
},
{
"epoch": 16.096807415036047,
"grad_norm": 0.36796221137046814,
"learning_rate": 4.669094295888588e-06,
"loss": 0.0323,
"step": 15630
},
{
"epoch": 16.107106076210094,
"grad_norm": 0.21030554175376892,
"learning_rate": 4.630410536071006e-06,
"loss": 0.0271,
"step": 15640
},
{
"epoch": 16.117404737384142,
"grad_norm": 0.23774808645248413,
"learning_rate": 4.59187991047384e-06,
"loss": 0.0319,
"step": 15650
},
{
"epoch": 16.127703398558186,
"grad_norm": 0.16403083503246307,
"learning_rate": 4.553502549148009e-06,
"loss": 0.0339,
"step": 15660
},
{
"epoch": 16.138002059732234,
"grad_norm": 0.23186904191970825,
"learning_rate": 4.515278581627141e-06,
"loss": 0.0301,
"step": 15670
},
{
"epoch": 16.14830072090628,
"grad_norm": 0.24327369034290314,
"learning_rate": 4.477208136927119e-06,
"loss": 0.0308,
"step": 15680
},
{
"epoch": 16.15859938208033,
"grad_norm": 0.2953716814517975,
"learning_rate": 4.439291343545643e-06,
"loss": 0.0281,
"step": 15690
},
{
"epoch": 16.168898043254377,
"grad_norm": 0.24078382551670074,
"learning_rate": 4.401528329461779e-06,
"loss": 0.0304,
"step": 15700
},
{
"epoch": 16.179196704428424,
"grad_norm": 0.3598305583000183,
"learning_rate": 4.363919222135604e-06,
"loss": 0.0279,
"step": 15710
},
{
"epoch": 16.189495365602472,
"grad_norm": 0.18711034953594208,
"learning_rate": 4.326464148507647e-06,
"loss": 0.0289,
"step": 15720
},
{
"epoch": 16.19979402677652,
"grad_norm": 0.3203088045120239,
"learning_rate": 4.289163234998589e-06,
"loss": 0.0334,
"step": 15730
},
{
"epoch": 16.210092687950567,
"grad_norm": 0.2985017001628876,
"learning_rate": 4.2520166075087635e-06,
"loss": 0.0246,
"step": 15740
},
{
"epoch": 16.220391349124615,
"grad_norm": 0.25471287965774536,
"learning_rate": 4.2150243914177325e-06,
"loss": 0.029,
"step": 15750
},
{
"epoch": 16.230690010298662,
"grad_norm": 0.22707876563072205,
"learning_rate": 4.178186711583904e-06,
"loss": 0.0258,
"step": 15760
},
{
"epoch": 16.24098867147271,
"grad_norm": 0.2530466914176941,
"learning_rate": 4.141503692344062e-06,
"loss": 0.0324,
"step": 15770
},
{
"epoch": 16.251287332646754,
"grad_norm": 0.23593966662883759,
"learning_rate": 4.1049754575129935e-06,
"loss": 0.0299,
"step": 15780
},
{
"epoch": 16.261585993820802,
"grad_norm": 0.26746660470962524,
"learning_rate": 4.068602130383031e-06,
"loss": 0.025,
"step": 15790
},
{
"epoch": 16.27188465499485,
"grad_norm": 0.3687654733657837,
"learning_rate": 4.032383833723657e-06,
"loss": 0.0344,
"step": 15800
},
{
"epoch": 16.282183316168897,
"grad_norm": 0.26962026953697205,
"learning_rate": 3.99632068978108e-06,
"loss": 0.0315,
"step": 15810
},
{
"epoch": 16.292481977342945,
"grad_norm": 0.3096659779548645,
"learning_rate": 3.960412820277865e-06,
"loss": 0.0241,
"step": 15820
},
{
"epoch": 16.302780638516992,
"grad_norm": 0.3644077777862549,
"learning_rate": 3.924660346412418e-06,
"loss": 0.0348,
"step": 15830
},
{
"epoch": 16.31307929969104,
"grad_norm": 0.2755933701992035,
"learning_rate": 3.8890633888587046e-06,
"loss": 0.0309,
"step": 15840
},
{
"epoch": 16.323377960865088,
"grad_norm": 0.5915675163269043,
"learning_rate": 3.8536220677657495e-06,
"loss": 0.0314,
"step": 15850
},
{
"epoch": 16.333676622039135,
"grad_norm": 0.2403060346841812,
"learning_rate": 3.8183365027572805e-06,
"loss": 0.0304,
"step": 15860
},
{
"epoch": 16.343975283213183,
"grad_norm": 0.24288389086723328,
"learning_rate": 3.783206812931289e-06,
"loss": 0.0291,
"step": 15870
},
{
"epoch": 16.35427394438723,
"grad_norm": 0.3532700836658478,
"learning_rate": 3.7482331168596675e-06,
"loss": 0.0289,
"step": 15880
},
{
"epoch": 16.36457260556128,
"grad_norm": 0.18153394758701324,
"learning_rate": 3.7134155325877772e-06,
"loss": 0.0329,
"step": 15890
},
{
"epoch": 16.374871266735326,
"grad_norm": 0.4066762924194336,
"learning_rate": 3.678754177634053e-06,
"loss": 0.0293,
"step": 15900
},
{
"epoch": 16.38516992790937,
"grad_norm": 0.33672627806663513,
"learning_rate": 3.64424916898965e-06,
"loss": 0.0303,
"step": 15910
},
{
"epoch": 16.395468589083418,
"grad_norm": 0.273366242647171,
"learning_rate": 3.6099006231179622e-06,
"loss": 0.0307,
"step": 15920
},
{
"epoch": 16.405767250257465,
"grad_norm": 0.22325216233730316,
"learning_rate": 3.575708655954324e-06,
"loss": 0.0327,
"step": 15930
},
{
"epoch": 16.416065911431513,
"grad_norm": 0.18643653392791748,
"learning_rate": 3.541673382905558e-06,
"loss": 0.0346,
"step": 15940
},
{
"epoch": 16.42636457260556,
"grad_norm": 0.2503977119922638,
"learning_rate": 3.5077949188495996e-06,
"loss": 0.033,
"step": 15950
},
{
"epoch": 16.43666323377961,
"grad_norm": 0.29063940048217773,
"learning_rate": 3.474073378135123e-06,
"loss": 0.0286,
"step": 15960
},
{
"epoch": 16.446961894953656,
"grad_norm": 0.2275126725435257,
"learning_rate": 3.440508874581139e-06,
"loss": 0.0321,
"step": 15970
},
{
"epoch": 16.457260556127704,
"grad_norm": 0.24945175647735596,
"learning_rate": 3.4071015214766134e-06,
"loss": 0.0312,
"step": 15980
},
{
"epoch": 16.46755921730175,
"grad_norm": 0.4091668725013733,
"learning_rate": 3.3738514315800995e-06,
"loss": 0.0351,
"step": 15990
},
{
"epoch": 16.4778578784758,
"grad_norm": 0.20869703590869904,
"learning_rate": 3.3407587171193354e-06,
"loss": 0.0262,
"step": 16000
},
{
"epoch": 16.488156539649847,
"grad_norm": 0.19803866744041443,
"learning_rate": 3.3078234897908788e-06,
"loss": 0.0293,
"step": 16010
},
{
"epoch": 16.498455200823894,
"grad_norm": 0.24785685539245605,
"learning_rate": 3.2750458607597457e-06,
"loss": 0.0295,
"step": 16020
},
{
"epoch": 16.508753861997942,
"grad_norm": 0.23679105937480927,
"learning_rate": 3.2424259406589664e-06,
"loss": 0.0269,
"step": 16030
},
{
"epoch": 16.519052523171986,
"grad_norm": 0.21375852823257446,
"learning_rate": 3.209963839589325e-06,
"loss": 0.0236,
"step": 16040
},
{
"epoch": 16.529351184346034,
"grad_norm": 0.1723773181438446,
"learning_rate": 3.177659667118882e-06,
"loss": 0.0312,
"step": 16050
},
{
"epoch": 16.53964984552008,
"grad_norm": 0.24385997653007507,
"learning_rate": 3.1455135322826678e-06,
"loss": 0.0301,
"step": 16060
},
{
"epoch": 16.54994850669413,
"grad_norm": 0.2073340266942978,
"learning_rate": 3.1135255435822796e-06,
"loss": 0.0286,
"step": 16070
},
{
"epoch": 16.560247167868177,
"grad_norm": 0.2794674336910248,
"learning_rate": 3.0816958089855462e-06,
"loss": 0.0265,
"step": 16080
},
{
"epoch": 16.570545829042224,
"grad_norm": 0.2308894544839859,
"learning_rate": 3.0500244359261355e-06,
"loss": 0.0284,
"step": 16090
},
{
"epoch": 16.580844490216272,
"grad_norm": 0.2674751579761505,
"learning_rate": 3.018511531303203e-06,
"loss": 0.0282,
"step": 16100
},
{
"epoch": 16.59114315139032,
"grad_norm": 0.20278188586235046,
"learning_rate": 2.9871572014810555e-06,
"loss": 0.0272,
"step": 16110
},
{
"epoch": 16.601441812564367,
"grad_norm": 0.20840872824192047,
"learning_rate": 2.9559615522887273e-06,
"loss": 0.0358,
"step": 16120
},
{
"epoch": 16.611740473738415,
"grad_norm": 0.26591232419013977,
"learning_rate": 2.924924689019698e-06,
"loss": 0.0262,
"step": 16130
},
{
"epoch": 16.622039134912463,
"grad_norm": 0.22082144021987915,
"learning_rate": 2.8940467164314924e-06,
"loss": 0.0321,
"step": 16140
},
{
"epoch": 16.63233779608651,
"grad_norm": 0.2413538098335266,
"learning_rate": 2.8633277387453308e-06,
"loss": 0.0377,
"step": 16150
},
{
"epoch": 16.642636457260558,
"grad_norm": 0.2731287479400635,
"learning_rate": 2.8327678596457963e-06,
"loss": 0.031,
"step": 16160
},
{
"epoch": 16.652935118434602,
"grad_norm": 0.18613195419311523,
"learning_rate": 2.802367182280463e-06,
"loss": 0.0367,
"step": 16170
},
{
"epoch": 16.66323377960865,
"grad_norm": 0.19616888463497162,
"learning_rate": 2.7721258092595627e-06,
"loss": 0.0265,
"step": 16180
},
{
"epoch": 16.673532440782697,
"grad_norm": 0.20527370274066925,
"learning_rate": 2.7420438426556338e-06,
"loss": 0.0331,
"step": 16190
},
{
"epoch": 16.683831101956745,
"grad_norm": 0.21385008096694946,
"learning_rate": 2.712121384003169e-06,
"loss": 0.0271,
"step": 16200
},
{
"epoch": 16.694129763130793,
"grad_norm": 0.2785768210887909,
"learning_rate": 2.682358534298285e-06,
"loss": 0.0365,
"step": 16210
},
{
"epoch": 16.70442842430484,
"grad_norm": 0.2710186243057251,
"learning_rate": 2.652755393998396e-06,
"loss": 0.0245,
"step": 16220
},
{
"epoch": 16.714727085478888,
"grad_norm": 0.2453254610300064,
"learning_rate": 2.6233120630218045e-06,
"loss": 0.0327,
"step": 16230
},
{
"epoch": 16.725025746652936,
"grad_norm": 0.2788352072238922,
"learning_rate": 2.594028640747476e-06,
"loss": 0.0292,
"step": 16240
},
{
"epoch": 16.735324407826983,
"grad_norm": 0.4019950032234192,
"learning_rate": 2.564905226014597e-06,
"loss": 0.029,
"step": 16250
},
{
"epoch": 16.74562306900103,
"grad_norm": 0.2551436424255371,
"learning_rate": 2.5359419171223086e-06,
"loss": 0.0296,
"step": 16260
},
{
"epoch": 16.75592173017508,
"grad_norm": 0.2889397442340851,
"learning_rate": 2.507138811829346e-06,
"loss": 0.033,
"step": 16270
},
{
"epoch": 16.766220391349126,
"grad_norm": 0.25674816966056824,
"learning_rate": 2.4784960073537143e-06,
"loss": 0.0267,
"step": 16280
},
{
"epoch": 16.77651905252317,
"grad_norm": 0.21177352964878082,
"learning_rate": 2.4500136003723638e-06,
"loss": 0.0262,
"step": 16290
},
{
"epoch": 16.786817713697218,
"grad_norm": 0.21103815734386444,
"learning_rate": 2.421691687020855e-06,
"loss": 0.0295,
"step": 16300
},
{
"epoch": 16.797116374871266,
"grad_norm": 0.26780322194099426,
"learning_rate": 2.3935303628930707e-06,
"loss": 0.0327,
"step": 16310
},
{
"epoch": 16.807415036045313,
"grad_norm": 0.49311545491218567,
"learning_rate": 2.3655297230408045e-06,
"loss": 0.03,
"step": 16320
},
{
"epoch": 16.81771369721936,
"grad_norm": 0.2364225834608078,
"learning_rate": 2.3376898619735577e-06,
"loss": 0.0276,
"step": 16330
},
{
"epoch": 16.82801235839341,
"grad_norm": 0.29716435074806213,
"learning_rate": 2.3100108736581305e-06,
"loss": 0.027,
"step": 16340
},
{
"epoch": 16.838311019567456,
"grad_norm": 0.20759916305541992,
"learning_rate": 2.282492851518342e-06,
"loss": 0.0275,
"step": 16350
},
{
"epoch": 16.848609680741504,
"grad_norm": 0.1657613217830658,
"learning_rate": 2.2551358884347007e-06,
"loss": 0.0273,
"step": 16360
},
{
"epoch": 16.85890834191555,
"grad_norm": 0.16528256237506866,
"learning_rate": 2.227940076744117e-06,
"loss": 0.0309,
"step": 16370
},
{
"epoch": 16.8692070030896,
"grad_norm": 0.28386402130126953,
"learning_rate": 2.2009055082395537e-06,
"loss": 0.0324,
"step": 16380
},
{
"epoch": 16.879505664263647,
"grad_norm": 0.23188601434230804,
"learning_rate": 2.174032274169746e-06,
"loss": 0.0283,
"step": 16390
},
{
"epoch": 16.889804325437694,
"grad_norm": 0.34195181727409363,
"learning_rate": 2.1473204652388834e-06,
"loss": 0.031,
"step": 16400
},
{
"epoch": 16.900102986611742,
"grad_norm": 0.19225898385047913,
"learning_rate": 2.1207701716062956e-06,
"loss": 0.0374,
"step": 16410
},
{
"epoch": 16.910401647785786,
"grad_norm": 0.4472239911556244,
"learning_rate": 2.0943814828861762e-06,
"loss": 0.0304,
"step": 16420
},
{
"epoch": 16.920700308959834,
"grad_norm": 0.26532843708992004,
"learning_rate": 2.0681544881472283e-06,
"loss": 0.0291,
"step": 16430
},
{
"epoch": 16.93099897013388,
"grad_norm": 0.27116134762763977,
"learning_rate": 2.0420892759124176e-06,
"loss": 0.0224,
"step": 16440
},
{
"epoch": 16.94129763130793,
"grad_norm": 0.3424379825592041,
"learning_rate": 2.0161859341586597e-06,
"loss": 0.0274,
"step": 16450
},
{
"epoch": 16.951596292481977,
"grad_norm": 0.23772460222244263,
"learning_rate": 1.9904445503164838e-06,
"loss": 0.0308,
"step": 16460
},
{
"epoch": 16.961894953656024,
"grad_norm": 0.23013190925121307,
"learning_rate": 1.964865211269801e-06,
"loss": 0.0265,
"step": 16470
},
{
"epoch": 16.972193614830072,
"grad_norm": 0.2528025805950165,
"learning_rate": 1.939448003355554e-06,
"loss": 0.0342,
"step": 16480
},
{
"epoch": 16.98249227600412,
"grad_norm": 0.39106324315071106,
"learning_rate": 1.914193012363469e-06,
"loss": 0.0326,
"step": 16490
},
{
"epoch": 16.992790937178167,
"grad_norm": 0.4082978069782257,
"learning_rate": 1.8891003235357308e-06,
"loss": 0.0321,
"step": 16500
},
{
"epoch": 17.003089598352215,
"grad_norm": 0.1785215586423874,
"learning_rate": 1.8641700215667413e-06,
"loss": 0.0265,
"step": 16510
},
{
"epoch": 17.013388259526263,
"grad_norm": 0.5540566444396973,
"learning_rate": 1.839402190602757e-06,
"loss": 0.0281,
"step": 16520
},
{
"epoch": 17.02368692070031,
"grad_norm": 0.2588430941104889,
"learning_rate": 1.8147969142417066e-06,
"loss": 0.0284,
"step": 16530
},
{
"epoch": 17.033985581874358,
"grad_norm": 0.3563145399093628,
"learning_rate": 1.7903542755328073e-06,
"loss": 0.0308,
"step": 16540
},
{
"epoch": 17.044284243048402,
"grad_norm": 0.303353488445282,
"learning_rate": 1.766074356976366e-06,
"loss": 0.0302,
"step": 16550
},
{
"epoch": 17.05458290422245,
"grad_norm": 0.24329645931720734,
"learning_rate": 1.7419572405234453e-06,
"loss": 0.0282,
"step": 16560
},
{
"epoch": 17.064881565396497,
"grad_norm": 0.212374746799469,
"learning_rate": 1.7180030075756136e-06,
"loss": 0.0298,
"step": 16570
},
{
"epoch": 17.075180226570545,
"grad_norm": 0.22339214384555817,
"learning_rate": 1.6942117389846746e-06,
"loss": 0.0314,
"step": 16580
},
{
"epoch": 17.085478887744593,
"grad_norm": 0.2897525131702423,
"learning_rate": 1.6705835150523707e-06,
"loss": 0.0331,
"step": 16590
},
{
"epoch": 17.09577754891864,
"grad_norm": 0.20139732956886292,
"learning_rate": 1.6471184155301355e-06,
"loss": 0.0271,
"step": 16600
},
{
"epoch": 17.106076210092688,
"grad_norm": 0.30817776918411255,
"learning_rate": 1.6238165196188039e-06,
"loss": 0.0288,
"step": 16610
},
{
"epoch": 17.116374871266736,
"grad_norm": 0.23742049932479858,
"learning_rate": 1.6006779059683784e-06,
"loss": 0.0317,
"step": 16620
},
{
"epoch": 17.126673532440783,
"grad_norm": 0.2712803781032562,
"learning_rate": 1.5777026526777094e-06,
"loss": 0.029,
"step": 16630
},
{
"epoch": 17.13697219361483,
"grad_norm": 0.19828765094280243,
"learning_rate": 1.5548908372942983e-06,
"loss": 0.0315,
"step": 16640
},
{
"epoch": 17.14727085478888,
"grad_norm": 0.27912184596061707,
"learning_rate": 1.5322425368139714e-06,
"loss": 0.0293,
"step": 16650
},
{
"epoch": 17.157569515962926,
"grad_norm": 0.41649627685546875,
"learning_rate": 1.5097578276806633e-06,
"loss": 0.0299,
"step": 16660
},
{
"epoch": 17.167868177136974,
"grad_norm": 0.20297054946422577,
"learning_rate": 1.487436785786145e-06,
"loss": 0.0313,
"step": 16670
},
{
"epoch": 17.178166838311018,
"grad_norm": 0.38883742690086365,
"learning_rate": 1.4652794864697671e-06,
"loss": 0.0293,
"step": 16680
},
{
"epoch": 17.188465499485066,
"grad_norm": 0.2401762455701828,
"learning_rate": 1.4432860045182017e-06,
"loss": 0.0282,
"step": 16690
},
{
"epoch": 17.198764160659113,
"grad_norm": 0.3450429141521454,
"learning_rate": 1.4214564141651898e-06,
"loss": 0.0249,
"step": 16700
},
{
"epoch": 17.20906282183316,
"grad_norm": 0.17480014264583588,
"learning_rate": 1.3997907890913265e-06,
"loss": 0.0271,
"step": 16710
},
{
"epoch": 17.21936148300721,
"grad_norm": 0.2633569538593292,
"learning_rate": 1.3782892024237327e-06,
"loss": 0.0282,
"step": 16720
},
{
"epoch": 17.229660144181256,
"grad_norm": 0.22684310376644135,
"learning_rate": 1.3569517267359e-06,
"loss": 0.0325,
"step": 16730
},
{
"epoch": 17.239958805355304,
"grad_norm": 0.30432412028312683,
"learning_rate": 1.33577843404738e-06,
"loss": 0.027,
"step": 16740
},
{
"epoch": 17.25025746652935,
"grad_norm": 0.3308713734149933,
"learning_rate": 1.3147693958235618e-06,
"loss": 0.0296,
"step": 16750
},
{
"epoch": 17.2605561277034,
"grad_norm": 0.2591300904750824,
"learning_rate": 1.2939246829754503e-06,
"loss": 0.0191,
"step": 16760
},
{
"epoch": 17.270854788877447,
"grad_norm": 0.3229091763496399,
"learning_rate": 1.2732443658593884e-06,
"loss": 0.0278,
"step": 16770
},
{
"epoch": 17.281153450051495,
"grad_norm": 0.3232883810997009,
"learning_rate": 1.2527285142768574e-06,
"loss": 0.0308,
"step": 16780
},
{
"epoch": 17.291452111225542,
"grad_norm": 0.16374994814395905,
"learning_rate": 1.2323771974742104e-06,
"loss": 0.0285,
"step": 16790
},
{
"epoch": 17.301750772399586,
"grad_norm": 0.4016587734222412,
"learning_rate": 1.212190484142467e-06,
"loss": 0.0287,
"step": 16800
},
{
"epoch": 17.312049433573634,
"grad_norm": 0.7468344569206238,
"learning_rate": 1.192168442417052e-06,
"loss": 0.0318,
"step": 16810
},
{
"epoch": 17.32234809474768,
"grad_norm": 0.62845778465271,
"learning_rate": 1.1723111398776077e-06,
"loss": 0.0307,
"step": 16820
},
{
"epoch": 17.33264675592173,
"grad_norm": 0.29316961765289307,
"learning_rate": 1.1526186435476927e-06,
"loss": 0.0322,
"step": 16830
},
{
"epoch": 17.342945417095777,
"grad_norm": 0.2891688942909241,
"learning_rate": 1.1330910198946442e-06,
"loss": 0.0274,
"step": 16840
},
{
"epoch": 17.353244078269825,
"grad_norm": 0.28778383135795593,
"learning_rate": 1.1137283348292892e-06,
"loss": 0.0341,
"step": 16850
},
{
"epoch": 17.363542739443872,
"grad_norm": 0.17100463807582855,
"learning_rate": 1.0945306537057555e-06,
"loss": 0.0334,
"step": 16860
},
{
"epoch": 17.37384140061792,
"grad_norm": 0.17976661026477814,
"learning_rate": 1.0754980413212268e-06,
"loss": 0.0299,
"step": 16870
},
{
"epoch": 17.384140061791967,
"grad_norm": 0.2614526152610779,
"learning_rate": 1.0566305619157502e-06,
"loss": 0.0278,
"step": 16880
},
{
"epoch": 17.394438722966015,
"grad_norm": 0.195588618516922,
"learning_rate": 1.0379282791719958e-06,
"loss": 0.028,
"step": 16890
},
{
"epoch": 17.404737384140063,
"grad_norm": 1.0282113552093506,
"learning_rate": 1.0193912562150464e-06,
"loss": 0.0291,
"step": 16900
},
{
"epoch": 17.41503604531411,
"grad_norm": 0.2868080735206604,
"learning_rate": 1.0010195556122203e-06,
"loss": 0.0329,
"step": 16910
},
{
"epoch": 17.425334706488158,
"grad_norm": 0.2227233201265335,
"learning_rate": 9.828132393727875e-07,
"loss": 0.0262,
"step": 16920
},
{
"epoch": 17.435633367662202,
"grad_norm": 0.20315021276474,
"learning_rate": 9.647723689478305e-07,
"loss": 0.0324,
"step": 16930
},
{
"epoch": 17.44593202883625,
"grad_norm": 0.6371609568595886,
"learning_rate": 9.468970052300019e-07,
"loss": 0.0318,
"step": 16940
},
{
"epoch": 17.456230690010297,
"grad_norm": 0.18564990162849426,
"learning_rate": 9.291872085533227e-07,
"loss": 0.0289,
"step": 16950
},
{
"epoch": 17.466529351184345,
"grad_norm": 0.22705796360969543,
"learning_rate": 9.116430386929886e-07,
"loss": 0.0249,
"step": 16960
},
{
"epoch": 17.476828012358393,
"grad_norm": 0.2133428156375885,
"learning_rate": 8.942645548651541e-07,
"loss": 0.0376,
"step": 16970
},
{
"epoch": 17.48712667353244,
"grad_norm": 0.19329524040222168,
"learning_rate": 8.770518157267482e-07,
"loss": 0.0308,
"step": 16980
},
{
"epoch": 17.497425334706488,
"grad_norm": 0.2410387098789215,
"learning_rate": 8.60004879375259e-07,
"loss": 0.0273,
"step": 16990
},
{
"epoch": 17.507723995880536,
"grad_norm": 0.20141083002090454,
"learning_rate": 8.4312380334855e-07,
"loss": 0.0336,
"step": 17000
},
{
"epoch": 17.518022657054583,
"grad_norm": 0.27098795771598816,
"learning_rate": 8.264086446246655e-07,
"loss": 0.0313,
"step": 17010
},
{
"epoch": 17.52832131822863,
"grad_norm": 0.35340428352355957,
"learning_rate": 8.098594596216424e-07,
"loss": 0.0348,
"step": 17020
},
{
"epoch": 17.53861997940268,
"grad_norm": 0.3264867663383484,
"learning_rate": 7.934763041972937e-07,
"loss": 0.0302,
"step": 17030
},
{
"epoch": 17.548918640576726,
"grad_norm": 0.2895232141017914,
"learning_rate": 7.772592336490525e-07,
"loss": 0.0325,
"step": 17040
},
{
"epoch": 17.559217301750774,
"grad_norm": 0.24770499765872955,
"learning_rate": 7.612083027137728e-07,
"loss": 0.0319,
"step": 17050
},
{
"epoch": 17.569515962924818,
"grad_norm": 0.4487510323524475,
"learning_rate": 7.453235655675406e-07,
"loss": 0.0258,
"step": 17060
},
{
"epoch": 17.579814624098866,
"grad_norm": 0.38243043422698975,
"learning_rate": 7.296050758254957e-07,
"loss": 0.0308,
"step": 17070
},
{
"epoch": 17.590113285272913,
"grad_norm": 0.5216277837753296,
"learning_rate": 7.140528865416441e-07,
"loss": 0.0268,
"step": 17080
},
{
"epoch": 17.60041194644696,
"grad_norm": 0.300006240606308,
"learning_rate": 6.986670502086901e-07,
"loss": 0.0324,
"step": 17090
},
{
"epoch": 17.61071060762101,
"grad_norm": 0.22057189047336578,
"learning_rate": 6.834476187578543e-07,
"loss": 0.0282,
"step": 17100
},
{
"epoch": 17.621009268795056,
"grad_norm": 0.26959654688835144,
"learning_rate": 6.683946435586952e-07,
"loss": 0.0307,
"step": 17110
},
{
"epoch": 17.631307929969104,
"grad_norm": 0.28995075821876526,
"learning_rate": 6.535081754189321e-07,
"loss": 0.0318,
"step": 17120
},
{
"epoch": 17.64160659114315,
"grad_norm": 0.3135945200920105,
"learning_rate": 6.387882645842947e-07,
"loss": 0.0287,
"step": 17130
},
{
"epoch": 17.6519052523172,
"grad_norm": 0.26953238248825073,
"learning_rate": 6.24234960738318e-07,
"loss": 0.0292,
"step": 17140
},
{
"epoch": 17.662203913491247,
"grad_norm": 0.2764807343482971,
"learning_rate": 6.098483130022148e-07,
"loss": 0.027,
"step": 17150
},
{
"epoch": 17.672502574665295,
"grad_norm": 0.3281687796115875,
"learning_rate": 5.956283699346754e-07,
"loss": 0.0254,
"step": 17160
},
{
"epoch": 17.682801235839342,
"grad_norm": 0.17730310559272766,
"learning_rate": 5.815751795317237e-07,
"loss": 0.0277,
"step": 17170
},
{
"epoch": 17.69309989701339,
"grad_norm": 0.43514519929885864,
"learning_rate": 5.676887892265559e-07,
"loss": 0.0238,
"step": 17180
},
{
"epoch": 17.703398558187434,
"grad_norm": 0.31942808628082275,
"learning_rate": 5.539692458893575e-07,
"loss": 0.027,
"step": 17190
},
{
"epoch": 17.71369721936148,
"grad_norm": 1.2527509927749634,
"learning_rate": 5.404165958271811e-07,
"loss": 0.029,
"step": 17200
},
{
"epoch": 17.72399588053553,
"grad_norm": 0.2568182051181793,
"learning_rate": 5.270308847837579e-07,
"loss": 0.0316,
"step": 17210
},
{
"epoch": 17.734294541709577,
"grad_norm": 0.32886284589767456,
"learning_rate": 5.13812157939364e-07,
"loss": 0.0341,
"step": 17220
},
{
"epoch": 17.744593202883625,
"grad_norm": 0.1350669264793396,
"learning_rate": 5.007604599106486e-07,
"loss": 0.0279,
"step": 17230
},
{
"epoch": 17.754891864057672,
"grad_norm": 0.24451610445976257,
"learning_rate": 4.878758347505175e-07,
"loss": 0.0261,
"step": 17240
},
{
"epoch": 17.76519052523172,
"grad_norm": 0.23091380298137665,
"learning_rate": 4.751583259479331e-07,
"loss": 0.031,
"step": 17250
},
{
"epoch": 17.775489186405768,
"grad_norm": 0.311443030834198,
"learning_rate": 4.6260797642782014e-07,
"loss": 0.032,
"step": 17260
},
{
"epoch": 17.785787847579815,
"grad_norm": 0.2045062929391861,
"learning_rate": 4.5022482855088255e-07,
"loss": 0.0256,
"step": 17270
},
{
"epoch": 17.796086508753863,
"grad_norm": 0.339093953371048,
"learning_rate": 4.380089241134866e-07,
"loss": 0.0306,
"step": 17280
},
{
"epoch": 17.80638516992791,
"grad_norm": 0.3019813597202301,
"learning_rate": 4.259603043475002e-07,
"loss": 0.0302,
"step": 17290
},
{
"epoch": 17.816683831101958,
"grad_norm": 0.21195490658283234,
"learning_rate": 4.1407900992015414e-07,
"loss": 0.0318,
"step": 17300
},
{
"epoch": 17.826982492276002,
"grad_norm": 0.2570505142211914,
"learning_rate": 4.023650809339363e-07,
"loss": 0.0387,
"step": 17310
},
{
"epoch": 17.83728115345005,
"grad_norm": 0.36077165603637695,
"learning_rate": 3.9081855692640333e-07,
"loss": 0.0281,
"step": 17320
},
{
"epoch": 17.847579814624098,
"grad_norm": 0.24089422821998596,
"learning_rate": 3.7943947687010816e-07,
"loss": 0.0265,
"step": 17330
},
{
"epoch": 17.857878475798145,
"grad_norm": 0.3065880835056305,
"learning_rate": 3.6822787917240587e-07,
"loss": 0.0265,
"step": 17340
},
{
"epoch": 17.868177136972193,
"grad_norm": 0.20888155698776245,
"learning_rate": 3.571838016753759e-07,
"loss": 0.0345,
"step": 17350
},
{
"epoch": 17.87847579814624,
"grad_norm": 0.42461952567100525,
"learning_rate": 3.4630728165566117e-07,
"loss": 0.0334,
"step": 17360
},
{
"epoch": 17.888774459320288,
"grad_norm": 0.36267679929733276,
"learning_rate": 3.3559835582435695e-07,
"loss": 0.0306,
"step": 17370
},
{
"epoch": 17.899073120494336,
"grad_norm": 0.1654314249753952,
"learning_rate": 3.250570603268943e-07,
"loss": 0.0247,
"step": 17380
},
{
"epoch": 17.909371781668384,
"grad_norm": 0.2670270800590515,
"learning_rate": 3.1468343074290143e-07,
"loss": 0.032,
"step": 17390
},
{
"epoch": 17.91967044284243,
"grad_norm": 0.2694757878780365,
"learning_rate": 3.0447750208607573e-07,
"loss": 0.0269,
"step": 17400
},
{
"epoch": 17.92996910401648,
"grad_norm": 0.34293317794799805,
"learning_rate": 2.944393088041009e-07,
"loss": 0.0234,
"step": 17410
},
{
"epoch": 17.940267765190526,
"grad_norm": 0.25010308623313904,
"learning_rate": 2.8456888477850776e-07,
"loss": 0.0294,
"step": 17420
},
{
"epoch": 17.950566426364574,
"grad_norm": 0.34105420112609863,
"learning_rate": 2.7486626332455245e-07,
"loss": 0.0292,
"step": 17430
},
{
"epoch": 17.96086508753862,
"grad_norm": 0.2277262657880783,
"learning_rate": 2.653314771911108e-07,
"loss": 0.0398,
"step": 17440
},
{
"epoch": 17.971163748712666,
"grad_norm": 0.3880465030670166,
"learning_rate": 2.5596455856058963e-07,
"loss": 0.0323,
"step": 17450
},
{
"epoch": 17.981462409886714,
"grad_norm": 0.1923012137413025,
"learning_rate": 2.467655390487822e-07,
"loss": 0.0227,
"step": 17460
},
{
"epoch": 17.99176107106076,
"grad_norm": 0.24936918914318085,
"learning_rate": 2.3773444970477955e-07,
"loss": 0.0249,
"step": 17470
},
{
"epoch": 18.00205973223481,
"grad_norm": 0.2869769334793091,
"learning_rate": 2.2887132101087615e-07,
"loss": 0.0248,
"step": 17480
},
{
"epoch": 18.012358393408856,
"grad_norm": 0.25350290536880493,
"learning_rate": 2.201761828824367e-07,
"loss": 0.0327,
"step": 17490
},
{
"epoch": 18.022657054582904,
"grad_norm": 0.27213600277900696,
"learning_rate": 2.1164906466783485e-07,
"loss": 0.0285,
"step": 17500
},
{
"epoch": 18.03295571575695,
"grad_norm": 0.257794588804245,
"learning_rate": 2.032899951483147e-07,
"loss": 0.0281,
"step": 17510
},
{
"epoch": 18.043254376931,
"grad_norm": 0.2469080537557602,
"learning_rate": 1.9509900253792955e-07,
"loss": 0.0259,
"step": 17520
},
{
"epoch": 18.053553038105047,
"grad_norm": 0.2920747995376587,
"learning_rate": 1.870761144834088e-07,
"loss": 0.0287,
"step": 17530
},
{
"epoch": 18.063851699279095,
"grad_norm": 0.2282969057559967,
"learning_rate": 1.7922135806410778e-07,
"loss": 0.0277,
"step": 17540
},
{
"epoch": 18.074150360453142,
"grad_norm": 0.28502708673477173,
"learning_rate": 1.7153475979186927e-07,
"loss": 0.0345,
"step": 17550
},
{
"epoch": 18.08444902162719,
"grad_norm": 0.23902451992034912,
"learning_rate": 1.6401634561098444e-07,
"loss": 0.0335,
"step": 17560
},
{
"epoch": 18.094747682801234,
"grad_norm": 0.3159581124782562,
"learning_rate": 1.566661408980541e-07,
"loss": 0.0299,
"step": 17570
},
{
"epoch": 18.105046343975282,
"grad_norm": 0.12344943732023239,
"learning_rate": 1.4948417046194985e-07,
"loss": 0.0272,
"step": 17580
},
{
"epoch": 18.11534500514933,
"grad_norm": 0.3794369101524353,
"learning_rate": 1.42470458543692e-07,
"loss": 0.0338,
"step": 17590
},
{
"epoch": 18.125643666323377,
"grad_norm": 0.1987241804599762,
"learning_rate": 1.3562502881639404e-07,
"loss": 0.0223,
"step": 17600
},
{
"epoch": 18.135942327497425,
"grad_norm": 0.21883957087993622,
"learning_rate": 1.2894790438516824e-07,
"loss": 0.0275,
"step": 17610
},
{
"epoch": 18.146240988671472,
"grad_norm": 0.2665363550186157,
"learning_rate": 1.2243910778705348e-07,
"loss": 0.033,
"step": 17620
},
{
"epoch": 18.15653964984552,
"grad_norm": 0.15010571479797363,
"learning_rate": 1.1609866099094313e-07,
"loss": 0.0227,
"step": 17630
},
{
"epoch": 18.166838311019568,
"grad_norm": 0.19142857193946838,
"learning_rate": 1.0992658539750178e-07,
"loss": 0.0279,
"step": 17640
},
{
"epoch": 18.177136972193615,
"grad_norm": 0.2638980746269226,
"learning_rate": 1.0392290183909304e-07,
"loss": 0.0265,
"step": 17650
},
{
"epoch": 18.187435633367663,
"grad_norm": 0.19933411478996277,
"learning_rate": 9.808763057971849e-08,
"loss": 0.0294,
"step": 17660
},
{
"epoch": 18.19773429454171,
"grad_norm": 0.32049107551574707,
"learning_rate": 9.242079131495107e-08,
"loss": 0.0268,
"step": 17670
},
{
"epoch": 18.20803295571576,
"grad_norm": 0.22636005282402039,
"learning_rate": 8.69224031718463e-08,
"loss": 0.0359,
"step": 17680
},
{
"epoch": 18.218331616889806,
"grad_norm": 0.19072987139225006,
"learning_rate": 8.159248470890334e-08,
"loss": 0.0272,
"step": 17690
},
{
"epoch": 18.22863027806385,
"grad_norm": 0.5597253441810608,
"learning_rate": 7.643105391598737e-08,
"loss": 0.0296,
"step": 17700
},
{
"epoch": 18.238928939237898,
"grad_norm": 0.20172372460365295,
"learning_rate": 7.143812821427953e-08,
"loss": 0.0321,
"step": 17710
},
{
"epoch": 18.249227600411945,
"grad_norm": 0.49044567346572876,
"learning_rate": 6.661372445621039e-08,
"loss": 0.0284,
"step": 17720
},
{
"epoch": 18.259526261585993,
"grad_norm": 0.2032887190580368,
"learning_rate": 6.19578589253933e-08,
"loss": 0.03,
"step": 17730
},
{
"epoch": 18.26982492276004,
"grad_norm": 0.30425992608070374,
"learning_rate": 5.747054733660773e-08,
"loss": 0.0301,
"step": 17740
},
{
"epoch": 18.28012358393409,
"grad_norm": 0.2486412227153778,
"learning_rate": 5.3151804835688267e-08,
"loss": 0.0261,
"step": 17750
},
{
"epoch": 18.290422245108136,
"grad_norm": 0.21091780066490173,
"learning_rate": 4.9001645999524613e-08,
"loss": 0.0276,
"step": 17760
},
{
"epoch": 18.300720906282184,
"grad_norm": 0.36458486318588257,
"learning_rate": 4.502008483598941e-08,
"loss": 0.0277,
"step": 17770
},
{
"epoch": 18.31101956745623,
"grad_norm": 0.21798443794250488,
"learning_rate": 4.1207134783888265e-08,
"loss": 0.0307,
"step": 17780
},
{
"epoch": 18.32131822863028,
"grad_norm": 0.27093908190727234,
"learning_rate": 3.756280871293205e-08,
"loss": 0.0328,
"step": 17790
},
{
"epoch": 18.331616889804327,
"grad_norm": 0.1765187829732895,
"learning_rate": 3.4087118923659125e-08,
"loss": 0.0305,
"step": 17800
},
{
"epoch": 18.341915550978374,
"grad_norm": 0.9125376343727112,
"learning_rate": 3.078007714744646e-08,
"loss": 0.0408,
"step": 17810
},
{
"epoch": 18.352214212152422,
"grad_norm": 0.1739547997713089,
"learning_rate": 2.7641694546409746e-08,
"loss": 0.0282,
"step": 17820
},
{
"epoch": 18.362512873326466,
"grad_norm": 0.2467593103647232,
"learning_rate": 2.467198171342e-08,
"loss": 0.0266,
"step": 17830
},
{
"epoch": 18.372811534500514,
"grad_norm": 0.7820371389389038,
"learning_rate": 2.1870948672036984e-08,
"loss": 0.0263,
"step": 17840
},
{
"epoch": 18.38311019567456,
"grad_norm": 0.30878883600234985,
"learning_rate": 1.9238604876470334e-08,
"loss": 0.03,
"step": 17850
},
{
"epoch": 18.39340885684861,
"grad_norm": 0.2729048728942871,
"learning_rate": 1.6774959211568465e-08,
"loss": 0.035,
"step": 17860
},
{
"epoch": 18.403707518022657,
"grad_norm": 0.33503258228302,
"learning_rate": 1.4480019992785254e-08,
"loss": 0.0261,
"step": 17870
},
{
"epoch": 18.414006179196704,
"grad_norm": 0.24983762204647064,
"learning_rate": 1.2353794966135646e-08,
"loss": 0.0265,
"step": 17880
},
{
"epoch": 18.424304840370752,
"grad_norm": 0.24591587483882904,
"learning_rate": 1.0396291308190087e-08,
"loss": 0.0248,
"step": 17890
},
{
"epoch": 18.4346035015448,
"grad_norm": 0.24605391919612885,
"learning_rate": 8.607515626030128e-09,
"loss": 0.0289,
"step": 17900
},
{
"epoch": 18.444902162718847,
"grad_norm": 0.2520316541194916,
"learning_rate": 6.987473957242863e-09,
"loss": 0.0307,
"step": 17910
},
{
"epoch": 18.455200823892895,
"grad_norm": 0.46191495656967163,
"learning_rate": 5.536171769887632e-09,
"loss": 0.0303,
"step": 17920
},
{
"epoch": 18.465499485066942,
"grad_norm": 0.26452863216400146,
"learning_rate": 4.253613962496017e-09,
"loss": 0.0329,
"step": 17930
},
{
"epoch": 18.47579814624099,
"grad_norm": 0.3968678116798401,
"learning_rate": 3.1398048640385315e-09,
"loss": 0.0356,
"step": 17940
},
{
"epoch": 18.486096807415038,
"grad_norm": 0.19242151081562042,
"learning_rate": 2.1947482338968705e-09,
"loss": 0.0265,
"step": 17950
},
{
"epoch": 18.496395468589082,
"grad_norm": 0.20866911113262177,
"learning_rate": 1.4184472618972154e-09,
"loss": 0.0251,
"step": 17960
},
{
"epoch": 18.50669412976313,
"grad_norm": 0.17729917168617249,
"learning_rate": 8.109045682547223e-10,
"loss": 0.0264,
"step": 17970
},
{
"epoch": 18.516992790937177,
"grad_norm": 0.19232727587223053,
"learning_rate": 3.721222035846239e-10,
"loss": 0.0366,
"step": 17980
},
{
"epoch": 18.527291452111225,
"grad_norm": 0.41915977001190186,
"learning_rate": 1.0210164889112861e-10,
"loss": 0.0288,
"step": 17990
},
{
"epoch": 18.537590113285273,
"grad_norm": 0.742242693901062,
"learning_rate": 8.438155674195258e-13,
"loss": 0.0335,
"step": 18000
},
{
"epoch": 18.537590113285273,
"step": 18000,
"total_flos": 0.0,
"train_loss": 0.05001054983586073,
"train_runtime": 5749.6082,
"train_samples_per_second": 100.181,
"train_steps_per_second": 3.131
}
],
"logging_steps": 10,
"max_steps": 18000,
"num_input_tokens_seen": 0,
"num_train_epochs": 19,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}