gr15_open_pot_BASE / trainer_state.json
binhng's picture
Upload folder using huggingface_hub
0e8063c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 24.716786817713697,
"eval_steps": 500,
"global_step": 24000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010298661174047374,
"grad_norm": 13.149330139160156,
"learning_rate": 7.5e-07,
"loss": 1.0642,
"step": 10
},
{
"epoch": 0.02059732234809475,
"grad_norm": 10.384645462036133,
"learning_rate": 1.5833333333333336e-06,
"loss": 1.1435,
"step": 20
},
{
"epoch": 0.030895983522142123,
"grad_norm": 4.765048027038574,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.8064,
"step": 30
},
{
"epoch": 0.0411946446961895,
"grad_norm": 1.7030583620071411,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.4879,
"step": 40
},
{
"epoch": 0.05149330587023687,
"grad_norm": 1.4565768241882324,
"learning_rate": 4.083333333333334e-06,
"loss": 0.3388,
"step": 50
},
{
"epoch": 0.061791967044284246,
"grad_norm": 1.2755788564682007,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.2841,
"step": 60
},
{
"epoch": 0.07209062821833162,
"grad_norm": 1.058097004890442,
"learning_rate": 5.750000000000001e-06,
"loss": 0.2302,
"step": 70
},
{
"epoch": 0.082389289392379,
"grad_norm": 1.4532016515731812,
"learning_rate": 6.583333333333333e-06,
"loss": 0.2028,
"step": 80
},
{
"epoch": 0.09268795056642637,
"grad_norm": 1.1775424480438232,
"learning_rate": 7.416666666666668e-06,
"loss": 0.1856,
"step": 90
},
{
"epoch": 0.10298661174047374,
"grad_norm": 1.3665395975112915,
"learning_rate": 8.25e-06,
"loss": 0.181,
"step": 100
},
{
"epoch": 0.11328527291452112,
"grad_norm": 1.4632707834243774,
"learning_rate": 9.083333333333333e-06,
"loss": 0.1686,
"step": 110
},
{
"epoch": 0.12358393408856849,
"grad_norm": 1.2164292335510254,
"learning_rate": 9.916666666666668e-06,
"loss": 0.1575,
"step": 120
},
{
"epoch": 0.13388259526261587,
"grad_norm": 0.6911335587501526,
"learning_rate": 1.075e-05,
"loss": 0.1477,
"step": 130
},
{
"epoch": 0.14418125643666324,
"grad_norm": 0.7088552117347717,
"learning_rate": 1.1583333333333333e-05,
"loss": 0.1382,
"step": 140
},
{
"epoch": 0.15447991761071062,
"grad_norm": 1.046133041381836,
"learning_rate": 1.2416666666666667e-05,
"loss": 0.1411,
"step": 150
},
{
"epoch": 0.164778578784758,
"grad_norm": 1.1359339952468872,
"learning_rate": 1.3250000000000002e-05,
"loss": 0.1347,
"step": 160
},
{
"epoch": 0.17507723995880536,
"grad_norm": 0.8838634490966797,
"learning_rate": 1.4083333333333335e-05,
"loss": 0.1272,
"step": 170
},
{
"epoch": 0.18537590113285274,
"grad_norm": 1.1631752252578735,
"learning_rate": 1.4916666666666667e-05,
"loss": 0.1351,
"step": 180
},
{
"epoch": 0.1956745623069001,
"grad_norm": 1.1706782579421997,
"learning_rate": 1.575e-05,
"loss": 0.1273,
"step": 190
},
{
"epoch": 0.2059732234809475,
"grad_norm": 0.9792522192001343,
"learning_rate": 1.6583333333333334e-05,
"loss": 0.1253,
"step": 200
},
{
"epoch": 0.21627188465499486,
"grad_norm": 0.7125104069709778,
"learning_rate": 1.741666666666667e-05,
"loss": 0.1238,
"step": 210
},
{
"epoch": 0.22657054582904224,
"grad_norm": 0.849624514579773,
"learning_rate": 1.825e-05,
"loss": 0.1094,
"step": 220
},
{
"epoch": 0.2368692070030896,
"grad_norm": 1.1960203647613525,
"learning_rate": 1.9083333333333334e-05,
"loss": 0.1211,
"step": 230
},
{
"epoch": 0.24716786817713698,
"grad_norm": 0.7790117859840393,
"learning_rate": 1.9916666666666665e-05,
"loss": 0.1017,
"step": 240
},
{
"epoch": 0.25746652935118436,
"grad_norm": 0.8855171799659729,
"learning_rate": 2.075e-05,
"loss": 0.099,
"step": 250
},
{
"epoch": 0.26776519052523173,
"grad_norm": 0.6777231693267822,
"learning_rate": 2.1583333333333334e-05,
"loss": 0.1089,
"step": 260
},
{
"epoch": 0.2780638516992791,
"grad_norm": 0.9875190854072571,
"learning_rate": 2.2416666666666665e-05,
"loss": 0.1096,
"step": 270
},
{
"epoch": 0.2883625128733265,
"grad_norm": 1.0280433893203735,
"learning_rate": 2.3250000000000003e-05,
"loss": 0.0929,
"step": 280
},
{
"epoch": 0.29866117404737386,
"grad_norm": 0.7355198860168457,
"learning_rate": 2.4083333333333337e-05,
"loss": 0.1095,
"step": 290
},
{
"epoch": 0.30895983522142123,
"grad_norm": 0.8522012829780579,
"learning_rate": 2.4916666666666668e-05,
"loss": 0.0985,
"step": 300
},
{
"epoch": 0.3192584963954686,
"grad_norm": 0.547737717628479,
"learning_rate": 2.5750000000000002e-05,
"loss": 0.0926,
"step": 310
},
{
"epoch": 0.329557157569516,
"grad_norm": 0.8814008235931396,
"learning_rate": 2.6583333333333333e-05,
"loss": 0.0878,
"step": 320
},
{
"epoch": 0.33985581874356335,
"grad_norm": 1.0947939157485962,
"learning_rate": 2.7416666666666668e-05,
"loss": 0.0981,
"step": 330
},
{
"epoch": 0.35015447991761073,
"grad_norm": 0.799148678779602,
"learning_rate": 2.825e-05,
"loss": 0.0875,
"step": 340
},
{
"epoch": 0.3604531410916581,
"grad_norm": 0.6206310987472534,
"learning_rate": 2.9083333333333333e-05,
"loss": 0.0793,
"step": 350
},
{
"epoch": 0.3707518022657055,
"grad_norm": 0.6761384010314941,
"learning_rate": 2.991666666666667e-05,
"loss": 0.0765,
"step": 360
},
{
"epoch": 0.38105046343975285,
"grad_norm": 0.49738550186157227,
"learning_rate": 3.075e-05,
"loss": 0.0909,
"step": 370
},
{
"epoch": 0.3913491246138002,
"grad_norm": 0.9449822306632996,
"learning_rate": 3.158333333333334e-05,
"loss": 0.0767,
"step": 380
},
{
"epoch": 0.4016477857878476,
"grad_norm": 0.8295601606369019,
"learning_rate": 3.2416666666666664e-05,
"loss": 0.0837,
"step": 390
},
{
"epoch": 0.411946446961895,
"grad_norm": 0.9744588136672974,
"learning_rate": 3.325e-05,
"loss": 0.0778,
"step": 400
},
{
"epoch": 0.42224510813594235,
"grad_norm": 1.1776330471038818,
"learning_rate": 3.408333333333333e-05,
"loss": 0.0796,
"step": 410
},
{
"epoch": 0.4325437693099897,
"grad_norm": 0.7325335144996643,
"learning_rate": 3.491666666666667e-05,
"loss": 0.082,
"step": 420
},
{
"epoch": 0.4428424304840371,
"grad_norm": 0.9740093350410461,
"learning_rate": 3.575e-05,
"loss": 0.0844,
"step": 430
},
{
"epoch": 0.45314109165808447,
"grad_norm": 0.9867604970932007,
"learning_rate": 3.658333333333334e-05,
"loss": 0.074,
"step": 440
},
{
"epoch": 0.46343975283213185,
"grad_norm": 0.8307005763053894,
"learning_rate": 3.7416666666666664e-05,
"loss": 0.0763,
"step": 450
},
{
"epoch": 0.4737384140061792,
"grad_norm": 0.8879880905151367,
"learning_rate": 3.825e-05,
"loss": 0.0766,
"step": 460
},
{
"epoch": 0.4840370751802266,
"grad_norm": 0.8298061490058899,
"learning_rate": 3.908333333333333e-05,
"loss": 0.0808,
"step": 470
},
{
"epoch": 0.49433573635427397,
"grad_norm": 0.9233872294425964,
"learning_rate": 3.991666666666667e-05,
"loss": 0.0795,
"step": 480
},
{
"epoch": 0.5046343975283213,
"grad_norm": 0.9856144189834595,
"learning_rate": 4.075e-05,
"loss": 0.0764,
"step": 490
},
{
"epoch": 0.5149330587023687,
"grad_norm": 0.6428182721138,
"learning_rate": 4.158333333333333e-05,
"loss": 0.0804,
"step": 500
},
{
"epoch": 0.525231719876416,
"grad_norm": 0.7709551453590393,
"learning_rate": 4.241666666666667e-05,
"loss": 0.071,
"step": 510
},
{
"epoch": 0.5355303810504635,
"grad_norm": 0.5600109100341797,
"learning_rate": 4.325e-05,
"loss": 0.0766,
"step": 520
},
{
"epoch": 0.5458290422245108,
"grad_norm": 0.6735143065452576,
"learning_rate": 4.408333333333334e-05,
"loss": 0.0796,
"step": 530
},
{
"epoch": 0.5561277033985582,
"grad_norm": 0.7762349843978882,
"learning_rate": 4.491666666666667e-05,
"loss": 0.0816,
"step": 540
},
{
"epoch": 0.5664263645726055,
"grad_norm": 0.613395631313324,
"learning_rate": 4.575e-05,
"loss": 0.0663,
"step": 550
},
{
"epoch": 0.576725025746653,
"grad_norm": 0.5750753283500671,
"learning_rate": 4.658333333333333e-05,
"loss": 0.064,
"step": 560
},
{
"epoch": 0.5870236869207003,
"grad_norm": 0.7344352006912231,
"learning_rate": 4.741666666666667e-05,
"loss": 0.0708,
"step": 570
},
{
"epoch": 0.5973223480947477,
"grad_norm": 0.5173144340515137,
"learning_rate": 4.825e-05,
"loss": 0.0811,
"step": 580
},
{
"epoch": 0.607621009268795,
"grad_norm": 0.6798299551010132,
"learning_rate": 4.908333333333334e-05,
"loss": 0.0694,
"step": 590
},
{
"epoch": 0.6179196704428425,
"grad_norm": 0.931536853313446,
"learning_rate": 4.991666666666667e-05,
"loss": 0.085,
"step": 600
},
{
"epoch": 0.6282183316168898,
"grad_norm": 0.9364974498748779,
"learning_rate": 5.075e-05,
"loss": 0.081,
"step": 610
},
{
"epoch": 0.6385169927909372,
"grad_norm": 0.8403638005256653,
"learning_rate": 5.158333333333334e-05,
"loss": 0.0695,
"step": 620
},
{
"epoch": 0.6488156539649845,
"grad_norm": 0.5034735798835754,
"learning_rate": 5.241666666666667e-05,
"loss": 0.0732,
"step": 630
},
{
"epoch": 0.659114315139032,
"grad_norm": 0.7169724106788635,
"learning_rate": 5.325e-05,
"loss": 0.0735,
"step": 640
},
{
"epoch": 0.6694129763130793,
"grad_norm": 0.5860706567764282,
"learning_rate": 5.4083333333333345e-05,
"loss": 0.0678,
"step": 650
},
{
"epoch": 0.6797116374871267,
"grad_norm": 1.0557808876037598,
"learning_rate": 5.491666666666667e-05,
"loss": 0.073,
"step": 660
},
{
"epoch": 0.690010298661174,
"grad_norm": 0.5684497952461243,
"learning_rate": 5.575e-05,
"loss": 0.0681,
"step": 670
},
{
"epoch": 0.7003089598352215,
"grad_norm": 0.9891376495361328,
"learning_rate": 5.658333333333333e-05,
"loss": 0.079,
"step": 680
},
{
"epoch": 0.7106076210092688,
"grad_norm": 0.6732739210128784,
"learning_rate": 5.7416666666666675e-05,
"loss": 0.0677,
"step": 690
},
{
"epoch": 0.7209062821833162,
"grad_norm": 0.6489456295967102,
"learning_rate": 5.8250000000000006e-05,
"loss": 0.0742,
"step": 700
},
{
"epoch": 0.7312049433573635,
"grad_norm": 0.6407278776168823,
"learning_rate": 5.908333333333334e-05,
"loss": 0.0772,
"step": 710
},
{
"epoch": 0.741503604531411,
"grad_norm": 0.5639925599098206,
"learning_rate": 5.991666666666667e-05,
"loss": 0.0643,
"step": 720
},
{
"epoch": 0.7518022657054583,
"grad_norm": 0.9374951720237732,
"learning_rate": 6.0750000000000006e-05,
"loss": 0.0754,
"step": 730
},
{
"epoch": 0.7621009268795057,
"grad_norm": 0.6028985381126404,
"learning_rate": 6.158333333333334e-05,
"loss": 0.0659,
"step": 740
},
{
"epoch": 0.772399588053553,
"grad_norm": 0.7816867232322693,
"learning_rate": 6.241666666666666e-05,
"loss": 0.0781,
"step": 750
},
{
"epoch": 0.7826982492276005,
"grad_norm": 0.5676434636116028,
"learning_rate": 6.324999999999999e-05,
"loss": 0.0632,
"step": 760
},
{
"epoch": 0.7929969104016478,
"grad_norm": 0.5514146685600281,
"learning_rate": 6.408333333333334e-05,
"loss": 0.0653,
"step": 770
},
{
"epoch": 0.8032955715756952,
"grad_norm": 0.4813036322593689,
"learning_rate": 6.491666666666667e-05,
"loss": 0.0718,
"step": 780
},
{
"epoch": 0.8135942327497425,
"grad_norm": 0.7320117354393005,
"learning_rate": 6.575e-05,
"loss": 0.0668,
"step": 790
},
{
"epoch": 0.82389289392379,
"grad_norm": 0.8730434775352478,
"learning_rate": 6.658333333333334e-05,
"loss": 0.0732,
"step": 800
},
{
"epoch": 0.8341915550978373,
"grad_norm": 0.5224652886390686,
"learning_rate": 6.741666666666667e-05,
"loss": 0.0665,
"step": 810
},
{
"epoch": 0.8444902162718847,
"grad_norm": 0.7872908711433411,
"learning_rate": 6.825e-05,
"loss": 0.0675,
"step": 820
},
{
"epoch": 0.854788877445932,
"grad_norm": 0.3085736930370331,
"learning_rate": 6.908333333333334e-05,
"loss": 0.069,
"step": 830
},
{
"epoch": 0.8650875386199794,
"grad_norm": 0.45653483271598816,
"learning_rate": 6.991666666666668e-05,
"loss": 0.0654,
"step": 840
},
{
"epoch": 0.8753861997940268,
"grad_norm": 0.7160809636116028,
"learning_rate": 7.075e-05,
"loss": 0.073,
"step": 850
},
{
"epoch": 0.8856848609680742,
"grad_norm": 0.7236250042915344,
"learning_rate": 7.158333333333333e-05,
"loss": 0.068,
"step": 860
},
{
"epoch": 0.8959835221421215,
"grad_norm": 0.696873664855957,
"learning_rate": 7.241666666666666e-05,
"loss": 0.0612,
"step": 870
},
{
"epoch": 0.9062821833161689,
"grad_norm": 0.6647109389305115,
"learning_rate": 7.325e-05,
"loss": 0.0627,
"step": 880
},
{
"epoch": 0.9165808444902163,
"grad_norm": 0.6832423806190491,
"learning_rate": 7.408333333333334e-05,
"loss": 0.0634,
"step": 890
},
{
"epoch": 0.9268795056642637,
"grad_norm": 0.5562313795089722,
"learning_rate": 7.491666666666667e-05,
"loss": 0.0679,
"step": 900
},
{
"epoch": 0.937178166838311,
"grad_norm": 0.5476490259170532,
"learning_rate": 7.575e-05,
"loss": 0.0573,
"step": 910
},
{
"epoch": 0.9474768280123584,
"grad_norm": 0.5675793290138245,
"learning_rate": 7.658333333333334e-05,
"loss": 0.0643,
"step": 920
},
{
"epoch": 0.9577754891864058,
"grad_norm": 0.5144763588905334,
"learning_rate": 7.741666666666667e-05,
"loss": 0.0569,
"step": 930
},
{
"epoch": 0.9680741503604532,
"grad_norm": 0.6619398593902588,
"learning_rate": 7.825e-05,
"loss": 0.0649,
"step": 940
},
{
"epoch": 0.9783728115345005,
"grad_norm": 0.7253459692001343,
"learning_rate": 7.908333333333335e-05,
"loss": 0.069,
"step": 950
},
{
"epoch": 0.9886714727085479,
"grad_norm": 0.9297211766242981,
"learning_rate": 7.991666666666667e-05,
"loss": 0.0581,
"step": 960
},
{
"epoch": 0.9989701338825953,
"grad_norm": 0.3911210298538208,
"learning_rate": 8.075e-05,
"loss": 0.0621,
"step": 970
},
{
"epoch": 1.0092687950566426,
"grad_norm": 0.5856914520263672,
"learning_rate": 8.158333333333333e-05,
"loss": 0.0583,
"step": 980
},
{
"epoch": 1.01956745623069,
"grad_norm": 0.5337209701538086,
"learning_rate": 8.241666666666667e-05,
"loss": 0.0652,
"step": 990
},
{
"epoch": 1.0298661174047374,
"grad_norm": 0.6302834153175354,
"learning_rate": 8.325e-05,
"loss": 0.0639,
"step": 1000
},
{
"epoch": 1.0401647785787849,
"grad_norm": 0.6350454092025757,
"learning_rate": 8.408333333333334e-05,
"loss": 0.0644,
"step": 1010
},
{
"epoch": 1.050463439752832,
"grad_norm": 0.4556646943092346,
"learning_rate": 8.491666666666667e-05,
"loss": 0.0652,
"step": 1020
},
{
"epoch": 1.0607621009268795,
"grad_norm": 0.531972348690033,
"learning_rate": 8.575000000000001e-05,
"loss": 0.0638,
"step": 1030
},
{
"epoch": 1.071060762100927,
"grad_norm": 0.47181880474090576,
"learning_rate": 8.658333333333334e-05,
"loss": 0.0691,
"step": 1040
},
{
"epoch": 1.0813594232749741,
"grad_norm": 0.5719209313392639,
"learning_rate": 8.741666666666667e-05,
"loss": 0.062,
"step": 1050
},
{
"epoch": 1.0916580844490216,
"grad_norm": 0.5950759649276733,
"learning_rate": 8.825e-05,
"loss": 0.0751,
"step": 1060
},
{
"epoch": 1.101956745623069,
"grad_norm": 0.547822117805481,
"learning_rate": 8.908333333333333e-05,
"loss": 0.0611,
"step": 1070
},
{
"epoch": 1.1122554067971164,
"grad_norm": 0.8676643967628479,
"learning_rate": 8.991666666666667e-05,
"loss": 0.0692,
"step": 1080
},
{
"epoch": 1.1225540679711639,
"grad_norm": 0.6153643727302551,
"learning_rate": 9.075e-05,
"loss": 0.0622,
"step": 1090
},
{
"epoch": 1.132852729145211,
"grad_norm": 0.6054476499557495,
"learning_rate": 9.158333333333334e-05,
"loss": 0.0758,
"step": 1100
},
{
"epoch": 1.1431513903192585,
"grad_norm": 0.7495082020759583,
"learning_rate": 9.241666666666667e-05,
"loss": 0.0762,
"step": 1110
},
{
"epoch": 1.153450051493306,
"grad_norm": 0.7987021207809448,
"learning_rate": 9.325e-05,
"loss": 0.0672,
"step": 1120
},
{
"epoch": 1.1637487126673531,
"grad_norm": 0.5528245568275452,
"learning_rate": 9.408333333333333e-05,
"loss": 0.0719,
"step": 1130
},
{
"epoch": 1.1740473738414006,
"grad_norm": 0.6874306797981262,
"learning_rate": 9.491666666666668e-05,
"loss": 0.0743,
"step": 1140
},
{
"epoch": 1.184346035015448,
"grad_norm": 0.7587935924530029,
"learning_rate": 9.575000000000001e-05,
"loss": 0.0699,
"step": 1150
},
{
"epoch": 1.1946446961894954,
"grad_norm": 0.6630555391311646,
"learning_rate": 9.658333333333334e-05,
"loss": 0.0705,
"step": 1160
},
{
"epoch": 1.2049433573635429,
"grad_norm": 0.654225766658783,
"learning_rate": 9.741666666666667e-05,
"loss": 0.0755,
"step": 1170
},
{
"epoch": 1.21524201853759,
"grad_norm": 0.5464800000190735,
"learning_rate": 9.825e-05,
"loss": 0.0677,
"step": 1180
},
{
"epoch": 1.2255406797116375,
"grad_norm": 0.6069100499153137,
"learning_rate": 9.908333333333333e-05,
"loss": 0.0642,
"step": 1190
},
{
"epoch": 1.235839340885685,
"grad_norm": 0.4604052007198334,
"learning_rate": 9.991666666666666e-05,
"loss": 0.0669,
"step": 1200
},
{
"epoch": 1.2461380020597321,
"grad_norm": 0.6743689775466919,
"learning_rate": 9.999996155365815e-05,
"loss": 0.0642,
"step": 1210
},
{
"epoch": 1.2564366632337796,
"grad_norm": 0.7741529941558838,
"learning_rate": 9.999982865279924e-05,
"loss": 0.0634,
"step": 1220
},
{
"epoch": 1.266735324407827,
"grad_norm": 0.43897321820259094,
"learning_rate": 9.999960082302935e-05,
"loss": 0.0649,
"step": 1230
},
{
"epoch": 1.2770339855818744,
"grad_norm": 0.41431164741516113,
"learning_rate": 9.999927806478104e-05,
"loss": 0.0687,
"step": 1240
},
{
"epoch": 1.2873326467559219,
"grad_norm": 0.4003790020942688,
"learning_rate": 9.999886037866707e-05,
"loss": 0.072,
"step": 1250
},
{
"epoch": 1.297631307929969,
"grad_norm": 0.7988683581352234,
"learning_rate": 9.999834776548048e-05,
"loss": 0.0612,
"step": 1260
},
{
"epoch": 1.3079299691040165,
"grad_norm": 0.574458122253418,
"learning_rate": 9.99977402261945e-05,
"loss": 0.0676,
"step": 1270
},
{
"epoch": 1.318228630278064,
"grad_norm": 0.4979383945465088,
"learning_rate": 9.999703776196258e-05,
"loss": 0.0628,
"step": 1280
},
{
"epoch": 1.3285272914521111,
"grad_norm": 0.8276640772819519,
"learning_rate": 9.999624037411843e-05,
"loss": 0.0627,
"step": 1290
},
{
"epoch": 1.3388259526261586,
"grad_norm": 0.35682398080825806,
"learning_rate": 9.999534806417596e-05,
"loss": 0.06,
"step": 1300
},
{
"epoch": 1.349124613800206,
"grad_norm": 0.683525562286377,
"learning_rate": 9.999436083382927e-05,
"loss": 0.0602,
"step": 1310
},
{
"epoch": 1.3594232749742534,
"grad_norm": 0.6512396335601807,
"learning_rate": 9.999327868495272e-05,
"loss": 0.0592,
"step": 1320
},
{
"epoch": 1.3697219361483008,
"grad_norm": 0.7462307214736938,
"learning_rate": 9.999210161960084e-05,
"loss": 0.0623,
"step": 1330
},
{
"epoch": 1.380020597322348,
"grad_norm": 0.5599775910377502,
"learning_rate": 9.999082964000844e-05,
"loss": 0.0643,
"step": 1340
},
{
"epoch": 1.3903192584963955,
"grad_norm": 0.6953631639480591,
"learning_rate": 9.998946274859042e-05,
"loss": 0.0604,
"step": 1350
},
{
"epoch": 1.400617919670443,
"grad_norm": 0.42685210704803467,
"learning_rate": 9.998800094794197e-05,
"loss": 0.0603,
"step": 1360
},
{
"epoch": 1.4109165808444901,
"grad_norm": 0.5579178929328918,
"learning_rate": 9.998644424083843e-05,
"loss": 0.0591,
"step": 1370
},
{
"epoch": 1.4212152420185376,
"grad_norm": 0.525526225566864,
"learning_rate": 9.998479263023536e-05,
"loss": 0.0521,
"step": 1380
},
{
"epoch": 1.431513903192585,
"grad_norm": 0.6368007659912109,
"learning_rate": 9.998304611926846e-05,
"loss": 0.0544,
"step": 1390
},
{
"epoch": 1.4418125643666324,
"grad_norm": 0.7885141968727112,
"learning_rate": 9.998120471125365e-05,
"loss": 0.0606,
"step": 1400
},
{
"epoch": 1.4521112255406798,
"grad_norm": 0.44973626732826233,
"learning_rate": 9.997926840968699e-05,
"loss": 0.0608,
"step": 1410
},
{
"epoch": 1.462409886714727,
"grad_norm": 0.5216629505157471,
"learning_rate": 9.99772372182447e-05,
"loss": 0.0651,
"step": 1420
},
{
"epoch": 1.4727085478887745,
"grad_norm": 0.2978615164756775,
"learning_rate": 9.99751111407832e-05,
"loss": 0.0552,
"step": 1430
},
{
"epoch": 1.483007209062822,
"grad_norm": 0.4659578502178192,
"learning_rate": 9.9972890181339e-05,
"loss": 0.0612,
"step": 1440
},
{
"epoch": 1.4933058702368691,
"grad_norm": 0.5563666224479675,
"learning_rate": 9.997057434412878e-05,
"loss": 0.0574,
"step": 1450
},
{
"epoch": 1.5036045314109165,
"grad_norm": 0.6018580198287964,
"learning_rate": 9.996816363354937e-05,
"loss": 0.0599,
"step": 1460
},
{
"epoch": 1.513903192584964,
"grad_norm": 0.6983861327171326,
"learning_rate": 9.996565805417769e-05,
"loss": 0.0625,
"step": 1470
},
{
"epoch": 1.5242018537590112,
"grad_norm": 0.7207497358322144,
"learning_rate": 9.99630576107708e-05,
"loss": 0.0628,
"step": 1480
},
{
"epoch": 1.5345005149330588,
"grad_norm": 0.44567862153053284,
"learning_rate": 9.996036230826589e-05,
"loss": 0.057,
"step": 1490
},
{
"epoch": 1.544799176107106,
"grad_norm": 0.7830976843833923,
"learning_rate": 9.995757215178018e-05,
"loss": 0.0685,
"step": 1500
},
{
"epoch": 1.5550978372811535,
"grad_norm": 0.5960754752159119,
"learning_rate": 9.995468714661105e-05,
"loss": 0.06,
"step": 1510
},
{
"epoch": 1.565396498455201,
"grad_norm": 0.40101906657218933,
"learning_rate": 9.995170729823588e-05,
"loss": 0.0577,
"step": 1520
},
{
"epoch": 1.575695159629248,
"grad_norm": 0.46922755241394043,
"learning_rate": 9.994863261231224e-05,
"loss": 0.0668,
"step": 1530
},
{
"epoch": 1.5859938208032955,
"grad_norm": 0.42105957865715027,
"learning_rate": 9.99454630946776e-05,
"loss": 0.0651,
"step": 1540
},
{
"epoch": 1.596292481977343,
"grad_norm": 0.4200706481933594,
"learning_rate": 9.994219875134962e-05,
"loss": 0.0586,
"step": 1550
},
{
"epoch": 1.6065911431513902,
"grad_norm": 0.602203369140625,
"learning_rate": 9.99388395885259e-05,
"loss": 0.0523,
"step": 1560
},
{
"epoch": 1.6168898043254378,
"grad_norm": 0.5096521973609924,
"learning_rate": 9.993538561258411e-05,
"loss": 0.0578,
"step": 1570
},
{
"epoch": 1.627188465499485,
"grad_norm": 0.26851776242256165,
"learning_rate": 9.993183683008189e-05,
"loss": 0.0533,
"step": 1580
},
{
"epoch": 1.6374871266735325,
"grad_norm": 0.48737746477127075,
"learning_rate": 9.992819324775696e-05,
"loss": 0.0535,
"step": 1590
},
{
"epoch": 1.64778578784758,
"grad_norm": 0.7643722891807556,
"learning_rate": 9.992445487252691e-05,
"loss": 0.0606,
"step": 1600
},
{
"epoch": 1.658084449021627,
"grad_norm": 0.4624369740486145,
"learning_rate": 9.992062171148938e-05,
"loss": 0.0547,
"step": 1610
},
{
"epoch": 1.6683831101956745,
"grad_norm": 0.644801914691925,
"learning_rate": 9.991669377192196e-05,
"loss": 0.0516,
"step": 1620
},
{
"epoch": 1.678681771369722,
"grad_norm": 0.48216086626052856,
"learning_rate": 9.991267106128219e-05,
"loss": 0.051,
"step": 1630
},
{
"epoch": 1.6889804325437692,
"grad_norm": 0.5955299735069275,
"learning_rate": 9.99085535872075e-05,
"loss": 0.0576,
"step": 1640
},
{
"epoch": 1.6992790937178168,
"grad_norm": 0.650093674659729,
"learning_rate": 9.990434135751526e-05,
"loss": 0.0689,
"step": 1650
},
{
"epoch": 1.709577754891864,
"grad_norm": 0.4557111859321594,
"learning_rate": 9.990003438020276e-05,
"loss": 0.0499,
"step": 1660
},
{
"epoch": 1.7198764160659115,
"grad_norm": 0.49610280990600586,
"learning_rate": 9.989563266344718e-05,
"loss": 0.0575,
"step": 1670
},
{
"epoch": 1.730175077239959,
"grad_norm": 0.6158459782600403,
"learning_rate": 9.989113621560552e-05,
"loss": 0.0611,
"step": 1680
},
{
"epoch": 1.740473738414006,
"grad_norm": 0.6327665448188782,
"learning_rate": 9.988654504521469e-05,
"loss": 0.0689,
"step": 1690
},
{
"epoch": 1.7507723995880535,
"grad_norm": 0.5656455755233765,
"learning_rate": 9.988185916099139e-05,
"loss": 0.0596,
"step": 1700
},
{
"epoch": 1.761071060762101,
"grad_norm": 0.5649673938751221,
"learning_rate": 9.987707857183218e-05,
"loss": 0.0595,
"step": 1710
},
{
"epoch": 1.7713697219361482,
"grad_norm": 0.7367047071456909,
"learning_rate": 9.987220328681343e-05,
"loss": 0.0635,
"step": 1720
},
{
"epoch": 1.7816683831101958,
"grad_norm": 0.4531395435333252,
"learning_rate": 9.986723331519126e-05,
"loss": 0.0572,
"step": 1730
},
{
"epoch": 1.791967044284243,
"grad_norm": 0.7696741223335266,
"learning_rate": 9.986216866640159e-05,
"loss": 0.0477,
"step": 1740
},
{
"epoch": 1.8022657054582905,
"grad_norm": 0.8207795023918152,
"learning_rate": 9.985700935006009e-05,
"loss": 0.0602,
"step": 1750
},
{
"epoch": 1.8125643666323379,
"grad_norm": 0.4749796390533447,
"learning_rate": 9.985175537596216e-05,
"loss": 0.0581,
"step": 1760
},
{
"epoch": 1.822863027806385,
"grad_norm": 0.48487281799316406,
"learning_rate": 9.98464067540829e-05,
"loss": 0.0579,
"step": 1770
},
{
"epoch": 1.8331616889804325,
"grad_norm": 0.58293217420578,
"learning_rate": 9.984096349457714e-05,
"loss": 0.056,
"step": 1780
},
{
"epoch": 1.84346035015448,
"grad_norm": 0.5481081008911133,
"learning_rate": 9.983542560777935e-05,
"loss": 0.061,
"step": 1790
},
{
"epoch": 1.8537590113285272,
"grad_norm": 0.49807319045066833,
"learning_rate": 9.982979310420369e-05,
"loss": 0.0557,
"step": 1800
},
{
"epoch": 1.8640576725025748,
"grad_norm": 0.5096905827522278,
"learning_rate": 9.982406599454395e-05,
"loss": 0.0604,
"step": 1810
},
{
"epoch": 1.874356333676622,
"grad_norm": 0.3464685082435608,
"learning_rate": 9.981824428967352e-05,
"loss": 0.0583,
"step": 1820
},
{
"epoch": 1.8846549948506695,
"grad_norm": 0.4973633885383606,
"learning_rate": 9.981232800064542e-05,
"loss": 0.0608,
"step": 1830
},
{
"epoch": 1.8949536560247169,
"grad_norm": 0.4537138044834137,
"learning_rate": 9.980631713869219e-05,
"loss": 0.0554,
"step": 1840
},
{
"epoch": 1.905252317198764,
"grad_norm": 0.5278320908546448,
"learning_rate": 9.9800211715226e-05,
"loss": 0.0587,
"step": 1850
},
{
"epoch": 1.9155509783728115,
"grad_norm": 0.5589116811752319,
"learning_rate": 9.979401174183849e-05,
"loss": 0.0719,
"step": 1860
},
{
"epoch": 1.925849639546859,
"grad_norm": 0.4185578525066376,
"learning_rate": 9.978771723030084e-05,
"loss": 0.0641,
"step": 1870
},
{
"epoch": 1.9361483007209062,
"grad_norm": 0.6495696306228638,
"learning_rate": 9.978132819256371e-05,
"loss": 0.0666,
"step": 1880
},
{
"epoch": 1.9464469618949538,
"grad_norm": 0.8356088995933533,
"learning_rate": 9.977484464075725e-05,
"loss": 0.0584,
"step": 1890
},
{
"epoch": 1.956745623069001,
"grad_norm": 0.397126704454422,
"learning_rate": 9.9768266587191e-05,
"loss": 0.0735,
"step": 1900
},
{
"epoch": 1.9670442842430484,
"grad_norm": 0.52510005235672,
"learning_rate": 9.976159404435397e-05,
"loss": 0.0709,
"step": 1910
},
{
"epoch": 1.9773429454170959,
"grad_norm": 0.44712215662002563,
"learning_rate": 9.975482702491457e-05,
"loss": 0.0606,
"step": 1920
},
{
"epoch": 1.987641606591143,
"grad_norm": 0.8051986694335938,
"learning_rate": 9.974796554172052e-05,
"loss": 0.0598,
"step": 1930
},
{
"epoch": 1.9979402677651905,
"grad_norm": 0.5369552969932556,
"learning_rate": 9.974100960779892e-05,
"loss": 0.0689,
"step": 1940
},
{
"epoch": 2.008238928939238,
"grad_norm": 0.6968551278114319,
"learning_rate": 9.973395923635627e-05,
"loss": 0.0539,
"step": 1950
},
{
"epoch": 2.018537590113285,
"grad_norm": 0.6285783052444458,
"learning_rate": 9.972681444077823e-05,
"loss": 0.0589,
"step": 1960
},
{
"epoch": 2.028836251287333,
"grad_norm": 0.48109880089759827,
"learning_rate": 9.97195752346298e-05,
"loss": 0.064,
"step": 1970
},
{
"epoch": 2.03913491246138,
"grad_norm": 0.3831437826156616,
"learning_rate": 9.971224163165527e-05,
"loss": 0.0656,
"step": 1980
},
{
"epoch": 2.049433573635427,
"grad_norm": 0.4746626913547516,
"learning_rate": 9.970481364577808e-05,
"loss": 0.054,
"step": 1990
},
{
"epoch": 2.059732234809475,
"grad_norm": 0.5078164339065552,
"learning_rate": 9.96972912911009e-05,
"loss": 0.0576,
"step": 2000
},
{
"epoch": 2.070030895983522,
"grad_norm": 0.39853551983833313,
"learning_rate": 9.968967458190554e-05,
"loss": 0.0497,
"step": 2010
},
{
"epoch": 2.0803295571575697,
"grad_norm": 0.32678648829460144,
"learning_rate": 9.9681963532653e-05,
"loss": 0.0524,
"step": 2020
},
{
"epoch": 2.090628218331617,
"grad_norm": 0.575631856918335,
"learning_rate": 9.967415815798331e-05,
"loss": 0.057,
"step": 2030
},
{
"epoch": 2.100926879505664,
"grad_norm": 0.3121809661388397,
"learning_rate": 9.966625847271569e-05,
"loss": 0.0563,
"step": 2040
},
{
"epoch": 2.111225540679712,
"grad_norm": 0.4975629150867462,
"learning_rate": 9.965826449184832e-05,
"loss": 0.0533,
"step": 2050
},
{
"epoch": 2.121524201853759,
"grad_norm": 0.4501200318336487,
"learning_rate": 9.965017623055848e-05,
"loss": 0.0485,
"step": 2060
},
{
"epoch": 2.131822863027806,
"grad_norm": 0.4712199568748474,
"learning_rate": 9.96419937042024e-05,
"loss": 0.0462,
"step": 2070
},
{
"epoch": 2.142121524201854,
"grad_norm": 0.3075672388076782,
"learning_rate": 9.963371692831528e-05,
"loss": 0.0533,
"step": 2080
},
{
"epoch": 2.152420185375901,
"grad_norm": 0.7105554342269897,
"learning_rate": 9.962534591861135e-05,
"loss": 0.0492,
"step": 2090
},
{
"epoch": 2.1627188465499483,
"grad_norm": 0.2947746515274048,
"learning_rate": 9.961688069098362e-05,
"loss": 0.0506,
"step": 2100
},
{
"epoch": 2.173017507723996,
"grad_norm": 0.4763787090778351,
"learning_rate": 9.960832126150405e-05,
"loss": 0.0529,
"step": 2110
},
{
"epoch": 2.183316168898043,
"grad_norm": 0.5647318363189697,
"learning_rate": 9.959966764642346e-05,
"loss": 0.0538,
"step": 2120
},
{
"epoch": 2.193614830072091,
"grad_norm": 0.4324939250946045,
"learning_rate": 9.959091986217146e-05,
"loss": 0.0626,
"step": 2130
},
{
"epoch": 2.203913491246138,
"grad_norm": 0.5161953568458557,
"learning_rate": 9.958207792535647e-05,
"loss": 0.0548,
"step": 2140
},
{
"epoch": 2.214212152420185,
"grad_norm": 0.3394765257835388,
"learning_rate": 9.957314185276564e-05,
"loss": 0.0653,
"step": 2150
},
{
"epoch": 2.224510813594233,
"grad_norm": 0.5128792524337769,
"learning_rate": 9.956411166136488e-05,
"loss": 0.0579,
"step": 2160
},
{
"epoch": 2.23480947476828,
"grad_norm": 0.3266693353652954,
"learning_rate": 9.955498736829875e-05,
"loss": 0.05,
"step": 2170
},
{
"epoch": 2.2451081359423277,
"grad_norm": 0.32357707619667053,
"learning_rate": 9.954576899089049e-05,
"loss": 0.0567,
"step": 2180
},
{
"epoch": 2.255406797116375,
"grad_norm": 0.2624098062515259,
"learning_rate": 9.953645654664202e-05,
"loss": 0.0566,
"step": 2190
},
{
"epoch": 2.265705458290422,
"grad_norm": 0.43851861357688904,
"learning_rate": 9.952705005323375e-05,
"loss": 0.0575,
"step": 2200
},
{
"epoch": 2.27600411946447,
"grad_norm": 0.4381640553474426,
"learning_rate": 9.951754952852474e-05,
"loss": 0.048,
"step": 2210
},
{
"epoch": 2.286302780638517,
"grad_norm": 0.41920751333236694,
"learning_rate": 9.950795499055252e-05,
"loss": 0.0613,
"step": 2220
},
{
"epoch": 2.296601441812564,
"grad_norm": 0.32066893577575684,
"learning_rate": 9.949826645753314e-05,
"loss": 0.0542,
"step": 2230
},
{
"epoch": 2.306900102986612,
"grad_norm": 0.40712329745292664,
"learning_rate": 9.94884839478611e-05,
"loss": 0.0533,
"step": 2240
},
{
"epoch": 2.317198764160659,
"grad_norm": 0.558935284614563,
"learning_rate": 9.947860748010933e-05,
"loss": 0.062,
"step": 2250
},
{
"epoch": 2.3274974253347063,
"grad_norm": 0.6141128540039062,
"learning_rate": 9.946863707302913e-05,
"loss": 0.0531,
"step": 2260
},
{
"epoch": 2.337796086508754,
"grad_norm": 0.5253733396530151,
"learning_rate": 9.945857274555017e-05,
"loss": 0.0516,
"step": 2270
},
{
"epoch": 2.348094747682801,
"grad_norm": 0.5007606744766235,
"learning_rate": 9.944841451678043e-05,
"loss": 0.0648,
"step": 2280
},
{
"epoch": 2.358393408856849,
"grad_norm": 0.4784156382083893,
"learning_rate": 9.943816240600614e-05,
"loss": 0.0678,
"step": 2290
},
{
"epoch": 2.368692070030896,
"grad_norm": 0.5175110101699829,
"learning_rate": 9.942781643269186e-05,
"loss": 0.0584,
"step": 2300
},
{
"epoch": 2.378990731204943,
"grad_norm": 0.41106414794921875,
"learning_rate": 9.941737661648024e-05,
"loss": 0.0617,
"step": 2310
},
{
"epoch": 2.389289392378991,
"grad_norm": 0.372574120759964,
"learning_rate": 9.940684297719218e-05,
"loss": 0.0587,
"step": 2320
},
{
"epoch": 2.399588053553038,
"grad_norm": 0.7082722187042236,
"learning_rate": 9.939621553482669e-05,
"loss": 0.066,
"step": 2330
},
{
"epoch": 2.4098867147270857,
"grad_norm": 0.28144243359565735,
"learning_rate": 9.938549430956085e-05,
"loss": 0.0549,
"step": 2340
},
{
"epoch": 2.420185375901133,
"grad_norm": 0.4946507513523102,
"learning_rate": 9.937467932174985e-05,
"loss": 0.0557,
"step": 2350
},
{
"epoch": 2.43048403707518,
"grad_norm": 0.4784681499004364,
"learning_rate": 9.936377059192683e-05,
"loss": 0.0575,
"step": 2360
},
{
"epoch": 2.4407826982492278,
"grad_norm": 0.47319531440734863,
"learning_rate": 9.935276814080294e-05,
"loss": 0.0612,
"step": 2370
},
{
"epoch": 2.451081359423275,
"grad_norm": 0.3896636664867401,
"learning_rate": 9.934167198926729e-05,
"loss": 0.0554,
"step": 2380
},
{
"epoch": 2.461380020597322,
"grad_norm": 0.4796706736087799,
"learning_rate": 9.933048215838682e-05,
"loss": 0.0588,
"step": 2390
},
{
"epoch": 2.47167868177137,
"grad_norm": 0.3464389145374298,
"learning_rate": 9.931919866940642e-05,
"loss": 0.0667,
"step": 2400
},
{
"epoch": 2.481977342945417,
"grad_norm": 0.3699023425579071,
"learning_rate": 9.930782154374874e-05,
"loss": 0.0498,
"step": 2410
},
{
"epoch": 2.4922760041194643,
"grad_norm": 0.3829363286495209,
"learning_rate": 9.929635080301418e-05,
"loss": 0.0539,
"step": 2420
},
{
"epoch": 2.502574665293512,
"grad_norm": 0.47303831577301025,
"learning_rate": 9.928478646898096e-05,
"loss": 0.0439,
"step": 2430
},
{
"epoch": 2.512873326467559,
"grad_norm": 0.5764483213424683,
"learning_rate": 9.92731285636049e-05,
"loss": 0.0547,
"step": 2440
},
{
"epoch": 2.5231719876416063,
"grad_norm": 0.3862007260322571,
"learning_rate": 9.926137710901956e-05,
"loss": 0.0485,
"step": 2450
},
{
"epoch": 2.533470648815654,
"grad_norm": 0.35379621386528015,
"learning_rate": 9.924953212753607e-05,
"loss": 0.0555,
"step": 2460
},
{
"epoch": 2.543769309989701,
"grad_norm": 0.32140642404556274,
"learning_rate": 9.923759364164311e-05,
"loss": 0.0495,
"step": 2470
},
{
"epoch": 2.554067971163749,
"grad_norm": 0.530728816986084,
"learning_rate": 9.922556167400692e-05,
"loss": 0.0601,
"step": 2480
},
{
"epoch": 2.564366632337796,
"grad_norm": 0.3118293285369873,
"learning_rate": 9.92134362474712e-05,
"loss": 0.0536,
"step": 2490
},
{
"epoch": 2.5746652935118437,
"grad_norm": 0.5819094777107239,
"learning_rate": 9.920121738505713e-05,
"loss": 0.0556,
"step": 2500
},
{
"epoch": 2.584963954685891,
"grad_norm": 0.3461971580982208,
"learning_rate": 9.91889051099632e-05,
"loss": 0.0572,
"step": 2510
},
{
"epoch": 2.595262615859938,
"grad_norm": 0.4987310767173767,
"learning_rate": 9.917649944556536e-05,
"loss": 0.0501,
"step": 2520
},
{
"epoch": 2.6055612770339858,
"grad_norm": 0.4364427626132965,
"learning_rate": 9.91640004154168e-05,
"loss": 0.0493,
"step": 2530
},
{
"epoch": 2.615859938208033,
"grad_norm": 0.5407469868659973,
"learning_rate": 9.915140804324797e-05,
"loss": 0.0515,
"step": 2540
},
{
"epoch": 2.62615859938208,
"grad_norm": 0.47441163659095764,
"learning_rate": 9.913872235296657e-05,
"loss": 0.0525,
"step": 2550
},
{
"epoch": 2.636457260556128,
"grad_norm": 0.6901610493659973,
"learning_rate": 9.912594336865745e-05,
"loss": 0.0522,
"step": 2560
},
{
"epoch": 2.646755921730175,
"grad_norm": 0.29979637265205383,
"learning_rate": 9.911307111458262e-05,
"loss": 0.0521,
"step": 2570
},
{
"epoch": 2.6570545829042223,
"grad_norm": 0.44594720005989075,
"learning_rate": 9.910010561518112e-05,
"loss": 0.0502,
"step": 2580
},
{
"epoch": 2.66735324407827,
"grad_norm": 0.36554035544395447,
"learning_rate": 9.908704689506904e-05,
"loss": 0.0545,
"step": 2590
},
{
"epoch": 2.677651905252317,
"grad_norm": 0.3384472727775574,
"learning_rate": 9.907389497903949e-05,
"loss": 0.0466,
"step": 2600
},
{
"epoch": 2.6879505664263643,
"grad_norm": 0.6370285749435425,
"learning_rate": 9.906064989206249e-05,
"loss": 0.0521,
"step": 2610
},
{
"epoch": 2.698249227600412,
"grad_norm": 0.4321992099285126,
"learning_rate": 9.904731165928497e-05,
"loss": 0.0486,
"step": 2620
},
{
"epoch": 2.708547888774459,
"grad_norm": 0.35261237621307373,
"learning_rate": 9.903388030603067e-05,
"loss": 0.0529,
"step": 2630
},
{
"epoch": 2.718846549948507,
"grad_norm": 0.2982620894908905,
"learning_rate": 9.902035585780016e-05,
"loss": 0.0457,
"step": 2640
},
{
"epoch": 2.729145211122554,
"grad_norm": 0.286214143037796,
"learning_rate": 9.900673834027077e-05,
"loss": 0.0449,
"step": 2650
},
{
"epoch": 2.7394438722966017,
"grad_norm": 0.41237062215805054,
"learning_rate": 9.899302777929649e-05,
"loss": 0.0552,
"step": 2660
},
{
"epoch": 2.749742533470649,
"grad_norm": 0.2523784935474396,
"learning_rate": 9.897922420090799e-05,
"loss": 0.0501,
"step": 2670
},
{
"epoch": 2.760041194644696,
"grad_norm": 0.5662415027618408,
"learning_rate": 9.896532763131253e-05,
"loss": 0.0521,
"step": 2680
},
{
"epoch": 2.7703398558187438,
"grad_norm": 0.45098549127578735,
"learning_rate": 9.895133809689391e-05,
"loss": 0.0504,
"step": 2690
},
{
"epoch": 2.780638516992791,
"grad_norm": 0.41485512256622314,
"learning_rate": 9.893725562421249e-05,
"loss": 0.0535,
"step": 2700
},
{
"epoch": 2.790937178166838,
"grad_norm": 0.42439576983451843,
"learning_rate": 9.8923080240005e-05,
"loss": 0.046,
"step": 2710
},
{
"epoch": 2.801235839340886,
"grad_norm": 0.5364951491355896,
"learning_rate": 9.890881197118462e-05,
"loss": 0.0492,
"step": 2720
},
{
"epoch": 2.811534500514933,
"grad_norm": 0.6617226600646973,
"learning_rate": 9.889445084484086e-05,
"loss": 0.0526,
"step": 2730
},
{
"epoch": 2.8218331616889802,
"grad_norm": 0.572847843170166,
"learning_rate": 9.887999688823955e-05,
"loss": 0.0551,
"step": 2740
},
{
"epoch": 2.832131822863028,
"grad_norm": 0.5418044328689575,
"learning_rate": 9.886545012882272e-05,
"loss": 0.052,
"step": 2750
},
{
"epoch": 2.842430484037075,
"grad_norm": 0.3684234321117401,
"learning_rate": 9.885081059420866e-05,
"loss": 0.0497,
"step": 2760
},
{
"epoch": 2.8527291452111223,
"grad_norm": 0.32835209369659424,
"learning_rate": 9.883607831219176e-05,
"loss": 0.0501,
"step": 2770
},
{
"epoch": 2.86302780638517,
"grad_norm": 0.4320078194141388,
"learning_rate": 9.882125331074251e-05,
"loss": 0.0557,
"step": 2780
},
{
"epoch": 2.873326467559217,
"grad_norm": 0.3763574957847595,
"learning_rate": 9.880633561800739e-05,
"loss": 0.0466,
"step": 2790
},
{
"epoch": 2.883625128733265,
"grad_norm": 0.267978310585022,
"learning_rate": 9.879132526230895e-05,
"loss": 0.0488,
"step": 2800
},
{
"epoch": 2.893923789907312,
"grad_norm": 0.35433250665664673,
"learning_rate": 9.87762222721456e-05,
"loss": 0.0499,
"step": 2810
},
{
"epoch": 2.9042224510813597,
"grad_norm": 0.5067030787467957,
"learning_rate": 9.876102667619166e-05,
"loss": 0.0443,
"step": 2820
},
{
"epoch": 2.914521112255407,
"grad_norm": 1.2673553228378296,
"learning_rate": 9.874573850329724e-05,
"loss": 0.0547,
"step": 2830
},
{
"epoch": 2.924819773429454,
"grad_norm": 0.53081876039505,
"learning_rate": 9.873035778248826e-05,
"loss": 0.05,
"step": 2840
},
{
"epoch": 2.9351184346035017,
"grad_norm": 0.343241810798645,
"learning_rate": 9.871488454296629e-05,
"loss": 0.0508,
"step": 2850
},
{
"epoch": 2.945417095777549,
"grad_norm": 0.2269151359796524,
"learning_rate": 9.86993188141086e-05,
"loss": 0.0485,
"step": 2860
},
{
"epoch": 2.955715756951596,
"grad_norm": 0.3384031355381012,
"learning_rate": 9.868366062546803e-05,
"loss": 0.0456,
"step": 2870
},
{
"epoch": 2.966014418125644,
"grad_norm": 0.3758116662502289,
"learning_rate": 9.866791000677302e-05,
"loss": 0.0543,
"step": 2880
},
{
"epoch": 2.976313079299691,
"grad_norm": 0.3721799850463867,
"learning_rate": 9.86520669879274e-05,
"loss": 0.0496,
"step": 2890
},
{
"epoch": 2.9866117404737382,
"grad_norm": 0.4035572409629822,
"learning_rate": 9.863613159901053e-05,
"loss": 0.0488,
"step": 2900
},
{
"epoch": 2.996910401647786,
"grad_norm": 0.36414894461631775,
"learning_rate": 9.862010387027706e-05,
"loss": 0.0599,
"step": 2910
},
{
"epoch": 3.007209062821833,
"grad_norm": 0.40540945529937744,
"learning_rate": 9.860398383215701e-05,
"loss": 0.05,
"step": 2920
},
{
"epoch": 3.0175077239958807,
"grad_norm": 0.39733192324638367,
"learning_rate": 9.858777151525564e-05,
"loss": 0.0456,
"step": 2930
},
{
"epoch": 3.027806385169928,
"grad_norm": 0.4590163230895996,
"learning_rate": 9.857146695035339e-05,
"loss": 0.0452,
"step": 2940
},
{
"epoch": 3.038105046343975,
"grad_norm": 0.3180115222930908,
"learning_rate": 9.855507016840587e-05,
"loss": 0.0463,
"step": 2950
},
{
"epoch": 3.048403707518023,
"grad_norm": 0.4976675510406494,
"learning_rate": 9.853858120054377e-05,
"loss": 0.061,
"step": 2960
},
{
"epoch": 3.05870236869207,
"grad_norm": 0.3708176612854004,
"learning_rate": 9.852200007807278e-05,
"loss": 0.0544,
"step": 2970
},
{
"epoch": 3.0690010298661172,
"grad_norm": 0.3110915422439575,
"learning_rate": 9.850532683247355e-05,
"loss": 0.0538,
"step": 2980
},
{
"epoch": 3.079299691040165,
"grad_norm": 0.38979801535606384,
"learning_rate": 9.848856149540168e-05,
"loss": 0.052,
"step": 2990
},
{
"epoch": 3.089598352214212,
"grad_norm": 0.558873176574707,
"learning_rate": 9.847170409868758e-05,
"loss": 0.05,
"step": 3000
},
{
"epoch": 3.0998970133882597,
"grad_norm": 0.5149732828140259,
"learning_rate": 9.845475467433643e-05,
"loss": 0.0582,
"step": 3010
},
{
"epoch": 3.110195674562307,
"grad_norm": 0.6636197566986084,
"learning_rate": 9.843771325452815e-05,
"loss": 0.061,
"step": 3020
},
{
"epoch": 3.120494335736354,
"grad_norm": 0.47130295634269714,
"learning_rate": 9.842057987161731e-05,
"loss": 0.0511,
"step": 3030
},
{
"epoch": 3.130792996910402,
"grad_norm": 0.4013822674751282,
"learning_rate": 9.840335455813312e-05,
"loss": 0.0513,
"step": 3040
},
{
"epoch": 3.141091658084449,
"grad_norm": 0.3371870219707489,
"learning_rate": 9.838603734677926e-05,
"loss": 0.0498,
"step": 3050
},
{
"epoch": 3.151390319258496,
"grad_norm": 0.3246757686138153,
"learning_rate": 9.836862827043396e-05,
"loss": 0.0525,
"step": 3060
},
{
"epoch": 3.161688980432544,
"grad_norm": 0.5342622995376587,
"learning_rate": 9.835112736214982e-05,
"loss": 0.059,
"step": 3070
},
{
"epoch": 3.171987641606591,
"grad_norm": 0.27634263038635254,
"learning_rate": 9.833353465515376e-05,
"loss": 0.0454,
"step": 3080
},
{
"epoch": 3.1822863027806383,
"grad_norm": 0.5118007659912109,
"learning_rate": 9.831585018284707e-05,
"loss": 0.0571,
"step": 3090
},
{
"epoch": 3.192584963954686,
"grad_norm": 0.42241352796554565,
"learning_rate": 9.829807397880519e-05,
"loss": 0.0517,
"step": 3100
},
{
"epoch": 3.202883625128733,
"grad_norm": 0.5368523001670837,
"learning_rate": 9.828020607677775e-05,
"loss": 0.0595,
"step": 3110
},
{
"epoch": 3.213182286302781,
"grad_norm": 0.5250363349914551,
"learning_rate": 9.826224651068852e-05,
"loss": 0.0544,
"step": 3120
},
{
"epoch": 3.223480947476828,
"grad_norm": 0.3914489448070526,
"learning_rate": 9.82441953146352e-05,
"loss": 0.0504,
"step": 3130
},
{
"epoch": 3.233779608650875,
"grad_norm": 0.42789822816848755,
"learning_rate": 9.822605252288953e-05,
"loss": 0.0539,
"step": 3140
},
{
"epoch": 3.244078269824923,
"grad_norm": 0.2896466553211212,
"learning_rate": 9.820781816989715e-05,
"loss": 0.044,
"step": 3150
},
{
"epoch": 3.25437693099897,
"grad_norm": 0.3793918788433075,
"learning_rate": 9.818949229027752e-05,
"loss": 0.0498,
"step": 3160
},
{
"epoch": 3.2646755921730177,
"grad_norm": 0.39543598890304565,
"learning_rate": 9.817107491882388e-05,
"loss": 0.0542,
"step": 3170
},
{
"epoch": 3.274974253347065,
"grad_norm": 0.38835805654525757,
"learning_rate": 9.815256609050316e-05,
"loss": 0.0474,
"step": 3180
},
{
"epoch": 3.285272914521112,
"grad_norm": 0.37964895367622375,
"learning_rate": 9.813396584045596e-05,
"loss": 0.0541,
"step": 3190
},
{
"epoch": 3.29557157569516,
"grad_norm": 0.4047639071941376,
"learning_rate": 9.811527420399639e-05,
"loss": 0.0564,
"step": 3200
},
{
"epoch": 3.305870236869207,
"grad_norm": 0.598591148853302,
"learning_rate": 9.809649121661214e-05,
"loss": 0.0579,
"step": 3210
},
{
"epoch": 3.316168898043254,
"grad_norm": 0.5548397302627563,
"learning_rate": 9.807761691396429e-05,
"loss": 0.049,
"step": 3220
},
{
"epoch": 3.326467559217302,
"grad_norm": 0.7035659551620483,
"learning_rate": 9.805865133188731e-05,
"loss": 0.0518,
"step": 3230
},
{
"epoch": 3.336766220391349,
"grad_norm": 0.7861883044242859,
"learning_rate": 9.803959450638896e-05,
"loss": 0.0547,
"step": 3240
},
{
"epoch": 3.3470648815653963,
"grad_norm": 0.4890913665294647,
"learning_rate": 9.802044647365023e-05,
"loss": 0.0473,
"step": 3250
},
{
"epoch": 3.357363542739444,
"grad_norm": 0.34270229935646057,
"learning_rate": 9.800120727002529e-05,
"loss": 0.0575,
"step": 3260
},
{
"epoch": 3.367662203913491,
"grad_norm": 0.4263719320297241,
"learning_rate": 9.798187693204141e-05,
"loss": 0.0516,
"step": 3270
},
{
"epoch": 3.377960865087539,
"grad_norm": 0.31996503472328186,
"learning_rate": 9.796245549639885e-05,
"loss": 0.0565,
"step": 3280
},
{
"epoch": 3.388259526261586,
"grad_norm": 0.3111179769039154,
"learning_rate": 9.794294299997088e-05,
"loss": 0.0518,
"step": 3290
},
{
"epoch": 3.398558187435633,
"grad_norm": 0.44092559814453125,
"learning_rate": 9.792333947980359e-05,
"loss": 0.0506,
"step": 3300
},
{
"epoch": 3.408856848609681,
"grad_norm": 0.27052825689315796,
"learning_rate": 9.790364497311597e-05,
"loss": 0.0526,
"step": 3310
},
{
"epoch": 3.419155509783728,
"grad_norm": 0.4862545132637024,
"learning_rate": 9.788385951729966e-05,
"loss": 0.043,
"step": 3320
},
{
"epoch": 3.4294541709577757,
"grad_norm": 0.3908790647983551,
"learning_rate": 9.786398314991907e-05,
"loss": 0.0536,
"step": 3330
},
{
"epoch": 3.439752832131823,
"grad_norm": 0.5570793151855469,
"learning_rate": 9.784401590871112e-05,
"loss": 0.0543,
"step": 3340
},
{
"epoch": 3.45005149330587,
"grad_norm": 0.34768927097320557,
"learning_rate": 9.782395783158537e-05,
"loss": 0.0464,
"step": 3350
},
{
"epoch": 3.460350154479918,
"grad_norm": 0.45262208580970764,
"learning_rate": 9.780380895662373e-05,
"loss": 0.0476,
"step": 3360
},
{
"epoch": 3.470648815653965,
"grad_norm": 0.3095530569553375,
"learning_rate": 9.778356932208055e-05,
"loss": 0.0582,
"step": 3370
},
{
"epoch": 3.480947476828012,
"grad_norm": 0.3530728220939636,
"learning_rate": 9.776323896638251e-05,
"loss": 0.0506,
"step": 3380
},
{
"epoch": 3.49124613800206,
"grad_norm": 0.34243884682655334,
"learning_rate": 9.774281792812852e-05,
"loss": 0.057,
"step": 3390
},
{
"epoch": 3.501544799176107,
"grad_norm": 0.4583176076412201,
"learning_rate": 9.772230624608961e-05,
"loss": 0.0559,
"step": 3400
},
{
"epoch": 3.5118434603501543,
"grad_norm": 0.3211122751235962,
"learning_rate": 9.770170395920899e-05,
"loss": 0.0497,
"step": 3410
},
{
"epoch": 3.522142121524202,
"grad_norm": 0.6675431132316589,
"learning_rate": 9.768101110660182e-05,
"loss": 0.0588,
"step": 3420
},
{
"epoch": 3.532440782698249,
"grad_norm": 0.47643566131591797,
"learning_rate": 9.766022772755525e-05,
"loss": 0.052,
"step": 3430
},
{
"epoch": 3.5427394438722963,
"grad_norm": 0.3988637626171112,
"learning_rate": 9.763935386152827e-05,
"loss": 0.0479,
"step": 3440
},
{
"epoch": 3.553038105046344,
"grad_norm": 0.3347717225551605,
"learning_rate": 9.76183895481517e-05,
"loss": 0.0517,
"step": 3450
},
{
"epoch": 3.563336766220391,
"grad_norm": 0.4554286599159241,
"learning_rate": 9.759733482722806e-05,
"loss": 0.0671,
"step": 3460
},
{
"epoch": 3.573635427394439,
"grad_norm": 0.39437830448150635,
"learning_rate": 9.757618973873152e-05,
"loss": 0.0456,
"step": 3470
},
{
"epoch": 3.583934088568486,
"grad_norm": 0.3920309245586395,
"learning_rate": 9.755495432280781e-05,
"loss": 0.0553,
"step": 3480
},
{
"epoch": 3.5942327497425337,
"grad_norm": 0.38523244857788086,
"learning_rate": 9.75336286197742e-05,
"loss": 0.0506,
"step": 3490
},
{
"epoch": 3.604531410916581,
"grad_norm": 0.3649066984653473,
"learning_rate": 9.751221267011929e-05,
"loss": 0.0544,
"step": 3500
},
{
"epoch": 3.614830072090628,
"grad_norm": 0.5154473185539246,
"learning_rate": 9.749070651450314e-05,
"loss": 0.0556,
"step": 3510
},
{
"epoch": 3.6251287332646758,
"grad_norm": 0.4133189618587494,
"learning_rate": 9.746911019375695e-05,
"loss": 0.0483,
"step": 3520
},
{
"epoch": 3.635427394438723,
"grad_norm": 0.39976581931114197,
"learning_rate": 9.744742374888321e-05,
"loss": 0.0545,
"step": 3530
},
{
"epoch": 3.64572605561277,
"grad_norm": 0.3134078085422516,
"learning_rate": 9.742564722105543e-05,
"loss": 0.05,
"step": 3540
},
{
"epoch": 3.656024716786818,
"grad_norm": 0.5159580707550049,
"learning_rate": 9.740378065161823e-05,
"loss": 0.0546,
"step": 3550
},
{
"epoch": 3.666323377960865,
"grad_norm": 0.3549450933933258,
"learning_rate": 9.738182408208714e-05,
"loss": 0.057,
"step": 3560
},
{
"epoch": 3.6766220391349123,
"grad_norm": 0.46382319927215576,
"learning_rate": 9.735977755414855e-05,
"loss": 0.0603,
"step": 3570
},
{
"epoch": 3.68692070030896,
"grad_norm": 0.43110236525535583,
"learning_rate": 9.733764110965966e-05,
"loss": 0.0524,
"step": 3580
},
{
"epoch": 3.697219361483007,
"grad_norm": 0.4503861367702484,
"learning_rate": 9.73154147906484e-05,
"loss": 0.0541,
"step": 3590
},
{
"epoch": 3.7075180226570543,
"grad_norm": 0.39958786964416504,
"learning_rate": 9.729309863931333e-05,
"loss": 0.053,
"step": 3600
},
{
"epoch": 3.717816683831102,
"grad_norm": 0.3972841203212738,
"learning_rate": 9.727069269802352e-05,
"loss": 0.0445,
"step": 3610
},
{
"epoch": 3.728115345005149,
"grad_norm": 0.38955458998680115,
"learning_rate": 9.724819700931858e-05,
"loss": 0.0477,
"step": 3620
},
{
"epoch": 3.738414006179197,
"grad_norm": 0.4105938673019409,
"learning_rate": 9.722561161590845e-05,
"loss": 0.0537,
"step": 3630
},
{
"epoch": 3.748712667353244,
"grad_norm": 0.5155870914459229,
"learning_rate": 9.720293656067345e-05,
"loss": 0.0484,
"step": 3640
},
{
"epoch": 3.7590113285272917,
"grad_norm": 0.6213602423667908,
"learning_rate": 9.718017188666406e-05,
"loss": 0.0568,
"step": 3650
},
{
"epoch": 3.769309989701339,
"grad_norm": 0.5646706223487854,
"learning_rate": 9.715731763710097e-05,
"loss": 0.0531,
"step": 3660
},
{
"epoch": 3.779608650875386,
"grad_norm": 0.3185526430606842,
"learning_rate": 9.713437385537489e-05,
"loss": 0.0481,
"step": 3670
},
{
"epoch": 3.7899073120494338,
"grad_norm": 0.34746411442756653,
"learning_rate": 9.711134058504653e-05,
"loss": 0.0469,
"step": 3680
},
{
"epoch": 3.800205973223481,
"grad_norm": 0.6697479486465454,
"learning_rate": 9.708821786984652e-05,
"loss": 0.05,
"step": 3690
},
{
"epoch": 3.810504634397528,
"grad_norm": 0.4407866597175598,
"learning_rate": 9.70650057536753e-05,
"loss": 0.049,
"step": 3700
},
{
"epoch": 3.820803295571576,
"grad_norm": 0.41101008653640747,
"learning_rate": 9.7041704280603e-05,
"loss": 0.0496,
"step": 3710
},
{
"epoch": 3.831101956745623,
"grad_norm": 0.4155142307281494,
"learning_rate": 9.70183134948695e-05,
"loss": 0.046,
"step": 3720
},
{
"epoch": 3.8414006179196702,
"grad_norm": 0.2584857940673828,
"learning_rate": 9.699483344088414e-05,
"loss": 0.0457,
"step": 3730
},
{
"epoch": 3.851699279093718,
"grad_norm": 0.348079115152359,
"learning_rate": 9.697126416322582e-05,
"loss": 0.0515,
"step": 3740
},
{
"epoch": 3.861997940267765,
"grad_norm": 0.34925252199172974,
"learning_rate": 9.694760570664278e-05,
"loss": 0.0503,
"step": 3750
},
{
"epoch": 3.8722966014418123,
"grad_norm": 0.39806774258613586,
"learning_rate": 9.692385811605263e-05,
"loss": 0.0534,
"step": 3760
},
{
"epoch": 3.88259526261586,
"grad_norm": 0.5874518156051636,
"learning_rate": 9.690002143654218e-05,
"loss": 0.0506,
"step": 3770
},
{
"epoch": 3.892893923789907,
"grad_norm": 0.4007057547569275,
"learning_rate": 9.687609571336739e-05,
"loss": 0.0576,
"step": 3780
},
{
"epoch": 3.903192584963955,
"grad_norm": 0.22046445310115814,
"learning_rate": 9.685208099195327e-05,
"loss": 0.0414,
"step": 3790
},
{
"epoch": 3.913491246138002,
"grad_norm": 0.28014832735061646,
"learning_rate": 9.682797731789378e-05,
"loss": 0.052,
"step": 3800
},
{
"epoch": 3.9237899073120497,
"grad_norm": 0.4397921562194824,
"learning_rate": 9.680378473695181e-05,
"loss": 0.0532,
"step": 3810
},
{
"epoch": 3.934088568486097,
"grad_norm": 0.3825766444206238,
"learning_rate": 9.677950329505902e-05,
"loss": 0.0567,
"step": 3820
},
{
"epoch": 3.944387229660144,
"grad_norm": 0.25868862867355347,
"learning_rate": 9.675513303831579e-05,
"loss": 0.0559,
"step": 3830
},
{
"epoch": 3.9546858908341918,
"grad_norm": 0.2990769147872925,
"learning_rate": 9.673067401299112e-05,
"loss": 0.05,
"step": 3840
},
{
"epoch": 3.964984552008239,
"grad_norm": 0.48668310046195984,
"learning_rate": 9.670612626552251e-05,
"loss": 0.0439,
"step": 3850
},
{
"epoch": 3.975283213182286,
"grad_norm": 0.27801236510276794,
"learning_rate": 9.668148984251597e-05,
"loss": 0.055,
"step": 3860
},
{
"epoch": 3.985581874356334,
"grad_norm": 0.4435347318649292,
"learning_rate": 9.665676479074583e-05,
"loss": 0.0514,
"step": 3870
},
{
"epoch": 3.995880535530381,
"grad_norm": 0.28677383065223694,
"learning_rate": 9.663195115715471e-05,
"loss": 0.0459,
"step": 3880
},
{
"epoch": 4.006179196704428,
"grad_norm": 0.3085887134075165,
"learning_rate": 9.660704898885337e-05,
"loss": 0.0475,
"step": 3890
},
{
"epoch": 4.016477857878476,
"grad_norm": 0.37418079376220703,
"learning_rate": 9.65820583331207e-05,
"loss": 0.05,
"step": 3900
},
{
"epoch": 4.0267765190525235,
"grad_norm": 0.3386285901069641,
"learning_rate": 9.655697923740357e-05,
"loss": 0.0491,
"step": 3910
},
{
"epoch": 4.03707518022657,
"grad_norm": 0.36807170510292053,
"learning_rate": 9.653181174931676e-05,
"loss": 0.0445,
"step": 3920
},
{
"epoch": 4.047373841400618,
"grad_norm": 0.39420345425605774,
"learning_rate": 9.65065559166429e-05,
"loss": 0.0487,
"step": 3930
},
{
"epoch": 4.057672502574666,
"grad_norm": 0.45534148812294006,
"learning_rate": 9.648121178733233e-05,
"loss": 0.0475,
"step": 3940
},
{
"epoch": 4.067971163748712,
"grad_norm": 0.552058756351471,
"learning_rate": 9.6455779409503e-05,
"loss": 0.0517,
"step": 3950
},
{
"epoch": 4.07826982492276,
"grad_norm": 0.5828810334205627,
"learning_rate": 9.643025883144046e-05,
"loss": 0.0489,
"step": 3960
},
{
"epoch": 4.088568486096808,
"grad_norm": 0.4259772002696991,
"learning_rate": 9.640465010159767e-05,
"loss": 0.0544,
"step": 3970
},
{
"epoch": 4.098867147270854,
"grad_norm": 0.6728827953338623,
"learning_rate": 9.637895326859498e-05,
"loss": 0.0531,
"step": 3980
},
{
"epoch": 4.109165808444902,
"grad_norm": 0.4142812490463257,
"learning_rate": 9.635316838122002e-05,
"loss": 0.0513,
"step": 3990
},
{
"epoch": 4.11946446961895,
"grad_norm": 0.4978756904602051,
"learning_rate": 9.632729548842756e-05,
"loss": 0.0464,
"step": 4000
},
{
"epoch": 4.1297631307929965,
"grad_norm": 0.4538825750350952,
"learning_rate": 9.630133463933948e-05,
"loss": 0.0512,
"step": 4010
},
{
"epoch": 4.140061791967044,
"grad_norm": 0.5200012922286987,
"learning_rate": 9.627528588324469e-05,
"loss": 0.0461,
"step": 4020
},
{
"epoch": 4.150360453141092,
"grad_norm": 0.5186310410499573,
"learning_rate": 9.62491492695989e-05,
"loss": 0.0529,
"step": 4030
},
{
"epoch": 4.1606591143151395,
"grad_norm": 0.34113621711730957,
"learning_rate": 9.622292484802473e-05,
"loss": 0.0631,
"step": 4040
},
{
"epoch": 4.170957775489186,
"grad_norm": 0.33333924412727356,
"learning_rate": 9.619661266831145e-05,
"loss": 0.0468,
"step": 4050
},
{
"epoch": 4.181256436663234,
"grad_norm": 0.41653305292129517,
"learning_rate": 9.617021278041499e-05,
"loss": 0.0536,
"step": 4060
},
{
"epoch": 4.1915550978372815,
"grad_norm": 0.2886696755886078,
"learning_rate": 9.614372523445771e-05,
"loss": 0.0496,
"step": 4070
},
{
"epoch": 4.201853759011328,
"grad_norm": 0.29733291268348694,
"learning_rate": 9.611715008072853e-05,
"loss": 0.0473,
"step": 4080
},
{
"epoch": 4.212152420185376,
"grad_norm": 0.332317590713501,
"learning_rate": 9.609048736968259e-05,
"loss": 0.0446,
"step": 4090
},
{
"epoch": 4.222451081359424,
"grad_norm": 0.30169472098350525,
"learning_rate": 9.606373715194133e-05,
"loss": 0.0531,
"step": 4100
},
{
"epoch": 4.23274974253347,
"grad_norm": 0.4030070900917053,
"learning_rate": 9.603689947829226e-05,
"loss": 0.0502,
"step": 4110
},
{
"epoch": 4.243048403707518,
"grad_norm": 0.6141675710678101,
"learning_rate": 9.600997439968904e-05,
"loss": 0.0459,
"step": 4120
},
{
"epoch": 4.253347064881566,
"grad_norm": 0.3945510685443878,
"learning_rate": 9.598296196725117e-05,
"loss": 0.0495,
"step": 4130
},
{
"epoch": 4.263645726055612,
"grad_norm": 0.3716789186000824,
"learning_rate": 9.595586223226405e-05,
"loss": 0.0483,
"step": 4140
},
{
"epoch": 4.27394438722966,
"grad_norm": 0.31221863627433777,
"learning_rate": 9.592867524617882e-05,
"loss": 0.0501,
"step": 4150
},
{
"epoch": 4.284243048403708,
"grad_norm": 0.37691348791122437,
"learning_rate": 9.59014010606123e-05,
"loss": 0.0469,
"step": 4160
},
{
"epoch": 4.2945417095777545,
"grad_norm": 0.3065001666545868,
"learning_rate": 9.58740397273468e-05,
"loss": 0.0546,
"step": 4170
},
{
"epoch": 4.304840370751802,
"grad_norm": 0.48879119753837585,
"learning_rate": 9.584659129833018e-05,
"loss": 0.049,
"step": 4180
},
{
"epoch": 4.31513903192585,
"grad_norm": 0.5020750164985657,
"learning_rate": 9.581905582567557e-05,
"loss": 0.0538,
"step": 4190
},
{
"epoch": 4.325437693099897,
"grad_norm": 0.29413041472435,
"learning_rate": 9.579143336166142e-05,
"loss": 0.0422,
"step": 4200
},
{
"epoch": 4.335736354273944,
"grad_norm": 0.23740436136722565,
"learning_rate": 9.57637239587313e-05,
"loss": 0.0445,
"step": 4210
},
{
"epoch": 4.346035015447992,
"grad_norm": 0.34985145926475525,
"learning_rate": 9.573592766949387e-05,
"loss": 0.0459,
"step": 4220
},
{
"epoch": 4.3563336766220395,
"grad_norm": 0.396068811416626,
"learning_rate": 9.570804454672274e-05,
"loss": 0.0417,
"step": 4230
},
{
"epoch": 4.366632337796086,
"grad_norm": 0.45746490359306335,
"learning_rate": 9.568007464335637e-05,
"loss": 0.0532,
"step": 4240
},
{
"epoch": 4.376930998970134,
"grad_norm": 0.36292997002601624,
"learning_rate": 9.565201801249801e-05,
"loss": 0.0524,
"step": 4250
},
{
"epoch": 4.387229660144182,
"grad_norm": 0.27282604575157166,
"learning_rate": 9.562387470741554e-05,
"loss": 0.0488,
"step": 4260
},
{
"epoch": 4.397528321318228,
"grad_norm": 0.29837465286254883,
"learning_rate": 9.559564478154141e-05,
"loss": 0.0547,
"step": 4270
},
{
"epoch": 4.407826982492276,
"grad_norm": 0.43137961626052856,
"learning_rate": 9.556732828847253e-05,
"loss": 0.0453,
"step": 4280
},
{
"epoch": 4.418125643666324,
"grad_norm": 0.4074147343635559,
"learning_rate": 9.553892528197015e-05,
"loss": 0.0463,
"step": 4290
},
{
"epoch": 4.42842430484037,
"grad_norm": 0.3655324876308441,
"learning_rate": 9.551043581595979e-05,
"loss": 0.0501,
"step": 4300
},
{
"epoch": 4.438722966014418,
"grad_norm": 0.2569686472415924,
"learning_rate": 9.548185994453111e-05,
"loss": 0.0477,
"step": 4310
},
{
"epoch": 4.449021627188466,
"grad_norm": 0.27901747822761536,
"learning_rate": 9.545319772193784e-05,
"loss": 0.0444,
"step": 4320
},
{
"epoch": 4.4593202883625125,
"grad_norm": 0.6347026228904724,
"learning_rate": 9.542444920259763e-05,
"loss": 0.0483,
"step": 4330
},
{
"epoch": 4.46961894953656,
"grad_norm": 0.24210530519485474,
"learning_rate": 9.539561444109199e-05,
"loss": 0.0474,
"step": 4340
},
{
"epoch": 4.479917610710608,
"grad_norm": 0.7740889191627502,
"learning_rate": 9.536669349216613e-05,
"loss": 0.0513,
"step": 4350
},
{
"epoch": 4.490216271884655,
"grad_norm": 0.29452022910118103,
"learning_rate": 9.533768641072893e-05,
"loss": 0.0422,
"step": 4360
},
{
"epoch": 4.500514933058702,
"grad_norm": 0.42809170484542847,
"learning_rate": 9.530859325185282e-05,
"loss": 0.0531,
"step": 4370
},
{
"epoch": 4.51081359423275,
"grad_norm": 0.4382365047931671,
"learning_rate": 9.52794140707736e-05,
"loss": 0.0507,
"step": 4380
},
{
"epoch": 4.521112255406797,
"grad_norm": 0.2706674635410309,
"learning_rate": 9.52501489228904e-05,
"loss": 0.0406,
"step": 4390
},
{
"epoch": 4.531410916580844,
"grad_norm": 0.32525891065597534,
"learning_rate": 9.522079786376563e-05,
"loss": 0.0566,
"step": 4400
},
{
"epoch": 4.541709577754892,
"grad_norm": 0.4712689220905304,
"learning_rate": 9.519136094912473e-05,
"loss": 0.0502,
"step": 4410
},
{
"epoch": 4.55200823892894,
"grad_norm": 0.4823038578033447,
"learning_rate": 9.516183823485618e-05,
"loss": 0.048,
"step": 4420
},
{
"epoch": 4.562306900102986,
"grad_norm": 0.5360227227210999,
"learning_rate": 9.513222977701137e-05,
"loss": 0.0482,
"step": 4430
},
{
"epoch": 4.572605561277034,
"grad_norm": 0.5071778893470764,
"learning_rate": 9.510253563180447e-05,
"loss": 0.0483,
"step": 4440
},
{
"epoch": 4.582904222451082,
"grad_norm": 0.306821346282959,
"learning_rate": 9.507275585561229e-05,
"loss": 0.0535,
"step": 4450
},
{
"epoch": 4.593202883625128,
"grad_norm": 0.3385528326034546,
"learning_rate": 9.504289050497429e-05,
"loss": 0.0495,
"step": 4460
},
{
"epoch": 4.603501544799176,
"grad_norm": 0.3122556209564209,
"learning_rate": 9.501293963659237e-05,
"loss": 0.0497,
"step": 4470
},
{
"epoch": 4.613800205973224,
"grad_norm": 0.4538201093673706,
"learning_rate": 9.498290330733078e-05,
"loss": 0.0497,
"step": 4480
},
{
"epoch": 4.6240988671472705,
"grad_norm": 0.3034268617630005,
"learning_rate": 9.495278157421604e-05,
"loss": 0.0492,
"step": 4490
},
{
"epoch": 4.634397528321318,
"grad_norm": 0.3226306140422821,
"learning_rate": 9.492257449443683e-05,
"loss": 0.0479,
"step": 4500
},
{
"epoch": 4.644696189495366,
"grad_norm": 0.30091768503189087,
"learning_rate": 9.48922821253438e-05,
"loss": 0.0523,
"step": 4510
},
{
"epoch": 4.6549948506694125,
"grad_norm": 0.28747081756591797,
"learning_rate": 9.486190452444963e-05,
"loss": 0.0529,
"step": 4520
},
{
"epoch": 4.66529351184346,
"grad_norm": 0.352568656206131,
"learning_rate": 9.483144174942874e-05,
"loss": 0.0406,
"step": 4530
},
{
"epoch": 4.675592173017508,
"grad_norm": 0.7357869148254395,
"learning_rate": 9.480089385811733e-05,
"loss": 0.0519,
"step": 4540
},
{
"epoch": 4.6858908341915555,
"grad_norm": 0.38945066928863525,
"learning_rate": 9.477026090851311e-05,
"loss": 0.0535,
"step": 4550
},
{
"epoch": 4.696189495365602,
"grad_norm": 0.2964130640029907,
"learning_rate": 9.473954295877535e-05,
"loss": 0.0485,
"step": 4560
},
{
"epoch": 4.70648815653965,
"grad_norm": 0.3560560643672943,
"learning_rate": 9.470874006722468e-05,
"loss": 0.0498,
"step": 4570
},
{
"epoch": 4.716786817713698,
"grad_norm": 0.3421489894390106,
"learning_rate": 9.4677852292343e-05,
"loss": 0.0526,
"step": 4580
},
{
"epoch": 4.727085478887744,
"grad_norm": 0.36280402541160583,
"learning_rate": 9.464687969277338e-05,
"loss": 0.0424,
"step": 4590
},
{
"epoch": 4.737384140061792,
"grad_norm": 0.2949029505252838,
"learning_rate": 9.46158223273199e-05,
"loss": 0.0471,
"step": 4600
},
{
"epoch": 4.74768280123584,
"grad_norm": 0.20883090794086456,
"learning_rate": 9.458468025494763e-05,
"loss": 0.0514,
"step": 4610
},
{
"epoch": 4.757981462409886,
"grad_norm": 0.3708082139492035,
"learning_rate": 9.45534535347824e-05,
"loss": 0.0538,
"step": 4620
},
{
"epoch": 4.768280123583934,
"grad_norm": 0.2781459093093872,
"learning_rate": 9.452214222611079e-05,
"loss": 0.0508,
"step": 4630
},
{
"epoch": 4.778578784757982,
"grad_norm": 0.3967878818511963,
"learning_rate": 9.449074638837999e-05,
"loss": 0.0427,
"step": 4640
},
{
"epoch": 4.7888774459320285,
"grad_norm": 0.4552743434906006,
"learning_rate": 9.445926608119765e-05,
"loss": 0.0534,
"step": 4650
},
{
"epoch": 4.799176107106076,
"grad_norm": 0.27807968854904175,
"learning_rate": 9.442770136433181e-05,
"loss": 0.0615,
"step": 4660
},
{
"epoch": 4.809474768280124,
"grad_norm": 0.46061357855796814,
"learning_rate": 9.439605229771074e-05,
"loss": 0.0507,
"step": 4670
},
{
"epoch": 4.819773429454171,
"grad_norm": 0.22432656586170197,
"learning_rate": 9.436431894142288e-05,
"loss": 0.0447,
"step": 4680
},
{
"epoch": 4.830072090628218,
"grad_norm": 0.2655046284198761,
"learning_rate": 9.43325013557167e-05,
"loss": 0.0522,
"step": 4690
},
{
"epoch": 4.840370751802266,
"grad_norm": 0.30245622992515564,
"learning_rate": 9.430059960100057e-05,
"loss": 0.0573,
"step": 4700
},
{
"epoch": 4.850669412976313,
"grad_norm": 0.40526264905929565,
"learning_rate": 9.42686137378427e-05,
"loss": 0.0509,
"step": 4710
},
{
"epoch": 4.86096807415036,
"grad_norm": 0.5215529203414917,
"learning_rate": 9.423654382697096e-05,
"loss": 0.0525,
"step": 4720
},
{
"epoch": 4.871266735324408,
"grad_norm": 0.25576338171958923,
"learning_rate": 9.420438992927276e-05,
"loss": 0.0456,
"step": 4730
},
{
"epoch": 4.8815653964984556,
"grad_norm": 0.4159855544567108,
"learning_rate": 9.417215210579502e-05,
"loss": 0.047,
"step": 4740
},
{
"epoch": 4.891864057672502,
"grad_norm": 0.3707447946071625,
"learning_rate": 9.4139830417744e-05,
"loss": 0.0512,
"step": 4750
},
{
"epoch": 4.90216271884655,
"grad_norm": 1.0935066938400269,
"learning_rate": 9.41074249264851e-05,
"loss": 0.0463,
"step": 4760
},
{
"epoch": 4.912461380020598,
"grad_norm": 0.1729433387517929,
"learning_rate": 9.407493569354296e-05,
"loss": 0.052,
"step": 4770
},
{
"epoch": 4.922760041194644,
"grad_norm": 0.5101364850997925,
"learning_rate": 9.404236278060111e-05,
"loss": 0.0452,
"step": 4780
},
{
"epoch": 4.933058702368692,
"grad_norm": 0.4424050748348236,
"learning_rate": 9.400970624950198e-05,
"loss": 0.0469,
"step": 4790
},
{
"epoch": 4.94335736354274,
"grad_norm": 0.4668126106262207,
"learning_rate": 9.397696616224679e-05,
"loss": 0.0475,
"step": 4800
},
{
"epoch": 4.9536560247167865,
"grad_norm": 0.39517247676849365,
"learning_rate": 9.394414258099533e-05,
"loss": 0.0532,
"step": 4810
},
{
"epoch": 4.963954685890834,
"grad_norm": 0.30740824341773987,
"learning_rate": 9.391123556806598e-05,
"loss": 0.046,
"step": 4820
},
{
"epoch": 4.974253347064882,
"grad_norm": 0.5773882865905762,
"learning_rate": 9.387824518593546e-05,
"loss": 0.0518,
"step": 4830
},
{
"epoch": 4.9845520082389285,
"grad_norm": 0.35598331689834595,
"learning_rate": 9.384517149723885e-05,
"loss": 0.0596,
"step": 4840
},
{
"epoch": 4.994850669412976,
"grad_norm": 0.37489575147628784,
"learning_rate": 9.381201456476933e-05,
"loss": 0.051,
"step": 4850
},
{
"epoch": 5.005149330587024,
"grad_norm": 0.539053201675415,
"learning_rate": 9.377877445147812e-05,
"loss": 0.0502,
"step": 4860
},
{
"epoch": 5.0154479917610715,
"grad_norm": 0.26694029569625854,
"learning_rate": 9.374545122047443e-05,
"loss": 0.0509,
"step": 4870
},
{
"epoch": 5.025746652935118,
"grad_norm": 0.3624720573425293,
"learning_rate": 9.371204493502522e-05,
"loss": 0.0482,
"step": 4880
},
{
"epoch": 5.036045314109166,
"grad_norm": 0.480744332075119,
"learning_rate": 9.367855565855514e-05,
"loss": 0.0422,
"step": 4890
},
{
"epoch": 5.0463439752832135,
"grad_norm": 0.3989076018333435,
"learning_rate": 9.364498345464647e-05,
"loss": 0.0477,
"step": 4900
},
{
"epoch": 5.05664263645726,
"grad_norm": 0.23205599188804626,
"learning_rate": 9.361132838703882e-05,
"loss": 0.0494,
"step": 4910
},
{
"epoch": 5.066941297631308,
"grad_norm": 0.39161673188209534,
"learning_rate": 9.357759051962921e-05,
"loss": 0.0459,
"step": 4920
},
{
"epoch": 5.077239958805356,
"grad_norm": 0.32930952310562134,
"learning_rate": 9.354376991647184e-05,
"loss": 0.0457,
"step": 4930
},
{
"epoch": 5.087538619979402,
"grad_norm": 0.3652319014072418,
"learning_rate": 9.350986664177802e-05,
"loss": 0.0395,
"step": 4940
},
{
"epoch": 5.09783728115345,
"grad_norm": 0.27907899022102356,
"learning_rate": 9.347588075991596e-05,
"loss": 0.0557,
"step": 4950
},
{
"epoch": 5.108135942327498,
"grad_norm": 0.366009384393692,
"learning_rate": 9.344181233541072e-05,
"loss": 0.054,
"step": 4960
},
{
"epoch": 5.1184346035015444,
"grad_norm": 0.506394624710083,
"learning_rate": 9.340766143294412e-05,
"loss": 0.0492,
"step": 4970
},
{
"epoch": 5.128733264675592,
"grad_norm": 0.39846888184547424,
"learning_rate": 9.337342811735454e-05,
"loss": 0.0535,
"step": 4980
},
{
"epoch": 5.13903192584964,
"grad_norm": 0.4480404257774353,
"learning_rate": 9.333911245363682e-05,
"loss": 0.0464,
"step": 4990
},
{
"epoch": 5.1493305870236865,
"grad_norm": 0.30809465050697327,
"learning_rate": 9.330471450694216e-05,
"loss": 0.0524,
"step": 5000
},
{
"epoch": 5.159629248197734,
"grad_norm": 0.4051932692527771,
"learning_rate": 9.327023434257796e-05,
"loss": 0.0503,
"step": 5010
},
{
"epoch": 5.169927909371782,
"grad_norm": 0.5069543123245239,
"learning_rate": 9.323567202600776e-05,
"loss": 0.0507,
"step": 5020
},
{
"epoch": 5.1802265705458295,
"grad_norm": 0.4128740131855011,
"learning_rate": 9.320102762285103e-05,
"loss": 0.0549,
"step": 5030
},
{
"epoch": 5.190525231719876,
"grad_norm": 0.4237695038318634,
"learning_rate": 9.31663011988831e-05,
"loss": 0.0455,
"step": 5040
},
{
"epoch": 5.200823892893924,
"grad_norm": 0.6371134519577026,
"learning_rate": 9.313149282003507e-05,
"loss": 0.0524,
"step": 5050
},
{
"epoch": 5.2111225540679715,
"grad_norm": 0.4311446249485016,
"learning_rate": 9.309660255239352e-05,
"loss": 0.0404,
"step": 5060
},
{
"epoch": 5.221421215242018,
"grad_norm": 0.3251447081565857,
"learning_rate": 9.306163046220064e-05,
"loss": 0.0518,
"step": 5070
},
{
"epoch": 5.231719876416066,
"grad_norm": 0.4141363799571991,
"learning_rate": 9.30265766158539e-05,
"loss": 0.0512,
"step": 5080
},
{
"epoch": 5.242018537590114,
"grad_norm": 0.3799976408481598,
"learning_rate": 9.299144107990596e-05,
"loss": 0.0438,
"step": 5090
},
{
"epoch": 5.25231719876416,
"grad_norm": 0.3431903123855591,
"learning_rate": 9.295622392106466e-05,
"loss": 0.0594,
"step": 5100
},
{
"epoch": 5.262615859938208,
"grad_norm": 0.360916405916214,
"learning_rate": 9.292092520619273e-05,
"loss": 0.0506,
"step": 5110
},
{
"epoch": 5.272914521112256,
"grad_norm": 0.559694766998291,
"learning_rate": 9.288554500230777e-05,
"loss": 0.0517,
"step": 5120
},
{
"epoch": 5.283213182286302,
"grad_norm": 0.6030452251434326,
"learning_rate": 9.285008337658212e-05,
"loss": 0.0469,
"step": 5130
},
{
"epoch": 5.29351184346035,
"grad_norm": 0.4279625713825226,
"learning_rate": 9.281454039634265e-05,
"loss": 0.0489,
"step": 5140
},
{
"epoch": 5.303810504634398,
"grad_norm": 0.525072455406189,
"learning_rate": 9.277891612907074e-05,
"loss": 0.0522,
"step": 5150
},
{
"epoch": 5.3141091658084445,
"grad_norm": 0.35859811305999756,
"learning_rate": 9.274321064240207e-05,
"loss": 0.055,
"step": 5160
},
{
"epoch": 5.324407826982492,
"grad_norm": 0.33742043375968933,
"learning_rate": 9.270742400412653e-05,
"loss": 0.0408,
"step": 5170
},
{
"epoch": 5.33470648815654,
"grad_norm": 0.5135817527770996,
"learning_rate": 9.26715562821881e-05,
"loss": 0.0452,
"step": 5180
},
{
"epoch": 5.3450051493305875,
"grad_norm": 0.4040980041027069,
"learning_rate": 9.263560754468468e-05,
"loss": 0.0486,
"step": 5190
},
{
"epoch": 5.355303810504634,
"grad_norm": 0.5518568158149719,
"learning_rate": 9.259957785986799e-05,
"loss": 0.0526,
"step": 5200
},
{
"epoch": 5.365602471678682,
"grad_norm": 0.31775131821632385,
"learning_rate": 9.256346729614342e-05,
"loss": 0.0462,
"step": 5210
},
{
"epoch": 5.3759011328527295,
"grad_norm": 0.34002628922462463,
"learning_rate": 9.252727592206997e-05,
"loss": 0.0521,
"step": 5220
},
{
"epoch": 5.386199794026776,
"grad_norm": 0.4139520227909088,
"learning_rate": 9.249100380636003e-05,
"loss": 0.0476,
"step": 5230
},
{
"epoch": 5.396498455200824,
"grad_norm": 0.268572062253952,
"learning_rate": 9.245465101787927e-05,
"loss": 0.0432,
"step": 5240
},
{
"epoch": 5.406797116374872,
"grad_norm": 0.32383212447166443,
"learning_rate": 9.241821762564653e-05,
"loss": 0.0408,
"step": 5250
},
{
"epoch": 5.417095777548918,
"grad_norm": 0.38563185930252075,
"learning_rate": 9.23817036988337e-05,
"loss": 0.0497,
"step": 5260
},
{
"epoch": 5.427394438722966,
"grad_norm": 0.27567192912101746,
"learning_rate": 9.234510930676558e-05,
"loss": 0.0486,
"step": 5270
},
{
"epoch": 5.437693099897014,
"grad_norm": 0.563594400882721,
"learning_rate": 9.23084345189197e-05,
"loss": 0.05,
"step": 5280
},
{
"epoch": 5.44799176107106,
"grad_norm": 0.4761587381362915,
"learning_rate": 9.227167940492626e-05,
"loss": 0.0479,
"step": 5290
},
{
"epoch": 5.458290422245108,
"grad_norm": 0.38452261686325073,
"learning_rate": 9.223484403456797e-05,
"loss": 0.045,
"step": 5300
},
{
"epoch": 5.468589083419156,
"grad_norm": 0.32944151759147644,
"learning_rate": 9.219792847777989e-05,
"loss": 0.0496,
"step": 5310
},
{
"epoch": 5.4788877445932025,
"grad_norm": 0.4086971580982208,
"learning_rate": 9.216093280464933e-05,
"loss": 0.0497,
"step": 5320
},
{
"epoch": 5.48918640576725,
"grad_norm": 0.20752595365047455,
"learning_rate": 9.212385708541571e-05,
"loss": 0.0471,
"step": 5330
},
{
"epoch": 5.499485066941298,
"grad_norm": 0.38301628828048706,
"learning_rate": 9.208670139047041e-05,
"loss": 0.0502,
"step": 5340
},
{
"epoch": 5.509783728115345,
"grad_norm": 0.2867702543735504,
"learning_rate": 9.204946579035671e-05,
"loss": 0.0465,
"step": 5350
},
{
"epoch": 5.520082389289392,
"grad_norm": 0.3082829415798187,
"learning_rate": 9.201215035576953e-05,
"loss": 0.0509,
"step": 5360
},
{
"epoch": 5.53038105046344,
"grad_norm": 0.36042046546936035,
"learning_rate": 9.197475515755535e-05,
"loss": 0.047,
"step": 5370
},
{
"epoch": 5.5406797116374875,
"grad_norm": 0.39044928550720215,
"learning_rate": 9.193728026671218e-05,
"loss": 0.0482,
"step": 5380
},
{
"epoch": 5.550978372811534,
"grad_norm": 0.4395255148410797,
"learning_rate": 9.189972575438923e-05,
"loss": 0.0488,
"step": 5390
},
{
"epoch": 5.561277033985582,
"grad_norm": 0.35821837186813354,
"learning_rate": 9.186209169188695e-05,
"loss": 0.0509,
"step": 5400
},
{
"epoch": 5.57157569515963,
"grad_norm": 0.6066220998764038,
"learning_rate": 9.182437815065679e-05,
"loss": 0.0516,
"step": 5410
},
{
"epoch": 5.581874356333676,
"grad_norm": 0.3900398910045624,
"learning_rate": 9.178658520230108e-05,
"loss": 0.0489,
"step": 5420
},
{
"epoch": 5.592173017507724,
"grad_norm": 0.4040411114692688,
"learning_rate": 9.174871291857296e-05,
"loss": 0.0402,
"step": 5430
},
{
"epoch": 5.602471678681772,
"grad_norm": 0.49822893738746643,
"learning_rate": 9.171076137137617e-05,
"loss": 0.0479,
"step": 5440
},
{
"epoch": 5.612770339855818,
"grad_norm": 0.4336659610271454,
"learning_rate": 9.167273063276493e-05,
"loss": 0.0484,
"step": 5450
},
{
"epoch": 5.623069001029866,
"grad_norm": 0.5972550511360168,
"learning_rate": 9.163462077494382e-05,
"loss": 0.0459,
"step": 5460
},
{
"epoch": 5.633367662203914,
"grad_norm": 0.28320547938346863,
"learning_rate": 9.159643187026762e-05,
"loss": 0.0478,
"step": 5470
},
{
"epoch": 5.6436663233779605,
"grad_norm": 0.4079047739505768,
"learning_rate": 9.155816399124125e-05,
"loss": 0.0477,
"step": 5480
},
{
"epoch": 5.653964984552008,
"grad_norm": 0.6271510124206543,
"learning_rate": 9.151981721051946e-05,
"loss": 0.0388,
"step": 5490
},
{
"epoch": 5.664263645726056,
"grad_norm": 0.3322283625602722,
"learning_rate": 9.148139160090688e-05,
"loss": 0.0411,
"step": 5500
},
{
"epoch": 5.674562306900103,
"grad_norm": 0.4952861964702606,
"learning_rate": 9.144288723535781e-05,
"loss": 0.0494,
"step": 5510
},
{
"epoch": 5.68486096807415,
"grad_norm": 0.40626558661460876,
"learning_rate": 9.140430418697603e-05,
"loss": 0.0416,
"step": 5520
},
{
"epoch": 5.695159629248198,
"grad_norm": 0.2986306846141815,
"learning_rate": 9.136564252901474e-05,
"loss": 0.0453,
"step": 5530
},
{
"epoch": 5.705458290422245,
"grad_norm": 0.3145788013935089,
"learning_rate": 9.132690233487635e-05,
"loss": 0.0495,
"step": 5540
},
{
"epoch": 5.715756951596292,
"grad_norm": 0.4465279281139374,
"learning_rate": 9.128808367811241e-05,
"loss": 0.0465,
"step": 5550
},
{
"epoch": 5.72605561277034,
"grad_norm": 0.3666897714138031,
"learning_rate": 9.124918663242346e-05,
"loss": 0.0515,
"step": 5560
},
{
"epoch": 5.736354273944388,
"grad_norm": 0.5630766749382019,
"learning_rate": 9.121021127165878e-05,
"loss": 0.044,
"step": 5570
},
{
"epoch": 5.746652935118434,
"grad_norm": 0.3722570836544037,
"learning_rate": 9.117115766981644e-05,
"loss": 0.0508,
"step": 5580
},
{
"epoch": 5.756951596292482,
"grad_norm": 0.3674274682998657,
"learning_rate": 9.1132025901043e-05,
"loss": 0.0588,
"step": 5590
},
{
"epoch": 5.76725025746653,
"grad_norm": 0.535066545009613,
"learning_rate": 9.109281603963342e-05,
"loss": 0.0496,
"step": 5600
},
{
"epoch": 5.777548918640576,
"grad_norm": 0.44427838921546936,
"learning_rate": 9.105352816003098e-05,
"loss": 0.055,
"step": 5610
},
{
"epoch": 5.787847579814624,
"grad_norm": 0.3270188570022583,
"learning_rate": 9.101416233682701e-05,
"loss": 0.0543,
"step": 5620
},
{
"epoch": 5.798146240988672,
"grad_norm": 0.43694767355918884,
"learning_rate": 9.097471864476089e-05,
"loss": 0.0464,
"step": 5630
},
{
"epoch": 5.8084449021627185,
"grad_norm": 0.3801634609699249,
"learning_rate": 9.093519715871979e-05,
"loss": 0.0536,
"step": 5640
},
{
"epoch": 5.818743563336766,
"grad_norm": 0.3844929337501526,
"learning_rate": 9.089559795373862e-05,
"loss": 0.0543,
"step": 5650
},
{
"epoch": 5.829042224510814,
"grad_norm": 0.4263947904109955,
"learning_rate": 9.08559211049998e-05,
"loss": 0.0475,
"step": 5660
},
{
"epoch": 5.8393408856848605,
"grad_norm": 0.39681196212768555,
"learning_rate": 9.081616668783322e-05,
"loss": 0.0551,
"step": 5670
},
{
"epoch": 5.849639546858908,
"grad_norm": 0.33459895849227905,
"learning_rate": 9.077633477771599e-05,
"loss": 0.0448,
"step": 5680
},
{
"epoch": 5.859938208032956,
"grad_norm": 0.30157431960105896,
"learning_rate": 9.073642545027236e-05,
"loss": 0.0519,
"step": 5690
},
{
"epoch": 5.8702368692070035,
"grad_norm": 0.2790994644165039,
"learning_rate": 9.069643878127359e-05,
"loss": 0.0517,
"step": 5700
},
{
"epoch": 5.88053553038105,
"grad_norm": 0.4210405945777893,
"learning_rate": 9.065637484663773e-05,
"loss": 0.0444,
"step": 5710
},
{
"epoch": 5.890834191555098,
"grad_norm": 0.3264312148094177,
"learning_rate": 9.06162337224296e-05,
"loss": 0.0481,
"step": 5720
},
{
"epoch": 5.901132852729146,
"grad_norm": 0.32913386821746826,
"learning_rate": 9.057601548486047e-05,
"loss": 0.0446,
"step": 5730
},
{
"epoch": 5.911431513903192,
"grad_norm": 0.25155073404312134,
"learning_rate": 9.053572021028812e-05,
"loss": 0.0436,
"step": 5740
},
{
"epoch": 5.92173017507724,
"grad_norm": 0.25006964802742004,
"learning_rate": 9.04953479752165e-05,
"loss": 0.0479,
"step": 5750
},
{
"epoch": 5.932028836251288,
"grad_norm": 0.3676224648952484,
"learning_rate": 9.045489885629576e-05,
"loss": 0.0441,
"step": 5760
},
{
"epoch": 5.942327497425334,
"grad_norm": 0.22950799763202667,
"learning_rate": 9.041437293032195e-05,
"loss": 0.0459,
"step": 5770
},
{
"epoch": 5.952626158599382,
"grad_norm": 0.26980340480804443,
"learning_rate": 9.0373770274237e-05,
"loss": 0.0494,
"step": 5780
},
{
"epoch": 5.96292481977343,
"grad_norm": 0.4180351495742798,
"learning_rate": 9.033309096512846e-05,
"loss": 0.0513,
"step": 5790
},
{
"epoch": 5.9732234809474765,
"grad_norm": 0.39388883113861084,
"learning_rate": 9.029233508022947e-05,
"loss": 0.0522,
"step": 5800
},
{
"epoch": 5.983522142121524,
"grad_norm": 0.33429384231567383,
"learning_rate": 9.025150269691852e-05,
"loss": 0.0438,
"step": 5810
},
{
"epoch": 5.993820803295572,
"grad_norm": 0.2864847183227539,
"learning_rate": 9.021059389271935e-05,
"loss": 0.0488,
"step": 5820
},
{
"epoch": 6.0041194644696185,
"grad_norm": 0.2295219898223877,
"learning_rate": 9.01696087453008e-05,
"loss": 0.0477,
"step": 5830
},
{
"epoch": 6.014418125643666,
"grad_norm": 0.2532294690608978,
"learning_rate": 9.012854733247663e-05,
"loss": 0.0456,
"step": 5840
},
{
"epoch": 6.024716786817714,
"grad_norm": 0.37234383821487427,
"learning_rate": 9.008740973220542e-05,
"loss": 0.0444,
"step": 5850
},
{
"epoch": 6.0350154479917615,
"grad_norm": 0.26388946175575256,
"learning_rate": 9.004619602259038e-05,
"loss": 0.0498,
"step": 5860
},
{
"epoch": 6.045314109165808,
"grad_norm": 0.2828536629676819,
"learning_rate": 9.000490628187926e-05,
"loss": 0.044,
"step": 5870
},
{
"epoch": 6.055612770339856,
"grad_norm": 0.25269097089767456,
"learning_rate": 8.996354058846408e-05,
"loss": 0.0491,
"step": 5880
},
{
"epoch": 6.0659114315139036,
"grad_norm": 0.37286481261253357,
"learning_rate": 8.992209902088116e-05,
"loss": 0.0548,
"step": 5890
},
{
"epoch": 6.07621009268795,
"grad_norm": 0.34140411019325256,
"learning_rate": 8.988058165781081e-05,
"loss": 0.0444,
"step": 5900
},
{
"epoch": 6.086508753861998,
"grad_norm": 0.40900543332099915,
"learning_rate": 8.983898857807726e-05,
"loss": 0.0488,
"step": 5910
},
{
"epoch": 6.096807415036046,
"grad_norm": 0.27773845195770264,
"learning_rate": 8.979731986064849e-05,
"loss": 0.0424,
"step": 5920
},
{
"epoch": 6.107106076210092,
"grad_norm": 0.35663488507270813,
"learning_rate": 8.97555755846361e-05,
"loss": 0.0436,
"step": 5930
},
{
"epoch": 6.11740473738414,
"grad_norm": 0.5180099010467529,
"learning_rate": 8.971375582929513e-05,
"loss": 0.0517,
"step": 5940
},
{
"epoch": 6.127703398558188,
"grad_norm": 0.41418635845184326,
"learning_rate": 8.967186067402393e-05,
"loss": 0.0422,
"step": 5950
},
{
"epoch": 6.1380020597322344,
"grad_norm": 0.38915103673934937,
"learning_rate": 8.9629890198364e-05,
"loss": 0.042,
"step": 5960
},
{
"epoch": 6.148300720906282,
"grad_norm": 0.7760641574859619,
"learning_rate": 8.958784448199987e-05,
"loss": 0.0465,
"step": 5970
},
{
"epoch": 6.15859938208033,
"grad_norm": 0.31317341327667236,
"learning_rate": 8.954572360475886e-05,
"loss": 0.0508,
"step": 5980
},
{
"epoch": 6.1688980432543765,
"grad_norm": 0.5295110940933228,
"learning_rate": 8.950352764661103e-05,
"loss": 0.0459,
"step": 5990
},
{
"epoch": 6.179196704428424,
"grad_norm": 0.3669707477092743,
"learning_rate": 8.946125668766898e-05,
"loss": 0.0495,
"step": 6000
},
{
"epoch": 6.189495365602472,
"grad_norm": 0.3288549780845642,
"learning_rate": 8.941891080818773e-05,
"loss": 0.0463,
"step": 6010
},
{
"epoch": 6.1997940267765195,
"grad_norm": 0.37521734833717346,
"learning_rate": 8.93764900885645e-05,
"loss": 0.0432,
"step": 6020
},
{
"epoch": 6.210092687950566,
"grad_norm": 0.40569764375686646,
"learning_rate": 8.933399460933862e-05,
"loss": 0.0442,
"step": 6030
},
{
"epoch": 6.220391349124614,
"grad_norm": 0.43062832951545715,
"learning_rate": 8.929142445119137e-05,
"loss": 0.0424,
"step": 6040
},
{
"epoch": 6.2306900102986615,
"grad_norm": 0.25243639945983887,
"learning_rate": 8.924877969494578e-05,
"loss": 0.0431,
"step": 6050
},
{
"epoch": 6.240988671472708,
"grad_norm": 0.33310917019844055,
"learning_rate": 8.920606042156657e-05,
"loss": 0.0436,
"step": 6060
},
{
"epoch": 6.251287332646756,
"grad_norm": 0.3819611668586731,
"learning_rate": 8.916326671215987e-05,
"loss": 0.0416,
"step": 6070
},
{
"epoch": 6.261585993820804,
"grad_norm": 0.38392260670661926,
"learning_rate": 8.912039864797317e-05,
"loss": 0.0491,
"step": 6080
},
{
"epoch": 6.27188465499485,
"grad_norm": 0.29428112506866455,
"learning_rate": 8.907745631039511e-05,
"loss": 0.045,
"step": 6090
},
{
"epoch": 6.282183316168898,
"grad_norm": 0.301840215921402,
"learning_rate": 8.903443978095537e-05,
"loss": 0.0506,
"step": 6100
},
{
"epoch": 6.292481977342946,
"grad_norm": 0.33956289291381836,
"learning_rate": 8.89913491413245e-05,
"loss": 0.0423,
"step": 6110
},
{
"epoch": 6.302780638516992,
"grad_norm": 0.2992474138736725,
"learning_rate": 8.894818447331368e-05,
"loss": 0.0503,
"step": 6120
},
{
"epoch": 6.31307929969104,
"grad_norm": 0.2867885231971741,
"learning_rate": 8.890494585887472e-05,
"loss": 0.05,
"step": 6130
},
{
"epoch": 6.323377960865088,
"grad_norm": 0.40204593539237976,
"learning_rate": 8.886163338009978e-05,
"loss": 0.048,
"step": 6140
},
{
"epoch": 6.3336766220391345,
"grad_norm": 0.3581812083721161,
"learning_rate": 8.881824711922129e-05,
"loss": 0.0488,
"step": 6150
},
{
"epoch": 6.343975283213182,
"grad_norm": 0.30317041277885437,
"learning_rate": 8.877478715861173e-05,
"loss": 0.0467,
"step": 6160
},
{
"epoch": 6.35427394438723,
"grad_norm": 0.4209807515144348,
"learning_rate": 8.873125358078352e-05,
"loss": 0.0527,
"step": 6170
},
{
"epoch": 6.364572605561277,
"grad_norm": 0.2900068461894989,
"learning_rate": 8.868764646838883e-05,
"loss": 0.0461,
"step": 6180
},
{
"epoch": 6.374871266735324,
"grad_norm": 0.30829277634620667,
"learning_rate": 8.864396590421947e-05,
"loss": 0.0449,
"step": 6190
},
{
"epoch": 6.385169927909372,
"grad_norm": 0.20672224462032318,
"learning_rate": 8.86002119712067e-05,
"loss": 0.0412,
"step": 6200
},
{
"epoch": 6.3954685890834195,
"grad_norm": 0.2930947542190552,
"learning_rate": 8.855638475242107e-05,
"loss": 0.0424,
"step": 6210
},
{
"epoch": 6.405767250257466,
"grad_norm": 0.3625617027282715,
"learning_rate": 8.851248433107225e-05,
"loss": 0.0427,
"step": 6220
},
{
"epoch": 6.416065911431514,
"grad_norm": 0.31254348158836365,
"learning_rate": 8.846851079050892e-05,
"loss": 0.044,
"step": 6230
},
{
"epoch": 6.426364572605562,
"grad_norm": 0.388361394405365,
"learning_rate": 8.842446421421857e-05,
"loss": 0.0386,
"step": 6240
},
{
"epoch": 6.436663233779608,
"grad_norm": 0.37418097257614136,
"learning_rate": 8.838034468582737e-05,
"loss": 0.0519,
"step": 6250
},
{
"epoch": 6.446961894953656,
"grad_norm": 0.42668136954307556,
"learning_rate": 8.833615228909995e-05,
"loss": 0.0533,
"step": 6260
},
{
"epoch": 6.457260556127704,
"grad_norm": 0.4811553657054901,
"learning_rate": 8.829188710793937e-05,
"loss": 0.0478,
"step": 6270
},
{
"epoch": 6.46755921730175,
"grad_norm": 0.30195683240890503,
"learning_rate": 8.824754922638677e-05,
"loss": 0.046,
"step": 6280
},
{
"epoch": 6.477857878475798,
"grad_norm": 0.3815639913082123,
"learning_rate": 8.82031387286214e-05,
"loss": 0.0511,
"step": 6290
},
{
"epoch": 6.488156539649846,
"grad_norm": 0.4513855278491974,
"learning_rate": 8.815865569896038e-05,
"loss": 0.0487,
"step": 6300
},
{
"epoch": 6.4984552008238925,
"grad_norm": 0.3365342915058136,
"learning_rate": 8.811410022185847e-05,
"loss": 0.0444,
"step": 6310
},
{
"epoch": 6.50875386199794,
"grad_norm": 0.3500465154647827,
"learning_rate": 8.806947238190803e-05,
"loss": 0.0432,
"step": 6320
},
{
"epoch": 6.519052523171988,
"grad_norm": 0.4398089349269867,
"learning_rate": 8.802477226383881e-05,
"loss": 0.042,
"step": 6330
},
{
"epoch": 6.5293511843460355,
"grad_norm": 0.318387895822525,
"learning_rate": 8.797999995251777e-05,
"loss": 0.0483,
"step": 6340
},
{
"epoch": 6.539649845520082,
"grad_norm": 0.32100003957748413,
"learning_rate": 8.793515553294891e-05,
"loss": 0.0468,
"step": 6350
},
{
"epoch": 6.54994850669413,
"grad_norm": 0.292423814535141,
"learning_rate": 8.789023909027319e-05,
"loss": 0.0489,
"step": 6360
},
{
"epoch": 6.5602471678681775,
"grad_norm": 0.31183555722236633,
"learning_rate": 8.784525070976825e-05,
"loss": 0.0456,
"step": 6370
},
{
"epoch": 6.570545829042224,
"grad_norm": 0.25083911418914795,
"learning_rate": 8.780019047684837e-05,
"loss": 0.0524,
"step": 6380
},
{
"epoch": 6.580844490216272,
"grad_norm": 0.4049484133720398,
"learning_rate": 8.775505847706422e-05,
"loss": 0.0484,
"step": 6390
},
{
"epoch": 6.59114315139032,
"grad_norm": 0.46989813446998596,
"learning_rate": 8.770985479610273e-05,
"loss": 0.0409,
"step": 6400
},
{
"epoch": 6.601441812564366,
"grad_norm": 0.301536500453949,
"learning_rate": 8.766457951978687e-05,
"loss": 0.0435,
"step": 6410
},
{
"epoch": 6.611740473738414,
"grad_norm": 0.3885561227798462,
"learning_rate": 8.761923273407564e-05,
"loss": 0.0417,
"step": 6420
},
{
"epoch": 6.622039134912462,
"grad_norm": 0.2474498450756073,
"learning_rate": 8.757381452506374e-05,
"loss": 0.0439,
"step": 6430
},
{
"epoch": 6.632337796086508,
"grad_norm": 0.6527761816978455,
"learning_rate": 8.752832497898148e-05,
"loss": 0.0491,
"step": 6440
},
{
"epoch": 6.642636457260556,
"grad_norm": 0.4527409076690674,
"learning_rate": 8.748276418219463e-05,
"loss": 0.0491,
"step": 6450
},
{
"epoch": 6.652935118434604,
"grad_norm": 0.433944433927536,
"learning_rate": 8.743713222120421e-05,
"loss": 0.043,
"step": 6460
},
{
"epoch": 6.663233779608651,
"grad_norm": 0.5092076063156128,
"learning_rate": 8.739142918264638e-05,
"loss": 0.0487,
"step": 6470
},
{
"epoch": 6.673532440782698,
"grad_norm": 0.3467596471309662,
"learning_rate": 8.734565515329221e-05,
"loss": 0.0404,
"step": 6480
},
{
"epoch": 6.683831101956746,
"grad_norm": 0.23407290875911713,
"learning_rate": 8.729981022004758e-05,
"loss": 0.0446,
"step": 6490
},
{
"epoch": 6.6941297631307926,
"grad_norm": 0.33283254504203796,
"learning_rate": 8.7253894469953e-05,
"loss": 0.0441,
"step": 6500
},
{
"epoch": 6.70442842430484,
"grad_norm": 0.30493679642677307,
"learning_rate": 8.720790799018337e-05,
"loss": 0.0457,
"step": 6510
},
{
"epoch": 6.714727085478888,
"grad_norm": 0.4004342257976532,
"learning_rate": 8.716185086804798e-05,
"loss": 0.0448,
"step": 6520
},
{
"epoch": 6.7250257466529355,
"grad_norm": 0.5191412568092346,
"learning_rate": 8.71157231909901e-05,
"loss": 0.0528,
"step": 6530
},
{
"epoch": 6.735324407826982,
"grad_norm": 0.6779634356498718,
"learning_rate": 8.706952504658712e-05,
"loss": 0.0496,
"step": 6540
},
{
"epoch": 6.74562306900103,
"grad_norm": 0.3031466007232666,
"learning_rate": 8.702325652255005e-05,
"loss": 0.0473,
"step": 6550
},
{
"epoch": 6.755921730175078,
"grad_norm": 0.41129881143569946,
"learning_rate": 8.697691770672366e-05,
"loss": 0.0468,
"step": 6560
},
{
"epoch": 6.766220391349124,
"grad_norm": 0.278126984834671,
"learning_rate": 8.69305086870861e-05,
"loss": 0.0439,
"step": 6570
},
{
"epoch": 6.776519052523172,
"grad_norm": 0.3106745779514313,
"learning_rate": 8.688402955174881e-05,
"loss": 0.0487,
"step": 6580
},
{
"epoch": 6.78681771369722,
"grad_norm": 0.4737553596496582,
"learning_rate": 8.683748038895639e-05,
"loss": 0.0512,
"step": 6590
},
{
"epoch": 6.797116374871266,
"grad_norm": 0.3413580358028412,
"learning_rate": 8.679086128708636e-05,
"loss": 0.0485,
"step": 6600
},
{
"epoch": 6.807415036045314,
"grad_norm": 0.2869110703468323,
"learning_rate": 8.674417233464903e-05,
"loss": 0.0397,
"step": 6610
},
{
"epoch": 6.817713697219362,
"grad_norm": 0.295481413602829,
"learning_rate": 8.669741362028734e-05,
"loss": 0.0422,
"step": 6620
},
{
"epoch": 6.8280123583934085,
"grad_norm": 0.453259140253067,
"learning_rate": 8.665058523277667e-05,
"loss": 0.0456,
"step": 6630
},
{
"epoch": 6.838311019567456,
"grad_norm": 0.3729632794857025,
"learning_rate": 8.660368726102469e-05,
"loss": 0.0483,
"step": 6640
},
{
"epoch": 6.848609680741504,
"grad_norm": 0.40101921558380127,
"learning_rate": 8.655671979407112e-05,
"loss": 0.0462,
"step": 6650
},
{
"epoch": 6.858908341915551,
"grad_norm": 0.27768710255622864,
"learning_rate": 8.650968292108774e-05,
"loss": 0.0425,
"step": 6660
},
{
"epoch": 6.869207003089598,
"grad_norm": 0.24457606673240662,
"learning_rate": 8.646257673137803e-05,
"loss": 0.044,
"step": 6670
},
{
"epoch": 6.879505664263646,
"grad_norm": 0.49996355175971985,
"learning_rate": 8.641540131437705e-05,
"loss": 0.0506,
"step": 6680
},
{
"epoch": 6.889804325437693,
"grad_norm": 0.46634727716445923,
"learning_rate": 8.636815675965136e-05,
"loss": 0.0492,
"step": 6690
},
{
"epoch": 6.90010298661174,
"grad_norm": 0.3688158690929413,
"learning_rate": 8.632084315689872e-05,
"loss": 0.045,
"step": 6700
},
{
"epoch": 6.910401647785788,
"grad_norm": 0.5695337653160095,
"learning_rate": 8.627346059594802e-05,
"loss": 0.0427,
"step": 6710
},
{
"epoch": 6.920700308959836,
"grad_norm": 0.2298751175403595,
"learning_rate": 8.622600916675909e-05,
"loss": 0.0524,
"step": 6720
},
{
"epoch": 6.930998970133882,
"grad_norm": 0.33885377645492554,
"learning_rate": 8.617848895942247e-05,
"loss": 0.0436,
"step": 6730
},
{
"epoch": 6.94129763130793,
"grad_norm": 0.3807486295700073,
"learning_rate": 8.61309000641593e-05,
"loss": 0.0446,
"step": 6740
},
{
"epoch": 6.951596292481978,
"grad_norm": 0.37897372245788574,
"learning_rate": 8.608324257132114e-05,
"loss": 0.0411,
"step": 6750
},
{
"epoch": 6.961894953656024,
"grad_norm": 0.4584294855594635,
"learning_rate": 8.603551657138975e-05,
"loss": 0.0417,
"step": 6760
},
{
"epoch": 6.972193614830072,
"grad_norm": 0.31342461705207825,
"learning_rate": 8.598772215497703e-05,
"loss": 0.0411,
"step": 6770
},
{
"epoch": 6.98249227600412,
"grad_norm": 0.3062428832054138,
"learning_rate": 8.593985941282468e-05,
"loss": 0.0405,
"step": 6780
},
{
"epoch": 6.9927909371781665,
"grad_norm": 0.40509548783302307,
"learning_rate": 8.58919284358042e-05,
"loss": 0.0424,
"step": 6790
},
{
"epoch": 7.003089598352214,
"grad_norm": 0.4782925248146057,
"learning_rate": 8.584392931491662e-05,
"loss": 0.0513,
"step": 6800
},
{
"epoch": 7.013388259526262,
"grad_norm": 0.5316755771636963,
"learning_rate": 8.579586214129232e-05,
"loss": 0.0432,
"step": 6810
},
{
"epoch": 7.0236869207003085,
"grad_norm": 0.3078876733779907,
"learning_rate": 8.574772700619089e-05,
"loss": 0.0467,
"step": 6820
},
{
"epoch": 7.033985581874356,
"grad_norm": 0.3792472183704376,
"learning_rate": 8.569952400100099e-05,
"loss": 0.0396,
"step": 6830
},
{
"epoch": 7.044284243048404,
"grad_norm": 0.3639551103115082,
"learning_rate": 8.565125321724009e-05,
"loss": 0.0455,
"step": 6840
},
{
"epoch": 7.0545829042224515,
"grad_norm": 0.27368006110191345,
"learning_rate": 8.560291474655438e-05,
"loss": 0.0409,
"step": 6850
},
{
"epoch": 7.064881565396498,
"grad_norm": 0.28575336933135986,
"learning_rate": 8.555450868071852e-05,
"loss": 0.0432,
"step": 6860
},
{
"epoch": 7.075180226570546,
"grad_norm": 0.29869362711906433,
"learning_rate": 8.550603511163554e-05,
"loss": 0.0488,
"step": 6870
},
{
"epoch": 7.085478887744594,
"grad_norm": 0.4075066149234772,
"learning_rate": 8.545749413133662e-05,
"loss": 0.0529,
"step": 6880
},
{
"epoch": 7.09577754891864,
"grad_norm": 1.2341939210891724,
"learning_rate": 8.540888583198092e-05,
"loss": 0.0413,
"step": 6890
},
{
"epoch": 7.106076210092688,
"grad_norm": 0.4124568998813629,
"learning_rate": 8.536021030585541e-05,
"loss": 0.0421,
"step": 6900
},
{
"epoch": 7.116374871266736,
"grad_norm": 0.30242303013801575,
"learning_rate": 8.53114676453747e-05,
"loss": 0.0472,
"step": 6910
},
{
"epoch": 7.126673532440782,
"grad_norm": 0.48593899607658386,
"learning_rate": 8.526265794308089e-05,
"loss": 0.0512,
"step": 6920
},
{
"epoch": 7.13697219361483,
"grad_norm": 0.3725290894508362,
"learning_rate": 8.52137812916433e-05,
"loss": 0.0485,
"step": 6930
},
{
"epoch": 7.147270854788878,
"grad_norm": 0.38606998324394226,
"learning_rate": 8.51648377838584e-05,
"loss": 0.043,
"step": 6940
},
{
"epoch": 7.1575695159629245,
"grad_norm": 0.49902254343032837,
"learning_rate": 8.511582751264959e-05,
"loss": 0.0405,
"step": 6950
},
{
"epoch": 7.167868177136972,
"grad_norm": 0.46539393067359924,
"learning_rate": 8.506675057106704e-05,
"loss": 0.0458,
"step": 6960
},
{
"epoch": 7.17816683831102,
"grad_norm": 0.2839367687702179,
"learning_rate": 8.501760705228746e-05,
"loss": 0.0444,
"step": 6970
},
{
"epoch": 7.1884654994850665,
"grad_norm": 0.3735373914241791,
"learning_rate": 8.496839704961398e-05,
"loss": 0.0549,
"step": 6980
},
{
"epoch": 7.198764160659114,
"grad_norm": 0.28953662514686584,
"learning_rate": 8.491912065647599e-05,
"loss": 0.041,
"step": 6990
},
{
"epoch": 7.209062821833162,
"grad_norm": 0.3638463318347931,
"learning_rate": 8.486977796642886e-05,
"loss": 0.0443,
"step": 7000
},
{
"epoch": 7.2193614830072095,
"grad_norm": 0.35548821091651917,
"learning_rate": 8.482036907315388e-05,
"loss": 0.0399,
"step": 7010
},
{
"epoch": 7.229660144181256,
"grad_norm": 0.3023133873939514,
"learning_rate": 8.477089407045803e-05,
"loss": 0.0456,
"step": 7020
},
{
"epoch": 7.239958805355304,
"grad_norm": 0.32346412539482117,
"learning_rate": 8.47213530522738e-05,
"loss": 0.0456,
"step": 7030
},
{
"epoch": 7.2502574665293515,
"grad_norm": 0.30193185806274414,
"learning_rate": 8.467174611265898e-05,
"loss": 0.0436,
"step": 7040
},
{
"epoch": 7.260556127703398,
"grad_norm": 0.3131241798400879,
"learning_rate": 8.462207334579658e-05,
"loss": 0.0493,
"step": 7050
},
{
"epoch": 7.270854788877446,
"grad_norm": 0.38617610931396484,
"learning_rate": 8.457233484599454e-05,
"loss": 0.0389,
"step": 7060
},
{
"epoch": 7.281153450051494,
"grad_norm": 0.34710705280303955,
"learning_rate": 8.452253070768562e-05,
"loss": 0.0548,
"step": 7070
},
{
"epoch": 7.29145211122554,
"grad_norm": 0.4385312795639038,
"learning_rate": 8.447266102542719e-05,
"loss": 0.0414,
"step": 7080
},
{
"epoch": 7.301750772399588,
"grad_norm": 0.33167046308517456,
"learning_rate": 8.442272589390107e-05,
"loss": 0.0436,
"step": 7090
},
{
"epoch": 7.312049433573636,
"grad_norm": 0.3587557375431061,
"learning_rate": 8.437272540791337e-05,
"loss": 0.0491,
"step": 7100
},
{
"epoch": 7.3223480947476824,
"grad_norm": 0.4869999587535858,
"learning_rate": 8.432265966239419e-05,
"loss": 0.0412,
"step": 7110
},
{
"epoch": 7.33264675592173,
"grad_norm": 0.3566068410873413,
"learning_rate": 8.427252875239764e-05,
"loss": 0.0395,
"step": 7120
},
{
"epoch": 7.342945417095778,
"grad_norm": 0.27183136343955994,
"learning_rate": 8.422233277310145e-05,
"loss": 0.0458,
"step": 7130
},
{
"epoch": 7.3532440782698245,
"grad_norm": 0.6837592124938965,
"learning_rate": 8.4172071819807e-05,
"loss": 0.0486,
"step": 7140
},
{
"epoch": 7.363542739443872,
"grad_norm": 0.3837999403476715,
"learning_rate": 8.412174598793893e-05,
"loss": 0.0517,
"step": 7150
},
{
"epoch": 7.37384140061792,
"grad_norm": 0.3554346561431885,
"learning_rate": 8.40713553730451e-05,
"loss": 0.0458,
"step": 7160
},
{
"epoch": 7.3841400617919675,
"grad_norm": 0.43407344818115234,
"learning_rate": 8.402090007079636e-05,
"loss": 0.0513,
"step": 7170
},
{
"epoch": 7.394438722966014,
"grad_norm": 0.3473135232925415,
"learning_rate": 8.397038017698638e-05,
"loss": 0.0459,
"step": 7180
},
{
"epoch": 7.404737384140062,
"grad_norm": 0.3611428439617157,
"learning_rate": 8.391979578753146e-05,
"loss": 0.0493,
"step": 7190
},
{
"epoch": 7.4150360453141095,
"grad_norm": 0.43605300784111023,
"learning_rate": 8.386914699847037e-05,
"loss": 0.0441,
"step": 7200
},
{
"epoch": 7.425334706488156,
"grad_norm": 0.4645574688911438,
"learning_rate": 8.381843390596409e-05,
"loss": 0.0411,
"step": 7210
},
{
"epoch": 7.435633367662204,
"grad_norm": 0.40116310119628906,
"learning_rate": 8.376765660629574e-05,
"loss": 0.0424,
"step": 7220
},
{
"epoch": 7.445932028836252,
"grad_norm": 0.31686821579933167,
"learning_rate": 8.371681519587033e-05,
"loss": 0.0438,
"step": 7230
},
{
"epoch": 7.456230690010298,
"grad_norm": 0.34404608607292175,
"learning_rate": 8.366590977121457e-05,
"loss": 0.0476,
"step": 7240
},
{
"epoch": 7.466529351184346,
"grad_norm": 0.4584087133407593,
"learning_rate": 8.361494042897675e-05,
"loss": 0.0403,
"step": 7250
},
{
"epoch": 7.476828012358394,
"grad_norm": 0.4275796413421631,
"learning_rate": 8.356390726592645e-05,
"loss": 0.0431,
"step": 7260
},
{
"epoch": 7.48712667353244,
"grad_norm": 0.5235409140586853,
"learning_rate": 8.351281037895448e-05,
"loss": 0.0486,
"step": 7270
},
{
"epoch": 7.497425334706488,
"grad_norm": 0.4118786156177521,
"learning_rate": 8.346164986507258e-05,
"loss": 0.0438,
"step": 7280
},
{
"epoch": 7.507723995880536,
"grad_norm": 0.3873061239719391,
"learning_rate": 8.341042582141336e-05,
"loss": 0.0501,
"step": 7290
},
{
"epoch": 7.518022657054583,
"grad_norm": 0.2781210243701935,
"learning_rate": 8.335913834522999e-05,
"loss": 0.0496,
"step": 7300
},
{
"epoch": 7.52832131822863,
"grad_norm": 0.39076125621795654,
"learning_rate": 8.330778753389608e-05,
"loss": 0.0518,
"step": 7310
},
{
"epoch": 7.538619979402678,
"grad_norm": 0.24123363196849823,
"learning_rate": 8.32563734849055e-05,
"loss": 0.0419,
"step": 7320
},
{
"epoch": 7.548918640576725,
"grad_norm": 0.510017991065979,
"learning_rate": 8.320489629587218e-05,
"loss": 0.049,
"step": 7330
},
{
"epoch": 7.559217301750772,
"grad_norm": 0.5126045942306519,
"learning_rate": 8.315335606452992e-05,
"loss": 0.0433,
"step": 7340
},
{
"epoch": 7.56951596292482,
"grad_norm": 0.26801052689552307,
"learning_rate": 8.310175288873222e-05,
"loss": 0.043,
"step": 7350
},
{
"epoch": 7.5798146240988675,
"grad_norm": 0.27340441942214966,
"learning_rate": 8.305008686645209e-05,
"loss": 0.0453,
"step": 7360
},
{
"epoch": 7.590113285272914,
"grad_norm": 0.33311620354652405,
"learning_rate": 8.299835809578183e-05,
"loss": 0.046,
"step": 7370
},
{
"epoch": 7.600411946446962,
"grad_norm": 0.3186795115470886,
"learning_rate": 8.294656667493292e-05,
"loss": 0.0466,
"step": 7380
},
{
"epoch": 7.61071060762101,
"grad_norm": 0.418103963136673,
"learning_rate": 8.289471270223575e-05,
"loss": 0.0496,
"step": 7390
},
{
"epoch": 7.621009268795056,
"grad_norm": 0.36763474345207214,
"learning_rate": 8.284279627613948e-05,
"loss": 0.0466,
"step": 7400
},
{
"epoch": 7.631307929969104,
"grad_norm": 0.555157482624054,
"learning_rate": 8.279081749521182e-05,
"loss": 0.0438,
"step": 7410
},
{
"epoch": 7.641606591143152,
"grad_norm": 0.3913489282131195,
"learning_rate": 8.273877645813893e-05,
"loss": 0.0419,
"step": 7420
},
{
"epoch": 7.651905252317198,
"grad_norm": 0.22710680961608887,
"learning_rate": 8.26866732637251e-05,
"loss": 0.0399,
"step": 7430
},
{
"epoch": 7.662203913491246,
"grad_norm": 0.6678986549377441,
"learning_rate": 8.263450801089268e-05,
"loss": 0.0454,
"step": 7440
},
{
"epoch": 7.672502574665294,
"grad_norm": 0.3249244689941406,
"learning_rate": 8.25822807986818e-05,
"loss": 0.0439,
"step": 7450
},
{
"epoch": 7.6828012358393405,
"grad_norm": 0.4379661977291107,
"learning_rate": 8.252999172625026e-05,
"loss": 0.0489,
"step": 7460
},
{
"epoch": 7.693099897013388,
"grad_norm": 0.3780250549316406,
"learning_rate": 8.24776408928733e-05,
"loss": 0.0498,
"step": 7470
},
{
"epoch": 7.703398558187436,
"grad_norm": 0.3127906620502472,
"learning_rate": 8.242522839794338e-05,
"loss": 0.0444,
"step": 7480
},
{
"epoch": 7.7136972193614834,
"grad_norm": 0.47873997688293457,
"learning_rate": 8.237275434097012e-05,
"loss": 0.0451,
"step": 7490
},
{
"epoch": 7.72399588053553,
"grad_norm": 0.3613678216934204,
"learning_rate": 8.23202188215799e-05,
"loss": 0.0453,
"step": 7500
},
{
"epoch": 7.734294541709578,
"grad_norm": 0.43183133006095886,
"learning_rate": 8.22676219395159e-05,
"loss": 0.0415,
"step": 7510
},
{
"epoch": 7.7445932028836255,
"grad_norm": 0.5181182622909546,
"learning_rate": 8.221496379463774e-05,
"loss": 0.0451,
"step": 7520
},
{
"epoch": 7.754891864057672,
"grad_norm": 0.3400101661682129,
"learning_rate": 8.216224448692138e-05,
"loss": 0.0508,
"step": 7530
},
{
"epoch": 7.76519052523172,
"grad_norm": 0.3759165108203888,
"learning_rate": 8.210946411645887e-05,
"loss": 0.046,
"step": 7540
},
{
"epoch": 7.775489186405768,
"grad_norm": 0.46893036365509033,
"learning_rate": 8.205662278345823e-05,
"loss": 0.0464,
"step": 7550
},
{
"epoch": 7.785787847579814,
"grad_norm": 0.3439854383468628,
"learning_rate": 8.200372058824322e-05,
"loss": 0.0432,
"step": 7560
},
{
"epoch": 7.796086508753862,
"grad_norm": 0.28945380449295044,
"learning_rate": 8.19507576312531e-05,
"loss": 0.043,
"step": 7570
},
{
"epoch": 7.80638516992791,
"grad_norm": 0.39605122804641724,
"learning_rate": 8.189773401304259e-05,
"loss": 0.041,
"step": 7580
},
{
"epoch": 7.816683831101956,
"grad_norm": 0.312443345785141,
"learning_rate": 8.184464983428146e-05,
"loss": 0.0495,
"step": 7590
},
{
"epoch": 7.826982492276004,
"grad_norm": 0.2684251368045807,
"learning_rate": 8.179150519575456e-05,
"loss": 0.0455,
"step": 7600
},
{
"epoch": 7.837281153450052,
"grad_norm": 0.3516882658004761,
"learning_rate": 8.17383001983615e-05,
"loss": 0.0421,
"step": 7610
},
{
"epoch": 7.8475798146240985,
"grad_norm": 0.2721065580844879,
"learning_rate": 8.168503494311644e-05,
"loss": 0.0465,
"step": 7620
},
{
"epoch": 7.857878475798146,
"grad_norm": 0.36698025465011597,
"learning_rate": 8.163170953114798e-05,
"loss": 0.0436,
"step": 7630
},
{
"epoch": 7.868177136972194,
"grad_norm": 0.25991520285606384,
"learning_rate": 8.157832406369897e-05,
"loss": 0.0485,
"step": 7640
},
{
"epoch": 7.8784757981462405,
"grad_norm": 0.42714834213256836,
"learning_rate": 8.15248786421262e-05,
"loss": 0.0505,
"step": 7650
},
{
"epoch": 7.888774459320288,
"grad_norm": 0.29556575417518616,
"learning_rate": 8.147137336790036e-05,
"loss": 0.045,
"step": 7660
},
{
"epoch": 7.899073120494336,
"grad_norm": 0.7319002747535706,
"learning_rate": 8.141780834260575e-05,
"loss": 0.0392,
"step": 7670
},
{
"epoch": 7.9093717816683835,
"grad_norm": 0.3782273530960083,
"learning_rate": 8.136418366794008e-05,
"loss": 0.041,
"step": 7680
},
{
"epoch": 7.91967044284243,
"grad_norm": 0.8856772184371948,
"learning_rate": 8.131049944571437e-05,
"loss": 0.0406,
"step": 7690
},
{
"epoch": 7.929969104016478,
"grad_norm": 0.3390977084636688,
"learning_rate": 8.125675577785264e-05,
"loss": 0.0381,
"step": 7700
},
{
"epoch": 7.940267765190526,
"grad_norm": 0.29981207847595215,
"learning_rate": 8.120295276639181e-05,
"loss": 0.0434,
"step": 7710
},
{
"epoch": 7.950566426364572,
"grad_norm": 0.3433596193790436,
"learning_rate": 8.114909051348144e-05,
"loss": 0.0504,
"step": 7720
},
{
"epoch": 7.96086508753862,
"grad_norm": 0.41486749053001404,
"learning_rate": 8.109516912138361e-05,
"loss": 0.0415,
"step": 7730
},
{
"epoch": 7.971163748712668,
"grad_norm": 0.36171993613243103,
"learning_rate": 8.104118869247263e-05,
"loss": 0.0452,
"step": 7740
},
{
"epoch": 7.981462409886714,
"grad_norm": 0.32786816358566284,
"learning_rate": 8.09871493292349e-05,
"loss": 0.0412,
"step": 7750
},
{
"epoch": 7.991761071060762,
"grad_norm": 0.3479085862636566,
"learning_rate": 8.09330511342688e-05,
"loss": 0.0453,
"step": 7760
},
{
"epoch": 8.002059732234809,
"grad_norm": 0.34164348244667053,
"learning_rate": 8.087889421028424e-05,
"loss": 0.0388,
"step": 7770
},
{
"epoch": 8.012358393408856,
"grad_norm": 0.29838696122169495,
"learning_rate": 8.082467866010279e-05,
"loss": 0.0484,
"step": 7780
},
{
"epoch": 8.022657054582904,
"grad_norm": 0.2911168932914734,
"learning_rate": 8.077040458665725e-05,
"loss": 0.0419,
"step": 7790
},
{
"epoch": 8.032955715756952,
"grad_norm": 0.4936712980270386,
"learning_rate": 8.071607209299157e-05,
"loss": 0.0439,
"step": 7800
},
{
"epoch": 8.043254376931,
"grad_norm": 0.3870621919631958,
"learning_rate": 8.066168128226057e-05,
"loss": 0.0479,
"step": 7810
},
{
"epoch": 8.053553038105047,
"grad_norm": 0.46932512521743774,
"learning_rate": 8.06072322577298e-05,
"loss": 0.0475,
"step": 7820
},
{
"epoch": 8.063851699279093,
"grad_norm": 0.3050336241722107,
"learning_rate": 8.055272512277537e-05,
"loss": 0.0484,
"step": 7830
},
{
"epoch": 8.07415036045314,
"grad_norm": 0.4158398509025574,
"learning_rate": 8.049815998088368e-05,
"loss": 0.0498,
"step": 7840
},
{
"epoch": 8.084449021627188,
"grad_norm": 0.25001806020736694,
"learning_rate": 8.044353693565127e-05,
"loss": 0.043,
"step": 7850
},
{
"epoch": 8.094747682801236,
"grad_norm": 0.35990580916404724,
"learning_rate": 8.038885609078464e-05,
"loss": 0.0402,
"step": 7860
},
{
"epoch": 8.105046343975284,
"grad_norm": 0.32985004782676697,
"learning_rate": 8.033411755009999e-05,
"loss": 0.0359,
"step": 7870
},
{
"epoch": 8.115345005149331,
"grad_norm": 0.3465685546398163,
"learning_rate": 8.027932141752309e-05,
"loss": 0.0466,
"step": 7880
},
{
"epoch": 8.125643666323377,
"grad_norm": 0.2139962762594223,
"learning_rate": 8.022446779708903e-05,
"loss": 0.0391,
"step": 7890
},
{
"epoch": 8.135942327497425,
"grad_norm": 0.3868210017681122,
"learning_rate": 8.016955679294206e-05,
"loss": 0.0445,
"step": 7900
},
{
"epoch": 8.146240988671472,
"grad_norm": 0.3728034496307373,
"learning_rate": 8.011458850933538e-05,
"loss": 0.0467,
"step": 7910
},
{
"epoch": 8.15653964984552,
"grad_norm": 0.31305524706840515,
"learning_rate": 8.005956305063091e-05,
"loss": 0.0422,
"step": 7920
},
{
"epoch": 8.166838311019568,
"grad_norm": 0.4065026044845581,
"learning_rate": 8.000448052129917e-05,
"loss": 0.0421,
"step": 7930
},
{
"epoch": 8.177136972193615,
"grad_norm": 0.29192665219306946,
"learning_rate": 7.9949341025919e-05,
"loss": 0.0509,
"step": 7940
},
{
"epoch": 8.187435633367663,
"grad_norm": 0.3657580316066742,
"learning_rate": 7.989414466917741e-05,
"loss": 0.0437,
"step": 7950
},
{
"epoch": 8.197734294541709,
"grad_norm": 0.36567407846450806,
"learning_rate": 7.983889155586935e-05,
"loss": 0.0511,
"step": 7960
},
{
"epoch": 8.208032955715757,
"grad_norm": 0.45859163999557495,
"learning_rate": 7.978358179089755e-05,
"loss": 0.0518,
"step": 7970
},
{
"epoch": 8.218331616889804,
"grad_norm": 0.4686458706855774,
"learning_rate": 7.972821547927228e-05,
"loss": 0.0512,
"step": 7980
},
{
"epoch": 8.228630278063852,
"grad_norm": 0.530068039894104,
"learning_rate": 7.967279272611115e-05,
"loss": 0.0504,
"step": 7990
},
{
"epoch": 8.2389289392379,
"grad_norm": 0.7595759034156799,
"learning_rate": 7.961731363663898e-05,
"loss": 0.0422,
"step": 8000
},
{
"epoch": 8.249227600411947,
"grad_norm": 0.317537397146225,
"learning_rate": 7.956177831618752e-05,
"loss": 0.0464,
"step": 8010
},
{
"epoch": 8.259526261585993,
"grad_norm": 0.3107547163963318,
"learning_rate": 7.950618687019527e-05,
"loss": 0.0427,
"step": 8020
},
{
"epoch": 8.26982492276004,
"grad_norm": 0.29626405239105225,
"learning_rate": 7.945053940420732e-05,
"loss": 0.0374,
"step": 8030
},
{
"epoch": 8.280123583934088,
"grad_norm": 0.30061104893684387,
"learning_rate": 7.939483602387508e-05,
"loss": 0.0391,
"step": 8040
},
{
"epoch": 8.290422245108136,
"grad_norm": 0.45963454246520996,
"learning_rate": 7.933907683495614e-05,
"loss": 0.0451,
"step": 8050
},
{
"epoch": 8.300720906282184,
"grad_norm": 0.36147454380989075,
"learning_rate": 7.928326194331404e-05,
"loss": 0.0428,
"step": 8060
},
{
"epoch": 8.311019567456231,
"grad_norm": 0.3963719606399536,
"learning_rate": 7.922739145491809e-05,
"loss": 0.0439,
"step": 8070
},
{
"epoch": 8.321318228630279,
"grad_norm": 0.3437096178531647,
"learning_rate": 7.917146547584314e-05,
"loss": 0.0475,
"step": 8080
},
{
"epoch": 8.331616889804325,
"grad_norm": 0.36308997869491577,
"learning_rate": 7.911548411226941e-05,
"loss": 0.0476,
"step": 8090
},
{
"epoch": 8.341915550978372,
"grad_norm": 0.37580838799476624,
"learning_rate": 7.905944747048225e-05,
"loss": 0.0409,
"step": 8100
},
{
"epoch": 8.35221421215242,
"grad_norm": 0.3824920058250427,
"learning_rate": 7.900335565687198e-05,
"loss": 0.042,
"step": 8110
},
{
"epoch": 8.362512873326468,
"grad_norm": 0.2326115220785141,
"learning_rate": 7.894720877793365e-05,
"loss": 0.0427,
"step": 8120
},
{
"epoch": 8.372811534500515,
"grad_norm": 0.4421122074127197,
"learning_rate": 7.88910069402669e-05,
"loss": 0.0482,
"step": 8130
},
{
"epoch": 8.383110195674563,
"grad_norm": 0.3101021647453308,
"learning_rate": 7.883475025057563e-05,
"loss": 0.043,
"step": 8140
},
{
"epoch": 8.393408856848609,
"grad_norm": 0.32335329055786133,
"learning_rate": 7.877843881566795e-05,
"loss": 0.0403,
"step": 8150
},
{
"epoch": 8.403707518022657,
"grad_norm": 0.20502477884292603,
"learning_rate": 7.872207274245592e-05,
"loss": 0.0467,
"step": 8160
},
{
"epoch": 8.414006179196704,
"grad_norm": 0.4081539809703827,
"learning_rate": 7.866565213795527e-05,
"loss": 0.0451,
"step": 8170
},
{
"epoch": 8.424304840370752,
"grad_norm": 0.24273471534252167,
"learning_rate": 7.860917710928532e-05,
"loss": 0.0453,
"step": 8180
},
{
"epoch": 8.4346035015448,
"grad_norm": 0.6472697854042053,
"learning_rate": 7.855264776366868e-05,
"loss": 0.0442,
"step": 8190
},
{
"epoch": 8.444902162718847,
"grad_norm": 0.38166165351867676,
"learning_rate": 7.849606420843111e-05,
"loss": 0.0446,
"step": 8200
},
{
"epoch": 8.455200823892893,
"grad_norm": 0.2978123128414154,
"learning_rate": 7.843942655100129e-05,
"loss": 0.0418,
"step": 8210
},
{
"epoch": 8.46549948506694,
"grad_norm": 0.731920063495636,
"learning_rate": 7.838273489891059e-05,
"loss": 0.0449,
"step": 8220
},
{
"epoch": 8.475798146240988,
"grad_norm": 0.30535033345222473,
"learning_rate": 7.832598935979294e-05,
"loss": 0.0427,
"step": 8230
},
{
"epoch": 8.486096807415036,
"grad_norm": 0.2867630422115326,
"learning_rate": 7.826919004138453e-05,
"loss": 0.043,
"step": 8240
},
{
"epoch": 8.496395468589084,
"grad_norm": 0.28375399112701416,
"learning_rate": 7.821233705152371e-05,
"loss": 0.0359,
"step": 8250
},
{
"epoch": 8.506694129763131,
"grad_norm": 0.272128164768219,
"learning_rate": 7.815543049815066e-05,
"loss": 0.0386,
"step": 8260
},
{
"epoch": 8.516992790937179,
"grad_norm": 0.28272774815559387,
"learning_rate": 7.809847048930734e-05,
"loss": 0.0537,
"step": 8270
},
{
"epoch": 8.527291452111225,
"grad_norm": 0.3891587257385254,
"learning_rate": 7.80414571331371e-05,
"loss": 0.0445,
"step": 8280
},
{
"epoch": 8.537590113285273,
"grad_norm": 0.28232893347740173,
"learning_rate": 7.798439053788467e-05,
"loss": 0.0431,
"step": 8290
},
{
"epoch": 8.54788877445932,
"grad_norm": 0.3467990756034851,
"learning_rate": 7.792727081189579e-05,
"loss": 0.0393,
"step": 8300
},
{
"epoch": 8.558187435633368,
"grad_norm": 0.2620207965373993,
"learning_rate": 7.78700980636171e-05,
"loss": 0.0424,
"step": 8310
},
{
"epoch": 8.568486096807415,
"grad_norm": 0.35534194111824036,
"learning_rate": 7.781287240159592e-05,
"loss": 0.0478,
"step": 8320
},
{
"epoch": 8.578784757981463,
"grad_norm": 0.3371882140636444,
"learning_rate": 7.775559393448002e-05,
"loss": 0.0441,
"step": 8330
},
{
"epoch": 8.589083419155509,
"grad_norm": 0.2300422340631485,
"learning_rate": 7.76982627710174e-05,
"loss": 0.0445,
"step": 8340
},
{
"epoch": 8.599382080329557,
"grad_norm": 0.3695335388183594,
"learning_rate": 7.764087902005616e-05,
"loss": 0.0461,
"step": 8350
},
{
"epoch": 8.609680741503604,
"grad_norm": 0.35191041231155396,
"learning_rate": 7.75834427905442e-05,
"loss": 0.0417,
"step": 8360
},
{
"epoch": 8.619979402677652,
"grad_norm": 0.354792982339859,
"learning_rate": 7.752595419152905e-05,
"loss": 0.0519,
"step": 8370
},
{
"epoch": 8.6302780638517,
"grad_norm": 0.306573748588562,
"learning_rate": 7.746841333215772e-05,
"loss": 0.0403,
"step": 8380
},
{
"epoch": 8.640576725025747,
"grad_norm": 0.5979099869728088,
"learning_rate": 7.741082032167641e-05,
"loss": 0.0372,
"step": 8390
},
{
"epoch": 8.650875386199793,
"grad_norm": 0.3066156208515167,
"learning_rate": 7.735317526943029e-05,
"loss": 0.0452,
"step": 8400
},
{
"epoch": 8.66117404737384,
"grad_norm": 0.5244849324226379,
"learning_rate": 7.729547828486339e-05,
"loss": 0.0397,
"step": 8410
},
{
"epoch": 8.671472708547888,
"grad_norm": 0.4708474576473236,
"learning_rate": 7.723772947751834e-05,
"loss": 0.043,
"step": 8420
},
{
"epoch": 8.681771369721936,
"grad_norm": 0.26460787653923035,
"learning_rate": 7.717992895703612e-05,
"loss": 0.0381,
"step": 8430
},
{
"epoch": 8.692070030895984,
"grad_norm": 0.35859358310699463,
"learning_rate": 7.712207683315594e-05,
"loss": 0.0394,
"step": 8440
},
{
"epoch": 8.702368692070031,
"grad_norm": 0.6054845452308655,
"learning_rate": 7.706417321571496e-05,
"loss": 0.0452,
"step": 8450
},
{
"epoch": 8.712667353244079,
"grad_norm": 0.34088876843452454,
"learning_rate": 7.700621821464807e-05,
"loss": 0.0392,
"step": 8460
},
{
"epoch": 8.722966014418125,
"grad_norm": 0.3852469027042389,
"learning_rate": 7.694821193998779e-05,
"loss": 0.0334,
"step": 8470
},
{
"epoch": 8.733264675592173,
"grad_norm": 0.5442830324172974,
"learning_rate": 7.68901545018639e-05,
"loss": 0.0399,
"step": 8480
},
{
"epoch": 8.74356333676622,
"grad_norm": 0.33397114276885986,
"learning_rate": 7.68320460105034e-05,
"loss": 0.0453,
"step": 8490
},
{
"epoch": 8.753861997940268,
"grad_norm": 0.47606992721557617,
"learning_rate": 7.677388657623019e-05,
"loss": 0.0435,
"step": 8500
},
{
"epoch": 8.764160659114316,
"grad_norm": 0.2601523697376251,
"learning_rate": 7.671567630946488e-05,
"loss": 0.0367,
"step": 8510
},
{
"epoch": 8.774459320288363,
"grad_norm": 0.3622424304485321,
"learning_rate": 7.665741532072457e-05,
"loss": 0.0506,
"step": 8520
},
{
"epoch": 8.784757981462409,
"grad_norm": 0.34801945090293884,
"learning_rate": 7.659910372062268e-05,
"loss": 0.0404,
"step": 8530
},
{
"epoch": 8.795056642636457,
"grad_norm": 0.8768806457519531,
"learning_rate": 7.654074161986877e-05,
"loss": 0.0353,
"step": 8540
},
{
"epoch": 8.805355303810504,
"grad_norm": 0.5520436763763428,
"learning_rate": 7.648232912926821e-05,
"loss": 0.0422,
"step": 8550
},
{
"epoch": 8.815653964984552,
"grad_norm": 0.24350865185260773,
"learning_rate": 7.642386635972202e-05,
"loss": 0.0447,
"step": 8560
},
{
"epoch": 8.8259526261586,
"grad_norm": 0.3099444806575775,
"learning_rate": 7.636535342222679e-05,
"loss": 0.0417,
"step": 8570
},
{
"epoch": 8.836251287332647,
"grad_norm": 0.36553552746772766,
"learning_rate": 7.630679042787425e-05,
"loss": 0.0393,
"step": 8580
},
{
"epoch": 8.846549948506695,
"grad_norm": 0.24580466747283936,
"learning_rate": 7.624817748785122e-05,
"loss": 0.0415,
"step": 8590
},
{
"epoch": 8.85684860968074,
"grad_norm": 0.30042266845703125,
"learning_rate": 7.618951471343931e-05,
"loss": 0.0473,
"step": 8600
},
{
"epoch": 8.867147270854788,
"grad_norm": 0.30034321546554565,
"learning_rate": 7.613080221601479e-05,
"loss": 0.0436,
"step": 8610
},
{
"epoch": 8.877445932028836,
"grad_norm": 0.3541191518306732,
"learning_rate": 7.607204010704831e-05,
"loss": 0.047,
"step": 8620
},
{
"epoch": 8.887744593202884,
"grad_norm": 0.20642833411693573,
"learning_rate": 7.60132284981047e-05,
"loss": 0.0438,
"step": 8630
},
{
"epoch": 8.898043254376931,
"grad_norm": 0.44705307483673096,
"learning_rate": 7.59543675008428e-05,
"loss": 0.0469,
"step": 8640
},
{
"epoch": 8.908341915550979,
"grad_norm": 0.3301944434642792,
"learning_rate": 7.589545722701519e-05,
"loss": 0.0402,
"step": 8650
},
{
"epoch": 8.918640576725025,
"grad_norm": 0.2832626402378082,
"learning_rate": 7.583649778846801e-05,
"loss": 0.0389,
"step": 8660
},
{
"epoch": 8.928939237899073,
"grad_norm": 0.22809769213199615,
"learning_rate": 7.577748929714074e-05,
"loss": 0.0421,
"step": 8670
},
{
"epoch": 8.93923789907312,
"grad_norm": 0.5481482148170471,
"learning_rate": 7.571843186506604e-05,
"loss": 0.0433,
"step": 8680
},
{
"epoch": 8.949536560247168,
"grad_norm": 0.28562408685684204,
"learning_rate": 7.565932560436942e-05,
"loss": 0.0424,
"step": 8690
},
{
"epoch": 8.959835221421216,
"grad_norm": 0.44328007102012634,
"learning_rate": 7.560017062726912e-05,
"loss": 0.0494,
"step": 8700
},
{
"epoch": 8.970133882595263,
"grad_norm": 0.2741154730319977,
"learning_rate": 7.554096704607589e-05,
"loss": 0.0439,
"step": 8710
},
{
"epoch": 8.98043254376931,
"grad_norm": 0.2939404547214508,
"learning_rate": 7.548171497319274e-05,
"loss": 0.0397,
"step": 8720
},
{
"epoch": 8.990731204943357,
"grad_norm": 0.2603434920310974,
"learning_rate": 7.542241452111476e-05,
"loss": 0.0429,
"step": 8730
},
{
"epoch": 9.001029866117404,
"grad_norm": 0.24690425395965576,
"learning_rate": 7.536306580242886e-05,
"loss": 0.0416,
"step": 8740
},
{
"epoch": 9.011328527291452,
"grad_norm": 0.3746093213558197,
"learning_rate": 7.530366892981362e-05,
"loss": 0.0436,
"step": 8750
},
{
"epoch": 9.0216271884655,
"grad_norm": 0.4268042743206024,
"learning_rate": 7.524422401603905e-05,
"loss": 0.0431,
"step": 8760
},
{
"epoch": 9.031925849639547,
"grad_norm": 0.35786107182502747,
"learning_rate": 7.518473117396633e-05,
"loss": 0.0404,
"step": 8770
},
{
"epoch": 9.042224510813595,
"grad_norm": 0.31200069189071655,
"learning_rate": 7.51251905165477e-05,
"loss": 0.0406,
"step": 8780
},
{
"epoch": 9.052523171987641,
"grad_norm": 0.33136019110679626,
"learning_rate": 7.506560215682608e-05,
"loss": 0.0484,
"step": 8790
},
{
"epoch": 9.062821833161689,
"grad_norm": 0.6744624972343445,
"learning_rate": 7.500596620793508e-05,
"loss": 0.0424,
"step": 8800
},
{
"epoch": 9.073120494335736,
"grad_norm": 0.5903081893920898,
"learning_rate": 7.494628278309858e-05,
"loss": 0.0497,
"step": 8810
},
{
"epoch": 9.083419155509784,
"grad_norm": 0.4158842861652374,
"learning_rate": 7.488655199563062e-05,
"loss": 0.0386,
"step": 8820
},
{
"epoch": 9.093717816683832,
"grad_norm": 0.2558102011680603,
"learning_rate": 7.482677395893515e-05,
"loss": 0.0477,
"step": 8830
},
{
"epoch": 9.10401647785788,
"grad_norm": 0.27110162377357483,
"learning_rate": 7.476694878650586e-05,
"loss": 0.0392,
"step": 8840
},
{
"epoch": 9.114315139031925,
"grad_norm": 0.3769000768661499,
"learning_rate": 7.470707659192588e-05,
"loss": 0.0456,
"step": 8850
},
{
"epoch": 9.124613800205973,
"grad_norm": 0.327959269285202,
"learning_rate": 7.464715748886766e-05,
"loss": 0.0428,
"step": 8860
},
{
"epoch": 9.13491246138002,
"grad_norm": 0.26664164662361145,
"learning_rate": 7.458719159109269e-05,
"loss": 0.0397,
"step": 8870
},
{
"epoch": 9.145211122554068,
"grad_norm": 0.4833793044090271,
"learning_rate": 7.452717901245132e-05,
"loss": 0.0406,
"step": 8880
},
{
"epoch": 9.155509783728116,
"grad_norm": 0.5883243680000305,
"learning_rate": 7.44671198668825e-05,
"loss": 0.0454,
"step": 8890
},
{
"epoch": 9.165808444902163,
"grad_norm": 0.46850427985191345,
"learning_rate": 7.440701426841361e-05,
"loss": 0.0405,
"step": 8900
},
{
"epoch": 9.176107106076211,
"grad_norm": 0.2926773726940155,
"learning_rate": 7.434686233116022e-05,
"loss": 0.0485,
"step": 8910
},
{
"epoch": 9.186405767250257,
"grad_norm": 0.46925362944602966,
"learning_rate": 7.428666416932589e-05,
"loss": 0.0385,
"step": 8920
},
{
"epoch": 9.196704428424304,
"grad_norm": 0.4723007380962372,
"learning_rate": 7.422641989720193e-05,
"loss": 0.0378,
"step": 8930
},
{
"epoch": 9.207003089598352,
"grad_norm": 0.3344404995441437,
"learning_rate": 7.416612962916718e-05,
"loss": 0.0417,
"step": 8940
},
{
"epoch": 9.2173017507724,
"grad_norm": 0.43562179803848267,
"learning_rate": 7.410579347968782e-05,
"loss": 0.0451,
"step": 8950
},
{
"epoch": 9.227600411946447,
"grad_norm": 0.4855707287788391,
"learning_rate": 7.404541156331717e-05,
"loss": 0.037,
"step": 8960
},
{
"epoch": 9.237899073120495,
"grad_norm": 0.30603402853012085,
"learning_rate": 7.398498399469539e-05,
"loss": 0.0436,
"step": 8970
},
{
"epoch": 9.248197734294541,
"grad_norm": 0.2129911333322525,
"learning_rate": 7.392451088854934e-05,
"loss": 0.0522,
"step": 8980
},
{
"epoch": 9.258496395468589,
"grad_norm": 0.361104279756546,
"learning_rate": 7.386399235969235e-05,
"loss": 0.0476,
"step": 8990
},
{
"epoch": 9.268795056642636,
"grad_norm": 0.2940675616264343,
"learning_rate": 7.380342852302395e-05,
"loss": 0.041,
"step": 9000
},
{
"epoch": 9.279093717816684,
"grad_norm": 0.18984778225421906,
"learning_rate": 7.374281949352973e-05,
"loss": 0.0368,
"step": 9010
},
{
"epoch": 9.289392378990732,
"grad_norm": 0.31379929184913635,
"learning_rate": 7.368216538628108e-05,
"loss": 0.0383,
"step": 9020
},
{
"epoch": 9.29969104016478,
"grad_norm": 0.26660239696502686,
"learning_rate": 7.362146631643495e-05,
"loss": 0.0414,
"step": 9030
},
{
"epoch": 9.309989701338825,
"grad_norm": 0.5518432259559631,
"learning_rate": 7.356072239923366e-05,
"loss": 0.0414,
"step": 9040
},
{
"epoch": 9.320288362512873,
"grad_norm": 0.3368736207485199,
"learning_rate": 7.349993375000468e-05,
"loss": 0.0383,
"step": 9050
},
{
"epoch": 9.33058702368692,
"grad_norm": 1.1266578435897827,
"learning_rate": 7.343910048416043e-05,
"loss": 0.045,
"step": 9060
},
{
"epoch": 9.340885684860968,
"grad_norm": 0.39378392696380615,
"learning_rate": 7.337822271719802e-05,
"loss": 0.035,
"step": 9070
},
{
"epoch": 9.351184346035016,
"grad_norm": 0.30862686038017273,
"learning_rate": 7.331730056469901e-05,
"loss": 0.0405,
"step": 9080
},
{
"epoch": 9.361483007209063,
"grad_norm": 0.3060796856880188,
"learning_rate": 7.325633414232933e-05,
"loss": 0.0336,
"step": 9090
},
{
"epoch": 9.371781668383111,
"grad_norm": 0.33509689569473267,
"learning_rate": 7.319532356583884e-05,
"loss": 0.0371,
"step": 9100
},
{
"epoch": 9.382080329557157,
"grad_norm": 0.320268452167511,
"learning_rate": 7.313426895106133e-05,
"loss": 0.0397,
"step": 9110
},
{
"epoch": 9.392378990731205,
"grad_norm": 0.7082319259643555,
"learning_rate": 7.307317041391415e-05,
"loss": 0.0515,
"step": 9120
},
{
"epoch": 9.402677651905252,
"grad_norm": 0.29491856694221497,
"learning_rate": 7.301202807039801e-05,
"loss": 0.033,
"step": 9130
},
{
"epoch": 9.4129763130793,
"grad_norm": 0.5242739319801331,
"learning_rate": 7.295084203659689e-05,
"loss": 0.0426,
"step": 9140
},
{
"epoch": 9.423274974253347,
"grad_norm": 0.38009077310562134,
"learning_rate": 7.288961242867762e-05,
"loss": 0.0401,
"step": 9150
},
{
"epoch": 9.433573635427395,
"grad_norm": 0.3632734417915344,
"learning_rate": 7.282833936288981e-05,
"loss": 0.0409,
"step": 9160
},
{
"epoch": 9.443872296601441,
"grad_norm": 0.23923484981060028,
"learning_rate": 7.276702295556557e-05,
"loss": 0.0415,
"step": 9170
},
{
"epoch": 9.454170957775489,
"grad_norm": 0.2659699320793152,
"learning_rate": 7.27056633231193e-05,
"loss": 0.0472,
"step": 9180
},
{
"epoch": 9.464469618949536,
"grad_norm": 0.4790288805961609,
"learning_rate": 7.264426058204741e-05,
"loss": 0.0423,
"step": 9190
},
{
"epoch": 9.474768280123584,
"grad_norm": 0.6561058163642883,
"learning_rate": 7.258281484892829e-05,
"loss": 0.0388,
"step": 9200
},
{
"epoch": 9.485066941297632,
"grad_norm": 0.28043946623802185,
"learning_rate": 7.252132624042182e-05,
"loss": 0.0405,
"step": 9210
},
{
"epoch": 9.49536560247168,
"grad_norm": 0.2707328200340271,
"learning_rate": 7.245979487326933e-05,
"loss": 0.0387,
"step": 9220
},
{
"epoch": 9.505664263645727,
"grad_norm": 0.32571718096733093,
"learning_rate": 7.239822086429335e-05,
"loss": 0.0463,
"step": 9230
},
{
"epoch": 9.515962924819773,
"grad_norm": 0.25263553857803345,
"learning_rate": 7.233660433039734e-05,
"loss": 0.0379,
"step": 9240
},
{
"epoch": 9.52626158599382,
"grad_norm": 0.2631954848766327,
"learning_rate": 7.227494538856552e-05,
"loss": 0.0424,
"step": 9250
},
{
"epoch": 9.536560247167868,
"grad_norm": 0.40027284622192383,
"learning_rate": 7.221324415586261e-05,
"loss": 0.0386,
"step": 9260
},
{
"epoch": 9.546858908341916,
"grad_norm": 0.6129460334777832,
"learning_rate": 7.215150074943365e-05,
"loss": 0.0364,
"step": 9270
},
{
"epoch": 9.557157569515963,
"grad_norm": 0.21345844864845276,
"learning_rate": 7.20897152865037e-05,
"loss": 0.0417,
"step": 9280
},
{
"epoch": 9.567456230690011,
"grad_norm": 2.892314910888672,
"learning_rate": 7.20278878843777e-05,
"loss": 0.0385,
"step": 9290
},
{
"epoch": 9.577754891864057,
"grad_norm": 0.3310607969760895,
"learning_rate": 7.196601866044023e-05,
"loss": 0.0384,
"step": 9300
},
{
"epoch": 9.588053553038105,
"grad_norm": 0.20281566679477692,
"learning_rate": 7.190410773215524e-05,
"loss": 0.0521,
"step": 9310
},
{
"epoch": 9.598352214212152,
"grad_norm": 0.5886447429656982,
"learning_rate": 7.184215521706585e-05,
"loss": 0.0434,
"step": 9320
},
{
"epoch": 9.6086508753862,
"grad_norm": 0.3617643713951111,
"learning_rate": 7.178016123279421e-05,
"loss": 0.0376,
"step": 9330
},
{
"epoch": 9.618949536560248,
"grad_norm": 0.7849340438842773,
"learning_rate": 7.17181258970411e-05,
"loss": 0.0443,
"step": 9340
},
{
"epoch": 9.629248197734295,
"grad_norm": 0.23935212194919586,
"learning_rate": 7.16560493275859e-05,
"loss": 0.0358,
"step": 9350
},
{
"epoch": 9.639546858908343,
"grad_norm": 0.2504394054412842,
"learning_rate": 7.159393164228622e-05,
"loss": 0.0398,
"step": 9360
},
{
"epoch": 9.649845520082389,
"grad_norm": 0.27913835644721985,
"learning_rate": 7.153177295907774e-05,
"loss": 0.0478,
"step": 9370
},
{
"epoch": 9.660144181256436,
"grad_norm": 0.36944735050201416,
"learning_rate": 7.1469573395974e-05,
"loss": 0.0447,
"step": 9380
},
{
"epoch": 9.670442842430484,
"grad_norm": 0.3498212695121765,
"learning_rate": 7.140733307106615e-05,
"loss": 0.045,
"step": 9390
},
{
"epoch": 9.680741503604532,
"grad_norm": 0.5487031936645508,
"learning_rate": 7.13450521025227e-05,
"loss": 0.0436,
"step": 9400
},
{
"epoch": 9.69104016477858,
"grad_norm": 0.2848561108112335,
"learning_rate": 7.128273060858935e-05,
"loss": 0.043,
"step": 9410
},
{
"epoch": 9.701338825952627,
"grad_norm": 0.6054211258888245,
"learning_rate": 7.122036870758875e-05,
"loss": 0.0417,
"step": 9420
},
{
"epoch": 9.711637487126673,
"grad_norm": 0.40893781185150146,
"learning_rate": 7.115796651792023e-05,
"loss": 0.0449,
"step": 9430
},
{
"epoch": 9.72193614830072,
"grad_norm": 0.252646803855896,
"learning_rate": 7.109552415805964e-05,
"loss": 0.0411,
"step": 9440
},
{
"epoch": 9.732234809474768,
"grad_norm": 0.24804732203483582,
"learning_rate": 7.10330417465591e-05,
"loss": 0.0453,
"step": 9450
},
{
"epoch": 9.742533470648816,
"grad_norm": 0.315452516078949,
"learning_rate": 7.097051940204677e-05,
"loss": 0.0407,
"step": 9460
},
{
"epoch": 9.752832131822863,
"grad_norm": 0.2799887955188751,
"learning_rate": 7.090795724322661e-05,
"loss": 0.04,
"step": 9470
},
{
"epoch": 9.763130792996911,
"grad_norm": 0.38197386264801025,
"learning_rate": 7.084535538887816e-05,
"loss": 0.0421,
"step": 9480
},
{
"epoch": 9.773429454170957,
"grad_norm": 0.23834991455078125,
"learning_rate": 7.078271395785638e-05,
"loss": 0.0337,
"step": 9490
},
{
"epoch": 9.783728115345005,
"grad_norm": 0.3643531799316406,
"learning_rate": 7.07200330690913e-05,
"loss": 0.0503,
"step": 9500
},
{
"epoch": 9.794026776519052,
"grad_norm": 0.6071295142173767,
"learning_rate": 7.06573128415879e-05,
"loss": 0.0492,
"step": 9510
},
{
"epoch": 9.8043254376931,
"grad_norm": 0.4726833701133728,
"learning_rate": 7.059455339442589e-05,
"loss": 0.0437,
"step": 9520
},
{
"epoch": 9.814624098867148,
"grad_norm": 0.43971115350723267,
"learning_rate": 7.053175484675935e-05,
"loss": 0.0372,
"step": 9530
},
{
"epoch": 9.824922760041195,
"grad_norm": 0.372178316116333,
"learning_rate": 7.046891731781667e-05,
"loss": 0.0419,
"step": 9540
},
{
"epoch": 9.835221421215241,
"grad_norm": 0.36207082867622375,
"learning_rate": 7.04060409269002e-05,
"loss": 0.0414,
"step": 9550
},
{
"epoch": 9.845520082389289,
"grad_norm": 0.34154579043388367,
"learning_rate": 7.034312579338611e-05,
"loss": 0.044,
"step": 9560
},
{
"epoch": 9.855818743563336,
"grad_norm": 0.22793729603290558,
"learning_rate": 7.028017203672412e-05,
"loss": 0.0351,
"step": 9570
},
{
"epoch": 9.866117404737384,
"grad_norm": 0.24308715760707855,
"learning_rate": 7.021717977643726e-05,
"loss": 0.0386,
"step": 9580
},
{
"epoch": 9.876416065911432,
"grad_norm": 0.28684303164482117,
"learning_rate": 7.015414913212166e-05,
"loss": 0.0463,
"step": 9590
},
{
"epoch": 9.88671472708548,
"grad_norm": 0.38573402166366577,
"learning_rate": 7.009108022344637e-05,
"loss": 0.0395,
"step": 9600
},
{
"epoch": 9.897013388259527,
"grad_norm": 0.5661159157752991,
"learning_rate": 7.002797317015302e-05,
"loss": 0.0369,
"step": 9610
},
{
"epoch": 9.907312049433573,
"grad_norm": 0.4411841332912445,
"learning_rate": 6.996482809205574e-05,
"loss": 0.0464,
"step": 9620
},
{
"epoch": 9.91761071060762,
"grad_norm": 0.30994483828544617,
"learning_rate": 6.990164510904077e-05,
"loss": 0.0437,
"step": 9630
},
{
"epoch": 9.927909371781668,
"grad_norm": 0.24947911500930786,
"learning_rate": 6.983842434106637e-05,
"loss": 0.0385,
"step": 9640
},
{
"epoch": 9.938208032955716,
"grad_norm": 0.23983316123485565,
"learning_rate": 6.977516590816255e-05,
"loss": 0.0428,
"step": 9650
},
{
"epoch": 9.948506694129764,
"grad_norm": 0.35743358731269836,
"learning_rate": 6.971186993043076e-05,
"loss": 0.0391,
"step": 9660
},
{
"epoch": 9.958805355303811,
"grad_norm": 0.23140031099319458,
"learning_rate": 6.964853652804382e-05,
"loss": 0.0366,
"step": 9670
},
{
"epoch": 9.969104016477857,
"grad_norm": 0.2773256003856659,
"learning_rate": 6.958516582124552e-05,
"loss": 0.0414,
"step": 9680
},
{
"epoch": 9.979402677651905,
"grad_norm": 0.28996285796165466,
"learning_rate": 6.952175793035053e-05,
"loss": 0.0406,
"step": 9690
},
{
"epoch": 9.989701338825952,
"grad_norm": 0.4465389549732208,
"learning_rate": 6.945831297574414e-05,
"loss": 0.0388,
"step": 9700
},
{
"epoch": 10.0,
"grad_norm": 0.44371479749679565,
"learning_rate": 6.939483107788192e-05,
"loss": 0.0434,
"step": 9710
},
{
"epoch": 10.010298661174048,
"grad_norm": 0.26840487122535706,
"learning_rate": 6.933131235728967e-05,
"loss": 0.0426,
"step": 9720
},
{
"epoch": 10.020597322348095,
"grad_norm": 0.46961385011672974,
"learning_rate": 6.926775693456303e-05,
"loss": 0.0457,
"step": 9730
},
{
"epoch": 10.030895983522143,
"grad_norm": 0.2932722866535187,
"learning_rate": 6.920416493036737e-05,
"loss": 0.0444,
"step": 9740
},
{
"epoch": 10.041194644696189,
"grad_norm": 0.4758000671863556,
"learning_rate": 6.91405364654375e-05,
"loss": 0.0433,
"step": 9750
},
{
"epoch": 10.051493305870236,
"grad_norm": 0.960474967956543,
"learning_rate": 6.907687166057746e-05,
"loss": 0.0475,
"step": 9760
},
{
"epoch": 10.061791967044284,
"grad_norm": 0.3127216398715973,
"learning_rate": 6.901317063666025e-05,
"loss": 0.0428,
"step": 9770
},
{
"epoch": 10.072090628218332,
"grad_norm": 0.3544008433818817,
"learning_rate": 6.894943351462769e-05,
"loss": 0.0432,
"step": 9780
},
{
"epoch": 10.08238928939238,
"grad_norm": 0.3885089159011841,
"learning_rate": 6.888566041549008e-05,
"loss": 0.043,
"step": 9790
},
{
"epoch": 10.092687950566427,
"grad_norm": 0.4694571793079376,
"learning_rate": 6.882185146032607e-05,
"loss": 0.0426,
"step": 9800
},
{
"epoch": 10.102986611740473,
"grad_norm": 0.43533068895339966,
"learning_rate": 6.875800677028235e-05,
"loss": 0.0425,
"step": 9810
},
{
"epoch": 10.11328527291452,
"grad_norm": 0.2700258493423462,
"learning_rate": 6.869412646657352e-05,
"loss": 0.0404,
"step": 9820
},
{
"epoch": 10.123583934088568,
"grad_norm": 0.5093254446983337,
"learning_rate": 6.863021067048168e-05,
"loss": 0.0486,
"step": 9830
},
{
"epoch": 10.133882595262616,
"grad_norm": 0.20484302937984467,
"learning_rate": 6.856625950335645e-05,
"loss": 0.0425,
"step": 9840
},
{
"epoch": 10.144181256436664,
"grad_norm": 0.2024274617433548,
"learning_rate": 6.850227308661448e-05,
"loss": 0.0496,
"step": 9850
},
{
"epoch": 10.154479917610711,
"grad_norm": 0.33070307970046997,
"learning_rate": 6.843825154173945e-05,
"loss": 0.0469,
"step": 9860
},
{
"epoch": 10.164778578784759,
"grad_norm": 0.27297335863113403,
"learning_rate": 6.837419499028166e-05,
"loss": 0.0433,
"step": 9870
},
{
"epoch": 10.175077239958805,
"grad_norm": 0.3384619951248169,
"learning_rate": 6.831010355385791e-05,
"loss": 0.0493,
"step": 9880
},
{
"epoch": 10.185375901132852,
"grad_norm": 0.3151882588863373,
"learning_rate": 6.824597735415123e-05,
"loss": 0.0387,
"step": 9890
},
{
"epoch": 10.1956745623069,
"grad_norm": 0.36280104517936707,
"learning_rate": 6.818181651291062e-05,
"loss": 0.0366,
"step": 9900
},
{
"epoch": 10.205973223480948,
"grad_norm": 0.3145824670791626,
"learning_rate": 6.811762115195089e-05,
"loss": 0.0349,
"step": 9910
},
{
"epoch": 10.216271884654995,
"grad_norm": 0.5121240615844727,
"learning_rate": 6.805339139315239e-05,
"loss": 0.0444,
"step": 9920
},
{
"epoch": 10.226570545829043,
"grad_norm": 0.3912114202976227,
"learning_rate": 6.798912735846072e-05,
"loss": 0.0355,
"step": 9930
},
{
"epoch": 10.236869207003089,
"grad_norm": 0.3818724751472473,
"learning_rate": 6.792482916988661e-05,
"loss": 0.0355,
"step": 9940
},
{
"epoch": 10.247167868177137,
"grad_norm": 0.4238334894180298,
"learning_rate": 6.786049694950564e-05,
"loss": 0.0407,
"step": 9950
},
{
"epoch": 10.257466529351184,
"grad_norm": 0.34777218103408813,
"learning_rate": 6.779613081945795e-05,
"loss": 0.0414,
"step": 9960
},
{
"epoch": 10.267765190525232,
"grad_norm": 0.21516083180904388,
"learning_rate": 6.77317309019481e-05,
"loss": 0.0435,
"step": 9970
},
{
"epoch": 10.27806385169928,
"grad_norm": 0.5008741617202759,
"learning_rate": 6.766729731924481e-05,
"loss": 0.0428,
"step": 9980
},
{
"epoch": 10.288362512873327,
"grad_norm": 0.3099767863750458,
"learning_rate": 6.760283019368067e-05,
"loss": 0.041,
"step": 9990
},
{
"epoch": 10.298661174047373,
"grad_norm": 0.3440527319908142,
"learning_rate": 6.753832964765199e-05,
"loss": 0.043,
"step": 10000
},
{
"epoch": 10.30895983522142,
"grad_norm": 0.22595007717609406,
"learning_rate": 6.747379580361853e-05,
"loss": 0.0416,
"step": 10010
},
{
"epoch": 10.319258496395468,
"grad_norm": 0.41214779019355774,
"learning_rate": 6.740922878410324e-05,
"loss": 0.0429,
"step": 10020
},
{
"epoch": 10.329557157569516,
"grad_norm": 0.2798968553543091,
"learning_rate": 6.73446287116921e-05,
"loss": 0.0348,
"step": 10030
},
{
"epoch": 10.339855818743564,
"grad_norm": 0.4669380784034729,
"learning_rate": 6.727999570903381e-05,
"loss": 0.0401,
"step": 10040
},
{
"epoch": 10.350154479917611,
"grad_norm": 0.6160303950309753,
"learning_rate": 6.721532989883958e-05,
"loss": 0.0419,
"step": 10050
},
{
"epoch": 10.360453141091659,
"grad_norm": 0.45790204405784607,
"learning_rate": 6.715063140388297e-05,
"loss": 0.0407,
"step": 10060
},
{
"epoch": 10.370751802265705,
"grad_norm": 0.3356086015701294,
"learning_rate": 6.708590034699954e-05,
"loss": 0.0371,
"step": 10070
},
{
"epoch": 10.381050463439752,
"grad_norm": 0.2664395272731781,
"learning_rate": 6.702113685108668e-05,
"loss": 0.042,
"step": 10080
},
{
"epoch": 10.3913491246138,
"grad_norm": 0.452416330575943,
"learning_rate": 6.695634103910336e-05,
"loss": 0.0343,
"step": 10090
},
{
"epoch": 10.401647785787848,
"grad_norm": 0.25324398279190063,
"learning_rate": 6.689151303406995e-05,
"loss": 0.0387,
"step": 10100
},
{
"epoch": 10.411946446961895,
"grad_norm": 0.49538522958755493,
"learning_rate": 6.682665295906789e-05,
"loss": 0.0435,
"step": 10110
},
{
"epoch": 10.422245108135943,
"grad_norm": 0.4202151596546173,
"learning_rate": 6.676176093723952e-05,
"loss": 0.0379,
"step": 10120
},
{
"epoch": 10.432543769309989,
"grad_norm": 0.37350866198539734,
"learning_rate": 6.669683709178788e-05,
"loss": 0.0344,
"step": 10130
},
{
"epoch": 10.442842430484037,
"grad_norm": 0.33116838335990906,
"learning_rate": 6.663188154597635e-05,
"loss": 0.0402,
"step": 10140
},
{
"epoch": 10.453141091658084,
"grad_norm": 0.2758043706417084,
"learning_rate": 6.656689442312855e-05,
"loss": 0.0369,
"step": 10150
},
{
"epoch": 10.463439752832132,
"grad_norm": 0.7657129168510437,
"learning_rate": 6.650187584662804e-05,
"loss": 0.0384,
"step": 10160
},
{
"epoch": 10.47373841400618,
"grad_norm": 0.5935283899307251,
"learning_rate": 6.643682593991812e-05,
"loss": 0.042,
"step": 10170
},
{
"epoch": 10.484037075180227,
"grad_norm": 0.30886420607566833,
"learning_rate": 6.637174482650149e-05,
"loss": 0.0418,
"step": 10180
},
{
"epoch": 10.494335736354273,
"grad_norm": 0.3029436767101288,
"learning_rate": 6.630663262994023e-05,
"loss": 0.0347,
"step": 10190
},
{
"epoch": 10.50463439752832,
"grad_norm": 1.5523313283920288,
"learning_rate": 6.62414894738553e-05,
"loss": 0.0425,
"step": 10200
},
{
"epoch": 10.514933058702368,
"grad_norm": 0.42397624254226685,
"learning_rate": 6.617631548192654e-05,
"loss": 0.037,
"step": 10210
},
{
"epoch": 10.525231719876416,
"grad_norm": 0.22810333967208862,
"learning_rate": 6.611111077789229e-05,
"loss": 0.0363,
"step": 10220
},
{
"epoch": 10.535530381050464,
"grad_norm": 2.0810601711273193,
"learning_rate": 6.604587548554918e-05,
"loss": 0.0354,
"step": 10230
},
{
"epoch": 10.545829042224511,
"grad_norm": 0.3718070089817047,
"learning_rate": 6.598060972875197e-05,
"loss": 0.045,
"step": 10240
},
{
"epoch": 10.556127703398559,
"grad_norm": 0.4691748321056366,
"learning_rate": 6.591531363141322e-05,
"loss": 0.0437,
"step": 10250
},
{
"epoch": 10.566426364572605,
"grad_norm": 0.3207489252090454,
"learning_rate": 6.58499873175031e-05,
"loss": 0.0415,
"step": 10260
},
{
"epoch": 10.576725025746653,
"grad_norm": 0.3843994140625,
"learning_rate": 6.578463091104915e-05,
"loss": 0.0402,
"step": 10270
},
{
"epoch": 10.5870236869207,
"grad_norm": 0.3183903992176056,
"learning_rate": 6.571924453613604e-05,
"loss": 0.0408,
"step": 10280
},
{
"epoch": 10.597322348094748,
"grad_norm": 0.38627657294273376,
"learning_rate": 6.565382831690536e-05,
"loss": 0.0465,
"step": 10290
},
{
"epoch": 10.607621009268795,
"grad_norm": 0.402624249458313,
"learning_rate": 6.558838237755533e-05,
"loss": 0.0405,
"step": 10300
},
{
"epoch": 10.617919670442843,
"grad_norm": 0.43303969502449036,
"learning_rate": 6.55229068423406e-05,
"loss": 0.0425,
"step": 10310
},
{
"epoch": 10.628218331616889,
"grad_norm": 0.23877955973148346,
"learning_rate": 6.545740183557205e-05,
"loss": 0.0398,
"step": 10320
},
{
"epoch": 10.638516992790937,
"grad_norm": 0.39281123876571655,
"learning_rate": 6.539186748161647e-05,
"loss": 0.0333,
"step": 10330
},
{
"epoch": 10.648815653964984,
"grad_norm": 0.2784782350063324,
"learning_rate": 6.53263039048964e-05,
"loss": 0.0309,
"step": 10340
},
{
"epoch": 10.659114315139032,
"grad_norm": 0.30646565556526184,
"learning_rate": 6.526071122988981e-05,
"loss": 0.0377,
"step": 10350
},
{
"epoch": 10.66941297631308,
"grad_norm": 0.6558123230934143,
"learning_rate": 6.519508958112998e-05,
"loss": 0.038,
"step": 10360
},
{
"epoch": 10.679711637487127,
"grad_norm": 0.30704358220100403,
"learning_rate": 6.512943908320514e-05,
"loss": 0.0347,
"step": 10370
},
{
"epoch": 10.690010298661175,
"grad_norm": 0.3183159828186035,
"learning_rate": 6.506375986075838e-05,
"loss": 0.0351,
"step": 10380
},
{
"epoch": 10.70030895983522,
"grad_norm": 0.4113023579120636,
"learning_rate": 6.499805203848721e-05,
"loss": 0.042,
"step": 10390
},
{
"epoch": 10.710607621009268,
"grad_norm": 0.3853531777858734,
"learning_rate": 6.493231574114352e-05,
"loss": 0.0379,
"step": 10400
},
{
"epoch": 10.720906282183316,
"grad_norm": 0.2897813618183136,
"learning_rate": 6.486655109353326e-05,
"loss": 0.0401,
"step": 10410
},
{
"epoch": 10.731204943357364,
"grad_norm": 0.3457646369934082,
"learning_rate": 6.480075822051615e-05,
"loss": 0.0424,
"step": 10420
},
{
"epoch": 10.741503604531411,
"grad_norm": 0.25340861082077026,
"learning_rate": 6.473493724700554e-05,
"loss": 0.0452,
"step": 10430
},
{
"epoch": 10.751802265705459,
"grad_norm": 0.34103667736053467,
"learning_rate": 6.466908829796817e-05,
"loss": 0.0385,
"step": 10440
},
{
"epoch": 10.762100926879505,
"grad_norm": 0.4537379741668701,
"learning_rate": 6.460321149842376e-05,
"loss": 0.0374,
"step": 10450
},
{
"epoch": 10.772399588053553,
"grad_norm": 0.31676602363586426,
"learning_rate": 6.453730697344509e-05,
"loss": 0.0424,
"step": 10460
},
{
"epoch": 10.7826982492276,
"grad_norm": 0.2844392955303192,
"learning_rate": 6.447137484815742e-05,
"loss": 0.0386,
"step": 10470
},
{
"epoch": 10.792996910401648,
"grad_norm": 0.7473909258842468,
"learning_rate": 6.440541524773852e-05,
"loss": 0.0487,
"step": 10480
},
{
"epoch": 10.803295571575696,
"grad_norm": 0.2831460237503052,
"learning_rate": 6.433942829741825e-05,
"loss": 0.0422,
"step": 10490
},
{
"epoch": 10.813594232749743,
"grad_norm": 0.17480167746543884,
"learning_rate": 6.427341412247844e-05,
"loss": 0.0346,
"step": 10500
},
{
"epoch": 10.82389289392379,
"grad_norm": 0.2777688503265381,
"learning_rate": 6.420737284825257e-05,
"loss": 0.0364,
"step": 10510
},
{
"epoch": 10.834191555097837,
"grad_norm": 0.42611163854599,
"learning_rate": 6.414130460012563e-05,
"loss": 0.0419,
"step": 10520
},
{
"epoch": 10.844490216271884,
"grad_norm": 0.28869888186454773,
"learning_rate": 6.407520950353377e-05,
"loss": 0.039,
"step": 10530
},
{
"epoch": 10.854788877445932,
"grad_norm": 0.25100454688072205,
"learning_rate": 6.400908768396414e-05,
"loss": 0.041,
"step": 10540
},
{
"epoch": 10.86508753861998,
"grad_norm": 0.3695141673088074,
"learning_rate": 6.394293926695458e-05,
"loss": 0.0358,
"step": 10550
},
{
"epoch": 10.875386199794027,
"grad_norm": 0.3969493508338928,
"learning_rate": 6.387676437809352e-05,
"loss": 0.0425,
"step": 10560
},
{
"epoch": 10.885684860968075,
"grad_norm": 0.4601006805896759,
"learning_rate": 6.381056314301955e-05,
"loss": 0.0424,
"step": 10570
},
{
"epoch": 10.89598352214212,
"grad_norm": 0.2443699687719345,
"learning_rate": 6.374433568742135e-05,
"loss": 0.0356,
"step": 10580
},
{
"epoch": 10.906282183316168,
"grad_norm": 0.3879341781139374,
"learning_rate": 6.367808213703735e-05,
"loss": 0.0377,
"step": 10590
},
{
"epoch": 10.916580844490216,
"grad_norm": 0.47516438364982605,
"learning_rate": 6.361180261765551e-05,
"loss": 0.0403,
"step": 10600
},
{
"epoch": 10.926879505664264,
"grad_norm": 0.8033366799354553,
"learning_rate": 6.354549725511312e-05,
"loss": 0.0379,
"step": 10610
},
{
"epoch": 10.937178166838311,
"grad_norm": 0.31796321272850037,
"learning_rate": 6.347916617529655e-05,
"loss": 0.0344,
"step": 10620
},
{
"epoch": 10.947476828012359,
"grad_norm": 0.22474992275238037,
"learning_rate": 6.341280950414096e-05,
"loss": 0.0362,
"step": 10630
},
{
"epoch": 10.957775489186405,
"grad_norm": 0.3195785880088806,
"learning_rate": 6.334642736763011e-05,
"loss": 0.0433,
"step": 10640
},
{
"epoch": 10.968074150360453,
"grad_norm": 0.2575366199016571,
"learning_rate": 6.328001989179613e-05,
"loss": 0.0392,
"step": 10650
},
{
"epoch": 10.9783728115345,
"grad_norm": 0.2524723708629608,
"learning_rate": 6.321358720271921e-05,
"loss": 0.0361,
"step": 10660
},
{
"epoch": 10.988671472708548,
"grad_norm": 0.3837646245956421,
"learning_rate": 6.314712942652744e-05,
"loss": 0.0323,
"step": 10670
},
{
"epoch": 10.998970133882596,
"grad_norm": 0.28651994466781616,
"learning_rate": 6.308064668939656e-05,
"loss": 0.0338,
"step": 10680
},
{
"epoch": 11.009268795056643,
"grad_norm": 0.2467758059501648,
"learning_rate": 6.301413911754966e-05,
"loss": 0.041,
"step": 10690
},
{
"epoch": 11.019567456230691,
"grad_norm": 0.47968339920043945,
"learning_rate": 6.294760683725702e-05,
"loss": 0.0393,
"step": 10700
},
{
"epoch": 11.029866117404737,
"grad_norm": 0.43202805519104004,
"learning_rate": 6.28810499748358e-05,
"loss": 0.04,
"step": 10710
},
{
"epoch": 11.040164778578784,
"grad_norm": 0.27403122186660767,
"learning_rate": 6.281446865664984e-05,
"loss": 0.0445,
"step": 10720
},
{
"epoch": 11.050463439752832,
"grad_norm": 0.28393465280532837,
"learning_rate": 6.274786300910942e-05,
"loss": 0.0344,
"step": 10730
},
{
"epoch": 11.06076210092688,
"grad_norm": 0.2949467599391937,
"learning_rate": 6.2681233158671e-05,
"loss": 0.0396,
"step": 10740
},
{
"epoch": 11.071060762100927,
"grad_norm": 0.30573686957359314,
"learning_rate": 6.2614579231837e-05,
"loss": 0.0389,
"step": 10750
},
{
"epoch": 11.081359423274975,
"grad_norm": 0.32069993019104004,
"learning_rate": 6.254790135515554e-05,
"loss": 0.0417,
"step": 10760
},
{
"epoch": 11.091658084449021,
"grad_norm": 0.42498165369033813,
"learning_rate": 6.248119965522024e-05,
"loss": 0.0435,
"step": 10770
},
{
"epoch": 11.101956745623069,
"grad_norm": 0.2858854830265045,
"learning_rate": 6.241447425866988e-05,
"loss": 0.0437,
"step": 10780
},
{
"epoch": 11.112255406797116,
"grad_norm": 0.38175907731056213,
"learning_rate": 6.234772529218833e-05,
"loss": 0.0412,
"step": 10790
},
{
"epoch": 11.122554067971164,
"grad_norm": 0.7922874689102173,
"learning_rate": 6.228095288250415e-05,
"loss": 0.046,
"step": 10800
},
{
"epoch": 11.132852729145212,
"grad_norm": 0.25292420387268066,
"learning_rate": 6.22141571563904e-05,
"loss": 0.0372,
"step": 10810
},
{
"epoch": 11.14315139031926,
"grad_norm": 0.49811357259750366,
"learning_rate": 6.214733824066443e-05,
"loss": 0.0402,
"step": 10820
},
{
"epoch": 11.153450051493305,
"grad_norm": 0.33916985988616943,
"learning_rate": 6.208049626218761e-05,
"loss": 0.0413,
"step": 10830
},
{
"epoch": 11.163748712667353,
"grad_norm": 0.3031541109085083,
"learning_rate": 6.20136313478651e-05,
"loss": 0.0409,
"step": 10840
},
{
"epoch": 11.1740473738414,
"grad_norm": 0.34187036752700806,
"learning_rate": 6.194674362464563e-05,
"loss": 0.0387,
"step": 10850
},
{
"epoch": 11.184346035015448,
"grad_norm": 0.41674673557281494,
"learning_rate": 6.187983321952117e-05,
"loss": 0.0331,
"step": 10860
},
{
"epoch": 11.194644696189496,
"grad_norm": 0.39333340525627136,
"learning_rate": 6.181290025952684e-05,
"loss": 0.0355,
"step": 10870
},
{
"epoch": 11.204943357363543,
"grad_norm": 0.23343288898468018,
"learning_rate": 6.174594487174047e-05,
"loss": 0.037,
"step": 10880
},
{
"epoch": 11.215242018537591,
"grad_norm": 0.26009804010391235,
"learning_rate": 6.167896718328259e-05,
"loss": 0.036,
"step": 10890
},
{
"epoch": 11.225540679711637,
"grad_norm": 0.3702475428581238,
"learning_rate": 6.161196732131601e-05,
"loss": 0.038,
"step": 10900
},
{
"epoch": 11.235839340885684,
"grad_norm": 0.22221307456493378,
"learning_rate": 6.154494541304561e-05,
"loss": 0.033,
"step": 10910
},
{
"epoch": 11.246138002059732,
"grad_norm": 0.27125799655914307,
"learning_rate": 6.147790158571821e-05,
"loss": 0.0362,
"step": 10920
},
{
"epoch": 11.25643666323378,
"grad_norm": 0.32463979721069336,
"learning_rate": 6.141083596662218e-05,
"loss": 0.0412,
"step": 10930
},
{
"epoch": 11.266735324407827,
"grad_norm": 0.28501760959625244,
"learning_rate": 6.134374868308726e-05,
"loss": 0.0414,
"step": 10940
},
{
"epoch": 11.277033985581875,
"grad_norm": 0.39163145422935486,
"learning_rate": 6.127663986248434e-05,
"loss": 0.0323,
"step": 10950
},
{
"epoch": 11.287332646755921,
"grad_norm": 0.3159801959991455,
"learning_rate": 6.120950963222523e-05,
"loss": 0.0388,
"step": 10960
},
{
"epoch": 11.297631307929969,
"grad_norm": 0.25974059104919434,
"learning_rate": 6.114235811976235e-05,
"loss": 0.041,
"step": 10970
},
{
"epoch": 11.307929969104016,
"grad_norm": 0.38364288210868835,
"learning_rate": 6.107518545258853e-05,
"loss": 0.035,
"step": 10980
},
{
"epoch": 11.318228630278064,
"grad_norm": 0.4129471778869629,
"learning_rate": 6.100799175823678e-05,
"loss": 0.0455,
"step": 10990
},
{
"epoch": 11.328527291452112,
"grad_norm": 0.3018711805343628,
"learning_rate": 6.094077716428e-05,
"loss": 0.0431,
"step": 11000
},
{
"epoch": 11.33882595262616,
"grad_norm": 0.5106986165046692,
"learning_rate": 6.0873541798330814e-05,
"loss": 0.037,
"step": 11010
},
{
"epoch": 11.349124613800207,
"grad_norm": 0.3932558298110962,
"learning_rate": 6.080628578804125e-05,
"loss": 0.0344,
"step": 11020
},
{
"epoch": 11.359423274974253,
"grad_norm": 0.32844093441963196,
"learning_rate": 6.073900926110254e-05,
"loss": 0.0386,
"step": 11030
},
{
"epoch": 11.3697219361483,
"grad_norm": 0.22941184043884277,
"learning_rate": 6.067171234524488e-05,
"loss": 0.0295,
"step": 11040
},
{
"epoch": 11.380020597322348,
"grad_norm": 0.23273223638534546,
"learning_rate": 6.0604395168237174e-05,
"loss": 0.0334,
"step": 11050
},
{
"epoch": 11.390319258496396,
"grad_norm": 0.26568588614463806,
"learning_rate": 6.0537057857886755e-05,
"loss": 0.0378,
"step": 11060
},
{
"epoch": 11.400617919670443,
"grad_norm": 0.40793830156326294,
"learning_rate": 6.0469700542039234e-05,
"loss": 0.033,
"step": 11070
},
{
"epoch": 11.410916580844491,
"grad_norm": 0.308687299489975,
"learning_rate": 6.040232334857818e-05,
"loss": 0.0411,
"step": 11080
},
{
"epoch": 11.421215242018537,
"grad_norm": 0.359393835067749,
"learning_rate": 6.033492640542491e-05,
"loss": 0.0394,
"step": 11090
},
{
"epoch": 11.431513903192585,
"grad_norm": 0.37227872014045715,
"learning_rate": 6.026750984053821e-05,
"loss": 0.0394,
"step": 11100
},
{
"epoch": 11.441812564366632,
"grad_norm": 0.5043579339981079,
"learning_rate": 6.020007378191416e-05,
"loss": 0.0425,
"step": 11110
},
{
"epoch": 11.45211122554068,
"grad_norm": 0.33296504616737366,
"learning_rate": 6.013261835758581e-05,
"loss": 0.0358,
"step": 11120
},
{
"epoch": 11.462409886714727,
"grad_norm": 0.19659306108951569,
"learning_rate": 6.0065143695623016e-05,
"loss": 0.0352,
"step": 11130
},
{
"epoch": 11.472708547888775,
"grad_norm": 0.3573089838027954,
"learning_rate": 5.9997649924132146e-05,
"loss": 0.0417,
"step": 11140
},
{
"epoch": 11.483007209062821,
"grad_norm": 0.38887205719947815,
"learning_rate": 5.993013717125583e-05,
"loss": 0.0584,
"step": 11150
},
{
"epoch": 11.493305870236869,
"grad_norm": 0.2696877121925354,
"learning_rate": 5.986260556517276e-05,
"loss": 0.0335,
"step": 11160
},
{
"epoch": 11.503604531410916,
"grad_norm": 0.2923082411289215,
"learning_rate": 5.97950552340974e-05,
"loss": 0.0345,
"step": 11170
},
{
"epoch": 11.513903192584964,
"grad_norm": 0.4937894940376282,
"learning_rate": 5.972748630627978e-05,
"loss": 0.0393,
"step": 11180
},
{
"epoch": 11.524201853759012,
"grad_norm": 0.4129011034965515,
"learning_rate": 5.965989891000523e-05,
"loss": 0.0392,
"step": 11190
},
{
"epoch": 11.53450051493306,
"grad_norm": 1.712835669517517,
"learning_rate": 5.9592293173594174e-05,
"loss": 0.0431,
"step": 11200
},
{
"epoch": 11.544799176107105,
"grad_norm": 0.6095489859580994,
"learning_rate": 5.9524669225401794e-05,
"loss": 0.0481,
"step": 11210
},
{
"epoch": 11.555097837281153,
"grad_norm": 0.26651468873023987,
"learning_rate": 5.945702719381791e-05,
"loss": 0.0399,
"step": 11220
},
{
"epoch": 11.5653964984552,
"grad_norm": 0.7687448859214783,
"learning_rate": 5.9389367207266645e-05,
"loss": 0.0422,
"step": 11230
},
{
"epoch": 11.575695159629248,
"grad_norm": 0.3595399558544159,
"learning_rate": 5.9321689394206215e-05,
"loss": 0.0408,
"step": 11240
},
{
"epoch": 11.585993820803296,
"grad_norm": 0.2710624933242798,
"learning_rate": 5.9253993883128666e-05,
"loss": 0.0396,
"step": 11250
},
{
"epoch": 11.596292481977343,
"grad_norm": 0.22831162810325623,
"learning_rate": 5.918628080255969e-05,
"loss": 0.038,
"step": 11260
},
{
"epoch": 11.606591143151391,
"grad_norm": 0.43246155977249146,
"learning_rate": 5.9118550281058295e-05,
"loss": 0.0334,
"step": 11270
},
{
"epoch": 11.616889804325437,
"grad_norm": 0.6237396597862244,
"learning_rate": 5.9050802447216604e-05,
"loss": 0.0354,
"step": 11280
},
{
"epoch": 11.627188465499485,
"grad_norm": 0.4510927200317383,
"learning_rate": 5.898303742965964e-05,
"loss": 0.0408,
"step": 11290
},
{
"epoch": 11.637487126673532,
"grad_norm": 0.2852180004119873,
"learning_rate": 5.8915255357045006e-05,
"loss": 0.0422,
"step": 11300
},
{
"epoch": 11.64778578784758,
"grad_norm": 0.27295804023742676,
"learning_rate": 5.884745635806272e-05,
"loss": 0.0441,
"step": 11310
},
{
"epoch": 11.658084449021628,
"grad_norm": 0.3172348737716675,
"learning_rate": 5.8779640561434943e-05,
"loss": 0.0404,
"step": 11320
},
{
"epoch": 11.668383110195675,
"grad_norm": 0.30553555488586426,
"learning_rate": 5.87118080959157e-05,
"loss": 0.0391,
"step": 11330
},
{
"epoch": 11.678681771369721,
"grad_norm": 0.3673776388168335,
"learning_rate": 5.8643959090290653e-05,
"loss": 0.0362,
"step": 11340
},
{
"epoch": 11.688980432543769,
"grad_norm": 0.3422437012195587,
"learning_rate": 5.857609367337692e-05,
"loss": 0.0384,
"step": 11350
},
{
"epoch": 11.699279093717816,
"grad_norm": 0.2885243892669678,
"learning_rate": 5.850821197402272e-05,
"loss": 0.035,
"step": 11360
},
{
"epoch": 11.709577754891864,
"grad_norm": 0.30880603194236755,
"learning_rate": 5.844031412110722e-05,
"loss": 0.0395,
"step": 11370
},
{
"epoch": 11.719876416065912,
"grad_norm": 0.24245630204677582,
"learning_rate": 5.837240024354026e-05,
"loss": 0.0411,
"step": 11380
},
{
"epoch": 11.73017507723996,
"grad_norm": 0.29702940583229065,
"learning_rate": 5.830447047026206e-05,
"loss": 0.0362,
"step": 11390
},
{
"epoch": 11.740473738414007,
"grad_norm": 0.23466959595680237,
"learning_rate": 5.8236524930243075e-05,
"loss": 0.0318,
"step": 11400
},
{
"epoch": 11.750772399588053,
"grad_norm": 0.2884041666984558,
"learning_rate": 5.816856375248368e-05,
"loss": 0.035,
"step": 11410
},
{
"epoch": 11.7610710607621,
"grad_norm": 0.9963494539260864,
"learning_rate": 5.810058706601389e-05,
"loss": 0.0377,
"step": 11420
},
{
"epoch": 11.771369721936148,
"grad_norm": 0.6087297201156616,
"learning_rate": 5.803259499989323e-05,
"loss": 0.037,
"step": 11430
},
{
"epoch": 11.781668383110196,
"grad_norm": 0.41283634305000305,
"learning_rate": 5.79645876832104e-05,
"loss": 0.0373,
"step": 11440
},
{
"epoch": 11.791967044284243,
"grad_norm": 0.3417651355266571,
"learning_rate": 5.7896565245083035e-05,
"loss": 0.044,
"step": 11450
},
{
"epoch": 11.802265705458291,
"grad_norm": 0.42142239212989807,
"learning_rate": 5.782852781465751e-05,
"loss": 0.0419,
"step": 11460
},
{
"epoch": 11.812564366632337,
"grad_norm": 0.34466907382011414,
"learning_rate": 5.776047552110866e-05,
"loss": 0.042,
"step": 11470
},
{
"epoch": 11.822863027806385,
"grad_norm": 0.29393690824508667,
"learning_rate": 5.769240849363952e-05,
"loss": 0.0435,
"step": 11480
},
{
"epoch": 11.833161688980432,
"grad_norm": 0.45664796233177185,
"learning_rate": 5.7624326861481094e-05,
"loss": 0.0296,
"step": 11490
},
{
"epoch": 11.84346035015448,
"grad_norm": 0.276324063539505,
"learning_rate": 5.755623075389214e-05,
"loss": 0.0397,
"step": 11500
},
{
"epoch": 11.853759011328528,
"grad_norm": 0.35282203555107117,
"learning_rate": 5.748812030015891e-05,
"loss": 0.042,
"step": 11510
},
{
"epoch": 11.864057672502575,
"grad_norm": 0.8150110840797424,
"learning_rate": 5.7419995629594835e-05,
"loss": 0.0347,
"step": 11520
},
{
"epoch": 11.874356333676623,
"grad_norm": 0.7844158411026001,
"learning_rate": 5.735185687154039e-05,
"loss": 0.0458,
"step": 11530
},
{
"epoch": 11.884654994850669,
"grad_norm": 0.2974760830402374,
"learning_rate": 5.7283704155362796e-05,
"loss": 0.036,
"step": 11540
},
{
"epoch": 11.894953656024716,
"grad_norm": 0.25210148096084595,
"learning_rate": 5.7215537610455726e-05,
"loss": 0.0366,
"step": 11550
},
{
"epoch": 11.905252317198764,
"grad_norm": 0.30548202991485596,
"learning_rate": 5.7147357366239174e-05,
"loss": 0.0416,
"step": 11560
},
{
"epoch": 11.915550978372812,
"grad_norm": 0.3365536332130432,
"learning_rate": 5.70791635521591e-05,
"loss": 0.0371,
"step": 11570
},
{
"epoch": 11.92584963954686,
"grad_norm": 0.2530035674571991,
"learning_rate": 5.7010956297687215e-05,
"loss": 0.0331,
"step": 11580
},
{
"epoch": 11.936148300720907,
"grad_norm": 0.2957620918750763,
"learning_rate": 5.694273573232078e-05,
"loss": 0.0395,
"step": 11590
},
{
"epoch": 11.946446961894953,
"grad_norm": 0.2622826099395752,
"learning_rate": 5.6874501985582365e-05,
"loss": 0.0339,
"step": 11600
},
{
"epoch": 11.956745623069,
"grad_norm": 0.25079742074012756,
"learning_rate": 5.6806255187019456e-05,
"loss": 0.034,
"step": 11610
},
{
"epoch": 11.967044284243048,
"grad_norm": 0.28562131524086,
"learning_rate": 5.67379954662044e-05,
"loss": 0.0374,
"step": 11620
},
{
"epoch": 11.977342945417096,
"grad_norm": 0.2564544081687927,
"learning_rate": 5.666972295273409e-05,
"loss": 0.0362,
"step": 11630
},
{
"epoch": 11.987641606591144,
"grad_norm": 0.3891771733760834,
"learning_rate": 5.660143777622964e-05,
"loss": 0.0396,
"step": 11640
},
{
"epoch": 11.997940267765191,
"grad_norm": 0.27068471908569336,
"learning_rate": 5.653314006633625e-05,
"loss": 0.0332,
"step": 11650
},
{
"epoch": 12.008238928939237,
"grad_norm": 0.22479818761348724,
"learning_rate": 5.6464829952722955e-05,
"loss": 0.0401,
"step": 11660
},
{
"epoch": 12.018537590113285,
"grad_norm": 0.2226206511259079,
"learning_rate": 5.639650756508222e-05,
"loss": 0.0333,
"step": 11670
},
{
"epoch": 12.028836251287332,
"grad_norm": 0.3488923907279968,
"learning_rate": 5.6328173033129925e-05,
"loss": 0.0425,
"step": 11680
},
{
"epoch": 12.03913491246138,
"grad_norm": 0.21748347580432892,
"learning_rate": 5.6259826486604996e-05,
"loss": 0.0292,
"step": 11690
},
{
"epoch": 12.049433573635428,
"grad_norm": 0.3232283294200897,
"learning_rate": 5.619146805526908e-05,
"loss": 0.0368,
"step": 11700
},
{
"epoch": 12.059732234809475,
"grad_norm": 0.38965263962745667,
"learning_rate": 5.612309786890649e-05,
"loss": 0.041,
"step": 11710
},
{
"epoch": 12.070030895983523,
"grad_norm": 0.6188856959342957,
"learning_rate": 5.6054716057323816e-05,
"loss": 0.0379,
"step": 11720
},
{
"epoch": 12.080329557157569,
"grad_norm": 0.2538016736507416,
"learning_rate": 5.5986322750349716e-05,
"loss": 0.0313,
"step": 11730
},
{
"epoch": 12.090628218331616,
"grad_norm": 0.26488491892814636,
"learning_rate": 5.591791807783466e-05,
"loss": 0.0459,
"step": 11740
},
{
"epoch": 12.100926879505664,
"grad_norm": 0.3006492555141449,
"learning_rate": 5.584950216965076e-05,
"loss": 0.0324,
"step": 11750
},
{
"epoch": 12.111225540679712,
"grad_norm": 0.3942396342754364,
"learning_rate": 5.5781075155691376e-05,
"loss": 0.03,
"step": 11760
},
{
"epoch": 12.12152420185376,
"grad_norm": 0.27575209736824036,
"learning_rate": 5.571263716587099e-05,
"loss": 0.0304,
"step": 11770
},
{
"epoch": 12.131822863027807,
"grad_norm": 0.30952727794647217,
"learning_rate": 5.5644188330124944e-05,
"loss": 0.0354,
"step": 11780
},
{
"epoch": 12.142121524201853,
"grad_norm": 0.3153734505176544,
"learning_rate": 5.557572877840915e-05,
"loss": 0.0381,
"step": 11790
},
{
"epoch": 12.1524201853759,
"grad_norm": 0.522873044013977,
"learning_rate": 5.5507258640699856e-05,
"loss": 0.0402,
"step": 11800
},
{
"epoch": 12.162718846549948,
"grad_norm": 0.294101357460022,
"learning_rate": 5.5438778046993424e-05,
"loss": 0.0341,
"step": 11810
},
{
"epoch": 12.173017507723996,
"grad_norm": 0.28906193375587463,
"learning_rate": 5.537028712730606e-05,
"loss": 0.0434,
"step": 11820
},
{
"epoch": 12.183316168898044,
"grad_norm": 0.2495001256465912,
"learning_rate": 5.5301786011673586e-05,
"loss": 0.0379,
"step": 11830
},
{
"epoch": 12.193614830072091,
"grad_norm": 0.3273758292198181,
"learning_rate": 5.5233274830151175e-05,
"loss": 0.0387,
"step": 11840
},
{
"epoch": 12.203913491246137,
"grad_norm": 0.27889516949653625,
"learning_rate": 5.516475371281309e-05,
"loss": 0.0381,
"step": 11850
},
{
"epoch": 12.214212152420185,
"grad_norm": 0.314471960067749,
"learning_rate": 5.50962227897525e-05,
"loss": 0.0355,
"step": 11860
},
{
"epoch": 12.224510813594232,
"grad_norm": 0.29174938797950745,
"learning_rate": 5.502768219108118e-05,
"loss": 0.043,
"step": 11870
},
{
"epoch": 12.23480947476828,
"grad_norm": 0.2509532570838928,
"learning_rate": 5.495913204692923e-05,
"loss": 0.0349,
"step": 11880
},
{
"epoch": 12.245108135942328,
"grad_norm": 0.331087589263916,
"learning_rate": 5.489057248744491e-05,
"loss": 0.0358,
"step": 11890
},
{
"epoch": 12.255406797116375,
"grad_norm": 0.27980178594589233,
"learning_rate": 5.482200364279437e-05,
"loss": 0.0363,
"step": 11900
},
{
"epoch": 12.265705458290423,
"grad_norm": 0.24944637715816498,
"learning_rate": 5.475342564316137e-05,
"loss": 0.0359,
"step": 11910
},
{
"epoch": 12.276004119464469,
"grad_norm": 0.21886838972568512,
"learning_rate": 5.468483861874705e-05,
"loss": 0.0363,
"step": 11920
},
{
"epoch": 12.286302780638517,
"grad_norm": 1.8008060455322266,
"learning_rate": 5.461624269976967e-05,
"loss": 0.0328,
"step": 11930
},
{
"epoch": 12.296601441812564,
"grad_norm": 0.19407658278942108,
"learning_rate": 5.454763801646443e-05,
"loss": 0.0316,
"step": 11940
},
{
"epoch": 12.306900102986612,
"grad_norm": 0.28356480598449707,
"learning_rate": 5.44790246990831e-05,
"loss": 0.0386,
"step": 11950
},
{
"epoch": 12.31719876416066,
"grad_norm": 0.2647826075553894,
"learning_rate": 5.441040287789388e-05,
"loss": 0.0383,
"step": 11960
},
{
"epoch": 12.327497425334707,
"grad_norm": 0.30886611342430115,
"learning_rate": 5.4341772683181144e-05,
"loss": 0.0411,
"step": 11970
},
{
"epoch": 12.337796086508753,
"grad_norm": 0.2655414342880249,
"learning_rate": 5.4273134245245095e-05,
"loss": 0.0332,
"step": 11980
},
{
"epoch": 12.3480947476828,
"grad_norm": 0.2595519423484802,
"learning_rate": 5.420448769440163e-05,
"loss": 0.035,
"step": 11990
},
{
"epoch": 12.358393408856848,
"grad_norm": 0.6750108003616333,
"learning_rate": 5.413583316098206e-05,
"loss": 0.0362,
"step": 12000
},
{
"epoch": 12.368692070030896,
"grad_norm": 0.37275227904319763,
"learning_rate": 5.406717077533281e-05,
"loss": 0.0392,
"step": 12010
},
{
"epoch": 12.378990731204944,
"grad_norm": 0.4077666699886322,
"learning_rate": 5.399850066781526e-05,
"loss": 0.0344,
"step": 12020
},
{
"epoch": 12.389289392378991,
"grad_norm": 0.27722200751304626,
"learning_rate": 5.392982296880541e-05,
"loss": 0.0335,
"step": 12030
},
{
"epoch": 12.399588053553039,
"grad_norm": 0.47021421790122986,
"learning_rate": 5.3861137808693695e-05,
"loss": 0.034,
"step": 12040
},
{
"epoch": 12.409886714727085,
"grad_norm": 2.5477991104125977,
"learning_rate": 5.3792445317884696e-05,
"loss": 0.039,
"step": 12050
},
{
"epoch": 12.420185375901132,
"grad_norm": 0.2117021083831787,
"learning_rate": 5.372374562679697e-05,
"loss": 0.0347,
"step": 12060
},
{
"epoch": 12.43048403707518,
"grad_norm": 0.2672605514526367,
"learning_rate": 5.3655038865862664e-05,
"loss": 0.0404,
"step": 12070
},
{
"epoch": 12.440782698249228,
"grad_norm": 0.37322327494621277,
"learning_rate": 5.358632516552738e-05,
"loss": 0.0357,
"step": 12080
},
{
"epoch": 12.451081359423275,
"grad_norm": 0.22624556720256805,
"learning_rate": 5.351760465624993e-05,
"loss": 0.0368,
"step": 12090
},
{
"epoch": 12.461380020597323,
"grad_norm": 0.2701815366744995,
"learning_rate": 5.3448877468502e-05,
"loss": 0.0436,
"step": 12100
},
{
"epoch": 12.471678681771369,
"grad_norm": 0.20336881279945374,
"learning_rate": 5.3380143732768e-05,
"loss": 0.04,
"step": 12110
},
{
"epoch": 12.481977342945417,
"grad_norm": 0.258292555809021,
"learning_rate": 5.331140357954473e-05,
"loss": 0.0376,
"step": 12120
},
{
"epoch": 12.492276004119464,
"grad_norm": 0.30692732334136963,
"learning_rate": 5.324265713934119e-05,
"loss": 0.0409,
"step": 12130
},
{
"epoch": 12.502574665293512,
"grad_norm": 0.3250420391559601,
"learning_rate": 5.317390454267834e-05,
"loss": 0.0359,
"step": 12140
},
{
"epoch": 12.51287332646756,
"grad_norm": 1.7624558210372925,
"learning_rate": 5.310514592008882e-05,
"loss": 0.0358,
"step": 12150
},
{
"epoch": 12.523171987641607,
"grad_norm": 0.45035508275032043,
"learning_rate": 5.303638140211666e-05,
"loss": 0.0465,
"step": 12160
},
{
"epoch": 12.533470648815655,
"grad_norm": 0.30285102128982544,
"learning_rate": 5.296761111931715e-05,
"loss": 0.031,
"step": 12170
},
{
"epoch": 12.5437693099897,
"grad_norm": 0.3329470157623291,
"learning_rate": 5.289883520225651e-05,
"loss": 0.0367,
"step": 12180
},
{
"epoch": 12.554067971163748,
"grad_norm": 0.2198537141084671,
"learning_rate": 5.283005378151162e-05,
"loss": 0.044,
"step": 12190
},
{
"epoch": 12.564366632337796,
"grad_norm": 0.7126184105873108,
"learning_rate": 5.276126698766985e-05,
"loss": 0.0373,
"step": 12200
},
{
"epoch": 12.574665293511844,
"grad_norm": 0.25698626041412354,
"learning_rate": 5.269247495132877e-05,
"loss": 0.0336,
"step": 12210
},
{
"epoch": 12.584963954685891,
"grad_norm": 0.3503556549549103,
"learning_rate": 5.2623677803095864e-05,
"loss": 0.0389,
"step": 12220
},
{
"epoch": 12.595262615859939,
"grad_norm": 0.24767592549324036,
"learning_rate": 5.2554875673588334e-05,
"loss": 0.0337,
"step": 12230
},
{
"epoch": 12.605561277033985,
"grad_norm": 0.21992820501327515,
"learning_rate": 5.24860686934329e-05,
"loss": 0.0423,
"step": 12240
},
{
"epoch": 12.615859938208033,
"grad_norm": 0.8321926593780518,
"learning_rate": 5.2417256993265396e-05,
"loss": 0.0327,
"step": 12250
},
{
"epoch": 12.62615859938208,
"grad_norm": 0.7005571722984314,
"learning_rate": 5.234844070373069e-05,
"loss": 0.0346,
"step": 12260
},
{
"epoch": 12.636457260556128,
"grad_norm": 0.27472203969955444,
"learning_rate": 5.227961995548235e-05,
"loss": 0.0299,
"step": 12270
},
{
"epoch": 12.646755921730175,
"grad_norm": 0.3291379511356354,
"learning_rate": 5.2210794879182376e-05,
"loss": 0.0354,
"step": 12280
},
{
"epoch": 12.657054582904223,
"grad_norm": 0.21298809349536896,
"learning_rate": 5.214196560550101e-05,
"loss": 0.034,
"step": 12290
},
{
"epoch": 12.667353244078269,
"grad_norm": 0.34874945878982544,
"learning_rate": 5.2073132265116456e-05,
"loss": 0.0326,
"step": 12300
},
{
"epoch": 12.677651905252317,
"grad_norm": 0.3092985153198242,
"learning_rate": 5.2004294988714654e-05,
"loss": 0.0366,
"step": 12310
},
{
"epoch": 12.687950566426364,
"grad_norm": 0.23273812234401703,
"learning_rate": 5.1935453906989e-05,
"loss": 0.0344,
"step": 12320
},
{
"epoch": 12.698249227600412,
"grad_norm": 0.3094405233860016,
"learning_rate": 5.1866609150640114e-05,
"loss": 0.0343,
"step": 12330
},
{
"epoch": 12.70854788877446,
"grad_norm": 0.3472210764884949,
"learning_rate": 5.179776085037561e-05,
"loss": 0.0348,
"step": 12340
},
{
"epoch": 12.718846549948507,
"grad_norm": 0.39559873938560486,
"learning_rate": 5.172890913690981e-05,
"loss": 0.035,
"step": 12350
},
{
"epoch": 12.729145211122553,
"grad_norm": 0.34745925664901733,
"learning_rate": 5.166005414096353e-05,
"loss": 0.0364,
"step": 12360
},
{
"epoch": 12.7394438722966,
"grad_norm": 0.3649449646472931,
"learning_rate": 5.159119599326383e-05,
"loss": 0.0312,
"step": 12370
},
{
"epoch": 12.749742533470648,
"grad_norm": 0.26928043365478516,
"learning_rate": 5.152233482454369e-05,
"loss": 0.0303,
"step": 12380
},
{
"epoch": 12.760041194644696,
"grad_norm": 0.278518944978714,
"learning_rate": 5.145347076554192e-05,
"loss": 0.0391,
"step": 12390
},
{
"epoch": 12.770339855818744,
"grad_norm": 0.366487979888916,
"learning_rate": 5.1384603947002775e-05,
"loss": 0.0337,
"step": 12400
},
{
"epoch": 12.780638516992791,
"grad_norm": 0.47460800409317017,
"learning_rate": 5.131573449967571e-05,
"loss": 0.0435,
"step": 12410
},
{
"epoch": 12.790937178166839,
"grad_norm": 0.22989706695079803,
"learning_rate": 5.12468625543152e-05,
"loss": 0.0371,
"step": 12420
},
{
"epoch": 12.801235839340885,
"grad_norm": 0.3515235185623169,
"learning_rate": 5.117798824168052e-05,
"loss": 0.0378,
"step": 12430
},
{
"epoch": 12.811534500514933,
"grad_norm": 0.3192023038864136,
"learning_rate": 5.1109111692535335e-05,
"loss": 0.0335,
"step": 12440
},
{
"epoch": 12.82183316168898,
"grad_norm": 0.22734206914901733,
"learning_rate": 5.1040233037647636e-05,
"loss": 0.0357,
"step": 12450
},
{
"epoch": 12.832131822863028,
"grad_norm": 0.2719617784023285,
"learning_rate": 5.0971352407789396e-05,
"loss": 0.0306,
"step": 12460
},
{
"epoch": 12.842430484037076,
"grad_norm": 0.28744617104530334,
"learning_rate": 5.0902469933736295e-05,
"loss": 0.0351,
"step": 12470
},
{
"epoch": 12.852729145211123,
"grad_norm": 0.273926317691803,
"learning_rate": 5.0833585746267556e-05,
"loss": 0.032,
"step": 12480
},
{
"epoch": 12.863027806385169,
"grad_norm": 1.2121611833572388,
"learning_rate": 5.076469997616568e-05,
"loss": 0.035,
"step": 12490
},
{
"epoch": 12.873326467559217,
"grad_norm": 0.22267426550388336,
"learning_rate": 5.0695812754216076e-05,
"loss": 0.0339,
"step": 12500
},
{
"epoch": 12.883625128733264,
"grad_norm": 0.204677551984787,
"learning_rate": 5.0626924211207015e-05,
"loss": 0.0371,
"step": 12510
},
{
"epoch": 12.893923789907312,
"grad_norm": 0.25673428177833557,
"learning_rate": 5.055803447792924e-05,
"loss": 0.0355,
"step": 12520
},
{
"epoch": 12.90422245108136,
"grad_norm": 0.1868886500597,
"learning_rate": 5.0489143685175714e-05,
"loss": 0.0363,
"step": 12530
},
{
"epoch": 12.914521112255407,
"grad_norm": 0.27115195989608765,
"learning_rate": 5.042025196374145e-05,
"loss": 0.0303,
"step": 12540
},
{
"epoch": 12.924819773429455,
"grad_norm": 0.2909318804740906,
"learning_rate": 5.035135944442324e-05,
"loss": 0.0329,
"step": 12550
},
{
"epoch": 12.9351184346035,
"grad_norm": 0.2762157618999481,
"learning_rate": 5.028246625801935e-05,
"loss": 0.041,
"step": 12560
},
{
"epoch": 12.945417095777549,
"grad_norm": 0.3040686547756195,
"learning_rate": 5.0213572535329336e-05,
"loss": 0.0353,
"step": 12570
},
{
"epoch": 12.955715756951596,
"grad_norm": 0.1944282352924347,
"learning_rate": 5.014467840715378e-05,
"loss": 0.0383,
"step": 12580
},
{
"epoch": 12.966014418125644,
"grad_norm": 0.29223209619522095,
"learning_rate": 5.007578400429399e-05,
"loss": 0.0321,
"step": 12590
},
{
"epoch": 12.976313079299691,
"grad_norm": 0.27292895317077637,
"learning_rate": 5.0006889457551864e-05,
"loss": 0.0304,
"step": 12600
},
{
"epoch": 12.98661174047374,
"grad_norm": 0.20329241454601288,
"learning_rate": 4.9937994897729515e-05,
"loss": 0.039,
"step": 12610
},
{
"epoch": 12.996910401647785,
"grad_norm": 0.3806949257850647,
"learning_rate": 4.9869100455629105e-05,
"loss": 0.0359,
"step": 12620
},
{
"epoch": 13.007209062821833,
"grad_norm": 0.3294031322002411,
"learning_rate": 4.9800206262052574e-05,
"loss": 0.0339,
"step": 12630
},
{
"epoch": 13.01750772399588,
"grad_norm": 0.2774278223514557,
"learning_rate": 4.973131244780138e-05,
"loss": 0.0344,
"step": 12640
},
{
"epoch": 13.027806385169928,
"grad_norm": 4.895512104034424,
"learning_rate": 4.966241914367627e-05,
"loss": 0.0407,
"step": 12650
},
{
"epoch": 13.038105046343976,
"grad_norm": 0.21008244156837463,
"learning_rate": 4.9593526480476996e-05,
"loss": 0.0342,
"step": 12660
},
{
"epoch": 13.048403707518023,
"grad_norm": 0.1632891595363617,
"learning_rate": 4.9524634589002164e-05,
"loss": 0.0364,
"step": 12670
},
{
"epoch": 13.058702368692071,
"grad_norm": 0.8435172438621521,
"learning_rate": 4.945574360004883e-05,
"loss": 0.0402,
"step": 12680
},
{
"epoch": 13.069001029866117,
"grad_norm": 0.16290047764778137,
"learning_rate": 4.93868536444124e-05,
"loss": 0.0432,
"step": 12690
},
{
"epoch": 13.079299691040164,
"grad_norm": 0.22083142399787903,
"learning_rate": 4.9317964852886256e-05,
"loss": 0.0427,
"step": 12700
},
{
"epoch": 13.089598352214212,
"grad_norm": 0.24757379293441772,
"learning_rate": 4.924907735626164e-05,
"loss": 0.0364,
"step": 12710
},
{
"epoch": 13.09989701338826,
"grad_norm": 0.4580037593841553,
"learning_rate": 4.918019128532726e-05,
"loss": 0.0354,
"step": 12720
},
{
"epoch": 13.110195674562307,
"grad_norm": 0.6730195879936218,
"learning_rate": 4.911130677086921e-05,
"loss": 0.0408,
"step": 12730
},
{
"epoch": 13.120494335736355,
"grad_norm": 0.27834802865982056,
"learning_rate": 4.9042423943670536e-05,
"loss": 0.0281,
"step": 12740
},
{
"epoch": 13.130792996910401,
"grad_norm": 0.25800591707229614,
"learning_rate": 4.8973542934511145e-05,
"loss": 0.0328,
"step": 12750
},
{
"epoch": 13.141091658084449,
"grad_norm": 0.27142634987831116,
"learning_rate": 4.8904663874167456e-05,
"loss": 0.0322,
"step": 12760
},
{
"epoch": 13.151390319258496,
"grad_norm": 0.16801717877388,
"learning_rate": 4.8835786893412215e-05,
"loss": 0.0289,
"step": 12770
},
{
"epoch": 13.161688980432544,
"grad_norm": 0.25099271535873413,
"learning_rate": 4.8766912123014177e-05,
"loss": 0.0337,
"step": 12780
},
{
"epoch": 13.171987641606592,
"grad_norm": 0.26906880736351013,
"learning_rate": 4.869803969373796e-05,
"loss": 0.0406,
"step": 12790
},
{
"epoch": 13.18228630278064,
"grad_norm": 0.5079290866851807,
"learning_rate": 4.862916973634369e-05,
"loss": 0.0363,
"step": 12800
},
{
"epoch": 13.192584963954685,
"grad_norm": 0.30340585112571716,
"learning_rate": 4.8560302381586834e-05,
"loss": 0.0303,
"step": 12810
},
{
"epoch": 13.202883625128733,
"grad_norm": 0.2514529228210449,
"learning_rate": 4.849143776021787e-05,
"loss": 0.0334,
"step": 12820
},
{
"epoch": 13.21318228630278,
"grad_norm": 0.19497543573379517,
"learning_rate": 4.8422576002982146e-05,
"loss": 0.0303,
"step": 12830
},
{
"epoch": 13.223480947476828,
"grad_norm": 0.4469880759716034,
"learning_rate": 4.8353717240619506e-05,
"loss": 0.0398,
"step": 12840
},
{
"epoch": 13.233779608650876,
"grad_norm": 0.3474951684474945,
"learning_rate": 4.82848616038642e-05,
"loss": 0.0349,
"step": 12850
},
{
"epoch": 13.244078269824923,
"grad_norm": 0.26160264015197754,
"learning_rate": 4.821600922344443e-05,
"loss": 0.0304,
"step": 12860
},
{
"epoch": 13.254376930998971,
"grad_norm": 0.3258974552154541,
"learning_rate": 4.814716023008231e-05,
"loss": 0.0311,
"step": 12870
},
{
"epoch": 13.264675592173017,
"grad_norm": 0.3310175836086273,
"learning_rate": 4.8078314754493475e-05,
"loss": 0.0345,
"step": 12880
},
{
"epoch": 13.274974253347064,
"grad_norm": 0.26981380581855774,
"learning_rate": 4.800947292738691e-05,
"loss": 0.0341,
"step": 12890
},
{
"epoch": 13.285272914521112,
"grad_norm": 0.1864253133535385,
"learning_rate": 4.794063487946463e-05,
"loss": 0.0353,
"step": 12900
},
{
"epoch": 13.29557157569516,
"grad_norm": 0.26357418298721313,
"learning_rate": 4.7871800741421496e-05,
"loss": 0.0314,
"step": 12910
},
{
"epoch": 13.305870236869207,
"grad_norm": 0.30822688341140747,
"learning_rate": 4.7802970643945e-05,
"loss": 0.0345,
"step": 12920
},
{
"epoch": 13.316168898043255,
"grad_norm": 0.45302730798721313,
"learning_rate": 4.773414471771485e-05,
"loss": 0.0327,
"step": 12930
},
{
"epoch": 13.326467559217301,
"grad_norm": 0.3582782447338104,
"learning_rate": 4.7665323093402955e-05,
"loss": 0.0379,
"step": 12940
},
{
"epoch": 13.336766220391349,
"grad_norm": 0.19117462635040283,
"learning_rate": 4.759650590167296e-05,
"loss": 0.0319,
"step": 12950
},
{
"epoch": 13.347064881565396,
"grad_norm": 0.4505470097064972,
"learning_rate": 4.752769327318016e-05,
"loss": 0.036,
"step": 12960
},
{
"epoch": 13.357363542739444,
"grad_norm": 0.18662497401237488,
"learning_rate": 4.745888533857114e-05,
"loss": 0.0379,
"step": 12970
},
{
"epoch": 13.367662203913492,
"grad_norm": 0.6353393793106079,
"learning_rate": 4.739008222848362e-05,
"loss": 0.0364,
"step": 12980
},
{
"epoch": 13.37796086508754,
"grad_norm": 0.2027062177658081,
"learning_rate": 4.732128407354609e-05,
"loss": 0.0343,
"step": 12990
},
{
"epoch": 13.388259526261585,
"grad_norm": 0.3728000521659851,
"learning_rate": 4.725249100437773e-05,
"loss": 0.0394,
"step": 13000
},
{
"epoch": 13.398558187435633,
"grad_norm": 0.3568314015865326,
"learning_rate": 4.718370315158796e-05,
"loss": 0.041,
"step": 13010
},
{
"epoch": 13.40885684860968,
"grad_norm": 0.2146635800600052,
"learning_rate": 4.711492064577639e-05,
"loss": 0.0381,
"step": 13020
},
{
"epoch": 13.419155509783728,
"grad_norm": 2.543966770172119,
"learning_rate": 4.704614361753239e-05,
"loss": 0.0431,
"step": 13030
},
{
"epoch": 13.429454170957776,
"grad_norm": 0.30011627078056335,
"learning_rate": 4.6977372197435023e-05,
"loss": 0.0348,
"step": 13040
},
{
"epoch": 13.439752832131823,
"grad_norm": 0.36090970039367676,
"learning_rate": 4.690860651605263e-05,
"loss": 0.0307,
"step": 13050
},
{
"epoch": 13.450051493305871,
"grad_norm": 0.5852978229522705,
"learning_rate": 4.683984670394269e-05,
"loss": 0.0391,
"step": 13060
},
{
"epoch": 13.460350154479917,
"grad_norm": 0.18308031558990479,
"learning_rate": 4.677109289165152e-05,
"loss": 0.0301,
"step": 13070
},
{
"epoch": 13.470648815653965,
"grad_norm": 0.2781670391559601,
"learning_rate": 4.670234520971408e-05,
"loss": 0.0338,
"step": 13080
},
{
"epoch": 13.480947476828012,
"grad_norm": 0.32513490319252014,
"learning_rate": 4.6633603788653636e-05,
"loss": 0.0429,
"step": 13090
},
{
"epoch": 13.49124613800206,
"grad_norm": 0.2742452025413513,
"learning_rate": 4.656486875898164e-05,
"loss": 0.0373,
"step": 13100
},
{
"epoch": 13.501544799176108,
"grad_norm": 0.3207267224788666,
"learning_rate": 4.649614025119734e-05,
"loss": 0.0353,
"step": 13110
},
{
"epoch": 13.511843460350155,
"grad_norm": 0.3440451920032501,
"learning_rate": 4.6427418395787655e-05,
"loss": 0.0341,
"step": 13120
},
{
"epoch": 13.522142121524201,
"grad_norm": 0.2777320444583893,
"learning_rate": 4.635870332322682e-05,
"loss": 0.039,
"step": 13130
},
{
"epoch": 13.532440782698249,
"grad_norm": 0.41627272963523865,
"learning_rate": 4.628999516397625e-05,
"loss": 0.0329,
"step": 13140
},
{
"epoch": 13.542739443872296,
"grad_norm": 0.2555646598339081,
"learning_rate": 4.6221294048484174e-05,
"loss": 0.04,
"step": 13150
},
{
"epoch": 13.553038105046344,
"grad_norm": 0.2983425557613373,
"learning_rate": 4.615260010718553e-05,
"loss": 0.0374,
"step": 13160
},
{
"epoch": 13.563336766220392,
"grad_norm": 0.32177847623825073,
"learning_rate": 4.608391347050154e-05,
"loss": 0.0336,
"step": 13170
},
{
"epoch": 13.57363542739444,
"grad_norm": 0.27015167474746704,
"learning_rate": 4.601523426883963e-05,
"loss": 0.0285,
"step": 13180
},
{
"epoch": 13.583934088568487,
"grad_norm": 0.2573201358318329,
"learning_rate": 4.5946562632593066e-05,
"loss": 0.031,
"step": 13190
},
{
"epoch": 13.594232749742533,
"grad_norm": 0.32997414469718933,
"learning_rate": 4.587789869214079e-05,
"loss": 0.0336,
"step": 13200
},
{
"epoch": 13.60453141091658,
"grad_norm": 0.2865029275417328,
"learning_rate": 4.580924257784706e-05,
"loss": 0.0327,
"step": 13210
},
{
"epoch": 13.614830072090628,
"grad_norm": 0.2227432280778885,
"learning_rate": 4.5740594420061396e-05,
"loss": 0.0306,
"step": 13220
},
{
"epoch": 13.625128733264676,
"grad_norm": 0.2854247987270355,
"learning_rate": 4.5671954349118094e-05,
"loss": 0.0298,
"step": 13230
},
{
"epoch": 13.635427394438723,
"grad_norm": 0.46989086270332336,
"learning_rate": 4.560332249533617e-05,
"loss": 0.0386,
"step": 13240
},
{
"epoch": 13.645726055612771,
"grad_norm": 0.36545002460479736,
"learning_rate": 4.5534698989018984e-05,
"loss": 0.0411,
"step": 13250
},
{
"epoch": 13.656024716786817,
"grad_norm": 0.3201320469379425,
"learning_rate": 4.5466083960454117e-05,
"loss": 0.0367,
"step": 13260
},
{
"epoch": 13.666323377960865,
"grad_norm": 1.3402334451675415,
"learning_rate": 4.539747753991297e-05,
"loss": 0.0464,
"step": 13270
},
{
"epoch": 13.676622039134912,
"grad_norm": 0.17494896054267883,
"learning_rate": 4.5328879857650676e-05,
"loss": 0.0368,
"step": 13280
},
{
"epoch": 13.68692070030896,
"grad_norm": 0.27458542585372925,
"learning_rate": 4.5260291043905736e-05,
"loss": 0.037,
"step": 13290
},
{
"epoch": 13.697219361483008,
"grad_norm": 0.3501793444156647,
"learning_rate": 4.519171122889983e-05,
"loss": 0.0434,
"step": 13300
},
{
"epoch": 13.707518022657055,
"grad_norm": 0.33499833941459656,
"learning_rate": 4.512314054283755e-05,
"loss": 0.0377,
"step": 13310
},
{
"epoch": 13.717816683831103,
"grad_norm": 0.30087441205978394,
"learning_rate": 4.505457911590613e-05,
"loss": 0.0372,
"step": 13320
},
{
"epoch": 13.728115345005149,
"grad_norm": 0.22873124480247498,
"learning_rate": 4.498602707827528e-05,
"loss": 0.0283,
"step": 13330
},
{
"epoch": 13.738414006179196,
"grad_norm": 0.24429480731487274,
"learning_rate": 4.4917484560096804e-05,
"loss": 0.0333,
"step": 13340
},
{
"epoch": 13.748712667353244,
"grad_norm": 0.23670226335525513,
"learning_rate": 4.4848951691504555e-05,
"loss": 0.0331,
"step": 13350
},
{
"epoch": 13.759011328527292,
"grad_norm": 0.24842509627342224,
"learning_rate": 4.478042860261391e-05,
"loss": 0.037,
"step": 13360
},
{
"epoch": 13.76930998970134,
"grad_norm": 0.24097035825252533,
"learning_rate": 4.4711915423521816e-05,
"loss": 0.0276,
"step": 13370
},
{
"epoch": 13.779608650875387,
"grad_norm": 0.1889146864414215,
"learning_rate": 4.4643412284306324e-05,
"loss": 0.0333,
"step": 13380
},
{
"epoch": 13.789907312049433,
"grad_norm": 0.22404436767101288,
"learning_rate": 4.457491931502646e-05,
"loss": 0.0313,
"step": 13390
},
{
"epoch": 13.80020597322348,
"grad_norm": 0.20593509078025818,
"learning_rate": 4.45064366457219e-05,
"loss": 0.034,
"step": 13400
},
{
"epoch": 13.810504634397528,
"grad_norm": 0.2181590348482132,
"learning_rate": 4.4437964406412844e-05,
"loss": 0.0339,
"step": 13410
},
{
"epoch": 13.820803295571576,
"grad_norm": 0.23468917608261108,
"learning_rate": 4.436950272709959e-05,
"loss": 0.0323,
"step": 13420
},
{
"epoch": 13.831101956745623,
"grad_norm": 0.2851261794567108,
"learning_rate": 4.4301051737762466e-05,
"loss": 0.0341,
"step": 13430
},
{
"epoch": 13.841400617919671,
"grad_norm": 0.23932397365570068,
"learning_rate": 4.423261156836146e-05,
"loss": 0.0356,
"step": 13440
},
{
"epoch": 13.851699279093717,
"grad_norm": 0.37307506799697876,
"learning_rate": 4.4164182348836056e-05,
"loss": 0.0345,
"step": 13450
},
{
"epoch": 13.861997940267765,
"grad_norm": 0.29198941588401794,
"learning_rate": 4.409576420910488e-05,
"loss": 0.039,
"step": 13460
},
{
"epoch": 13.872296601441812,
"grad_norm": 0.3117142617702484,
"learning_rate": 4.402735727906564e-05,
"loss": 0.0402,
"step": 13470
},
{
"epoch": 13.88259526261586,
"grad_norm": 0.6684085726737976,
"learning_rate": 4.39589616885946e-05,
"loss": 0.0387,
"step": 13480
},
{
"epoch": 13.892893923789908,
"grad_norm": 0.48037102818489075,
"learning_rate": 4.389057756754665e-05,
"loss": 0.0378,
"step": 13490
},
{
"epoch": 13.903192584963955,
"grad_norm": 0.25196754932403564,
"learning_rate": 4.3822205045754804e-05,
"loss": 0.0422,
"step": 13500
},
{
"epoch": 13.913491246138001,
"grad_norm": 0.22298026084899902,
"learning_rate": 4.3753844253030115e-05,
"loss": 0.0333,
"step": 13510
},
{
"epoch": 13.923789907312049,
"grad_norm": 1.1777735948562622,
"learning_rate": 4.368549531916129e-05,
"loss": 0.0426,
"step": 13520
},
{
"epoch": 13.934088568486096,
"grad_norm": 1.4645673036575317,
"learning_rate": 4.361715837391465e-05,
"loss": 0.0312,
"step": 13530
},
{
"epoch": 13.944387229660144,
"grad_norm": 0.34733325242996216,
"learning_rate": 4.3548833547033585e-05,
"loss": 0.0314,
"step": 13540
},
{
"epoch": 13.954685890834192,
"grad_norm": 0.31025397777557373,
"learning_rate": 4.348052096823864e-05,
"loss": 0.0326,
"step": 13550
},
{
"epoch": 13.96498455200824,
"grad_norm": 0.29611408710479736,
"learning_rate": 4.3412220767227e-05,
"loss": 0.0386,
"step": 13560
},
{
"epoch": 13.975283213182287,
"grad_norm": 0.24702376127243042,
"learning_rate": 4.334393307367239e-05,
"loss": 0.0329,
"step": 13570
},
{
"epoch": 13.985581874356333,
"grad_norm": 0.24566444754600525,
"learning_rate": 4.327565801722477e-05,
"loss": 0.0353,
"step": 13580
},
{
"epoch": 13.99588053553038,
"grad_norm": 0.3815259039402008,
"learning_rate": 4.3207395727510156e-05,
"loss": 0.0337,
"step": 13590
},
{
"epoch": 14.006179196704428,
"grad_norm": 0.30025404691696167,
"learning_rate": 4.313914633413023e-05,
"loss": 0.027,
"step": 13600
},
{
"epoch": 14.016477857878476,
"grad_norm": 0.25736743211746216,
"learning_rate": 4.307090996666231e-05,
"loss": 0.0337,
"step": 13610
},
{
"epoch": 14.026776519052524,
"grad_norm": 0.39078590273857117,
"learning_rate": 4.300268675465888e-05,
"loss": 0.0331,
"step": 13620
},
{
"epoch": 14.037075180226571,
"grad_norm": 0.4594716727733612,
"learning_rate": 4.293447682764751e-05,
"loss": 0.033,
"step": 13630
},
{
"epoch": 14.047373841400617,
"grad_norm": 2.626016855239868,
"learning_rate": 4.286628031513049e-05,
"loss": 0.0307,
"step": 13640
},
{
"epoch": 14.057672502574665,
"grad_norm": 0.36267414689064026,
"learning_rate": 4.2798097346584745e-05,
"loss": 0.0359,
"step": 13650
},
{
"epoch": 14.067971163748712,
"grad_norm": 0.9793422818183899,
"learning_rate": 4.272992805146133e-05,
"loss": 0.0286,
"step": 13660
},
{
"epoch": 14.07826982492276,
"grad_norm": 0.3450843095779419,
"learning_rate": 4.2661772559185506e-05,
"loss": 0.0363,
"step": 13670
},
{
"epoch": 14.088568486096808,
"grad_norm": 0.24814942479133606,
"learning_rate": 4.2593630999156196e-05,
"loss": 0.0307,
"step": 13680
},
{
"epoch": 14.098867147270855,
"grad_norm": 0.34020665287971497,
"learning_rate": 4.252550350074597e-05,
"loss": 0.0333,
"step": 13690
},
{
"epoch": 14.109165808444903,
"grad_norm": 0.3881569504737854,
"learning_rate": 4.24573901933006e-05,
"loss": 0.0337,
"step": 13700
},
{
"epoch": 14.119464469618949,
"grad_norm": 0.1805485635995865,
"learning_rate": 4.238929120613903e-05,
"loss": 0.0377,
"step": 13710
},
{
"epoch": 14.129763130792997,
"grad_norm": 0.39343783259391785,
"learning_rate": 4.2321206668552934e-05,
"loss": 0.0363,
"step": 13720
},
{
"epoch": 14.140061791967044,
"grad_norm": 0.30404913425445557,
"learning_rate": 4.225313670980655e-05,
"loss": 0.032,
"step": 13730
},
{
"epoch": 14.150360453141092,
"grad_norm": 0.2011529803276062,
"learning_rate": 4.21850814591365e-05,
"loss": 0.026,
"step": 13740
},
{
"epoch": 14.16065911431514,
"grad_norm": 0.32636958360671997,
"learning_rate": 4.2117041045751416e-05,
"loss": 0.0391,
"step": 13750
},
{
"epoch": 14.170957775489187,
"grad_norm": 0.19938711822032928,
"learning_rate": 4.204901559883181e-05,
"loss": 0.0306,
"step": 13760
},
{
"epoch": 14.181256436663233,
"grad_norm": 0.3431154489517212,
"learning_rate": 4.1981005247529716e-05,
"loss": 0.0388,
"step": 13770
},
{
"epoch": 14.19155509783728,
"grad_norm": 0.28631865978240967,
"learning_rate": 4.191301012096861e-05,
"loss": 0.0304,
"step": 13780
},
{
"epoch": 14.201853759011328,
"grad_norm": 0.3022019863128662,
"learning_rate": 4.1845030348242945e-05,
"loss": 0.0347,
"step": 13790
},
{
"epoch": 14.212152420185376,
"grad_norm": 0.2202078104019165,
"learning_rate": 4.177706605841811e-05,
"loss": 0.0281,
"step": 13800
},
{
"epoch": 14.222451081359424,
"grad_norm": 0.2681058347225189,
"learning_rate": 4.170911738053006e-05,
"loss": 0.0368,
"step": 13810
},
{
"epoch": 14.232749742533471,
"grad_norm": 0.23820021748542786,
"learning_rate": 4.164118444358512e-05,
"loss": 0.0297,
"step": 13820
},
{
"epoch": 14.243048403707519,
"grad_norm": 0.3738692104816437,
"learning_rate": 4.1573267376559705e-05,
"loss": 0.032,
"step": 13830
},
{
"epoch": 14.253347064881565,
"grad_norm": 0.18206800520420074,
"learning_rate": 4.150536630840017e-05,
"loss": 0.0351,
"step": 13840
},
{
"epoch": 14.263645726055612,
"grad_norm": 0.3507537245750427,
"learning_rate": 4.143748136802238e-05,
"loss": 0.0325,
"step": 13850
},
{
"epoch": 14.27394438722966,
"grad_norm": 0.2590770125389099,
"learning_rate": 4.13696126843117e-05,
"loss": 0.0315,
"step": 13860
},
{
"epoch": 14.284243048403708,
"grad_norm": 0.4932098984718323,
"learning_rate": 4.130176038612256e-05,
"loss": 0.0367,
"step": 13870
},
{
"epoch": 14.294541709577755,
"grad_norm": 0.22558774054050446,
"learning_rate": 4.123392460227829e-05,
"loss": 0.03,
"step": 13880
},
{
"epoch": 14.304840370751803,
"grad_norm": 0.3039613962173462,
"learning_rate": 4.116610546157086e-05,
"loss": 0.0294,
"step": 13890
},
{
"epoch": 14.315139031925849,
"grad_norm": 0.26412516832351685,
"learning_rate": 4.1098303092760707e-05,
"loss": 0.0278,
"step": 13900
},
{
"epoch": 14.325437693099897,
"grad_norm": 0.24321669340133667,
"learning_rate": 4.1030517624576304e-05,
"loss": 0.0316,
"step": 13910
},
{
"epoch": 14.335736354273944,
"grad_norm": 0.1724211871623993,
"learning_rate": 4.0962749185714156e-05,
"loss": 0.0299,
"step": 13920
},
{
"epoch": 14.346035015447992,
"grad_norm": 0.37654179334640503,
"learning_rate": 4.0894997904838364e-05,
"loss": 0.0376,
"step": 13930
},
{
"epoch": 14.35633367662204,
"grad_norm": 0.24133779108524323,
"learning_rate": 4.082726391058048e-05,
"loss": 0.0361,
"step": 13940
},
{
"epoch": 14.366632337796087,
"grad_norm": 0.24687901139259338,
"learning_rate": 4.075954733153922e-05,
"loss": 0.034,
"step": 13950
},
{
"epoch": 14.376930998970133,
"grad_norm": 0.23451952636241913,
"learning_rate": 4.069184829628029e-05,
"loss": 0.0305,
"step": 13960
},
{
"epoch": 14.38722966014418,
"grad_norm": 0.4871678650379181,
"learning_rate": 4.062416693333598e-05,
"loss": 0.0333,
"step": 13970
},
{
"epoch": 14.397528321318228,
"grad_norm": 0.30757251381874084,
"learning_rate": 4.055650337120514e-05,
"loss": 0.0299,
"step": 13980
},
{
"epoch": 14.407826982492276,
"grad_norm": 0.22331511974334717,
"learning_rate": 4.0488857738352745e-05,
"loss": 0.0289,
"step": 13990
},
{
"epoch": 14.418125643666324,
"grad_norm": 0.24289913475513458,
"learning_rate": 4.042123016320979e-05,
"loss": 0.0352,
"step": 14000
},
{
"epoch": 14.428424304840371,
"grad_norm": 0.18522806465625763,
"learning_rate": 4.035362077417292e-05,
"loss": 0.0302,
"step": 14010
},
{
"epoch": 14.438722966014419,
"grad_norm": 0.23417025804519653,
"learning_rate": 4.028602969960434e-05,
"loss": 0.028,
"step": 14020
},
{
"epoch": 14.449021627188465,
"grad_norm": 0.27148130536079407,
"learning_rate": 4.021845706783138e-05,
"loss": 0.0303,
"step": 14030
},
{
"epoch": 14.459320288362512,
"grad_norm": 0.36462321877479553,
"learning_rate": 4.0150903007146434e-05,
"loss": 0.0305,
"step": 14040
},
{
"epoch": 14.46961894953656,
"grad_norm": 0.4786697328090668,
"learning_rate": 4.00833676458066e-05,
"loss": 0.0349,
"step": 14050
},
{
"epoch": 14.479917610710608,
"grad_norm": 0.4755783677101135,
"learning_rate": 4.00158511120335e-05,
"loss": 0.0335,
"step": 14060
},
{
"epoch": 14.490216271884655,
"grad_norm": 0.25529634952545166,
"learning_rate": 3.994835353401295e-05,
"loss": 0.0347,
"step": 14070
},
{
"epoch": 14.500514933058703,
"grad_norm": 0.4141525328159332,
"learning_rate": 3.988087503989489e-05,
"loss": 0.0328,
"step": 14080
},
{
"epoch": 14.510813594232749,
"grad_norm": 0.30694714188575745,
"learning_rate": 3.9813415757792885e-05,
"loss": 0.0306,
"step": 14090
},
{
"epoch": 14.521112255406797,
"grad_norm": 0.18365953862667084,
"learning_rate": 3.974597581578416e-05,
"loss": 0.029,
"step": 14100
},
{
"epoch": 14.531410916580844,
"grad_norm": 0.3182389438152313,
"learning_rate": 3.9678555341909125e-05,
"loss": 0.0355,
"step": 14110
},
{
"epoch": 14.541709577754892,
"grad_norm": 0.2846277952194214,
"learning_rate": 3.9611154464171255e-05,
"loss": 0.0349,
"step": 14120
},
{
"epoch": 14.55200823892894,
"grad_norm": 0.2074788361787796,
"learning_rate": 3.954377331053686e-05,
"loss": 0.036,
"step": 14130
},
{
"epoch": 14.562306900102987,
"grad_norm": 0.3968390226364136,
"learning_rate": 3.947641200893473e-05,
"loss": 0.0441,
"step": 14140
},
{
"epoch": 14.572605561277033,
"grad_norm": 0.3291980028152466,
"learning_rate": 3.940907068725603e-05,
"loss": 0.0367,
"step": 14150
},
{
"epoch": 14.58290422245108,
"grad_norm": 0.270810067653656,
"learning_rate": 3.934174947335394e-05,
"loss": 0.0361,
"step": 14160
},
{
"epoch": 14.593202883625128,
"grad_norm": 0.1831371933221817,
"learning_rate": 3.927444849504353e-05,
"loss": 0.0313,
"step": 14170
},
{
"epoch": 14.603501544799176,
"grad_norm": 0.30703842639923096,
"learning_rate": 3.920716788010137e-05,
"loss": 0.0361,
"step": 14180
},
{
"epoch": 14.613800205973224,
"grad_norm": 0.23841963708400726,
"learning_rate": 3.913990775626544e-05,
"loss": 0.027,
"step": 14190
},
{
"epoch": 14.624098867147271,
"grad_norm": 0.36122044920921326,
"learning_rate": 3.907266825123475e-05,
"loss": 0.0297,
"step": 14200
},
{
"epoch": 14.634397528321319,
"grad_norm": 0.4103507995605469,
"learning_rate": 3.9005449492669224e-05,
"loss": 0.0303,
"step": 14210
},
{
"epoch": 14.644696189495365,
"grad_norm": 0.4827253222465515,
"learning_rate": 3.893825160818935e-05,
"loss": 0.032,
"step": 14220
},
{
"epoch": 14.654994850669413,
"grad_norm": 0.3118212819099426,
"learning_rate": 3.887107472537602e-05,
"loss": 0.0319,
"step": 14230
},
{
"epoch": 14.66529351184346,
"grad_norm": 0.27819785475730896,
"learning_rate": 3.880391897177024e-05,
"loss": 0.0305,
"step": 14240
},
{
"epoch": 14.675592173017508,
"grad_norm": 0.24802176654338837,
"learning_rate": 3.873678447487289e-05,
"loss": 0.0376,
"step": 14250
},
{
"epoch": 14.685890834191555,
"grad_norm": 0.41126886010169983,
"learning_rate": 3.8669671362144485e-05,
"loss": 0.0435,
"step": 14260
},
{
"epoch": 14.696189495365603,
"grad_norm": 0.3224561810493469,
"learning_rate": 3.8602579761005e-05,
"loss": 0.034,
"step": 14270
},
{
"epoch": 14.706488156539649,
"grad_norm": 0.263100802898407,
"learning_rate": 3.853550979883346e-05,
"loss": 0.0287,
"step": 14280
},
{
"epoch": 14.716786817713697,
"grad_norm": 0.2607341706752777,
"learning_rate": 3.846846160296794e-05,
"loss": 0.0251,
"step": 14290
},
{
"epoch": 14.727085478887744,
"grad_norm": 0.2823236584663391,
"learning_rate": 3.840143530070507e-05,
"loss": 0.0328,
"step": 14300
},
{
"epoch": 14.737384140061792,
"grad_norm": 0.4001230001449585,
"learning_rate": 3.833443101929999e-05,
"loss": 0.0317,
"step": 14310
},
{
"epoch": 14.74768280123584,
"grad_norm": 0.29263654351234436,
"learning_rate": 3.8267448885965994e-05,
"loss": 0.0349,
"step": 14320
},
{
"epoch": 14.757981462409887,
"grad_norm": 0.3369698226451874,
"learning_rate": 3.820048902787435e-05,
"loss": 0.0409,
"step": 14330
},
{
"epoch": 14.768280123583935,
"grad_norm": 0.42798182368278503,
"learning_rate": 3.813355157215398e-05,
"loss": 0.0305,
"step": 14340
},
{
"epoch": 14.77857878475798,
"grad_norm": 0.26598209142684937,
"learning_rate": 3.806663664589138e-05,
"loss": 0.0312,
"step": 14350
},
{
"epoch": 14.788877445932028,
"grad_norm": 0.17724353075027466,
"learning_rate": 3.799974437613016e-05,
"loss": 0.0309,
"step": 14360
},
{
"epoch": 14.799176107106076,
"grad_norm": 0.24102918803691864,
"learning_rate": 3.793287488987098e-05,
"loss": 0.0353,
"step": 14370
},
{
"epoch": 14.809474768280124,
"grad_norm": 0.29433581233024597,
"learning_rate": 3.786602831407121e-05,
"loss": 0.03,
"step": 14380
},
{
"epoch": 14.819773429454171,
"grad_norm": 0.3096264898777008,
"learning_rate": 3.779920477564477e-05,
"loss": 0.032,
"step": 14390
},
{
"epoch": 14.830072090628219,
"grad_norm": 0.2747699022293091,
"learning_rate": 3.7732404401461764e-05,
"loss": 0.0329,
"step": 14400
},
{
"epoch": 14.840370751802265,
"grad_norm": 0.21906539797782898,
"learning_rate": 3.76656273183484e-05,
"loss": 0.0351,
"step": 14410
},
{
"epoch": 14.850669412976313,
"grad_norm": 0.32619962096214294,
"learning_rate": 3.759887365308661e-05,
"loss": 0.0354,
"step": 14420
},
{
"epoch": 14.86096807415036,
"grad_norm": 0.35620927810668945,
"learning_rate": 3.7532143532413886e-05,
"loss": 0.0307,
"step": 14430
},
{
"epoch": 14.871266735324408,
"grad_norm": 0.3073720335960388,
"learning_rate": 3.746543708302301e-05,
"loss": 0.0315,
"step": 14440
},
{
"epoch": 14.881565396498456,
"grad_norm": 0.28775277733802795,
"learning_rate": 3.739875443156186e-05,
"loss": 0.0286,
"step": 14450
},
{
"epoch": 14.891864057672503,
"grad_norm": 0.29448071122169495,
"learning_rate": 3.733209570463304e-05,
"loss": 0.0361,
"step": 14460
},
{
"epoch": 14.90216271884655,
"grad_norm": 0.2410167008638382,
"learning_rate": 3.726546102879386e-05,
"loss": 0.0279,
"step": 14470
},
{
"epoch": 14.912461380020597,
"grad_norm": 0.21059367060661316,
"learning_rate": 3.719885053055584e-05,
"loss": 0.0336,
"step": 14480
},
{
"epoch": 14.922760041194644,
"grad_norm": 1.691361665725708,
"learning_rate": 3.713226433638469e-05,
"loss": 0.0357,
"step": 14490
},
{
"epoch": 14.933058702368692,
"grad_norm": 0.2861790955066681,
"learning_rate": 3.706570257269991e-05,
"loss": 0.0308,
"step": 14500
},
{
"epoch": 14.94335736354274,
"grad_norm": 0.23796629905700684,
"learning_rate": 3.6999165365874696e-05,
"loss": 0.0294,
"step": 14510
},
{
"epoch": 14.953656024716787,
"grad_norm": 0.29010501503944397,
"learning_rate": 3.693265284223554e-05,
"loss": 0.0315,
"step": 14520
},
{
"epoch": 14.963954685890835,
"grad_norm": 0.4312698245048523,
"learning_rate": 3.686616512806208e-05,
"loss": 0.0318,
"step": 14530
},
{
"epoch": 14.97425334706488,
"grad_norm": 0.1826692372560501,
"learning_rate": 3.6799702349586914e-05,
"loss": 0.0299,
"step": 14540
},
{
"epoch": 14.984552008238929,
"grad_norm": 0.3241519033908844,
"learning_rate": 3.6733264632995254e-05,
"loss": 0.0308,
"step": 14550
},
{
"epoch": 14.994850669412976,
"grad_norm": 0.23148488998413086,
"learning_rate": 3.666685210442472e-05,
"loss": 0.0312,
"step": 14560
},
{
"epoch": 15.005149330587024,
"grad_norm": 0.20060890913009644,
"learning_rate": 3.660046488996513e-05,
"loss": 0.039,
"step": 14570
},
{
"epoch": 15.015447991761071,
"grad_norm": 0.25309333205223083,
"learning_rate": 3.6534103115658244e-05,
"loss": 0.0331,
"step": 14580
},
{
"epoch": 15.02574665293512,
"grad_norm": 0.32379934191703796,
"learning_rate": 3.646776690749748e-05,
"loss": 0.0319,
"step": 14590
},
{
"epoch": 15.036045314109165,
"grad_norm": 0.2242245078086853,
"learning_rate": 3.640145639142779e-05,
"loss": 0.0293,
"step": 14600
},
{
"epoch": 15.046343975283213,
"grad_norm": 0.37435922026634216,
"learning_rate": 3.6335171693345295e-05,
"loss": 0.0317,
"step": 14610
},
{
"epoch": 15.05664263645726,
"grad_norm": 0.4914971888065338,
"learning_rate": 3.62689129390971e-05,
"loss": 0.0297,
"step": 14620
},
{
"epoch": 15.066941297631308,
"grad_norm": 0.2628423571586609,
"learning_rate": 3.620268025448107e-05,
"loss": 0.0295,
"step": 14630
},
{
"epoch": 15.077239958805356,
"grad_norm": 0.22137194871902466,
"learning_rate": 3.6136473765245575e-05,
"loss": 0.0382,
"step": 14640
},
{
"epoch": 15.087538619979403,
"grad_norm": 0.33135750889778137,
"learning_rate": 3.6070293597089225e-05,
"loss": 0.0346,
"step": 14650
},
{
"epoch": 15.097837281153451,
"grad_norm": 0.2422097623348236,
"learning_rate": 3.60041398756607e-05,
"loss": 0.028,
"step": 14660
},
{
"epoch": 15.108135942327497,
"grad_norm": 0.25506383180618286,
"learning_rate": 3.593801272655842e-05,
"loss": 0.0292,
"step": 14670
},
{
"epoch": 15.118434603501544,
"grad_norm": 0.20777393877506256,
"learning_rate": 3.5871912275330415e-05,
"loss": 0.0256,
"step": 14680
},
{
"epoch": 15.128733264675592,
"grad_norm": 0.3608769476413727,
"learning_rate": 3.5805838647473956e-05,
"loss": 0.034,
"step": 14690
},
{
"epoch": 15.13903192584964,
"grad_norm": 0.24145326018333435,
"learning_rate": 3.573979196843546e-05,
"loss": 0.0293,
"step": 14700
},
{
"epoch": 15.149330587023687,
"grad_norm": 0.2989254891872406,
"learning_rate": 3.567377236361008e-05,
"loss": 0.0281,
"step": 14710
},
{
"epoch": 15.159629248197735,
"grad_norm": 0.19318531453609467,
"learning_rate": 3.560777995834171e-05,
"loss": 0.0426,
"step": 14720
},
{
"epoch": 15.169927909371781,
"grad_norm": 0.2559758722782135,
"learning_rate": 3.554181487792246e-05,
"loss": 0.0308,
"step": 14730
},
{
"epoch": 15.180226570545829,
"grad_norm": 0.20600715279579163,
"learning_rate": 3.547587724759267e-05,
"loss": 0.0312,
"step": 14740
},
{
"epoch": 15.190525231719876,
"grad_norm": 0.25616544485092163,
"learning_rate": 3.540996719254048e-05,
"loss": 0.0296,
"step": 14750
},
{
"epoch": 15.200823892893924,
"grad_norm": 0.1702621430158615,
"learning_rate": 3.5344084837901745e-05,
"loss": 0.0297,
"step": 14760
},
{
"epoch": 15.211122554067972,
"grad_norm": 0.26881083846092224,
"learning_rate": 3.527823030875966e-05,
"loss": 0.0338,
"step": 14770
},
{
"epoch": 15.22142121524202,
"grad_norm": 0.2374623417854309,
"learning_rate": 3.5212403730144674e-05,
"loss": 0.0322,
"step": 14780
},
{
"epoch": 15.231719876416065,
"grad_norm": 0.3395121991634369,
"learning_rate": 3.5146605227034076e-05,
"loss": 0.0352,
"step": 14790
},
{
"epoch": 15.242018537590113,
"grad_norm": 0.2763458490371704,
"learning_rate": 3.508083492435195e-05,
"loss": 0.0339,
"step": 14800
},
{
"epoch": 15.25231719876416,
"grad_norm": 0.24392758309841156,
"learning_rate": 3.501509294696874e-05,
"loss": 0.0298,
"step": 14810
},
{
"epoch": 15.262615859938208,
"grad_norm": 0.3314224183559418,
"learning_rate": 3.49493794197012e-05,
"loss": 0.0357,
"step": 14820
},
{
"epoch": 15.272914521112256,
"grad_norm": 0.40287965536117554,
"learning_rate": 3.488369446731198e-05,
"loss": 0.0301,
"step": 14830
},
{
"epoch": 15.283213182286303,
"grad_norm": 0.3372296392917633,
"learning_rate": 3.4818038214509584e-05,
"loss": 0.0336,
"step": 14840
},
{
"epoch": 15.293511843460351,
"grad_norm": 0.2584548890590668,
"learning_rate": 3.4752410785947937e-05,
"loss": 0.029,
"step": 14850
},
{
"epoch": 15.303810504634397,
"grad_norm": 0.34848877787590027,
"learning_rate": 3.468681230622629e-05,
"loss": 0.0318,
"step": 14860
},
{
"epoch": 15.314109165808445,
"grad_norm": 0.4821033477783203,
"learning_rate": 3.462124289988889e-05,
"loss": 0.0331,
"step": 14870
},
{
"epoch": 15.324407826982492,
"grad_norm": 0.3023509085178375,
"learning_rate": 3.4555702691424834e-05,
"loss": 0.0354,
"step": 14880
},
{
"epoch": 15.33470648815654,
"grad_norm": 0.23537398874759674,
"learning_rate": 3.449019180526774e-05,
"loss": 0.0282,
"step": 14890
},
{
"epoch": 15.345005149330587,
"grad_norm": 0.2854698598384857,
"learning_rate": 3.44247103657956e-05,
"loss": 0.0307,
"step": 14900
},
{
"epoch": 15.355303810504635,
"grad_norm": 0.23819385468959808,
"learning_rate": 3.435925849733045e-05,
"loss": 0.0312,
"step": 14910
},
{
"epoch": 15.365602471678681,
"grad_norm": 0.241379514336586,
"learning_rate": 3.429383632413823e-05,
"loss": 0.0301,
"step": 14920
},
{
"epoch": 15.375901132852729,
"grad_norm": 0.2703462839126587,
"learning_rate": 3.422844397042847e-05,
"loss": 0.0387,
"step": 14930
},
{
"epoch": 15.386199794026776,
"grad_norm": 0.2962131202220917,
"learning_rate": 3.416308156035409e-05,
"loss": 0.0278,
"step": 14940
},
{
"epoch": 15.396498455200824,
"grad_norm": 0.2346954643726349,
"learning_rate": 3.4097749218011174e-05,
"loss": 0.0285,
"step": 14950
},
{
"epoch": 15.406797116374872,
"grad_norm": 0.4202100336551666,
"learning_rate": 3.4032447067438674e-05,
"loss": 0.0279,
"step": 14960
},
{
"epoch": 15.41709577754892,
"grad_norm": 0.25344449281692505,
"learning_rate": 3.396717523261831e-05,
"loss": 0.029,
"step": 14970
},
{
"epoch": 15.427394438722967,
"grad_norm": 0.23738066852092743,
"learning_rate": 3.390193383747415e-05,
"loss": 0.0309,
"step": 14980
},
{
"epoch": 15.437693099897013,
"grad_norm": 2.6296916007995605,
"learning_rate": 3.383672300587254e-05,
"loss": 0.0366,
"step": 14990
},
{
"epoch": 15.44799176107106,
"grad_norm": 0.3654041290283203,
"learning_rate": 3.3771542861621736e-05,
"loss": 0.0387,
"step": 15000
},
{
"epoch": 15.458290422245108,
"grad_norm": 0.26136070489883423,
"learning_rate": 3.370639352847179e-05,
"loss": 0.0332,
"step": 15010
},
{
"epoch": 15.468589083419156,
"grad_norm": 0.26935380697250366,
"learning_rate": 3.3641275130114206e-05,
"loss": 0.0307,
"step": 15020
},
{
"epoch": 15.478887744593203,
"grad_norm": 0.2979789972305298,
"learning_rate": 3.357618779018182e-05,
"loss": 0.0255,
"step": 15030
},
{
"epoch": 15.489186405767251,
"grad_norm": 0.2572219967842102,
"learning_rate": 3.351113163224843e-05,
"loss": 0.0358,
"step": 15040
},
{
"epoch": 15.499485066941297,
"grad_norm": 0.2063932716846466,
"learning_rate": 3.344610677982867e-05,
"loss": 0.0322,
"step": 15050
},
{
"epoch": 15.509783728115345,
"grad_norm": 0.44023096561431885,
"learning_rate": 3.338111335637773e-05,
"loss": 0.0323,
"step": 15060
},
{
"epoch": 15.520082389289392,
"grad_norm": 0.3511848449707031,
"learning_rate": 3.3316151485291146e-05,
"loss": 0.0284,
"step": 15070
},
{
"epoch": 15.53038105046344,
"grad_norm": 0.3060377538204193,
"learning_rate": 3.32512212899045e-05,
"loss": 0.0299,
"step": 15080
},
{
"epoch": 15.540679711637488,
"grad_norm": 0.2702105939388275,
"learning_rate": 3.318632289349332e-05,
"loss": 0.0296,
"step": 15090
},
{
"epoch": 15.550978372811535,
"grad_norm": 0.13126692175865173,
"learning_rate": 3.312145641927265e-05,
"loss": 0.0283,
"step": 15100
},
{
"epoch": 15.561277033985581,
"grad_norm": 0.17313429713249207,
"learning_rate": 3.305662199039705e-05,
"loss": 0.032,
"step": 15110
},
{
"epoch": 15.571575695159629,
"grad_norm": 0.28625550866127014,
"learning_rate": 3.2991819729960136e-05,
"loss": 0.0339,
"step": 15120
},
{
"epoch": 15.581874356333676,
"grad_norm": 0.308564692735672,
"learning_rate": 3.29270497609945e-05,
"loss": 0.0357,
"step": 15130
},
{
"epoch": 15.592173017507724,
"grad_norm": 0.2243090718984604,
"learning_rate": 3.28623122064714e-05,
"loss": 0.03,
"step": 15140
},
{
"epoch": 15.602471678681772,
"grad_norm": 0.19758236408233643,
"learning_rate": 3.27976071893006e-05,
"loss": 0.0284,
"step": 15150
},
{
"epoch": 15.61277033985582,
"grad_norm": 0.2614839971065521,
"learning_rate": 3.2732934832330033e-05,
"loss": 0.0318,
"step": 15160
},
{
"epoch": 15.623069001029865,
"grad_norm": 0.22418425977230072,
"learning_rate": 3.2668295258345665e-05,
"loss": 0.0302,
"step": 15170
},
{
"epoch": 15.633367662203913,
"grad_norm": 0.2627883851528168,
"learning_rate": 3.260368859007119e-05,
"loss": 0.0257,
"step": 15180
},
{
"epoch": 15.64366632337796,
"grad_norm": 0.23288536071777344,
"learning_rate": 3.253911495016785e-05,
"loss": 0.0313,
"step": 15190
},
{
"epoch": 15.653964984552008,
"grad_norm": 0.3750770390033722,
"learning_rate": 3.247457446123415e-05,
"loss": 0.034,
"step": 15200
},
{
"epoch": 15.664263645726056,
"grad_norm": 0.7027446627616882,
"learning_rate": 3.2410067245805715e-05,
"loss": 0.0315,
"step": 15210
},
{
"epoch": 15.674562306900103,
"grad_norm": 0.2821480929851532,
"learning_rate": 3.234559342635493e-05,
"loss": 0.0259,
"step": 15220
},
{
"epoch": 15.684860968074151,
"grad_norm": 0.18873628973960876,
"learning_rate": 3.228115312529082e-05,
"loss": 0.0238,
"step": 15230
},
{
"epoch": 15.695159629248197,
"grad_norm": 0.25101834535598755,
"learning_rate": 3.221674646495874e-05,
"loss": 0.0293,
"step": 15240
},
{
"epoch": 15.705458290422245,
"grad_norm": 0.4093859791755676,
"learning_rate": 3.215237356764021e-05,
"loss": 0.0295,
"step": 15250
},
{
"epoch": 15.715756951596292,
"grad_norm": 0.26483431458473206,
"learning_rate": 3.208803455555259e-05,
"loss": 0.0349,
"step": 15260
},
{
"epoch": 15.72605561277034,
"grad_norm": 0.19289034605026245,
"learning_rate": 3.2023729550849e-05,
"loss": 0.0304,
"step": 15270
},
{
"epoch": 15.736354273944388,
"grad_norm": 0.2590263783931732,
"learning_rate": 3.195945867561791e-05,
"loss": 0.032,
"step": 15280
},
{
"epoch": 15.746652935118435,
"grad_norm": 0.29010581970214844,
"learning_rate": 3.189522205188303e-05,
"loss": 0.0296,
"step": 15290
},
{
"epoch": 15.756951596292481,
"grad_norm": 0.24612699449062347,
"learning_rate": 3.183101980160303e-05,
"loss": 0.0414,
"step": 15300
},
{
"epoch": 15.767250257466529,
"grad_norm": 0.281768798828125,
"learning_rate": 3.176685204667132e-05,
"loss": 0.0306,
"step": 15310
},
{
"epoch": 15.777548918640576,
"grad_norm": 0.16171668469905853,
"learning_rate": 3.1702718908915805e-05,
"loss": 0.0347,
"step": 15320
},
{
"epoch": 15.787847579814624,
"grad_norm": 0.34655749797821045,
"learning_rate": 3.1638620510098725e-05,
"loss": 0.0342,
"step": 15330
},
{
"epoch": 15.798146240988672,
"grad_norm": 0.30499929189682007,
"learning_rate": 3.157455697191629e-05,
"loss": 0.0298,
"step": 15340
},
{
"epoch": 15.80844490216272,
"grad_norm": 0.3088139593601227,
"learning_rate": 3.151052841599854e-05,
"loss": 0.0325,
"step": 15350
},
{
"epoch": 15.818743563336767,
"grad_norm": 0.24579790234565735,
"learning_rate": 3.1446534963909146e-05,
"loss": 0.0318,
"step": 15360
},
{
"epoch": 15.829042224510813,
"grad_norm": 0.1964549720287323,
"learning_rate": 3.138257673714507e-05,
"loss": 0.0283,
"step": 15370
},
{
"epoch": 15.83934088568486,
"grad_norm": 0.257536381483078,
"learning_rate": 3.131865385713645e-05,
"loss": 0.0314,
"step": 15380
},
{
"epoch": 15.849639546858908,
"grad_norm": 0.25062111020088196,
"learning_rate": 3.1254766445246255e-05,
"loss": 0.0323,
"step": 15390
},
{
"epoch": 15.859938208032956,
"grad_norm": 0.24054642021656036,
"learning_rate": 3.11909146227702e-05,
"loss": 0.0342,
"step": 15400
},
{
"epoch": 15.870236869207003,
"grad_norm": 0.2715880870819092,
"learning_rate": 3.1127098510936335e-05,
"loss": 0.0349,
"step": 15410
},
{
"epoch": 15.880535530381051,
"grad_norm": 0.39196503162384033,
"learning_rate": 3.106331823090498e-05,
"loss": 0.0313,
"step": 15420
},
{
"epoch": 15.890834191555097,
"grad_norm": 0.2922776937484741,
"learning_rate": 3.0999573903768386e-05,
"loss": 0.0355,
"step": 15430
},
{
"epoch": 15.901132852729145,
"grad_norm": 0.4503494203090668,
"learning_rate": 3.093586565055058e-05,
"loss": 0.0357,
"step": 15440
},
{
"epoch": 15.911431513903192,
"grad_norm": 0.2735413908958435,
"learning_rate": 3.0872193592207035e-05,
"loss": 0.0322,
"step": 15450
},
{
"epoch": 15.92173017507724,
"grad_norm": 0.2650955021381378,
"learning_rate": 3.080855784962461e-05,
"loss": 0.031,
"step": 15460
},
{
"epoch": 15.932028836251288,
"grad_norm": 0.3438953161239624,
"learning_rate": 3.07449585436211e-05,
"loss": 0.0309,
"step": 15470
},
{
"epoch": 15.942327497425335,
"grad_norm": 0.26952746510505676,
"learning_rate": 3.068139579494521e-05,
"loss": 0.0365,
"step": 15480
},
{
"epoch": 15.952626158599383,
"grad_norm": 0.33866754174232483,
"learning_rate": 3.061786972427618e-05,
"loss": 0.0296,
"step": 15490
},
{
"epoch": 15.962924819773429,
"grad_norm": 0.30630460381507874,
"learning_rate": 3.0554380452223666e-05,
"loss": 0.031,
"step": 15500
},
{
"epoch": 15.973223480947476,
"grad_norm": 0.27467259764671326,
"learning_rate": 3.0490928099327386e-05,
"loss": 0.0317,
"step": 15510
},
{
"epoch": 15.983522142121524,
"grad_norm": 0.23503737151622772,
"learning_rate": 3.0427512786057054e-05,
"loss": 0.0325,
"step": 15520
},
{
"epoch": 15.993820803295572,
"grad_norm": 0.28616487979888916,
"learning_rate": 3.0364134632811992e-05,
"loss": 0.0314,
"step": 15530
},
{
"epoch": 16.004119464469618,
"grad_norm": 0.20699192583560944,
"learning_rate": 3.0300793759921003e-05,
"loss": 0.041,
"step": 15540
},
{
"epoch": 16.014418125643665,
"grad_norm": 0.2688738703727722,
"learning_rate": 3.0237490287642088e-05,
"loss": 0.0329,
"step": 15550
},
{
"epoch": 16.024716786817713,
"grad_norm": 0.49202895164489746,
"learning_rate": 3.017422433616227e-05,
"loss": 0.0327,
"step": 15560
},
{
"epoch": 16.03501544799176,
"grad_norm": 0.2781137228012085,
"learning_rate": 3.011099602559728e-05,
"loss": 0.0343,
"step": 15570
},
{
"epoch": 16.04531410916581,
"grad_norm": 0.28886276483535767,
"learning_rate": 3.0047805475991487e-05,
"loss": 0.0298,
"step": 15580
},
{
"epoch": 16.055612770339856,
"grad_norm": 0.2826370596885681,
"learning_rate": 2.9984652807317442e-05,
"loss": 0.0308,
"step": 15590
},
{
"epoch": 16.065911431513904,
"grad_norm": 0.22490771114826202,
"learning_rate": 2.992153813947588e-05,
"loss": 0.0267,
"step": 15600
},
{
"epoch": 16.07621009268795,
"grad_norm": 0.17981351912021637,
"learning_rate": 2.9858461592295316e-05,
"loss": 0.0249,
"step": 15610
},
{
"epoch": 16.086508753862,
"grad_norm": 0.2858598828315735,
"learning_rate": 2.9795423285531942e-05,
"loss": 0.0287,
"step": 15620
},
{
"epoch": 16.096807415036047,
"grad_norm": 0.2903124988079071,
"learning_rate": 2.9732423338869286e-05,
"loss": 0.0339,
"step": 15630
},
{
"epoch": 16.107106076210094,
"grad_norm": 0.2045518010854721,
"learning_rate": 2.9669461871918143e-05,
"loss": 0.0262,
"step": 15640
},
{
"epoch": 16.117404737384142,
"grad_norm": 0.18494410812854767,
"learning_rate": 2.9606539004216142e-05,
"loss": 0.037,
"step": 15650
},
{
"epoch": 16.127703398558186,
"grad_norm": 0.2084999680519104,
"learning_rate": 2.954365485522771e-05,
"loss": 0.0321,
"step": 15660
},
{
"epoch": 16.138002059732234,
"grad_norm": 0.39265531301498413,
"learning_rate": 2.9480809544343713e-05,
"loss": 0.0293,
"step": 15670
},
{
"epoch": 16.14830072090628,
"grad_norm": 0.20841005444526672,
"learning_rate": 2.9418003190881314e-05,
"loss": 0.0303,
"step": 15680
},
{
"epoch": 16.15859938208033,
"grad_norm": 0.30363988876342773,
"learning_rate": 2.935523591408366e-05,
"loss": 0.0292,
"step": 15690
},
{
"epoch": 16.168898043254377,
"grad_norm": 0.24384894967079163,
"learning_rate": 2.9292507833119798e-05,
"loss": 0.0272,
"step": 15700
},
{
"epoch": 16.179196704428424,
"grad_norm": 0.2245059460401535,
"learning_rate": 2.9229819067084262e-05,
"loss": 0.0256,
"step": 15710
},
{
"epoch": 16.189495365602472,
"grad_norm": 0.2277413308620453,
"learning_rate": 2.9167169734997024e-05,
"loss": 0.0269,
"step": 15720
},
{
"epoch": 16.19979402677652,
"grad_norm": 0.2198115885257721,
"learning_rate": 2.91045599558031e-05,
"loss": 0.0348,
"step": 15730
},
{
"epoch": 16.210092687950567,
"grad_norm": 0.21734359860420227,
"learning_rate": 2.9041989848372497e-05,
"loss": 0.0241,
"step": 15740
},
{
"epoch": 16.220391349124615,
"grad_norm": 0.23658619821071625,
"learning_rate": 2.8979459531499876e-05,
"loss": 0.032,
"step": 15750
},
{
"epoch": 16.230690010298662,
"grad_norm": 0.33513155579566956,
"learning_rate": 2.891696912390429e-05,
"loss": 0.0301,
"step": 15760
},
{
"epoch": 16.24098867147271,
"grad_norm": 0.26320722699165344,
"learning_rate": 2.885451874422911e-05,
"loss": 0.0305,
"step": 15770
},
{
"epoch": 16.251287332646754,
"grad_norm": 0.26464515924453735,
"learning_rate": 2.8792108511041666e-05,
"loss": 0.0332,
"step": 15780
},
{
"epoch": 16.261585993820802,
"grad_norm": 0.30329862236976624,
"learning_rate": 2.8729738542833097e-05,
"loss": 0.0284,
"step": 15790
},
{
"epoch": 16.27188465499485,
"grad_norm": 0.237071231007576,
"learning_rate": 2.8667408958018e-05,
"loss": 0.0323,
"step": 15800
},
{
"epoch": 16.282183316168897,
"grad_norm": 0.22881901264190674,
"learning_rate": 2.860511987493446e-05,
"loss": 0.0342,
"step": 15810
},
{
"epoch": 16.292481977342945,
"grad_norm": 0.32153141498565674,
"learning_rate": 2.854287141184353e-05,
"loss": 0.0266,
"step": 15820
},
{
"epoch": 16.302780638516992,
"grad_norm": 0.27103251218795776,
"learning_rate": 2.8480663686929194e-05,
"loss": 0.0333,
"step": 15830
},
{
"epoch": 16.31307929969104,
"grad_norm": 0.3440805971622467,
"learning_rate": 2.8418496818298095e-05,
"loss": 0.0308,
"step": 15840
},
{
"epoch": 16.323377960865088,
"grad_norm": 0.3047653138637543,
"learning_rate": 2.8356370923979324e-05,
"loss": 0.0269,
"step": 15850
},
{
"epoch": 16.333676622039135,
"grad_norm": 0.19806207716464996,
"learning_rate": 2.8294286121924084e-05,
"loss": 0.0317,
"step": 15860
},
{
"epoch": 16.343975283213183,
"grad_norm": 0.1738138198852539,
"learning_rate": 2.8232242530005726e-05,
"loss": 0.0326,
"step": 15870
},
{
"epoch": 16.35427394438723,
"grad_norm": 0.3517332077026367,
"learning_rate": 2.8170240266019197e-05,
"loss": 0.0343,
"step": 15880
},
{
"epoch": 16.36457260556128,
"grad_norm": 0.28405335545539856,
"learning_rate": 2.8108279447681072e-05,
"loss": 0.0301,
"step": 15890
},
{
"epoch": 16.374871266735326,
"grad_norm": 0.34772422909736633,
"learning_rate": 2.8046360192629218e-05,
"loss": 0.0266,
"step": 15900
},
{
"epoch": 16.38516992790937,
"grad_norm": 0.26349806785583496,
"learning_rate": 2.7984482618422604e-05,
"loss": 0.0338,
"step": 15910
},
{
"epoch": 16.395468589083418,
"grad_norm": 0.4074728190898895,
"learning_rate": 2.7922646842540977e-05,
"loss": 0.0302,
"step": 15920
},
{
"epoch": 16.405767250257465,
"grad_norm": 0.3678358793258667,
"learning_rate": 2.7860852982384887e-05,
"loss": 0.0389,
"step": 15930
},
{
"epoch": 16.416065911431513,
"grad_norm": 0.22004730999469757,
"learning_rate": 2.7799101155275155e-05,
"loss": 0.0335,
"step": 15940
},
{
"epoch": 16.42636457260556,
"grad_norm": 0.41160476207733154,
"learning_rate": 2.773739147845286e-05,
"loss": 0.0335,
"step": 15950
},
{
"epoch": 16.43666323377961,
"grad_norm": 0.2567139267921448,
"learning_rate": 2.767572406907908e-05,
"loss": 0.0275,
"step": 15960
},
{
"epoch": 16.446961894953656,
"grad_norm": 0.3399168848991394,
"learning_rate": 2.761409904423461e-05,
"loss": 0.0326,
"step": 15970
},
{
"epoch": 16.457260556127704,
"grad_norm": 0.35717037320137024,
"learning_rate": 2.7552516520919734e-05,
"loss": 0.0346,
"step": 15980
},
{
"epoch": 16.46755921730175,
"grad_norm": 0.3658212721347809,
"learning_rate": 2.7490976616054177e-05,
"loss": 0.0353,
"step": 15990
},
{
"epoch": 16.4778578784758,
"grad_norm": 0.22349724173545837,
"learning_rate": 2.7429479446476587e-05,
"loss": 0.0285,
"step": 16000
},
{
"epoch": 16.488156539649847,
"grad_norm": 0.23875796794891357,
"learning_rate": 2.7368025128944576e-05,
"loss": 0.0276,
"step": 16010
},
{
"epoch": 16.498455200823894,
"grad_norm": 0.2834377586841583,
"learning_rate": 2.7306613780134376e-05,
"loss": 0.0325,
"step": 16020
},
{
"epoch": 16.508753861997942,
"grad_norm": 0.2421753704547882,
"learning_rate": 2.7245245516640656e-05,
"loss": 0.0275,
"step": 16030
},
{
"epoch": 16.519052523171986,
"grad_norm": 0.2751816213130951,
"learning_rate": 2.7183920454976196e-05,
"loss": 0.0263,
"step": 16040
},
{
"epoch": 16.529351184346034,
"grad_norm": 0.2655097544193268,
"learning_rate": 2.7122638711571912e-05,
"loss": 0.0266,
"step": 16050
},
{
"epoch": 16.53964984552008,
"grad_norm": 0.24982544779777527,
"learning_rate": 2.7061400402776328e-05,
"loss": 0.0293,
"step": 16060
},
{
"epoch": 16.54994850669413,
"grad_norm": 0.2736775577068329,
"learning_rate": 2.7000205644855557e-05,
"loss": 0.0326,
"step": 16070
},
{
"epoch": 16.560247167868177,
"grad_norm": 0.2747161388397217,
"learning_rate": 2.6939054553993065e-05,
"loss": 0.0253,
"step": 16080
},
{
"epoch": 16.570545829042224,
"grad_norm": 0.22185395658016205,
"learning_rate": 2.6877947246289376e-05,
"loss": 0.0312,
"step": 16090
},
{
"epoch": 16.580844490216272,
"grad_norm": 0.2517991364002228,
"learning_rate": 2.681688383776184e-05,
"loss": 0.027,
"step": 16100
},
{
"epoch": 16.59114315139032,
"grad_norm": 0.2567277252674103,
"learning_rate": 2.675586444434459e-05,
"loss": 0.03,
"step": 16110
},
{
"epoch": 16.601441812564367,
"grad_norm": 0.2631073296070099,
"learning_rate": 2.669488918188806e-05,
"loss": 0.0322,
"step": 16120
},
{
"epoch": 16.611740473738415,
"grad_norm": 0.46920910477638245,
"learning_rate": 2.6633958166158958e-05,
"loss": 0.0273,
"step": 16130
},
{
"epoch": 16.622039134912463,
"grad_norm": 0.1975933313369751,
"learning_rate": 2.6573071512839996e-05,
"loss": 0.0302,
"step": 16140
},
{
"epoch": 16.63233779608651,
"grad_norm": 0.34002429246902466,
"learning_rate": 2.651222933752965e-05,
"loss": 0.0284,
"step": 16150
},
{
"epoch": 16.642636457260558,
"grad_norm": 0.18480847775936127,
"learning_rate": 2.645143175574192e-05,
"loss": 0.0284,
"step": 16160
},
{
"epoch": 16.652935118434602,
"grad_norm": 0.1662713885307312,
"learning_rate": 2.6390678882906173e-05,
"loss": 0.0311,
"step": 16170
},
{
"epoch": 16.66323377960865,
"grad_norm": 0.26757383346557617,
"learning_rate": 2.6329970834366886e-05,
"loss": 0.0303,
"step": 16180
},
{
"epoch": 16.673532440782697,
"grad_norm": 0.2224883884191513,
"learning_rate": 2.626930772538344e-05,
"loss": 0.0331,
"step": 16190
},
{
"epoch": 16.683831101956745,
"grad_norm": 0.5324035882949829,
"learning_rate": 2.6208689671129898e-05,
"loss": 0.0277,
"step": 16200
},
{
"epoch": 16.694129763130793,
"grad_norm": 0.24052481353282928,
"learning_rate": 2.6148116786694743e-05,
"loss": 0.0362,
"step": 16210
},
{
"epoch": 16.70442842430484,
"grad_norm": 0.16978083550930023,
"learning_rate": 2.6087589187080742e-05,
"loss": 0.0249,
"step": 16220
},
{
"epoch": 16.714727085478888,
"grad_norm": 0.2954985499382019,
"learning_rate": 2.6027106987204676e-05,
"loss": 0.0304,
"step": 16230
},
{
"epoch": 16.725025746652936,
"grad_norm": 0.31481167674064636,
"learning_rate": 2.5966670301897116e-05,
"loss": 0.0301,
"step": 16240
},
{
"epoch": 16.735324407826983,
"grad_norm": 0.4357336461544037,
"learning_rate": 2.590627924590224e-05,
"loss": 0.0276,
"step": 16250
},
{
"epoch": 16.74562306900103,
"grad_norm": 0.22642803192138672,
"learning_rate": 2.58459339338776e-05,
"loss": 0.0307,
"step": 16260
},
{
"epoch": 16.75592173017508,
"grad_norm": 0.2144901007413864,
"learning_rate": 2.578563448039384e-05,
"loss": 0.0353,
"step": 16270
},
{
"epoch": 16.766220391349126,
"grad_norm": 0.19773432612419128,
"learning_rate": 2.57253809999346e-05,
"loss": 0.0325,
"step": 16280
},
{
"epoch": 16.77651905252317,
"grad_norm": 0.2523026168346405,
"learning_rate": 2.5665173606896233e-05,
"loss": 0.0312,
"step": 16290
},
{
"epoch": 16.786817713697218,
"grad_norm": 0.3775824010372162,
"learning_rate": 2.560501241558756e-05,
"loss": 0.0344,
"step": 16300
},
{
"epoch": 16.797116374871266,
"grad_norm": 0.24595195055007935,
"learning_rate": 2.5544897540229708e-05,
"loss": 0.032,
"step": 16310
},
{
"epoch": 16.807415036045313,
"grad_norm": 0.3129733204841614,
"learning_rate": 2.5484829094955876e-05,
"loss": 0.0317,
"step": 16320
},
{
"epoch": 16.81771369721936,
"grad_norm": 0.3071824610233307,
"learning_rate": 2.5424807193811055e-05,
"loss": 0.0322,
"step": 16330
},
{
"epoch": 16.82801235839341,
"grad_norm": 0.2712470591068268,
"learning_rate": 2.5364831950751945e-05,
"loss": 0.0298,
"step": 16340
},
{
"epoch": 16.838311019567456,
"grad_norm": 0.31381168961524963,
"learning_rate": 2.5304903479646603e-05,
"loss": 0.0305,
"step": 16350
},
{
"epoch": 16.848609680741504,
"grad_norm": 0.24024145305156708,
"learning_rate": 2.5245021894274333e-05,
"loss": 0.0309,
"step": 16360
},
{
"epoch": 16.85890834191555,
"grad_norm": 0.2446487545967102,
"learning_rate": 2.5185187308325375e-05,
"loss": 0.0336,
"step": 16370
},
{
"epoch": 16.8692070030896,
"grad_norm": 0.2759553790092468,
"learning_rate": 2.51253998354008e-05,
"loss": 0.0308,
"step": 16380
},
{
"epoch": 16.879505664263647,
"grad_norm": 0.23006874322891235,
"learning_rate": 2.506565958901214e-05,
"loss": 0.0261,
"step": 16390
},
{
"epoch": 16.889804325437694,
"grad_norm": 0.25269433856010437,
"learning_rate": 2.500596668258134e-05,
"loss": 0.0317,
"step": 16400
},
{
"epoch": 16.900102986611742,
"grad_norm": 0.33330032229423523,
"learning_rate": 2.4946321229440435e-05,
"loss": 0.0372,
"step": 16410
},
{
"epoch": 16.910401647785786,
"grad_norm": 0.38393720984458923,
"learning_rate": 2.4886723342831374e-05,
"loss": 0.0368,
"step": 16420
},
{
"epoch": 16.920700308959834,
"grad_norm": 0.254517525434494,
"learning_rate": 2.482717313590579e-05,
"loss": 0.0261,
"step": 16430
},
{
"epoch": 16.93099897013388,
"grad_norm": 0.21054412424564362,
"learning_rate": 2.4767670721724822e-05,
"loss": 0.0226,
"step": 16440
},
{
"epoch": 16.94129763130793,
"grad_norm": 0.2419450879096985,
"learning_rate": 2.4708216213258805e-05,
"loss": 0.0259,
"step": 16450
},
{
"epoch": 16.951596292481977,
"grad_norm": 0.220100998878479,
"learning_rate": 2.464880972338718e-05,
"loss": 0.0296,
"step": 16460
},
{
"epoch": 16.961894953656024,
"grad_norm": 0.2163473665714264,
"learning_rate": 2.4589451364898197e-05,
"loss": 0.0276,
"step": 16470
},
{
"epoch": 16.972193614830072,
"grad_norm": 0.30859941244125366,
"learning_rate": 2.453014125048873e-05,
"loss": 0.0332,
"step": 16480
},
{
"epoch": 16.98249227600412,
"grad_norm": 0.16488564014434814,
"learning_rate": 2.447087949276406e-05,
"loss": 0.0287,
"step": 16490
},
{
"epoch": 16.992790937178167,
"grad_norm": 1.141352891921997,
"learning_rate": 2.441166620423767e-05,
"loss": 0.0282,
"step": 16500
},
{
"epoch": 17.003089598352215,
"grad_norm": 0.6053497791290283,
"learning_rate": 2.435250149733097e-05,
"loss": 0.027,
"step": 16510
},
{
"epoch": 17.013388259526263,
"grad_norm": 0.4089854955673218,
"learning_rate": 2.4293385484373188e-05,
"loss": 0.03,
"step": 16520
},
{
"epoch": 17.02368692070031,
"grad_norm": 0.38011640310287476,
"learning_rate": 2.423431827760108e-05,
"loss": 0.0242,
"step": 16530
},
{
"epoch": 17.033985581874358,
"grad_norm": 0.15878790616989136,
"learning_rate": 2.417529998915875e-05,
"loss": 0.0295,
"step": 16540
},
{
"epoch": 17.044284243048402,
"grad_norm": 0.3160199820995331,
"learning_rate": 2.411633073109741e-05,
"loss": 0.0317,
"step": 16550
},
{
"epoch": 17.05458290422245,
"grad_norm": 0.4487075209617615,
"learning_rate": 2.4057410615375215e-05,
"loss": 0.0322,
"step": 16560
},
{
"epoch": 17.064881565396497,
"grad_norm": 0.14873281121253967,
"learning_rate": 2.3998539753856962e-05,
"loss": 0.0275,
"step": 16570
},
{
"epoch": 17.075180226570545,
"grad_norm": 0.26425817608833313,
"learning_rate": 2.393971825831398e-05,
"loss": 0.0332,
"step": 16580
},
{
"epoch": 17.085478887744593,
"grad_norm": 0.28254973888397217,
"learning_rate": 2.388094624042389e-05,
"loss": 0.0272,
"step": 16590
},
{
"epoch": 17.09577754891864,
"grad_norm": 0.2275743931531906,
"learning_rate": 2.3822223811770288e-05,
"loss": 0.0274,
"step": 16600
},
{
"epoch": 17.106076210092688,
"grad_norm": 0.2673308253288269,
"learning_rate": 2.3763551083842757e-05,
"loss": 0.0275,
"step": 16610
},
{
"epoch": 17.116374871266736,
"grad_norm": 0.25909948348999023,
"learning_rate": 2.370492816803638e-05,
"loss": 0.0327,
"step": 16620
},
{
"epoch": 17.126673532440783,
"grad_norm": 0.2639828026294708,
"learning_rate": 2.364635517565175e-05,
"loss": 0.0302,
"step": 16630
},
{
"epoch": 17.13697219361483,
"grad_norm": 0.2498447746038437,
"learning_rate": 2.358783221789466e-05,
"loss": 0.0332,
"step": 16640
},
{
"epoch": 17.14727085478888,
"grad_norm": 0.4615592658519745,
"learning_rate": 2.352935940587592e-05,
"loss": 0.0324,
"step": 16650
},
{
"epoch": 17.157569515962926,
"grad_norm": 0.3771480619907379,
"learning_rate": 2.3470936850611063e-05,
"loss": 0.0319,
"step": 16660
},
{
"epoch": 17.167868177136974,
"grad_norm": 0.15599535405635834,
"learning_rate": 2.3412564663020337e-05,
"loss": 0.0361,
"step": 16670
},
{
"epoch": 17.178166838311018,
"grad_norm": 0.26683464646339417,
"learning_rate": 2.335424295392822e-05,
"loss": 0.0297,
"step": 16680
},
{
"epoch": 17.188465499485066,
"grad_norm": 0.2283865511417389,
"learning_rate": 2.3295971834063446e-05,
"loss": 0.0285,
"step": 16690
},
{
"epoch": 17.198764160659113,
"grad_norm": 0.2537069022655487,
"learning_rate": 2.323775141405867e-05,
"loss": 0.0248,
"step": 16700
},
{
"epoch": 17.20906282183316,
"grad_norm": 0.40300917625427246,
"learning_rate": 2.3179581804450306e-05,
"loss": 0.0285,
"step": 16710
},
{
"epoch": 17.21936148300721,
"grad_norm": 0.32292699813842773,
"learning_rate": 2.3121463115678237e-05,
"loss": 0.0277,
"step": 16720
},
{
"epoch": 17.229660144181256,
"grad_norm": 0.2189875841140747,
"learning_rate": 2.3063395458085795e-05,
"loss": 0.0349,
"step": 16730
},
{
"epoch": 17.239958805355304,
"grad_norm": 0.24812109768390656,
"learning_rate": 2.3005378941919287e-05,
"loss": 0.0293,
"step": 16740
},
{
"epoch": 17.25025746652935,
"grad_norm": 0.46981245279312134,
"learning_rate": 2.2947413677328e-05,
"loss": 0.0297,
"step": 16750
},
{
"epoch": 17.2605561277034,
"grad_norm": 0.293948233127594,
"learning_rate": 2.2889499774363903e-05,
"loss": 0.0303,
"step": 16760
},
{
"epoch": 17.270854788877447,
"grad_norm": 1.4435584545135498,
"learning_rate": 2.283163734298147e-05,
"loss": 0.0281,
"step": 16770
},
{
"epoch": 17.281153450051495,
"grad_norm": 2.3254072666168213,
"learning_rate": 2.2773826493037377e-05,
"loss": 0.0307,
"step": 16780
},
{
"epoch": 17.291452111225542,
"grad_norm": 0.45493340492248535,
"learning_rate": 2.271606733429048e-05,
"loss": 0.0315,
"step": 16790
},
{
"epoch": 17.301750772399586,
"grad_norm": 0.3371964991092682,
"learning_rate": 2.2658359976401388e-05,
"loss": 0.0301,
"step": 16800
},
{
"epoch": 17.312049433573634,
"grad_norm": 0.2556536793708801,
"learning_rate": 2.260070452893243e-05,
"loss": 0.0324,
"step": 16810
},
{
"epoch": 17.32234809474768,
"grad_norm": 0.24934424459934235,
"learning_rate": 2.2543101101347357e-05,
"loss": 0.0289,
"step": 16820
},
{
"epoch": 17.33264675592173,
"grad_norm": 0.2703939974308014,
"learning_rate": 2.2485549803011175e-05,
"loss": 0.029,
"step": 16830
},
{
"epoch": 17.342945417095777,
"grad_norm": 0.24408842623233795,
"learning_rate": 2.2428050743189845e-05,
"loss": 0.0267,
"step": 16840
},
{
"epoch": 17.353244078269825,
"grad_norm": 0.3018171787261963,
"learning_rate": 2.237060403105027e-05,
"loss": 0.0286,
"step": 16850
},
{
"epoch": 17.363542739443872,
"grad_norm": 0.3476426899433136,
"learning_rate": 2.2313209775659854e-05,
"loss": 0.0299,
"step": 16860
},
{
"epoch": 17.37384140061792,
"grad_norm": 0.25924161076545715,
"learning_rate": 2.2255868085986476e-05,
"loss": 0.03,
"step": 16870
},
{
"epoch": 17.384140061791967,
"grad_norm": 0.3101584017276764,
"learning_rate": 2.219857907089818e-05,
"loss": 0.0256,
"step": 16880
},
{
"epoch": 17.394438722966015,
"grad_norm": 0.21111738681793213,
"learning_rate": 2.2141342839163038e-05,
"loss": 0.0262,
"step": 16890
},
{
"epoch": 17.404737384140063,
"grad_norm": 0.2484743893146515,
"learning_rate": 2.2084159499448833e-05,
"loss": 0.0295,
"step": 16900
},
{
"epoch": 17.41503604531411,
"grad_norm": 0.30068278312683105,
"learning_rate": 2.2027029160323053e-05,
"loss": 0.0309,
"step": 16910
},
{
"epoch": 17.425334706488158,
"grad_norm": 0.20330175757408142,
"learning_rate": 2.196995193025243e-05,
"loss": 0.0236,
"step": 16920
},
{
"epoch": 17.435633367662202,
"grad_norm": 0.31278255581855774,
"learning_rate": 2.1912927917602944e-05,
"loss": 0.0357,
"step": 16930
},
{
"epoch": 17.44593202883625,
"grad_norm": 0.5232197642326355,
"learning_rate": 2.1855957230639507e-05,
"loss": 0.0324,
"step": 16940
},
{
"epoch": 17.456230690010297,
"grad_norm": 0.2283019870519638,
"learning_rate": 2.179903997752582e-05,
"loss": 0.0271,
"step": 16950
},
{
"epoch": 17.466529351184345,
"grad_norm": 0.2504780888557434,
"learning_rate": 2.174217626632407e-05,
"loss": 0.0245,
"step": 16960
},
{
"epoch": 17.476828012358393,
"grad_norm": 0.40278464555740356,
"learning_rate": 2.168536620499485e-05,
"loss": 0.0371,
"step": 16970
},
{
"epoch": 17.48712667353244,
"grad_norm": 0.21591483056545258,
"learning_rate": 2.162860990139688e-05,
"loss": 0.0312,
"step": 16980
},
{
"epoch": 17.497425334706488,
"grad_norm": 0.33034050464630127,
"learning_rate": 2.15719074632868e-05,
"loss": 0.0281,
"step": 16990
},
{
"epoch": 17.507723995880536,
"grad_norm": 0.3057808578014374,
"learning_rate": 2.151525899831902e-05,
"loss": 0.0343,
"step": 17000
},
{
"epoch": 17.518022657054583,
"grad_norm": 0.344502717256546,
"learning_rate": 2.1458664614045415e-05,
"loss": 0.0341,
"step": 17010
},
{
"epoch": 17.52832131822863,
"grad_norm": 0.2241314798593521,
"learning_rate": 2.1402124417915226e-05,
"loss": 0.0326,
"step": 17020
},
{
"epoch": 17.53861997940268,
"grad_norm": 0.30178990960121155,
"learning_rate": 2.134563851727482e-05,
"loss": 0.0337,
"step": 17030
},
{
"epoch": 17.548918640576726,
"grad_norm": 0.39564597606658936,
"learning_rate": 2.128920701936745e-05,
"loss": 0.0309,
"step": 17040
},
{
"epoch": 17.559217301750774,
"grad_norm": 0.20545372366905212,
"learning_rate": 2.123283003133311e-05,
"loss": 0.031,
"step": 17050
},
{
"epoch": 17.569515962924818,
"grad_norm": 0.43441474437713623,
"learning_rate": 2.11765076602083e-05,
"loss": 0.0321,
"step": 17060
},
{
"epoch": 17.579814624098866,
"grad_norm": 0.4398522675037384,
"learning_rate": 2.1120240012925775e-05,
"loss": 0.0301,
"step": 17070
},
{
"epoch": 17.590113285272913,
"grad_norm": 0.3297673761844635,
"learning_rate": 2.1064027196314452e-05,
"loss": 0.0215,
"step": 17080
},
{
"epoch": 17.60041194644696,
"grad_norm": 0.3874305486679077,
"learning_rate": 2.1007869317099128e-05,
"loss": 0.029,
"step": 17090
},
{
"epoch": 17.61071060762101,
"grad_norm": 0.5168983340263367,
"learning_rate": 2.095176648190029e-05,
"loss": 0.0265,
"step": 17100
},
{
"epoch": 17.621009268795056,
"grad_norm": 0.29372426867485046,
"learning_rate": 2.0895718797233925e-05,
"loss": 0.0325,
"step": 17110
},
{
"epoch": 17.631307929969104,
"grad_norm": 0.33520805835723877,
"learning_rate": 2.0839726369511338e-05,
"loss": 0.0342,
"step": 17120
},
{
"epoch": 17.64160659114315,
"grad_norm": 0.2704266607761383,
"learning_rate": 2.0783789305038847e-05,
"loss": 0.0273,
"step": 17130
},
{
"epoch": 17.6519052523172,
"grad_norm": 0.29930296540260315,
"learning_rate": 2.0727907710017736e-05,
"loss": 0.0342,
"step": 17140
},
{
"epoch": 17.662203913491247,
"grad_norm": 0.2962852716445923,
"learning_rate": 2.067208169054394e-05,
"loss": 0.0322,
"step": 17150
},
{
"epoch": 17.672502574665295,
"grad_norm": 0.31568607687950134,
"learning_rate": 2.061631135260789e-05,
"loss": 0.0249,
"step": 17160
},
{
"epoch": 17.682801235839342,
"grad_norm": 0.2064095288515091,
"learning_rate": 2.0560596802094305e-05,
"loss": 0.0246,
"step": 17170
},
{
"epoch": 17.69309989701339,
"grad_norm": 0.3407458961009979,
"learning_rate": 2.0504938144781988e-05,
"loss": 0.0262,
"step": 17180
},
{
"epoch": 17.703398558187434,
"grad_norm": 0.273579478263855,
"learning_rate": 2.0449335486343584e-05,
"loss": 0.0273,
"step": 17190
},
{
"epoch": 17.71369721936148,
"grad_norm": 0.23050899803638458,
"learning_rate": 2.039378893234547e-05,
"loss": 0.0259,
"step": 17200
},
{
"epoch": 17.72399588053553,
"grad_norm": 0.287034809589386,
"learning_rate": 2.0338298588247485e-05,
"loss": 0.0339,
"step": 17210
},
{
"epoch": 17.734294541709577,
"grad_norm": 0.2856465280056,
"learning_rate": 2.028286455940274e-05,
"loss": 0.0331,
"step": 17220
},
{
"epoch": 17.744593202883625,
"grad_norm": 0.18497076630592346,
"learning_rate": 2.022748695105745e-05,
"loss": 0.0274,
"step": 17230
},
{
"epoch": 17.754891864057672,
"grad_norm": 0.27101799845695496,
"learning_rate": 2.0172165868350707e-05,
"loss": 0.031,
"step": 17240
},
{
"epoch": 17.76519052523172,
"grad_norm": 0.27160143852233887,
"learning_rate": 2.0116901416314234e-05,
"loss": 0.029,
"step": 17250
},
{
"epoch": 17.775489186405768,
"grad_norm": 0.24040678143501282,
"learning_rate": 2.0061693699872298e-05,
"loss": 0.0336,
"step": 17260
},
{
"epoch": 17.785787847579815,
"grad_norm": 0.2809303402900696,
"learning_rate": 2.0006542823841423e-05,
"loss": 0.0245,
"step": 17270
},
{
"epoch": 17.796086508753863,
"grad_norm": 0.33203157782554626,
"learning_rate": 1.9951448892930225e-05,
"loss": 0.032,
"step": 17280
},
{
"epoch": 17.80638516992791,
"grad_norm": 0.3012252151966095,
"learning_rate": 1.9896412011739197e-05,
"loss": 0.0295,
"step": 17290
},
{
"epoch": 17.816683831101958,
"grad_norm": 0.22999230027198792,
"learning_rate": 1.9841432284760537e-05,
"loss": 0.0294,
"step": 17300
},
{
"epoch": 17.826982492276002,
"grad_norm": 0.42275768518447876,
"learning_rate": 1.978650981637788e-05,
"loss": 0.0343,
"step": 17310
},
{
"epoch": 17.83728115345005,
"grad_norm": 0.29662439227104187,
"learning_rate": 1.9731644710866204e-05,
"loss": 0.0288,
"step": 17320
},
{
"epoch": 17.847579814624098,
"grad_norm": 0.19465483725070953,
"learning_rate": 1.967683707239156e-05,
"loss": 0.0263,
"step": 17330
},
{
"epoch": 17.857878475798145,
"grad_norm": 0.22264879941940308,
"learning_rate": 1.9622087005010902e-05,
"loss": 0.0263,
"step": 17340
},
{
"epoch": 17.868177136972193,
"grad_norm": 0.24109327793121338,
"learning_rate": 1.956739461267186e-05,
"loss": 0.0302,
"step": 17350
},
{
"epoch": 17.87847579814624,
"grad_norm": 0.2714434266090393,
"learning_rate": 1.9512759999212593e-05,
"loss": 0.0328,
"step": 17360
},
{
"epoch": 17.888774459320288,
"grad_norm": 0.18760699033737183,
"learning_rate": 1.945818326836151e-05,
"loss": 0.03,
"step": 17370
},
{
"epoch": 17.899073120494336,
"grad_norm": 0.19363541901111603,
"learning_rate": 1.940366452373718e-05,
"loss": 0.0256,
"step": 17380
},
{
"epoch": 17.909371781668384,
"grad_norm": 0.30826708674430847,
"learning_rate": 1.9349203868848077e-05,
"loss": 0.0314,
"step": 17390
},
{
"epoch": 17.91967044284243,
"grad_norm": 0.24862709641456604,
"learning_rate": 1.929480140709231e-05,
"loss": 0.0242,
"step": 17400
},
{
"epoch": 17.92996910401648,
"grad_norm": 0.2579440176486969,
"learning_rate": 1.9240457241757635e-05,
"loss": 0.0247,
"step": 17410
},
{
"epoch": 17.940267765190526,
"grad_norm": 0.2497265785932541,
"learning_rate": 1.9186171476021004e-05,
"loss": 0.0276,
"step": 17420
},
{
"epoch": 17.950566426364574,
"grad_norm": 0.2420155256986618,
"learning_rate": 1.9131944212948555e-05,
"loss": 0.029,
"step": 17430
},
{
"epoch": 17.96086508753862,
"grad_norm": 0.18265056610107422,
"learning_rate": 1.907777555549534e-05,
"loss": 0.0382,
"step": 17440
},
{
"epoch": 17.971163748712666,
"grad_norm": 0.2574191093444824,
"learning_rate": 1.9023665606505175e-05,
"loss": 0.0305,
"step": 17450
},
{
"epoch": 17.981462409886714,
"grad_norm": 0.3031885325908661,
"learning_rate": 1.8969614468710317e-05,
"loss": 0.022,
"step": 17460
},
{
"epoch": 17.99176107106076,
"grad_norm": 0.2563394010066986,
"learning_rate": 1.89156222447315e-05,
"loss": 0.0261,
"step": 17470
},
{
"epoch": 18.00205973223481,
"grad_norm": 0.23329629004001617,
"learning_rate": 1.8861689037077494e-05,
"loss": 0.0264,
"step": 17480
},
{
"epoch": 18.012358393408856,
"grad_norm": 0.24654603004455566,
"learning_rate": 1.880781494814508e-05,
"loss": 0.0289,
"step": 17490
},
{
"epoch": 18.022657054582904,
"grad_norm": 0.33546507358551025,
"learning_rate": 1.875400008021877e-05,
"loss": 0.0279,
"step": 17500
},
{
"epoch": 18.03295571575695,
"grad_norm": 0.28472423553466797,
"learning_rate": 1.8700244535470673e-05,
"loss": 0.0246,
"step": 17510
},
{
"epoch": 18.043254376931,
"grad_norm": 0.22415411472320557,
"learning_rate": 1.8646548415960197e-05,
"loss": 0.0262,
"step": 17520
},
{
"epoch": 18.053553038105047,
"grad_norm": 0.31146541237831116,
"learning_rate": 1.8592911823634034e-05,
"loss": 0.0316,
"step": 17530
},
{
"epoch": 18.063851699279095,
"grad_norm": 0.480960875749588,
"learning_rate": 1.8539334860325757e-05,
"loss": 0.0257,
"step": 17540
},
{
"epoch": 18.074150360453142,
"grad_norm": 0.23705746233463287,
"learning_rate": 1.8485817627755787e-05,
"loss": 0.0361,
"step": 17550
},
{
"epoch": 18.08444902162719,
"grad_norm": 0.2233453243970871,
"learning_rate": 1.8432360227531116e-05,
"loss": 0.0332,
"step": 17560
},
{
"epoch": 18.094747682801234,
"grad_norm": 0.19901172816753387,
"learning_rate": 1.837896276114517e-05,
"loss": 0.0271,
"step": 17570
},
{
"epoch": 18.105046343975282,
"grad_norm": 0.22876538336277008,
"learning_rate": 1.832562532997751e-05,
"loss": 0.0255,
"step": 17580
},
{
"epoch": 18.11534500514933,
"grad_norm": 0.33671191334724426,
"learning_rate": 1.827234803529384e-05,
"loss": 0.0312,
"step": 17590
},
{
"epoch": 18.125643666323377,
"grad_norm": 0.24380284547805786,
"learning_rate": 1.8219130978245563e-05,
"loss": 0.023,
"step": 17600
},
{
"epoch": 18.135942327497425,
"grad_norm": 0.26702800393104553,
"learning_rate": 1.816597425986979e-05,
"loss": 0.0239,
"step": 17610
},
{
"epoch": 18.146240988671472,
"grad_norm": 0.18801453709602356,
"learning_rate": 1.8112877981089055e-05,
"loss": 0.0291,
"step": 17620
},
{
"epoch": 18.15653964984552,
"grad_norm": 0.23468522727489471,
"learning_rate": 1.8059842242711156e-05,
"loss": 0.023,
"step": 17630
},
{
"epoch": 18.166838311019568,
"grad_norm": 0.23238340020179749,
"learning_rate": 1.800686714542889e-05,
"loss": 0.0267,
"step": 17640
},
{
"epoch": 18.177136972193615,
"grad_norm": 0.24705295264720917,
"learning_rate": 1.795395278982003e-05,
"loss": 0.0257,
"step": 17650
},
{
"epoch": 18.187435633367663,
"grad_norm": 0.21146944165229797,
"learning_rate": 1.7901099276346912e-05,
"loss": 0.0276,
"step": 17660
},
{
"epoch": 18.19773429454171,
"grad_norm": 0.28207865357398987,
"learning_rate": 1.7848306705356434e-05,
"loss": 0.0291,
"step": 17670
},
{
"epoch": 18.20803295571576,
"grad_norm": 0.2663029432296753,
"learning_rate": 1.7795575177079754e-05,
"loss": 0.0303,
"step": 17680
},
{
"epoch": 18.218331616889806,
"grad_norm": 0.2542501389980316,
"learning_rate": 1.7742904791632175e-05,
"loss": 0.0247,
"step": 17690
},
{
"epoch": 18.22863027806385,
"grad_norm": 0.2575390338897705,
"learning_rate": 1.769029564901282e-05,
"loss": 0.0256,
"step": 17700
},
{
"epoch": 18.238928939237898,
"grad_norm": 0.25764167308807373,
"learning_rate": 1.7637747849104692e-05,
"loss": 0.0308,
"step": 17710
},
{
"epoch": 18.249227600411945,
"grad_norm": 0.24213287234306335,
"learning_rate": 1.7585261491674175e-05,
"loss": 0.0277,
"step": 17720
},
{
"epoch": 18.259526261585993,
"grad_norm": 0.2765040099620819,
"learning_rate": 1.7532836676371083e-05,
"loss": 0.0284,
"step": 17730
},
{
"epoch": 18.26982492276004,
"grad_norm": 0.4227076768875122,
"learning_rate": 1.748047350272838e-05,
"loss": 0.0269,
"step": 17740
},
{
"epoch": 18.28012358393409,
"grad_norm": 0.30982914566993713,
"learning_rate": 1.7428172070161992e-05,
"loss": 0.027,
"step": 17750
},
{
"epoch": 18.290422245108136,
"grad_norm": 0.24797886610031128,
"learning_rate": 1.737593247797058e-05,
"loss": 0.0268,
"step": 17760
},
{
"epoch": 18.300720906282184,
"grad_norm": 0.25233253836631775,
"learning_rate": 1.7323754825335493e-05,
"loss": 0.0259,
"step": 17770
},
{
"epoch": 18.31101956745623,
"grad_norm": 0.18777655065059662,
"learning_rate": 1.72716392113204e-05,
"loss": 0.0306,
"step": 17780
},
{
"epoch": 18.32131822863028,
"grad_norm": 0.3482552766799927,
"learning_rate": 1.7219585734871175e-05,
"loss": 0.0283,
"step": 17790
},
{
"epoch": 18.331616889804327,
"grad_norm": 0.19487865269184113,
"learning_rate": 1.716759449481582e-05,
"loss": 0.0311,
"step": 17800
},
{
"epoch": 18.341915550978374,
"grad_norm": 0.5890926718711853,
"learning_rate": 1.7115665589864055e-05,
"loss": 0.0369,
"step": 17810
},
{
"epoch": 18.352214212152422,
"grad_norm": 0.27247244119644165,
"learning_rate": 1.706379911860732e-05,
"loss": 0.0257,
"step": 17820
},
{
"epoch": 18.362512873326466,
"grad_norm": 0.19774708151817322,
"learning_rate": 1.701199517951852e-05,
"loss": 0.0277,
"step": 17830
},
{
"epoch": 18.372811534500514,
"grad_norm": 0.21970653533935547,
"learning_rate": 1.6960253870951825e-05,
"loss": 0.0269,
"step": 17840
},
{
"epoch": 18.38311019567456,
"grad_norm": 0.25846680998802185,
"learning_rate": 1.6908575291142447e-05,
"loss": 0.0275,
"step": 17850
},
{
"epoch": 18.39340885684861,
"grad_norm": 0.16934412717819214,
"learning_rate": 1.6856959538206618e-05,
"loss": 0.0325,
"step": 17860
},
{
"epoch": 18.403707518022657,
"grad_norm": 0.34561848640441895,
"learning_rate": 1.6805406710141164e-05,
"loss": 0.0248,
"step": 17870
},
{
"epoch": 18.414006179196704,
"grad_norm": 0.21684630215168,
"learning_rate": 1.6753916904823518e-05,
"loss": 0.0273,
"step": 17880
},
{
"epoch": 18.424304840370752,
"grad_norm": 0.27160486578941345,
"learning_rate": 1.670249022001143e-05,
"loss": 0.0222,
"step": 17890
},
{
"epoch": 18.4346035015448,
"grad_norm": 0.24969789385795593,
"learning_rate": 1.6651126753342845e-05,
"loss": 0.0259,
"step": 17900
},
{
"epoch": 18.444902162718847,
"grad_norm": 0.28738731145858765,
"learning_rate": 1.65998266023356e-05,
"loss": 0.0292,
"step": 17910
},
{
"epoch": 18.455200823892895,
"grad_norm": 0.424589067697525,
"learning_rate": 1.654858986438745e-05,
"loss": 0.0318,
"step": 17920
},
{
"epoch": 18.465499485066942,
"grad_norm": 0.3065814971923828,
"learning_rate": 1.6497416636775625e-05,
"loss": 0.0357,
"step": 17930
},
{
"epoch": 18.47579814624099,
"grad_norm": 0.19018980860710144,
"learning_rate": 1.644630701665686e-05,
"loss": 0.0299,
"step": 17940
},
{
"epoch": 18.486096807415038,
"grad_norm": 0.29618680477142334,
"learning_rate": 1.6395261101067082e-05,
"loss": 0.0288,
"step": 17950
},
{
"epoch": 18.496395468589082,
"grad_norm": 0.18861742317676544,
"learning_rate": 1.6344278986921325e-05,
"loss": 0.0222,
"step": 17960
},
{
"epoch": 18.50669412976313,
"grad_norm": 0.25318944454193115,
"learning_rate": 1.629336077101339e-05,
"loss": 0.0256,
"step": 17970
},
{
"epoch": 18.516992790937177,
"grad_norm": 0.1487075686454773,
"learning_rate": 1.6242506550015896e-05,
"loss": 0.032,
"step": 17980
},
{
"epoch": 18.527291452111225,
"grad_norm": 0.1627105176448822,
"learning_rate": 1.619171642047983e-05,
"loss": 0.027,
"step": 17990
},
{
"epoch": 18.537590113285273,
"grad_norm": 0.21512849628925323,
"learning_rate": 1.6140990478834582e-05,
"loss": 0.0288,
"step": 18000
},
{
"epoch": 18.54788877445932,
"grad_norm": 0.2389346957206726,
"learning_rate": 1.609032882138765e-05,
"loss": 0.0242,
"step": 18010
},
{
"epoch": 18.558187435633368,
"grad_norm": 0.23594819009304047,
"learning_rate": 1.60397315443245e-05,
"loss": 0.0335,
"step": 18020
},
{
"epoch": 18.568486096807415,
"grad_norm": 0.31918737292289734,
"learning_rate": 1.5989198743708294e-05,
"loss": 0.0272,
"step": 18030
},
{
"epoch": 18.578784757981463,
"grad_norm": 0.40922242403030396,
"learning_rate": 1.5938730515479904e-05,
"loss": 0.0317,
"step": 18040
},
{
"epoch": 18.58908341915551,
"grad_norm": 0.506949782371521,
"learning_rate": 1.5888326955457487e-05,
"loss": 0.0348,
"step": 18050
},
{
"epoch": 18.59938208032956,
"grad_norm": 1.7606223821640015,
"learning_rate": 1.5837988159336493e-05,
"loss": 0.0329,
"step": 18060
},
{
"epoch": 18.609680741503606,
"grad_norm": 0.25751596689224243,
"learning_rate": 1.5787714222689386e-05,
"loss": 0.032,
"step": 18070
},
{
"epoch": 18.61997940267765,
"grad_norm": 0.22682945430278778,
"learning_rate": 1.5737505240965515e-05,
"loss": 0.0267,
"step": 18080
},
{
"epoch": 18.630278063851698,
"grad_norm": 0.32027962803840637,
"learning_rate": 1.5687361309490838e-05,
"loss": 0.0284,
"step": 18090
},
{
"epoch": 18.640576725025745,
"grad_norm": 0.18152746558189392,
"learning_rate": 1.5637282523467918e-05,
"loss": 0.0235,
"step": 18100
},
{
"epoch": 18.650875386199793,
"grad_norm": 0.30700448155403137,
"learning_rate": 1.5587268977975528e-05,
"loss": 0.0272,
"step": 18110
},
{
"epoch": 18.66117404737384,
"grad_norm": 0.2509545385837555,
"learning_rate": 1.553732076796863e-05,
"loss": 0.0256,
"step": 18120
},
{
"epoch": 18.67147270854789,
"grad_norm": 0.5025020241737366,
"learning_rate": 1.5487437988278142e-05,
"loss": 0.0252,
"step": 18130
},
{
"epoch": 18.681771369721936,
"grad_norm": 0.5062695145606995,
"learning_rate": 1.5437620733610757e-05,
"loss": 0.0241,
"step": 18140
},
{
"epoch": 18.692070030895984,
"grad_norm": 0.22402432560920715,
"learning_rate": 1.5387869098548713e-05,
"loss": 0.0276,
"step": 18150
},
{
"epoch": 18.70236869207003,
"grad_norm": 0.2398539036512375,
"learning_rate": 1.5338183177549763e-05,
"loss": 0.0277,
"step": 18160
},
{
"epoch": 18.71266735324408,
"grad_norm": 0.7876859903335571,
"learning_rate": 1.5288563064946793e-05,
"loss": 0.0251,
"step": 18170
},
{
"epoch": 18.722966014418127,
"grad_norm": 0.35239315032958984,
"learning_rate": 1.52390088549478e-05,
"loss": 0.0242,
"step": 18180
},
{
"epoch": 18.733264675592174,
"grad_norm": 0.2633828818798065,
"learning_rate": 1.5189520641635674e-05,
"loss": 0.0258,
"step": 18190
},
{
"epoch": 18.743563336766222,
"grad_norm": 0.28488120436668396,
"learning_rate": 1.514009851896795e-05,
"loss": 0.0273,
"step": 18200
},
{
"epoch": 18.753861997940266,
"grad_norm": 0.1860319823026657,
"learning_rate": 1.5090742580776723e-05,
"loss": 0.0227,
"step": 18210
},
{
"epoch": 18.764160659114314,
"grad_norm": 0.2818928062915802,
"learning_rate": 1.5041452920768423e-05,
"loss": 0.026,
"step": 18220
},
{
"epoch": 18.77445932028836,
"grad_norm": 0.9211439490318298,
"learning_rate": 1.4992229632523657e-05,
"loss": 0.0315,
"step": 18230
},
{
"epoch": 18.78475798146241,
"grad_norm": 0.29785358905792236,
"learning_rate": 1.4943072809497e-05,
"loss": 0.0287,
"step": 18240
},
{
"epoch": 18.795056642636457,
"grad_norm": 0.4559323489665985,
"learning_rate": 1.4893982545016866e-05,
"loss": 0.0257,
"step": 18250
},
{
"epoch": 18.805355303810504,
"grad_norm": 0.31199562549591064,
"learning_rate": 1.484495893228524e-05,
"loss": 0.0273,
"step": 18260
},
{
"epoch": 18.815653964984552,
"grad_norm": 0.23612628877162933,
"learning_rate": 1.4796002064377629e-05,
"loss": 0.0284,
"step": 18270
},
{
"epoch": 18.8259526261586,
"grad_norm": 0.3867475688457489,
"learning_rate": 1.4747112034242794e-05,
"loss": 0.0295,
"step": 18280
},
{
"epoch": 18.836251287332647,
"grad_norm": 0.2292865663766861,
"learning_rate": 1.4698288934702597e-05,
"loss": 0.0262,
"step": 18290
},
{
"epoch": 18.846549948506695,
"grad_norm": 0.2770872414112091,
"learning_rate": 1.4649532858451826e-05,
"loss": 0.0294,
"step": 18300
},
{
"epoch": 18.856848609680743,
"grad_norm": 0.2201216071844101,
"learning_rate": 1.4600843898058048e-05,
"loss": 0.0302,
"step": 18310
},
{
"epoch": 18.86714727085479,
"grad_norm": 0.26552289724349976,
"learning_rate": 1.4552222145961325e-05,
"loss": 0.0265,
"step": 18320
},
{
"epoch": 18.877445932028838,
"grad_norm": 0.22724245488643646,
"learning_rate": 1.4503667694474232e-05,
"loss": 0.0187,
"step": 18330
},
{
"epoch": 18.887744593202882,
"grad_norm": 0.18195529282093048,
"learning_rate": 1.4455180635781474e-05,
"loss": 0.0234,
"step": 18340
},
{
"epoch": 18.89804325437693,
"grad_norm": 0.2183389961719513,
"learning_rate": 1.4406761061939844e-05,
"loss": 0.0276,
"step": 18350
},
{
"epoch": 18.908341915550977,
"grad_norm": 0.3725976347923279,
"learning_rate": 1.4358409064878015e-05,
"loss": 0.0257,
"step": 18360
},
{
"epoch": 18.918640576725025,
"grad_norm": 0.30146896839141846,
"learning_rate": 1.4310124736396358e-05,
"loss": 0.0301,
"step": 18370
},
{
"epoch": 18.928939237899073,
"grad_norm": 0.21054169535636902,
"learning_rate": 1.4261908168166716e-05,
"loss": 0.0256,
"step": 18380
},
{
"epoch": 18.93923789907312,
"grad_norm": 0.22048494219779968,
"learning_rate": 1.4213759451732395e-05,
"loss": 0.0275,
"step": 18390
},
{
"epoch": 18.949536560247168,
"grad_norm": 0.15067796409130096,
"learning_rate": 1.416567867850776e-05,
"loss": 0.0282,
"step": 18400
},
{
"epoch": 18.959835221421216,
"grad_norm": 0.21609516441822052,
"learning_rate": 1.4117665939778257e-05,
"loss": 0.0299,
"step": 18410
},
{
"epoch": 18.970133882595263,
"grad_norm": 0.2782236933708191,
"learning_rate": 1.4069721326700131e-05,
"loss": 0.0291,
"step": 18420
},
{
"epoch": 18.98043254376931,
"grad_norm": 0.22176700830459595,
"learning_rate": 1.4021844930300315e-05,
"loss": 0.0305,
"step": 18430
},
{
"epoch": 18.99073120494336,
"grad_norm": 0.2350529432296753,
"learning_rate": 1.3974036841476146e-05,
"loss": 0.0236,
"step": 18440
},
{
"epoch": 19.001029866117406,
"grad_norm": 0.23732732236385345,
"learning_rate": 1.3926297150995404e-05,
"loss": 0.0271,
"step": 18450
},
{
"epoch": 19.011328527291454,
"grad_norm": 0.21267026662826538,
"learning_rate": 1.3878625949495883e-05,
"loss": 0.0247,
"step": 18460
},
{
"epoch": 19.021627188465498,
"grad_norm": 0.21228396892547607,
"learning_rate": 1.3831023327485416e-05,
"loss": 0.027,
"step": 18470
},
{
"epoch": 19.031925849639546,
"grad_norm": 0.3779662847518921,
"learning_rate": 1.3783489375341613e-05,
"loss": 0.0225,
"step": 18480
},
{
"epoch": 19.042224510813593,
"grad_norm": 0.19540618360042572,
"learning_rate": 1.373602418331173e-05,
"loss": 0.0201,
"step": 18490
},
{
"epoch": 19.05252317198764,
"grad_norm": 0.1852959394454956,
"learning_rate": 1.3688627841512402e-05,
"loss": 0.0285,
"step": 18500
},
{
"epoch": 19.06282183316169,
"grad_norm": 0.12700384855270386,
"learning_rate": 1.3641300439929666e-05,
"loss": 0.0256,
"step": 18510
},
{
"epoch": 19.073120494335736,
"grad_norm": 0.2065911442041397,
"learning_rate": 1.3594042068418555e-05,
"loss": 0.0238,
"step": 18520
},
{
"epoch": 19.083419155509784,
"grad_norm": 0.3038773238658905,
"learning_rate": 1.3546852816703109e-05,
"loss": 0.0303,
"step": 18530
},
{
"epoch": 19.09371781668383,
"grad_norm": 0.21155600249767303,
"learning_rate": 1.3499732774376118e-05,
"loss": 0.0243,
"step": 18540
},
{
"epoch": 19.10401647785788,
"grad_norm": 0.15863338112831116,
"learning_rate": 1.345268203089899e-05,
"loss": 0.0316,
"step": 18550
},
{
"epoch": 19.114315139031927,
"grad_norm": 0.20425492525100708,
"learning_rate": 1.3405700675601506e-05,
"loss": 0.0298,
"step": 18560
},
{
"epoch": 19.124613800205974,
"grad_norm": 0.3093377351760864,
"learning_rate": 1.3358788797681805e-05,
"loss": 0.0296,
"step": 18570
},
{
"epoch": 19.134912461380022,
"grad_norm": 0.3627874255180359,
"learning_rate": 1.3311946486206022e-05,
"loss": 0.0285,
"step": 18580
},
{
"epoch": 19.145211122554066,
"grad_norm": 0.5787222385406494,
"learning_rate": 1.326517383010827e-05,
"loss": 0.0274,
"step": 18590
},
{
"epoch": 19.155509783728114,
"grad_norm": 0.16981856524944305,
"learning_rate": 1.3218470918190401e-05,
"loss": 0.025,
"step": 18600
},
{
"epoch": 19.16580844490216,
"grad_norm": 0.1558324694633484,
"learning_rate": 1.3171837839121837e-05,
"loss": 0.0308,
"step": 18610
},
{
"epoch": 19.17610710607621,
"grad_norm": 0.19524988532066345,
"learning_rate": 1.3125274681439436e-05,
"loss": 0.0236,
"step": 18620
},
{
"epoch": 19.186405767250257,
"grad_norm": 0.4640062749385834,
"learning_rate": 1.3078781533547303e-05,
"loss": 0.0299,
"step": 18630
},
{
"epoch": 19.196704428424304,
"grad_norm": 0.26007869839668274,
"learning_rate": 1.3032358483716622e-05,
"loss": 0.032,
"step": 18640
},
{
"epoch": 19.207003089598352,
"grad_norm": 0.679685115814209,
"learning_rate": 1.2986005620085456e-05,
"loss": 0.0223,
"step": 18650
},
{
"epoch": 19.2173017507724,
"grad_norm": 0.1843012571334839,
"learning_rate": 1.2939723030658695e-05,
"loss": 0.0266,
"step": 18660
},
{
"epoch": 19.227600411946447,
"grad_norm": 0.18425314128398895,
"learning_rate": 1.2893510803307718e-05,
"loss": 0.0246,
"step": 18670
},
{
"epoch": 19.237899073120495,
"grad_norm": 0.26258182525634766,
"learning_rate": 1.2847369025770361e-05,
"loss": 0.032,
"step": 18680
},
{
"epoch": 19.248197734294543,
"grad_norm": 0.22664831578731537,
"learning_rate": 1.2801297785650706e-05,
"loss": 0.0211,
"step": 18690
},
{
"epoch": 19.25849639546859,
"grad_norm": 0.24430438876152039,
"learning_rate": 1.2755297170418912e-05,
"loss": 0.0284,
"step": 18700
},
{
"epoch": 19.268795056642638,
"grad_norm": 0.24674183130264282,
"learning_rate": 1.2709367267411004e-05,
"loss": 0.0299,
"step": 18710
},
{
"epoch": 19.279093717816682,
"grad_norm": 0.24077735841274261,
"learning_rate": 1.2663508163828857e-05,
"loss": 0.0316,
"step": 18720
},
{
"epoch": 19.28939237899073,
"grad_norm": 0.2905300259590149,
"learning_rate": 1.2617719946739814e-05,
"loss": 0.0245,
"step": 18730
},
{
"epoch": 19.299691040164777,
"grad_norm": 0.2111150026321411,
"learning_rate": 1.2572002703076708e-05,
"loss": 0.0232,
"step": 18740
},
{
"epoch": 19.309989701338825,
"grad_norm": 0.28864920139312744,
"learning_rate": 1.2526356519637588e-05,
"loss": 0.024,
"step": 18750
},
{
"epoch": 19.320288362512873,
"grad_norm": 0.1956787258386612,
"learning_rate": 1.248078148308563e-05,
"loss": 0.0257,
"step": 18760
},
{
"epoch": 19.33058702368692,
"grad_norm": 0.44175270199775696,
"learning_rate": 1.2435277679948842e-05,
"loss": 0.0242,
"step": 18770
},
{
"epoch": 19.340885684860968,
"grad_norm": 0.18450774252414703,
"learning_rate": 1.2389845196620121e-05,
"loss": 0.0257,
"step": 18780
},
{
"epoch": 19.351184346035016,
"grad_norm": 0.17851541936397552,
"learning_rate": 1.234448411935683e-05,
"loss": 0.0289,
"step": 18790
},
{
"epoch": 19.361483007209063,
"grad_norm": 0.3439672589302063,
"learning_rate": 1.2299194534280844e-05,
"loss": 0.0221,
"step": 18800
},
{
"epoch": 19.37178166838311,
"grad_norm": 0.2443213164806366,
"learning_rate": 1.2253976527378274e-05,
"loss": 0.0242,
"step": 18810
},
{
"epoch": 19.38208032955716,
"grad_norm": 0.23600997030735016,
"learning_rate": 1.2208830184499347e-05,
"loss": 0.0305,
"step": 18820
},
{
"epoch": 19.392378990731206,
"grad_norm": 0.3016259968280792,
"learning_rate": 1.2163755591358184e-05,
"loss": 0.0273,
"step": 18830
},
{
"epoch": 19.402677651905254,
"grad_norm": 0.306363970041275,
"learning_rate": 1.211875283353277e-05,
"loss": 0.0259,
"step": 18840
},
{
"epoch": 19.412976313079298,
"grad_norm": 0.23701894283294678,
"learning_rate": 1.2073821996464613e-05,
"loss": 0.0271,
"step": 18850
},
{
"epoch": 19.423274974253346,
"grad_norm": 0.3945840895175934,
"learning_rate": 1.2028963165458728e-05,
"loss": 0.022,
"step": 18860
},
{
"epoch": 19.433573635427393,
"grad_norm": 0.2171708345413208,
"learning_rate": 1.1984176425683408e-05,
"loss": 0.0234,
"step": 18870
},
{
"epoch": 19.44387229660144,
"grad_norm": 0.252113938331604,
"learning_rate": 1.1939461862170086e-05,
"loss": 0.0245,
"step": 18880
},
{
"epoch": 19.45417095777549,
"grad_norm": 0.5461235046386719,
"learning_rate": 1.1894819559813108e-05,
"loss": 0.0247,
"step": 18890
},
{
"epoch": 19.464469618949536,
"grad_norm": 0.23864491283893585,
"learning_rate": 1.1850249603369723e-05,
"loss": 0.024,
"step": 18900
},
{
"epoch": 19.474768280123584,
"grad_norm": 0.21564757823944092,
"learning_rate": 1.1805752077459725e-05,
"loss": 0.0308,
"step": 18910
},
{
"epoch": 19.48506694129763,
"grad_norm": 0.31438037753105164,
"learning_rate": 1.1761327066565452e-05,
"loss": 0.0282,
"step": 18920
},
{
"epoch": 19.49536560247168,
"grad_norm": 0.20203201472759247,
"learning_rate": 1.1716974655031554e-05,
"loss": 0.0246,
"step": 18930
},
{
"epoch": 19.505664263645727,
"grad_norm": 0.19842277467250824,
"learning_rate": 1.1672694927064858e-05,
"loss": 0.026,
"step": 18940
},
{
"epoch": 19.515962924819775,
"grad_norm": 0.23037093877792358,
"learning_rate": 1.162848796673413e-05,
"loss": 0.0327,
"step": 18950
},
{
"epoch": 19.526261585993822,
"grad_norm": 0.366129070520401,
"learning_rate": 1.1584353857970088e-05,
"loss": 0.0264,
"step": 18960
},
{
"epoch": 19.53656024716787,
"grad_norm": 0.31818097829818726,
"learning_rate": 1.154029268456504e-05,
"loss": 0.0243,
"step": 18970
},
{
"epoch": 19.546858908341914,
"grad_norm": 0.5746592879295349,
"learning_rate": 1.1496304530172863e-05,
"loss": 0.0235,
"step": 18980
},
{
"epoch": 19.55715756951596,
"grad_norm": 0.26987478137016296,
"learning_rate": 1.1452389478308806e-05,
"loss": 0.0215,
"step": 18990
},
{
"epoch": 19.56745623069001,
"grad_norm": 0.2493455708026886,
"learning_rate": 1.1408547612349318e-05,
"loss": 0.0239,
"step": 19000
},
{
"epoch": 19.577754891864057,
"grad_norm": 0.21429450809955597,
"learning_rate": 1.1364779015531873e-05,
"loss": 0.0206,
"step": 19010
},
{
"epoch": 19.588053553038105,
"grad_norm": 0.26744788885116577,
"learning_rate": 1.1321083770954871e-05,
"loss": 0.0274,
"step": 19020
},
{
"epoch": 19.598352214212152,
"grad_norm": 0.28107255697250366,
"learning_rate": 1.1277461961577446e-05,
"loss": 0.0308,
"step": 19030
},
{
"epoch": 19.6086508753862,
"grad_norm": 0.22350192070007324,
"learning_rate": 1.1233913670219287e-05,
"loss": 0.029,
"step": 19040
},
{
"epoch": 19.618949536560248,
"grad_norm": 0.39531412720680237,
"learning_rate": 1.1190438979560536e-05,
"loss": 0.0351,
"step": 19050
},
{
"epoch": 19.629248197734295,
"grad_norm": 0.19649939239025116,
"learning_rate": 1.1147037972141545e-05,
"loss": 0.0242,
"step": 19060
},
{
"epoch": 19.639546858908343,
"grad_norm": 0.1779136210680008,
"learning_rate": 1.1103710730362821e-05,
"loss": 0.0259,
"step": 19070
},
{
"epoch": 19.64984552008239,
"grad_norm": 0.2700199782848358,
"learning_rate": 1.1060457336484803e-05,
"loss": 0.0237,
"step": 19080
},
{
"epoch": 19.660144181256438,
"grad_norm": 0.19958068430423737,
"learning_rate": 1.1017277872627719e-05,
"loss": 0.0278,
"step": 19090
},
{
"epoch": 19.670442842430482,
"grad_norm": 0.24603743851184845,
"learning_rate": 1.0974172420771444e-05,
"loss": 0.0255,
"step": 19100
},
{
"epoch": 19.68074150360453,
"grad_norm": 0.25125035643577576,
"learning_rate": 1.0931141062755346e-05,
"loss": 0.0225,
"step": 19110
},
{
"epoch": 19.691040164778578,
"grad_norm": 0.24162299931049347,
"learning_rate": 1.0888183880278074e-05,
"loss": 0.0235,
"step": 19120
},
{
"epoch": 19.701338825952625,
"grad_norm": 0.23117221891880035,
"learning_rate": 1.0845300954897492e-05,
"loss": 0.0251,
"step": 19130
},
{
"epoch": 19.711637487126673,
"grad_norm": 0.4146316945552826,
"learning_rate": 1.0802492368030471e-05,
"loss": 0.0257,
"step": 19140
},
{
"epoch": 19.72193614830072,
"grad_norm": 0.3388553261756897,
"learning_rate": 1.0759758200952729e-05,
"loss": 0.0265,
"step": 19150
},
{
"epoch": 19.732234809474768,
"grad_norm": 0.3048183023929596,
"learning_rate": 1.0717098534798714e-05,
"loss": 0.0277,
"step": 19160
},
{
"epoch": 19.742533470648816,
"grad_norm": 0.7091302871704102,
"learning_rate": 1.0674513450561429e-05,
"loss": 0.0245,
"step": 19170
},
{
"epoch": 19.752832131822863,
"grad_norm": 0.2630675435066223,
"learning_rate": 1.0632003029092235e-05,
"loss": 0.032,
"step": 19180
},
{
"epoch": 19.76313079299691,
"grad_norm": 0.2844875752925873,
"learning_rate": 1.0589567351100782e-05,
"loss": 0.0303,
"step": 19190
},
{
"epoch": 19.77342945417096,
"grad_norm": 0.24576683342456818,
"learning_rate": 1.0547206497154798e-05,
"loss": 0.0244,
"step": 19200
},
{
"epoch": 19.783728115345006,
"grad_norm": 0.19090470671653748,
"learning_rate": 1.0504920547679958e-05,
"loss": 0.0283,
"step": 19210
},
{
"epoch": 19.794026776519054,
"grad_norm": 0.35600224137306213,
"learning_rate": 1.0462709582959718e-05,
"loss": 0.0212,
"step": 19220
},
{
"epoch": 19.8043254376931,
"grad_norm": 0.31343191862106323,
"learning_rate": 1.0420573683135187e-05,
"loss": 0.0216,
"step": 19230
},
{
"epoch": 19.814624098867146,
"grad_norm": 0.2817479074001312,
"learning_rate": 1.037851292820491e-05,
"loss": 0.0294,
"step": 19240
},
{
"epoch": 19.824922760041193,
"grad_norm": 0.24477481842041016,
"learning_rate": 1.0336527398024804e-05,
"loss": 0.0187,
"step": 19250
},
{
"epoch": 19.83522142121524,
"grad_norm": 0.2589147388935089,
"learning_rate": 1.0294617172307963e-05,
"loss": 0.0262,
"step": 19260
},
{
"epoch": 19.84552008238929,
"grad_norm": 0.5249900817871094,
"learning_rate": 1.02527823306245e-05,
"loss": 0.0271,
"step": 19270
},
{
"epoch": 19.855818743563336,
"grad_norm": 0.24145112931728363,
"learning_rate": 1.02110229524014e-05,
"loss": 0.0231,
"step": 19280
},
{
"epoch": 19.866117404737384,
"grad_norm": 0.28619223833084106,
"learning_rate": 1.01693391169224e-05,
"loss": 0.0278,
"step": 19290
},
{
"epoch": 19.87641606591143,
"grad_norm": 0.23199136555194855,
"learning_rate": 1.0127730903327765e-05,
"loss": 0.0277,
"step": 19300
},
{
"epoch": 19.88671472708548,
"grad_norm": 0.3432420492172241,
"learning_rate": 1.0086198390614227e-05,
"loss": 0.0266,
"step": 19310
},
{
"epoch": 19.897013388259527,
"grad_norm": 0.21095818281173706,
"learning_rate": 1.0044741657634782e-05,
"loss": 0.024,
"step": 19320
},
{
"epoch": 19.907312049433575,
"grad_norm": 0.1991989016532898,
"learning_rate": 1.0003360783098548e-05,
"loss": 0.0224,
"step": 19330
},
{
"epoch": 19.917610710607622,
"grad_norm": 0.2295091152191162,
"learning_rate": 9.962055845570622e-06,
"loss": 0.0277,
"step": 19340
},
{
"epoch": 19.92790937178167,
"grad_norm": 0.39027097821235657,
"learning_rate": 9.920826923471943e-06,
"loss": 0.0215,
"step": 19350
},
{
"epoch": 19.938208032955714,
"grad_norm": 0.2114678919315338,
"learning_rate": 9.879674095079083e-06,
"loss": 0.0292,
"step": 19360
},
{
"epoch": 19.94850669412976,
"grad_norm": 0.22323083877563477,
"learning_rate": 9.838597438524182e-06,
"loss": 0.0236,
"step": 19370
},
{
"epoch": 19.95880535530381,
"grad_norm": 0.5763716101646423,
"learning_rate": 9.797597031794763e-06,
"loss": 0.0275,
"step": 19380
},
{
"epoch": 19.969104016477857,
"grad_norm": 0.2158919721841812,
"learning_rate": 9.75667295273357e-06,
"loss": 0.0299,
"step": 19390
},
{
"epoch": 19.979402677651905,
"grad_norm": 0.25896155834198,
"learning_rate": 9.715825279038433e-06,
"loss": 0.0226,
"step": 19400
},
{
"epoch": 19.989701338825952,
"grad_norm": 0.19433017075061798,
"learning_rate": 9.675054088262125e-06,
"loss": 0.0188,
"step": 19410
},
{
"epoch": 20.0,
"grad_norm": 0.4307090640068054,
"learning_rate": 9.634359457812192e-06,
"loss": 0.0287,
"step": 19420
},
{
"epoch": 20.010298661174048,
"grad_norm": 0.2535577714443207,
"learning_rate": 9.59374146495085e-06,
"loss": 0.0283,
"step": 19430
},
{
"epoch": 20.020597322348095,
"grad_norm": 0.2701593041419983,
"learning_rate": 9.553200186794809e-06,
"loss": 0.0249,
"step": 19440
},
{
"epoch": 20.030895983522143,
"grad_norm": 0.42833298444747925,
"learning_rate": 9.51273570031508e-06,
"loss": 0.0305,
"step": 19450
},
{
"epoch": 20.04119464469619,
"grad_norm": 0.26018840074539185,
"learning_rate": 9.472348082336973e-06,
"loss": 0.0229,
"step": 19460
},
{
"epoch": 20.05149330587024,
"grad_norm": 0.1944461166858673,
"learning_rate": 9.432037409539768e-06,
"loss": 0.0288,
"step": 19470
},
{
"epoch": 20.061791967044286,
"grad_norm": 0.2379932403564453,
"learning_rate": 9.391803758456696e-06,
"loss": 0.0283,
"step": 19480
},
{
"epoch": 20.07209062821833,
"grad_norm": 0.24124853312969208,
"learning_rate": 9.351647205474762e-06,
"loss": 0.0273,
"step": 19490
},
{
"epoch": 20.082389289392378,
"grad_norm": 0.25228720903396606,
"learning_rate": 9.311567826834593e-06,
"loss": 0.0257,
"step": 19500
},
{
"epoch": 20.092687950566425,
"grad_norm": 0.16082525253295898,
"learning_rate": 9.271565698630246e-06,
"loss": 0.024,
"step": 19510
},
{
"epoch": 20.102986611740473,
"grad_norm": 0.132386714220047,
"learning_rate": 9.231640896809202e-06,
"loss": 0.0237,
"step": 19520
},
{
"epoch": 20.11328527291452,
"grad_norm": 0.2096797674894333,
"learning_rate": 9.191793497172041e-06,
"loss": 0.0261,
"step": 19530
},
{
"epoch": 20.12358393408857,
"grad_norm": 0.3422824740409851,
"learning_rate": 9.152023575372443e-06,
"loss": 0.0252,
"step": 19540
},
{
"epoch": 20.133882595262616,
"grad_norm": 0.2562166750431061,
"learning_rate": 9.112331206916968e-06,
"loss": 0.0271,
"step": 19550
},
{
"epoch": 20.144181256436664,
"grad_norm": 0.40402811765670776,
"learning_rate": 9.072716467164965e-06,
"loss": 0.0274,
"step": 19560
},
{
"epoch": 20.15447991761071,
"grad_norm": 0.16052035987377167,
"learning_rate": 9.033179431328326e-06,
"loss": 0.0283,
"step": 19570
},
{
"epoch": 20.16477857878476,
"grad_norm": 0.24665406346321106,
"learning_rate": 8.993720174471509e-06,
"loss": 0.0272,
"step": 19580
},
{
"epoch": 20.175077239958807,
"grad_norm": 0.27554023265838623,
"learning_rate": 8.954338771511234e-06,
"loss": 0.0253,
"step": 19590
},
{
"epoch": 20.185375901132854,
"grad_norm": 0.40893182158470154,
"learning_rate": 8.915035297216434e-06,
"loss": 0.0292,
"step": 19600
},
{
"epoch": 20.195674562306902,
"grad_norm": 0.4667159914970398,
"learning_rate": 8.875809826208082e-06,
"loss": 0.027,
"step": 19610
},
{
"epoch": 20.205973223480946,
"grad_norm": 0.14994700253009796,
"learning_rate": 8.83666243295908e-06,
"loss": 0.023,
"step": 19620
},
{
"epoch": 20.216271884654994,
"grad_norm": 0.28246450424194336,
"learning_rate": 8.797593191794024e-06,
"loss": 0.0266,
"step": 19630
},
{
"epoch": 20.22657054582904,
"grad_norm": 0.2889540195465088,
"learning_rate": 8.758602176889236e-06,
"loss": 0.0227,
"step": 19640
},
{
"epoch": 20.23686920700309,
"grad_norm": 0.3575446307659149,
"learning_rate": 8.719689462272417e-06,
"loss": 0.0267,
"step": 19650
},
{
"epoch": 20.247167868177137,
"grad_norm": 1.8250188827514648,
"learning_rate": 8.680855121822673e-06,
"loss": 0.0244,
"step": 19660
},
{
"epoch": 20.257466529351184,
"grad_norm": 0.3060987889766693,
"learning_rate": 8.642099229270284e-06,
"loss": 0.027,
"step": 19670
},
{
"epoch": 20.267765190525232,
"grad_norm": 0.3836202919483185,
"learning_rate": 8.603421858196615e-06,
"loss": 0.0265,
"step": 19680
},
{
"epoch": 20.27806385169928,
"grad_norm": 0.22838439047336578,
"learning_rate": 8.56482308203388e-06,
"loss": 0.0226,
"step": 19690
},
{
"epoch": 20.288362512873327,
"grad_norm": 0.33937060832977295,
"learning_rate": 8.526302974065193e-06,
"loss": 0.0242,
"step": 19700
},
{
"epoch": 20.298661174047375,
"grad_norm": 0.17305344343185425,
"learning_rate": 8.487861607424191e-06,
"loss": 0.032,
"step": 19710
},
{
"epoch": 20.308959835221422,
"grad_norm": 0.2143699824810028,
"learning_rate": 8.449499055095089e-06,
"loss": 0.0319,
"step": 19720
},
{
"epoch": 20.31925849639547,
"grad_norm": 0.20306874811649323,
"learning_rate": 8.41121538991243e-06,
"loss": 0.024,
"step": 19730
},
{
"epoch": 20.329557157569518,
"grad_norm": 0.2488834708929062,
"learning_rate": 8.373010684561022e-06,
"loss": 0.0234,
"step": 19740
},
{
"epoch": 20.339855818743562,
"grad_norm": 0.20350010693073273,
"learning_rate": 8.334885011575694e-06,
"loss": 0.0261,
"step": 19750
},
{
"epoch": 20.35015447991761,
"grad_norm": 0.3989735543727875,
"learning_rate": 8.296838443341314e-06,
"loss": 0.0261,
"step": 19760
},
{
"epoch": 20.360453141091657,
"grad_norm": 0.2657710313796997,
"learning_rate": 8.258871052092476e-06,
"loss": 0.024,
"step": 19770
},
{
"epoch": 20.370751802265705,
"grad_norm": 0.9721512794494629,
"learning_rate": 8.220982909913504e-06,
"loss": 0.0268,
"step": 19780
},
{
"epoch": 20.381050463439752,
"grad_norm": 0.17601211369037628,
"learning_rate": 8.183174088738248e-06,
"loss": 0.0286,
"step": 19790
},
{
"epoch": 20.3913491246138,
"grad_norm": 0.1754244714975357,
"learning_rate": 8.145444660349966e-06,
"loss": 0.0255,
"step": 19800
},
{
"epoch": 20.401647785787848,
"grad_norm": 0.3604333996772766,
"learning_rate": 8.107794696381155e-06,
"loss": 0.0288,
"step": 19810
},
{
"epoch": 20.411946446961895,
"grad_norm": 0.23770973086357117,
"learning_rate": 8.07022426831347e-06,
"loss": 0.0262,
"step": 19820
},
{
"epoch": 20.422245108135943,
"grad_norm": 0.2746269404888153,
"learning_rate": 8.032733447477552e-06,
"loss": 0.0225,
"step": 19830
},
{
"epoch": 20.43254376930999,
"grad_norm": 0.22414709627628326,
"learning_rate": 7.995322305052905e-06,
"loss": 0.0344,
"step": 19840
},
{
"epoch": 20.44284243048404,
"grad_norm": 0.2225753217935562,
"learning_rate": 7.95799091206776e-06,
"loss": 0.0309,
"step": 19850
},
{
"epoch": 20.453141091658086,
"grad_norm": 0.22550413012504578,
"learning_rate": 7.920739339398908e-06,
"loss": 0.0249,
"step": 19860
},
{
"epoch": 20.46343975283213,
"grad_norm": 0.1763533353805542,
"learning_rate": 7.883567657771623e-06,
"loss": 0.0294,
"step": 19870
},
{
"epoch": 20.473738414006178,
"grad_norm": 0.22323361039161682,
"learning_rate": 7.8464759377595e-06,
"loss": 0.031,
"step": 19880
},
{
"epoch": 20.484037075180225,
"grad_norm": 0.3230327069759369,
"learning_rate": 7.809464249784309e-06,
"loss": 0.0227,
"step": 19890
},
{
"epoch": 20.494335736354273,
"grad_norm": 0.17077143490314484,
"learning_rate": 7.772532664115872e-06,
"loss": 0.0203,
"step": 19900
},
{
"epoch": 20.50463439752832,
"grad_norm": 0.2937758266925812,
"learning_rate": 7.73568125087195e-06,
"loss": 0.0274,
"step": 19910
},
{
"epoch": 20.51493305870237,
"grad_norm": 0.2762782871723175,
"learning_rate": 7.698910080018046e-06,
"loss": 0.0182,
"step": 19920
},
{
"epoch": 20.525231719876416,
"grad_norm": 0.37982815504074097,
"learning_rate": 7.662219221367356e-06,
"loss": 0.0207,
"step": 19930
},
{
"epoch": 20.535530381050464,
"grad_norm": 0.18541328608989716,
"learning_rate": 7.625608744580587e-06,
"loss": 0.0187,
"step": 19940
},
{
"epoch": 20.54582904222451,
"grad_norm": 0.24123801290988922,
"learning_rate": 7.5890787191658265e-06,
"loss": 0.0232,
"step": 19950
},
{
"epoch": 20.55612770339856,
"grad_norm": 0.2651348114013672,
"learning_rate": 7.5526292144784235e-06,
"loss": 0.0267,
"step": 19960
},
{
"epoch": 20.566426364572607,
"grad_norm": 0.24495820701122284,
"learning_rate": 7.516260299720862e-06,
"loss": 0.0249,
"step": 19970
},
{
"epoch": 20.576725025746654,
"grad_norm": 0.3241025507450104,
"learning_rate": 7.47997204394259e-06,
"loss": 0.0233,
"step": 19980
},
{
"epoch": 20.587023686920702,
"grad_norm": 0.18341611325740814,
"learning_rate": 7.443764516039947e-06,
"loss": 0.024,
"step": 19990
},
{
"epoch": 20.597322348094746,
"grad_norm": 0.2776510715484619,
"learning_rate": 7.407637784755983e-06,
"loss": 0.024,
"step": 20000
},
{
"epoch": 20.607621009268794,
"grad_norm": 0.2483317106962204,
"learning_rate": 7.37159191868037e-06,
"loss": 0.0259,
"step": 20010
},
{
"epoch": 20.61791967044284,
"grad_norm": 0.35903045535087585,
"learning_rate": 7.3356269862492276e-06,
"loss": 0.0298,
"step": 20020
},
{
"epoch": 20.62821833161689,
"grad_norm": 0.1740478128194809,
"learning_rate": 7.299743055745051e-06,
"loss": 0.0205,
"step": 20030
},
{
"epoch": 20.638516992790937,
"grad_norm": 0.289324551820755,
"learning_rate": 7.263940195296487e-06,
"loss": 0.0261,
"step": 20040
},
{
"epoch": 20.648815653964984,
"grad_norm": 0.1669989377260208,
"learning_rate": 7.228218472878323e-06,
"loss": 0.0246,
"step": 20050
},
{
"epoch": 20.659114315139032,
"grad_norm": 0.21035151183605194,
"learning_rate": 7.192577956311264e-06,
"loss": 0.0252,
"step": 20060
},
{
"epoch": 20.66941297631308,
"grad_norm": 0.3138667941093445,
"learning_rate": 7.157018713261859e-06,
"loss": 0.0257,
"step": 20070
},
{
"epoch": 20.679711637487127,
"grad_norm": 0.26812276244163513,
"learning_rate": 7.121540811242339e-06,
"loss": 0.0265,
"step": 20080
},
{
"epoch": 20.690010298661175,
"grad_norm": 0.17188164591789246,
"learning_rate": 7.086144317610521e-06,
"loss": 0.0216,
"step": 20090
},
{
"epoch": 20.700308959835223,
"grad_norm": 0.2384611815214157,
"learning_rate": 7.050829299569622e-06,
"loss": 0.0211,
"step": 20100
},
{
"epoch": 20.71060762100927,
"grad_norm": 0.23628003895282745,
"learning_rate": 7.015595824168214e-06,
"loss": 0.0229,
"step": 20110
},
{
"epoch": 20.720906282183318,
"grad_norm": 0.41519197821617126,
"learning_rate": 6.9804439583000255e-06,
"loss": 0.0325,
"step": 20120
},
{
"epoch": 20.731204943357362,
"grad_norm": 0.29381370544433594,
"learning_rate": 6.945373768703856e-06,
"loss": 0.0253,
"step": 20130
},
{
"epoch": 20.74150360453141,
"grad_norm": 0.3895536959171295,
"learning_rate": 6.910385321963431e-06,
"loss": 0.0247,
"step": 20140
},
{
"epoch": 20.751802265705457,
"grad_norm": 0.26379773020744324,
"learning_rate": 6.875478684507297e-06,
"loss": 0.0231,
"step": 20150
},
{
"epoch": 20.762100926879505,
"grad_norm": 0.20624184608459473,
"learning_rate": 6.840653922608636e-06,
"loss": 0.0327,
"step": 20160
},
{
"epoch": 20.772399588053553,
"grad_norm": 0.3396904170513153,
"learning_rate": 6.805911102385221e-06,
"loss": 0.0271,
"step": 20170
},
{
"epoch": 20.7826982492276,
"grad_norm": 0.2959730625152588,
"learning_rate": 6.771250289799236e-06,
"loss": 0.0206,
"step": 20180
},
{
"epoch": 20.792996910401648,
"grad_norm": 0.25411221385002136,
"learning_rate": 6.736671550657181e-06,
"loss": 0.0267,
"step": 20190
},
{
"epoch": 20.803295571575696,
"grad_norm": 0.5632199645042419,
"learning_rate": 6.702174950609708e-06,
"loss": 0.0269,
"step": 20200
},
{
"epoch": 20.813594232749743,
"grad_norm": 0.23322194814682007,
"learning_rate": 6.667760555151559e-06,
"loss": 0.0272,
"step": 20210
},
{
"epoch": 20.82389289392379,
"grad_norm": 0.8834056854248047,
"learning_rate": 6.6334284296213524e-06,
"loss": 0.0268,
"step": 20220
},
{
"epoch": 20.83419155509784,
"grad_norm": 0.4093743860721588,
"learning_rate": 6.599178639201542e-06,
"loss": 0.0177,
"step": 20230
},
{
"epoch": 20.844490216271886,
"grad_norm": 0.3726256489753723,
"learning_rate": 6.565011248918279e-06,
"loss": 0.0283,
"step": 20240
},
{
"epoch": 20.854788877445934,
"grad_norm": 0.24083060026168823,
"learning_rate": 6.530926323641207e-06,
"loss": 0.0226,
"step": 20250
},
{
"epoch": 20.865087538619978,
"grad_norm": 0.2924061119556427,
"learning_rate": 6.496923928083493e-06,
"loss": 0.0201,
"step": 20260
},
{
"epoch": 20.875386199794026,
"grad_norm": 0.26381853222846985,
"learning_rate": 6.463004126801531e-06,
"loss": 0.0262,
"step": 20270
},
{
"epoch": 20.885684860968073,
"grad_norm": 0.29354435205459595,
"learning_rate": 6.429166984194945e-06,
"loss": 0.0251,
"step": 20280
},
{
"epoch": 20.89598352214212,
"grad_norm": 0.267516553401947,
"learning_rate": 6.395412564506426e-06,
"loss": 0.0178,
"step": 20290
},
{
"epoch": 20.90628218331617,
"grad_norm": 0.6406127214431763,
"learning_rate": 6.361740931821608e-06,
"loss": 0.0238,
"step": 20300
},
{
"epoch": 20.916580844490216,
"grad_norm": 0.38734951615333557,
"learning_rate": 6.3281521500689e-06,
"loss": 0.0268,
"step": 20310
},
{
"epoch": 20.926879505664264,
"grad_norm": 0.148049458861351,
"learning_rate": 6.2946462830195005e-06,
"loss": 0.0213,
"step": 20320
},
{
"epoch": 20.93717816683831,
"grad_norm": 0.15365584194660187,
"learning_rate": 6.261223394287097e-06,
"loss": 0.0263,
"step": 20330
},
{
"epoch": 20.94747682801236,
"grad_norm": 0.45060428977012634,
"learning_rate": 6.22788354732789e-06,
"loss": 0.0254,
"step": 20340
},
{
"epoch": 20.957775489186407,
"grad_norm": 0.35524290800094604,
"learning_rate": 6.194626805440407e-06,
"loss": 0.0288,
"step": 20350
},
{
"epoch": 20.968074150360454,
"grad_norm": 0.20348112285137177,
"learning_rate": 6.1614532317654015e-06,
"loss": 0.0291,
"step": 20360
},
{
"epoch": 20.978372811534502,
"grad_norm": 0.4701661467552185,
"learning_rate": 6.128362889285671e-06,
"loss": 0.0292,
"step": 20370
},
{
"epoch": 20.988671472708546,
"grad_norm": 0.34608376026153564,
"learning_rate": 6.095355840826089e-06,
"loss": 0.0262,
"step": 20380
},
{
"epoch": 20.998970133882594,
"grad_norm": 0.15542836487293243,
"learning_rate": 6.062432149053293e-06,
"loss": 0.0266,
"step": 20390
},
{
"epoch": 21.00926879505664,
"grad_norm": 0.15955375134944916,
"learning_rate": 6.029591876475721e-06,
"loss": 0.0277,
"step": 20400
},
{
"epoch": 21.01956745623069,
"grad_norm": 0.3163027763366699,
"learning_rate": 5.996835085443403e-06,
"loss": 0.0233,
"step": 20410
},
{
"epoch": 21.029866117404737,
"grad_norm": 0.33183568716049194,
"learning_rate": 5.964161838147897e-06,
"loss": 0.025,
"step": 20420
},
{
"epoch": 21.040164778578784,
"grad_norm": 0.192501500248909,
"learning_rate": 5.931572196622104e-06,
"loss": 0.0239,
"step": 20430
},
{
"epoch": 21.050463439752832,
"grad_norm": 0.2504754066467285,
"learning_rate": 5.899066222740257e-06,
"loss": 0.0202,
"step": 20440
},
{
"epoch": 21.06076210092688,
"grad_norm": 0.1719776839017868,
"learning_rate": 5.866643978217667e-06,
"loss": 0.0266,
"step": 20450
},
{
"epoch": 21.071060762100927,
"grad_norm": 0.2612786293029785,
"learning_rate": 5.834305524610728e-06,
"loss": 0.0232,
"step": 20460
},
{
"epoch": 21.081359423274975,
"grad_norm": 0.2384280562400818,
"learning_rate": 5.802050923316738e-06,
"loss": 0.0252,
"step": 20470
},
{
"epoch": 21.091658084449023,
"grad_norm": 0.36142706871032715,
"learning_rate": 5.769880235573788e-06,
"loss": 0.0235,
"step": 20480
},
{
"epoch": 21.10195674562307,
"grad_norm": 0.19037047028541565,
"learning_rate": 5.737793522460633e-06,
"loss": 0.0234,
"step": 20490
},
{
"epoch": 21.112255406797118,
"grad_norm": 0.30957984924316406,
"learning_rate": 5.705790844896658e-06,
"loss": 0.0232,
"step": 20500
},
{
"epoch": 21.122554067971162,
"grad_norm": 1.0624467134475708,
"learning_rate": 5.673872263641622e-06,
"loss": 0.0283,
"step": 20510
},
{
"epoch": 21.13285272914521,
"grad_norm": 0.43074896931648254,
"learning_rate": 5.642037839295666e-06,
"loss": 0.0267,
"step": 20520
},
{
"epoch": 21.143151390319257,
"grad_norm": 0.27735230326652527,
"learning_rate": 5.6102876322991495e-06,
"loss": 0.0251,
"step": 20530
},
{
"epoch": 21.153450051493305,
"grad_norm": 0.21708090603351593,
"learning_rate": 5.5786217029325295e-06,
"loss": 0.0201,
"step": 20540
},
{
"epoch": 21.163748712667353,
"grad_norm": 0.2916223406791687,
"learning_rate": 5.547040111316232e-06,
"loss": 0.0262,
"step": 20550
},
{
"epoch": 21.1740473738414,
"grad_norm": 0.2554883360862732,
"learning_rate": 5.515542917410627e-06,
"loss": 0.0267,
"step": 20560
},
{
"epoch": 21.184346035015448,
"grad_norm": 0.26386693120002747,
"learning_rate": 5.484130181015773e-06,
"loss": 0.0227,
"step": 20570
},
{
"epoch": 21.194644696189496,
"grad_norm": 0.3534632623195648,
"learning_rate": 5.4528019617714195e-06,
"loss": 0.0261,
"step": 20580
},
{
"epoch": 21.204943357363543,
"grad_norm": 0.3145328462123871,
"learning_rate": 5.42155831915685e-06,
"loss": 0.0234,
"step": 20590
},
{
"epoch": 21.21524201853759,
"grad_norm": 0.2311139851808548,
"learning_rate": 5.3903993124907736e-06,
"loss": 0.0253,
"step": 20600
},
{
"epoch": 21.22554067971164,
"grad_norm": 0.2479424774646759,
"learning_rate": 5.35932500093117e-06,
"loss": 0.0212,
"step": 20610
},
{
"epoch": 21.235839340885686,
"grad_norm": 0.20154082775115967,
"learning_rate": 5.328335443475302e-06,
"loss": 0.0282,
"step": 20620
},
{
"epoch": 21.246138002059734,
"grad_norm": 0.19295744597911835,
"learning_rate": 5.297430698959443e-06,
"loss": 0.0242,
"step": 20630
},
{
"epoch": 21.256436663233778,
"grad_norm": 0.2379661649465561,
"learning_rate": 5.266610826058854e-06,
"loss": 0.0228,
"step": 20640
},
{
"epoch": 21.266735324407826,
"grad_norm": 0.3034205436706543,
"learning_rate": 5.235875883287705e-06,
"loss": 0.0263,
"step": 20650
},
{
"epoch": 21.277033985581873,
"grad_norm": 0.3351058065891266,
"learning_rate": 5.205225928998874e-06,
"loss": 0.026,
"step": 20660
},
{
"epoch": 21.28733264675592,
"grad_norm": 0.17250697314739227,
"learning_rate": 5.174661021383898e-06,
"loss": 0.0225,
"step": 20670
},
{
"epoch": 21.29763130792997,
"grad_norm": 0.16241209208965302,
"learning_rate": 5.144181218472838e-06,
"loss": 0.0286,
"step": 20680
},
{
"epoch": 21.307929969104016,
"grad_norm": 0.189495250582695,
"learning_rate": 5.113786578134205e-06,
"loss": 0.0267,
"step": 20690
},
{
"epoch": 21.318228630278064,
"grad_norm": 0.3670301139354706,
"learning_rate": 5.083477158074757e-06,
"loss": 0.03,
"step": 20700
},
{
"epoch": 21.32852729145211,
"grad_norm": 0.545432448387146,
"learning_rate": 5.053253015839543e-06,
"loss": 0.0245,
"step": 20710
},
{
"epoch": 21.33882595262616,
"grad_norm": 0.281653493642807,
"learning_rate": 5.0231142088116245e-06,
"loss": 0.0321,
"step": 20720
},
{
"epoch": 21.349124613800207,
"grad_norm": 0.37861448526382446,
"learning_rate": 4.993060794212096e-06,
"loss": 0.0234,
"step": 20730
},
{
"epoch": 21.359423274974255,
"grad_norm": 0.9391881823539734,
"learning_rate": 4.9630928290999026e-06,
"loss": 0.0257,
"step": 20740
},
{
"epoch": 21.369721936148302,
"grad_norm": 0.20733687281608582,
"learning_rate": 4.933210370371783e-06,
"loss": 0.0194,
"step": 20750
},
{
"epoch": 21.38002059732235,
"grad_norm": 0.20302939414978027,
"learning_rate": 4.9034134747620805e-06,
"loss": 0.0223,
"step": 20760
},
{
"epoch": 21.390319258496394,
"grad_norm": 0.3713189363479614,
"learning_rate": 4.873702198842767e-06,
"loss": 0.0212,
"step": 20770
},
{
"epoch": 21.40061791967044,
"grad_norm": 0.6167316436767578,
"learning_rate": 4.844076599023195e-06,
"loss": 0.0234,
"step": 20780
},
{
"epoch": 21.41091658084449,
"grad_norm": 0.3855701982975006,
"learning_rate": 4.814536731550073e-06,
"loss": 0.0276,
"step": 20790
},
{
"epoch": 21.421215242018537,
"grad_norm": 0.24730083346366882,
"learning_rate": 4.785082652507355e-06,
"loss": 0.0252,
"step": 20800
},
{
"epoch": 21.431513903192585,
"grad_norm": 0.19756212830543518,
"learning_rate": 4.755714417816104e-06,
"loss": 0.0273,
"step": 20810
},
{
"epoch": 21.441812564366632,
"grad_norm": 0.3603731095790863,
"learning_rate": 4.726432083234383e-06,
"loss": 0.0204,
"step": 20820
},
{
"epoch": 21.45211122554068,
"grad_norm": 0.7706936597824097,
"learning_rate": 4.697235704357217e-06,
"loss": 0.0252,
"step": 20830
},
{
"epoch": 21.462409886714727,
"grad_norm": 0.23673802614212036,
"learning_rate": 4.66812533661638e-06,
"loss": 0.0229,
"step": 20840
},
{
"epoch": 21.472708547888775,
"grad_norm": 0.184868723154068,
"learning_rate": 4.6391010352803745e-06,
"loss": 0.0272,
"step": 20850
},
{
"epoch": 21.483007209062823,
"grad_norm": 0.2611936926841736,
"learning_rate": 4.610162855454303e-06,
"loss": 0.0295,
"step": 20860
},
{
"epoch": 21.49330587023687,
"grad_norm": 0.24588996171951294,
"learning_rate": 4.581310852079762e-06,
"loss": 0.0303,
"step": 20870
},
{
"epoch": 21.503604531410918,
"grad_norm": 0.31468459963798523,
"learning_rate": 4.552545079934689e-06,
"loss": 0.0237,
"step": 20880
},
{
"epoch": 21.513903192584962,
"grad_norm": 0.26411235332489014,
"learning_rate": 4.523865593633381e-06,
"loss": 0.025,
"step": 20890
},
{
"epoch": 21.52420185375901,
"grad_norm": 0.31742873787879944,
"learning_rate": 4.4952724476262475e-06,
"loss": 0.0194,
"step": 20900
},
{
"epoch": 21.534500514933058,
"grad_norm": 0.22004817426204681,
"learning_rate": 4.466765696199798e-06,
"loss": 0.0265,
"step": 20910
},
{
"epoch": 21.544799176107105,
"grad_norm": 0.247009739279747,
"learning_rate": 4.438345393476528e-06,
"loss": 0.0242,
"step": 20920
},
{
"epoch": 21.555097837281153,
"grad_norm": 0.34784770011901855,
"learning_rate": 4.410011593414792e-06,
"loss": 0.0204,
"step": 20930
},
{
"epoch": 21.5653964984552,
"grad_norm": 0.5714616775512695,
"learning_rate": 4.381764349808687e-06,
"loss": 0.0264,
"step": 20940
},
{
"epoch": 21.575695159629248,
"grad_norm": 0.24919095635414124,
"learning_rate": 4.35360371628803e-06,
"loss": 0.0202,
"step": 20950
},
{
"epoch": 21.585993820803296,
"grad_norm": 0.1831541508436203,
"learning_rate": 4.325529746318147e-06,
"loss": 0.0234,
"step": 20960
},
{
"epoch": 21.596292481977343,
"grad_norm": 0.28681278228759766,
"learning_rate": 4.297542493199852e-06,
"loss": 0.0265,
"step": 20970
},
{
"epoch": 21.60659114315139,
"grad_norm": 0.1981225460767746,
"learning_rate": 4.269642010069319e-06,
"loss": 0.029,
"step": 20980
},
{
"epoch": 21.61688980432544,
"grad_norm": 0.3072325587272644,
"learning_rate": 4.241828349897991e-06,
"loss": 0.0216,
"step": 20990
},
{
"epoch": 21.627188465499486,
"grad_norm": 0.24011924862861633,
"learning_rate": 4.214101565492429e-06,
"loss": 0.0215,
"step": 21000
},
{
"epoch": 21.637487126673534,
"grad_norm": 0.24705877900123596,
"learning_rate": 4.186461709494316e-06,
"loss": 0.0279,
"step": 21010
},
{
"epoch": 21.647785787847578,
"grad_norm": 0.33800575137138367,
"learning_rate": 4.158908834380237e-06,
"loss": 0.0233,
"step": 21020
},
{
"epoch": 21.658084449021626,
"grad_norm": 0.13398931920528412,
"learning_rate": 4.13144299246167e-06,
"loss": 0.0181,
"step": 21030
},
{
"epoch": 21.668383110195673,
"grad_norm": 0.39062628149986267,
"learning_rate": 4.104064235884847e-06,
"loss": 0.023,
"step": 21040
},
{
"epoch": 21.67868177136972,
"grad_norm": 0.23508042097091675,
"learning_rate": 4.076772616630642e-06,
"loss": 0.0227,
"step": 21050
},
{
"epoch": 21.68898043254377,
"grad_norm": 0.2864128649234772,
"learning_rate": 4.049568186514513e-06,
"loss": 0.0219,
"step": 21060
},
{
"epoch": 21.699279093717816,
"grad_norm": 0.30533504486083984,
"learning_rate": 4.022450997186378e-06,
"loss": 0.0262,
"step": 21070
},
{
"epoch": 21.709577754891864,
"grad_norm": 0.30080464482307434,
"learning_rate": 3.99542110013052e-06,
"loss": 0.0222,
"step": 21080
},
{
"epoch": 21.71987641606591,
"grad_norm": 0.263285368680954,
"learning_rate": 3.9684785466654885e-06,
"loss": 0.0258,
"step": 21090
},
{
"epoch": 21.73017507723996,
"grad_norm": 0.2877969741821289,
"learning_rate": 3.9416233879440046e-06,
"loss": 0.0269,
"step": 21100
},
{
"epoch": 21.740473738414007,
"grad_norm": 0.6060110330581665,
"learning_rate": 3.914855674952856e-06,
"loss": 0.0253,
"step": 21110
},
{
"epoch": 21.750772399588055,
"grad_norm": 0.27484217286109924,
"learning_rate": 3.888175458512816e-06,
"loss": 0.0243,
"step": 21120
},
{
"epoch": 21.761071060762102,
"grad_norm": 0.2235790491104126,
"learning_rate": 3.86158278927854e-06,
"loss": 0.0214,
"step": 21130
},
{
"epoch": 21.77136972193615,
"grad_norm": 0.18296927213668823,
"learning_rate": 3.835077717738461e-06,
"loss": 0.0247,
"step": 21140
},
{
"epoch": 21.781668383110194,
"grad_norm": 0.22553135454654694,
"learning_rate": 3.8086602942147053e-06,
"loss": 0.0219,
"step": 21150
},
{
"epoch": 21.79196704428424,
"grad_norm": 0.2685544788837433,
"learning_rate": 3.7823305688629907e-06,
"loss": 0.0244,
"step": 21160
},
{
"epoch": 21.80226570545829,
"grad_norm": 0.22941188514232635,
"learning_rate": 3.756088591672513e-06,
"loss": 0.0253,
"step": 21170
},
{
"epoch": 21.812564366632337,
"grad_norm": 0.24472026526927948,
"learning_rate": 3.729934412465924e-06,
"loss": 0.0276,
"step": 21180
},
{
"epoch": 21.822863027806385,
"grad_norm": 0.3415147364139557,
"learning_rate": 3.7038680808991255e-06,
"loss": 0.0253,
"step": 21190
},
{
"epoch": 21.833161688980432,
"grad_norm": 0.27968692779541016,
"learning_rate": 3.677889646461252e-06,
"loss": 0.0266,
"step": 21200
},
{
"epoch": 21.84346035015448,
"grad_norm": 0.3046143352985382,
"learning_rate": 3.6519991584745782e-06,
"loss": 0.0308,
"step": 21210
},
{
"epoch": 21.853759011328528,
"grad_norm": 0.20563410222530365,
"learning_rate": 3.626196666094389e-06,
"loss": 0.0225,
"step": 21220
},
{
"epoch": 21.864057672502575,
"grad_norm": 0.22085511684417725,
"learning_rate": 3.600482218308876e-06,
"loss": 0.0241,
"step": 21230
},
{
"epoch": 21.874356333676623,
"grad_norm": 0.2871539890766144,
"learning_rate": 3.574855863939136e-06,
"loss": 0.0307,
"step": 21240
},
{
"epoch": 21.88465499485067,
"grad_norm": 0.16943010687828064,
"learning_rate": 3.5493176516389447e-06,
"loss": 0.02,
"step": 21250
},
{
"epoch": 21.894953656024718,
"grad_norm": 0.30576252937316895,
"learning_rate": 3.5238676298947726e-06,
"loss": 0.026,
"step": 21260
},
{
"epoch": 21.905252317198766,
"grad_norm": 0.369899719953537,
"learning_rate": 3.4985058470256403e-06,
"loss": 0.0242,
"step": 21270
},
{
"epoch": 21.91555097837281,
"grad_norm": 0.17503631114959717,
"learning_rate": 3.473232351183048e-06,
"loss": 0.0273,
"step": 21280
},
{
"epoch": 21.925849639546858,
"grad_norm": 0.1711639016866684,
"learning_rate": 3.4480471903508505e-06,
"loss": 0.028,
"step": 21290
},
{
"epoch": 21.936148300720905,
"grad_norm": 0.7285293340682983,
"learning_rate": 3.422950412345238e-06,
"loss": 0.0254,
"step": 21300
},
{
"epoch": 21.946446961894953,
"grad_norm": 0.35878413915634155,
"learning_rate": 3.3979420648145465e-06,
"loss": 0.0307,
"step": 21310
},
{
"epoch": 21.956745623069,
"grad_norm": 0.20208248496055603,
"learning_rate": 3.3730221952392503e-06,
"loss": 0.0182,
"step": 21320
},
{
"epoch": 21.96704428424305,
"grad_norm": 0.1312950998544693,
"learning_rate": 3.3481908509318316e-06,
"loss": 0.0199,
"step": 21330
},
{
"epoch": 21.977342945417096,
"grad_norm": 0.24594810605049133,
"learning_rate": 3.323448079036712e-06,
"loss": 0.0274,
"step": 21340
},
{
"epoch": 21.987641606591144,
"grad_norm": 0.15972022712230682,
"learning_rate": 3.2987939265301137e-06,
"loss": 0.0204,
"step": 21350
},
{
"epoch": 21.99794026776519,
"grad_norm": 0.2519496977329254,
"learning_rate": 3.2742284402200674e-06,
"loss": 0.023,
"step": 21360
},
{
"epoch": 22.00823892893924,
"grad_norm": 0.4822291135787964,
"learning_rate": 3.249751666746209e-06,
"loss": 0.0269,
"step": 21370
},
{
"epoch": 22.018537590113286,
"grad_norm": 0.1985165923833847,
"learning_rate": 3.2253636525797715e-06,
"loss": 0.024,
"step": 21380
},
{
"epoch": 22.028836251287334,
"grad_norm": 0.2741225063800812,
"learning_rate": 3.201064444023466e-06,
"loss": 0.0238,
"step": 21390
},
{
"epoch": 22.039134912461382,
"grad_norm": 0.3621535301208496,
"learning_rate": 3.176854087211406e-06,
"loss": 0.0217,
"step": 21400
},
{
"epoch": 22.049433573635426,
"grad_norm": 0.15000490844249725,
"learning_rate": 3.1527326281089895e-06,
"loss": 0.0277,
"step": 21410
},
{
"epoch": 22.059732234809474,
"grad_norm": 0.2596382200717926,
"learning_rate": 3.128700112512867e-06,
"loss": 0.0171,
"step": 21420
},
{
"epoch": 22.07003089598352,
"grad_norm": 0.15466873347759247,
"learning_rate": 3.104756586050794e-06,
"loss": 0.0283,
"step": 21430
},
{
"epoch": 22.08032955715757,
"grad_norm": 0.15102769434452057,
"learning_rate": 3.080902094181587e-06,
"loss": 0.0219,
"step": 21440
},
{
"epoch": 22.090628218331616,
"grad_norm": 0.21250209212303162,
"learning_rate": 3.0571366821950274e-06,
"loss": 0.0203,
"step": 21450
},
{
"epoch": 22.100926879505664,
"grad_norm": 0.24926158785820007,
"learning_rate": 3.0334603952117513e-06,
"loss": 0.0298,
"step": 21460
},
{
"epoch": 22.111225540679712,
"grad_norm": 0.22714541852474213,
"learning_rate": 3.0098732781832005e-06,
"loss": 0.0253,
"step": 21470
},
{
"epoch": 22.12152420185376,
"grad_norm": 0.16168661415576935,
"learning_rate": 2.9863753758915204e-06,
"loss": 0.0216,
"step": 21480
},
{
"epoch": 22.131822863027807,
"grad_norm": 0.30570539832115173,
"learning_rate": 2.9629667329494683e-06,
"loss": 0.0231,
"step": 21490
},
{
"epoch": 22.142121524201855,
"grad_norm": 0.269822359085083,
"learning_rate": 2.9396473938003153e-06,
"loss": 0.0232,
"step": 21500
},
{
"epoch": 22.152420185375902,
"grad_norm": 0.2935786247253418,
"learning_rate": 2.9164174027178413e-06,
"loss": 0.024,
"step": 21510
},
{
"epoch": 22.16271884654995,
"grad_norm": 0.8317728638648987,
"learning_rate": 2.8932768038061163e-06,
"loss": 0.0204,
"step": 21520
},
{
"epoch": 22.173017507723994,
"grad_norm": 0.2997764050960541,
"learning_rate": 2.8702256409995466e-06,
"loss": 0.0279,
"step": 21530
},
{
"epoch": 22.183316168898042,
"grad_norm": 0.25958797335624695,
"learning_rate": 2.847263958062718e-06,
"loss": 0.0213,
"step": 21540
},
{
"epoch": 22.19361483007209,
"grad_norm": 0.27575650811195374,
"learning_rate": 2.8243917985903258e-06,
"loss": 0.0218,
"step": 21550
},
{
"epoch": 22.203913491246137,
"grad_norm": 0.28547245264053345,
"learning_rate": 2.801609206007094e-06,
"loss": 0.0244,
"step": 21560
},
{
"epoch": 22.214212152420185,
"grad_norm": 0.2561958134174347,
"learning_rate": 2.778916223567729e-06,
"loss": 0.0225,
"step": 21570
},
{
"epoch": 22.224510813594232,
"grad_norm": 0.3616337478160858,
"learning_rate": 2.7563128943567607e-06,
"loss": 0.0243,
"step": 21580
},
{
"epoch": 22.23480947476828,
"grad_norm": 0.9687033295631409,
"learning_rate": 2.7337992612885275e-06,
"loss": 0.0262,
"step": 21590
},
{
"epoch": 22.245108135942328,
"grad_norm": 0.5252371430397034,
"learning_rate": 2.7113753671070774e-06,
"loss": 0.0256,
"step": 21600
},
{
"epoch": 22.255406797116375,
"grad_norm": 0.19724826514720917,
"learning_rate": 2.689041254386071e-06,
"loss": 0.0265,
"step": 21610
},
{
"epoch": 22.265705458290423,
"grad_norm": 0.27360984683036804,
"learning_rate": 2.666796965528695e-06,
"loss": 0.0256,
"step": 21620
},
{
"epoch": 22.27600411946447,
"grad_norm": 0.17367063462734222,
"learning_rate": 2.6446425427676503e-06,
"loss": 0.0254,
"step": 21630
},
{
"epoch": 22.28630278063852,
"grad_norm": 0.2514844536781311,
"learning_rate": 2.6225780281649626e-06,
"loss": 0.0325,
"step": 21640
},
{
"epoch": 22.296601441812566,
"grad_norm": 0.37126439809799194,
"learning_rate": 2.6006034636119835e-06,
"loss": 0.0188,
"step": 21650
},
{
"epoch": 22.30690010298661,
"grad_norm": 0.21366749703884125,
"learning_rate": 2.5787188908292847e-06,
"loss": 0.0232,
"step": 21660
},
{
"epoch": 22.317198764160658,
"grad_norm": 0.18360739946365356,
"learning_rate": 2.5569243513666017e-06,
"loss": 0.0275,
"step": 21670
},
{
"epoch": 22.327497425334705,
"grad_norm": 0.4810870289802551,
"learning_rate": 2.53521988660268e-06,
"loss": 0.0218,
"step": 21680
},
{
"epoch": 22.337796086508753,
"grad_norm": 0.24920041859149933,
"learning_rate": 2.513605537745317e-06,
"loss": 0.0213,
"step": 21690
},
{
"epoch": 22.3480947476828,
"grad_norm": 0.24797523021697998,
"learning_rate": 2.492081345831171e-06,
"loss": 0.0257,
"step": 21700
},
{
"epoch": 22.35839340885685,
"grad_norm": 0.3930380642414093,
"learning_rate": 2.4706473517257413e-06,
"loss": 0.0313,
"step": 21710
},
{
"epoch": 22.368692070030896,
"grad_norm": 0.26522117853164673,
"learning_rate": 2.449303596123287e-06,
"loss": 0.0242,
"step": 21720
},
{
"epoch": 22.378990731204944,
"grad_norm": 0.27054446935653687,
"learning_rate": 2.4280501195467374e-06,
"loss": 0.0248,
"step": 21730
},
{
"epoch": 22.38928939237899,
"grad_norm": 0.3780190348625183,
"learning_rate": 2.4068869623476097e-06,
"loss": 0.0228,
"step": 21740
},
{
"epoch": 22.39958805355304,
"grad_norm": 0.2384350597858429,
"learning_rate": 2.3858141647059683e-06,
"loss": 0.022,
"step": 21750
},
{
"epoch": 22.409886714727087,
"grad_norm": 0.21788683533668518,
"learning_rate": 2.3648317666302823e-06,
"loss": 0.023,
"step": 21760
},
{
"epoch": 22.420185375901134,
"grad_norm": 0.3061414659023285,
"learning_rate": 2.343939807957429e-06,
"loss": 0.0216,
"step": 21770
},
{
"epoch": 22.430484037075182,
"grad_norm": 0.186418816447258,
"learning_rate": 2.3231383283525588e-06,
"loss": 0.0275,
"step": 21780
},
{
"epoch": 22.440782698249226,
"grad_norm": 0.25363844633102417,
"learning_rate": 2.302427367309046e-06,
"loss": 0.0204,
"step": 21790
},
{
"epoch": 22.451081359423274,
"grad_norm": 0.20051760971546173,
"learning_rate": 2.2818069641483864e-06,
"loss": 0.026,
"step": 21800
},
{
"epoch": 22.46138002059732,
"grad_norm": 0.29427051544189453,
"learning_rate": 2.2612771580201863e-06,
"loss": 0.0216,
"step": 21810
},
{
"epoch": 22.47167868177137,
"grad_norm": 0.14624521136283875,
"learning_rate": 2.2408379879020114e-06,
"loss": 0.0217,
"step": 21820
},
{
"epoch": 22.481977342945417,
"grad_norm": 0.21760720014572144,
"learning_rate": 2.2204894925993535e-06,
"loss": 0.0265,
"step": 21830
},
{
"epoch": 22.492276004119464,
"grad_norm": 0.23411710560321808,
"learning_rate": 2.200231710745565e-06,
"loss": 0.0258,
"step": 21840
},
{
"epoch": 22.502574665293512,
"grad_norm": 0.38780733942985535,
"learning_rate": 2.1800646808017576e-06,
"loss": 0.0243,
"step": 21850
},
{
"epoch": 22.51287332646756,
"grad_norm": 1.6846517324447632,
"learning_rate": 2.1599884410567427e-06,
"loss": 0.0293,
"step": 21860
},
{
"epoch": 22.523171987641607,
"grad_norm": 0.3452723026275635,
"learning_rate": 2.1400030296269633e-06,
"loss": 0.0248,
"step": 21870
},
{
"epoch": 22.533470648815655,
"grad_norm": 0.30434274673461914,
"learning_rate": 2.1201084844564124e-06,
"loss": 0.0267,
"step": 21880
},
{
"epoch": 22.543769309989703,
"grad_norm": 0.28344815969467163,
"learning_rate": 2.1003048433165806e-06,
"loss": 0.0226,
"step": 21890
},
{
"epoch": 22.55406797116375,
"grad_norm": 0.3896522521972656,
"learning_rate": 2.0805921438063593e-06,
"loss": 0.0248,
"step": 21900
},
{
"epoch": 22.564366632337794,
"grad_norm": 0.2550482749938965,
"learning_rate": 2.0609704233519657e-06,
"loss": 0.0226,
"step": 21910
},
{
"epoch": 22.574665293511842,
"grad_norm": 0.21991167962551117,
"learning_rate": 2.0414397192069003e-06,
"loss": 0.0258,
"step": 21920
},
{
"epoch": 22.58496395468589,
"grad_norm": 0.19379939138889313,
"learning_rate": 2.022000068451868e-06,
"loss": 0.0199,
"step": 21930
},
{
"epoch": 22.595262615859937,
"grad_norm": 0.21848715841770172,
"learning_rate": 2.0026515079946906e-06,
"loss": 0.0205,
"step": 21940
},
{
"epoch": 22.605561277033985,
"grad_norm": 0.22498224675655365,
"learning_rate": 1.983394074570244e-06,
"loss": 0.0223,
"step": 21950
},
{
"epoch": 22.615859938208033,
"grad_norm": 0.279851496219635,
"learning_rate": 1.9642278047404095e-06,
"loss": 0.0278,
"step": 21960
},
{
"epoch": 22.62615859938208,
"grad_norm": 0.22556227445602417,
"learning_rate": 1.9451527348939568e-06,
"loss": 0.0228,
"step": 21970
},
{
"epoch": 22.636457260556128,
"grad_norm": 0.13972289860248566,
"learning_rate": 1.926168901246539e-06,
"loss": 0.0228,
"step": 21980
},
{
"epoch": 22.646755921730175,
"grad_norm": 0.4148993194103241,
"learning_rate": 1.907276339840558e-06,
"loss": 0.0259,
"step": 21990
},
{
"epoch": 22.657054582904223,
"grad_norm": 0.33898934721946716,
"learning_rate": 1.8884750865451494e-06,
"loss": 0.0302,
"step": 22000
},
{
"epoch": 22.66735324407827,
"grad_norm": 0.2901424169540405,
"learning_rate": 1.8697651770560876e-06,
"loss": 0.021,
"step": 22010
},
{
"epoch": 22.67765190525232,
"grad_norm": 0.26389849185943604,
"learning_rate": 1.851146646895724e-06,
"loss": 0.0259,
"step": 22020
},
{
"epoch": 22.687950566426366,
"grad_norm": 0.43549320101737976,
"learning_rate": 1.8326195314129047e-06,
"loss": 0.0262,
"step": 22030
},
{
"epoch": 22.698249227600414,
"grad_norm": 0.17495097219944,
"learning_rate": 1.8141838657829313e-06,
"loss": 0.0253,
"step": 22040
},
{
"epoch": 22.708547888774458,
"grad_norm": 0.3065433204174042,
"learning_rate": 1.7958396850074832e-06,
"loss": 0.0218,
"step": 22050
},
{
"epoch": 22.718846549948505,
"grad_norm": 0.2452508509159088,
"learning_rate": 1.7775870239145398e-06,
"loss": 0.0183,
"step": 22060
},
{
"epoch": 22.729145211122553,
"grad_norm": 0.2773599326610565,
"learning_rate": 1.7594259171583195e-06,
"loss": 0.0234,
"step": 22070
},
{
"epoch": 22.7394438722966,
"grad_norm": 0.37604328989982605,
"learning_rate": 1.7413563992192294e-06,
"loss": 0.0187,
"step": 22080
},
{
"epoch": 22.74974253347065,
"grad_norm": 0.2879926264286041,
"learning_rate": 1.723378504403772e-06,
"loss": 0.0253,
"step": 22090
},
{
"epoch": 22.760041194644696,
"grad_norm": 0.6065763831138611,
"learning_rate": 1.7054922668445106e-06,
"loss": 0.0284,
"step": 22100
},
{
"epoch": 22.770339855818744,
"grad_norm": 0.2422868013381958,
"learning_rate": 1.687697720499981e-06,
"loss": 0.0286,
"step": 22110
},
{
"epoch": 22.78063851699279,
"grad_norm": 0.3824164569377899,
"learning_rate": 1.6699948991546366e-06,
"loss": 0.0218,
"step": 22120
},
{
"epoch": 22.79093717816684,
"grad_norm": 0.27847930788993835,
"learning_rate": 1.6523838364187806e-06,
"loss": 0.0289,
"step": 22130
},
{
"epoch": 22.801235839340887,
"grad_norm": 0.1401498168706894,
"learning_rate": 1.6348645657285166e-06,
"loss": 0.0207,
"step": 22140
},
{
"epoch": 22.811534500514934,
"grad_norm": 0.3438400328159332,
"learning_rate": 1.617437120345655e-06,
"loss": 0.0216,
"step": 22150
},
{
"epoch": 22.821833161688982,
"grad_norm": 0.18453820049762726,
"learning_rate": 1.6001015333576786e-06,
"loss": 0.0213,
"step": 22160
},
{
"epoch": 22.832131822863026,
"grad_norm": 0.38582921028137207,
"learning_rate": 1.5828578376776704e-06,
"loss": 0.0236,
"step": 22170
},
{
"epoch": 22.842430484037074,
"grad_norm": 0.2904150187969208,
"learning_rate": 1.565706066044248e-06,
"loss": 0.0225,
"step": 22180
},
{
"epoch": 22.85272914521112,
"grad_norm": 0.21802499890327454,
"learning_rate": 1.5486462510215016e-06,
"loss": 0.0253,
"step": 22190
},
{
"epoch": 22.86302780638517,
"grad_norm": 0.14389820396900177,
"learning_rate": 1.5316784249989447e-06,
"loss": 0.0253,
"step": 22200
},
{
"epoch": 22.873326467559217,
"grad_norm": 0.8343854546546936,
"learning_rate": 1.5148026201914134e-06,
"loss": 0.0231,
"step": 22210
},
{
"epoch": 22.883625128733264,
"grad_norm": 0.20952999591827393,
"learning_rate": 1.4980188686390672e-06,
"loss": 0.0233,
"step": 22220
},
{
"epoch": 22.893923789907312,
"grad_norm": 0.2580125331878662,
"learning_rate": 1.4813272022072778e-06,
"loss": 0.0239,
"step": 22230
},
{
"epoch": 22.90422245108136,
"grad_norm": 0.31164586544036865,
"learning_rate": 1.4647276525865894e-06,
"loss": 0.0282,
"step": 22240
},
{
"epoch": 22.914521112255407,
"grad_norm": 0.1792629510164261,
"learning_rate": 1.448220251292648e-06,
"loss": 0.0195,
"step": 22250
},
{
"epoch": 22.924819773429455,
"grad_norm": 0.37863343954086304,
"learning_rate": 1.431805029666161e-06,
"loss": 0.0265,
"step": 22260
},
{
"epoch": 22.935118434603503,
"grad_norm": 0.2690068781375885,
"learning_rate": 1.41548201887281e-06,
"loss": 0.024,
"step": 22270
},
{
"epoch": 22.94541709577755,
"grad_norm": 0.236667662858963,
"learning_rate": 1.3992512499032217e-06,
"loss": 0.0244,
"step": 22280
},
{
"epoch": 22.955715756951598,
"grad_norm": 0.2539116144180298,
"learning_rate": 1.3831127535728794e-06,
"loss": 0.023,
"step": 22290
},
{
"epoch": 22.966014418125642,
"grad_norm": 0.2247859090566635,
"learning_rate": 1.3670665605220845e-06,
"loss": 0.0203,
"step": 22300
},
{
"epoch": 22.97631307929969,
"grad_norm": 0.292784184217453,
"learning_rate": 1.3511127012159007e-06,
"loss": 0.0196,
"step": 22310
},
{
"epoch": 22.986611740473737,
"grad_norm": 0.22877749800682068,
"learning_rate": 1.3352512059440825e-06,
"loss": 0.0234,
"step": 22320
},
{
"epoch": 22.996910401647785,
"grad_norm": 0.3564964830875397,
"learning_rate": 1.3194821048210126e-06,
"loss": 0.0208,
"step": 22330
},
{
"epoch": 23.007209062821833,
"grad_norm": 0.2306360900402069,
"learning_rate": 1.3038054277856703e-06,
"loss": 0.0247,
"step": 22340
},
{
"epoch": 23.01750772399588,
"grad_norm": 0.37728771567344666,
"learning_rate": 1.2882212046015641e-06,
"loss": 0.0268,
"step": 22350
},
{
"epoch": 23.027806385169928,
"grad_norm": 0.21463976800441742,
"learning_rate": 1.2727294648566424e-06,
"loss": 0.0236,
"step": 22360
},
{
"epoch": 23.038105046343976,
"grad_norm": 0.24805015325546265,
"learning_rate": 1.2573302379633112e-06,
"loss": 0.0204,
"step": 22370
},
{
"epoch": 23.048403707518023,
"grad_norm": 0.20930248498916626,
"learning_rate": 1.2420235531582892e-06,
"loss": 0.0234,
"step": 22380
},
{
"epoch": 23.05870236869207,
"grad_norm": 0.32791492342948914,
"learning_rate": 1.2268094395026186e-06,
"loss": 0.0264,
"step": 22390
},
{
"epoch": 23.06900102986612,
"grad_norm": 0.1751231551170349,
"learning_rate": 1.2116879258815772e-06,
"loss": 0.0192,
"step": 22400
},
{
"epoch": 23.079299691040166,
"grad_norm": 0.12545523047447205,
"learning_rate": 1.1966590410046607e-06,
"loss": 0.022,
"step": 22410
},
{
"epoch": 23.089598352214214,
"grad_norm": 0.29402562975883484,
"learning_rate": 1.1817228134054502e-06,
"loss": 0.0277,
"step": 22420
},
{
"epoch": 23.099897013388258,
"grad_norm": 0.17338956892490387,
"learning_rate": 1.1668792714416676e-06,
"loss": 0.0271,
"step": 22430
},
{
"epoch": 23.110195674562306,
"grad_norm": 0.16912265121936798,
"learning_rate": 1.152128443295014e-06,
"loss": 0.0259,
"step": 22440
},
{
"epoch": 23.120494335736353,
"grad_norm": 0.2655620276927948,
"learning_rate": 1.1374703569711986e-06,
"loss": 0.0267,
"step": 22450
},
{
"epoch": 23.1307929969104,
"grad_norm": 0.23414306342601776,
"learning_rate": 1.1229050402998375e-06,
"loss": 0.0298,
"step": 22460
},
{
"epoch": 23.14109165808445,
"grad_norm": 0.28117451071739197,
"learning_rate": 1.1084325209344216e-06,
"loss": 0.0311,
"step": 22470
},
{
"epoch": 23.151390319258496,
"grad_norm": 0.2047118991613388,
"learning_rate": 1.0940528263522376e-06,
"loss": 0.0237,
"step": 22480
},
{
"epoch": 23.161688980432544,
"grad_norm": 0.2524803578853607,
"learning_rate": 1.0797659838543805e-06,
"loss": 0.0246,
"step": 22490
},
{
"epoch": 23.17198764160659,
"grad_norm": 0.16907493770122528,
"learning_rate": 1.0655720205656083e-06,
"loss": 0.0229,
"step": 22500
},
{
"epoch": 23.18228630278064,
"grad_norm": 0.2680252492427826,
"learning_rate": 1.0514709634343812e-06,
"loss": 0.0263,
"step": 22510
},
{
"epoch": 23.192584963954687,
"grad_norm": 0.24415595829486847,
"learning_rate": 1.0374628392327335e-06,
"loss": 0.0223,
"step": 22520
},
{
"epoch": 23.202883625128734,
"grad_norm": 0.2939612865447998,
"learning_rate": 1.0235476745562967e-06,
"loss": 0.023,
"step": 22530
},
{
"epoch": 23.213182286302782,
"grad_norm": 0.2137501835823059,
"learning_rate": 1.0097254958241653e-06,
"loss": 0.0198,
"step": 22540
},
{
"epoch": 23.22348094747683,
"grad_norm": 0.1515851467847824,
"learning_rate": 9.959963292789364e-07,
"loss": 0.022,
"step": 22550
},
{
"epoch": 23.233779608650874,
"grad_norm": 0.24298404157161713,
"learning_rate": 9.823602009865873e-07,
"loss": 0.0231,
"step": 22560
},
{
"epoch": 23.24407826982492,
"grad_norm": 0.3320057690143585,
"learning_rate": 9.68817136836464e-07,
"loss": 0.0219,
"step": 22570
},
{
"epoch": 23.25437693099897,
"grad_norm": 0.4774312674999237,
"learning_rate": 9.553671625412264e-07,
"loss": 0.0286,
"step": 22580
},
{
"epoch": 23.264675592173017,
"grad_norm": 0.3456204831600189,
"learning_rate": 9.420103036367811e-07,
"loss": 0.018,
"step": 22590
},
{
"epoch": 23.274974253347064,
"grad_norm": 0.1354740411043167,
"learning_rate": 9.287465854822597e-07,
"loss": 0.0247,
"step": 22600
},
{
"epoch": 23.285272914521112,
"grad_norm": 0.27582287788391113,
"learning_rate": 9.155760332599627e-07,
"loss": 0.0237,
"step": 22610
},
{
"epoch": 23.29557157569516,
"grad_norm": 0.23232264816761017,
"learning_rate": 9.024986719752881e-07,
"loss": 0.0229,
"step": 22620
},
{
"epoch": 23.305870236869207,
"grad_norm": 0.33807799220085144,
"learning_rate": 8.89514526456714e-07,
"loss": 0.0255,
"step": 22630
},
{
"epoch": 23.316168898043255,
"grad_norm": 0.2717418670654297,
"learning_rate": 8.766236213557544e-07,
"loss": 0.0157,
"step": 22640
},
{
"epoch": 23.326467559217303,
"grad_norm": 0.233852356672287,
"learning_rate": 8.638259811468708e-07,
"loss": 0.0235,
"step": 22650
},
{
"epoch": 23.33676622039135,
"grad_norm": 0.3999418318271637,
"learning_rate": 8.511216301274772e-07,
"loss": 0.0235,
"step": 22660
},
{
"epoch": 23.347064881565398,
"grad_norm": 0.258497029542923,
"learning_rate": 8.385105924178516e-07,
"loss": 0.0277,
"step": 22670
},
{
"epoch": 23.357363542739442,
"grad_norm": 0.27405309677124023,
"learning_rate": 8.259928919611248e-07,
"loss": 0.0222,
"step": 22680
},
{
"epoch": 23.36766220391349,
"grad_norm": 0.13552913069725037,
"learning_rate": 8.135685525232028e-07,
"loss": 0.0205,
"step": 22690
},
{
"epoch": 23.377960865087537,
"grad_norm": 0.4466046094894409,
"learning_rate": 8.012375976927611e-07,
"loss": 0.0293,
"step": 22700
},
{
"epoch": 23.388259526261585,
"grad_norm": 0.15895040333271027,
"learning_rate": 7.890000508811501e-07,
"loss": 0.0293,
"step": 22710
},
{
"epoch": 23.398558187435633,
"grad_norm": 0.27999603748321533,
"learning_rate": 7.768559353223958e-07,
"loss": 0.0244,
"step": 22720
},
{
"epoch": 23.40885684860968,
"grad_norm": 0.30583688616752625,
"learning_rate": 7.648052740731215e-07,
"loss": 0.0237,
"step": 22730
},
{
"epoch": 23.419155509783728,
"grad_norm": 0.16827332973480225,
"learning_rate": 7.528480900125368e-07,
"loss": 0.024,
"step": 22740
},
{
"epoch": 23.429454170957776,
"grad_norm": 0.24422600865364075,
"learning_rate": 7.409844058423709e-07,
"loss": 0.0298,
"step": 22750
},
{
"epoch": 23.439752832131823,
"grad_norm": 0.17749178409576416,
"learning_rate": 7.292142440868289e-07,
"loss": 0.0235,
"step": 22760
},
{
"epoch": 23.45005149330587,
"grad_norm": 0.3712615370750427,
"learning_rate": 7.175376270925571e-07,
"loss": 0.028,
"step": 22770
},
{
"epoch": 23.46035015447992,
"grad_norm": 0.3284103572368622,
"learning_rate": 7.059545770286058e-07,
"loss": 0.0234,
"step": 22780
},
{
"epoch": 23.470648815653966,
"grad_norm": 0.3628927767276764,
"learning_rate": 6.94465115886378e-07,
"loss": 0.0279,
"step": 22790
},
{
"epoch": 23.480947476828014,
"grad_norm": 0.24092569947242737,
"learning_rate": 6.830692654795856e-07,
"loss": 0.0295,
"step": 22800
},
{
"epoch": 23.491246138002058,
"grad_norm": 0.25664201378822327,
"learning_rate": 6.717670474442217e-07,
"loss": 0.0237,
"step": 22810
},
{
"epoch": 23.501544799176106,
"grad_norm": 0.31676584482192993,
"learning_rate": 6.605584832384992e-07,
"loss": 0.0247,
"step": 22820
},
{
"epoch": 23.511843460350153,
"grad_norm": 0.2529754936695099,
"learning_rate": 6.49443594142829e-07,
"loss": 0.0272,
"step": 22830
},
{
"epoch": 23.5221421215242,
"grad_norm": 0.224832683801651,
"learning_rate": 6.384224012597695e-07,
"loss": 0.0262,
"step": 22840
},
{
"epoch": 23.53244078269825,
"grad_norm": 0.35248956084251404,
"learning_rate": 6.274949255139883e-07,
"loss": 0.0233,
"step": 22850
},
{
"epoch": 23.542739443872296,
"grad_norm": 0.146409809589386,
"learning_rate": 6.166611876522288e-07,
"loss": 0.0214,
"step": 22860
},
{
"epoch": 23.553038105046344,
"grad_norm": 0.25351008772850037,
"learning_rate": 6.059212082432542e-07,
"loss": 0.0204,
"step": 22870
},
{
"epoch": 23.56333676622039,
"grad_norm": 0.22615984082221985,
"learning_rate": 5.952750076778312e-07,
"loss": 0.0197,
"step": 22880
},
{
"epoch": 23.57363542739444,
"grad_norm": 0.4564981162548065,
"learning_rate": 5.847226061686695e-07,
"loss": 0.0263,
"step": 22890
},
{
"epoch": 23.583934088568487,
"grad_norm": 0.2238938808441162,
"learning_rate": 5.742640237503927e-07,
"loss": 0.0202,
"step": 22900
},
{
"epoch": 23.594232749742535,
"grad_norm": 0.3683464825153351,
"learning_rate": 5.638992802795173e-07,
"loss": 0.0288,
"step": 22910
},
{
"epoch": 23.604531410916582,
"grad_norm": 0.21569469571113586,
"learning_rate": 5.536283954343747e-07,
"loss": 0.025,
"step": 22920
},
{
"epoch": 23.61483007209063,
"grad_norm": 0.24484778940677643,
"learning_rate": 5.434513887151216e-07,
"loss": 0.0241,
"step": 22930
},
{
"epoch": 23.625128733264674,
"grad_norm": 0.2003822773694992,
"learning_rate": 5.333682794436578e-07,
"loss": 0.0256,
"step": 22940
},
{
"epoch": 23.63542739443872,
"grad_norm": 0.20311187207698822,
"learning_rate": 5.233790867636257e-07,
"loss": 0.0253,
"step": 22950
},
{
"epoch": 23.64572605561277,
"grad_norm": 0.2250942438840866,
"learning_rate": 5.134838296403544e-07,
"loss": 0.0211,
"step": 22960
},
{
"epoch": 23.656024716786817,
"grad_norm": 0.21214169263839722,
"learning_rate": 5.03682526860827e-07,
"loss": 0.026,
"step": 22970
},
{
"epoch": 23.666323377960865,
"grad_norm": 0.23194220662117004,
"learning_rate": 4.939751970336415e-07,
"loss": 0.0224,
"step": 22980
},
{
"epoch": 23.676622039134912,
"grad_norm": 0.19989456236362457,
"learning_rate": 4.843618585889942e-07,
"loss": 0.0252,
"step": 22990
},
{
"epoch": 23.68692070030896,
"grad_norm": 0.23081596195697784,
"learning_rate": 4.748425297786241e-07,
"loss": 0.0309,
"step": 23000
},
{
"epoch": 23.697219361483008,
"grad_norm": 0.35934457182884216,
"learning_rate": 4.654172286757741e-07,
"loss": 0.0237,
"step": 23010
},
{
"epoch": 23.707518022657055,
"grad_norm": 0.3558262586593628,
"learning_rate": 4.5608597317517987e-07,
"loss": 0.0199,
"step": 23020
},
{
"epoch": 23.717816683831103,
"grad_norm": 0.22818619012832642,
"learning_rate": 4.468487809930255e-07,
"loss": 0.0208,
"step": 23030
},
{
"epoch": 23.72811534500515,
"grad_norm": 0.23627056181430817,
"learning_rate": 4.377056696668991e-07,
"loss": 0.0229,
"step": 23040
},
{
"epoch": 23.738414006179198,
"grad_norm": 0.4432663023471832,
"learning_rate": 4.286566565557759e-07,
"loss": 0.0208,
"step": 23050
},
{
"epoch": 23.748712667353246,
"grad_norm": 0.19427362084388733,
"learning_rate": 4.197017588399743e-07,
"loss": 0.0191,
"step": 23060
},
{
"epoch": 23.75901132852729,
"grad_norm": 0.3566558361053467,
"learning_rate": 4.108409935211166e-07,
"loss": 0.0251,
"step": 23070
},
{
"epoch": 23.769309989701338,
"grad_norm": 0.19104306399822235,
"learning_rate": 4.0207437742212363e-07,
"loss": 0.0285,
"step": 23080
},
{
"epoch": 23.779608650875385,
"grad_norm": 0.37885013222694397,
"learning_rate": 3.934019271871592e-07,
"loss": 0.0256,
"step": 23090
},
{
"epoch": 23.789907312049433,
"grad_norm": 0.20795543491840363,
"learning_rate": 3.8482365928160236e-07,
"loss": 0.0215,
"step": 23100
},
{
"epoch": 23.80020597322348,
"grad_norm": 0.14693936705589294,
"learning_rate": 3.7633958999202523e-07,
"loss": 0.0255,
"step": 23110
},
{
"epoch": 23.810504634397528,
"grad_norm": 0.19041191041469574,
"learning_rate": 3.679497354261485e-07,
"loss": 0.0257,
"step": 23120
},
{
"epoch": 23.820803295571576,
"grad_norm": 0.3607507050037384,
"learning_rate": 3.5965411151282493e-07,
"loss": 0.0211,
"step": 23130
},
{
"epoch": 23.831101956745623,
"grad_norm": 0.138556107878685,
"learning_rate": 3.5145273400200017e-07,
"loss": 0.019,
"step": 23140
},
{
"epoch": 23.84140061791967,
"grad_norm": 0.14976058900356293,
"learning_rate": 3.4334561846467995e-07,
"loss": 0.0217,
"step": 23150
},
{
"epoch": 23.85169927909372,
"grad_norm": 0.7945707440376282,
"learning_rate": 3.353327802929074e-07,
"loss": 0.027,
"step": 23160
},
{
"epoch": 23.861997940267766,
"grad_norm": 0.7725450992584229,
"learning_rate": 3.274142346997466e-07,
"loss": 0.0255,
"step": 23170
},
{
"epoch": 23.872296601441814,
"grad_norm": 0.25931257009506226,
"learning_rate": 3.195899967192162e-07,
"loss": 0.0243,
"step": 23180
},
{
"epoch": 23.882595262615858,
"grad_norm": 0.5646887421607971,
"learning_rate": 3.118600812063e-07,
"loss": 0.0265,
"step": 23190
},
{
"epoch": 23.892893923789906,
"grad_norm": 0.21446570754051208,
"learning_rate": 3.042245028368973e-07,
"loss": 0.024,
"step": 23200
},
{
"epoch": 23.903192584963953,
"grad_norm": 0.5765520930290222,
"learning_rate": 2.966832761077953e-07,
"loss": 0.0249,
"step": 23210
},
{
"epoch": 23.913491246138,
"grad_norm": 0.7060703039169312,
"learning_rate": 2.892364153366578e-07,
"loss": 0.0312,
"step": 23220
},
{
"epoch": 23.92378990731205,
"grad_norm": 0.3687019646167755,
"learning_rate": 2.818839346619806e-07,
"loss": 0.0231,
"step": 23230
},
{
"epoch": 23.934088568486096,
"grad_norm": 0.28896424174308777,
"learning_rate": 2.7462584804306966e-07,
"loss": 0.0291,
"step": 23240
},
{
"epoch": 23.944387229660144,
"grad_norm": 0.7275195121765137,
"learning_rate": 2.6746216926001875e-07,
"loss": 0.028,
"step": 23250
},
{
"epoch": 23.95468589083419,
"grad_norm": 0.3027587831020355,
"learning_rate": 2.603929119136761e-07,
"loss": 0.0276,
"step": 23260
},
{
"epoch": 23.96498455200824,
"grad_norm": 0.14704370498657227,
"learning_rate": 2.534180894256277e-07,
"loss": 0.0263,
"step": 23270
},
{
"epoch": 23.975283213182287,
"grad_norm": 0.1789170503616333,
"learning_rate": 2.4653771503816424e-07,
"loss": 0.0263,
"step": 23280
},
{
"epoch": 23.985581874356335,
"grad_norm": 0.20929020643234253,
"learning_rate": 2.3975180181426414e-07,
"loss": 0.0221,
"step": 23290
},
{
"epoch": 23.995880535530382,
"grad_norm": 0.18785648047924042,
"learning_rate": 2.3306036263754938e-07,
"loss": 0.0255,
"step": 23300
},
{
"epoch": 24.00617919670443,
"grad_norm": 0.8915624022483826,
"learning_rate": 2.264634102122909e-07,
"loss": 0.0232,
"step": 23310
},
{
"epoch": 24.016477857878474,
"grad_norm": 0.24266694486141205,
"learning_rate": 2.1996095706335872e-07,
"loss": 0.0242,
"step": 23320
},
{
"epoch": 24.02677651905252,
"grad_norm": 0.1327933520078659,
"learning_rate": 2.1355301553621644e-07,
"loss": 0.0304,
"step": 23330
},
{
"epoch": 24.03707518022657,
"grad_norm": 0.3175448179244995,
"learning_rate": 2.072395977968711e-07,
"loss": 0.0211,
"step": 23340
},
{
"epoch": 24.047373841400617,
"grad_norm": 0.19741173088550568,
"learning_rate": 2.0102071583190108e-07,
"loss": 0.0179,
"step": 23350
},
{
"epoch": 24.057672502574665,
"grad_norm": 0.363275408744812,
"learning_rate": 1.9489638144836176e-07,
"loss": 0.0255,
"step": 23360
},
{
"epoch": 24.067971163748712,
"grad_norm": 0.2827618420124054,
"learning_rate": 1.8886660627383534e-07,
"loss": 0.0216,
"step": 23370
},
{
"epoch": 24.07826982492276,
"grad_norm": 0.13472236692905426,
"learning_rate": 1.829314017563477e-07,
"loss": 0.0196,
"step": 23380
},
{
"epoch": 24.088568486096808,
"grad_norm": 0.25898340344429016,
"learning_rate": 1.7709077916440163e-07,
"loss": 0.0229,
"step": 23390
},
{
"epoch": 24.098867147270855,
"grad_norm": 0.5560616850852966,
"learning_rate": 1.7134474958689917e-07,
"loss": 0.0216,
"step": 23400
},
{
"epoch": 24.109165808444903,
"grad_norm": 0.32972094416618347,
"learning_rate": 1.6569332393317483e-07,
"loss": 0.0259,
"step": 23410
},
{
"epoch": 24.11946446961895,
"grad_norm": 0.2403629869222641,
"learning_rate": 1.6013651293293464e-07,
"loss": 0.0252,
"step": 23420
},
{
"epoch": 24.129763130793,
"grad_norm": 0.237924724817276,
"learning_rate": 1.5467432713625607e-07,
"loss": 0.0235,
"step": 23430
},
{
"epoch": 24.140061791967046,
"grad_norm": 0.21502567827701569,
"learning_rate": 1.4930677691356033e-07,
"loss": 0.0205,
"step": 23440
},
{
"epoch": 24.15036045314109,
"grad_norm": 0.27757346630096436,
"learning_rate": 1.4403387245560117e-07,
"loss": 0.0322,
"step": 23450
},
{
"epoch": 24.160659114315138,
"grad_norm": 0.2888646125793457,
"learning_rate": 1.3885562377343176e-07,
"loss": 0.0205,
"step": 23460
},
{
"epoch": 24.170957775489185,
"grad_norm": 0.25868770480155945,
"learning_rate": 1.3377204069839333e-07,
"loss": 0.0224,
"step": 23470
},
{
"epoch": 24.181256436663233,
"grad_norm": 0.25292280316352844,
"learning_rate": 1.2878313288209876e-07,
"loss": 0.0216,
"step": 23480
},
{
"epoch": 24.19155509783728,
"grad_norm": 0.8537740707397461,
"learning_rate": 1.2388890979641576e-07,
"loss": 0.031,
"step": 23490
},
{
"epoch": 24.20185375901133,
"grad_norm": 0.19652284681797028,
"learning_rate": 1.1908938073344477e-07,
"loss": 0.022,
"step": 23500
},
{
"epoch": 24.212152420185376,
"grad_norm": 0.2277028113603592,
"learning_rate": 1.1438455480549115e-07,
"loss": 0.0212,
"step": 23510
},
{
"epoch": 24.222451081359424,
"grad_norm": 0.28218087553977966,
"learning_rate": 1.0977444094506517e-07,
"loss": 0.0226,
"step": 23520
},
{
"epoch": 24.23274974253347,
"grad_norm": 0.3701043426990509,
"learning_rate": 1.0525904790485985e-07,
"loss": 0.0252,
"step": 23530
},
{
"epoch": 24.24304840370752,
"grad_norm": 0.19152191281318665,
"learning_rate": 1.0083838425773984e-07,
"loss": 0.0219,
"step": 23540
},
{
"epoch": 24.253347064881567,
"grad_norm": 0.31448763608932495,
"learning_rate": 9.651245839669698e-08,
"loss": 0.0294,
"step": 23550
},
{
"epoch": 24.263645726055614,
"grad_norm": 0.32265207171440125,
"learning_rate": 9.228127853487811e-08,
"loss": 0.0185,
"step": 23560
},
{
"epoch": 24.273944387229662,
"grad_norm": 0.2978125214576721,
"learning_rate": 8.814485270553508e-08,
"loss": 0.0245,
"step": 23570
},
{
"epoch": 24.284243048403706,
"grad_norm": 0.3098287582397461,
"learning_rate": 8.410318876201362e-08,
"loss": 0.0235,
"step": 23580
},
{
"epoch": 24.294541709577754,
"grad_norm": 0.2803812026977539,
"learning_rate": 8.01562943777645e-08,
"loss": 0.0249,
"step": 23590
},
{
"epoch": 24.3048403707518,
"grad_norm": 0.23657964169979095,
"learning_rate": 7.630417704630466e-08,
"loss": 0.0217,
"step": 23600
},
{
"epoch": 24.31513903192585,
"grad_norm": 0.14953146874904633,
"learning_rate": 7.254684408118939e-08,
"loss": 0.0242,
"step": 23610
},
{
"epoch": 24.325437693099897,
"grad_norm": 0.28647905588150024,
"learning_rate": 6.888430261605128e-08,
"loss": 0.0206,
"step": 23620
},
{
"epoch": 24.335736354273944,
"grad_norm": 0.18508291244506836,
"learning_rate": 6.531655960452243e-08,
"loss": 0.0239,
"step": 23630
},
{
"epoch": 24.346035015447992,
"grad_norm": 0.25578784942626953,
"learning_rate": 6.184362182026781e-08,
"loss": 0.0255,
"step": 23640
},
{
"epoch": 24.35633367662204,
"grad_norm": 0.30153825879096985,
"learning_rate": 5.8465495856963035e-08,
"loss": 0.0199,
"step": 23650
},
{
"epoch": 24.366632337796087,
"grad_norm": 0.21878878772258759,
"learning_rate": 5.5182188128261035e-08,
"loss": 0.0277,
"step": 23660
},
{
"epoch": 24.376930998970135,
"grad_norm": 0.25511273741722107,
"learning_rate": 5.199370486779209e-08,
"loss": 0.0293,
"step": 23670
},
{
"epoch": 24.387229660144182,
"grad_norm": 0.42204421758651733,
"learning_rate": 4.8900052129174924e-08,
"loss": 0.0244,
"step": 23680
},
{
"epoch": 24.39752832131823,
"grad_norm": 0.3273943066596985,
"learning_rate": 4.590123578596117e-08,
"loss": 0.024,
"step": 23690
},
{
"epoch": 24.407826982492274,
"grad_norm": 0.4405585825443268,
"learning_rate": 4.299726153166317e-08,
"loss": 0.0224,
"step": 23700
},
{
"epoch": 24.418125643666322,
"grad_norm": 0.3542063236236572,
"learning_rate": 4.0188134879715064e-08,
"loss": 0.0194,
"step": 23710
},
{
"epoch": 24.42842430484037,
"grad_norm": 0.48409783840179443,
"learning_rate": 3.747386116349505e-08,
"loss": 0.0234,
"step": 23720
},
{
"epoch": 24.438722966014417,
"grad_norm": 0.22048763930797577,
"learning_rate": 3.485444553626982e-08,
"loss": 0.0212,
"step": 23730
},
{
"epoch": 24.449021627188465,
"grad_norm": 0.2321694940328598,
"learning_rate": 3.232989297122791e-08,
"loss": 0.0286,
"step": 23740
},
{
"epoch": 24.459320288362512,
"grad_norm": 0.23533855378627777,
"learning_rate": 2.99002082614408e-08,
"loss": 0.0242,
"step": 23750
},
{
"epoch": 24.46961894953656,
"grad_norm": 0.2119673490524292,
"learning_rate": 2.7565396019879618e-08,
"loss": 0.0214,
"step": 23760
},
{
"epoch": 24.479917610710608,
"grad_norm": 0.2214546799659729,
"learning_rate": 2.5325460679376246e-08,
"loss": 0.0292,
"step": 23770
},
{
"epoch": 24.490216271884655,
"grad_norm": 0.751918375492096,
"learning_rate": 2.3180406492634422e-08,
"loss": 0.0278,
"step": 23780
},
{
"epoch": 24.500514933058703,
"grad_norm": 0.4512670934200287,
"learning_rate": 2.113023753222976e-08,
"loss": 0.02,
"step": 23790
},
{
"epoch": 24.51081359423275,
"grad_norm": 0.29734495282173157,
"learning_rate": 1.9174957690581972e-08,
"loss": 0.0282,
"step": 23800
},
{
"epoch": 24.5211122554068,
"grad_norm": 0.1899419128894806,
"learning_rate": 1.7314570679949347e-08,
"loss": 0.0237,
"step": 23810
},
{
"epoch": 24.531410916580846,
"grad_norm": 0.8968344330787659,
"learning_rate": 1.5549080032434273e-08,
"loss": 0.0189,
"step": 23820
},
{
"epoch": 24.54170957775489,
"grad_norm": 0.16808949410915375,
"learning_rate": 1.3878489099972136e-08,
"loss": 0.0253,
"step": 23830
},
{
"epoch": 24.552008238928938,
"grad_norm": 0.3582736849784851,
"learning_rate": 1.2302801054325797e-08,
"loss": 0.0213,
"step": 23840
},
{
"epoch": 24.562306900102985,
"grad_norm": 0.3200194239616394,
"learning_rate": 1.0822018887063357e-08,
"loss": 0.0253,
"step": 23850
},
{
"epoch": 24.572605561277033,
"grad_norm": 0.2794504463672638,
"learning_rate": 9.436145409585927e-09,
"loss": 0.0283,
"step": 23860
},
{
"epoch": 24.58290422245108,
"grad_norm": 0.2662137746810913,
"learning_rate": 8.145183253083222e-09,
"loss": 0.023,
"step": 23870
},
{
"epoch": 24.59320288362513,
"grad_norm": 0.4419270157814026,
"learning_rate": 6.9491348685613025e-09,
"loss": 0.0287,
"step": 23880
},
{
"epoch": 24.603501544799176,
"grad_norm": 0.24515603482723236,
"learning_rate": 5.848002526814833e-09,
"loss": 0.0226,
"step": 23890
},
{
"epoch": 24.613800205973224,
"grad_norm": 0.24852906167507172,
"learning_rate": 4.8417883184381784e-09,
"loss": 0.0265,
"step": 23900
},
{
"epoch": 24.62409886714727,
"grad_norm": 0.24117045104503632,
"learning_rate": 3.930494153819853e-09,
"loss": 0.0214,
"step": 23910
},
{
"epoch": 24.63439752832132,
"grad_norm": 0.41323116421699524,
"learning_rate": 3.1141217631203147e-09,
"loss": 0.0279,
"step": 23920
},
{
"epoch": 24.644696189495367,
"grad_norm": 0.2782849669456482,
"learning_rate": 2.3926726962997248e-09,
"loss": 0.0261,
"step": 23930
},
{
"epoch": 24.654994850669414,
"grad_norm": 0.2185438722372055,
"learning_rate": 1.7661483230846377e-09,
"loss": 0.0271,
"step": 23940
},
{
"epoch": 24.665293511843462,
"grad_norm": 0.4621998369693756,
"learning_rate": 1.234549832984655e-09,
"loss": 0.0255,
"step": 23950
},
{
"epoch": 24.675592173017506,
"grad_norm": 0.18405234813690186,
"learning_rate": 7.978782352924264e-10,
"loss": 0.0248,
"step": 23960
},
{
"epoch": 24.685890834191554,
"grad_norm": 0.18703693151474,
"learning_rate": 4.5613435905589305e-10,
"loss": 0.0235,
"step": 23970
},
{
"epoch": 24.6961894953656,
"grad_norm": 0.3629539906978607,
"learning_rate": 2.0931885311159526e-10,
"loss": 0.0247,
"step": 23980
},
{
"epoch": 24.70648815653965,
"grad_norm": 0.3040322959423065,
"learning_rate": 5.743218605136491e-11,
"loss": 0.0226,
"step": 23990
},
{
"epoch": 24.716786817713697,
"grad_norm": 0.2292143851518631,
"learning_rate": 4.746462556326492e-13,
"loss": 0.0208,
"step": 24000
},
{
"epoch": 24.716786817713697,
"step": 24000,
"total_flos": 0.0,
"train_loss": 0.04103807942320903,
"train_runtime": 10175.0236,
"train_samples_per_second": 75.479,
"train_steps_per_second": 2.359
}
],
"logging_steps": 10,
"max_steps": 24000,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}