{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03028773346794548, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 355.625, "epoch": 1.0095911155981828e-05, "grad_norm": 2.368061742658887, "kl": 0.0, "learning_rate": 9.999999997485042e-07, "loss": 0.0, "reward": 1.9210001230239868, "reward_std": 0.12013404071331024, "rewards/accuracy_reward": 0.814750075340271, "rewards/format_reward": 1.0, "step": 1 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 365.625, "epoch": 2.0191822311963656e-05, "grad_norm": 3.0170777777266298, "kl": 0.00067138671875, "learning_rate": 9.999999989940166e-07, "loss": 0.0, "reward": 1.9205312728881836, "reward_std": 0.12367895990610123, "rewards/accuracy_reward": 0.7580312490463257, "rewards/format_reward": 1.0, "step": 2 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 381.71875, "epoch": 3.028773346794548e-05, "grad_norm": 2.113975521801714, "kl": 0.000701904296875, "learning_rate": 9.999999977365373e-07, "loss": 0.0, "reward": 1.979875087738037, "reward_std": 0.11240961402654648, "rewards/accuracy_reward": 0.8048749566078186, "rewards/format_reward": 1.0, "step": 3 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 451.28125, "epoch": 4.038364462392731e-05, "grad_norm": 2.0051846414416525, "kl": 0.000640869140625, "learning_rate": 9.999999959760666e-07, "loss": 0.0, "reward": 1.8012187480926514, "reward_std": 0.35452377796173096, "rewards/accuracy_reward": 0.6887187361717224, "rewards/format_reward": 1.0, "step": 4 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.09375, "epoch": 5.047955577990914e-05, "grad_norm": 2.396231577448834, "kl": 0.000701904296875, "learning_rate": 9.999999937126042e-07, "loss": 0.0, "reward": 1.987375020980835, "reward_std": 0.1965545415878296, "rewards/accuracy_reward": 0.8811250329017639, "rewards/format_reward": 0.96875, "step": 5 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.5, "epoch": 6.057546693589096e-05, "grad_norm": 1.916522444486206, "kl": 0.000843048095703125, "learning_rate": 9.999999909461501e-07, "loss": 0.0, "reward": 1.742437481880188, "reward_std": 0.10457907617092133, "rewards/accuracy_reward": 0.6361874938011169, "rewards/format_reward": 1.0, "step": 6 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.375, "epoch": 7.06713780918728e-05, "grad_norm": 2.1652877663977317, "kl": 0.0009307861328125, "learning_rate": 9.999999876767042e-07, "loss": 0.0, "reward": 1.7994999885559082, "reward_std": 0.30147784948349, "rewards/accuracy_reward": 0.6495000123977661, "rewards/format_reward": 1.0, "step": 7 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 348.15625, "epoch": 8.076728924785463e-05, "grad_norm": 2.169181601204912, "kl": 0.00128936767578125, "learning_rate": 9.999999839042668e-07, "loss": 0.0001, "reward": 1.9187812805175781, "reward_std": 0.2009730041027069, "rewards/accuracy_reward": 0.7625312209129333, "rewards/format_reward": 1.0, "step": 8 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.5625, "epoch": 9.086320040383644e-05, "grad_norm": 2.085617536023113, "kl": 0.0012054443359375, "learning_rate": 9.999999796288377e-07, "loss": 0.0, "reward": 2.0006561279296875, "reward_std": 0.1299782246351242, "rewards/accuracy_reward": 0.8194062113761902, "rewards/format_reward": 1.0, "step": 9 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 390.53125, "epoch": 0.00010095911155981827, "grad_norm": 2.4551579423063936, "kl": 0.00148773193359375, "learning_rate": 9.99999974850417e-07, "loss": 0.0001, "reward": 1.9224687814712524, "reward_std": 0.20108568668365479, "rewards/accuracy_reward": 0.7537187337875366, "rewards/format_reward": 1.0, "step": 10 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 379.84375, "epoch": 0.0001110550227158001, "grad_norm": 4.00483703890344, "kl": 0.00177001953125, "learning_rate": 9.999999695690045e-07, "loss": 0.0001, "reward": 1.8878438472747803, "reward_std": 0.14083829522132874, "rewards/accuracy_reward": 0.7315937280654907, "rewards/format_reward": 1.0, "step": 11 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 401.15625, "epoch": 0.00012115093387178192, "grad_norm": 2.438620785162263, "kl": 0.0021820068359375, "learning_rate": 9.999999637846004e-07, "loss": 0.0001, "reward": 1.675874948501587, "reward_std": 0.13218040764331818, "rewards/accuracy_reward": 0.5696250200271606, "rewards/format_reward": 1.0, "step": 12 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 411.0, "epoch": 0.00013124684502776376, "grad_norm": 1.9821337074273262, "kl": 0.002777099609375, "learning_rate": 9.999999574972046e-07, "loss": 0.0001, "reward": 1.7541875839233398, "reward_std": 0.1342015564441681, "rewards/accuracy_reward": 0.6416875123977661, "rewards/format_reward": 1.0, "step": 13 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 440.28125, "epoch": 0.0001413427561837456, "grad_norm": 2.4101139741829027, "kl": 0.0027618408203125, "learning_rate": 9.999999507068174e-07, "loss": 0.0001, "reward": 1.6816563606262207, "reward_std": 0.09342411160469055, "rewards/accuracy_reward": 0.5441562533378601, "rewards/format_reward": 1.0, "step": 14 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 374.375, "epoch": 0.00015143866733972742, "grad_norm": 1.9630973446804931, "kl": 0.0031890869140625, "learning_rate": 9.999999434134385e-07, "loss": 0.0001, "reward": 1.5666249990463257, "reward_std": 0.15209373831748962, "rewards/accuracy_reward": 0.5103750228881836, "rewards/format_reward": 1.0, "step": 15 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 377.21875, "epoch": 0.00016153457849570925, "grad_norm": 2.3306965155436297, "kl": 0.00439453125, "learning_rate": 9.99999935617068e-07, "loss": 0.0002, "reward": 1.9698750972747803, "reward_std": 0.1319195032119751, "rewards/accuracy_reward": 0.8323749899864197, "rewards/format_reward": 1.0, "step": 16 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 454.90625, "epoch": 0.00017163048965169106, "grad_norm": 2.1403911834610443, "kl": 0.004119873046875, "learning_rate": 9.99999927317706e-07, "loss": 0.0002, "reward": 1.810156226158142, "reward_std": 0.2651243209838867, "rewards/accuracy_reward": 0.678906261920929, "rewards/format_reward": 1.0, "step": 17 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 443.3125, "epoch": 0.0001817264008076729, "grad_norm": 1.7140601857914162, "kl": 0.0042724609375, "learning_rate": 9.99999918515352e-07, "loss": 0.0002, "reward": 1.7944687604904175, "reward_std": 0.0802764892578125, "rewards/accuracy_reward": 0.6819686889648438, "rewards/format_reward": 1.0, "step": 18 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.00019182231196365472, "grad_norm": 2.13731652774118, "kl": 0.00537109375, "learning_rate": 9.99999909210007e-07, "loss": 0.0002, "reward": 1.9116876125335693, "reward_std": 0.15367430448532104, "rewards/accuracy_reward": 0.7366874814033508, "rewards/format_reward": 1.0, "step": 19 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 429.1875, "epoch": 0.00020191822311963655, "grad_norm": 1.9343829094793508, "kl": 0.005462646484375, "learning_rate": 9.999998994016701e-07, "loss": 0.0002, "reward": 1.8875312805175781, "reward_std": 0.15924495458602905, "rewards/accuracy_reward": 0.7250312566757202, "rewards/format_reward": 1.0, "step": 20 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.125, "epoch": 0.00021201413427561838, "grad_norm": 2.808299467248588, "kl": 0.006072998046875, "learning_rate": 9.999998890903417e-07, "loss": 0.0002, "reward": 2.073625087738037, "reward_std": 0.10110638290643692, "rewards/accuracy_reward": 0.8798750638961792, "rewards/format_reward": 1.0, "step": 21 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.53125, "epoch": 0.0002221100454316002, "grad_norm": 4.1370140391167345, "kl": 0.003875732421875, "learning_rate": 9.999998782760215e-07, "loss": 0.0002, "reward": 1.856406331062317, "reward_std": 0.22319602966308594, "rewards/accuracy_reward": 0.7189062833786011, "rewards/format_reward": 1.0, "step": 22 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.46875, "epoch": 0.00023220595658758202, "grad_norm": 1.9804023959478025, "kl": 0.004669189453125, "learning_rate": 9.9999986695871e-07, "loss": 0.0002, "reward": 1.8367812633514404, "reward_std": 0.1647380292415619, "rewards/accuracy_reward": 0.7117812633514404, "rewards/format_reward": 1.0, "step": 23 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 446.65625, "epoch": 0.00024230186774356385, "grad_norm": 1.7582732200797728, "kl": 0.00506591796875, "learning_rate": 9.99999855138407e-07, "loss": 0.0002, "reward": 2.0201876163482666, "reward_std": 0.18980096280574799, "rewards/accuracy_reward": 0.8576874732971191, "rewards/format_reward": 1.0, "step": 24 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 442.28125, "epoch": 0.0002523977788995457, "grad_norm": 2.0906811581063867, "kl": 0.0047607421875, "learning_rate": 9.999998428151123e-07, "loss": 0.0002, "reward": 1.753000020980835, "reward_std": 0.09714198112487793, "rewards/accuracy_reward": 0.6154999732971191, "rewards/format_reward": 1.0, "step": 25 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 431.71875, "epoch": 0.0002624936900555275, "grad_norm": 2.267487285904402, "kl": 0.0062255859375, "learning_rate": 9.999998299888263e-07, "loss": 0.0002, "reward": 1.9801876544952393, "reward_std": 0.12151825428009033, "rewards/accuracy_reward": 0.8176875114440918, "rewards/format_reward": 1.0, "step": 26 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.71875, "epoch": 0.00027258960121150934, "grad_norm": 1.8696928038407707, "kl": 0.0067138671875, "learning_rate": 9.999998166595487e-07, "loss": 0.0003, "reward": 2.0635311603546143, "reward_std": 0.10327333211898804, "rewards/accuracy_reward": 0.8885312676429749, "rewards/format_reward": 1.0, "step": 27 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.84375, "epoch": 0.0002826855123674912, "grad_norm": 2.160641762177654, "kl": 0.00701904296875, "learning_rate": 9.999998028272796e-07, "loss": 0.0003, "reward": 1.9147188663482666, "reward_std": 0.11533698439598083, "rewards/accuracy_reward": 0.7209687232971191, "rewards/format_reward": 1.0, "step": 28 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.4375, "epoch": 0.000292781423523473, "grad_norm": 1.8202301346194845, "kl": 0.004913330078125, "learning_rate": 9.99999788492019e-07, "loss": 0.0002, "reward": 1.843937635421753, "reward_std": 0.07164421677589417, "rewards/accuracy_reward": 0.7189375162124634, "rewards/format_reward": 1.0, "step": 29 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 441.375, "epoch": 0.00030287733467945484, "grad_norm": 1.8251324841509673, "kl": 0.00531005859375, "learning_rate": 9.99999773653767e-07, "loss": 0.0002, "reward": 2.0075936317443848, "reward_std": 0.19453352689743042, "rewards/accuracy_reward": 0.8325937986373901, "rewards/format_reward": 1.0, "step": 30 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 424.46875, "epoch": 0.00031297324583543667, "grad_norm": 2.314396144612979, "kl": 0.00592041015625, "learning_rate": 9.999997583125235e-07, "loss": 0.0002, "reward": 2.045468807220459, "reward_std": 0.1887637972831726, "rewards/accuracy_reward": 0.8767187595367432, "rewards/format_reward": 1.0, "step": 31 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 452.75, "epoch": 0.0003230691569914185, "grad_norm": 2.0988881605906466, "kl": 0.00421142578125, "learning_rate": 9.999997424682886e-07, "loss": 0.0002, "reward": 1.885312557220459, "reward_std": 0.25123435258865356, "rewards/accuracy_reward": 0.7478125095367432, "rewards/format_reward": 1.0, "step": 32 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.0625, "epoch": 0.0003331650681474003, "grad_norm": 2.255049319435565, "kl": 0.006439208984375, "learning_rate": 9.999997261210624e-07, "loss": 0.0003, "reward": 1.9644999504089355, "reward_std": 0.27120882272720337, "rewards/accuracy_reward": 0.8270000219345093, "rewards/format_reward": 1.0, "step": 33 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.75, "epoch": 0.0003432609793033821, "grad_norm": 4.939215667809582, "kl": 0.00830078125, "learning_rate": 9.999997092708448e-07, "loss": 0.0003, "reward": 1.924562692642212, "reward_std": 0.11284403502941132, "rewards/accuracy_reward": 0.7308124899864197, "rewards/format_reward": 1.0, "step": 34 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.3125, "epoch": 0.00035335689045936394, "grad_norm": 4.732491614625869, "kl": 0.00750732421875, "learning_rate": 9.999996919176357e-07, "loss": 0.0003, "reward": 1.9546562433242798, "reward_std": 0.15749983489513397, "rewards/accuracy_reward": 0.8046562671661377, "rewards/format_reward": 1.0, "step": 35 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 420.03125, "epoch": 0.0003634528016153458, "grad_norm": 2.1212833849670165, "kl": 0.00726318359375, "learning_rate": 9.999996740614353e-07, "loss": 0.0003, "reward": 1.9275625944137573, "reward_std": 0.23702609539031982, "rewards/accuracy_reward": 0.7775624990463257, "rewards/format_reward": 1.0, "step": 36 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 455.25, "epoch": 0.0003735487127713276, "grad_norm": 1.8481574725204435, "kl": 0.006134033203125, "learning_rate": 9.999996557022435e-07, "loss": 0.0002, "reward": 1.7318438291549683, "reward_std": 0.26594001054763794, "rewards/accuracy_reward": 0.5880937576293945, "rewards/format_reward": 1.0, "step": 37 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.6875, "epoch": 0.00038364462392730944, "grad_norm": 2.0009034699516386, "kl": 0.00750732421875, "learning_rate": 9.999996368400605e-07, "loss": 0.0003, "reward": 2.0066561698913574, "reward_std": 0.18551787734031677, "rewards/accuracy_reward": 0.8379062414169312, "rewards/format_reward": 1.0, "step": 38 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.46875, "epoch": 0.00039374053508329127, "grad_norm": 2.116385944038055, "kl": 0.00872802734375, "learning_rate": 9.999996174748862e-07, "loss": 0.0003, "reward": 2.0709376335144043, "reward_std": 0.0929776057600975, "rewards/accuracy_reward": 0.8709374666213989, "rewards/format_reward": 1.0, "step": 39 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.3125, "epoch": 0.0004038364462392731, "grad_norm": 1.7440213311831585, "kl": 0.007415771484375, "learning_rate": 9.999995976067207e-07, "loss": 0.0003, "reward": 1.7612812519073486, "reward_std": 0.08131412416696548, "rewards/accuracy_reward": 0.6362812519073486, "rewards/format_reward": 1.0, "step": 40 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 423.6875, "epoch": 0.00041393235739525493, "grad_norm": 1.6111585056456865, "kl": 0.008544921875, "learning_rate": 9.999995772355637e-07, "loss": 0.0003, "reward": 1.5077812671661377, "reward_std": 0.05196709930896759, "rewards/accuracy_reward": 0.4077812433242798, "rewards/format_reward": 1.0, "step": 41 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.125, "epoch": 0.00042402826855123676, "grad_norm": 2.302380666084948, "kl": 0.00634765625, "learning_rate": 9.999995563614154e-07, "loss": 0.0003, "reward": 1.85421884059906, "reward_std": 0.2088339477777481, "rewards/accuracy_reward": 0.7292187213897705, "rewards/format_reward": 1.0, "step": 42 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 457.875, "epoch": 0.0004341241797072186, "grad_norm": 1.4104286593656923, "kl": 0.006317138671875, "learning_rate": 9.999995349842762e-07, "loss": 0.0003, "reward": 1.2686874866485596, "reward_std": 0.11449584364891052, "rewards/accuracy_reward": 0.218687504529953, "rewards/format_reward": 1.0, "step": 43 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 451.78125, "epoch": 0.0004442200908632004, "grad_norm": 2.572056088450461, "kl": 0.009033203125, "learning_rate": 9.999995131041457e-07, "loss": 0.0004, "reward": 1.6210625171661377, "reward_std": 0.21261611580848694, "rewards/accuracy_reward": 0.5148124694824219, "rewards/format_reward": 1.0, "step": 44 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.5, "epoch": 0.00045431600201918226, "grad_norm": 2.4037139828397183, "kl": 0.0091552734375, "learning_rate": 9.999994907210239e-07, "loss": 0.0004, "reward": 2.039249897003174, "reward_std": 0.1292666345834732, "rewards/accuracy_reward": 0.8642500042915344, "rewards/format_reward": 1.0, "step": 45 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 432.8125, "epoch": 0.00046441191317516404, "grad_norm": 1.9552127943933195, "kl": 0.00958251953125, "learning_rate": 9.999994678349109e-07, "loss": 0.0004, "reward": 1.7881561517715454, "reward_std": 0.11242731660604477, "rewards/accuracy_reward": 0.656906247138977, "rewards/format_reward": 1.0, "step": 46 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 448.78125, "epoch": 0.00047450782433114587, "grad_norm": 2.049088983420436, "kl": 0.00897216796875, "learning_rate": 9.999994444458068e-07, "loss": 0.0004, "reward": 1.6592187881469727, "reward_std": 0.16417738795280457, "rewards/accuracy_reward": 0.52796870470047, "rewards/format_reward": 1.0, "step": 47 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 443.5625, "epoch": 0.0004846037354871277, "grad_norm": 2.151699303719901, "kl": 0.005859375, "learning_rate": 9.999994205537118e-07, "loss": 0.0002, "reward": 1.8533438444137573, "reward_std": 0.3451181650161743, "rewards/accuracy_reward": 0.7095937728881836, "rewards/format_reward": 1.0, "step": 48 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 466.34375, "epoch": 0.0004946996466431095, "grad_norm": 1.8868060930582558, "kl": 0.008056640625, "learning_rate": 9.999993961586256e-07, "loss": 0.0003, "reward": 1.5635312795639038, "reward_std": 0.296091228723526, "rewards/accuracy_reward": 0.4572812616825104, "rewards/format_reward": 1.0, "step": 49 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 441.96875, "epoch": 0.0005047955577990914, "grad_norm": 2.797136263177468, "kl": 0.00860595703125, "learning_rate": 9.999993712605483e-07, "loss": 0.0003, "reward": 1.7207812070846558, "reward_std": 0.083226278424263, "rewards/accuracy_reward": 0.5895312428474426, "rewards/format_reward": 1.0, "step": 50 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 436.59375, "epoch": 0.0005148914689550732, "grad_norm": 2.1252306243782746, "kl": 0.0084228515625, "learning_rate": 9.9999934585948e-07, "loss": 0.0003, "reward": 2.011906147003174, "reward_std": 0.21333274245262146, "rewards/accuracy_reward": 0.8556561470031738, "rewards/format_reward": 1.0, "step": 51 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 446.09375, "epoch": 0.000524987380111055, "grad_norm": 2.0180594760696544, "kl": 0.00921630859375, "learning_rate": 9.999993199554207e-07, "loss": 0.0004, "reward": 1.8533437252044678, "reward_std": 0.29387766122817993, "rewards/accuracy_reward": 0.7283437252044678, "rewards/format_reward": 1.0, "step": 52 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.3125, "epoch": 0.0005350832912670369, "grad_norm": 1.8890560526973472, "kl": 0.00750732421875, "learning_rate": 9.999992935483704e-07, "loss": 0.0003, "reward": 1.5239062309265137, "reward_std": 0.35616573691368103, "rewards/accuracy_reward": 0.4489062428474426, "rewards/format_reward": 1.0, "step": 53 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.28125, "epoch": 0.0005451792024230187, "grad_norm": 2.468191613790196, "kl": 0.012451171875, "learning_rate": 9.999992666383292e-07, "loss": 0.0005, "reward": 1.9348437786102295, "reward_std": 0.10478056967258453, "rewards/accuracy_reward": 0.7535936832427979, "rewards/format_reward": 1.0, "step": 54 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.0625, "epoch": 0.0005552751135790005, "grad_norm": 1.5486968487579935, "kl": 0.0089111328125, "learning_rate": 9.999992392252968e-07, "loss": 0.0004, "reward": 2.0166563987731934, "reward_std": 0.15882062911987305, "rewards/accuracy_reward": 0.8541562557220459, "rewards/format_reward": 1.0, "step": 55 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.5625, "epoch": 0.0005653710247349824, "grad_norm": 1.9487818046516763, "kl": 0.01055908203125, "learning_rate": 9.99999211309274e-07, "loss": 0.0004, "reward": 1.7640937566757202, "reward_std": 0.14476224780082703, "rewards/accuracy_reward": 0.6203437447547913, "rewards/format_reward": 1.0, "step": 56 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 441.875, "epoch": 0.0005754669358909642, "grad_norm": 2.183463498465026, "kl": 0.01251220703125, "learning_rate": 9.9999918289026e-07, "loss": 0.0005, "reward": 1.7802499532699585, "reward_std": 0.1628354787826538, "rewards/accuracy_reward": 0.6427500247955322, "rewards/format_reward": 1.0, "step": 57 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.71875, "epoch": 0.000585562847046946, "grad_norm": 1.8880694982806163, "kl": 0.011962890625, "learning_rate": 9.999991539682552e-07, "loss": 0.0005, "reward": 2.0869998931884766, "reward_std": 0.06415347009897232, "rewards/accuracy_reward": 0.8932501077651978, "rewards/format_reward": 1.0, "step": 58 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 440.625, "epoch": 0.0005956587582029278, "grad_norm": 1.6160560431867699, "kl": 0.01556396484375, "learning_rate": 9.999991245432596e-07, "loss": 0.0006, "reward": 1.8336875438690186, "reward_std": 0.1797502338886261, "rewards/accuracy_reward": 0.6836875081062317, "rewards/format_reward": 1.0, "step": 59 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 426.84375, "epoch": 0.0006057546693589097, "grad_norm": 1.941340505839146, "kl": 0.01239013671875, "learning_rate": 9.999990946152731e-07, "loss": 0.0005, "reward": 2.0028748512268066, "reward_std": 0.17697665095329285, "rewards/accuracy_reward": 0.827875018119812, "rewards/format_reward": 1.0, "step": 60 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 387.25, "epoch": 0.0006158505805148915, "grad_norm": 2.05670629296684, "kl": 0.0169677734375, "learning_rate": 9.999990641842959e-07, "loss": 0.0007, "reward": 1.9340312480926514, "reward_std": 0.24620959162712097, "rewards/accuracy_reward": 0.8027812242507935, "rewards/format_reward": 0.9375, "step": 61 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 419.78125, "epoch": 0.0006259464916708733, "grad_norm": 1.5801256428026726, "kl": 0.0108642578125, "learning_rate": 9.99999033250328e-07, "loss": 0.0004, "reward": 1.598062515258789, "reward_std": 0.18506810069084167, "rewards/accuracy_reward": 0.5043125152587891, "rewards/format_reward": 1.0, "step": 62 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 442.09375, "epoch": 0.0006360424028268552, "grad_norm": 1.5559182059723595, "kl": 0.01043701171875, "learning_rate": 9.999990018133696e-07, "loss": 0.0004, "reward": 1.6814687252044678, "reward_std": 0.24735836684703827, "rewards/accuracy_reward": 0.5689687132835388, "rewards/format_reward": 1.0, "step": 63 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.4375, "epoch": 0.000646138313982837, "grad_norm": 2.097352965134568, "kl": 0.01434326171875, "learning_rate": 9.999989698734203e-07, "loss": 0.0006, "reward": 1.9532500505447388, "reward_std": 0.09886159002780914, "rewards/accuracy_reward": 0.7720000147819519, "rewards/format_reward": 1.0, "step": 64 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 399.65625, "epoch": 0.0006562342251388187, "grad_norm": 1.7687555176871348, "kl": 0.014892578125, "learning_rate": 9.999989374304804e-07, "loss": 0.0006, "reward": 1.777500033378601, "reward_std": 0.03819815441966057, "rewards/accuracy_reward": 0.6274999976158142, "rewards/format_reward": 1.0, "step": 65 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.0, "epoch": 0.0006663301362948006, "grad_norm": 2.329611591824057, "kl": 0.017333984375, "learning_rate": 9.999989044845499e-07, "loss": 0.0007, "reward": 1.9665000438690186, "reward_std": 0.10015924274921417, "rewards/accuracy_reward": 0.7790000438690186, "rewards/format_reward": 1.0, "step": 66 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.59375, "epoch": 0.0006764260474507824, "grad_norm": 2.6884395250887074, "kl": 0.0194091796875, "learning_rate": 9.99998871035629e-07, "loss": 0.0008, "reward": 1.8846561908721924, "reward_std": 0.07940257340669632, "rewards/accuracy_reward": 0.690906286239624, "rewards/format_reward": 1.0, "step": 67 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.0006865219586067642, "grad_norm": 2.384756754867785, "kl": 0.0167236328125, "learning_rate": 9.999988370837174e-07, "loss": 0.0007, "reward": 2.0735936164855957, "reward_std": 0.06873105466365814, "rewards/accuracy_reward": 0.8985937833786011, "rewards/format_reward": 1.0, "step": 68 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.000696617869762746, "grad_norm": 2.1564378506986386, "kl": 0.0152587890625, "learning_rate": 9.999988026288153e-07, "loss": 0.0006, "reward": 1.8704688549041748, "reward_std": 0.07262495160102844, "rewards/accuracy_reward": 0.6704687476158142, "rewards/format_reward": 1.0, "step": 69 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 389.46875, "epoch": 0.0007067137809187279, "grad_norm": 2.101254532533232, "kl": 0.014892578125, "learning_rate": 9.999987676709228e-07, "loss": 0.0006, "reward": 1.8146250247955322, "reward_std": 0.25082337856292725, "rewards/accuracy_reward": 0.6833750009536743, "rewards/format_reward": 1.0, "step": 70 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.53125, "epoch": 0.0007168096920747097, "grad_norm": 7.033493831976905, "kl": 0.01458740234375, "learning_rate": 9.999987322100396e-07, "loss": 0.0006, "reward": 1.8094062805175781, "reward_std": 0.21911081671714783, "rewards/accuracy_reward": 0.6531562805175781, "rewards/format_reward": 1.0, "step": 71 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.34375, "epoch": 0.0007269056032306915, "grad_norm": 3.130533054307358, "kl": 0.0179443359375, "learning_rate": 9.999986962461663e-07, "loss": 0.0007, "reward": 1.990187644958496, "reward_std": 0.14125201106071472, "rewards/accuracy_reward": 0.8339374661445618, "rewards/format_reward": 0.96875, "step": 72 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.375, "epoch": 0.0007370015143866734, "grad_norm": 2.147476284135236, "kl": 0.0152587890625, "learning_rate": 9.999986597793026e-07, "loss": 0.0006, "reward": 2.0234062671661377, "reward_std": 0.10022676736116409, "rewards/accuracy_reward": 0.8296562433242798, "rewards/format_reward": 1.0, "step": 73 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 394.96875, "epoch": 0.0007470974255426552, "grad_norm": 2.1995824572437788, "kl": 0.0152587890625, "learning_rate": 9.999986228094487e-07, "loss": 0.0006, "reward": 1.9345312118530273, "reward_std": 0.11458326876163483, "rewards/accuracy_reward": 0.740781307220459, "rewards/format_reward": 1.0, "step": 74 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.9375, "epoch": 0.000757193336698637, "grad_norm": 2.781930673245086, "kl": 0.01422119140625, "learning_rate": 9.999985853366043e-07, "loss": 0.0006, "reward": 2.1572189331054688, "reward_std": 0.07687188684940338, "rewards/accuracy_reward": 0.9697187542915344, "rewards/format_reward": 1.0, "step": 75 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.03125, "epoch": 0.0007672892478546189, "grad_norm": 2.1594687879825814, "kl": 0.014892578125, "learning_rate": 9.999985473607698e-07, "loss": 0.0006, "reward": 1.9817500114440918, "reward_std": 0.10254956781864166, "rewards/accuracy_reward": 0.7880000472068787, "rewards/format_reward": 1.0, "step": 76 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.4375, "epoch": 0.0007773851590106007, "grad_norm": 2.0063140095312, "kl": 0.0184326171875, "learning_rate": 9.999985088819452e-07, "loss": 0.0007, "reward": 1.9738125801086426, "reward_std": 0.12812094390392303, "rewards/accuracy_reward": 0.8113124966621399, "rewards/format_reward": 1.0, "step": 77 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.0007874810701665825, "grad_norm": 2.4041101054154588, "kl": 0.0179443359375, "learning_rate": 9.999984699001303e-07, "loss": 0.0007, "reward": 2.093937397003174, "reward_std": 0.07433228939771652, "rewards/accuracy_reward": 0.8939375281333923, "rewards/format_reward": 1.0, "step": 78 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 431.5, "epoch": 0.0007975769813225644, "grad_norm": 2.0090020415309184, "kl": 0.015380859375, "learning_rate": 9.999984304153252e-07, "loss": 0.0006, "reward": 1.9322187900543213, "reward_std": 0.22017724812030792, "rewards/accuracy_reward": 0.7697187662124634, "rewards/format_reward": 1.0, "step": 79 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 441.21875, "epoch": 0.0008076728924785462, "grad_norm": 1.9576216517561602, "kl": 0.017333984375, "learning_rate": 9.9999839042753e-07, "loss": 0.0007, "reward": 2.0342812538146973, "reward_std": 0.10042206197977066, "rewards/accuracy_reward": 0.8592813014984131, "rewards/format_reward": 1.0, "step": 80 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.34375, "epoch": 0.000817768803634528, "grad_norm": 2.0423662692060387, "kl": 0.019775390625, "learning_rate": 9.999983499367447e-07, "loss": 0.0008, "reward": 1.863687515258789, "reward_std": 0.10455750674009323, "rewards/accuracy_reward": 0.6824374198913574, "rewards/format_reward": 1.0, "step": 81 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.90625, "epoch": 0.0008278647147905099, "grad_norm": 2.097569520804077, "kl": 0.0174560546875, "learning_rate": 9.999983089429698e-07, "loss": 0.0007, "reward": 1.9606250524520874, "reward_std": 0.08835773915052414, "rewards/accuracy_reward": 0.7793749570846558, "rewards/format_reward": 1.0, "step": 82 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 438.46875, "epoch": 0.0008379606259464917, "grad_norm": 1.5196659132620416, "kl": 0.012939453125, "learning_rate": 9.999982674462046e-07, "loss": 0.0005, "reward": 1.2775624990463257, "reward_std": 0.10693497955799103, "rewards/accuracy_reward": 0.22756250202655792, "rewards/format_reward": 1.0, "step": 83 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 444.84375, "epoch": 0.0008480565371024735, "grad_norm": 1.516043967292656, "kl": 0.013427734375, "learning_rate": 9.999982254464494e-07, "loss": 0.0005, "reward": 1.6209375858306885, "reward_std": 0.2147689312696457, "rewards/accuracy_reward": 0.4959375262260437, "rewards/format_reward": 1.0, "step": 84 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 433.6875, "epoch": 0.0008581524482584554, "grad_norm": 2.0329765230312593, "kl": 0.0191650390625, "learning_rate": 9.999981829437045e-07, "loss": 0.0008, "reward": 2.0855002403259277, "reward_std": 0.10030870139598846, "rewards/accuracy_reward": 0.9167499542236328, "rewards/format_reward": 1.0, "step": 85 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.0008682483594144372, "grad_norm": 1.7152088937707408, "kl": 0.01513671875, "learning_rate": 9.999981399379696e-07, "loss": 0.0006, "reward": 2.0687501430511475, "reward_std": 0.18542763590812683, "rewards/accuracy_reward": 0.887499988079071, "rewards/format_reward": 1.0, "step": 86 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 446.21875, "epoch": 0.000878344270570419, "grad_norm": 1.8522391337501503, "kl": 0.0205078125, "learning_rate": 9.999980964292452e-07, "loss": 0.0008, "reward": 2.0318126678466797, "reward_std": 0.0768652856349945, "rewards/accuracy_reward": 0.8443124294281006, "rewards/format_reward": 1.0, "step": 87 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 449.34375, "epoch": 0.0008884401817264009, "grad_norm": 16.90729234575583, "kl": 0.0196533203125, "learning_rate": 9.999980524175308e-07, "loss": 0.0008, "reward": 1.951562523841858, "reward_std": 0.07704654335975647, "rewards/accuracy_reward": 0.7828124761581421, "rewards/format_reward": 1.0, "step": 88 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 455.25, "epoch": 0.0008985360928823827, "grad_norm": 1.9468418238659153, "kl": 0.01531982421875, "learning_rate": 9.999980079028268e-07, "loss": 0.0006, "reward": 1.5295937061309814, "reward_std": 0.3098824918270111, "rewards/accuracy_reward": 0.41709375381469727, "rewards/format_reward": 1.0, "step": 89 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 452.25, "epoch": 0.0009086320040383645, "grad_norm": 1.447608553902692, "kl": 0.0174560546875, "learning_rate": 9.999979628851329e-07, "loss": 0.0007, "reward": 1.7379374504089355, "reward_std": 0.06024404615163803, "rewards/accuracy_reward": 0.6004374623298645, "rewards/format_reward": 1.0, "step": 90 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 436.3125, "epoch": 0.0009187279151943462, "grad_norm": 1.6505479497433888, "kl": 0.0179443359375, "learning_rate": 9.999979173644498e-07, "loss": 0.0007, "reward": 1.9446876049041748, "reward_std": 0.2149237096309662, "rewards/accuracy_reward": 0.7696874737739563, "rewards/format_reward": 1.0, "step": 91 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.09375, "epoch": 0.0009288238263503281, "grad_norm": 1.8512042437240182, "kl": 0.0179443359375, "learning_rate": 9.999978713407768e-07, "loss": 0.0007, "reward": 2.0464062690734863, "reward_std": 0.18979759514331818, "rewards/accuracy_reward": 0.8776562213897705, "rewards/format_reward": 1.0, "step": 92 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 444.75, "epoch": 0.0009389197375063099, "grad_norm": 1.653075159747783, "kl": 0.0191650390625, "learning_rate": 9.999978248141144e-07, "loss": 0.0008, "reward": 1.9172813892364502, "reward_std": 0.08200256526470184, "rewards/accuracy_reward": 0.7360312342643738, "rewards/format_reward": 1.0, "step": 93 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 425.28125, "epoch": 0.0009490156486622917, "grad_norm": 2.030175939205058, "kl": 0.0233154296875, "learning_rate": 9.999977777844625e-07, "loss": 0.0009, "reward": 2.016749858856201, "reward_std": 0.08863753080368042, "rewards/accuracy_reward": 0.8292499780654907, "rewards/format_reward": 1.0, "step": 94 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.65625, "epoch": 0.0009591115598182736, "grad_norm": 2.069873781559919, "kl": 0.0225830078125, "learning_rate": 9.999977302518212e-07, "loss": 0.0009, "reward": 1.7424062490463257, "reward_std": 0.09342799335718155, "rewards/accuracy_reward": 0.5549062490463257, "rewards/format_reward": 1.0, "step": 95 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 429.8125, "epoch": 0.0009692074709742554, "grad_norm": 3.837828805898913, "kl": 0.0145263671875, "learning_rate": 9.999976822161904e-07, "loss": 0.0006, "reward": 1.7484687566757202, "reward_std": 0.33927011489868164, "rewards/accuracy_reward": 0.6047187447547913, "rewards/format_reward": 1.0, "step": 96 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 447.78125, "epoch": 0.0009793033821302373, "grad_norm": 1.8700770067271284, "kl": 0.016357421875, "learning_rate": 9.999976336775704e-07, "loss": 0.0007, "reward": 1.917062520980835, "reward_std": 0.20710881054401398, "rewards/accuracy_reward": 0.7483124732971191, "rewards/format_reward": 1.0, "step": 97 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.09375, "epoch": 0.000989399293286219, "grad_norm": 2.8051931002317345, "kl": 0.0172119140625, "learning_rate": 9.999975846359609e-07, "loss": 0.0007, "reward": 1.9335312843322754, "reward_std": 0.1906992346048355, "rewards/accuracy_reward": 0.7585312128067017, "rewards/format_reward": 1.0, "step": 98 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.1875, "epoch": 0.000999495204442201, "grad_norm": 4.500047692609386, "kl": 0.0213623046875, "learning_rate": 9.999975350913625e-07, "loss": 0.0009, "reward": 1.9769062995910645, "reward_std": 0.06583760678768158, "rewards/accuracy_reward": 0.7769061326980591, "rewards/format_reward": 1.0, "step": 99 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.75, "epoch": 0.0010095911155981827, "grad_norm": 1.8518818186406065, "kl": 0.01708984375, "learning_rate": 9.999974850437747e-07, "loss": 0.0007, "reward": 2.080749988555908, "reward_std": 0.05636772885918617, "rewards/accuracy_reward": 0.8932499885559082, "rewards/format_reward": 1.0, "step": 100 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.0010196870267541647, "grad_norm": 2.6570028026149792, "kl": 0.0211181640625, "learning_rate": 9.999974344931978e-07, "loss": 0.0008, "reward": 2.0512499809265137, "reward_std": 0.09541171789169312, "rewards/accuracy_reward": 0.8762500286102295, "rewards/format_reward": 1.0, "step": 101 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 397.28125, "epoch": 0.0010297829379101464, "grad_norm": 2.558892973029303, "kl": 0.01904296875, "learning_rate": 9.999973834396318e-07, "loss": 0.0008, "reward": 1.4895000457763672, "reward_std": 0.0316481776535511, "rewards/accuracy_reward": 0.3957499861717224, "rewards/format_reward": 1.0, "step": 102 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 426.5625, "epoch": 0.0010398788490661283, "grad_norm": 13.749858510317859, "kl": 0.01904296875, "learning_rate": 9.999973318830769e-07, "loss": 0.0008, "reward": 1.8925626277923584, "reward_std": 0.06408590823411942, "rewards/accuracy_reward": 0.6988124847412109, "rewards/format_reward": 1.0, "step": 103 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.00104997476022211, "grad_norm": 2.129836190395455, "kl": 0.023681640625, "learning_rate": 9.999972798235328e-07, "loss": 0.0009, "reward": 1.9890000820159912, "reward_std": 0.06667684018611908, "rewards/accuracy_reward": 0.7952499985694885, "rewards/format_reward": 1.0, "step": 104 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.0625, "epoch": 0.0010600706713780918, "grad_norm": 4.750086710242227, "kl": 0.02294921875, "learning_rate": 9.999972272609998e-07, "loss": 0.0009, "reward": 1.9317500591278076, "reward_std": 0.09331730753183365, "rewards/accuracy_reward": 0.7505000233650208, "rewards/format_reward": 1.0, "step": 105 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 415.59375, "epoch": 0.0010701665825340737, "grad_norm": 5.7919323063532495, "kl": 0.018310546875, "learning_rate": 9.99997174195478e-07, "loss": 0.0007, "reward": 1.8676562309265137, "reward_std": 0.0781112015247345, "rewards/accuracy_reward": 0.6801562309265137, "rewards/format_reward": 1.0, "step": 106 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.0010802624936900554, "grad_norm": 1.8572219612501744, "kl": 0.021484375, "learning_rate": 9.999971206269673e-07, "loss": 0.0009, "reward": 1.9606250524520874, "reward_std": 0.0960540622472763, "rewards/accuracy_reward": 0.7793749570846558, "rewards/format_reward": 1.0, "step": 107 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 429.3125, "epoch": 0.0010903584048460374, "grad_norm": 0.8830955958016244, "kl": 0.01171875, "learning_rate": 9.99997066555468e-07, "loss": 0.0005, "reward": 1.5191874504089355, "reward_std": 0.02343750186264515, "rewards/accuracy_reward": 0.4191874861717224, "rewards/format_reward": 1.0, "step": 108 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.625, "epoch": 0.001100454316002019, "grad_norm": 2.928631907100439, "kl": 0.0198974609375, "learning_rate": 9.999970119809798e-07, "loss": 0.0008, "reward": 1.7919375896453857, "reward_std": 0.14680925011634827, "rewards/accuracy_reward": 0.6419374942779541, "rewards/format_reward": 1.0, "step": 109 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.0625, "epoch": 0.001110550227158001, "grad_norm": 2.0717329434632337, "kl": 0.02392578125, "learning_rate": 9.99996956903503e-07, "loss": 0.001, "reward": 1.9897812604904175, "reward_std": 0.06460793316364288, "rewards/accuracy_reward": 0.7960312366485596, "rewards/format_reward": 1.0, "step": 110 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.53125, "epoch": 0.0011206461383139828, "grad_norm": 1.6041310566199016, "kl": 0.02001953125, "learning_rate": 9.999969013230377e-07, "loss": 0.0008, "reward": 2.1037187576293945, "reward_std": 0.054493438452482224, "rewards/accuracy_reward": 0.9224687218666077, "rewards/format_reward": 1.0, "step": 111 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.28125, "epoch": 0.0011307420494699647, "grad_norm": 1.2575335328324215, "kl": 0.017578125, "learning_rate": 9.999968452395837e-07, "loss": 0.0007, "reward": 1.7625000476837158, "reward_std": 0.035355344414711, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 112 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.59375, "epoch": 0.0011408379606259464, "grad_norm": 2.2260612729045723, "kl": 0.024658203125, "learning_rate": 9.999967886531413e-07, "loss": 0.001, "reward": 2.0221874713897705, "reward_std": 0.11563746631145477, "rewards/accuracy_reward": 0.8409375548362732, "rewards/format_reward": 1.0, "step": 113 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 390.625, "epoch": 0.0011509338717819284, "grad_norm": 2.6830424114387754, "kl": 0.02294921875, "learning_rate": 9.999967315637107e-07, "loss": 0.0009, "reward": 1.666062593460083, "reward_std": 0.07836806774139404, "rewards/accuracy_reward": 0.5160624980926514, "rewards/format_reward": 1.0, "step": 114 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.00116102978293791, "grad_norm": 3.039618731755611, "kl": 0.022216796875, "learning_rate": 9.999966739712913e-07, "loss": 0.0009, "reward": 1.9405624866485596, "reward_std": 0.07479392737150192, "rewards/accuracy_reward": 0.7468124628067017, "rewards/format_reward": 1.0, "step": 115 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.4375, "epoch": 0.001171125694093892, "grad_norm": 3.1166048569057176, "kl": 0.0228271484375, "learning_rate": 9.999966158758836e-07, "loss": 0.0009, "reward": 1.9226875305175781, "reward_std": 0.08160136640071869, "rewards/accuracy_reward": 0.7289375066757202, "rewards/format_reward": 1.0, "step": 116 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.125, "epoch": 0.0011812216052498738, "grad_norm": 2.3148067108387465, "kl": 0.0252685546875, "learning_rate": 9.999965572774878e-07, "loss": 0.001, "reward": 1.9320625066757202, "reward_std": 0.09939878433942795, "rewards/accuracy_reward": 0.7445625066757202, "rewards/format_reward": 1.0, "step": 117 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.40625, "epoch": 0.0011913175164058557, "grad_norm": 2.094479057188616, "kl": 0.025634765625, "learning_rate": 9.999964981761038e-07, "loss": 0.001, "reward": 2.0046563148498535, "reward_std": 0.0608849823474884, "rewards/accuracy_reward": 0.8046561479568481, "rewards/format_reward": 1.0, "step": 118 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.25, "epoch": 0.0012014134275618374, "grad_norm": 1.9775966590518428, "kl": 0.0250244140625, "learning_rate": 9.999964385717317e-07, "loss": 0.001, "reward": 2.1077189445495605, "reward_std": 0.044057123363018036, "rewards/accuracy_reward": 0.9077187776565552, "rewards/format_reward": 1.0, "step": 119 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.6875, "epoch": 0.0012115093387178194, "grad_norm": 2.0244198357223304, "kl": 0.0238037109375, "learning_rate": 9.999963784643714e-07, "loss": 0.001, "reward": 2.023937702178955, "reward_std": 0.16357968747615814, "rewards/accuracy_reward": 0.8301874995231628, "rewards/format_reward": 1.0, "step": 120 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 428.625, "epoch": 0.001221605249873801, "grad_norm": 1.64744650783282, "kl": 0.021484375, "learning_rate": 9.999963178540231e-07, "loss": 0.0009, "reward": 2.008000135421753, "reward_std": 0.21919485926628113, "rewards/accuracy_reward": 0.8267499804496765, "rewards/format_reward": 1.0, "step": 121 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.53125, "epoch": 0.001231701161029783, "grad_norm": 1.7419547861250744, "kl": 0.0164794921875, "learning_rate": 9.999962567406868e-07, "loss": 0.0007, "reward": 1.7980000972747803, "reward_std": 0.31691592931747437, "rewards/accuracy_reward": 0.6667499542236328, "rewards/format_reward": 1.0, "step": 122 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 407.9375, "epoch": 0.0012417970721857647, "grad_norm": 1.504945498769008, "kl": 0.0157470703125, "learning_rate": 9.999961951243627e-07, "loss": 0.0006, "reward": 1.8472813367843628, "reward_std": 0.04272625595331192, "rewards/accuracy_reward": 0.7035312652587891, "rewards/format_reward": 1.0, "step": 123 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.84375, "epoch": 0.0012518929833417467, "grad_norm": 2.19518744793812, "kl": 0.02099609375, "learning_rate": 9.999961330050509e-07, "loss": 0.0008, "reward": 1.8815938234329224, "reward_std": 0.21747922897338867, "rewards/accuracy_reward": 0.7190937399864197, "rewards/format_reward": 1.0, "step": 124 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.8125, "epoch": 0.0012619888944977284, "grad_norm": 2.2199861956930396, "kl": 0.020751953125, "learning_rate": 9.99996070382751e-07, "loss": 0.0008, "reward": 1.975250005722046, "reward_std": 0.18760459125041962, "rewards/accuracy_reward": 0.8002500534057617, "rewards/format_reward": 1.0, "step": 125 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.78125, "epoch": 0.0012720848056537103, "grad_norm": 1.8764958481362592, "kl": 0.0272216796875, "learning_rate": 9.999960072574635e-07, "loss": 0.0011, "reward": 1.9010000228881836, "reward_std": 0.17675957083702087, "rewards/accuracy_reward": 0.7135000228881836, "rewards/format_reward": 1.0, "step": 126 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.65625, "epoch": 0.001282180716809692, "grad_norm": 1.859169215958819, "kl": 0.0242919921875, "learning_rate": 9.999959436291883e-07, "loss": 0.001, "reward": 1.9707499742507935, "reward_std": 0.08213190734386444, "rewards/accuracy_reward": 0.7770000100135803, "rewards/format_reward": 1.0, "step": 127 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.21875, "epoch": 0.001292276627965674, "grad_norm": 1.5500079627244439, "kl": 0.0238037109375, "learning_rate": 9.999958794979257e-07, "loss": 0.001, "reward": 1.8166874647140503, "reward_std": 0.056258976459503174, "rewards/accuracy_reward": 0.6666874885559082, "rewards/format_reward": 1.0, "step": 128 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.5625, "epoch": 0.0013023725391216557, "grad_norm": 2.301850030162296, "kl": 0.0230712890625, "learning_rate": 9.999958148636755e-07, "loss": 0.0009, "reward": 1.9743125438690186, "reward_std": 0.22149574756622314, "rewards/accuracy_reward": 0.7993125319480896, "rewards/format_reward": 1.0, "step": 129 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.9375, "epoch": 0.0013124684502776375, "grad_norm": 8.0086302732827, "kl": 0.0255126953125, "learning_rate": 9.999957497264378e-07, "loss": 0.001, "reward": 2.06068754196167, "reward_std": 0.07701534032821655, "rewards/accuracy_reward": 0.8731874823570251, "rewards/format_reward": 1.0, "step": 130 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 431.5625, "epoch": 0.0013225643614336194, "grad_norm": 2.199670937362219, "kl": 0.026123046875, "learning_rate": 9.999956840862127e-07, "loss": 0.001, "reward": 1.8201563358306885, "reward_std": 0.061138998717069626, "rewards/accuracy_reward": 0.6764062643051147, "rewards/format_reward": 1.0, "step": 131 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.40625, "epoch": 0.0013326602725896011, "grad_norm": 1.6249723038640274, "kl": 0.027099609375, "learning_rate": 9.999956179430003e-07, "loss": 0.0011, "reward": 1.7099062204360962, "reward_std": 0.06283078342676163, "rewards/accuracy_reward": 0.566156268119812, "rewards/format_reward": 1.0, "step": 132 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.001342756183745583, "grad_norm": 2.8326447581381387, "kl": 0.0279541015625, "learning_rate": 9.999955512968006e-07, "loss": 0.0011, "reward": 1.968656301498413, "reward_std": 0.10469064116477966, "rewards/accuracy_reward": 0.7749062180519104, "rewards/format_reward": 1.0, "step": 133 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.125, "epoch": 0.0013528520949015648, "grad_norm": 2.2129945772708037, "kl": 0.02392578125, "learning_rate": 9.999954841476138e-07, "loss": 0.001, "reward": 1.7151250839233398, "reward_std": 0.16833430528640747, "rewards/accuracy_reward": 0.5776249766349792, "rewards/format_reward": 1.0, "step": 134 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.78125, "epoch": 0.0013629480060575467, "grad_norm": 1.8715249721087774, "kl": 0.025390625, "learning_rate": 9.999954164954398e-07, "loss": 0.001, "reward": 1.9728751182556152, "reward_std": 0.14662247896194458, "rewards/accuracy_reward": 0.7853749990463257, "rewards/format_reward": 1.0, "step": 135 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.875, "epoch": 0.0013730439172135284, "grad_norm": 2.1611689644250633, "kl": 0.0299072265625, "learning_rate": 9.999953483402788e-07, "loss": 0.0012, "reward": 1.9200937747955322, "reward_std": 0.09479568898677826, "rewards/accuracy_reward": 0.7325937747955322, "rewards/format_reward": 1.0, "step": 136 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 424.09375, "epoch": 0.0013831398283695104, "grad_norm": 1.5274332735126275, "kl": 0.0201416015625, "learning_rate": 9.999952796821308e-07, "loss": 0.0008, "reward": 1.4781875610351562, "reward_std": 0.04334306716918945, "rewards/accuracy_reward": 0.37818750739097595, "rewards/format_reward": 1.0, "step": 137 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.3125, "epoch": 0.001393235739525492, "grad_norm": 2.504669618411833, "kl": 0.024658203125, "learning_rate": 9.999952105209957e-07, "loss": 0.001, "reward": 2.0244998931884766, "reward_std": 0.1495480239391327, "rewards/accuracy_reward": 0.8307499885559082, "rewards/format_reward": 1.0, "step": 138 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.6875, "epoch": 0.001403331650681474, "grad_norm": 24.402602642479096, "kl": 0.0262451171875, "learning_rate": 9.99995140856874e-07, "loss": 0.0011, "reward": 2.0547187328338623, "reward_std": 0.11395949870347977, "rewards/accuracy_reward": 0.854718804359436, "rewards/format_reward": 1.0, "step": 139 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.0014134275618374558, "grad_norm": 2.2599748674090536, "kl": 0.0284423828125, "learning_rate": 9.999950706897655e-07, "loss": 0.0011, "reward": 1.9945313930511475, "reward_std": 0.10190100967884064, "rewards/accuracy_reward": 0.7945312857627869, "rewards/format_reward": 1.0, "step": 140 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.0014235234729934377, "grad_norm": 1.7063166876746236, "kl": 0.0255126953125, "learning_rate": 9.999950000196703e-07, "loss": 0.001, "reward": 2.0680625438690186, "reward_std": 0.04601778835058212, "rewards/accuracy_reward": 0.8743125200271606, "rewards/format_reward": 1.0, "step": 141 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.65625, "epoch": 0.0014336193841494194, "grad_norm": 2.964195717222086, "kl": 0.025634765625, "learning_rate": 9.999949288465882e-07, "loss": 0.001, "reward": 1.9351249933242798, "reward_std": 0.09535897523164749, "rewards/accuracy_reward": 0.7351250052452087, "rewards/format_reward": 1.0, "step": 142 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.84375, "epoch": 0.0014437152953054014, "grad_norm": 2.123525821888702, "kl": 0.02685546875, "learning_rate": 9.999948571705198e-07, "loss": 0.0011, "reward": 2.038249969482422, "reward_std": 0.08730645477771759, "rewards/accuracy_reward": 0.8570000529289246, "rewards/format_reward": 1.0, "step": 143 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.71875, "epoch": 0.001453811206461383, "grad_norm": 1.8983601296972548, "kl": 0.0244140625, "learning_rate": 9.999947849914649e-07, "loss": 0.001, "reward": 2.093625068664551, "reward_std": 0.05947788804769516, "rewards/accuracy_reward": 0.893625020980835, "rewards/format_reward": 1.0, "step": 144 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 417.125, "epoch": 0.001463907117617365, "grad_norm": 2.034257976644004, "kl": 0.02490234375, "learning_rate": 9.999947123094235e-07, "loss": 0.001, "reward": 1.6932811737060547, "reward_std": 0.06595882773399353, "rewards/accuracy_reward": 0.5495312213897705, "rewards/format_reward": 1.0, "step": 145 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.25, "epoch": 0.0014740030287733468, "grad_norm": 1.8433113136242851, "kl": 0.0240478515625, "learning_rate": 9.999946391243956e-07, "loss": 0.001, "reward": 2.1123437881469727, "reward_std": 0.04923524707555771, "rewards/accuracy_reward": 0.9185937643051147, "rewards/format_reward": 1.0, "step": 146 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.90625, "epoch": 0.0014840989399293287, "grad_norm": 1.7087131123108459, "kl": 0.0203857421875, "learning_rate": 9.999945654363817e-07, "loss": 0.0008, "reward": 2.1578125953674316, "reward_std": 0.04723595455288887, "rewards/accuracy_reward": 0.9703124761581421, "rewards/format_reward": 1.0, "step": 147 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.46875, "epoch": 0.0014941948510853104, "grad_norm": 1.984039108669536, "kl": 0.027587890625, "learning_rate": 9.999944912453814e-07, "loss": 0.0011, "reward": 2.056906223297119, "reward_std": 0.06921645998954773, "rewards/accuracy_reward": 0.863156259059906, "rewards/format_reward": 1.0, "step": 148 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 419.53125, "epoch": 0.0015042907622412924, "grad_norm": 1.5644967405449237, "kl": 0.0216064453125, "learning_rate": 9.999944165513952e-07, "loss": 0.0009, "reward": 1.7611563205718994, "reward_std": 0.07086822390556335, "rewards/accuracy_reward": 0.6236562728881836, "rewards/format_reward": 1.0, "step": 149 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.1875, "epoch": 0.001514386673397274, "grad_norm": 1.8958249038778525, "kl": 0.0225830078125, "learning_rate": 9.999943413544228e-07, "loss": 0.0009, "reward": 2.164875030517578, "reward_std": 0.06488104164600372, "rewards/accuracy_reward": 0.9773750305175781, "rewards/format_reward": 1.0, "step": 150 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 426.125, "epoch": 0.001524482584553256, "grad_norm": 1.9746516232209996, "kl": 0.031982421875, "learning_rate": 9.999942656544645e-07, "loss": 0.0013, "reward": 1.987874984741211, "reward_std": 0.09698949754238129, "rewards/accuracy_reward": 0.8066250085830688, "rewards/format_reward": 1.0, "step": 151 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 432.5, "epoch": 0.0015345784957092377, "grad_norm": 1.7748929201996406, "kl": 0.01806640625, "learning_rate": 9.999941894515202e-07, "loss": 0.0007, "reward": 1.6029688119888306, "reward_std": 0.34945768117904663, "rewards/accuracy_reward": 0.5029687285423279, "rewards/format_reward": 1.0, "step": 152 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 388.0, "epoch": 0.0015446744068652197, "grad_norm": 2.234218996796586, "kl": 0.030517578125, "learning_rate": 9.999941127455901e-07, "loss": 0.0012, "reward": 1.9505937099456787, "reward_std": 0.07132446020841599, "rewards/accuracy_reward": 0.7568437457084656, "rewards/format_reward": 1.0, "step": 153 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 403.03125, "epoch": 0.0015547703180212014, "grad_norm": 1.2221115726747818, "kl": 0.0205078125, "learning_rate": 9.999940355366744e-07, "loss": 0.0008, "reward": 1.5637500286102295, "reward_std": 0.026553863659501076, "rewards/accuracy_reward": 0.4637499749660492, "rewards/format_reward": 1.0, "step": 154 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 447.71875, "epoch": 0.0015648662291771831, "grad_norm": 2.173276489710756, "kl": 0.02099609375, "learning_rate": 9.999939578247727e-07, "loss": 0.0008, "reward": 1.7375624179840088, "reward_std": 0.19605118036270142, "rewards/accuracy_reward": 0.6125624775886536, "rewards/format_reward": 1.0, "step": 155 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.5, "epoch": 0.001574962140333165, "grad_norm": 2.8650105338759895, "kl": 0.0284423828125, "learning_rate": 9.999938796098856e-07, "loss": 0.0011, "reward": 2.0414376258850098, "reward_std": 0.08801525831222534, "rewards/accuracy_reward": 0.8601875305175781, "rewards/format_reward": 1.0, "step": 156 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.46875, "epoch": 0.0015850580514891468, "grad_norm": 2.721660298974271, "kl": 0.034423828125, "learning_rate": 9.99993800892013e-07, "loss": 0.0014, "reward": 2.0940937995910645, "reward_std": 0.07940727472305298, "rewards/accuracy_reward": 0.9065937399864197, "rewards/format_reward": 1.0, "step": 157 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 399.5625, "epoch": 0.0015951539626451287, "grad_norm": 1.7514200235995971, "kl": 0.018310546875, "learning_rate": 9.999937216711552e-07, "loss": 0.0007, "reward": 1.7745938301086426, "reward_std": 0.26367875933647156, "rewards/accuracy_reward": 0.6495937705039978, "rewards/format_reward": 1.0, "step": 158 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.53125, "epoch": 0.0016052498738011105, "grad_norm": 2.147340943874515, "kl": 0.026123046875, "learning_rate": 9.999936419473118e-07, "loss": 0.001, "reward": 2.0520312786102295, "reward_std": 0.16783763468265533, "rewards/accuracy_reward": 0.8707811832427979, "rewards/format_reward": 1.0, "step": 159 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.0016153457849570924, "grad_norm": 2.3358326993408545, "kl": 0.02587890625, "learning_rate": 9.99993561720483e-07, "loss": 0.001, "reward": 2.0439376831054688, "reward_std": 0.0782424584031105, "rewards/accuracy_reward": 0.8626874685287476, "rewards/format_reward": 1.0, "step": 160 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 435.8125, "epoch": 0.0016254416961130741, "grad_norm": 2.000648863141328, "kl": 0.030029296875, "learning_rate": 9.999934809906692e-07, "loss": 0.0012, "reward": 1.7680624723434448, "reward_std": 0.0797620564699173, "rewards/accuracy_reward": 0.6243124604225159, "rewards/format_reward": 1.0, "step": 161 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.59375, "epoch": 0.001635537607269056, "grad_norm": 1.9546213448902698, "kl": 0.031494140625, "learning_rate": 9.999933997578703e-07, "loss": 0.0013, "reward": 2.016031265258789, "reward_std": 0.08243782818317413, "rewards/accuracy_reward": 0.8160312175750732, "rewards/format_reward": 1.0, "step": 162 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 435.0625, "epoch": 0.0016456335184250378, "grad_norm": 3.4429907635733774, "kl": 0.03466796875, "learning_rate": 9.999933180220862e-07, "loss": 0.0014, "reward": 2.029843807220459, "reward_std": 0.06949349492788315, "rewards/accuracy_reward": 0.8423437476158142, "rewards/format_reward": 1.0, "step": 163 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 441.59375, "epoch": 0.0016557294295810197, "grad_norm": 2.0154570750477716, "kl": 0.031005859375, "learning_rate": 9.999932357833173e-07, "loss": 0.0012, "reward": 1.8273125886917114, "reward_std": 0.2064659148454666, "rewards/accuracy_reward": 0.6648125052452087, "rewards/format_reward": 1.0, "step": 164 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.3125, "epoch": 0.0016658253407370014, "grad_norm": 2.025094351436452, "kl": 0.035400390625, "learning_rate": 9.999931530415637e-07, "loss": 0.0014, "reward": 1.9912188053131104, "reward_std": 0.12570369243621826, "rewards/accuracy_reward": 0.8037186861038208, "rewards/format_reward": 1.0, "step": 165 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 432.53125, "epoch": 0.0016759212518929834, "grad_norm": 2.424977260994002, "kl": 0.0267333984375, "learning_rate": 9.99993069796825e-07, "loss": 0.0011, "reward": 1.9264376163482666, "reward_std": 0.15449532866477966, "rewards/accuracy_reward": 0.7326875329017639, "rewards/format_reward": 1.0, "step": 166 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.4375, "epoch": 0.0016860171630489651, "grad_norm": 2.14330858374347, "kl": 0.0302734375, "learning_rate": 9.999929860491018e-07, "loss": 0.0012, "reward": 2.0904998779296875, "reward_std": 0.054400667548179626, "rewards/accuracy_reward": 0.8904999494552612, "rewards/format_reward": 1.0, "step": 167 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.4375, "epoch": 0.001696113074204947, "grad_norm": 1.2343600599309938, "kl": 0.0216064453125, "learning_rate": 9.99992901798394e-07, "loss": 0.0009, "reward": 2.13253116607666, "reward_std": 0.04429956525564194, "rewards/accuracy_reward": 0.9325311779975891, "rewards/format_reward": 1.0, "step": 168 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.65625, "epoch": 0.0017062089853609288, "grad_norm": 1.9418652809059123, "kl": 0.026123046875, "learning_rate": 9.999928170447016e-07, "loss": 0.001, "reward": 2.032437562942505, "reward_std": 0.12841936945915222, "rewards/accuracy_reward": 0.8449375033378601, "rewards/format_reward": 1.0, "step": 169 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.1875, "epoch": 0.0017163048965169107, "grad_norm": 2.108961027918085, "kl": 0.030517578125, "learning_rate": 9.999927317880248e-07, "loss": 0.0012, "reward": 2.072499990463257, "reward_std": 0.06516535580158234, "rewards/accuracy_reward": 0.8787499666213989, "rewards/format_reward": 1.0, "step": 170 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.9375, "epoch": 0.0017264008076728924, "grad_norm": 1.9522829070603067, "kl": 0.0283203125, "learning_rate": 9.999926460283637e-07, "loss": 0.0011, "reward": 2.0438437461853027, "reward_std": 0.07482236623764038, "rewards/accuracy_reward": 0.8500937223434448, "rewards/format_reward": 1.0, "step": 171 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.375, "epoch": 0.0017364967188288744, "grad_norm": 2.0492111324583973, "kl": 0.0294189453125, "learning_rate": 9.999925597657182e-07, "loss": 0.0012, "reward": 1.929531216621399, "reward_std": 0.05091296136379242, "rewards/accuracy_reward": 0.7295312881469727, "rewards/format_reward": 1.0, "step": 172 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 417.78125, "epoch": 0.001746592629984856, "grad_norm": 1.9809181490802987, "kl": 0.031982421875, "learning_rate": 9.999924730000886e-07, "loss": 0.0013, "reward": 1.861375093460083, "reward_std": 0.04932747036218643, "rewards/accuracy_reward": 0.6613749861717224, "rewards/format_reward": 1.0, "step": 173 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.6875, "epoch": 0.001756688541140838, "grad_norm": 2.6318042971602895, "kl": 0.031494140625, "learning_rate": 9.99992385731475e-07, "loss": 0.0013, "reward": 2.0784687995910645, "reward_std": 0.07872430980205536, "rewards/accuracy_reward": 0.8784687519073486, "rewards/format_reward": 1.0, "step": 174 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.40625, "epoch": 0.0017667844522968198, "grad_norm": 1.6419277918412174, "kl": 0.030029296875, "learning_rate": 9.99992297959877e-07, "loss": 0.0012, "reward": 1.9790937900543213, "reward_std": 0.05698253586888313, "rewards/accuracy_reward": 0.7790937423706055, "rewards/format_reward": 1.0, "step": 175 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.78125, "epoch": 0.0017768803634528017, "grad_norm": 2.392693219738747, "kl": 0.0302734375, "learning_rate": 9.999922096852956e-07, "loss": 0.0012, "reward": 2.0199062824249268, "reward_std": 0.10041315853595734, "rewards/accuracy_reward": 0.8199062347412109, "rewards/format_reward": 1.0, "step": 176 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.09375, "epoch": 0.0017869762746087834, "grad_norm": 1.936441985298189, "kl": 0.0235595703125, "learning_rate": 9.9999212090773e-07, "loss": 0.0009, "reward": 1.954281210899353, "reward_std": 0.19575932621955872, "rewards/accuracy_reward": 0.7855312824249268, "rewards/format_reward": 1.0, "step": 177 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.65625, "epoch": 0.0017970721857647654, "grad_norm": 1.339328880413479, "kl": 0.01904296875, "learning_rate": 9.999920316271808e-07, "loss": 0.0008, "reward": 1.61537504196167, "reward_std": 0.0975017100572586, "rewards/accuracy_reward": 0.515375018119812, "rewards/format_reward": 1.0, "step": 178 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 0.001807168096920747, "grad_norm": 2.578684713021007, "kl": 0.0302734375, "learning_rate": 9.99991941843648e-07, "loss": 0.0012, "reward": 1.9268438816070557, "reward_std": 0.09284500777721405, "rewards/accuracy_reward": 0.745593786239624, "rewards/format_reward": 1.0, "step": 179 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.03125, "epoch": 0.001817264008076729, "grad_norm": 2.729186372878441, "kl": 0.029052734375, "learning_rate": 9.999918515571314e-07, "loss": 0.0012, "reward": 2.073499917984009, "reward_std": 0.07762030512094498, "rewards/accuracy_reward": 0.8860000371932983, "rewards/format_reward": 1.0, "step": 180 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.375, "epoch": 0.0018273599192327108, "grad_norm": 1.7596850258805745, "kl": 0.0223388671875, "learning_rate": 9.999917607676316e-07, "loss": 0.0009, "reward": 2.03543758392334, "reward_std": 0.2036915123462677, "rewards/accuracy_reward": 0.8604375123977661, "rewards/format_reward": 1.0, "step": 181 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 401.96875, "epoch": 0.0018374558303886925, "grad_norm": 10.03903790645343, "kl": 0.026611328125, "learning_rate": 9.999916694751483e-07, "loss": 0.0011, "reward": 1.6688437461853027, "reward_std": 0.09356577694416046, "rewards/accuracy_reward": 0.5250937342643738, "rewards/format_reward": 1.0, "step": 182 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.5, "epoch": 0.0018475517415446744, "grad_norm": 2.0298072582841784, "kl": 0.026123046875, "learning_rate": 9.999915776796817e-07, "loss": 0.001, "reward": 1.8437812328338623, "reward_std": 0.18945524096488953, "rewards/accuracy_reward": 0.6875312328338623, "rewards/format_reward": 1.0, "step": 183 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.0018576476527006561, "grad_norm": 3.202152446404935, "kl": 0.0218505859375, "learning_rate": 9.99991485381232e-07, "loss": 0.0009, "reward": 1.9200313091278076, "reward_std": 0.3508338928222656, "rewards/accuracy_reward": 0.7762812376022339, "rewards/format_reward": 1.0, "step": 184 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.28125, "epoch": 0.001867743563856638, "grad_norm": 1.3778726368115708, "kl": 0.029541015625, "learning_rate": 9.999913925797992e-07, "loss": 0.0012, "reward": 2.050374984741211, "reward_std": 0.04554280638694763, "rewards/accuracy_reward": 0.8566250205039978, "rewards/format_reward": 1.0, "step": 185 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.28125, "epoch": 0.0018778394750126198, "grad_norm": 1.721367540531098, "kl": 0.028076171875, "learning_rate": 9.999912992753834e-07, "loss": 0.0011, "reward": 2.1304688453674316, "reward_std": 0.03188595175743103, "rewards/accuracy_reward": 0.9367188215255737, "rewards/format_reward": 1.0, "step": 186 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.90625, "epoch": 0.0018879353861686017, "grad_norm": 2.002217221953438, "kl": 0.0263671875, "learning_rate": 9.999912054679844e-07, "loss": 0.0011, "reward": 2.145718812942505, "reward_std": 0.032003004103899, "rewards/accuracy_reward": 0.9519687294960022, "rewards/format_reward": 1.0, "step": 187 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 439.75, "epoch": 0.0018980312973245835, "grad_norm": 1.668975473070088, "kl": 0.0294189453125, "learning_rate": 9.99991111157603e-07, "loss": 0.0012, "reward": 2.0931875705718994, "reward_std": 0.05857175961136818, "rewards/accuracy_reward": 0.8994375467300415, "rewards/format_reward": 1.0, "step": 188 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.0, "epoch": 0.0019081272084805654, "grad_norm": 2.210839933931341, "kl": 0.0283203125, "learning_rate": 9.999910163442386e-07, "loss": 0.0011, "reward": 2.086937427520752, "reward_std": 0.06097465753555298, "rewards/accuracy_reward": 0.8931875228881836, "rewards/format_reward": 1.0, "step": 189 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.625, "epoch": 0.0019182231196365471, "grad_norm": 2.182491781559845, "kl": 0.0291748046875, "learning_rate": 9.999909210278917e-07, "loss": 0.0012, "reward": 2.107781410217285, "reward_std": 0.07593752443790436, "rewards/accuracy_reward": 0.9140312075614929, "rewards/format_reward": 1.0, "step": 190 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 425.5625, "epoch": 0.001928319030792529, "grad_norm": 3.756206707180806, "kl": 0.02099609375, "learning_rate": 9.999908252085622e-07, "loss": 0.0008, "reward": 1.9683752059936523, "reward_std": 0.2500483989715576, "rewards/accuracy_reward": 0.780875027179718, "rewards/format_reward": 1.0, "step": 191 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 408.4375, "epoch": 0.0019384149419485108, "grad_norm": 6.743561179017026, "kl": 0.0206298828125, "learning_rate": 9.999907288862501e-07, "loss": 0.0008, "reward": 1.6155624389648438, "reward_std": 0.16764017939567566, "rewards/accuracy_reward": 0.5030624866485596, "rewards/format_reward": 1.0, "step": 192 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.3125, "epoch": 0.0019485108531044927, "grad_norm": 3.3908423990090846, "kl": 0.0283203125, "learning_rate": 9.99990632060956e-07, "loss": 0.0011, "reward": 1.9591562747955322, "reward_std": 0.23536549508571625, "rewards/accuracy_reward": 0.8029062747955322, "rewards/format_reward": 1.0, "step": 193 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.9375, "epoch": 0.0019586067642604747, "grad_norm": 3.4459488297153515, "kl": 0.0274658203125, "learning_rate": 9.999905347326794e-07, "loss": 0.0011, "reward": 2.0729689598083496, "reward_std": 0.15846076607704163, "rewards/accuracy_reward": 0.8792186975479126, "rewards/format_reward": 1.0, "step": 194 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 379.03125, "epoch": 0.0019687026754164564, "grad_norm": 2.049277680950866, "kl": 0.0264892578125, "learning_rate": 9.99990436901421e-07, "loss": 0.0011, "reward": 1.7955626249313354, "reward_std": 0.1479376256465912, "rewards/accuracy_reward": 0.6580624580383301, "rewards/format_reward": 1.0, "step": 195 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 428.5, "epoch": 0.001978798586572438, "grad_norm": 3.1790802249419348, "kl": 0.02783203125, "learning_rate": 9.9999033856718e-07, "loss": 0.0011, "reward": 1.7789061069488525, "reward_std": 0.0444352813065052, "rewards/accuracy_reward": 0.62890625, "rewards/format_reward": 1.0, "step": 196 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.00198889449772842, "grad_norm": 1.6022100166387856, "kl": 0.0240478515625, "learning_rate": 9.999902397299573e-07, "loss": 0.001, "reward": 1.9172186851501465, "reward_std": 0.16982658207416534, "rewards/accuracy_reward": 0.7547187805175781, "rewards/format_reward": 1.0, "step": 197 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 378.28125, "epoch": 0.001998990408884402, "grad_norm": 2.1128238150582046, "kl": 0.0286865234375, "learning_rate": 9.99990140389753e-07, "loss": 0.0011, "reward": 1.8735625743865967, "reward_std": 0.22881178557872772, "rewards/accuracy_reward": 0.723562479019165, "rewards/format_reward": 1.0, "step": 198 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 399.375, "epoch": 0.0020090863200403837, "grad_norm": 3.597536126053849, "kl": 0.0238037109375, "learning_rate": 9.999900405465667e-07, "loss": 0.001, "reward": 1.6848750114440918, "reward_std": 0.20204579830169678, "rewards/accuracy_reward": 0.5723749995231628, "rewards/format_reward": 1.0, "step": 199 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.90625, "epoch": 0.0020191822311963654, "grad_norm": 2.2136387428946955, "kl": 0.0322265625, "learning_rate": 9.999899402003988e-07, "loss": 0.0013, "reward": 1.913062572479248, "reward_std": 0.06727355718612671, "rewards/accuracy_reward": 0.7193124890327454, "rewards/format_reward": 1.0, "step": 200 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 388.1875, "epoch": 0.002029278142352347, "grad_norm": 3.879867299091889, "kl": 0.0302734375, "learning_rate": 9.999898393512494e-07, "loss": 0.0012, "reward": 1.8548123836517334, "reward_std": 0.12193457782268524, "rewards/accuracy_reward": 0.692312479019165, "rewards/format_reward": 1.0, "step": 201 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.34375, "epoch": 0.0020393740535083293, "grad_norm": 2.453810706809344, "kl": 0.03076171875, "learning_rate": 9.999897379991184e-07, "loss": 0.0012, "reward": 2.059812545776367, "reward_std": 0.08740681409835815, "rewards/accuracy_reward": 0.8660624623298645, "rewards/format_reward": 1.0, "step": 202 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.875, "epoch": 0.002049469964664311, "grad_norm": 1.8108638221842672, "kl": 0.02734375, "learning_rate": 9.999896361440062e-07, "loss": 0.0011, "reward": 2.0493435859680176, "reward_std": 0.0851055160164833, "rewards/accuracy_reward": 0.8555936813354492, "rewards/format_reward": 1.0, "step": 203 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 399.28125, "epoch": 0.0020595658758202928, "grad_norm": 1.795540553846869, "kl": 0.0286865234375, "learning_rate": 9.999895337859127e-07, "loss": 0.0011, "reward": 1.748500108718872, "reward_std": 0.048254288733005524, "rewards/accuracy_reward": 0.6110000014305115, "rewards/format_reward": 1.0, "step": 204 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.4375, "epoch": 0.0020696617869762745, "grad_norm": 2.6314327920575367, "kl": 0.025146484375, "learning_rate": 9.999894309248382e-07, "loss": 0.001, "reward": 1.8729686737060547, "reward_std": 0.18494708836078644, "rewards/accuracy_reward": 0.7104687690734863, "rewards/format_reward": 1.0, "step": 205 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.5, "epoch": 0.0020797576981322567, "grad_norm": 1.8222649652650973, "kl": 0.0174560546875, "learning_rate": 9.999893275607825e-07, "loss": 0.0007, "reward": 1.6796250343322754, "reward_std": 0.14738979935646057, "rewards/accuracy_reward": 0.5733749866485596, "rewards/format_reward": 1.0, "step": 206 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.625, "epoch": 0.0020898536092882384, "grad_norm": 2.0817049363152926, "kl": 0.0269775390625, "learning_rate": 9.99989223693746e-07, "loss": 0.0011, "reward": 2.0380001068115234, "reward_std": 0.17520453035831451, "rewards/accuracy_reward": 0.8505000472068787, "rewards/format_reward": 1.0, "step": 207 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 409.625, "epoch": 0.00209994952044422, "grad_norm": 1.4893084328427784, "kl": 0.02734375, "learning_rate": 9.999891193237286e-07, "loss": 0.0011, "reward": 1.4294999837875366, "reward_std": 0.03190597891807556, "rewards/accuracy_reward": 0.3294999897480011, "rewards/format_reward": 1.0, "step": 208 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 389.625, "epoch": 0.002110045431600202, "grad_norm": 2.3620856832903168, "kl": 0.03466796875, "learning_rate": 9.999890144507305e-07, "loss": 0.0014, "reward": 1.9431874752044678, "reward_std": 0.08629113435745239, "rewards/accuracy_reward": 0.7556874752044678, "rewards/format_reward": 1.0, "step": 209 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 368.3125, "epoch": 0.0021201413427561835, "grad_norm": 2.781709813090354, "kl": 0.0341796875, "learning_rate": 9.999889090747518e-07, "loss": 0.0014, "reward": 2.069187641143799, "reward_std": 0.12701305747032166, "rewards/accuracy_reward": 0.9066874980926514, "rewards/format_reward": 1.0, "step": 210 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.5, "epoch": 0.0021302372539121657, "grad_norm": 1.8350792375719134, "kl": 0.025634765625, "learning_rate": 9.999888031957926e-07, "loss": 0.001, "reward": 1.675437569618225, "reward_std": 0.2785819172859192, "rewards/accuracy_reward": 0.5504375100135803, "rewards/format_reward": 1.0, "step": 211 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.78125, "epoch": 0.0021403331650681474, "grad_norm": 1.859060578235632, "kl": 0.037109375, "learning_rate": 9.999886968138528e-07, "loss": 0.0015, "reward": 2.0893750190734863, "reward_std": 0.04767885059118271, "rewards/accuracy_reward": 0.8956249952316284, "rewards/format_reward": 1.0, "step": 212 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 374.65625, "epoch": 0.002150429076224129, "grad_norm": 1.512834779792765, "kl": 0.0277099609375, "learning_rate": 9.99988589928933e-07, "loss": 0.0011, "reward": 1.8680938482284546, "reward_std": 0.046478789299726486, "rewards/accuracy_reward": 0.7243437767028809, "rewards/format_reward": 1.0, "step": 213 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 402.1875, "epoch": 0.002160524987380111, "grad_norm": 1.9735326550514942, "kl": 0.029052734375, "learning_rate": 9.999884825410329e-07, "loss": 0.0012, "reward": 1.8633438348770142, "reward_std": 0.2325250804424286, "rewards/accuracy_reward": 0.7070937156677246, "rewards/format_reward": 1.0, "step": 214 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.002170620898536093, "grad_norm": 2.0624136992823754, "kl": 0.03759765625, "learning_rate": 9.999883746501528e-07, "loss": 0.0015, "reward": 2.0174999237060547, "reward_std": 0.06433632969856262, "rewards/accuracy_reward": 0.8174999952316284, "rewards/format_reward": 1.0, "step": 215 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.9375, "epoch": 0.0021807168096920748, "grad_norm": 2.9888106949545823, "kl": 0.032958984375, "learning_rate": 9.999882662562924e-07, "loss": 0.0013, "reward": 2.0421249866485596, "reward_std": 0.08821793645620346, "rewards/accuracy_reward": 0.8546249866485596, "rewards/format_reward": 1.0, "step": 216 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.375, "epoch": 0.0021908127208480565, "grad_norm": 2.009848483000481, "kl": 0.02978515625, "learning_rate": 9.999881573594525e-07, "loss": 0.0012, "reward": 2.1356563568115234, "reward_std": 0.06135120242834091, "rewards/accuracy_reward": 0.9544062614440918, "rewards/format_reward": 1.0, "step": 217 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.1875, "epoch": 0.002200908632004038, "grad_norm": 1.8341157790330318, "kl": 0.031005859375, "learning_rate": 9.999880479596327e-07, "loss": 0.0012, "reward": 1.9353437423706055, "reward_std": 0.05515395104885101, "rewards/accuracy_reward": 0.7415937185287476, "rewards/format_reward": 1.0, "step": 218 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.3125, "epoch": 0.0022110045431600204, "grad_norm": 3.5796451341878175, "kl": 0.0240478515625, "learning_rate": 9.999879380568332e-07, "loss": 0.001, "reward": 2.136531352996826, "reward_std": 0.049960941076278687, "rewards/accuracy_reward": 0.9427812099456787, "rewards/format_reward": 1.0, "step": 219 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.5, "epoch": 0.002221100454316002, "grad_norm": 2.0112129010567843, "kl": 0.0311279296875, "learning_rate": 9.999878276510542e-07, "loss": 0.0012, "reward": 2.1342811584472656, "reward_std": 0.04901735112071037, "rewards/accuracy_reward": 0.9405312538146973, "rewards/format_reward": 1.0, "step": 220 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.75, "epoch": 0.002231196365471984, "grad_norm": 1.8659722338827582, "kl": 0.02197265625, "learning_rate": 9.99987716742296e-07, "loss": 0.0009, "reward": 2.022031307220459, "reward_std": 0.18786795437335968, "rewards/accuracy_reward": 0.8407812118530273, "rewards/format_reward": 1.0, "step": 221 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.0022412922766279655, "grad_norm": 2.0774016947363223, "kl": 0.032958984375, "learning_rate": 9.999876053305582e-07, "loss": 0.0013, "reward": 1.8058750629425049, "reward_std": 0.08998537063598633, "rewards/accuracy_reward": 0.6308749914169312, "rewards/format_reward": 1.0, "step": 222 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 401.5, "epoch": 0.0022513881877839477, "grad_norm": 2.326127474729588, "kl": 0.0238037109375, "learning_rate": 9.999874934158413e-07, "loss": 0.001, "reward": 1.8050000667572021, "reward_std": 0.08354546129703522, "rewards/accuracy_reward": 0.6800000071525574, "rewards/format_reward": 1.0, "step": 223 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.8125, "epoch": 0.0022614840989399294, "grad_norm": 2.1173649170525497, "kl": 0.03369140625, "learning_rate": 9.999873809981452e-07, "loss": 0.0013, "reward": 1.9991874694824219, "reward_std": 0.06515098363161087, "rewards/accuracy_reward": 0.7991874814033508, "rewards/format_reward": 1.0, "step": 224 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.002271580010095911, "grad_norm": 2.7751259153123007, "kl": 0.033447265625, "learning_rate": 9.999872680774704e-07, "loss": 0.0013, "reward": 1.9655935764312744, "reward_std": 0.05174662545323372, "rewards/accuracy_reward": 0.7718437314033508, "rewards/format_reward": 1.0, "step": 225 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.28125, "epoch": 0.002281675921251893, "grad_norm": 2.391192779808617, "kl": 0.031005859375, "learning_rate": 9.999871546538166e-07, "loss": 0.0012, "reward": 2.044968605041504, "reward_std": 0.049869354814291, "rewards/accuracy_reward": 0.8449687957763672, "rewards/format_reward": 1.0, "step": 226 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.46875, "epoch": 0.002291771832407875, "grad_norm": 3.2748844580285352, "kl": 0.031494140625, "learning_rate": 9.999870407271839e-07, "loss": 0.0013, "reward": 2.0454063415527344, "reward_std": 0.07709353417158127, "rewards/accuracy_reward": 0.8516561985015869, "rewards/format_reward": 1.0, "step": 227 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 436.53125, "epoch": 0.0023018677435638567, "grad_norm": 2.06759590217931, "kl": 0.0291748046875, "learning_rate": 9.999869262975728e-07, "loss": 0.0012, "reward": 2.117968797683716, "reward_std": 0.06253287196159363, "rewards/accuracy_reward": 0.9242187738418579, "rewards/format_reward": 1.0, "step": 228 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 432.5625, "epoch": 0.0023119636547198385, "grad_norm": 2.18736465583612, "kl": 0.0306396484375, "learning_rate": 9.99986811364983e-07, "loss": 0.0012, "reward": 1.9782501459121704, "reward_std": 0.07546444237232208, "rewards/accuracy_reward": 0.7907500267028809, "rewards/format_reward": 1.0, "step": 229 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 451.34375, "epoch": 0.00232205956587582, "grad_norm": 2.178838080290174, "kl": 0.03173828125, "learning_rate": 9.99986695929415e-07, "loss": 0.0013, "reward": 2.0071563720703125, "reward_std": 0.06693597882986069, "rewards/accuracy_reward": 0.819656252861023, "rewards/format_reward": 1.0, "step": 230 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 427.46875, "epoch": 0.0023321554770318023, "grad_norm": 2.1672818275062906, "kl": 0.0341796875, "learning_rate": 9.999865799908685e-07, "loss": 0.0014, "reward": 1.8295937776565552, "reward_std": 0.07344529777765274, "rewards/accuracy_reward": 0.6358437538146973, "rewards/format_reward": 1.0, "step": 231 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.9375, "epoch": 0.002342251388187784, "grad_norm": 3.7004142979119563, "kl": 0.0289306640625, "learning_rate": 9.99986463549344e-07, "loss": 0.0012, "reward": 1.8362188339233398, "reward_std": 0.11675569415092468, "rewards/accuracy_reward": 0.6862187385559082, "rewards/format_reward": 1.0, "step": 232 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 446.78125, "epoch": 0.0023523472993437658, "grad_norm": 2.068853249951569, "kl": 0.0264892578125, "learning_rate": 9.999863466048414e-07, "loss": 0.0011, "reward": 1.996593713760376, "reward_std": 0.11324161291122437, "rewards/accuracy_reward": 0.8153437376022339, "rewards/format_reward": 1.0, "step": 233 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 437.6875, "epoch": 0.0023624432104997475, "grad_norm": 1.6425758207755907, "kl": 0.0252685546875, "learning_rate": 9.999862291573608e-07, "loss": 0.001, "reward": 1.8139688968658447, "reward_std": 0.1532040238380432, "rewards/accuracy_reward": 0.6827187538146973, "rewards/format_reward": 1.0, "step": 234 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 414.375, "epoch": 0.0023725391216557292, "grad_norm": 1.8632354187765083, "kl": 0.0260009765625, "learning_rate": 9.999861112069025e-07, "loss": 0.001, "reward": 1.5503437519073486, "reward_std": 0.11274170875549316, "rewards/accuracy_reward": 0.4503437280654907, "rewards/format_reward": 1.0, "step": 235 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.375, "epoch": 0.0023826350328117114, "grad_norm": 2.073545535139145, "kl": 0.03369140625, "learning_rate": 9.999859927534665e-07, "loss": 0.0013, "reward": 2.0657501220703125, "reward_std": 0.06220652535557747, "rewards/accuracy_reward": 0.8657500147819519, "rewards/format_reward": 1.0, "step": 236 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 439.4375, "epoch": 0.002392730943967693, "grad_norm": 1.3981503565527378, "kl": 0.0281982421875, "learning_rate": 9.999858737970528e-07, "loss": 0.0011, "reward": 2.0944063663482666, "reward_std": 0.042489178478717804, "rewards/accuracy_reward": 0.906906247138977, "rewards/format_reward": 1.0, "step": 237 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 446.96875, "epoch": 0.002402826855123675, "grad_norm": 1.7258229337565794, "kl": 0.026611328125, "learning_rate": 9.999857543376618e-07, "loss": 0.0011, "reward": 1.5747811794281006, "reward_std": 0.14079999923706055, "rewards/accuracy_reward": 0.48103126883506775, "rewards/format_reward": 1.0, "step": 238 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 451.25, "epoch": 0.0024129227662796566, "grad_norm": 1.95499584755001, "kl": 0.029541015625, "learning_rate": 9.999856343752933e-07, "loss": 0.0012, "reward": 1.7944061756134033, "reward_std": 0.03468593209981918, "rewards/accuracy_reward": 0.6506562829017639, "rewards/format_reward": 1.0, "step": 239 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 437.15625, "epoch": 0.0024230186774356387, "grad_norm": 1.8738553529888657, "kl": 0.03271484375, "learning_rate": 9.999855139099478e-07, "loss": 0.0013, "reward": 1.7656874656677246, "reward_std": 0.03981117904186249, "rewards/accuracy_reward": 0.6156874895095825, "rewards/format_reward": 1.0, "step": 240 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.21875, "epoch": 0.0024331145885916204, "grad_norm": 1.4453803853048037, "kl": 0.0302734375, "learning_rate": 9.99985392941625e-07, "loss": 0.0012, "reward": 1.7964999675750732, "reward_std": 0.014501243829727173, "rewards/accuracy_reward": 0.6464999914169312, "rewards/format_reward": 1.0, "step": 241 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 449.75, "epoch": 0.002443210499747602, "grad_norm": 1.7065724778583504, "kl": 0.0274658203125, "learning_rate": 9.999852714703254e-07, "loss": 0.0011, "reward": 1.8411874771118164, "reward_std": 0.048418715596199036, "rewards/accuracy_reward": 0.6974375247955322, "rewards/format_reward": 1.0, "step": 242 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 428.625, "epoch": 0.002453306410903584, "grad_norm": 2.2892396121119982, "kl": 0.03271484375, "learning_rate": 9.99985149496049e-07, "loss": 0.0013, "reward": 2.012718677520752, "reward_std": 0.05762708932161331, "rewards/accuracy_reward": 0.825218677520752, "rewards/format_reward": 1.0, "step": 243 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.46875, "epoch": 0.002463402322059566, "grad_norm": 2.2284445369000334, "kl": 0.0341796875, "learning_rate": 9.999850270187954e-07, "loss": 0.0014, "reward": 2.0610313415527344, "reward_std": 0.04507733881473541, "rewards/accuracy_reward": 0.8672812581062317, "rewards/format_reward": 1.0, "step": 244 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 439.9375, "epoch": 0.0024734982332155478, "grad_norm": 2.345177063706009, "kl": 0.032958984375, "learning_rate": 9.999849040385657e-07, "loss": 0.0013, "reward": 2.0174999237060547, "reward_std": 0.06579624116420746, "rewards/accuracy_reward": 0.8237500190734863, "rewards/format_reward": 1.0, "step": 245 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.8125, "epoch": 0.0024835941443715295, "grad_norm": 3.265410625745428, "kl": 0.033935546875, "learning_rate": 9.999847805553595e-07, "loss": 0.0014, "reward": 2.108062505722046, "reward_std": 0.03447720780968666, "rewards/accuracy_reward": 0.9080625176429749, "rewards/format_reward": 1.0, "step": 246 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.46875, "epoch": 0.002493690055527511, "grad_norm": 1.83310054260233, "kl": 0.032470703125, "learning_rate": 9.999846565691767e-07, "loss": 0.0013, "reward": 1.7603750228881836, "reward_std": 0.062065351754426956, "rewards/accuracy_reward": 0.6166250109672546, "rewards/format_reward": 1.0, "step": 247 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.96875, "epoch": 0.0025037859666834934, "grad_norm": 2.1202340492436806, "kl": 0.0303955078125, "learning_rate": 9.999845320800177e-07, "loss": 0.0012, "reward": 1.788156270980835, "reward_std": 0.15171734988689423, "rewards/accuracy_reward": 0.6444061994552612, "rewards/format_reward": 1.0, "step": 248 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.002513881877839475, "grad_norm": 2.2984142488492023, "kl": 0.037109375, "learning_rate": 9.999844070878827e-07, "loss": 0.0015, "reward": 2.0389060974121094, "reward_std": 0.05888991802930832, "rewards/accuracy_reward": 0.845156192779541, "rewards/format_reward": 1.0, "step": 249 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.15625, "epoch": 0.002523977788995457, "grad_norm": 2.157808682507455, "kl": 0.03271484375, "learning_rate": 9.999842815927717e-07, "loss": 0.0013, "reward": 1.921375036239624, "reward_std": 0.17803120613098145, "rewards/accuracy_reward": 0.733875036239624, "rewards/format_reward": 1.0, "step": 250 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 414.875, "epoch": 0.0025340737001514385, "grad_norm": 2.0120252399956593, "kl": 0.035888671875, "learning_rate": 9.999841555946846e-07, "loss": 0.0014, "reward": 1.73046875, "reward_std": 0.0788014680147171, "rewards/accuracy_reward": 0.592968761920929, "rewards/format_reward": 1.0, "step": 251 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.03125, "epoch": 0.0025441696113074207, "grad_norm": 2.028507477306097, "kl": 0.0361328125, "learning_rate": 9.99984029093622e-07, "loss": 0.0014, "reward": 2.1231560707092285, "reward_std": 0.04446343332529068, "rewards/accuracy_reward": 0.9294062852859497, "rewards/format_reward": 1.0, "step": 252 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.15625, "epoch": 0.0025542655224634024, "grad_norm": 3.2287886126564365, "kl": 0.03515625, "learning_rate": 9.99983902089584e-07, "loss": 0.0014, "reward": 2.144624948501587, "reward_std": 0.023849256336688995, "rewards/accuracy_reward": 0.9446249604225159, "rewards/format_reward": 1.0, "step": 253 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.0, "epoch": 0.002564361433619384, "grad_norm": 2.4713451593773508, "kl": 0.034912109375, "learning_rate": 9.999837745825704e-07, "loss": 0.0014, "reward": 1.772687554359436, "reward_std": 0.04757148027420044, "rewards/accuracy_reward": 0.6226874589920044, "rewards/format_reward": 1.0, "step": 254 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 425.875, "epoch": 0.002574457344775366, "grad_norm": 3.802262072353181, "kl": 0.037109375, "learning_rate": 9.999836465725813e-07, "loss": 0.0015, "reward": 1.9454374313354492, "reward_std": 0.07700834423303604, "rewards/accuracy_reward": 0.757937490940094, "rewards/format_reward": 1.0, "step": 255 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 431.65625, "epoch": 0.002584553255931348, "grad_norm": 2.23121345417488, "kl": 0.0289306640625, "learning_rate": 9.99983518059617e-07, "loss": 0.0012, "reward": 1.6819686889648438, "reward_std": 0.27919209003448486, "rewards/accuracy_reward": 0.5632187128067017, "rewards/format_reward": 1.0, "step": 256 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.75, "epoch": 0.0025946491670873297, "grad_norm": 5.665588214050738, "kl": 0.03955078125, "learning_rate": 9.999833890436778e-07, "loss": 0.0016, "reward": 1.9824376106262207, "reward_std": 0.1090034544467926, "rewards/accuracy_reward": 0.7824374437332153, "rewards/format_reward": 1.0, "step": 257 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 429.65625, "epoch": 0.0026047450782433115, "grad_norm": 2.651897044909848, "kl": 0.0281982421875, "learning_rate": 9.999832595247633e-07, "loss": 0.0011, "reward": 1.8380000591278076, "reward_std": 0.0501093752682209, "rewards/accuracy_reward": 0.700499951839447, "rewards/format_reward": 1.0, "step": 258 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 423.3125, "epoch": 0.002614840989399293, "grad_norm": 2.3331434176533037, "kl": 0.0299072265625, "learning_rate": 9.999831295028742e-07, "loss": 0.0012, "reward": 1.6471874713897705, "reward_std": 0.18214167654514313, "rewards/accuracy_reward": 0.5284374952316284, "rewards/format_reward": 1.0, "step": 259 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.40625, "epoch": 0.002624936900555275, "grad_norm": 1.7614465924821605, "kl": 0.0306396484375, "learning_rate": 9.999829989780104e-07, "loss": 0.0012, "reward": 1.671375036239624, "reward_std": 0.17743219435214996, "rewards/accuracy_reward": 0.5526250004768372, "rewards/format_reward": 1.0, "step": 260 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.59375, "epoch": 0.002635032811711257, "grad_norm": 2.427752196606172, "kl": 0.04052734375, "learning_rate": 9.99982867950172e-07, "loss": 0.0016, "reward": 2.0279688835144043, "reward_std": 0.07435256242752075, "rewards/accuracy_reward": 0.8404687643051147, "rewards/format_reward": 1.0, "step": 261 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.78125, "epoch": 0.002645128722867239, "grad_norm": 4.473523809385294, "kl": 0.040771484375, "learning_rate": 9.999827364193595e-07, "loss": 0.0016, "reward": 1.8446563482284546, "reward_std": 0.08135692775249481, "rewards/accuracy_reward": 0.644656240940094, "rewards/format_reward": 1.0, "step": 262 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.1875, "epoch": 0.0026552246340232205, "grad_norm": 4.139083707553521, "kl": 0.04248046875, "learning_rate": 9.99982604385572e-07, "loss": 0.0017, "reward": 2.0176563262939453, "reward_std": 0.07800976932048798, "rewards/accuracy_reward": 0.8176562190055847, "rewards/format_reward": 1.0, "step": 263 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.0625, "epoch": 0.0026653205451792022, "grad_norm": 87.21573892813068, "kl": 0.03515625, "learning_rate": 9.999824718488109e-07, "loss": 0.0014, "reward": 2.0803751945495605, "reward_std": 0.06504951417446136, "rewards/accuracy_reward": 0.8991249799728394, "rewards/format_reward": 1.0, "step": 264 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.25, "epoch": 0.0026754164563351844, "grad_norm": 1.986977031239722, "kl": 0.035888671875, "learning_rate": 9.999823388090755e-07, "loss": 0.0014, "reward": 1.697312593460083, "reward_std": 0.05557265132665634, "rewards/accuracy_reward": 0.5473124980926514, "rewards/format_reward": 1.0, "step": 265 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.59375, "epoch": 0.002685512367491166, "grad_norm": 3.100214093251657, "kl": 0.032958984375, "learning_rate": 9.999822052663665e-07, "loss": 0.0013, "reward": 1.6515312194824219, "reward_std": 0.23143237829208374, "rewards/accuracy_reward": 0.5452812910079956, "rewards/format_reward": 1.0, "step": 266 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.90625, "epoch": 0.002695608278647148, "grad_norm": 3.0602629093333773, "kl": 0.04296875, "learning_rate": 9.999820712206835e-07, "loss": 0.0017, "reward": 2.0338125228881836, "reward_std": 0.030625220388174057, "rewards/accuracy_reward": 0.8338124752044678, "rewards/format_reward": 1.0, "step": 267 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 414.46875, "epoch": 0.0027057041898031296, "grad_norm": 3.5094551240585266, "kl": 0.03466796875, "learning_rate": 9.999819366720267e-07, "loss": 0.0014, "reward": 1.7622500658035278, "reward_std": 0.06191035360097885, "rewards/accuracy_reward": 0.612250030040741, "rewards/format_reward": 1.0, "step": 268 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 389.4375, "epoch": 0.0027158001009591117, "grad_norm": 1.1527303748660147, "kl": 0.030517578125, "learning_rate": 9.99981801620397e-07, "loss": 0.0012, "reward": 1.5598437786102295, "reward_std": 0.014205710962414742, "rewards/accuracy_reward": 0.4598437547683716, "rewards/format_reward": 1.0, "step": 269 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.0027258960121150934, "grad_norm": 2.227016722970364, "kl": 0.0390625, "learning_rate": 9.999816660657934e-07, "loss": 0.0016, "reward": 2.1086251735687256, "reward_std": 0.030259674414992332, "rewards/accuracy_reward": 0.908625066280365, "rewards/format_reward": 1.0, "step": 270 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.65625, "epoch": 0.002735991923271075, "grad_norm": 2.3197918660087846, "kl": 0.0341796875, "learning_rate": 9.999815300082165e-07, "loss": 0.0014, "reward": 1.7922500371932983, "reward_std": 0.03627890348434448, "rewards/accuracy_reward": 0.6484999656677246, "rewards/format_reward": 1.0, "step": 271 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 384.8125, "epoch": 0.002746087834427057, "grad_norm": 5.0674037010958095, "kl": 0.037353515625, "learning_rate": 9.999813934476668e-07, "loss": 0.0015, "reward": 1.8055624961853027, "reward_std": 0.037203043699264526, "rewards/accuracy_reward": 0.6618124842643738, "rewards/format_reward": 1.0, "step": 272 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 401.53125, "epoch": 0.002756183745583039, "grad_norm": 1.7676325409080016, "kl": 0.0380859375, "learning_rate": 9.999812563841442e-07, "loss": 0.0015, "reward": 1.8206562995910645, "reward_std": 0.02506965398788452, "rewards/accuracy_reward": 0.6769062280654907, "rewards/format_reward": 1.0, "step": 273 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.28125, "epoch": 0.0027662796567390208, "grad_norm": 2.1305178758985934, "kl": 0.04248046875, "learning_rate": 9.999811188176486e-07, "loss": 0.0017, "reward": 2.0734686851501465, "reward_std": 0.039929211139678955, "rewards/accuracy_reward": 0.8734687566757202, "rewards/format_reward": 1.0, "step": 274 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.75, "epoch": 0.0027763755678950025, "grad_norm": 1.5471548924762188, "kl": 0.029296875, "learning_rate": 9.999809807481806e-07, "loss": 0.0012, "reward": 1.7031874656677246, "reward_std": 0.1582699567079544, "rewards/accuracy_reward": 0.5719375610351562, "rewards/format_reward": 1.0, "step": 275 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.59375, "epoch": 0.002786471479050984, "grad_norm": 1.4700988183492945, "kl": 0.02880859375, "learning_rate": 9.999808421757398e-07, "loss": 0.0011, "reward": 1.6957499980926514, "reward_std": 0.16113539040088654, "rewards/accuracy_reward": 0.5770000219345093, "rewards/format_reward": 1.0, "step": 276 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 430.65625, "epoch": 0.0027965673902069664, "grad_norm": 1.8644375097498793, "kl": 0.0264892578125, "learning_rate": 9.999807031003268e-07, "loss": 0.0011, "reward": 1.5590624809265137, "reward_std": 0.013951298780739307, "rewards/accuracy_reward": 0.45906248688697815, "rewards/format_reward": 1.0, "step": 277 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 434.0625, "epoch": 0.002806663301362948, "grad_norm": 2.1198232975180624, "kl": 0.038818359375, "learning_rate": 9.999805635219413e-07, "loss": 0.0016, "reward": 2.1063437461853027, "reward_std": 0.03339764475822449, "rewards/accuracy_reward": 0.9063437581062317, "rewards/format_reward": 1.0, "step": 278 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 395.5625, "epoch": 0.00281675921251893, "grad_norm": 1.7939329022943722, "kl": 0.0283203125, "learning_rate": 9.999804234405837e-07, "loss": 0.0011, "reward": 1.585437536239624, "reward_std": 0.29060253500938416, "rewards/accuracy_reward": 0.5416874885559082, "rewards/format_reward": 0.9375, "step": 279 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.3125, "epoch": 0.0028268551236749115, "grad_norm": 2.318395986401882, "kl": 0.039794921875, "learning_rate": 9.999802828562542e-07, "loss": 0.0016, "reward": 2.141437530517578, "reward_std": 0.052161622792482376, "rewards/accuracy_reward": 0.9476875066757202, "rewards/format_reward": 1.0, "step": 280 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.03125, "epoch": 0.0028369510348308937, "grad_norm": 1.976267879338169, "kl": 0.040283203125, "learning_rate": 9.99980141768953e-07, "loss": 0.0016, "reward": 2.0465312004089355, "reward_std": 0.06394835561513901, "rewards/accuracy_reward": 0.8465312719345093, "rewards/format_reward": 1.0, "step": 281 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.4375, "epoch": 0.0028470469459868754, "grad_norm": 3.6625174743698596, "kl": 0.04443359375, "learning_rate": 9.9998000017868e-07, "loss": 0.0018, "reward": 1.9847187995910645, "reward_std": 0.040490563958883286, "rewards/accuracy_reward": 0.7847187519073486, "rewards/format_reward": 1.0, "step": 282 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.375, "epoch": 0.002857142857142857, "grad_norm": 2.1807810352304857, "kl": 0.04345703125, "learning_rate": 9.999798580854355e-07, "loss": 0.0017, "reward": 1.998093843460083, "reward_std": 0.086199551820755, "rewards/accuracy_reward": 0.8043437600135803, "rewards/format_reward": 1.0, "step": 283 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.8125, "epoch": 0.002867238768298839, "grad_norm": 2.27195164903905, "kl": 0.03759765625, "learning_rate": 9.999797154892195e-07, "loss": 0.0015, "reward": 2.0962188243865967, "reward_std": 0.04261964559555054, "rewards/accuracy_reward": 0.9024688005447388, "rewards/format_reward": 1.0, "step": 284 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 435.53125, "epoch": 0.0028773346794548206, "grad_norm": 1.6408317037776325, "kl": 0.0341796875, "learning_rate": 9.999795723900324e-07, "loss": 0.0014, "reward": 1.4641562700271606, "reward_std": 0.037679560482501984, "rewards/accuracy_reward": 0.3766562342643738, "rewards/format_reward": 1.0, "step": 285 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.0028874305906108027, "grad_norm": 1.8096039866384002, "kl": 0.04248046875, "learning_rate": 9.99979428787874e-07, "loss": 0.0017, "reward": 2.121187686920166, "reward_std": 0.031107008457183838, "rewards/accuracy_reward": 0.927437424659729, "rewards/format_reward": 1.0, "step": 286 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.125, "epoch": 0.0028975265017667845, "grad_norm": 2.535092669838653, "kl": 0.041748046875, "learning_rate": 9.999792846827447e-07, "loss": 0.0017, "reward": 1.7396875619888306, "reward_std": 0.02728220634162426, "rewards/accuracy_reward": 0.5896875262260437, "rewards/format_reward": 1.0, "step": 287 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 387.28125, "epoch": 0.002907622412922766, "grad_norm": 2.5269756176785836, "kl": 0.046630859375, "learning_rate": 9.999791400746447e-07, "loss": 0.0019, "reward": 1.7024375200271606, "reward_std": 0.05353553220629692, "rewards/accuracy_reward": 0.5586875081062317, "rewards/format_reward": 1.0, "step": 288 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.6875, "epoch": 0.002917718324078748, "grad_norm": 2.04502217403442, "kl": 0.046875, "learning_rate": 9.999789949635739e-07, "loss": 0.0019, "reward": 2.0224063396453857, "reward_std": 0.03153660520911217, "rewards/accuracy_reward": 0.8286563158035278, "rewards/format_reward": 1.0, "step": 289 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.03125, "epoch": 0.00292781423523473, "grad_norm": 3.6724546889393, "kl": 0.0390625, "learning_rate": 9.999788493495324e-07, "loss": 0.0016, "reward": 1.8189687728881836, "reward_std": 0.15550631284713745, "rewards/accuracy_reward": 0.6627187728881836, "rewards/format_reward": 1.0, "step": 290 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 405.25, "epoch": 0.002937910146390712, "grad_norm": 2.511331856964022, "kl": 0.03125, "learning_rate": 9.999787032325208e-07, "loss": 0.0012, "reward": 1.544281244277954, "reward_std": 0.036298152059316635, "rewards/accuracy_reward": 0.4505312740802765, "rewards/format_reward": 1.0, "step": 291 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.75, "epoch": 0.0029480060575466935, "grad_norm": 2.3688180573015014, "kl": 0.043701171875, "learning_rate": 9.99978556612539e-07, "loss": 0.0017, "reward": 2.0683751106262207, "reward_std": 0.05073293671011925, "rewards/accuracy_reward": 0.8683750033378601, "rewards/format_reward": 1.0, "step": 292 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 413.5, "epoch": 0.0029581019687026752, "grad_norm": 3.5791397074637494, "kl": 0.033203125, "learning_rate": 9.999784094895868e-07, "loss": 0.0013, "reward": 1.7176562547683716, "reward_std": 0.1871662139892578, "rewards/accuracy_reward": 0.5926562547683716, "rewards/format_reward": 1.0, "step": 293 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 408.9375, "epoch": 0.0029681978798586574, "grad_norm": 2.351743800874848, "kl": 0.041015625, "learning_rate": 9.999782618636648e-07, "loss": 0.0016, "reward": 1.734531283378601, "reward_std": 0.050530292093753815, "rewards/accuracy_reward": 0.5845312476158142, "rewards/format_reward": 1.0, "step": 294 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.625, "epoch": 0.002978293791014639, "grad_norm": 2.1077781685886303, "kl": 0.041748046875, "learning_rate": 9.99978113734773e-07, "loss": 0.0017, "reward": 2.041062355041504, "reward_std": 0.036461204290390015, "rewards/accuracy_reward": 0.8410624861717224, "rewards/format_reward": 1.0, "step": 295 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.28125, "epoch": 0.002988389702170621, "grad_norm": 2.9282752366256464, "kl": 0.0291748046875, "learning_rate": 9.999779651029115e-07, "loss": 0.0012, "reward": 1.9020626544952393, "reward_std": 0.28277021646499634, "rewards/accuracy_reward": 0.7395625114440918, "rewards/format_reward": 1.0, "step": 296 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.0029984856133266026, "grad_norm": 2.1718704661829022, "kl": 0.039794921875, "learning_rate": 9.999778159680806e-07, "loss": 0.0016, "reward": 2.0559375286102295, "reward_std": 0.02439400553703308, "rewards/accuracy_reward": 0.8559374809265137, "rewards/format_reward": 1.0, "step": 297 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.75, "epoch": 0.0030085815244825847, "grad_norm": 2.1181420153300063, "kl": 0.032470703125, "learning_rate": 9.999776663302804e-07, "loss": 0.0013, "reward": 1.8386874198913574, "reward_std": 0.04226094111800194, "rewards/accuracy_reward": 0.6886874437332153, "rewards/format_reward": 1.0, "step": 298 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.90625, "epoch": 0.0030186774356385664, "grad_norm": 2.3848102751581153, "kl": 0.037353515625, "learning_rate": 9.999775161895108e-07, "loss": 0.0015, "reward": 1.9327187538146973, "reward_std": 0.1829453855752945, "rewards/accuracy_reward": 0.7577186822891235, "rewards/format_reward": 1.0, "step": 299 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.1875, "epoch": 0.003028773346794548, "grad_norm": 3.4730015512092693, "kl": 0.033447265625, "learning_rate": 9.999773655457722e-07, "loss": 0.0013, "reward": 2.1713125705718994, "reward_std": 0.04460848867893219, "rewards/accuracy_reward": 0.9838124513626099, "rewards/format_reward": 1.0, "step": 300 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.71875, "epoch": 0.00303886925795053, "grad_norm": 2.033501912073084, "kl": 0.04443359375, "learning_rate": 9.999772143990648e-07, "loss": 0.0018, "reward": 2.1075210571289062, "reward_std": 0.019194740802049637, "rewards/accuracy_reward": 0.9075208902359009, "rewards/format_reward": 1.0, "step": 301 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.03125, "epoch": 0.003048965169106512, "grad_norm": 2.033196942000885, "kl": 0.03759765625, "learning_rate": 9.999770627493887e-07, "loss": 0.0015, "reward": 2.13993763923645, "reward_std": 0.04851624742150307, "rewards/accuracy_reward": 0.9524375200271606, "rewards/format_reward": 1.0, "step": 302 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.34375, "epoch": 0.0030590610802624938, "grad_norm": 2.2571725988058446, "kl": 0.044677734375, "learning_rate": 9.999769105967439e-07, "loss": 0.0018, "reward": 2.076437473297119, "reward_std": 0.11565415561199188, "rewards/accuracy_reward": 0.895187497138977, "rewards/format_reward": 1.0, "step": 303 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.4375, "epoch": 0.0030691569914184755, "grad_norm": 2.3226767602098106, "kl": 0.036865234375, "learning_rate": 9.999767579411307e-07, "loss": 0.0015, "reward": 1.803781270980835, "reward_std": 0.16046705842018127, "rewards/accuracy_reward": 0.672531247138977, "rewards/format_reward": 1.0, "step": 304 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.0030792529025744572, "grad_norm": 4.704315641334519, "kl": 0.041259765625, "learning_rate": 9.999766047825492e-07, "loss": 0.0016, "reward": 1.7823125123977661, "reward_std": 0.13318413496017456, "rewards/accuracy_reward": 0.6385624408721924, "rewards/format_reward": 1.0, "step": 305 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.0030893488137304394, "grad_norm": 1.956420691931211, "kl": 0.04052734375, "learning_rate": 9.999764511209995e-07, "loss": 0.0016, "reward": 2.0189688205718994, "reward_std": 0.047731608152389526, "rewards/accuracy_reward": 0.8439687490463257, "rewards/format_reward": 1.0, "step": 306 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.40625, "epoch": 0.003099444724886421, "grad_norm": 2.434489857787716, "kl": 0.043212890625, "learning_rate": 9.999762969564818e-07, "loss": 0.0017, "reward": 2.0088748931884766, "reward_std": 0.05514276772737503, "rewards/accuracy_reward": 0.8213750720024109, "rewards/format_reward": 1.0, "step": 307 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.46875, "epoch": 0.003109540636042403, "grad_norm": 2.452647323223319, "kl": 0.040283203125, "learning_rate": 9.999761422889964e-07, "loss": 0.0016, "reward": 1.906093716621399, "reward_std": 0.16622231900691986, "rewards/accuracy_reward": 0.7498437762260437, "rewards/format_reward": 1.0, "step": 308 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.84375, "epoch": 0.0031196365471983845, "grad_norm": 2.5387679563905468, "kl": 0.041015625, "learning_rate": 9.999759871185433e-07, "loss": 0.0016, "reward": 2.121687650680542, "reward_std": 0.04791989177465439, "rewards/accuracy_reward": 0.9279375076293945, "rewards/format_reward": 1.0, "step": 309 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.84375, "epoch": 0.0031297324583543663, "grad_norm": 3.851284782420172, "kl": 0.044677734375, "learning_rate": 9.999758314451229e-07, "loss": 0.0018, "reward": 2.1167500019073486, "reward_std": 0.06037373095750809, "rewards/accuracy_reward": 0.9354999661445618, "rewards/format_reward": 1.0, "step": 310 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 390.4375, "epoch": 0.0031398283695103484, "grad_norm": 1.8132743748894933, "kl": 0.0419921875, "learning_rate": 9.999756752687347e-07, "loss": 0.0017, "reward": 1.7410937547683716, "reward_std": 0.024584993720054626, "rewards/accuracy_reward": 0.5910937786102295, "rewards/format_reward": 1.0, "step": 311 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.78125, "epoch": 0.00314992428066633, "grad_norm": 2.3989365358403476, "kl": 0.03125, "learning_rate": 9.999755185893796e-07, "loss": 0.0013, "reward": 1.843937635421753, "reward_std": 0.3415958285331726, "rewards/accuracy_reward": 0.7126874923706055, "rewards/format_reward": 1.0, "step": 312 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.28125, "epoch": 0.003160020191822312, "grad_norm": 1.8380430213577887, "kl": 0.046142578125, "learning_rate": 9.999753614070574e-07, "loss": 0.0018, "reward": 2.0744376182556152, "reward_std": 0.0369720533490181, "rewards/accuracy_reward": 0.8744375109672546, "rewards/format_reward": 1.0, "step": 313 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.0031701161029782936, "grad_norm": 1.9886275215098117, "kl": 0.0267333984375, "learning_rate": 9.999752037217683e-07, "loss": 0.0011, "reward": 1.9554375410079956, "reward_std": 0.2810952663421631, "rewards/accuracy_reward": 0.7866874933242798, "rewards/format_reward": 1.0, "step": 314 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 419.0, "epoch": 0.0031802120141342758, "grad_norm": 2.0995682493348045, "kl": 0.035400390625, "learning_rate": 9.999750455335126e-07, "loss": 0.0014, "reward": 1.8959063291549683, "reward_std": 0.3265809416770935, "rewards/accuracy_reward": 0.7334062457084656, "rewards/format_reward": 1.0, "step": 315 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.125, "epoch": 0.0031903079252902575, "grad_norm": 2.188603595241271, "kl": 0.04443359375, "learning_rate": 9.999748868422901e-07, "loss": 0.0018, "reward": 1.972000002861023, "reward_std": 0.13029327988624573, "rewards/accuracy_reward": 0.778249979019165, "rewards/format_reward": 1.0, "step": 316 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 391.40625, "epoch": 0.003200403836446239, "grad_norm": 1.4893714949579129, "kl": 0.0322265625, "learning_rate": 9.999747276481014e-07, "loss": 0.0013, "reward": 1.6914374828338623, "reward_std": 0.019432658329606056, "rewards/accuracy_reward": 0.5414375066757202, "rewards/format_reward": 1.0, "step": 317 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.003210499747602221, "grad_norm": 2.3850669681372354, "kl": 0.041748046875, "learning_rate": 9.999745679509464e-07, "loss": 0.0017, "reward": 2.011624813079834, "reward_std": 0.18696093559265137, "rewards/accuracy_reward": 0.8303750157356262, "rewards/format_reward": 1.0, "step": 318 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.84375, "epoch": 0.003220595658758203, "grad_norm": 2.0967407837039427, "kl": 0.044189453125, "learning_rate": 9.999744077508252e-07, "loss": 0.0018, "reward": 1.88881254196167, "reward_std": 0.10154133290052414, "rewards/accuracy_reward": 0.7013125419616699, "rewards/format_reward": 1.0, "step": 319 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.84375, "epoch": 0.003230691569914185, "grad_norm": 2.590624880962643, "kl": 0.039306640625, "learning_rate": 9.999742470477382e-07, "loss": 0.0016, "reward": 1.8192813396453857, "reward_std": 0.17971409857273102, "rewards/accuracy_reward": 0.6505312919616699, "rewards/format_reward": 1.0, "step": 320 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.1875, "epoch": 0.0032407874810701665, "grad_norm": 3.1314958940937445, "kl": 0.04443359375, "learning_rate": 9.999740858416855e-07, "loss": 0.0018, "reward": 2.1617188453674316, "reward_std": 0.027404412627220154, "rewards/accuracy_reward": 0.961718738079071, "rewards/format_reward": 1.0, "step": 321 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 388.4375, "epoch": 0.0032508833922261482, "grad_norm": 2.1021901010491684, "kl": 0.040771484375, "learning_rate": 9.99973924132667e-07, "loss": 0.0016, "reward": 1.8181562423706055, "reward_std": 0.05890439450740814, "rewards/accuracy_reward": 0.6806561946868896, "rewards/format_reward": 1.0, "step": 322 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.6875, "epoch": 0.0032609793033821304, "grad_norm": 2.0090902859865993, "kl": 0.03857421875, "learning_rate": 9.999737619206833e-07, "loss": 0.0015, "reward": 2.0102500915527344, "reward_std": 0.17923778295516968, "rewards/accuracy_reward": 0.8289999961853027, "rewards/format_reward": 1.0, "step": 323 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.71875, "epoch": 0.003271075214538112, "grad_norm": 2.3968316772896174, "kl": 0.039306640625, "learning_rate": 9.999735992057341e-07, "loss": 0.0016, "reward": 2.1199686527252197, "reward_std": 0.039076074957847595, "rewards/accuracy_reward": 0.9262187480926514, "rewards/format_reward": 1.0, "step": 324 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.28125, "epoch": 0.003281171125694094, "grad_norm": 2.3202611327077696, "kl": 0.04296875, "learning_rate": 9.9997343598782e-07, "loss": 0.0017, "reward": 1.8192813396453857, "reward_std": 0.12690407037734985, "rewards/accuracy_reward": 0.6692812442779541, "rewards/format_reward": 1.0, "step": 325 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.0032912670368500756, "grad_norm": 3.977206525250537, "kl": 0.02734375, "learning_rate": 9.99973272266941e-07, "loss": 0.0011, "reward": 1.7280625104904175, "reward_std": 0.2524643540382385, "rewards/accuracy_reward": 0.6093125343322754, "rewards/format_reward": 1.0, "step": 326 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.25, "epoch": 0.0033013629480060577, "grad_norm": 4.127660647085151, "kl": 0.03271484375, "learning_rate": 9.999731080430968e-07, "loss": 0.0013, "reward": 1.711593747138977, "reward_std": 0.26868894696235657, "rewards/accuracy_reward": 0.586593747138977, "rewards/format_reward": 1.0, "step": 327 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 387.59375, "epoch": 0.0033114588591620395, "grad_norm": 1.6959975243649514, "kl": 0.03759765625, "learning_rate": 9.999729433162884e-07, "loss": 0.0015, "reward": 1.5276875495910645, "reward_std": 0.039044737815856934, "rewards/accuracy_reward": 0.4401875138282776, "rewards/format_reward": 1.0, "step": 328 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.003321554770318021, "grad_norm": 2.5658222122536136, "kl": 0.04638671875, "learning_rate": 9.999727780865154e-07, "loss": 0.0019, "reward": 2.115781307220459, "reward_std": 0.03791564702987671, "rewards/accuracy_reward": 0.9157812595367432, "rewards/format_reward": 1.0, "step": 329 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 384.96875, "epoch": 0.003331650681474003, "grad_norm": 1.9837823862868669, "kl": 0.04541015625, "learning_rate": 9.999726123537783e-07, "loss": 0.0018, "reward": 1.862781286239624, "reward_std": 0.040977153927087784, "rewards/accuracy_reward": 0.7190312743186951, "rewards/format_reward": 1.0, "step": 330 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 408.1875, "epoch": 0.003341746592629985, "grad_norm": 3.4131969388608767, "kl": 0.048828125, "learning_rate": 9.999724461180768e-07, "loss": 0.002, "reward": 1.9553438425064087, "reward_std": 0.06003564968705177, "rewards/accuracy_reward": 0.7553436756134033, "rewards/format_reward": 1.0, "step": 331 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.375, "epoch": 0.0033518425037859668, "grad_norm": 1.6852024321022117, "kl": 0.0439453125, "learning_rate": 9.999722793794113e-07, "loss": 0.0018, "reward": 1.8514063358306885, "reward_std": 0.02887752279639244, "rewards/accuracy_reward": 0.7139062285423279, "rewards/format_reward": 1.0, "step": 332 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.15625, "epoch": 0.0033619384149419485, "grad_norm": 4.589231491567459, "kl": 0.036865234375, "learning_rate": 9.999721121377824e-07, "loss": 0.0015, "reward": 2.0177502632141113, "reward_std": 0.13193698227405548, "rewards/accuracy_reward": 0.8240000009536743, "rewards/format_reward": 1.0, "step": 333 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.0033720343260979302, "grad_norm": 2.091116351537923, "kl": 0.044677734375, "learning_rate": 9.999719443931898e-07, "loss": 0.0018, "reward": 1.9475938081741333, "reward_std": 0.05075972527265549, "rewards/accuracy_reward": 0.7538437247276306, "rewards/format_reward": 1.0, "step": 334 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 406.25, "epoch": 0.0033821302372539124, "grad_norm": 9.510168183747044, "kl": 0.045166015625, "learning_rate": 9.999717761456336e-07, "loss": 0.0018, "reward": 1.7303125858306885, "reward_std": 0.060959868133068085, "rewards/accuracy_reward": 0.5928125381469727, "rewards/format_reward": 1.0, "step": 335 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 432.25, "epoch": 0.003392226148409894, "grad_norm": 3.131277740869721, "kl": 0.050048828125, "learning_rate": 9.999716073951142e-07, "loss": 0.002, "reward": 1.9948126077651978, "reward_std": 0.05922267958521843, "rewards/accuracy_reward": 0.8073124885559082, "rewards/format_reward": 1.0, "step": 336 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.003402322059565876, "grad_norm": 1.7991690718688704, "kl": 0.04052734375, "learning_rate": 9.999714381416316e-07, "loss": 0.0016, "reward": 1.9251251220703125, "reward_std": 0.15655140578746796, "rewards/accuracy_reward": 0.762624979019165, "rewards/format_reward": 1.0, "step": 337 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.6875, "epoch": 0.0034124179707218576, "grad_norm": 2.627618017375596, "kl": 0.037353515625, "learning_rate": 9.99971268385186e-07, "loss": 0.0015, "reward": 1.8748126029968262, "reward_std": 0.2976238429546356, "rewards/accuracy_reward": 0.7248125076293945, "rewards/format_reward": 1.0, "step": 338 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.15625, "epoch": 0.0034225138818778393, "grad_norm": 1.755626171268834, "kl": 0.0301513671875, "learning_rate": 9.999710981257778e-07, "loss": 0.0012, "reward": 1.8375000953674316, "reward_std": 0.02231631986796856, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 339 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.53125, "epoch": 0.0034326097930338214, "grad_norm": 1.8990712300644876, "kl": 0.038330078125, "learning_rate": 9.99970927363407e-07, "loss": 0.0015, "reward": 1.7128437757492065, "reward_std": 0.1675553023815155, "rewards/accuracy_reward": 0.5815938115119934, "rewards/format_reward": 1.0, "step": 340 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 414.53125, "epoch": 0.003442705704189803, "grad_norm": 1.8689894364770485, "kl": 0.040771484375, "learning_rate": 9.999707560980739e-07, "loss": 0.0016, "reward": 1.7194688320159912, "reward_std": 0.03860195726156235, "rewards/accuracy_reward": 0.5694687366485596, "rewards/format_reward": 1.0, "step": 341 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.34375, "epoch": 0.003452801615345785, "grad_norm": 3.090038805366402, "kl": 0.044189453125, "learning_rate": 9.999705843297783e-07, "loss": 0.0018, "reward": 1.8334062099456787, "reward_std": 0.17600136995315552, "rewards/accuracy_reward": 0.6709063053131104, "rewards/format_reward": 1.0, "step": 342 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.9375, "epoch": 0.0034628975265017666, "grad_norm": 3.247091327393654, "kl": 0.03955078125, "learning_rate": 9.999704120585209e-07, "loss": 0.0016, "reward": 2.06962513923645, "reward_std": 0.04727005958557129, "rewards/accuracy_reward": 0.8821249604225159, "rewards/format_reward": 1.0, "step": 343 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 424.53125, "epoch": 0.0034729934376577488, "grad_norm": 2.7278772668222087, "kl": 0.04541015625, "learning_rate": 9.999702392843012e-07, "loss": 0.0018, "reward": 1.8432186841964722, "reward_std": 0.014334147796034813, "rewards/accuracy_reward": 0.6932187676429749, "rewards/format_reward": 1.0, "step": 344 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.1875, "epoch": 0.0034830893488137305, "grad_norm": 1.755820821276176, "kl": 0.0380859375, "learning_rate": 9.999700660071202e-07, "loss": 0.0015, "reward": 1.7376562356948853, "reward_std": 0.30137038230895996, "rewards/accuracy_reward": 0.6439062356948853, "rewards/format_reward": 0.96875, "step": 345 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.5625, "epoch": 0.003493185259969712, "grad_norm": 2.257590336542808, "kl": 0.048828125, "learning_rate": 9.999698922269773e-07, "loss": 0.002, "reward": 2.007999897003174, "reward_std": 0.04192490875720978, "rewards/accuracy_reward": 0.8080000281333923, "rewards/format_reward": 1.0, "step": 346 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.003503281171125694, "grad_norm": 2.632183610961355, "kl": 0.051025390625, "learning_rate": 9.999697179438733e-07, "loss": 0.002, "reward": 2.018249988555908, "reward_std": 0.03969603031873703, "rewards/accuracy_reward": 0.8245000243186951, "rewards/format_reward": 1.0, "step": 347 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.46875, "epoch": 0.003513377082281676, "grad_norm": 1.8233129135044241, "kl": 0.038818359375, "learning_rate": 9.999695431578079e-07, "loss": 0.0015, "reward": 1.7301874160766602, "reward_std": 0.17038437724113464, "rewards/accuracy_reward": 0.6051875352859497, "rewards/format_reward": 1.0, "step": 348 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.71875, "epoch": 0.003523472993437658, "grad_norm": 1.91925486842519, "kl": 0.041259765625, "learning_rate": 9.999693678687815e-07, "loss": 0.0017, "reward": 2.058406352996826, "reward_std": 0.03054901398718357, "rewards/accuracy_reward": 0.8646562099456787, "rewards/format_reward": 1.0, "step": 349 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.5, "epoch": 0.0035335689045936395, "grad_norm": 2.101055768923571, "kl": 0.04833984375, "learning_rate": 9.999691920767943e-07, "loss": 0.0019, "reward": 2.049781322479248, "reward_std": 0.0433051735162735, "rewards/accuracy_reward": 0.8497812747955322, "rewards/format_reward": 1.0, "step": 350 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.0035436648157496213, "grad_norm": 2.058558045152799, "kl": 0.038818359375, "learning_rate": 9.999690157818465e-07, "loss": 0.0016, "reward": 2.109499931335449, "reward_std": 0.04848100244998932, "rewards/accuracy_reward": 0.9157500267028809, "rewards/format_reward": 1.0, "step": 351 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.78125, "epoch": 0.0035537607269056034, "grad_norm": 2.3304951018186575, "kl": 0.046875, "learning_rate": 9.99968838983938e-07, "loss": 0.0019, "reward": 1.9908437728881836, "reward_std": 0.03133280575275421, "rewards/accuracy_reward": 0.7908437848091125, "rewards/format_reward": 1.0, "step": 352 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.3125, "epoch": 0.003563856638061585, "grad_norm": 2.218639417141315, "kl": 0.042236328125, "learning_rate": 9.999686616830693e-07, "loss": 0.0017, "reward": 1.8337500095367432, "reward_std": 0.1625032126903534, "rewards/accuracy_reward": 0.6712499856948853, "rewards/format_reward": 1.0, "step": 353 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.15625, "epoch": 0.003573952549217567, "grad_norm": 1.4451948386198863, "kl": 0.039306640625, "learning_rate": 9.999684838792406e-07, "loss": 0.0016, "reward": 1.946000099182129, "reward_std": 0.15379081666469574, "rewards/accuracy_reward": 0.7709999680519104, "rewards/format_reward": 1.0, "step": 354 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.46875, "epoch": 0.0035840484603735486, "grad_norm": 3.9681124621012733, "kl": 0.04541015625, "learning_rate": 9.999683055724516e-07, "loss": 0.0018, "reward": 2.0720624923706055, "reward_std": 0.03327076509594917, "rewards/accuracy_reward": 0.8720624446868896, "rewards/format_reward": 1.0, "step": 355 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.65625, "epoch": 0.0035941443715295307, "grad_norm": 2.7838320269994488, "kl": 0.042724609375, "learning_rate": 9.999681267627031e-07, "loss": 0.0017, "reward": 2.0917811393737793, "reward_std": 0.033484797924757004, "rewards/accuracy_reward": 0.8980312347412109, "rewards/format_reward": 1.0, "step": 356 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 411.4375, "epoch": 0.0036042402826855125, "grad_norm": 0.9830784066525207, "kl": 0.040771484375, "learning_rate": 9.99967947449995e-07, "loss": 0.0016, "reward": 1.566593885421753, "reward_std": 0.008496800437569618, "rewards/accuracy_reward": 0.46659374237060547, "rewards/format_reward": 1.0, "step": 357 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 421.46875, "epoch": 0.003614336193841494, "grad_norm": 1.7439481887670896, "kl": 0.032958984375, "learning_rate": 9.999677676343273e-07, "loss": 0.0013, "reward": 1.5715000629425049, "reward_std": 0.10478249192237854, "rewards/accuracy_reward": 0.4714999794960022, "rewards/format_reward": 1.0, "step": 358 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.09375, "epoch": 0.003624432104997476, "grad_norm": 1.9146666906609402, "kl": 0.041015625, "learning_rate": 9.999675873157006e-07, "loss": 0.0016, "reward": 1.790374994277954, "reward_std": 0.130710631608963, "rewards/accuracy_reward": 0.6466250419616699, "rewards/format_reward": 1.0, "step": 359 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.125, "epoch": 0.003634528016153458, "grad_norm": 2.795271066426364, "kl": 0.04736328125, "learning_rate": 9.999674064941146e-07, "loss": 0.0019, "reward": 2.0811874866485596, "reward_std": 0.030237672850489616, "rewards/accuracy_reward": 0.8811875581741333, "rewards/format_reward": 1.0, "step": 360 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.71875, "epoch": 0.00364462392730944, "grad_norm": 2.362661998088272, "kl": 0.043212890625, "learning_rate": 9.999672251695698e-07, "loss": 0.0017, "reward": 1.947156310081482, "reward_std": 0.04044144228100777, "rewards/accuracy_reward": 0.7471563220024109, "rewards/format_reward": 1.0, "step": 361 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 435.15625, "epoch": 0.0036547198384654215, "grad_norm": 2.372066665277187, "kl": 0.0390625, "learning_rate": 9.999670433420664e-07, "loss": 0.0016, "reward": 2.030843734741211, "reward_std": 0.15766532719135284, "rewards/accuracy_reward": 0.855843722820282, "rewards/format_reward": 1.0, "step": 362 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.3125, "epoch": 0.0036648157496214032, "grad_norm": 3.0590952661242543, "kl": 0.051025390625, "learning_rate": 9.999668610116043e-07, "loss": 0.002, "reward": 1.9535000324249268, "reward_std": 0.04784667119383812, "rewards/accuracy_reward": 0.7597500085830688, "rewards/format_reward": 1.0, "step": 363 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.71875, "epoch": 0.003674911660777385, "grad_norm": 1.9657774492304312, "kl": 0.046630859375, "learning_rate": 9.99966678178184e-07, "loss": 0.0019, "reward": 2.0630626678466797, "reward_std": 0.04238676652312279, "rewards/accuracy_reward": 0.8693125247955322, "rewards/format_reward": 1.0, "step": 364 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.03125, "epoch": 0.003685007571933367, "grad_norm": 2.287787603699243, "kl": 0.037109375, "learning_rate": 9.999664948418054e-07, "loss": 0.0015, "reward": 1.8613437414169312, "reward_std": 0.14712344110012054, "rewards/accuracy_reward": 0.717593789100647, "rewards/format_reward": 1.0, "step": 365 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.875, "epoch": 0.003695103483089349, "grad_norm": 2.2122316838166327, "kl": 0.04931640625, "learning_rate": 9.999663110024691e-07, "loss": 0.002, "reward": 2.074937343597412, "reward_std": 0.03703717887401581, "rewards/accuracy_reward": 0.8749374747276306, "rewards/format_reward": 1.0, "step": 366 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.875, "epoch": 0.0037051993942453306, "grad_norm": 1.7706070345282965, "kl": 0.044677734375, "learning_rate": 9.999661266601748e-07, "loss": 0.0018, "reward": 2.185499906539917, "reward_std": 0.011179214343428612, "rewards/accuracy_reward": 0.9854999780654907, "rewards/format_reward": 1.0, "step": 367 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.40625, "epoch": 0.0037152953054013123, "grad_norm": 1.9587930889469356, "kl": 0.044921875, "learning_rate": 9.999659418149232e-07, "loss": 0.0018, "reward": 1.9201562404632568, "reward_std": 0.18070945143699646, "rewards/accuracy_reward": 0.7576562166213989, "rewards/format_reward": 1.0, "step": 368 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 428.5, "epoch": 0.0037253912165572944, "grad_norm": 1.7453582670228156, "kl": 0.042236328125, "learning_rate": 9.99965756466714e-07, "loss": 0.0017, "reward": 1.823625087738037, "reward_std": 0.030431155115365982, "rewards/accuracy_reward": 0.6798750162124634, "rewards/format_reward": 1.0, "step": 369 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.84375, "epoch": 0.003735487127713276, "grad_norm": 1.9226805958585824, "kl": 0.0361328125, "learning_rate": 9.999655706155475e-07, "loss": 0.0014, "reward": 1.8385624885559082, "reward_std": 0.11210741847753525, "rewards/accuracy_reward": 0.694812536239624, "rewards/format_reward": 1.0, "step": 370 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.40625, "epoch": 0.003745583038869258, "grad_norm": 1.7196604765460224, "kl": 0.04345703125, "learning_rate": 9.99965384261424e-07, "loss": 0.0017, "reward": 2.076125144958496, "reward_std": 0.0346112959086895, "rewards/accuracy_reward": 0.8823750019073486, "rewards/format_reward": 1.0, "step": 371 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.21875, "epoch": 0.0037556789500252396, "grad_norm": 2.65883705624486, "kl": 0.05224609375, "learning_rate": 9.999651974043437e-07, "loss": 0.0021, "reward": 2.12290620803833, "reward_std": 0.042028963565826416, "rewards/accuracy_reward": 0.9291561841964722, "rewards/format_reward": 1.0, "step": 372 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 443.5625, "epoch": 0.0037657748611812218, "grad_norm": 2.157208588520911, "kl": 0.03955078125, "learning_rate": 9.999650100443067e-07, "loss": 0.0016, "reward": 1.9977188110351562, "reward_std": 0.057665757834911346, "rewards/accuracy_reward": 0.8289687037467957, "rewards/format_reward": 1.0, "step": 373 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 427.59375, "epoch": 0.0037758707723372035, "grad_norm": 1.6475407496483019, "kl": 0.04150390625, "learning_rate": 9.999648221813134e-07, "loss": 0.0017, "reward": 1.8437187671661377, "reward_std": 0.16530105471611023, "rewards/accuracy_reward": 0.6749687194824219, "rewards/format_reward": 1.0, "step": 374 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.09375, "epoch": 0.003785966683493185, "grad_norm": 8.78368604683135, "kl": 0.04345703125, "learning_rate": 9.999646338153635e-07, "loss": 0.0017, "reward": 1.8726874589920044, "reward_std": 0.1733321249485016, "rewards/accuracy_reward": 0.7039374709129333, "rewards/format_reward": 1.0, "step": 375 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.4375, "epoch": 0.003796062594649167, "grad_norm": 2.1209400806182135, "kl": 0.03955078125, "learning_rate": 9.999644449464578e-07, "loss": 0.0016, "reward": 1.9893125295639038, "reward_std": 0.1794608235359192, "rewards/accuracy_reward": 0.8080624938011169, "rewards/format_reward": 1.0, "step": 376 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 414.8125, "epoch": 0.003806158505805149, "grad_norm": 1.0396810931758045, "kl": 0.037841796875, "learning_rate": 9.999642555745962e-07, "loss": 0.0015, "reward": 1.5751874446868896, "reward_std": 0.016978323459625244, "rewards/accuracy_reward": 0.4814375042915344, "rewards/format_reward": 1.0, "step": 377 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.003816254416961131, "grad_norm": 3.104516472983755, "kl": 0.037841796875, "learning_rate": 9.999640656997787e-07, "loss": 0.0015, "reward": 2.1249687671661377, "reward_std": 0.06321851164102554, "rewards/accuracy_reward": 0.949968695640564, "rewards/format_reward": 1.0, "step": 378 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 428.46875, "epoch": 0.0038263503281171125, "grad_norm": 1.9636233190393502, "kl": 0.03857421875, "learning_rate": 9.999638753220058e-07, "loss": 0.0015, "reward": 1.7147188186645508, "reward_std": 0.2925739586353302, "rewards/accuracy_reward": 0.577218770980835, "rewards/format_reward": 1.0, "step": 379 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 427.78125, "epoch": 0.0038364462392730943, "grad_norm": 2.381894290856729, "kl": 0.042724609375, "learning_rate": 9.999636844412776e-07, "loss": 0.0017, "reward": 1.9335312843322754, "reward_std": 0.1359749138355255, "rewards/accuracy_reward": 0.7397812604904175, "rewards/format_reward": 1.0, "step": 380 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 421.03125, "epoch": 0.0038465421504290764, "grad_norm": 1.7040449150961554, "kl": 0.041259765625, "learning_rate": 9.999634930575942e-07, "loss": 0.0017, "reward": 1.8039376735687256, "reward_std": 0.04557451605796814, "rewards/accuracy_reward": 0.6664375066757202, "rewards/format_reward": 1.0, "step": 381 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 434.65625, "epoch": 0.003856638061585058, "grad_norm": 1.4480065457348894, "kl": 0.037841796875, "learning_rate": 9.99963301170956e-07, "loss": 0.0015, "reward": 1.5498437881469727, "reward_std": 0.03430645540356636, "rewards/accuracy_reward": 0.45609375834465027, "rewards/format_reward": 1.0, "step": 382 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.90625, "epoch": 0.00386673397274104, "grad_norm": 1.724216686330939, "kl": 0.046875, "learning_rate": 9.999631087813629e-07, "loss": 0.0019, "reward": 1.8334689140319824, "reward_std": 0.03555557131767273, "rewards/accuracy_reward": 0.6397187113761902, "rewards/format_reward": 1.0, "step": 383 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 421.9375, "epoch": 0.0038768298838970216, "grad_norm": 2.8703862474160213, "kl": 0.04541015625, "learning_rate": 9.999629158888152e-07, "loss": 0.0018, "reward": 2.0065627098083496, "reward_std": 0.05818052589893341, "rewards/accuracy_reward": 0.8253124952316284, "rewards/format_reward": 1.0, "step": 384 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.5625, "epoch": 0.0038869257950530037, "grad_norm": 2.065421897037824, "kl": 0.03857421875, "learning_rate": 9.99962722493313e-07, "loss": 0.0015, "reward": 1.9842188358306885, "reward_std": 0.05733494460582733, "rewards/accuracy_reward": 0.7967187166213989, "rewards/format_reward": 1.0, "step": 385 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 434.28125, "epoch": 0.0038970217062089855, "grad_norm": 2.610935747123381, "kl": 0.038818359375, "learning_rate": 9.99962528594857e-07, "loss": 0.0016, "reward": 1.616187572479248, "reward_std": 0.18023325502872467, "rewards/accuracy_reward": 0.45993751287460327, "rewards/format_reward": 1.0, "step": 386 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.78125, "epoch": 0.003907117617364967, "grad_norm": 2.0133883952016274, "kl": 0.0400390625, "learning_rate": 9.99962334193447e-07, "loss": 0.0016, "reward": 2.0302188396453857, "reward_std": 0.05079733580350876, "rewards/accuracy_reward": 0.842718780040741, "rewards/format_reward": 1.0, "step": 387 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.003917213528520949, "grad_norm": 2.396513676829873, "kl": 0.042724609375, "learning_rate": 9.999621392890832e-07, "loss": 0.0017, "reward": 2.0161561965942383, "reward_std": 0.06749871373176575, "rewards/accuracy_reward": 0.8224061727523804, "rewards/format_reward": 1.0, "step": 388 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.1875, "epoch": 0.003927309439676931, "grad_norm": 1.8421783053633838, "kl": 0.034423828125, "learning_rate": 9.999619438817656e-07, "loss": 0.0014, "reward": 1.7760626077651978, "reward_std": 0.149233877658844, "rewards/accuracy_reward": 0.6323124766349792, "rewards/format_reward": 1.0, "step": 389 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 431.5625, "epoch": 0.003937405350832913, "grad_norm": 2.2993498871993627, "kl": 0.044921875, "learning_rate": 9.999617479714947e-07, "loss": 0.0018, "reward": 2.0327811241149902, "reward_std": 0.05486287176609039, "rewards/accuracy_reward": 0.8452812433242798, "rewards/format_reward": 1.0, "step": 390 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 423.5625, "epoch": 0.003947501261988894, "grad_norm": 1.9227711512605243, "kl": 0.036376953125, "learning_rate": 9.999615515582707e-07, "loss": 0.0015, "reward": 1.7098125219345093, "reward_std": 0.1802482157945633, "rewards/accuracy_reward": 0.5848124623298645, "rewards/format_reward": 1.0, "step": 391 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.25, "epoch": 0.003957597173144876, "grad_norm": 6.057903353962193, "kl": 0.04150390625, "learning_rate": 9.999613546420937e-07, "loss": 0.0017, "reward": 2.074031352996826, "reward_std": 0.023350801318883896, "rewards/accuracy_reward": 0.8740312457084656, "rewards/format_reward": 1.0, "step": 392 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.78125, "epoch": 0.003967693084300858, "grad_norm": 2.3273285131067065, "kl": 0.042724609375, "learning_rate": 9.999611572229638e-07, "loss": 0.0017, "reward": 1.8935834169387817, "reward_std": 0.04860455542802811, "rewards/accuracy_reward": 0.6935833096504211, "rewards/format_reward": 1.0, "step": 393 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 401.6875, "epoch": 0.00397778899545684, "grad_norm": 1.9091350264058873, "kl": 0.04150390625, "learning_rate": 9.999609593008815e-07, "loss": 0.0017, "reward": 1.7873749732971191, "reward_std": 0.028913630172610283, "rewards/accuracy_reward": 0.637374997138977, "rewards/format_reward": 1.0, "step": 394 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.5625, "epoch": 0.003987884906612822, "grad_norm": 3.433288753887843, "kl": 0.04052734375, "learning_rate": 9.999607608758466e-07, "loss": 0.0016, "reward": 2.123281240463257, "reward_std": 0.022722914814949036, "rewards/accuracy_reward": 0.9232813119888306, "rewards/format_reward": 1.0, "step": 395 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.09375, "epoch": 0.003997980817768804, "grad_norm": 3.9707290203215986, "kl": 0.04150390625, "learning_rate": 9.999605619478597e-07, "loss": 0.0017, "reward": 2.0290000438690186, "reward_std": 0.024166546761989594, "rewards/accuracy_reward": 0.8289999961853027, "rewards/format_reward": 1.0, "step": 396 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.1875, "epoch": 0.004008076728924785, "grad_norm": 2.4034443365989184, "kl": 0.045654296875, "learning_rate": 9.999603625169207e-07, "loss": 0.0018, "reward": 2.023624897003174, "reward_std": 0.059515997767448425, "rewards/accuracy_reward": 0.8361249566078186, "rewards/format_reward": 1.0, "step": 397 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 0.0040181726400807674, "grad_norm": 2.2072986758077153, "kl": 0.04248046875, "learning_rate": 9.999601625830298e-07, "loss": 0.0017, "reward": 2.0331249237060547, "reward_std": 0.05726657435297966, "rewards/accuracy_reward": 0.8456249237060547, "rewards/format_reward": 1.0, "step": 398 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.25, "epoch": 0.004028268551236749, "grad_norm": 2.2112059961893595, "kl": 0.04052734375, "learning_rate": 9.999599621461875e-07, "loss": 0.0016, "reward": 2.092562437057495, "reward_std": 0.04207116365432739, "rewards/accuracy_reward": 0.898812472820282, "rewards/format_reward": 1.0, "step": 399 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 388.28125, "epoch": 0.004038364462392731, "grad_norm": 1.0327212167230393, "kl": 0.034423828125, "learning_rate": 9.999597612063938e-07, "loss": 0.0014, "reward": 1.8818750381469727, "reward_std": 0.00602228008210659, "rewards/accuracy_reward": 0.731874942779541, "rewards/format_reward": 1.0, "step": 400 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.004048460373548713, "grad_norm": 1.7434729315234176, "kl": 0.037109375, "learning_rate": 9.999595597636488e-07, "loss": 0.0015, "reward": 1.9089689254760742, "reward_std": 0.175084188580513, "rewards/accuracy_reward": 0.7402187585830688, "rewards/format_reward": 1.0, "step": 401 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.004058556284704694, "grad_norm": 1.742529446060553, "kl": 0.03515625, "learning_rate": 9.999593578179528e-07, "loss": 0.0014, "reward": 2.1479687690734863, "reward_std": 0.04170800745487213, "rewards/accuracy_reward": 0.9542187452316284, "rewards/format_reward": 1.0, "step": 402 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 409.09375, "epoch": 0.0040686521958606765, "grad_norm": 1.3906125368538222, "kl": 0.029296875, "learning_rate": 9.999591553693063e-07, "loss": 0.0012, "reward": 1.560906171798706, "reward_std": 0.15973177552223206, "rewards/accuracy_reward": 0.44840624928474426, "rewards/format_reward": 1.0, "step": 403 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.46875, "epoch": 0.004078748107016659, "grad_norm": 1.8674540621220834, "kl": 0.03173828125, "learning_rate": 9.99958952417709e-07, "loss": 0.0013, "reward": 2.062187433242798, "reward_std": 0.17494511604309082, "rewards/accuracy_reward": 0.8871874809265137, "rewards/format_reward": 1.0, "step": 404 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.75, "epoch": 0.00408884401817264, "grad_norm": 2.1700228131354877, "kl": 0.039794921875, "learning_rate": 9.999587489631612e-07, "loss": 0.0016, "reward": 1.9737188816070557, "reward_std": 0.054496072232723236, "rewards/accuracy_reward": 0.7799687385559082, "rewards/format_reward": 1.0, "step": 405 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.8125, "epoch": 0.004098939929328622, "grad_norm": 3.750700486175161, "kl": 0.037109375, "learning_rate": 9.999585450056633e-07, "loss": 0.0015, "reward": 1.9321249723434448, "reward_std": 0.19651833176612854, "rewards/accuracy_reward": 0.7508749961853027, "rewards/format_reward": 1.0, "step": 406 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.34375, "epoch": 0.004109035840484603, "grad_norm": 2.2278670612788884, "kl": 0.04248046875, "learning_rate": 9.999583405452157e-07, "loss": 0.0017, "reward": 2.017812728881836, "reward_std": 0.0551881343126297, "rewards/accuracy_reward": 0.8240624666213989, "rewards/format_reward": 1.0, "step": 407 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.0041191317516405855, "grad_norm": 2.6540131312665407, "kl": 0.03857421875, "learning_rate": 9.99958135581818e-07, "loss": 0.0015, "reward": 1.8275001049041748, "reward_std": 0.20179739594459534, "rewards/accuracy_reward": 0.6587499976158142, "rewards/format_reward": 1.0, "step": 408 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.004129227662796568, "grad_norm": 2.0487108012995123, "kl": 0.04296875, "learning_rate": 9.99957930115471e-07, "loss": 0.0017, "reward": 2.0878748893737793, "reward_std": 0.06044870987534523, "rewards/accuracy_reward": 0.8941249847412109, "rewards/format_reward": 1.0, "step": 409 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.78125, "epoch": 0.004139323573952549, "grad_norm": 2.6405214755039785, "kl": 0.043701171875, "learning_rate": 9.999577241461745e-07, "loss": 0.0018, "reward": 2.1064376831054688, "reward_std": 0.05271655693650246, "rewards/accuracy_reward": 0.9064374566078186, "rewards/format_reward": 1.0, "step": 410 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.71875, "epoch": 0.004149419485108531, "grad_norm": 2.044196300211959, "kl": 0.03466796875, "learning_rate": 9.99957517673929e-07, "loss": 0.0014, "reward": 2.08774995803833, "reward_std": 0.039116621017456055, "rewards/accuracy_reward": 0.8877500295639038, "rewards/format_reward": 1.0, "step": 411 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.0, "epoch": 0.004159515396264513, "grad_norm": 3.00097891762714, "kl": 0.0390625, "learning_rate": 9.999573106987343e-07, "loss": 0.0016, "reward": 1.988687515258789, "reward_std": 0.040497247129678726, "rewards/accuracy_reward": 0.7886874675750732, "rewards/format_reward": 1.0, "step": 412 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.21875, "epoch": 0.004169611307420495, "grad_norm": 4.958487058250473, "kl": 0.0390625, "learning_rate": 9.999571032205912e-07, "loss": 0.0016, "reward": 1.8968437910079956, "reward_std": 0.08784809708595276, "rewards/accuracy_reward": 0.7030937671661377, "rewards/format_reward": 1.0, "step": 413 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.0625, "epoch": 0.004179707218576477, "grad_norm": 1.869809723332022, "kl": 0.037109375, "learning_rate": 9.999568952394994e-07, "loss": 0.0015, "reward": 2.138446807861328, "reward_std": 0.01839783415198326, "rewards/accuracy_reward": 0.9384468793869019, "rewards/format_reward": 1.0, "step": 414 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 435.65625, "epoch": 0.004189803129732458, "grad_norm": 3.125132489468356, "kl": 0.03173828125, "learning_rate": 9.999566867554593e-07, "loss": 0.0013, "reward": 1.8317188024520874, "reward_std": 0.04579631984233856, "rewards/accuracy_reward": 0.7004687786102295, "rewards/format_reward": 1.0, "step": 415 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 413.34375, "epoch": 0.00419989904088844, "grad_norm": 2.418863099284301, "kl": 0.041259765625, "learning_rate": 9.999564777684712e-07, "loss": 0.0016, "reward": 1.7053749561309814, "reward_std": 0.058574121445417404, "rewards/accuracy_reward": 0.5553749799728394, "rewards/format_reward": 1.0, "step": 416 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.75, "epoch": 0.004209994952044422, "grad_norm": 6.605285130961833, "kl": 0.04248046875, "learning_rate": 9.999562682785353e-07, "loss": 0.0017, "reward": 2.0886874198913574, "reward_std": 0.05805312842130661, "rewards/accuracy_reward": 0.8949375152587891, "rewards/format_reward": 1.0, "step": 417 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.375, "epoch": 0.004220090863200404, "grad_norm": 2.4383665336055316, "kl": 0.04248046875, "learning_rate": 9.999560582856515e-07, "loss": 0.0017, "reward": 1.7866250276565552, "reward_std": 0.03825664520263672, "rewards/accuracy_reward": 0.6428750157356262, "rewards/format_reward": 1.0, "step": 418 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.65625, "epoch": 0.004230186774356386, "grad_norm": 1.6578478590375945, "kl": 0.0341796875, "learning_rate": 9.999558477898203e-07, "loss": 0.0014, "reward": 2.166874885559082, "reward_std": 0.025537550449371338, "rewards/accuracy_reward": 0.9668749570846558, "rewards/format_reward": 1.0, "step": 419 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 365.6875, "epoch": 0.004240282685512367, "grad_norm": 1.8167334027569397, "kl": 0.0400390625, "learning_rate": 9.999556367910417e-07, "loss": 0.0016, "reward": 1.7074687480926514, "reward_std": 0.05807764083147049, "rewards/accuracy_reward": 0.5762187242507935, "rewards/format_reward": 1.0, "step": 420 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 367.0625, "epoch": 0.004250378596668349, "grad_norm": 2.308723054091065, "kl": 0.03759765625, "learning_rate": 9.999554252893163e-07, "loss": 0.0015, "reward": 2.1468124389648438, "reward_std": 0.07297243922948837, "rewards/accuracy_reward": 0.9780625104904175, "rewards/format_reward": 1.0, "step": 421 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.6875, "epoch": 0.004260474507824331, "grad_norm": 1.784096149861139, "kl": 0.0390625, "learning_rate": 9.99955213284644e-07, "loss": 0.0016, "reward": 2.0177812576293945, "reward_std": 0.04542867839336395, "rewards/accuracy_reward": 0.8177813291549683, "rewards/format_reward": 1.0, "step": 422 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.125, "epoch": 0.004270570418980313, "grad_norm": 2.1574451351115433, "kl": 0.03564453125, "learning_rate": 9.999550007770252e-07, "loss": 0.0014, "reward": 1.8742812871932983, "reward_std": 0.14562676846981049, "rewards/accuracy_reward": 0.7242811918258667, "rewards/format_reward": 1.0, "step": 423 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.84375, "epoch": 0.004280666330136295, "grad_norm": 2.2766064831497177, "kl": 0.0439453125, "learning_rate": 9.9995478776646e-07, "loss": 0.0018, "reward": 2.0972187519073486, "reward_std": 0.048300210386514664, "rewards/accuracy_reward": 0.8972187638282776, "rewards/format_reward": 1.0, "step": 424 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.8125, "epoch": 0.004290762241292277, "grad_norm": 5.323014557609595, "kl": 0.04443359375, "learning_rate": 9.999545742529485e-07, "loss": 0.0018, "reward": 1.7512500286102295, "reward_std": 0.06952282041311264, "rewards/accuracy_reward": 0.6075000166893005, "rewards/format_reward": 1.0, "step": 425 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 399.9375, "epoch": 0.004300858152448258, "grad_norm": 2.37975507019116, "kl": 0.037109375, "learning_rate": 9.99954360236491e-07, "loss": 0.0015, "reward": 1.4950000047683716, "reward_std": 0.1721252202987671, "rewards/accuracy_reward": 0.3824999928474426, "rewards/format_reward": 1.0, "step": 426 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.0, "epoch": 0.0043109540636042405, "grad_norm": 1.9513118574873942, "kl": 0.04296875, "learning_rate": 9.999541457170877e-07, "loss": 0.0017, "reward": 2.0746874809265137, "reward_std": 0.05918693169951439, "rewards/accuracy_reward": 0.8809375762939453, "rewards/format_reward": 1.0, "step": 427 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.4375, "epoch": 0.004321049974760222, "grad_norm": 2.1009338204997645, "kl": 0.040771484375, "learning_rate": 9.99953930694739e-07, "loss": 0.0016, "reward": 2.0712811946868896, "reward_std": 0.036976560950279236, "rewards/accuracy_reward": 0.8712812662124634, "rewards/format_reward": 1.0, "step": 428 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.59375, "epoch": 0.004331145885916204, "grad_norm": 2.390935522325616, "kl": 0.03515625, "learning_rate": 9.99953715169445e-07, "loss": 0.0014, "reward": 1.929593801498413, "reward_std": 0.17729225754737854, "rewards/accuracy_reward": 0.7670937776565552, "rewards/format_reward": 1.0, "step": 429 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.96875, "epoch": 0.004341241797072186, "grad_norm": 2.1672984491499885, "kl": 0.04345703125, "learning_rate": 9.999534991412058e-07, "loss": 0.0017, "reward": 2.019437551498413, "reward_std": 0.07160348445177078, "rewards/accuracy_reward": 0.8194375038146973, "rewards/format_reward": 1.0, "step": 430 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.3125, "epoch": 0.004351337708228167, "grad_norm": 1.8522629405018316, "kl": 0.03564453125, "learning_rate": 9.999532826100218e-07, "loss": 0.0014, "reward": 1.9668750762939453, "reward_std": 0.05967790260910988, "rewards/accuracy_reward": 0.7793749570846558, "rewards/format_reward": 1.0, "step": 431 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 432.875, "epoch": 0.0043614336193841495, "grad_norm": 2.336510153570897, "kl": 0.04443359375, "learning_rate": 9.999530655758929e-07, "loss": 0.0018, "reward": 2.086656093597412, "reward_std": 0.07021024078130722, "rewards/accuracy_reward": 0.8929063081741333, "rewards/format_reward": 1.0, "step": 432 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 448.75, "epoch": 0.004371529530540132, "grad_norm": 2.6787752083749496, "kl": 0.0439453125, "learning_rate": 9.9995284803882e-07, "loss": 0.0018, "reward": 2.090749979019165, "reward_std": 0.05687155947089195, "rewards/accuracy_reward": 0.8970000743865967, "rewards/format_reward": 1.0, "step": 433 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 428.40625, "epoch": 0.004381625441696113, "grad_norm": 2.201989972393693, "kl": 0.035400390625, "learning_rate": 9.999526299988024e-07, "loss": 0.0014, "reward": 1.760968804359436, "reward_std": 0.17061977088451385, "rewards/accuracy_reward": 0.6297187805175781, "rewards/format_reward": 1.0, "step": 434 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.25, "epoch": 0.004391721352852095, "grad_norm": 3.703202968648756, "kl": 0.03515625, "learning_rate": 9.999524114558412e-07, "loss": 0.0014, "reward": 1.9695937633514404, "reward_std": 0.32357460260391235, "rewards/accuracy_reward": 0.8008437752723694, "rewards/format_reward": 1.0, "step": 435 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.03125, "epoch": 0.004401817264008076, "grad_norm": 1.499240010339218, "kl": 0.03466796875, "learning_rate": 9.999521924099357e-07, "loss": 0.0014, "reward": 1.7354062795639038, "reward_std": 0.13294315338134766, "rewards/accuracy_reward": 0.6041562557220459, "rewards/format_reward": 1.0, "step": 436 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 432.75, "epoch": 0.0044119131751640586, "grad_norm": 1.9592425891599745, "kl": 0.041259765625, "learning_rate": 9.99951972861087e-07, "loss": 0.0016, "reward": 2.0703437328338623, "reward_std": 0.05342741310596466, "rewards/accuracy_reward": 0.8828437328338623, "rewards/format_reward": 1.0, "step": 437 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 438.0625, "epoch": 0.004422009086320041, "grad_norm": 2.097183385469161, "kl": 0.041748046875, "learning_rate": 9.99951752809295e-07, "loss": 0.0017, "reward": 1.962499976158142, "reward_std": 0.058637361973524094, "rewards/accuracy_reward": 0.7749999761581421, "rewards/format_reward": 1.0, "step": 438 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.78125, "epoch": 0.004432104997476022, "grad_norm": 4.253420617294403, "kl": 0.043212890625, "learning_rate": 9.999515322545596e-07, "loss": 0.0017, "reward": 2.102062702178955, "reward_std": 0.07120099663734436, "rewards/accuracy_reward": 0.9020624160766602, "rewards/format_reward": 1.0, "step": 439 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.0625, "epoch": 0.004442200908632004, "grad_norm": 1.5236715380373445, "kl": 0.035888671875, "learning_rate": 9.999513111968814e-07, "loss": 0.0014, "reward": 1.8549375534057617, "reward_std": 0.025669125840067863, "rewards/accuracy_reward": 0.711187481880188, "rewards/format_reward": 1.0, "step": 440 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.0044522968197879854, "grad_norm": 2.013613514188762, "kl": 0.0419921875, "learning_rate": 9.999510896362605e-07, "loss": 0.0017, "reward": 1.9217500686645508, "reward_std": 0.030445443466305733, "rewards/accuracy_reward": 0.721750020980835, "rewards/format_reward": 1.0, "step": 441 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 446.875, "epoch": 0.004462392730943968, "grad_norm": 2.5608920953439083, "kl": 0.03564453125, "learning_rate": 9.999508675726971e-07, "loss": 0.0014, "reward": 2.0499377250671387, "reward_std": 0.1418786197900772, "rewards/accuracy_reward": 0.8624374866485596, "rewards/format_reward": 1.0, "step": 442 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.75, "epoch": 0.00447248864209995, "grad_norm": 2.669182352563376, "kl": 0.0380859375, "learning_rate": 9.999506450061916e-07, "loss": 0.0015, "reward": 2.0189061164855957, "reward_std": 0.17275750637054443, "rewards/accuracy_reward": 0.8376562595367432, "rewards/format_reward": 1.0, "step": 443 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 422.09375, "epoch": 0.004482584553255931, "grad_norm": 1.52243719498569, "kl": 0.0322265625, "learning_rate": 9.999504219367441e-07, "loss": 0.0013, "reward": 1.3669376373291016, "reward_std": 0.024415574967861176, "rewards/accuracy_reward": 0.2669374942779541, "rewards/format_reward": 1.0, "step": 444 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.65625, "epoch": 0.004492680464411913, "grad_norm": 2.6682328915566753, "kl": 0.0390625, "learning_rate": 9.999501983643548e-07, "loss": 0.0016, "reward": 1.9909999370574951, "reward_std": 0.19346319139003754, "rewards/accuracy_reward": 0.8097500801086426, "rewards/format_reward": 1.0, "step": 445 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.34375, "epoch": 0.004502776375567895, "grad_norm": 4.066495945897182, "kl": 0.0458984375, "learning_rate": 9.999499742890237e-07, "loss": 0.0018, "reward": 2.0219998359680176, "reward_std": 0.03823597729206085, "rewards/accuracy_reward": 0.8219999670982361, "rewards/format_reward": 1.0, "step": 446 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.4375, "epoch": 0.004512872286723877, "grad_norm": 2.164735537160354, "kl": 0.03515625, "learning_rate": 9.999497497107514e-07, "loss": 0.0014, "reward": 2.1238436698913574, "reward_std": 0.05307558923959732, "rewards/accuracy_reward": 0.9363437294960022, "rewards/format_reward": 1.0, "step": 447 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 0.004522968197879859, "grad_norm": 2.3598520360521364, "kl": 0.044921875, "learning_rate": 9.99949524629538e-07, "loss": 0.0018, "reward": 1.918562412261963, "reward_std": 0.053849756717681885, "rewards/accuracy_reward": 0.7185625433921814, "rewards/format_reward": 1.0, "step": 448 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.78125, "epoch": 0.00453306410903584, "grad_norm": 2.1589761464183566, "kl": 0.034912109375, "learning_rate": 9.999492990453838e-07, "loss": 0.0014, "reward": 2.1480937004089355, "reward_std": 0.04347701370716095, "rewards/accuracy_reward": 0.9543437957763672, "rewards/format_reward": 1.0, "step": 449 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.59375, "epoch": 0.004543160020191822, "grad_norm": 1.6361415987474213, "kl": 0.034423828125, "learning_rate": 9.999490729582887e-07, "loss": 0.0014, "reward": 2.108250141143799, "reward_std": 0.03578351438045502, "rewards/accuracy_reward": 0.9082499742507935, "rewards/format_reward": 1.0, "step": 450 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 407.34375, "epoch": 0.004553255931347804, "grad_norm": 1.8213592790383197, "kl": 0.037109375, "learning_rate": 9.999488463682534e-07, "loss": 0.0015, "reward": 1.709375023841858, "reward_std": 0.017225001007318497, "rewards/accuracy_reward": 0.5593750476837158, "rewards/format_reward": 1.0, "step": 451 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.004563351842503786, "grad_norm": 2.9429499323939994, "kl": 0.03662109375, "learning_rate": 9.999486192752777e-07, "loss": 0.0015, "reward": 2.0349063873291016, "reward_std": 0.05288638919591904, "rewards/accuracy_reward": 0.847406268119812, "rewards/format_reward": 1.0, "step": 452 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 418.65625, "epoch": 0.004573447753659768, "grad_norm": 4.311782862256814, "kl": 0.04248046875, "learning_rate": 9.999483916793624e-07, "loss": 0.0017, "reward": 1.9888750314712524, "reward_std": 0.045196257531642914, "rewards/accuracy_reward": 0.7888749837875366, "rewards/format_reward": 1.0, "step": 453 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.4375, "epoch": 0.00458354366481575, "grad_norm": 2.9762631866242355, "kl": 0.0380859375, "learning_rate": 9.99948163580507e-07, "loss": 0.0015, "reward": 1.8989686965942383, "reward_std": 0.18485060334205627, "rewards/accuracy_reward": 0.730218768119812, "rewards/format_reward": 1.0, "step": 454 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.0, "epoch": 0.004593639575971731, "grad_norm": 2.2497601196376604, "kl": 0.041015625, "learning_rate": 9.99947934978712e-07, "loss": 0.0016, "reward": 2.02665638923645, "reward_std": 0.06809619069099426, "rewards/accuracy_reward": 0.8329062461853027, "rewards/format_reward": 1.0, "step": 455 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.3125, "epoch": 0.0046037354871277135, "grad_norm": 1.979585852521353, "kl": 0.0400390625, "learning_rate": 9.99947705873978e-07, "loss": 0.0016, "reward": 2.1417500972747803, "reward_std": 0.022847013548016548, "rewards/accuracy_reward": 0.9417500495910645, "rewards/format_reward": 1.0, "step": 456 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.4375, "epoch": 0.004613831398283695, "grad_norm": 1.9530427791004392, "kl": 0.0390625, "learning_rate": 9.999474762663046e-07, "loss": 0.0016, "reward": 1.8001874685287476, "reward_std": 0.021833330392837524, "rewards/accuracy_reward": 0.6501875519752502, "rewards/format_reward": 1.0, "step": 457 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.65625, "epoch": 0.004623927309439677, "grad_norm": 3.5788688361078336, "kl": 0.037109375, "learning_rate": 9.999472461556927e-07, "loss": 0.0015, "reward": 1.8647500276565552, "reward_std": 0.19552898406982422, "rewards/accuracy_reward": 0.6959999799728394, "rewards/format_reward": 1.0, "step": 458 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.78125, "epoch": 0.004634023220595659, "grad_norm": 1.4551515781748299, "kl": 0.03369140625, "learning_rate": 9.99947015542142e-07, "loss": 0.0013, "reward": 1.972749948501587, "reward_std": 0.14127828180789948, "rewards/accuracy_reward": 0.7915000319480896, "rewards/format_reward": 1.0, "step": 459 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.28125, "epoch": 0.00464411913175164, "grad_norm": 2.3969877120427197, "kl": 0.037109375, "learning_rate": 9.999467844256532e-07, "loss": 0.0015, "reward": 2.0692501068115234, "reward_std": 0.04410139471292496, "rewards/accuracy_reward": 0.875499963760376, "rewards/format_reward": 1.0, "step": 460 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.0046542150429076225, "grad_norm": 2.034697965567258, "kl": 0.029296875, "learning_rate": 9.99946552806226e-07, "loss": 0.0012, "reward": 1.8456251621246338, "reward_std": 0.24120286107063293, "rewards/accuracy_reward": 0.7081249356269836, "rewards/format_reward": 1.0, "step": 461 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.875, "epoch": 0.004664310954063605, "grad_norm": 4.149182385189983, "kl": 0.03857421875, "learning_rate": 9.99946320683861e-07, "loss": 0.0015, "reward": 1.8132188320159912, "reward_std": 0.03038104623556137, "rewards/accuracy_reward": 0.6632187366485596, "rewards/format_reward": 1.0, "step": 462 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.59375, "epoch": 0.004674406865219586, "grad_norm": 2.146034930244977, "kl": 0.0361328125, "learning_rate": 9.999460880585582e-07, "loss": 0.0014, "reward": 2.055781364440918, "reward_std": 0.1443839818239212, "rewards/accuracy_reward": 0.8682812452316284, "rewards/format_reward": 1.0, "step": 463 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.15625, "epoch": 0.004684502776375568, "grad_norm": 6.601836865157849, "kl": 0.0419921875, "learning_rate": 9.999458549303181e-07, "loss": 0.0017, "reward": 2.069125175476074, "reward_std": 0.04121464118361473, "rewards/accuracy_reward": 0.8753750324249268, "rewards/format_reward": 1.0, "step": 464 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.875, "epoch": 0.004694598687531549, "grad_norm": 1.0651307369418526, "kl": 0.03271484375, "learning_rate": 9.999456212991409e-07, "loss": 0.0013, "reward": 2.1029062271118164, "reward_std": 0.010951122269034386, "rewards/accuracy_reward": 0.9029062390327454, "rewards/format_reward": 1.0, "step": 465 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 415.65625, "epoch": 0.0047046945986875316, "grad_norm": 3.3877271264314444, "kl": 0.037109375, "learning_rate": 9.999453871650265e-07, "loss": 0.0015, "reward": 2.0180888175964355, "reward_std": 0.2093382030725479, "rewards/accuracy_reward": 0.8368387222290039, "rewards/format_reward": 1.0, "step": 466 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 418.78125, "epoch": 0.004714790509843514, "grad_norm": 0.26100572254129484, "kl": 0.02685546875, "learning_rate": 9.999451525279756e-07, "loss": 0.0011, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 467 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.53125, "epoch": 0.004724886420999495, "grad_norm": 2.3040300527193813, "kl": 0.041748046875, "learning_rate": 9.99944917387988e-07, "loss": 0.0017, "reward": 2.0466561317443848, "reward_std": 0.0496186688542366, "rewards/accuracy_reward": 0.8591562509536743, "rewards/format_reward": 1.0, "step": 468 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.84375, "epoch": 0.004734982332155477, "grad_norm": 2.147434400747741, "kl": 0.034423828125, "learning_rate": 9.999446817450643e-07, "loss": 0.0014, "reward": 1.82296884059906, "reward_std": 0.023065567016601562, "rewards/accuracy_reward": 0.6729687452316284, "rewards/format_reward": 1.0, "step": 469 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.6875, "epoch": 0.0047450782433114585, "grad_norm": 2.4680581134624546, "kl": 0.035888671875, "learning_rate": 9.999444455992046e-07, "loss": 0.0014, "reward": 1.6621251106262207, "reward_std": 0.255985826253891, "rewards/accuracy_reward": 0.5558750033378601, "rewards/format_reward": 1.0, "step": 470 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.03125, "epoch": 0.004755174154467441, "grad_norm": 2.9600062064053976, "kl": 0.0361328125, "learning_rate": 9.999442089504089e-07, "loss": 0.0014, "reward": 1.8710311651229858, "reward_std": 0.11193837225437164, "rewards/accuracy_reward": 0.7210312485694885, "rewards/format_reward": 1.0, "step": 471 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.78125, "epoch": 0.004765270065623423, "grad_norm": 1.6544489705413659, "kl": 0.038330078125, "learning_rate": 9.99943971798678e-07, "loss": 0.0015, "reward": 1.7442187070846558, "reward_std": 0.019304607063531876, "rewards/accuracy_reward": 0.5942187309265137, "rewards/format_reward": 1.0, "step": 472 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.004775365976779404, "grad_norm": 3.2161075660245624, "kl": 0.042236328125, "learning_rate": 9.999437341440115e-07, "loss": 0.0017, "reward": 2.009000062942505, "reward_std": 0.053121842443943024, "rewards/accuracy_reward": 0.8090000152587891, "rewards/format_reward": 1.0, "step": 473 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.004785461887935386, "grad_norm": 3.380478403606823, "kl": 0.042724609375, "learning_rate": 9.999434959864102e-07, "loss": 0.0017, "reward": 1.9026561975479126, "reward_std": 0.19350938498973846, "rewards/accuracy_reward": 0.7276562452316284, "rewards/format_reward": 1.0, "step": 474 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.28125, "epoch": 0.004795557799091368, "grad_norm": 2.2216457786576593, "kl": 0.051025390625, "learning_rate": 9.999432573258737e-07, "loss": 0.002, "reward": 2.0541250705718994, "reward_std": 0.01837148703634739, "rewards/accuracy_reward": 0.8541250228881836, "rewards/format_reward": 1.0, "step": 475 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.34375, "epoch": 0.00480565371024735, "grad_norm": 2.80783523897221, "kl": 0.0439453125, "learning_rate": 9.999430181624028e-07, "loss": 0.0018, "reward": 1.9873437881469727, "reward_std": 0.05082353949546814, "rewards/accuracy_reward": 0.7873437404632568, "rewards/format_reward": 1.0, "step": 476 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.40625, "epoch": 0.004815749621403332, "grad_norm": 2.132103384765541, "kl": 0.0458984375, "learning_rate": 9.999427784959978e-07, "loss": 0.0018, "reward": 2.0236876010894775, "reward_std": 0.04613665118813515, "rewards/accuracy_reward": 0.836187481880188, "rewards/format_reward": 1.0, "step": 477 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.125, "epoch": 0.004825845532559313, "grad_norm": 1.3614862911723198, "kl": 0.0302734375, "learning_rate": 9.999425383266583e-07, "loss": 0.0012, "reward": 1.625, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 478 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.375, "epoch": 0.004835941443715295, "grad_norm": 1.5815697934464625, "kl": 0.04248046875, "learning_rate": 9.999422976543852e-07, "loss": 0.0017, "reward": 2.153437614440918, "reward_std": 0.02321607805788517, "rewards/accuracy_reward": 0.9596874713897705, "rewards/format_reward": 1.0, "step": 479 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.625, "epoch": 0.004846037354871277, "grad_norm": 2.0162159239092183, "kl": 0.044921875, "learning_rate": 9.999420564791784e-07, "loss": 0.0018, "reward": 2.086250066757202, "reward_std": 0.020622558891773224, "rewards/accuracy_reward": 0.8862499594688416, "rewards/format_reward": 1.0, "step": 480 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 403.78125, "epoch": 0.004856133266027259, "grad_norm": 2.244651748805665, "kl": 0.041015625, "learning_rate": 9.999418148010384e-07, "loss": 0.0016, "reward": 1.8279062509536743, "reward_std": 0.04994850233197212, "rewards/accuracy_reward": 0.6841562390327454, "rewards/format_reward": 1.0, "step": 481 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.875, "epoch": 0.004866229177183241, "grad_norm": 3.4711602998314253, "kl": 0.0361328125, "learning_rate": 9.99941572619965e-07, "loss": 0.0014, "reward": 1.7817811965942383, "reward_std": 0.02393220365047455, "rewards/accuracy_reward": 0.631781280040741, "rewards/format_reward": 1.0, "step": 482 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.59375, "epoch": 0.004876325088339223, "grad_norm": 1.6248288472794739, "kl": 0.03076171875, "learning_rate": 9.999413299359588e-07, "loss": 0.0012, "reward": 2.132406234741211, "reward_std": 0.11485415697097778, "rewards/accuracy_reward": 0.938656210899353, "rewards/format_reward": 1.0, "step": 483 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.40625, "epoch": 0.004886420999495204, "grad_norm": 1.7447787639864865, "kl": 0.0390625, "learning_rate": 9.9994108674902e-07, "loss": 0.0016, "reward": 2.001999855041504, "reward_std": 0.0515395812690258, "rewards/accuracy_reward": 0.8144999742507935, "rewards/format_reward": 1.0, "step": 484 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.0048965169106511865, "grad_norm": 4.018029367185827, "kl": 0.052734375, "learning_rate": 9.999408430591487e-07, "loss": 0.0021, "reward": 1.9769375324249268, "reward_std": 0.04614870995283127, "rewards/accuracy_reward": 0.7831875085830688, "rewards/format_reward": 1.0, "step": 485 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 432.6875, "epoch": 0.004906612821807168, "grad_norm": 1.1868324228631235, "kl": 0.0308837890625, "learning_rate": 9.999405988663453e-07, "loss": 0.0012, "reward": 1.7220001220703125, "reward_std": 0.261330783367157, "rewards/accuracy_reward": 0.597000002861023, "rewards/format_reward": 1.0, "step": 486 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.46875, "epoch": 0.00491670873296315, "grad_norm": 1.4404274870102962, "kl": 0.037353515625, "learning_rate": 9.9994035417061e-07, "loss": 0.0015, "reward": 2.050656318664551, "reward_std": 0.01310214214026928, "rewards/accuracy_reward": 0.850656270980835, "rewards/format_reward": 1.0, "step": 487 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 433.5, "epoch": 0.004926804644119132, "grad_norm": 10.001346355714762, "kl": 0.04736328125, "learning_rate": 9.99940108971943e-07, "loss": 0.0019, "reward": 1.7734999656677246, "reward_std": 0.04404474049806595, "rewards/accuracy_reward": 0.6360000371932983, "rewards/format_reward": 1.0, "step": 488 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.03125, "epoch": 0.004936900555275113, "grad_norm": 2.4273445923559906, "kl": 0.041259765625, "learning_rate": 9.999398632703446e-07, "loss": 0.0017, "reward": 2.1240313053131104, "reward_std": 0.09293318539857864, "rewards/accuracy_reward": 0.9427812695503235, "rewards/format_reward": 1.0, "step": 489 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 421.96875, "epoch": 0.0049469964664310955, "grad_norm": 2.989571247367063, "kl": 0.03271484375, "learning_rate": 9.99939617065815e-07, "loss": 0.0013, "reward": 1.5699687004089355, "reward_std": 0.16621221601963043, "rewards/accuracy_reward": 0.45121872425079346, "rewards/format_reward": 1.0, "step": 490 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 414.125, "epoch": 0.004957092377587077, "grad_norm": 1.6494862646130817, "kl": 0.0390625, "learning_rate": 9.999393703583547e-07, "loss": 0.0016, "reward": 1.696874976158142, "reward_std": 0.18334443867206573, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 491 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.25, "epoch": 0.004967188288743059, "grad_norm": 1.545843299766629, "kl": 0.036376953125, "learning_rate": 9.999391231479634e-07, "loss": 0.0015, "reward": 2.139812469482422, "reward_std": 0.03700418770313263, "rewards/accuracy_reward": 0.9460625052452087, "rewards/format_reward": 1.0, "step": 492 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.40625, "epoch": 0.004977284199899041, "grad_norm": 2.9802965226065044, "kl": 0.04541015625, "learning_rate": 9.999388754346419e-07, "loss": 0.0018, "reward": 2.0696873664855957, "reward_std": 0.020275074988603592, "rewards/accuracy_reward": 0.869687557220459, "rewards/format_reward": 1.0, "step": 493 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.6875, "epoch": 0.004987380111055022, "grad_norm": 3.701748501931677, "kl": 0.049072265625, "learning_rate": 9.9993862721839e-07, "loss": 0.002, "reward": 2.0414061546325684, "reward_std": 0.040925294160842896, "rewards/accuracy_reward": 0.84765625, "rewards/format_reward": 1.0, "step": 494 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.15625, "epoch": 0.0049974760222110046, "grad_norm": 2.733847433541248, "kl": 0.04443359375, "learning_rate": 9.999383784992084e-07, "loss": 0.0018, "reward": 1.9614686965942383, "reward_std": 0.035127755254507065, "rewards/accuracy_reward": 0.761468768119812, "rewards/format_reward": 1.0, "step": 495 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.15625, "epoch": 0.005007571933366987, "grad_norm": 1.8716879919597282, "kl": 0.0400390625, "learning_rate": 9.999381292770972e-07, "loss": 0.0016, "reward": 1.8155624866485596, "reward_std": 0.016679886728525162, "rewards/accuracy_reward": 0.6655625104904175, "rewards/format_reward": 1.0, "step": 496 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.34375, "epoch": 0.005017667844522968, "grad_norm": 9.043700649794955, "kl": 0.03955078125, "learning_rate": 9.999378795520563e-07, "loss": 0.0016, "reward": 1.7810312509536743, "reward_std": 0.03666093200445175, "rewards/accuracy_reward": 0.6372812390327454, "rewards/format_reward": 1.0, "step": 497 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 406.65625, "epoch": 0.00502776375567895, "grad_norm": 1.405645610648008, "kl": 0.03466796875, "learning_rate": 9.999376293240864e-07, "loss": 0.0014, "reward": 1.8442500829696655, "reward_std": 0.02381889522075653, "rewards/accuracy_reward": 0.7005000114440918, "rewards/format_reward": 1.0, "step": 498 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.3125, "epoch": 0.0050378596668349315, "grad_norm": 2.265259528784971, "kl": 0.040771484375, "learning_rate": 9.999373785931876e-07, "loss": 0.0016, "reward": 1.9888126850128174, "reward_std": 0.17538988590240479, "rewards/accuracy_reward": 0.820062518119812, "rewards/format_reward": 1.0, "step": 499 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.75, "epoch": 0.005047955577990914, "grad_norm": 9.590137828870175, "kl": 0.05029296875, "learning_rate": 9.9993712735936e-07, "loss": 0.002, "reward": 2.0205626487731934, "reward_std": 0.05022747069597244, "rewards/accuracy_reward": 0.8268125057220459, "rewards/format_reward": 1.0, "step": 500 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.125, "epoch": 0.005058051489146896, "grad_norm": 2.1016821452757006, "kl": 0.039794921875, "learning_rate": 9.99936875622604e-07, "loss": 0.0016, "reward": 1.733625054359436, "reward_std": 0.02326502650976181, "rewards/accuracy_reward": 0.5836250185966492, "rewards/format_reward": 1.0, "step": 501 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.005068147400302877, "grad_norm": 3.33476463637841, "kl": 0.048583984375, "learning_rate": 9.999366233829198e-07, "loss": 0.0019, "reward": 2.0716562271118164, "reward_std": 0.02915109694004059, "rewards/accuracy_reward": 0.8716562986373901, "rewards/format_reward": 1.0, "step": 502 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.4375, "epoch": 0.005078243311458859, "grad_norm": 2.2722706135793747, "kl": 0.04541015625, "learning_rate": 9.999363706403077e-07, "loss": 0.0018, "reward": 1.8105623722076416, "reward_std": 0.02333701401948929, "rewards/accuracy_reward": 0.6605625152587891, "rewards/format_reward": 1.0, "step": 503 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 409.125, "epoch": 0.005088339222614841, "grad_norm": 3.8140590746574032, "kl": 0.04638671875, "learning_rate": 9.99936117394768e-07, "loss": 0.0019, "reward": 1.8080313205718994, "reward_std": 0.020534541457891464, "rewards/accuracy_reward": 0.6580312848091125, "rewards/format_reward": 1.0, "step": 504 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.34375, "epoch": 0.005098435133770823, "grad_norm": 6.422326068074918, "kl": 0.048583984375, "learning_rate": 9.999358636463009e-07, "loss": 0.0019, "reward": 2.05496883392334, "reward_std": 0.03096778690814972, "rewards/accuracy_reward": 0.8549686670303345, "rewards/format_reward": 1.0, "step": 505 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 438.90625, "epoch": 0.005108531044926805, "grad_norm": 5.4288717456718825, "kl": 0.040771484375, "learning_rate": 9.999356093949067e-07, "loss": 0.0016, "reward": 1.697718858718872, "reward_std": 0.15842139720916748, "rewards/accuracy_reward": 0.5602187514305115, "rewards/format_reward": 1.0, "step": 506 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.3125, "epoch": 0.005118626956082786, "grad_norm": 1.258874624974841, "kl": 0.031005859375, "learning_rate": 9.999353546405855e-07, "loss": 0.0012, "reward": 1.8371875286102295, "reward_std": 0.0423724502325058, "rewards/accuracy_reward": 0.7121874690055847, "rewards/format_reward": 1.0, "step": 507 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.5625, "epoch": 0.005128722867238768, "grad_norm": 1.8329830566012986, "kl": 0.060791015625, "learning_rate": 9.999350993833378e-07, "loss": 0.0024, "reward": 2.015625, "reward_std": 0.019116312265396118, "rewards/accuracy_reward": 0.815625011920929, "rewards/format_reward": 1.0, "step": 508 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.1875, "epoch": 0.00513881877839475, "grad_norm": 1.8580606886025386, "kl": 0.0439453125, "learning_rate": 9.999348436231636e-07, "loss": 0.0018, "reward": 2.043093681335449, "reward_std": 0.03783680871129036, "rewards/accuracy_reward": 0.8493437170982361, "rewards/format_reward": 1.0, "step": 509 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.15625, "epoch": 0.005148914689550732, "grad_norm": 2.192798591148925, "kl": 0.044189453125, "learning_rate": 9.999345873600633e-07, "loss": 0.0018, "reward": 2.0369062423706055, "reward_std": 0.05943996459245682, "rewards/accuracy_reward": 0.8494062423706055, "rewards/format_reward": 1.0, "step": 510 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 421.96875, "epoch": 0.005159010600706714, "grad_norm": 1.662353693441982, "kl": 0.03466796875, "learning_rate": 9.999343305940373e-07, "loss": 0.0014, "reward": 1.5753438472747803, "reward_std": 0.10757441818714142, "rewards/accuracy_reward": 0.4753437638282776, "rewards/format_reward": 1.0, "step": 511 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 439.03125, "epoch": 0.005169106511862696, "grad_norm": 1.6229099555330502, "kl": 0.03759765625, "learning_rate": 9.999340733250855e-07, "loss": 0.0015, "reward": 1.8186249732971191, "reward_std": 0.031006766483187675, "rewards/accuracy_reward": 0.674875020980835, "rewards/format_reward": 1.0, "step": 512 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.28125, "epoch": 0.005179202423018677, "grad_norm": 3.7400935822217685, "kl": 0.050537109375, "learning_rate": 9.999338155532083e-07, "loss": 0.002, "reward": 2.035656452178955, "reward_std": 0.03728102520108223, "rewards/accuracy_reward": 0.8419061899185181, "rewards/format_reward": 1.0, "step": 513 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 405.09375, "epoch": 0.0051892983341746595, "grad_norm": 1.3546541163120671, "kl": 0.03857421875, "learning_rate": 9.999335572784062e-07, "loss": 0.0015, "reward": 1.8646249771118164, "reward_std": 0.012195724993944168, "rewards/accuracy_reward": 0.7146250009536743, "rewards/format_reward": 1.0, "step": 514 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.3125, "epoch": 0.005199394245330641, "grad_norm": 2.9005204836279135, "kl": 0.04541015625, "learning_rate": 9.999332985006793e-07, "loss": 0.0018, "reward": 1.8246874809265137, "reward_std": 0.10042648762464523, "rewards/accuracy_reward": 0.6746874451637268, "rewards/format_reward": 1.0, "step": 515 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.005209490156486623, "grad_norm": 2.143813248975468, "kl": 0.04296875, "learning_rate": 9.999330392200277e-07, "loss": 0.0017, "reward": 1.8960938453674316, "reward_std": 0.17142820358276367, "rewards/accuracy_reward": 0.7398437857627869, "rewards/format_reward": 1.0, "step": 516 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.84375, "epoch": 0.005219586067642605, "grad_norm": 2.192441962682335, "kl": 0.04296875, "learning_rate": 9.999327794364519e-07, "loss": 0.0017, "reward": 1.8365000486373901, "reward_std": 0.17597728967666626, "rewards/accuracy_reward": 0.6802500486373901, "rewards/format_reward": 1.0, "step": 517 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.875, "epoch": 0.005229681978798586, "grad_norm": 0.08050198156331721, "kl": 0.0299072265625, "learning_rate": 9.99932519149952e-07, "loss": 0.0012, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 518 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.71875, "epoch": 0.0052397778899545685, "grad_norm": 3.7687734476705135, "kl": 0.05908203125, "learning_rate": 9.999322583605282e-07, "loss": 0.0024, "reward": 1.9491875171661377, "reward_std": 0.043653152883052826, "rewards/accuracy_reward": 0.7491875290870667, "rewards/format_reward": 1.0, "step": 519 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.6875, "epoch": 0.00524987380111055, "grad_norm": 2.105459213723709, "kl": 0.05517578125, "learning_rate": 9.99931997068181e-07, "loss": 0.0022, "reward": 2.0309998989105225, "reward_std": 0.04779927432537079, "rewards/accuracy_reward": 0.8372499942779541, "rewards/format_reward": 1.0, "step": 520 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.40625, "epoch": 0.005259969712266532, "grad_norm": 1.953204058793752, "kl": 0.052734375, "learning_rate": 9.999317352729106e-07, "loss": 0.0021, "reward": 1.830625057220459, "reward_std": 0.017579972743988037, "rewards/accuracy_reward": 0.6806250214576721, "rewards/format_reward": 1.0, "step": 521 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.0, "epoch": 0.005270065623422514, "grad_norm": 1.027376937089553, "kl": 0.031005859375, "learning_rate": 9.99931472974717e-07, "loss": 0.0012, "reward": 1.6749999523162842, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 522 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.78125, "epoch": 0.005280161534578495, "grad_norm": 3.9636127500766065, "kl": 0.04443359375, "learning_rate": 9.999312101736008e-07, "loss": 0.0018, "reward": 1.809531331062317, "reward_std": 0.1848491132259369, "rewards/accuracy_reward": 0.6470312476158142, "rewards/format_reward": 1.0, "step": 523 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.0, "epoch": 0.005290257445734478, "grad_norm": 2.5951453381067693, "kl": 0.0478515625, "learning_rate": 9.999309468695623e-07, "loss": 0.0019, "reward": 1.9827812910079956, "reward_std": 0.04933669790625572, "rewards/accuracy_reward": 0.7827813029289246, "rewards/format_reward": 1.0, "step": 524 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.875, "epoch": 0.00530035335689046, "grad_norm": 1.6356320856278752, "kl": 0.0419921875, "learning_rate": 9.999306830626012e-07, "loss": 0.0017, "reward": 2.1587188243865967, "reward_std": 0.017733555287122726, "rewards/accuracy_reward": 0.9587187170982361, "rewards/format_reward": 1.0, "step": 525 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.03125, "epoch": 0.005310449268046441, "grad_norm": 2.260771988594512, "kl": 0.03173828125, "learning_rate": 9.999304187527185e-07, "loss": 0.0013, "reward": 1.9624063968658447, "reward_std": 0.1710212528705597, "rewards/accuracy_reward": 0.7936562299728394, "rewards/format_reward": 1.0, "step": 526 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.625, "epoch": 0.005320545179202423, "grad_norm": 2.604551398826538, "kl": 0.04638671875, "learning_rate": 9.99930153939914e-07, "loss": 0.0019, "reward": 1.8426249027252197, "reward_std": 0.16394788026809692, "rewards/accuracy_reward": 0.6801249980926514, "rewards/format_reward": 1.0, "step": 527 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.75, "epoch": 0.0053306410903584045, "grad_norm": 2.355855335905673, "kl": 0.0478515625, "learning_rate": 9.999298886241883e-07, "loss": 0.0019, "reward": 2.085343837738037, "reward_std": 0.022387497127056122, "rewards/accuracy_reward": 0.8853437900543213, "rewards/format_reward": 1.0, "step": 528 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.8125, "epoch": 0.005340737001514387, "grad_norm": 2.3331682013193307, "kl": 0.04296875, "learning_rate": 9.99929622805541e-07, "loss": 0.0017, "reward": 1.7112500667572021, "reward_std": 0.023236483335494995, "rewards/accuracy_reward": 0.5612499713897705, "rewards/format_reward": 1.0, "step": 529 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 422.4375, "epoch": 0.005350832912670369, "grad_norm": 2.050632201030964, "kl": 0.041015625, "learning_rate": 9.999293564839732e-07, "loss": 0.0016, "reward": 2.0229687690734863, "reward_std": 0.13758216798305511, "rewards/accuracy_reward": 0.8292188048362732, "rewards/format_reward": 1.0, "step": 530 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.8125, "epoch": 0.00536092882382635, "grad_norm": 2.343024694778481, "kl": 0.046875, "learning_rate": 9.99929089659485e-07, "loss": 0.0019, "reward": 2.0638437271118164, "reward_std": 0.0483272448182106, "rewards/accuracy_reward": 0.8638437390327454, "rewards/format_reward": 1.0, "step": 531 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.005371024734982332, "grad_norm": 4.572953302235144, "kl": 0.05029296875, "learning_rate": 9.999288223320762e-07, "loss": 0.002, "reward": 1.9837499856948853, "reward_std": 0.04374421387910843, "rewards/accuracy_reward": 0.7837499976158142, "rewards/format_reward": 1.0, "step": 532 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.34375, "epoch": 0.005381120646138314, "grad_norm": 2.6292619971612874, "kl": 0.04931640625, "learning_rate": 9.999285545017471e-07, "loss": 0.002, "reward": 2.058906316757202, "reward_std": 0.035565197467803955, "rewards/accuracy_reward": 0.8651562929153442, "rewards/format_reward": 1.0, "step": 533 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.005391216557294296, "grad_norm": 2.5555278916016517, "kl": 0.0498046875, "learning_rate": 9.999282861684986e-07, "loss": 0.002, "reward": 2.115468978881836, "reward_std": 0.03869961202144623, "rewards/accuracy_reward": 0.9154687523841858, "rewards/format_reward": 1.0, "step": 534 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.1875, "epoch": 0.005401312468450278, "grad_norm": 2.000022357463033, "kl": 0.039306640625, "learning_rate": 9.999280173323302e-07, "loss": 0.0016, "reward": 2.0683436393737793, "reward_std": 0.021310003474354744, "rewards/accuracy_reward": 0.868343710899353, "rewards/format_reward": 1.0, "step": 535 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 363.09375, "epoch": 0.005411408379606259, "grad_norm": 2.4136937086779313, "kl": 0.0439453125, "learning_rate": 9.999277479932428e-07, "loss": 0.0018, "reward": 2.1112186908721924, "reward_std": 0.04814226180315018, "rewards/accuracy_reward": 0.9424687623977661, "rewards/format_reward": 1.0, "step": 536 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.5625, "epoch": 0.005421504290762241, "grad_norm": 2.2747374371961278, "kl": 0.0439453125, "learning_rate": 9.999274781512366e-07, "loss": 0.0018, "reward": 1.991031289100647, "reward_std": 0.04675097391009331, "rewards/accuracy_reward": 0.7910312414169312, "rewards/format_reward": 1.0, "step": 537 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.03125, "epoch": 0.005431600201918223, "grad_norm": 1.0303299153480907, "kl": 0.03662109375, "learning_rate": 9.999272078063113e-07, "loss": 0.0015, "reward": 1.8693749904632568, "reward_std": 0.008964222855865955, "rewards/accuracy_reward": 0.7193750143051147, "rewards/format_reward": 1.0, "step": 538 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.5, "epoch": 0.005441696113074205, "grad_norm": 1.7107873138667926, "kl": 0.0419921875, "learning_rate": 9.999269369584678e-07, "loss": 0.0017, "reward": 1.9774374961853027, "reward_std": 0.16734501719474792, "rewards/accuracy_reward": 0.7961874604225159, "rewards/format_reward": 1.0, "step": 539 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.21875, "epoch": 0.005451792024230187, "grad_norm": 4.714345215750481, "kl": 0.047607421875, "learning_rate": 9.999266656077061e-07, "loss": 0.0019, "reward": 2.004687547683716, "reward_std": 0.03903834894299507, "rewards/accuracy_reward": 0.8046875, "rewards/format_reward": 1.0, "step": 540 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.15625, "epoch": 0.005461887935386169, "grad_norm": 13.012448495915448, "kl": 0.046630859375, "learning_rate": 9.999263937540263e-07, "loss": 0.0019, "reward": 1.766374945640564, "reward_std": 0.032262057065963745, "rewards/accuracy_reward": 0.6163749694824219, "rewards/format_reward": 1.0, "step": 541 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.25, "epoch": 0.00547198384654215, "grad_norm": 2.6975870411686724, "kl": 0.0380859375, "learning_rate": 9.99926121397429e-07, "loss": 0.0015, "reward": 1.8134688138961792, "reward_std": 0.238605797290802, "rewards/accuracy_reward": 0.6759687662124634, "rewards/format_reward": 1.0, "step": 542 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.09375, "epoch": 0.0054820797576981325, "grad_norm": 2.0446983144013293, "kl": 0.0400390625, "learning_rate": 9.999258485379145e-07, "loss": 0.0016, "reward": 1.9262187480926514, "reward_std": 0.17685961723327637, "rewards/accuracy_reward": 0.7574687600135803, "rewards/format_reward": 1.0, "step": 543 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 398.1875, "epoch": 0.005492175668854114, "grad_norm": 4.030621303860259, "kl": 0.0419921875, "learning_rate": 9.999255751754829e-07, "loss": 0.0017, "reward": 1.6174687147140503, "reward_std": 0.0339864082634449, "rewards/accuracy_reward": 0.4674687385559082, "rewards/format_reward": 1.0, "step": 544 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.005502271580010096, "grad_norm": 2.3635403181452395, "kl": 0.059326171875, "learning_rate": 9.999253013101345e-07, "loss": 0.0024, "reward": 2.02706241607666, "reward_std": 0.03099925071001053, "rewards/accuracy_reward": 0.8270624876022339, "rewards/format_reward": 1.0, "step": 545 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.005512367491166078, "grad_norm": 2.3994714369810755, "kl": 0.04638671875, "learning_rate": 9.999250269418694e-07, "loss": 0.0019, "reward": 1.6632187366485596, "reward_std": 0.10788824409246445, "rewards/accuracy_reward": 0.5132187604904175, "rewards/format_reward": 1.0, "step": 546 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.6875, "epoch": 0.005522463402322059, "grad_norm": 1.751585148481766, "kl": 0.038818359375, "learning_rate": 9.999247520706882e-07, "loss": 0.0016, "reward": 1.8167500495910645, "reward_std": 0.11508777737617493, "rewards/accuracy_reward": 0.6792500019073486, "rewards/format_reward": 1.0, "step": 547 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.5, "epoch": 0.0055325593134780415, "grad_norm": 2.186813354116569, "kl": 0.0458984375, "learning_rate": 9.99924476696591e-07, "loss": 0.0018, "reward": 2.120375156402588, "reward_std": 0.03426230698823929, "rewards/accuracy_reward": 0.9266250133514404, "rewards/format_reward": 1.0, "step": 548 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.90625, "epoch": 0.005542655224634023, "grad_norm": 2.0894490168273268, "kl": 0.04931640625, "learning_rate": 9.999242008195781e-07, "loss": 0.002, "reward": 2.085218906402588, "reward_std": 0.02944764867424965, "rewards/accuracy_reward": 0.8852187395095825, "rewards/format_reward": 1.0, "step": 549 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.6875, "epoch": 0.005552751135790005, "grad_norm": 2.479287727130821, "kl": 0.04052734375, "learning_rate": 9.999239244396498e-07, "loss": 0.0016, "reward": 2.044062614440918, "reward_std": 0.1350070834159851, "rewards/accuracy_reward": 0.8503125309944153, "rewards/format_reward": 1.0, "step": 550 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.78125, "epoch": 0.005562847046945987, "grad_norm": 2.8156611132755796, "kl": 0.0390625, "learning_rate": 9.999236475568063e-07, "loss": 0.0016, "reward": 1.8583438396453857, "reward_std": 0.18350283801555634, "rewards/accuracy_reward": 0.6895936727523804, "rewards/format_reward": 1.0, "step": 551 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.125, "epoch": 0.005572942958101968, "grad_norm": 1.5694739746293556, "kl": 0.03564453125, "learning_rate": 9.999233701710482e-07, "loss": 0.0014, "reward": 1.9492499828338623, "reward_std": 0.15655069053173065, "rewards/accuracy_reward": 0.7929999828338623, "rewards/format_reward": 1.0, "step": 552 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 434.71875, "epoch": 0.005583038869257951, "grad_norm": 1.7824272384293418, "kl": 0.034423828125, "learning_rate": 9.999230922823752e-07, "loss": 0.0014, "reward": 1.8021249771118164, "reward_std": 0.15202346444129944, "rewards/accuracy_reward": 0.6646249890327454, "rewards/format_reward": 1.0, "step": 553 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 406.53125, "epoch": 0.005593134780413933, "grad_norm": 0.9412600931074935, "kl": 0.0284423828125, "learning_rate": 9.99922813890788e-07, "loss": 0.0011, "reward": 1.8937499523162842, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 554 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.15625, "epoch": 0.005603230691569914, "grad_norm": 2.039010763158544, "kl": 0.03173828125, "learning_rate": 9.999225349962867e-07, "loss": 0.0013, "reward": 1.7452499866485596, "reward_std": 0.28024545311927795, "rewards/accuracy_reward": 0.6327499747276306, "rewards/format_reward": 1.0, "step": 555 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 393.03125, "epoch": 0.005613326602725896, "grad_norm": 1.4936746579015483, "kl": 0.03564453125, "learning_rate": 9.999222555988719e-07, "loss": 0.0014, "reward": 1.8476250171661377, "reward_std": 0.01788843423128128, "rewards/accuracy_reward": 0.697624921798706, "rewards/format_reward": 1.0, "step": 556 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.5625, "epoch": 0.0056234225138818775, "grad_norm": 2.7493741201148243, "kl": 0.046142578125, "learning_rate": 9.999219756985436e-07, "loss": 0.0018, "reward": 2.0249686241149902, "reward_std": 0.05684882402420044, "rewards/accuracy_reward": 0.8374687433242798, "rewards/format_reward": 1.0, "step": 557 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.71875, "epoch": 0.00563351842503786, "grad_norm": 3.13219158988568, "kl": 0.04345703125, "learning_rate": 9.99921695295302e-07, "loss": 0.0017, "reward": 1.9439375400543213, "reward_std": 0.03367071971297264, "rewards/accuracy_reward": 0.7439374923706055, "rewards/format_reward": 1.0, "step": 558 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.75, "epoch": 0.005643614336193842, "grad_norm": 1.8614957932153684, "kl": 0.04248046875, "learning_rate": 9.999214143891474e-07, "loss": 0.0017, "reward": 2.0458438396453857, "reward_std": 0.04599430039525032, "rewards/accuracy_reward": 0.8520936965942383, "rewards/format_reward": 1.0, "step": 559 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.90625, "epoch": 0.005653710247349823, "grad_norm": 6.9846349303263455, "kl": 0.03955078125, "learning_rate": 9.999211329800804e-07, "loss": 0.0016, "reward": 2.0372812747955322, "reward_std": 0.04128546640276909, "rewards/accuracy_reward": 0.8372812271118164, "rewards/format_reward": 1.0, "step": 560 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.1875, "epoch": 0.005663806158505805, "grad_norm": 7.4467119279450955, "kl": 0.03857421875, "learning_rate": 9.99920851068101e-07, "loss": 0.0016, "reward": 2.061875104904175, "reward_std": 0.17586159706115723, "rewards/accuracy_reward": 0.8806250095367432, "rewards/format_reward": 1.0, "step": 561 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.28125, "epoch": 0.005673902069661787, "grad_norm": 1.5901267264574328, "kl": 0.04736328125, "learning_rate": 9.999205686532096e-07, "loss": 0.0019, "reward": 1.7998125553131104, "reward_std": 0.011151323094964027, "rewards/accuracy_reward": 0.6498125195503235, "rewards/format_reward": 1.0, "step": 562 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 444.625, "epoch": 0.005683997980817769, "grad_norm": 1.6830627047285895, "kl": 0.03369140625, "learning_rate": 9.999202857354062e-07, "loss": 0.0014, "reward": 1.7750000953674316, "reward_std": 0.19062010943889618, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 563 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 432.34375, "epoch": 0.005694093891973751, "grad_norm": 2.0787771302812263, "kl": 0.044921875, "learning_rate": 9.999200023146917e-07, "loss": 0.0018, "reward": 1.7262500524520874, "reward_std": 0.2040889710187912, "rewards/accuracy_reward": 0.5949999690055847, "rewards/format_reward": 0.96875, "step": 564 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.21875, "epoch": 0.005704189803129732, "grad_norm": 2.764194503231047, "kl": 0.051025390625, "learning_rate": 9.99919718391066e-07, "loss": 0.002, "reward": 2.0648436546325684, "reward_std": 0.011774061247706413, "rewards/accuracy_reward": 0.8648437857627869, "rewards/format_reward": 1.0, "step": 565 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.125, "epoch": 0.005714285714285714, "grad_norm": 2.032220393259259, "kl": 0.040283203125, "learning_rate": 9.99919433964529e-07, "loss": 0.0016, "reward": 2.0612502098083496, "reward_std": 0.11878181248903275, "rewards/accuracy_reward": 0.8674999475479126, "rewards/format_reward": 1.0, "step": 566 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 436.28125, "epoch": 0.0057243816254416964, "grad_norm": 1.4987609266017408, "kl": 0.032470703125, "learning_rate": 9.999191490350816e-07, "loss": 0.0013, "reward": 1.5533125400543213, "reward_std": 0.10787743330001831, "rewards/accuracy_reward": 0.4595624804496765, "rewards/format_reward": 1.0, "step": 567 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.40625, "epoch": 0.005734477536597678, "grad_norm": 2.3351001363376382, "kl": 0.05126953125, "learning_rate": 9.99918863602724e-07, "loss": 0.0021, "reward": 2.102468729019165, "reward_std": 0.03274565190076828, "rewards/accuracy_reward": 0.9024688005447388, "rewards/format_reward": 1.0, "step": 568 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 444.90625, "epoch": 0.00574457344775366, "grad_norm": 1.3144974954701782, "kl": 0.039306640625, "learning_rate": 9.999185776674561e-07, "loss": 0.0016, "reward": 1.4892187118530273, "reward_std": 0.015898358076810837, "rewards/accuracy_reward": 0.39546874165534973, "rewards/format_reward": 1.0, "step": 569 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.03125, "epoch": 0.005754669358909641, "grad_norm": 5.06937153054347, "kl": 0.046875, "learning_rate": 9.999182912292786e-07, "loss": 0.0019, "reward": 2.065187454223633, "reward_std": 0.05792762339115143, "rewards/accuracy_reward": 0.8714375495910645, "rewards/format_reward": 1.0, "step": 570 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 424.84375, "epoch": 0.005764765270065623, "grad_norm": 1.747746704588723, "kl": 0.0419921875, "learning_rate": 9.999180042881916e-07, "loss": 0.0017, "reward": 1.6537500619888306, "reward_std": 0.027826767414808273, "rewards/accuracy_reward": 0.5037499666213989, "rewards/format_reward": 1.0, "step": 571 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 436.34375, "epoch": 0.0057748611812216055, "grad_norm": 1.691619577841178, "kl": 0.046142578125, "learning_rate": 9.999177168441954e-07, "loss": 0.0019, "reward": 2.010000228881836, "reward_std": 0.055292725563049316, "rewards/accuracy_reward": 0.8287500143051147, "rewards/format_reward": 1.0, "step": 572 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 430.15625, "epoch": 0.005784957092377587, "grad_norm": 2.3754620575886802, "kl": 0.04052734375, "learning_rate": 9.999174288972904e-07, "loss": 0.0016, "reward": 1.7645000219345093, "reward_std": 0.1257583498954773, "rewards/accuracy_reward": 0.6207500696182251, "rewards/format_reward": 1.0, "step": 573 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.59375, "epoch": 0.005795053003533569, "grad_norm": 2.226904075147024, "kl": 0.046142578125, "learning_rate": 9.999171404474768e-07, "loss": 0.0018, "reward": 2.085531234741211, "reward_std": 0.03569312393665314, "rewards/accuracy_reward": 0.891781210899353, "rewards/format_reward": 1.0, "step": 574 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.5, "epoch": 0.005805148914689551, "grad_norm": 8.076077879369048, "kl": 0.0458984375, "learning_rate": 9.999168514947547e-07, "loss": 0.0018, "reward": 2.1320624351501465, "reward_std": 0.02853696048259735, "rewards/accuracy_reward": 0.9383125305175781, "rewards/format_reward": 1.0, "step": 575 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.46875, "epoch": 0.005815244825845532, "grad_norm": 1.9187460269838794, "kl": 0.03955078125, "learning_rate": 9.999165620391248e-07, "loss": 0.0016, "reward": 2.1302502155303955, "reward_std": 0.13168096542358398, "rewards/accuracy_reward": 0.9427499771118164, "rewards/format_reward": 1.0, "step": 576 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.78125, "epoch": 0.0058253407370015145, "grad_norm": 1.9205963826792698, "kl": 0.050537109375, "learning_rate": 9.99916272080587e-07, "loss": 0.002, "reward": 1.8119688034057617, "reward_std": 0.171625018119812, "rewards/accuracy_reward": 0.6494687795639038, "rewards/format_reward": 1.0, "step": 577 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 447.34375, "epoch": 0.005835436648157496, "grad_norm": 2.0079631796543764, "kl": 0.044921875, "learning_rate": 9.99915981619142e-07, "loss": 0.0018, "reward": 1.8370625972747803, "reward_std": 0.18751122057437897, "rewards/accuracy_reward": 0.6683124899864197, "rewards/format_reward": 1.0, "step": 578 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.125, "epoch": 0.005845532559313478, "grad_norm": 2.2875027144090736, "kl": 0.052490234375, "learning_rate": 9.999156906547896e-07, "loss": 0.0021, "reward": 2.088562488555908, "reward_std": 0.02274107187986374, "rewards/accuracy_reward": 0.8885625004768372, "rewards/format_reward": 1.0, "step": 579 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.84375, "epoch": 0.00585562847046946, "grad_norm": 2.1107488838957207, "kl": 0.0419921875, "learning_rate": 9.999153991875304e-07, "loss": 0.0017, "reward": 2.102750062942505, "reward_std": 0.030405621975660324, "rewards/accuracy_reward": 0.909000039100647, "rewards/format_reward": 1.0, "step": 580 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.40625, "epoch": 0.005865724381625441, "grad_norm": 2.3420425660226667, "kl": 0.05126953125, "learning_rate": 9.999151072173645e-07, "loss": 0.002, "reward": 1.9154062271118164, "reward_std": 0.02632031962275505, "rewards/accuracy_reward": 0.7216562032699585, "rewards/format_reward": 1.0, "step": 581 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.005875820292781424, "grad_norm": 2.465243824833267, "kl": 0.05859375, "learning_rate": 9.999148147442926e-07, "loss": 0.0023, "reward": 2.0399999618530273, "reward_std": 0.04571272432804108, "rewards/accuracy_reward": 0.8400000333786011, "rewards/format_reward": 1.0, "step": 582 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.84375, "epoch": 0.005885916203937406, "grad_norm": 1.9412233696697134, "kl": 0.04736328125, "learning_rate": 9.999145217683148e-07, "loss": 0.0019, "reward": 2.1358437538146973, "reward_std": 0.046907082200050354, "rewards/accuracy_reward": 0.9420937299728394, "rewards/format_reward": 1.0, "step": 583 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 440.96875, "epoch": 0.005896012115093387, "grad_norm": 2.1293613321223877, "kl": 0.0458984375, "learning_rate": 9.99914228289431e-07, "loss": 0.0018, "reward": 1.9481250047683716, "reward_std": 0.19099999964237213, "rewards/accuracy_reward": 0.7793749570846558, "rewards/format_reward": 1.0, "step": 584 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 424.75, "epoch": 0.005906108026249369, "grad_norm": 1.98053818382057, "kl": 0.053466796875, "learning_rate": 9.999139343076422e-07, "loss": 0.0021, "reward": 1.7193751335144043, "reward_std": 0.01925925724208355, "rewards/accuracy_reward": 0.5693750381469727, "rewards/format_reward": 1.0, "step": 585 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.21875, "epoch": 0.0059162039374053505, "grad_norm": 1.929907525194977, "kl": 0.040771484375, "learning_rate": 9.99913639822948e-07, "loss": 0.0016, "reward": 2.1024999618530273, "reward_std": 0.04753885418176651, "rewards/accuracy_reward": 0.9149999618530273, "rewards/format_reward": 1.0, "step": 586 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.15625, "epoch": 0.005926299848561333, "grad_norm": 2.0817492302540805, "kl": 0.05224609375, "learning_rate": 9.99913344835349e-07, "loss": 0.0021, "reward": 1.88671875, "reward_std": 0.03681743144989014, "rewards/accuracy_reward": 0.6867187023162842, "rewards/format_reward": 1.0, "step": 587 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 434.625, "epoch": 0.005936395759717315, "grad_norm": 1.3328576821151887, "kl": 0.041015625, "learning_rate": 9.999130493448457e-07, "loss": 0.0016, "reward": 2.0500001907348633, "reward_std": 0.15223608911037445, "rewards/accuracy_reward": 0.862500011920929, "rewards/format_reward": 1.0, "step": 588 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 442.84375, "epoch": 0.005946491670873296, "grad_norm": 1.6835470165444206, "kl": 0.0458984375, "learning_rate": 9.999127533514381e-07, "loss": 0.0018, "reward": 1.8462188243865967, "reward_std": 0.10117337107658386, "rewards/accuracy_reward": 0.696218729019165, "rewards/format_reward": 1.0, "step": 589 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.65625, "epoch": 0.005956587582029278, "grad_norm": 1.9492783731467134, "kl": 0.042724609375, "learning_rate": 9.999124568551266e-07, "loss": 0.0017, "reward": 2.0580310821533203, "reward_std": 0.05188221111893654, "rewards/accuracy_reward": 0.8642812967300415, "rewards/format_reward": 1.0, "step": 590 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.25, "epoch": 0.00596668349318526, "grad_norm": 2.0211404625838982, "kl": 0.04736328125, "learning_rate": 9.999121598559114e-07, "loss": 0.0019, "reward": 1.8055312633514404, "reward_std": 0.04103612154722214, "rewards/accuracy_reward": 0.6617812514305115, "rewards/format_reward": 1.0, "step": 591 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.8125, "epoch": 0.005976779404341242, "grad_norm": 1.3296049293213963, "kl": 0.043701171875, "learning_rate": 9.999118623537932e-07, "loss": 0.0017, "reward": 2.149718761444092, "reward_std": 0.014747117646038532, "rewards/accuracy_reward": 0.9497187733650208, "rewards/format_reward": 1.0, "step": 592 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.40625, "epoch": 0.005986875315497224, "grad_norm": 2.685418022723183, "kl": 0.050537109375, "learning_rate": 9.999115643487717e-07, "loss": 0.002, "reward": 2.0176875591278076, "reward_std": 0.04628437012434006, "rewards/accuracy_reward": 0.8301874399185181, "rewards/format_reward": 1.0, "step": 593 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.59375, "epoch": 0.005996971226653205, "grad_norm": 4.0684856835360526, "kl": 0.048583984375, "learning_rate": 9.999112658408477e-07, "loss": 0.0019, "reward": 2.1600937843322754, "reward_std": 0.03357888013124466, "rewards/accuracy_reward": 0.9600937366485596, "rewards/format_reward": 1.0, "step": 594 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.0, "epoch": 0.006007067137809187, "grad_norm": 2.872538320553206, "kl": 0.06396484375, "learning_rate": 9.999109668300213e-07, "loss": 0.0026, "reward": 2.056000232696533, "reward_std": 0.04303734004497528, "rewards/accuracy_reward": 0.8559999465942383, "rewards/format_reward": 1.0, "step": 595 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.75, "epoch": 0.0060171630489651694, "grad_norm": 1.6924143144534673, "kl": 0.037353515625, "learning_rate": 9.999106673162927e-07, "loss": 0.0015, "reward": 2.1585311889648438, "reward_std": 0.01273481547832489, "rewards/accuracy_reward": 0.9585312604904175, "rewards/format_reward": 1.0, "step": 596 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 419.46875, "epoch": 0.006027258960121151, "grad_norm": 3.925640130782454, "kl": 0.0390625, "learning_rate": 9.999103672996622e-07, "loss": 0.0016, "reward": 1.7041563987731934, "reward_std": 0.17689348757266998, "rewards/accuracy_reward": 0.5791562795639038, "rewards/format_reward": 1.0, "step": 597 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.0, "epoch": 0.006037354871277133, "grad_norm": 1.3675491291774793, "kl": 0.0380859375, "learning_rate": 9.999100667801303e-07, "loss": 0.0015, "reward": 1.836125135421753, "reward_std": 0.11185237765312195, "rewards/accuracy_reward": 0.6923749446868896, "rewards/format_reward": 1.0, "step": 598 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.006047450782433114, "grad_norm": 3.419867386958238, "kl": 0.050048828125, "learning_rate": 9.999097657576973e-07, "loss": 0.002, "reward": 2.147125005722046, "reward_std": 0.023501748219132423, "rewards/accuracy_reward": 0.9471250176429749, "rewards/format_reward": 1.0, "step": 599 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.125, "epoch": 0.006057546693589096, "grad_norm": 1.8062360529654249, "kl": 0.0478515625, "learning_rate": 9.999094642323633e-07, "loss": 0.0019, "reward": 2.075000047683716, "reward_std": 0.05022291839122772, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 600 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.78125, "epoch": 0.0060676426047450785, "grad_norm": 2.570902395104723, "kl": 0.052978515625, "learning_rate": 9.999091622041287e-07, "loss": 0.0021, "reward": 2.105437755584717, "reward_std": 0.05751696228981018, "rewards/accuracy_reward": 0.9116874933242798, "rewards/format_reward": 1.0, "step": 601 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.00607773851590106, "grad_norm": 1.6714785761200663, "kl": 0.053466796875, "learning_rate": 9.99908859672994e-07, "loss": 0.0021, "reward": 1.9726874828338623, "reward_std": 0.011100973933935165, "rewards/accuracy_reward": 0.7726874947547913, "rewards/format_reward": 1.0, "step": 602 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 393.375, "epoch": 0.006087834427057042, "grad_norm": 1.730631663633356, "kl": 0.04931640625, "learning_rate": 9.99908556638959e-07, "loss": 0.002, "reward": 1.8186249732971191, "reward_std": 0.013719704002141953, "rewards/accuracy_reward": 0.6686249375343323, "rewards/format_reward": 1.0, "step": 603 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.90625, "epoch": 0.006097930338213024, "grad_norm": 1.5592093929758069, "kl": 0.0478515625, "learning_rate": 9.999082531020244e-07, "loss": 0.0019, "reward": 1.8394688367843628, "reward_std": 0.01915539801120758, "rewards/accuracy_reward": 0.6894687414169312, "rewards/format_reward": 1.0, "step": 604 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 435.46875, "epoch": 0.006108026249369005, "grad_norm": 3.9229552095365556, "kl": 0.048095703125, "learning_rate": 9.999079490621907e-07, "loss": 0.0019, "reward": 1.580625057220459, "reward_std": 0.007204905617982149, "rewards/accuracy_reward": 0.4806250035762787, "rewards/format_reward": 1.0, "step": 605 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 407.46875, "epoch": 0.0061181221605249875, "grad_norm": 1.5540994873143479, "kl": 0.05517578125, "learning_rate": 9.999076445194577e-07, "loss": 0.0022, "reward": 1.6345312595367432, "reward_std": 0.03080238588154316, "rewards/accuracy_reward": 0.4845312833786011, "rewards/format_reward": 1.0, "step": 606 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.40625, "epoch": 0.006128218071680969, "grad_norm": 1.424698385557782, "kl": 0.049560546875, "learning_rate": 9.99907339473826e-07, "loss": 0.002, "reward": 1.7839374542236328, "reward_std": 0.013379449024796486, "rewards/accuracy_reward": 0.6339375376701355, "rewards/format_reward": 1.0, "step": 607 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.71875, "epoch": 0.006138313982836951, "grad_norm": 2.0749222555728237, "kl": 0.05908203125, "learning_rate": 9.999070339252958e-07, "loss": 0.0024, "reward": 2.056187629699707, "reward_std": 0.03866884857416153, "rewards/accuracy_reward": 0.8561874628067017, "rewards/format_reward": 1.0, "step": 608 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.65625, "epoch": 0.006148409893992933, "grad_norm": 29.71694186481631, "kl": 0.043212890625, "learning_rate": 9.999067278738675e-07, "loss": 0.0017, "reward": 1.5906562805175781, "reward_std": 0.09526731818914413, "rewards/accuracy_reward": 0.4906562566757202, "rewards/format_reward": 1.0, "step": 609 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.21875, "epoch": 0.0061585058051489144, "grad_norm": 14.55561824251808, "kl": 0.04833984375, "learning_rate": 9.999064213195413e-07, "loss": 0.0019, "reward": 2.0241875648498535, "reward_std": 0.17728546261787415, "rewards/accuracy_reward": 0.8491874933242798, "rewards/format_reward": 1.0, "step": 610 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.0, "epoch": 0.006168601716304897, "grad_norm": 1.9073587115607409, "kl": 0.040283203125, "learning_rate": 9.999061142623177e-07, "loss": 0.0016, "reward": 1.9865312576293945, "reward_std": 0.201650470495224, "rewards/accuracy_reward": 0.8302813172340393, "rewards/format_reward": 1.0, "step": 611 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.09375, "epoch": 0.006178697627460879, "grad_norm": 3.2912963187890725, "kl": 0.05224609375, "learning_rate": 9.99905806702197e-07, "loss": 0.0021, "reward": 1.8215625286102295, "reward_std": 0.100054070353508, "rewards/accuracy_reward": 0.6715624928474426, "rewards/format_reward": 1.0, "step": 612 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.96875, "epoch": 0.00618879353861686, "grad_norm": 2.6187768675472416, "kl": 0.0634765625, "learning_rate": 9.99905498639179e-07, "loss": 0.0025, "reward": 2.03265643119812, "reward_std": 0.07009825110435486, "rewards/accuracy_reward": 0.845156192779541, "rewards/format_reward": 1.0, "step": 613 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 427.1875, "epoch": 0.006198889449772842, "grad_norm": 1.3968400324578722, "kl": 0.05615234375, "learning_rate": 9.999051900732647e-07, "loss": 0.0022, "reward": 1.7279374599456787, "reward_std": 0.007380537688732147, "rewards/accuracy_reward": 0.5779374837875366, "rewards/format_reward": 1.0, "step": 614 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 426.8125, "epoch": 0.0062089853609288235, "grad_norm": 1.434331609255181, "kl": 0.0546875, "learning_rate": 9.999048810044539e-07, "loss": 0.0022, "reward": 1.719843864440918, "reward_std": 0.015995243564248085, "rewards/accuracy_reward": 0.5698437094688416, "rewards/format_reward": 1.0, "step": 615 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.75, "epoch": 0.006219081272084806, "grad_norm": 3.393849777337403, "kl": 0.0625, "learning_rate": 9.999045714327473e-07, "loss": 0.0025, "reward": 2.0445001125335693, "reward_std": 0.03697941452264786, "rewards/accuracy_reward": 0.844499945640564, "rewards/format_reward": 1.0, "step": 616 }, { "all_correct": 0.0, "all_wrong": 0.75, "completion_length": 400.5625, "epoch": 0.006229177183240788, "grad_norm": 3.64362509005659, "kl": 0.046875, "learning_rate": 9.99904261358145e-07, "loss": 0.0019, "reward": 1.258406162261963, "reward_std": 0.01410352997481823, "rewards/accuracy_reward": 0.2084062397480011, "rewards/format_reward": 1.0, "step": 617 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.006239273094396769, "grad_norm": 2.3291560156958835, "kl": 0.053466796875, "learning_rate": 9.999039507806473e-07, "loss": 0.0021, "reward": 1.9123749732971191, "reward_std": 0.189693883061409, "rewards/accuracy_reward": 0.7561250329017639, "rewards/format_reward": 1.0, "step": 618 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.71875, "epoch": 0.006249369005552751, "grad_norm": 1.7041512416779054, "kl": 0.0537109375, "learning_rate": 9.999036397002545e-07, "loss": 0.0021, "reward": 2.1376874446868896, "reward_std": 0.0332685261964798, "rewards/accuracy_reward": 0.9439375400543213, "rewards/format_reward": 1.0, "step": 619 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.25, "epoch": 0.0062594649167087325, "grad_norm": 3.1421604272902366, "kl": 0.05859375, "learning_rate": 9.999033281169672e-07, "loss": 0.0024, "reward": 1.774531364440918, "reward_std": 0.016319990158081055, "rewards/accuracy_reward": 0.6245312690734863, "rewards/format_reward": 1.0, "step": 620 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.03125, "epoch": 0.006269560827864715, "grad_norm": 4.371122280821781, "kl": 0.048583984375, "learning_rate": 9.999030160307853e-07, "loss": 0.0019, "reward": 1.8339999914169312, "reward_std": 0.014625193551182747, "rewards/accuracy_reward": 0.6840000152587891, "rewards/format_reward": 1.0, "step": 621 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.5625, "epoch": 0.006279656739020697, "grad_norm": 1.9649037109179492, "kl": 0.05517578125, "learning_rate": 9.999027034417094e-07, "loss": 0.0022, "reward": 1.8354687690734863, "reward_std": 0.10525589436292648, "rewards/accuracy_reward": 0.6854687929153442, "rewards/format_reward": 1.0, "step": 622 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 428.21875, "epoch": 0.006289752650176678, "grad_norm": 1.755289925203769, "kl": 0.048583984375, "learning_rate": 9.999023903497396e-07, "loss": 0.0019, "reward": 1.7930313348770142, "reward_std": 0.027222732082009315, "rewards/accuracy_reward": 0.6430312395095825, "rewards/format_reward": 1.0, "step": 623 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.59375, "epoch": 0.00629984856133266, "grad_norm": 2.398196184617875, "kl": 0.0654296875, "learning_rate": 9.999020767548763e-07, "loss": 0.0026, "reward": 2.047374963760376, "reward_std": 0.04204847663640976, "rewards/accuracy_reward": 0.8536249995231628, "rewards/format_reward": 1.0, "step": 624 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.21875, "epoch": 0.0063099444724886425, "grad_norm": 2.026008395553164, "kl": 0.0498046875, "learning_rate": 9.9990176265712e-07, "loss": 0.002, "reward": 2.0088751316070557, "reward_std": 0.015385085716843605, "rewards/accuracy_reward": 0.8088750243186951, "rewards/format_reward": 1.0, "step": 625 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.0, "epoch": 0.006320040383644624, "grad_norm": 3.8242622750019506, "kl": 0.0556640625, "learning_rate": 9.999014480564707e-07, "loss": 0.0022, "reward": 2.1233439445495605, "reward_std": 0.04037685692310333, "rewards/accuracy_reward": 0.9295938014984131, "rewards/format_reward": 1.0, "step": 626 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 425.4375, "epoch": 0.006330136294800606, "grad_norm": 2.5119011257426065, "kl": 0.053466796875, "learning_rate": 9.99901132952929e-07, "loss": 0.0021, "reward": 1.756656289100647, "reward_std": 0.05429944023489952, "rewards/accuracy_reward": 0.6129062175750732, "rewards/format_reward": 1.0, "step": 627 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.125, "epoch": 0.006340232205956587, "grad_norm": 2.0934482083576174, "kl": 0.052734375, "learning_rate": 9.99900817346495e-07, "loss": 0.0021, "reward": 2.1000313758850098, "reward_std": 0.050975821912288666, "rewards/accuracy_reward": 0.9062812328338623, "rewards/format_reward": 1.0, "step": 628 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.71875, "epoch": 0.006350328117112569, "grad_norm": 0.998236839987285, "kl": 0.044189453125, "learning_rate": 9.99900501237169e-07, "loss": 0.0018, "reward": 2.1681251525878906, "reward_std": 0.009508472867310047, "rewards/accuracy_reward": 0.9681249856948853, "rewards/format_reward": 1.0, "step": 629 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.9375, "epoch": 0.0063604240282685515, "grad_norm": 5.261372966068228, "kl": 0.06396484375, "learning_rate": 9.999001846249517e-07, "loss": 0.0026, "reward": 2.0316874980926514, "reward_std": 0.03567042946815491, "rewards/accuracy_reward": 0.8316874504089355, "rewards/format_reward": 1.0, "step": 630 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.8125, "epoch": 0.006370519939424533, "grad_norm": 1.825985753881396, "kl": 0.04638671875, "learning_rate": 9.99899867509843e-07, "loss": 0.0019, "reward": 1.73646879196167, "reward_std": 0.1818573921918869, "rewards/accuracy_reward": 0.6177186965942383, "rewards/format_reward": 1.0, "step": 631 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.03125, "epoch": 0.006380615850580515, "grad_norm": 2.654298951336227, "kl": 0.05712890625, "learning_rate": 9.998995498918434e-07, "loss": 0.0023, "reward": 2.0391719341278076, "reward_std": 0.03314629942178726, "rewards/accuracy_reward": 0.8391718864440918, "rewards/format_reward": 1.0, "step": 632 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.3125, "epoch": 0.006390711761736497, "grad_norm": 5.10446243565055, "kl": 0.05517578125, "learning_rate": 9.99899231770953e-07, "loss": 0.0022, "reward": 2.0255000591278076, "reward_std": 0.03956074267625809, "rewards/accuracy_reward": 0.825499951839447, "rewards/format_reward": 1.0, "step": 633 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.53125, "epoch": 0.006400807672892478, "grad_norm": 2.3715948936644677, "kl": 0.06103515625, "learning_rate": 9.998989131471727e-07, "loss": 0.0024, "reward": 2.084812641143799, "reward_std": 0.033787623047828674, "rewards/accuracy_reward": 0.8910624980926514, "rewards/format_reward": 1.0, "step": 634 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 387.5, "epoch": 0.0064109035840484606, "grad_norm": 2.4773423062209665, "kl": 0.06787109375, "learning_rate": 9.998985940205021e-07, "loss": 0.0027, "reward": 1.9572187662124634, "reward_std": 0.029413744807243347, "rewards/accuracy_reward": 0.7572187781333923, "rewards/format_reward": 1.0, "step": 635 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.65625, "epoch": 0.006420999495204442, "grad_norm": 1.7419140362463432, "kl": 0.04296875, "learning_rate": 9.99898274390942e-07, "loss": 0.0017, "reward": 2.0684375762939453, "reward_std": 0.12398721277713776, "rewards/accuracy_reward": 0.8871874809265137, "rewards/format_reward": 1.0, "step": 636 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.65625, "epoch": 0.006431095406360424, "grad_norm": 1.6778140308444052, "kl": 0.050048828125, "learning_rate": 9.998979542584924e-07, "loss": 0.002, "reward": 2.142000198364258, "reward_std": 0.024314627051353455, "rewards/accuracy_reward": 0.9419999122619629, "rewards/format_reward": 1.0, "step": 637 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.0625, "epoch": 0.006441191317516406, "grad_norm": 1.487952892516055, "kl": 0.043701171875, "learning_rate": 9.99897633623154e-07, "loss": 0.0018, "reward": 2.0210001468658447, "reward_std": 0.16406269371509552, "rewards/accuracy_reward": 0.8585000038146973, "rewards/format_reward": 1.0, "step": 638 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.59375, "epoch": 0.0064512872286723874, "grad_norm": 1.73048960882215, "kl": 0.0498046875, "learning_rate": 9.998973124849267e-07, "loss": 0.002, "reward": 2.1822500228881836, "reward_std": 0.025514494627714157, "rewards/accuracy_reward": 0.9884998798370361, "rewards/format_reward": 1.0, "step": 639 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.78125, "epoch": 0.00646138313982837, "grad_norm": 1.935394471928049, "kl": 0.05615234375, "learning_rate": 9.998969908438113e-07, "loss": 0.0022, "reward": 1.7418125867843628, "reward_std": 0.03694805130362511, "rewards/accuracy_reward": 0.5980625152587891, "rewards/format_reward": 1.0, "step": 640 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.006471479050984352, "grad_norm": 1.9821204663759813, "kl": 0.04833984375, "learning_rate": 9.998966686998075e-07, "loss": 0.0019, "reward": 1.5422499179840088, "reward_std": 0.19639456272125244, "rewards/accuracy_reward": 0.44225001335144043, "rewards/format_reward": 1.0, "step": 641 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.0625, "epoch": 0.006481574962140333, "grad_norm": 2.2207987582111954, "kl": 0.050048828125, "learning_rate": 9.998963460529163e-07, "loss": 0.002, "reward": 2.0824999809265137, "reward_std": 0.014625193551182747, "rewards/accuracy_reward": 0.8824999332427979, "rewards/format_reward": 1.0, "step": 642 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.75, "epoch": 0.006491670873296315, "grad_norm": 2.390389219977804, "kl": 0.0654296875, "learning_rate": 9.998960229031378e-07, "loss": 0.0026, "reward": 1.9875311851501465, "reward_std": 0.021440941840410233, "rewards/accuracy_reward": 0.7875312566757202, "rewards/format_reward": 1.0, "step": 643 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.8125, "epoch": 0.0065017667844522965, "grad_norm": 2.655432750483889, "kl": 0.051513671875, "learning_rate": 9.99895699250472e-07, "loss": 0.0021, "reward": 1.7245937585830688, "reward_std": 0.17136916518211365, "rewards/accuracy_reward": 0.5995937585830688, "rewards/format_reward": 1.0, "step": 644 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.0625, "epoch": 0.006511862695608279, "grad_norm": 2.178099975406199, "kl": 0.059326171875, "learning_rate": 9.998953750949194e-07, "loss": 0.0024, "reward": 1.834625005722046, "reward_std": 0.036368805915117264, "rewards/accuracy_reward": 0.684624969959259, "rewards/format_reward": 1.0, "step": 645 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.96875, "epoch": 0.006521958606764261, "grad_norm": 2.649278923152512, "kl": 0.0595703125, "learning_rate": 9.998950504364805e-07, "loss": 0.0024, "reward": 2.153125047683716, "reward_std": 0.01493791677057743, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 646 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.90625, "epoch": 0.006532054517920242, "grad_norm": 1.5827692641540159, "kl": 0.0537109375, "learning_rate": 9.998947252751555e-07, "loss": 0.0021, "reward": 2.1128125190734863, "reward_std": 0.013258237391710281, "rewards/accuracy_reward": 0.9128124713897705, "rewards/format_reward": 1.0, "step": 647 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.21875, "epoch": 0.006542150429076224, "grad_norm": 2.5411691040670585, "kl": 0.055419921875, "learning_rate": 9.998943996109446e-07, "loss": 0.0022, "reward": 2.0405936241149902, "reward_std": 0.03276192396879196, "rewards/accuracy_reward": 0.8405937552452087, "rewards/format_reward": 1.0, "step": 648 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.25, "epoch": 0.0065522463402322055, "grad_norm": 1.862552614764634, "kl": 0.056884765625, "learning_rate": 9.998940734438484e-07, "loss": 0.0023, "reward": 1.7990624904632568, "reward_std": 0.015992192551493645, "rewards/accuracy_reward": 0.6490625143051147, "rewards/format_reward": 1.0, "step": 649 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 406.40625, "epoch": 0.006562342251388188, "grad_norm": 6.680434210897036, "kl": 0.056396484375, "learning_rate": 9.99893746773867e-07, "loss": 0.0023, "reward": 1.5820937156677246, "reward_std": 0.016664834693074226, "rewards/accuracy_reward": 0.4820937514305115, "rewards/format_reward": 1.0, "step": 650 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.46875, "epoch": 0.00657243816254417, "grad_norm": 3.077964068434753, "kl": 0.058349609375, "learning_rate": 9.99893419601001e-07, "loss": 0.0023, "reward": 1.7799687385559082, "reward_std": 0.03654696047306061, "rewards/accuracy_reward": 0.6299687623977661, "rewards/format_reward": 1.0, "step": 651 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.006582534073700151, "grad_norm": 1.6521337717764057, "kl": 0.052001953125, "learning_rate": 9.998930919252505e-07, "loss": 0.0021, "reward": 2.0337812900543213, "reward_std": 0.022239618003368378, "rewards/accuracy_reward": 0.8337812423706055, "rewards/format_reward": 1.0, "step": 652 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.71875, "epoch": 0.006592629984856133, "grad_norm": 4.425651978801568, "kl": 0.0556640625, "learning_rate": 9.998927637466158e-07, "loss": 0.0022, "reward": 2.0877811908721924, "reward_std": 0.12118736654520035, "rewards/accuracy_reward": 0.8940311670303345, "rewards/format_reward": 1.0, "step": 653 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.0066027258960121155, "grad_norm": 9.564038021191406, "kl": 0.05615234375, "learning_rate": 9.998924350650972e-07, "loss": 0.0023, "reward": 2.1193125247955322, "reward_std": 0.03917395696043968, "rewards/accuracy_reward": 0.9318125247955322, "rewards/format_reward": 1.0, "step": 654 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.25, "epoch": 0.006612821807168097, "grad_norm": 2.6136325036672554, "kl": 0.05517578125, "learning_rate": 9.998921058806953e-07, "loss": 0.0022, "reward": 2.119687557220459, "reward_std": 0.02853281982243061, "rewards/accuracy_reward": 0.9196875095367432, "rewards/format_reward": 1.0, "step": 655 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.125, "epoch": 0.006622917718324079, "grad_norm": 1.6642639035673887, "kl": 0.0546875, "learning_rate": 9.998917761934104e-07, "loss": 0.0022, "reward": 1.7953126430511475, "reward_std": 0.018732057884335518, "rewards/accuracy_reward": 0.645312488079071, "rewards/format_reward": 1.0, "step": 656 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.9375, "epoch": 0.00663301362948006, "grad_norm": 2.0771068729082556, "kl": 0.0595703125, "learning_rate": 9.998914460032425e-07, "loss": 0.0024, "reward": 2.1538126468658447, "reward_std": 0.02276553213596344, "rewards/accuracy_reward": 0.9538124799728394, "rewards/format_reward": 1.0, "step": 657 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.84375, "epoch": 0.006643109540636042, "grad_norm": 2.811064287727442, "kl": 0.0537109375, "learning_rate": 9.998911153101922e-07, "loss": 0.0022, "reward": 2.15443754196167, "reward_std": 0.01886827126145363, "rewards/accuracy_reward": 0.9544374942779541, "rewards/format_reward": 1.0, "step": 658 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.0066532054517920245, "grad_norm": 1.5230734105832004, "kl": 0.049072265625, "learning_rate": 9.998907841142598e-07, "loss": 0.002, "reward": 2.0875000953674316, "reward_std": 0.24493902921676636, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 659 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.5, "epoch": 0.006663301362948006, "grad_norm": 2.285032953647014, "kl": 0.0546875, "learning_rate": 9.998904524154456e-07, "loss": 0.0022, "reward": 1.9442501068115234, "reward_std": 0.15087777376174927, "rewards/accuracy_reward": 0.7942500114440918, "rewards/format_reward": 1.0, "step": 660 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.375, "epoch": 0.006673397274103988, "grad_norm": 3.113863189727635, "kl": 0.0517578125, "learning_rate": 9.9989012021375e-07, "loss": 0.0021, "reward": 2.0895938873291016, "reward_std": 0.024337399750947952, "rewards/accuracy_reward": 0.889593780040741, "rewards/format_reward": 1.0, "step": 661 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 420.1875, "epoch": 0.00668349318525997, "grad_norm": 2.1486824982895754, "kl": 0.058837890625, "learning_rate": 9.99889787509173e-07, "loss": 0.0024, "reward": 1.7614374160766602, "reward_std": 0.14133094251155853, "rewards/accuracy_reward": 0.6051874756813049, "rewards/format_reward": 1.0, "step": 662 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.6875, "epoch": 0.006693589096415951, "grad_norm": 1.5854833607772243, "kl": 0.05615234375, "learning_rate": 9.998894543017155e-07, "loss": 0.0022, "reward": 2.013624906539917, "reward_std": 0.16261444985866547, "rewards/accuracy_reward": 0.8323749899864197, "rewards/format_reward": 1.0, "step": 663 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 438.9375, "epoch": 0.0067036850075719336, "grad_norm": 1.6648620377963932, "kl": 0.05078125, "learning_rate": 9.998891205913773e-07, "loss": 0.002, "reward": 1.9207500219345093, "reward_std": 0.145677387714386, "rewards/accuracy_reward": 0.7707500457763672, "rewards/format_reward": 1.0, "step": 664 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.90625, "epoch": 0.006713780918727915, "grad_norm": 2.868888402977442, "kl": 0.057373046875, "learning_rate": 9.99888786378159e-07, "loss": 0.0023, "reward": 1.9500000476837158, "reward_std": 0.030459992587566376, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 665 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 448.21875, "epoch": 0.006723876829883897, "grad_norm": 1.828200855329521, "kl": 0.052734375, "learning_rate": 9.998884516620612e-07, "loss": 0.0021, "reward": 1.9356250762939453, "reward_std": 0.1676950752735138, "rewards/accuracy_reward": 0.7668750286102295, "rewards/format_reward": 1.0, "step": 666 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.006733972741039879, "grad_norm": 1.9624109730654642, "kl": 0.051025390625, "learning_rate": 9.998881164430836e-07, "loss": 0.002, "reward": 2.058718681335449, "reward_std": 0.02815668284893036, "rewards/accuracy_reward": 0.858718752861023, "rewards/format_reward": 1.0, "step": 667 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 442.9375, "epoch": 0.0067440686521958604, "grad_norm": 3.457049601714292, "kl": 0.049560546875, "learning_rate": 9.99887780721227e-07, "loss": 0.002, "reward": 1.904250144958496, "reward_std": 0.25545430183410645, "rewards/accuracy_reward": 0.7542500495910645, "rewards/format_reward": 1.0, "step": 668 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 432.84375, "epoch": 0.006754164563351843, "grad_norm": 2.2493650508683722, "kl": 0.0576171875, "learning_rate": 9.998874444964919e-07, "loss": 0.0023, "reward": 2.15625, "reward_std": 0.017570294439792633, "rewards/accuracy_reward": 0.9562499523162842, "rewards/format_reward": 1.0, "step": 669 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.625, "epoch": 0.006764260474507825, "grad_norm": 1.677884019859005, "kl": 0.050048828125, "learning_rate": 9.998871077688781e-07, "loss": 0.002, "reward": 2.1316561698913574, "reward_std": 0.017828866839408875, "rewards/accuracy_reward": 0.9316562414169312, "rewards/format_reward": 1.0, "step": 670 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.625, "epoch": 0.006774356385663806, "grad_norm": 2.8904463762907113, "kl": 0.05810546875, "learning_rate": 9.998867705383862e-07, "loss": 0.0023, "reward": 2.1229374408721924, "reward_std": 0.01742449216544628, "rewards/accuracy_reward": 0.9229375123977661, "rewards/format_reward": 1.0, "step": 671 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 438.375, "epoch": 0.006784452296819788, "grad_norm": 3.180447193557989, "kl": 0.052978515625, "learning_rate": 9.998864328050166e-07, "loss": 0.0021, "reward": 1.5552186965942383, "reward_std": 0.012292103841900826, "rewards/accuracy_reward": 0.45521876215934753, "rewards/format_reward": 1.0, "step": 672 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.6875, "epoch": 0.0067945482079757695, "grad_norm": 2.3088640270353977, "kl": 0.0556640625, "learning_rate": 9.998860945687694e-07, "loss": 0.0022, "reward": 2.140406370162964, "reward_std": 0.02593454346060753, "rewards/accuracy_reward": 0.9529062509536743, "rewards/format_reward": 1.0, "step": 673 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.84375, "epoch": 0.006804644119131752, "grad_norm": 2.2691221892307154, "kl": 0.0517578125, "learning_rate": 9.998857558296453e-07, "loss": 0.0021, "reward": 2.0274689197540283, "reward_std": 0.017982356250286102, "rewards/accuracy_reward": 0.827468752861023, "rewards/format_reward": 1.0, "step": 674 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 419.84375, "epoch": 0.006814740030287734, "grad_norm": 3.2574218550264042, "kl": 0.05859375, "learning_rate": 9.998854165876445e-07, "loss": 0.0023, "reward": 2.0008440017700195, "reward_std": 0.06016597896814346, "rewards/accuracy_reward": 0.8195936679840088, "rewards/format_reward": 1.0, "step": 675 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 434.125, "epoch": 0.006824835941443715, "grad_norm": 14.407032002392999, "kl": 0.052734375, "learning_rate": 9.998850768427673e-07, "loss": 0.0021, "reward": 1.792625069618225, "reward_std": 0.05385936051607132, "rewards/accuracy_reward": 0.6613750457763672, "rewards/format_reward": 1.0, "step": 676 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.875, "epoch": 0.006834931852599697, "grad_norm": 1.0002514154148687, "kl": 0.05810546875, "learning_rate": 9.99884736595014e-07, "loss": 0.0023, "reward": 2.0977187156677246, "reward_std": 0.020991669967770576, "rewards/accuracy_reward": 0.9039687514305115, "rewards/format_reward": 1.0, "step": 677 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.625, "epoch": 0.0068450277637556785, "grad_norm": 5.604398340254341, "kl": 0.05419921875, "learning_rate": 9.99884395844385e-07, "loss": 0.0022, "reward": 2.1167187690734863, "reward_std": 0.1325872242450714, "rewards/accuracy_reward": 0.9292187094688416, "rewards/format_reward": 1.0, "step": 678 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.90625, "epoch": 0.006855123674911661, "grad_norm": 4.38885614552897, "kl": 0.057861328125, "learning_rate": 9.998840545908806e-07, "loss": 0.0023, "reward": 2.0851094722747803, "reward_std": 0.04569800943136215, "rewards/accuracy_reward": 0.8976093530654907, "rewards/format_reward": 1.0, "step": 679 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 444.625, "epoch": 0.006865219586067643, "grad_norm": 2.033329797171011, "kl": 0.052734375, "learning_rate": 9.998837128345012e-07, "loss": 0.0021, "reward": 1.6663750410079956, "reward_std": 0.26167917251586914, "rewards/accuracy_reward": 0.5538750290870667, "rewards/format_reward": 1.0, "step": 680 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.21875, "epoch": 0.006875315497223624, "grad_norm": 5.622149377276499, "kl": 0.0556640625, "learning_rate": 9.998833705752472e-07, "loss": 0.0022, "reward": 2.104562520980835, "reward_std": 0.019932463765144348, "rewards/accuracy_reward": 0.9045624732971191, "rewards/format_reward": 1.0, "step": 681 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.006885411408379606, "grad_norm": 2.054658742017684, "kl": 0.0478515625, "learning_rate": 9.998830278131187e-07, "loss": 0.0019, "reward": 2.1273436546325684, "reward_std": 0.03600040078163147, "rewards/accuracy_reward": 0.9273436665534973, "rewards/format_reward": 1.0, "step": 682 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 404.09375, "epoch": 0.0068955073195355885, "grad_norm": 1.7646546910625345, "kl": 0.05322265625, "learning_rate": 9.99882684548116e-07, "loss": 0.0021, "reward": 1.81459379196167, "reward_std": 0.012947091832756996, "rewards/accuracy_reward": 0.6645938158035278, "rewards/format_reward": 1.0, "step": 683 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 412.40625, "epoch": 0.00690560323069157, "grad_norm": 2.5754593731202324, "kl": 0.052001953125, "learning_rate": 9.998823407802402e-07, "loss": 0.0021, "reward": 1.7976250648498535, "reward_std": 0.032844800502061844, "rewards/accuracy_reward": 0.6538750529289246, "rewards/format_reward": 1.0, "step": 684 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.96875, "epoch": 0.006915699141847552, "grad_norm": 1.96853356936911, "kl": 0.05322265625, "learning_rate": 9.998819965094906e-07, "loss": 0.0021, "reward": 1.9418751001358032, "reward_std": 0.045087769627571106, "rewards/accuracy_reward": 0.7543749809265137, "rewards/format_reward": 1.0, "step": 685 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 406.46875, "epoch": 0.006925795053003533, "grad_norm": 0.08443923467439143, "kl": 0.049072265625, "learning_rate": 9.998816517358683e-07, "loss": 0.002, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 686 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.03125, "epoch": 0.006935890964159515, "grad_norm": 2.418989259309181, "kl": 0.0517578125, "learning_rate": 9.998813064593734e-07, "loss": 0.0021, "reward": 1.721343755722046, "reward_std": 0.031172411516308784, "rewards/accuracy_reward": 0.5775938034057617, "rewards/format_reward": 1.0, "step": 687 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.0069459868753154975, "grad_norm": 2.257410092474916, "kl": 0.060791015625, "learning_rate": 9.99880960680006e-07, "loss": 0.0024, "reward": 2.064218759536743, "reward_std": 0.027613256126642227, "rewards/accuracy_reward": 0.8642187118530273, "rewards/format_reward": 1.0, "step": 688 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.006956082786471479, "grad_norm": 5.100745051040443, "kl": 0.061767578125, "learning_rate": 9.99880614397767e-07, "loss": 0.0025, "reward": 1.9768123626708984, "reward_std": 0.043186310678720474, "rewards/accuracy_reward": 0.7768125534057617, "rewards/format_reward": 1.0, "step": 689 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.90625, "epoch": 0.006966178697627461, "grad_norm": 2.5991745781394937, "kl": 0.04833984375, "learning_rate": 9.998802676126562e-07, "loss": 0.0019, "reward": 2.0755624771118164, "reward_std": 0.12105938047170639, "rewards/accuracy_reward": 0.8818124532699585, "rewards/format_reward": 1.0, "step": 690 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.40625, "epoch": 0.006976274608783443, "grad_norm": 1.654563843166401, "kl": 0.04931640625, "learning_rate": 9.998799203246745e-07, "loss": 0.002, "reward": 1.8270313739776611, "reward_std": 0.01154839526861906, "rewards/accuracy_reward": 0.6770312786102295, "rewards/format_reward": 1.0, "step": 691 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.46875, "epoch": 0.006986370519939424, "grad_norm": 3.9945095885150286, "kl": 0.05126953125, "learning_rate": 9.998795725338215e-07, "loss": 0.0021, "reward": 1.9909999370574951, "reward_std": 0.009449097327888012, "rewards/accuracy_reward": 0.7910000085830688, "rewards/format_reward": 1.0, "step": 692 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.59375, "epoch": 0.0069964664310954066, "grad_norm": 1.476130463379837, "kl": 0.05126953125, "learning_rate": 9.998792242400982e-07, "loss": 0.0021, "reward": 1.7963751554489136, "reward_std": 0.016748521476984024, "rewards/accuracy_reward": 0.6463750004768372, "rewards/format_reward": 1.0, "step": 693 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.46875, "epoch": 0.007006562342251388, "grad_norm": 2.136207385468365, "kl": 0.054443359375, "learning_rate": 9.998788754435048e-07, "loss": 0.0022, "reward": 2.0172812938690186, "reward_std": 0.020562410354614258, "rewards/accuracy_reward": 0.8172812461853027, "rewards/format_reward": 1.0, "step": 694 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 409.9375, "epoch": 0.00701665825340737, "grad_norm": 2.120757518100125, "kl": 0.052490234375, "learning_rate": 9.998785261440415e-07, "loss": 0.0021, "reward": 1.8651875257492065, "reward_std": 0.01804208755493164, "rewards/accuracy_reward": 0.7151875495910645, "rewards/format_reward": 1.0, "step": 695 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.21875, "epoch": 0.007026754164563352, "grad_norm": 2.0259726666856315, "kl": 0.0498046875, "learning_rate": 9.998781763417087e-07, "loss": 0.002, "reward": 1.7834376096725464, "reward_std": 0.046271346509456635, "rewards/accuracy_reward": 0.6396874785423279, "rewards/format_reward": 1.0, "step": 696 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.28125, "epoch": 0.0070368500757193335, "grad_norm": 2.779776684393619, "kl": 0.0615234375, "learning_rate": 9.998778260365069e-07, "loss": 0.0025, "reward": 2.0723438262939453, "reward_std": 0.031011858955025673, "rewards/accuracy_reward": 0.8723437190055847, "rewards/format_reward": 1.0, "step": 697 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 377.21875, "epoch": 0.007046945986875316, "grad_norm": 2.5832794217536335, "kl": 0.064453125, "learning_rate": 9.998774752284362e-07, "loss": 0.0026, "reward": 1.9155001640319824, "reward_std": 0.05494452267885208, "rewards/accuracy_reward": 0.7280000448226929, "rewards/format_reward": 1.0, "step": 698 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 380.3125, "epoch": 0.007057041898031297, "grad_norm": 2.285869905358857, "kl": 0.0546875, "learning_rate": 9.99877123917497e-07, "loss": 0.0022, "reward": 1.7779375314712524, "reward_std": 0.04144357889890671, "rewards/accuracy_reward": 0.6341875195503235, "rewards/format_reward": 1.0, "step": 699 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 378.21875, "epoch": 0.007067137809187279, "grad_norm": 2.187701195011851, "kl": 0.058349609375, "learning_rate": 9.998767721036901e-07, "loss": 0.0023, "reward": 1.625437617301941, "reward_std": 0.038510873913764954, "rewards/accuracy_reward": 0.4816875457763672, "rewards/format_reward": 1.0, "step": 700 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.21875, "epoch": 0.007077233720343261, "grad_norm": 2.8921510115972895, "kl": 0.052490234375, "learning_rate": 9.998764197870152e-07, "loss": 0.0021, "reward": 2.0499062538146973, "reward_std": 0.025859687477350235, "rewards/accuracy_reward": 0.8561562299728394, "rewards/format_reward": 1.0, "step": 701 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.15625, "epoch": 0.0070873296314992425, "grad_norm": 3.701838689163377, "kl": 0.04931640625, "learning_rate": 9.998760669674731e-07, "loss": 0.002, "reward": 1.785843849182129, "reward_std": 0.03211021423339844, "rewards/accuracy_reward": 0.6358437538146973, "rewards/format_reward": 1.0, "step": 702 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.28125, "epoch": 0.007097425542655225, "grad_norm": 2.325000275313603, "kl": 0.04931640625, "learning_rate": 9.99875713645064e-07, "loss": 0.002, "reward": 2.1484999656677246, "reward_std": 0.0703163594007492, "rewards/accuracy_reward": 0.9672499895095825, "rewards/format_reward": 1.0, "step": 703 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.007107521453811207, "grad_norm": 4.697121444053384, "kl": 0.0556640625, "learning_rate": 9.998753598197882e-07, "loss": 0.0022, "reward": 1.9413750171661377, "reward_std": 0.02366369217634201, "rewards/accuracy_reward": 0.7413749694824219, "rewards/format_reward": 1.0, "step": 704 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.34375, "epoch": 0.007117617364967188, "grad_norm": 1.9162549703809229, "kl": 0.046630859375, "learning_rate": 9.99875005491646e-07, "loss": 0.0019, "reward": 2.098249912261963, "reward_std": 0.022627420723438263, "rewards/accuracy_reward": 0.9045000076293945, "rewards/format_reward": 1.0, "step": 705 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.75, "epoch": 0.00712771327612317, "grad_norm": 2.242491480352827, "kl": 0.052490234375, "learning_rate": 9.99874650660638e-07, "loss": 0.0021, "reward": 2.0904688835144043, "reward_std": 0.039197057485580444, "rewards/accuracy_reward": 0.8967187404632568, "rewards/format_reward": 1.0, "step": 706 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.28125, "epoch": 0.0071378091872791516, "grad_norm": 2.5667721437830333, "kl": 0.060546875, "learning_rate": 9.998742953267645e-07, "loss": 0.0024, "reward": 1.716343879699707, "reward_std": 0.026789672672748566, "rewards/accuracy_reward": 0.5725938081741333, "rewards/format_reward": 1.0, "step": 707 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.625, "epoch": 0.007147905098435134, "grad_norm": 1.4960716171429973, "kl": 0.04736328125, "learning_rate": 9.998739394900257e-07, "loss": 0.0019, "reward": 2.081437587738037, "reward_std": 0.023359324783086777, "rewards/accuracy_reward": 0.8876874446868896, "rewards/format_reward": 1.0, "step": 708 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.0, "epoch": 0.007158001009591116, "grad_norm": 2.0214338938642196, "kl": 0.05126953125, "learning_rate": 9.998735831504222e-07, "loss": 0.0021, "reward": 1.8400312662124634, "reward_std": 0.043065641075372696, "rewards/accuracy_reward": 0.6900312304496765, "rewards/format_reward": 1.0, "step": 709 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 417.78125, "epoch": 0.007168096920747097, "grad_norm": 2.0262516888090674, "kl": 0.054931640625, "learning_rate": 9.99873226307954e-07, "loss": 0.0022, "reward": 1.8008124828338623, "reward_std": 0.023399490863084793, "rewards/accuracy_reward": 0.6508125066757202, "rewards/format_reward": 1.0, "step": 710 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.6875, "epoch": 0.007178192831903079, "grad_norm": 2.8043831971859605, "kl": 0.05859375, "learning_rate": 9.998728689626216e-07, "loss": 0.0023, "reward": 1.981156349182129, "reward_std": 0.04058556631207466, "rewards/accuracy_reward": 0.7811562418937683, "rewards/format_reward": 1.0, "step": 711 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.0071882887430590615, "grad_norm": 4.306119655083141, "kl": 0.05126953125, "learning_rate": 9.998725111144258e-07, "loss": 0.002, "reward": 1.9840937852859497, "reward_std": 0.05257803201675415, "rewards/accuracy_reward": 0.7840937972068787, "rewards/format_reward": 1.0, "step": 712 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.34375, "epoch": 0.007198384654215043, "grad_norm": 1.5500161707181985, "kl": 0.054931640625, "learning_rate": 9.998721527633661e-07, "loss": 0.0022, "reward": 2.1584062576293945, "reward_std": 0.014598570764064789, "rewards/accuracy_reward": 0.9584063291549683, "rewards/format_reward": 1.0, "step": 713 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.09375, "epoch": 0.007208480565371025, "grad_norm": 1.8148372977074991, "kl": 0.052001953125, "learning_rate": 9.998717939094438e-07, "loss": 0.0021, "reward": 2.0919687747955322, "reward_std": 0.044501807540655136, "rewards/accuracy_reward": 0.8982187509536743, "rewards/format_reward": 1.0, "step": 714 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 440.40625, "epoch": 0.007218576476527006, "grad_norm": 1.85113743521993, "kl": 0.058837890625, "learning_rate": 9.998714345526586e-07, "loss": 0.0023, "reward": 1.7995624542236328, "reward_std": 0.04202977195382118, "rewards/accuracy_reward": 0.6620625257492065, "rewards/format_reward": 1.0, "step": 715 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.03125, "epoch": 0.007228672387682988, "grad_norm": 1.9808783162930081, "kl": 0.052734375, "learning_rate": 9.99871074693011e-07, "loss": 0.0021, "reward": 2.0832812786102295, "reward_std": 0.06513816118240356, "rewards/accuracy_reward": 0.8832812309265137, "rewards/format_reward": 1.0, "step": 716 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.6875, "epoch": 0.0072387682988389705, "grad_norm": 1.8816797336295736, "kl": 0.048583984375, "learning_rate": 9.998707143305017e-07, "loss": 0.0019, "reward": 2.1288437843322754, "reward_std": 0.0355895571410656, "rewards/accuracy_reward": 0.9350937604904175, "rewards/format_reward": 1.0, "step": 717 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.90625, "epoch": 0.007248864209994952, "grad_norm": 2.8475779033811337, "kl": 0.06201171875, "learning_rate": 9.998703534651304e-07, "loss": 0.0025, "reward": 1.98046875, "reward_std": 0.05403219908475876, "rewards/accuracy_reward": 0.7867187261581421, "rewards/format_reward": 1.0, "step": 718 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 425.625, "epoch": 0.007258960121150934, "grad_norm": 1.3959799597887492, "kl": 0.055419921875, "learning_rate": 9.998699920968982e-07, "loss": 0.0022, "reward": 1.6738126277923584, "reward_std": 0.029878929257392883, "rewards/accuracy_reward": 0.5300624966621399, "rewards/format_reward": 1.0, "step": 719 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.125, "epoch": 0.007269056032306916, "grad_norm": 1.8186480455361456, "kl": 0.05859375, "learning_rate": 9.99869630225805e-07, "loss": 0.0023, "reward": 1.7550625801086426, "reward_std": 0.02119395136833191, "rewards/accuracy_reward": 0.6050624847412109, "rewards/format_reward": 1.0, "step": 720 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 428.875, "epoch": 0.007279151943462897, "grad_norm": 1.9172665506086557, "kl": 0.056640625, "learning_rate": 9.998692678518512e-07, "loss": 0.0023, "reward": 1.4309375286102295, "reward_std": 0.018196381628513336, "rewards/accuracy_reward": 0.3309374749660492, "rewards/format_reward": 1.0, "step": 721 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.78125, "epoch": 0.00728924785461888, "grad_norm": 2.7563905516345675, "kl": 0.0478515625, "learning_rate": 9.998689049750372e-07, "loss": 0.0019, "reward": 2.0989065170288086, "reward_std": 0.03250107541680336, "rewards/accuracy_reward": 0.9051563143730164, "rewards/format_reward": 1.0, "step": 722 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.28125, "epoch": 0.007299343765774861, "grad_norm": 7.080444497739649, "kl": 0.048583984375, "learning_rate": 9.998685415953637e-07, "loss": 0.0019, "reward": 2.1173439025878906, "reward_std": 0.022001463919878006, "rewards/accuracy_reward": 0.91734379529953, "rewards/format_reward": 1.0, "step": 723 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.875, "epoch": 0.007309439676930843, "grad_norm": 1.8484920014841266, "kl": 0.05029296875, "learning_rate": 9.998681777128306e-07, "loss": 0.002, "reward": 2.0646250247955322, "reward_std": 0.043352462351322174, "rewards/accuracy_reward": 0.8708750605583191, "rewards/format_reward": 1.0, "step": 724 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.8125, "epoch": 0.007319535588086825, "grad_norm": 3.2064493896698543, "kl": 0.0537109375, "learning_rate": 9.998678133274383e-07, "loss": 0.0021, "reward": 2.0492501258850098, "reward_std": 0.054910775274038315, "rewards/accuracy_reward": 0.8555000424385071, "rewards/format_reward": 1.0, "step": 725 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.71875, "epoch": 0.0073296314992428065, "grad_norm": 2.5634155778392036, "kl": 0.049560546875, "learning_rate": 9.998674484391877e-07, "loss": 0.002, "reward": 2.0573437213897705, "reward_std": 0.03810930252075195, "rewards/accuracy_reward": 0.8635936975479126, "rewards/format_reward": 1.0, "step": 726 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 427.21875, "epoch": 0.007339727410398789, "grad_norm": 1.7700964304530749, "kl": 0.050048828125, "learning_rate": 9.998670830480785e-07, "loss": 0.002, "reward": 1.8648124933242798, "reward_std": 0.011256733909249306, "rewards/accuracy_reward": 0.7148125171661377, "rewards/format_reward": 1.0, "step": 727 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.09375, "epoch": 0.00734982332155477, "grad_norm": 1.470264634993751, "kl": 0.046142578125, "learning_rate": 9.998667171541113e-07, "loss": 0.0018, "reward": 2.177781105041504, "reward_std": 0.01581816002726555, "rewards/accuracy_reward": 0.9777811765670776, "rewards/format_reward": 1.0, "step": 728 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.21875, "epoch": 0.007359919232710752, "grad_norm": 5.634367214596626, "kl": 0.05322265625, "learning_rate": 9.998663507572867e-07, "loss": 0.0021, "reward": 2.0432188510894775, "reward_std": 0.021332476288080215, "rewards/accuracy_reward": 0.8432188034057617, "rewards/format_reward": 1.0, "step": 729 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 413.8125, "epoch": 0.007370015143866734, "grad_norm": 1.8835492575732653, "kl": 0.05126953125, "learning_rate": 9.998659838576047e-07, "loss": 0.0021, "reward": 1.5544687509536743, "reward_std": 0.009603959508240223, "rewards/accuracy_reward": 0.4544687569141388, "rewards/format_reward": 1.0, "step": 730 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.09375, "epoch": 0.0073801110550227155, "grad_norm": 2.52694643064182, "kl": 0.051025390625, "learning_rate": 9.998656164550659e-07, "loss": 0.002, "reward": 2.0993125438690186, "reward_std": 0.05236663669347763, "rewards/accuracy_reward": 0.9118125438690186, "rewards/format_reward": 1.0, "step": 731 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.625, "epoch": 0.007390206966178698, "grad_norm": 2.0424643536387523, "kl": 0.046875, "learning_rate": 9.998652485496705e-07, "loss": 0.0019, "reward": 1.8508750200271606, "reward_std": 0.02456575818359852, "rewards/accuracy_reward": 0.7008750438690186, "rewards/format_reward": 1.0, "step": 732 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.34375, "epoch": 0.00740030287733468, "grad_norm": 4.08348515973685, "kl": 0.0546875, "learning_rate": 9.998648801414194e-07, "loss": 0.0022, "reward": 1.8307498693466187, "reward_std": 0.020929958671331406, "rewards/accuracy_reward": 0.6807500123977661, "rewards/format_reward": 1.0, "step": 733 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 0.007410398788490661, "grad_norm": 3.2346795893500535, "kl": 0.05224609375, "learning_rate": 9.998645112303121e-07, "loss": 0.0021, "reward": 1.7518749237060547, "reward_std": 0.343801885843277, "rewards/accuracy_reward": 0.5956249833106995, "rewards/format_reward": 1.0, "step": 734 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.75, "epoch": 0.007420494699646643, "grad_norm": 2.4531833219057857, "kl": 0.0439453125, "learning_rate": 9.998641418163498e-07, "loss": 0.0018, "reward": 1.9567813873291016, "reward_std": 0.1734793782234192, "rewards/accuracy_reward": 0.7817811965942383, "rewards/format_reward": 1.0, "step": 735 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 405.09375, "epoch": 0.0074305906108026246, "grad_norm": 1.89261437233514, "kl": 0.05615234375, "learning_rate": 9.998637718995324e-07, "loss": 0.0022, "reward": 1.5914063453674316, "reward_std": 0.18255221843719482, "rewards/accuracy_reward": 0.47265625, "rewards/format_reward": 1.0, "step": 736 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.007440686521958607, "grad_norm": 2.3089345779979022, "kl": 0.055419921875, "learning_rate": 9.998634014798601e-07, "loss": 0.0022, "reward": 2.13643741607666, "reward_std": 0.01795058697462082, "rewards/accuracy_reward": 0.9364375472068787, "rewards/format_reward": 1.0, "step": 737 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 401.625, "epoch": 0.007450782433114589, "grad_norm": 2.3974592780825805, "kl": 0.053466796875, "learning_rate": 9.998630305573338e-07, "loss": 0.0021, "reward": 1.842156171798706, "reward_std": 0.03744799643754959, "rewards/accuracy_reward": 0.6921562552452087, "rewards/format_reward": 1.0, "step": 738 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.21875, "epoch": 0.00746087834427057, "grad_norm": 1.7441394036286608, "kl": 0.058349609375, "learning_rate": 9.998626591319537e-07, "loss": 0.0023, "reward": 1.8369375467300415, "reward_std": 0.025194572284817696, "rewards/accuracy_reward": 0.6931874752044678, "rewards/format_reward": 1.0, "step": 739 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 381.375, "epoch": 0.007470974255426552, "grad_norm": 3.6703851028939423, "kl": 0.055419921875, "learning_rate": 9.9986228720372e-07, "loss": 0.0022, "reward": 1.8151562213897705, "reward_std": 0.03440900519490242, "rewards/accuracy_reward": 0.6714062690734863, "rewards/format_reward": 1.0, "step": 740 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.84375, "epoch": 0.0074810701665825345, "grad_norm": 18.11874361396192, "kl": 0.061767578125, "learning_rate": 9.99861914772633e-07, "loss": 0.0025, "reward": 2.1052498817443848, "reward_std": 0.056119490414857864, "rewards/accuracy_reward": 0.9177500009536743, "rewards/format_reward": 1.0, "step": 741 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.375, "epoch": 0.007491166077738516, "grad_norm": 2.7485730833619706, "kl": 0.055908203125, "learning_rate": 9.998615418386936e-07, "loss": 0.0022, "reward": 1.8598124980926514, "reward_std": 0.13527856767177582, "rewards/accuracy_reward": 0.7223124504089355, "rewards/format_reward": 1.0, "step": 742 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 389.875, "epoch": 0.007501261988894498, "grad_norm": 1.404221719979842, "kl": 0.0537109375, "learning_rate": 9.998611684019016e-07, "loss": 0.0021, "reward": 1.8455312252044678, "reward_std": 0.016109276562929153, "rewards/accuracy_reward": 0.6955312490463257, "rewards/format_reward": 1.0, "step": 743 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 380.03125, "epoch": 0.007511357900050479, "grad_norm": 2.6657447855786582, "kl": 0.0673828125, "learning_rate": 9.998607944622575e-07, "loss": 0.0027, "reward": 1.944906234741211, "reward_std": 0.043771810829639435, "rewards/accuracy_reward": 0.751156210899353, "rewards/format_reward": 1.0, "step": 744 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 374.5625, "epoch": 0.007521453811206461, "grad_norm": 4.4957628860071255, "kl": 0.052978515625, "learning_rate": 9.998604200197619e-07, "loss": 0.0021, "reward": 2.0346875190734863, "reward_std": 0.05587739124894142, "rewards/accuracy_reward": 0.8471875190734863, "rewards/format_reward": 1.0, "step": 745 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.0625, "epoch": 0.0075315497223624435, "grad_norm": 1.847764801522661, "kl": 0.053466796875, "learning_rate": 9.99860045074415e-07, "loss": 0.0021, "reward": 1.8393750190734863, "reward_std": 0.02336427941918373, "rewards/accuracy_reward": 0.6956250071525574, "rewards/format_reward": 1.0, "step": 746 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.625, "epoch": 0.007541645633518425, "grad_norm": 1.9748215349126426, "kl": 0.05908203125, "learning_rate": 9.998596696262175e-07, "loss": 0.0024, "reward": 2.051875114440918, "reward_std": 0.042688239365816116, "rewards/accuracy_reward": 0.8581249713897705, "rewards/format_reward": 1.0, "step": 747 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.0, "epoch": 0.007551741544674407, "grad_norm": 3.184919428967933, "kl": 0.052001953125, "learning_rate": 9.99859293675169e-07, "loss": 0.0021, "reward": 2.0373125076293945, "reward_std": 0.036510858684778214, "rewards/accuracy_reward": 0.8373124599456787, "rewards/format_reward": 1.0, "step": 748 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.03125, "epoch": 0.007561837455830388, "grad_norm": 1.9577088800732672, "kl": 0.054443359375, "learning_rate": 9.998589172212707e-07, "loss": 0.0022, "reward": 2.1425623893737793, "reward_std": 0.03807096928358078, "rewards/accuracy_reward": 0.9488126039505005, "rewards/format_reward": 1.0, "step": 749 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.03125, "epoch": 0.00757193336698637, "grad_norm": 2.0898486391999493, "kl": 0.057373046875, "learning_rate": 9.998585402645226e-07, "loss": 0.0023, "reward": 2.0703125, "reward_std": 0.028612695634365082, "rewards/accuracy_reward": 0.8703125715255737, "rewards/format_reward": 1.0, "step": 750 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.21875, "epoch": 0.007582029278142353, "grad_norm": 3.165542776027233, "kl": 0.0546875, "learning_rate": 9.998581628049253e-07, "loss": 0.0022, "reward": 1.8857499361038208, "reward_std": 0.03282430022954941, "rewards/accuracy_reward": 0.6857500076293945, "rewards/format_reward": 1.0, "step": 751 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.8125, "epoch": 0.007592125189298334, "grad_norm": 2.2003957512624863, "kl": 0.046875, "learning_rate": 9.998577848424788e-07, "loss": 0.0019, "reward": 2.1396875381469727, "reward_std": 0.04333290457725525, "rewards/accuracy_reward": 0.9459375143051147, "rewards/format_reward": 1.0, "step": 752 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.1875, "epoch": 0.007602221100454316, "grad_norm": 1.61154885365186, "kl": 0.04931640625, "learning_rate": 9.998574063771836e-07, "loss": 0.002, "reward": 2.149125099182129, "reward_std": 0.024684591218829155, "rewards/accuracy_reward": 0.9491249918937683, "rewards/format_reward": 1.0, "step": 753 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.1875, "epoch": 0.007612317011610298, "grad_norm": 2.0217139834097404, "kl": 0.0517578125, "learning_rate": 9.998570274090405e-07, "loss": 0.0021, "reward": 2.139312505722046, "reward_std": 0.047279104590415955, "rewards/accuracy_reward": 0.945562481880188, "rewards/format_reward": 1.0, "step": 754 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.1875, "epoch": 0.0076224129227662795, "grad_norm": 14.98594896519749, "kl": 0.0498046875, "learning_rate": 9.998566479380493e-07, "loss": 0.002, "reward": 2.0895626544952393, "reward_std": 0.05125473439693451, "rewards/accuracy_reward": 0.9083124399185181, "rewards/format_reward": 1.0, "step": 755 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.007632508833922262, "grad_norm": 2.1691883950357846, "kl": 0.053955078125, "learning_rate": 9.998562679642109e-07, "loss": 0.0022, "reward": 1.9140626192092896, "reward_std": 0.2968519330024719, "rewards/accuracy_reward": 0.745312511920929, "rewards/format_reward": 1.0, "step": 756 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.1875, "epoch": 0.007642604745078243, "grad_norm": 1.6131416494344974, "kl": 0.05810546875, "learning_rate": 9.998558874875252e-07, "loss": 0.0023, "reward": 2.060812473297119, "reward_std": 0.02564765140414238, "rewards/accuracy_reward": 0.8608124256134033, "rewards/format_reward": 1.0, "step": 757 }, { "all_correct": 0.25, "all_wrong": 0.75, "completion_length": 406.3125, "epoch": 0.007652700656234225, "grad_norm": 0.7813210038617496, "kl": 0.049072265625, "learning_rate": 9.998555065079927e-07, "loss": 0.002, "reward": 1.2937500476837158, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 758 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 383.90625, "epoch": 0.007662796567390207, "grad_norm": 2.4376105835371287, "kl": 0.0546875, "learning_rate": 9.998551250256142e-07, "loss": 0.0022, "reward": 1.880250096321106, "reward_std": 0.011651305481791496, "rewards/accuracy_reward": 0.7302500009536743, "rewards/format_reward": 1.0, "step": 759 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.6875, "epoch": 0.0076728924785461885, "grad_norm": 2.119674442093289, "kl": 0.05078125, "learning_rate": 9.998547430403897e-07, "loss": 0.002, "reward": 2.0314688682556152, "reward_std": 0.053694888949394226, "rewards/accuracy_reward": 0.8314687609672546, "rewards/format_reward": 1.0, "step": 760 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.3125, "epoch": 0.007682988389702171, "grad_norm": 3.2366648117919072, "kl": 0.06201171875, "learning_rate": 9.998543605523195e-07, "loss": 0.0025, "reward": 2.018312454223633, "reward_std": 0.046777334064245224, "rewards/accuracy_reward": 0.8245624899864197, "rewards/format_reward": 1.0, "step": 761 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.59375, "epoch": 0.007693084300858153, "grad_norm": 13.577760729749807, "kl": 0.06640625, "learning_rate": 9.998539775614043e-07, "loss": 0.0027, "reward": 2.1015937328338623, "reward_std": 0.0311477892100811, "rewards/accuracy_reward": 0.9078437089920044, "rewards/format_reward": 1.0, "step": 762 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.15625, "epoch": 0.007703180212014134, "grad_norm": 5.889696048200557, "kl": 0.05859375, "learning_rate": 9.998535940676442e-07, "loss": 0.0023, "reward": 2.0547187328338623, "reward_std": 0.030958808958530426, "rewards/accuracy_reward": 0.8547187447547913, "rewards/format_reward": 1.0, "step": 763 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.4375, "epoch": 0.007713276123170116, "grad_norm": 1.7841155633298809, "kl": 0.05615234375, "learning_rate": 9.998532100710399e-07, "loss": 0.0023, "reward": 1.8505001068115234, "reward_std": 0.006060127168893814, "rewards/accuracy_reward": 0.7005000114440918, "rewards/format_reward": 1.0, "step": 764 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.03125, "epoch": 0.007723372034326098, "grad_norm": 1.510331882096646, "kl": 0.056396484375, "learning_rate": 9.998528255715914e-07, "loss": 0.0023, "reward": 2.1855626106262207, "reward_std": 0.010314828716218472, "rewards/accuracy_reward": 0.9855625629425049, "rewards/format_reward": 1.0, "step": 765 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.00773346794548208, "grad_norm": 3.0755876339577473, "kl": 0.05712890625, "learning_rate": 9.998524405692993e-07, "loss": 0.0023, "reward": 2.141218662261963, "reward_std": 0.03317878395318985, "rewards/accuracy_reward": 0.9474686980247498, "rewards/format_reward": 1.0, "step": 766 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.8125, "epoch": 0.007743563856638062, "grad_norm": 5.91082780869544, "kl": 0.060302734375, "learning_rate": 9.998520550641642e-07, "loss": 0.0024, "reward": 2.1127500534057617, "reward_std": 0.04787202924489975, "rewards/accuracy_reward": 0.918999969959259, "rewards/format_reward": 1.0, "step": 767 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.0625, "epoch": 0.007753659767794043, "grad_norm": 2.3331323642946447, "kl": 0.05908203125, "learning_rate": 9.998516690561863e-07, "loss": 0.0024, "reward": 1.9311094284057617, "reward_std": 0.1492132544517517, "rewards/accuracy_reward": 0.7686094045639038, "rewards/format_reward": 1.0, "step": 768 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.84375, "epoch": 0.007763755678950025, "grad_norm": 4.864364521834072, "kl": 0.059814453125, "learning_rate": 9.998512825453656e-07, "loss": 0.0024, "reward": 2.1357812881469727, "reward_std": 0.04075434431433678, "rewards/accuracy_reward": 0.9420313239097595, "rewards/format_reward": 1.0, "step": 769 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 430.59375, "epoch": 0.0077738515901060075, "grad_norm": 0.9932401099461322, "kl": 0.053955078125, "learning_rate": 9.998508955317031e-07, "loss": 0.0022, "reward": 1.4420000314712524, "reward_std": 0.13611315190792084, "rewards/accuracy_reward": 0.3607500195503235, "rewards/format_reward": 1.0, "step": 770 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 424.46875, "epoch": 0.007783947501261989, "grad_norm": 4.333894049023528, "kl": 0.060546875, "learning_rate": 9.99850508015199e-07, "loss": 0.0024, "reward": 1.7843124866485596, "reward_std": 0.021321747452020645, "rewards/accuracy_reward": 0.6343125104904175, "rewards/format_reward": 1.0, "step": 771 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 422.6875, "epoch": 0.007794043412417971, "grad_norm": 1.4797514442477162, "kl": 0.06103515625, "learning_rate": 9.998501199958534e-07, "loss": 0.0024, "reward": 1.8602501153945923, "reward_std": 0.004998024087399244, "rewards/accuracy_reward": 0.7102500200271606, "rewards/format_reward": 1.0, "step": 772 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.15625, "epoch": 0.007804139323573952, "grad_norm": 3.1329551225867154, "kl": 0.05224609375, "learning_rate": 9.99849731473667e-07, "loss": 0.0021, "reward": 2.0359063148498535, "reward_std": 0.055698931217193604, "rewards/accuracy_reward": 0.8421563506126404, "rewards/format_reward": 1.0, "step": 773 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 433.875, "epoch": 0.007814235234729934, "grad_norm": 2.6363521887560095, "kl": 0.055419921875, "learning_rate": 9.998493424486399e-07, "loss": 0.0022, "reward": 1.5082499980926514, "reward_std": 0.03566336631774902, "rewards/accuracy_reward": 0.40825000405311584, "rewards/format_reward": 1.0, "step": 774 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.21875, "epoch": 0.007824331145885917, "grad_norm": 2.104203975762311, "kl": 0.05712890625, "learning_rate": 9.99848952920773e-07, "loss": 0.0023, "reward": 2.1260313987731934, "reward_std": 0.03627306967973709, "rewards/accuracy_reward": 0.9322812557220459, "rewards/format_reward": 1.0, "step": 775 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.875, "epoch": 0.007834427057041899, "grad_norm": 2.219920694568044, "kl": 0.056640625, "learning_rate": 9.998485628900662e-07, "loss": 0.0023, "reward": 1.9305312633514404, "reward_std": 0.17818361520767212, "rewards/accuracy_reward": 0.7555312514305115, "rewards/format_reward": 1.0, "step": 776 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 426.5, "epoch": 0.007844522968197879, "grad_norm": 1.6815074481483037, "kl": 0.0517578125, "learning_rate": 9.998481723565202e-07, "loss": 0.0021, "reward": 2.144124984741211, "reward_std": 0.03280142694711685, "rewards/accuracy_reward": 0.9503750205039978, "rewards/format_reward": 1.0, "step": 777 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 443.625, "epoch": 0.007854618879353861, "grad_norm": 1.5725349812886742, "kl": 0.05810546875, "learning_rate": 9.998477813201352e-07, "loss": 0.0023, "reward": 1.83774995803833, "reward_std": 0.0323854461312294, "rewards/accuracy_reward": 0.6940000057220459, "rewards/format_reward": 1.0, "step": 778 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.875, "epoch": 0.007864714790509843, "grad_norm": 1.8801165786209342, "kl": 0.058349609375, "learning_rate": 9.998473897809114e-07, "loss": 0.0023, "reward": 2.0635313987731934, "reward_std": 0.12117913365364075, "rewards/accuracy_reward": 0.8760311603546143, "rewards/format_reward": 1.0, "step": 779 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 430.4375, "epoch": 0.007874810701665826, "grad_norm": 3.3396099324100956, "kl": 0.060791015625, "learning_rate": 9.9984699773885e-07, "loss": 0.0024, "reward": 1.8615938425064087, "reward_std": 0.01470961794257164, "rewards/accuracy_reward": 0.711593747138977, "rewards/format_reward": 1.0, "step": 780 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.375, "epoch": 0.007884906612821808, "grad_norm": 2.3687669970417407, "kl": 0.0615234375, "learning_rate": 9.998466051939505e-07, "loss": 0.0025, "reward": 1.9600937366485596, "reward_std": 0.20282196998596191, "rewards/accuracy_reward": 0.7850937247276306, "rewards/format_reward": 1.0, "step": 781 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 430.875, "epoch": 0.007895002523977788, "grad_norm": 2.8957039418297748, "kl": 0.0537109375, "learning_rate": 9.998462121462136e-07, "loss": 0.0021, "reward": 1.550624966621399, "reward_std": 0.020529191941022873, "rewards/accuracy_reward": 0.4506250023841858, "rewards/format_reward": 1.0, "step": 782 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.125, "epoch": 0.00790509843513377, "grad_norm": 1.7197169361695837, "kl": 0.0556640625, "learning_rate": 9.9984581859564e-07, "loss": 0.0022, "reward": 2.1535937786102295, "reward_std": 0.014123471453785896, "rewards/accuracy_reward": 0.9535937309265137, "rewards/format_reward": 1.0, "step": 783 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.8125, "epoch": 0.007915194346289752, "grad_norm": 2.809770137765933, "kl": 0.0576171875, "learning_rate": 9.998454245422296e-07, "loss": 0.0023, "reward": 1.778999924659729, "reward_std": 0.04218413308262825, "rewards/accuracy_reward": 0.6352500319480896, "rewards/format_reward": 1.0, "step": 784 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.5625, "epoch": 0.007925290257445735, "grad_norm": 2.593542000637018, "kl": 0.05419921875, "learning_rate": 9.998450299859831e-07, "loss": 0.0022, "reward": 2.053781509399414, "reward_std": 0.048881735652685165, "rewards/accuracy_reward": 0.8600311875343323, "rewards/format_reward": 1.0, "step": 785 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 435.625, "epoch": 0.007935386168601717, "grad_norm": 1.9761116616869632, "kl": 0.054931640625, "learning_rate": 9.99844634926901e-07, "loss": 0.0022, "reward": 1.8680624961853027, "reward_std": 0.11577572673559189, "rewards/accuracy_reward": 0.7243124842643738, "rewards/format_reward": 1.0, "step": 786 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.75, "epoch": 0.007945482079757699, "grad_norm": 1.9210385451253367, "kl": 0.0537109375, "learning_rate": 9.998442393649833e-07, "loss": 0.0021, "reward": 2.048093795776367, "reward_std": 0.045306652784347534, "rewards/accuracy_reward": 0.8605936765670776, "rewards/format_reward": 1.0, "step": 787 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.53125, "epoch": 0.00795557799091368, "grad_norm": 1.5724306908111354, "kl": 0.053955078125, "learning_rate": 9.998438433002307e-07, "loss": 0.0022, "reward": 2.1388282775878906, "reward_std": 0.010958120226860046, "rewards/accuracy_reward": 0.9388281106948853, "rewards/format_reward": 1.0, "step": 788 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.007965673902069662, "grad_norm": 2.515654994972178, "kl": 0.056884765625, "learning_rate": 9.998434467326435e-07, "loss": 0.0023, "reward": 2.032156229019165, "reward_std": 0.02643660083413124, "rewards/accuracy_reward": 0.8321563005447388, "rewards/format_reward": 1.0, "step": 789 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.875, "epoch": 0.007975769813225644, "grad_norm": 2.458897291026721, "kl": 0.0615234375, "learning_rate": 9.998430496622221e-07, "loss": 0.0025, "reward": 2.122593879699707, "reward_std": 0.021252810955047607, "rewards/accuracy_reward": 0.9225937128067017, "rewards/format_reward": 1.0, "step": 790 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 408.03125, "epoch": 0.007985865724381626, "grad_norm": 1.4042713593111171, "kl": 0.0634765625, "learning_rate": 9.998426520889671e-07, "loss": 0.0025, "reward": 1.527500033378601, "reward_std": 0.0046348790638148785, "rewards/accuracy_reward": 0.42750000953674316, "rewards/format_reward": 1.0, "step": 791 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.875, "epoch": 0.007995961635537608, "grad_norm": 2.328923528948126, "kl": 0.06103515625, "learning_rate": 9.998422540128786e-07, "loss": 0.0024, "reward": 2.118687629699707, "reward_std": 0.015096098184585571, "rewards/accuracy_reward": 0.9186875820159912, "rewards/format_reward": 1.0, "step": 792 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.34375, "epoch": 0.008006057546693588, "grad_norm": 3.121156349473336, "kl": 0.064453125, "learning_rate": 9.998418554339573e-07, "loss": 0.0026, "reward": 2.121500015258789, "reward_std": 0.03913139924407005, "rewards/accuracy_reward": 0.921500027179718, "rewards/format_reward": 1.0, "step": 793 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.90625, "epoch": 0.00801615345784957, "grad_norm": 1.594434493634004, "kl": 0.048828125, "learning_rate": 9.99841456352203e-07, "loss": 0.002, "reward": 2.061906337738037, "reward_std": 0.010462536476552486, "rewards/accuracy_reward": 0.8619061708450317, "rewards/format_reward": 1.0, "step": 794 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.375, "epoch": 0.008026249369005553, "grad_norm": 1.7151551607842763, "kl": 0.05859375, "learning_rate": 9.998410567676168e-07, "loss": 0.0023, "reward": 2.081031322479248, "reward_std": 0.15463325381278992, "rewards/accuracy_reward": 0.8935312628746033, "rewards/format_reward": 1.0, "step": 795 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 404.03125, "epoch": 0.008036345280161535, "grad_norm": 1.2628915938496141, "kl": 0.062255859375, "learning_rate": 9.998406566801988e-07, "loss": 0.0025, "reward": 1.873499870300293, "reward_std": 0.0066815330646932125, "rewards/accuracy_reward": 0.7235000133514404, "rewards/format_reward": 1.0, "step": 796 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.78125, "epoch": 0.008046441191317517, "grad_norm": 2.5980507012249183, "kl": 0.0517578125, "learning_rate": 9.998402560899495e-07, "loss": 0.0021, "reward": 2.1248438358306885, "reward_std": 0.0403231680393219, "rewards/accuracy_reward": 0.9310938119888306, "rewards/format_reward": 1.0, "step": 797 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.78125, "epoch": 0.008056537102473497, "grad_norm": 1.610066720047672, "kl": 0.052978515625, "learning_rate": 9.998398549968691e-07, "loss": 0.0021, "reward": 2.1134064197540283, "reward_std": 0.040091291069984436, "rewards/accuracy_reward": 0.9321562647819519, "rewards/format_reward": 1.0, "step": 798 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 384.625, "epoch": 0.00806663301362948, "grad_norm": 2.0483944871931263, "kl": 0.060546875, "learning_rate": 9.998394534009583e-07, "loss": 0.0024, "reward": 1.8440625667572021, "reward_std": 0.05717966705560684, "rewards/accuracy_reward": 0.7003124952316284, "rewards/format_reward": 1.0, "step": 799 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.59375, "epoch": 0.008076728924785462, "grad_norm": 7.34364463946595, "kl": 0.05419921875, "learning_rate": 9.998390513022172e-07, "loss": 0.0022, "reward": 2.0329999923706055, "reward_std": 0.02279065176844597, "rewards/accuracy_reward": 0.8330000042915344, "rewards/format_reward": 1.0, "step": 800 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.375, "epoch": 0.008086824835941444, "grad_norm": 2.0375105265866287, "kl": 0.0595703125, "learning_rate": 9.998386487006465e-07, "loss": 0.0024, "reward": 1.889312505722046, "reward_std": 0.10717283934354782, "rewards/accuracy_reward": 0.7393125295639038, "rewards/format_reward": 1.0, "step": 801 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 416.0, "epoch": 0.008096920747097426, "grad_norm": 3.246365079997074, "kl": 0.06103515625, "learning_rate": 9.998382455962463e-07, "loss": 0.0024, "reward": 1.7032500505447388, "reward_std": 0.16662459075450897, "rewards/accuracy_reward": 0.578249990940094, "rewards/format_reward": 1.0, "step": 802 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.008107016658253407, "grad_norm": 2.5727909420169217, "kl": 0.051513671875, "learning_rate": 9.998378419890172e-07, "loss": 0.0021, "reward": 2.1274375915527344, "reward_std": 0.03704307973384857, "rewards/accuracy_reward": 0.9274374842643738, "rewards/format_reward": 1.0, "step": 803 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.008117112569409389, "grad_norm": 2.896054784386168, "kl": 0.05615234375, "learning_rate": 9.998374378789596e-07, "loss": 0.0022, "reward": 2.0470314025878906, "reward_std": 0.0380992591381073, "rewards/accuracy_reward": 0.8532812595367432, "rewards/format_reward": 1.0, "step": 804 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.8125, "epoch": 0.00812720848056537, "grad_norm": 1.8835249470843622, "kl": 0.06396484375, "learning_rate": 9.998370332660737e-07, "loss": 0.0026, "reward": 2.1000001430511475, "reward_std": 0.056500211358070374, "rewards/accuracy_reward": 0.9125000238418579, "rewards/format_reward": 1.0, "step": 805 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.78125, "epoch": 0.008137304391721353, "grad_norm": 1.5357396859706782, "kl": 0.061767578125, "learning_rate": 9.998366281503602e-07, "loss": 0.0025, "reward": 2.182312488555908, "reward_std": 0.004656968638300896, "rewards/accuracy_reward": 0.9823124408721924, "rewards/format_reward": 1.0, "step": 806 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.9375, "epoch": 0.008147400302877335, "grad_norm": 1.3755895007950432, "kl": 0.05078125, "learning_rate": 9.998362225318194e-07, "loss": 0.002, "reward": 2.0991873741149902, "reward_std": 0.016933679580688477, "rewards/accuracy_reward": 0.8991875648498535, "rewards/format_reward": 1.0, "step": 807 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.008157496214033317, "grad_norm": 2.178087068832983, "kl": 0.047119140625, "learning_rate": 9.998358164104517e-07, "loss": 0.0019, "reward": 2.0449061393737793, "reward_std": 0.047607142478227615, "rewards/accuracy_reward": 0.8511562347412109, "rewards/format_reward": 1.0, "step": 808 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 409.21875, "epoch": 0.008167592125189298, "grad_norm": 0.9674809407444123, "kl": 0.06201171875, "learning_rate": 9.998354097862574e-07, "loss": 0.0025, "reward": 1.4874999523162842, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 809 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.5625, "epoch": 0.00817768803634528, "grad_norm": 2.1742678332009033, "kl": 0.0498046875, "learning_rate": 9.998350026592373e-07, "loss": 0.002, "reward": 1.9842500686645508, "reward_std": 0.020999809727072716, "rewards/accuracy_reward": 0.784250020980835, "rewards/format_reward": 1.0, "step": 810 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.25, "epoch": 0.008187783947501262, "grad_norm": 3.026030912583236, "kl": 0.052490234375, "learning_rate": 9.998345950293912e-07, "loss": 0.0021, "reward": 2.035531520843506, "reward_std": 0.28332918882369995, "rewards/accuracy_reward": 0.8667812347412109, "rewards/format_reward": 1.0, "step": 811 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.53125, "epoch": 0.008197879858657244, "grad_norm": 2.574952454541151, "kl": 0.050048828125, "learning_rate": 9.9983418689672e-07, "loss": 0.002, "reward": 2.0242812633514404, "reward_std": 0.026201022788882256, "rewards/accuracy_reward": 0.8242812156677246, "rewards/format_reward": 1.0, "step": 812 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 422.21875, "epoch": 0.008207975769813226, "grad_norm": 3.9314846748793433, "kl": 0.05224609375, "learning_rate": 9.998337782612238e-07, "loss": 0.0021, "reward": 1.944156289100647, "reward_std": 0.052794117480516434, "rewards/accuracy_reward": 0.7504062652587891, "rewards/format_reward": 1.0, "step": 813 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.21875, "epoch": 0.008218071680969207, "grad_norm": 1.952717875732758, "kl": 0.05419921875, "learning_rate": 9.998333691229032e-07, "loss": 0.0022, "reward": 1.956687569618225, "reward_std": 0.18824023008346558, "rewards/accuracy_reward": 0.7879374027252197, "rewards/format_reward": 1.0, "step": 814 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.9375, "epoch": 0.008228167592125189, "grad_norm": 1.7554180157844113, "kl": 0.06494140625, "learning_rate": 9.998329594817584e-07, "loss": 0.0026, "reward": 1.9229062795639038, "reward_std": 0.3146037459373474, "rewards/accuracy_reward": 0.7729063034057617, "rewards/format_reward": 1.0, "step": 815 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.5625, "epoch": 0.008238263503281171, "grad_norm": 2.409512098918597, "kl": 0.055908203125, "learning_rate": 9.998325493377903e-07, "loss": 0.0022, "reward": 2.0075623989105225, "reward_std": 0.029075918719172478, "rewards/accuracy_reward": 0.8075624704360962, "rewards/format_reward": 1.0, "step": 816 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.59375, "epoch": 0.008248359414437153, "grad_norm": 2.7213100174721485, "kl": 0.05078125, "learning_rate": 9.99832138690999e-07, "loss": 0.002, "reward": 1.7508125305175781, "reward_std": 0.02907530404627323, "rewards/accuracy_reward": 0.6008124947547913, "rewards/format_reward": 1.0, "step": 817 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 435.71875, "epoch": 0.008258455325593135, "grad_norm": 2.311354361397183, "kl": 0.0625, "learning_rate": 9.998317275413846e-07, "loss": 0.0025, "reward": 1.8100312948226929, "reward_std": 0.13382215797901154, "rewards/accuracy_reward": 0.672531247138977, "rewards/format_reward": 1.0, "step": 818 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.3125, "epoch": 0.008268551236749116, "grad_norm": 1.881559213815454, "kl": 0.05224609375, "learning_rate": 9.998313158889479e-07, "loss": 0.0021, "reward": 2.119374990463257, "reward_std": 0.026030711829662323, "rewards/accuracy_reward": 0.9193750023841858, "rewards/format_reward": 1.0, "step": 819 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 415.625, "epoch": 0.008278647147905098, "grad_norm": 1.858468615840649, "kl": 0.060791015625, "learning_rate": 9.998309037336895e-07, "loss": 0.0024, "reward": 1.5482187271118164, "reward_std": 0.027638111263513565, "rewards/accuracy_reward": 0.4544687569141388, "rewards/format_reward": 1.0, "step": 820 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.1875, "epoch": 0.00828874305906108, "grad_norm": 4.702984635997848, "kl": 0.059326171875, "learning_rate": 9.99830491075609e-07, "loss": 0.0024, "reward": 2.096750020980835, "reward_std": 0.03509029000997543, "rewards/accuracy_reward": 0.902999997138977, "rewards/format_reward": 1.0, "step": 821 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 425.75, "epoch": 0.008298838970217062, "grad_norm": 2.0361793327912383, "kl": 0.053955078125, "learning_rate": 9.998300779147078e-07, "loss": 0.0022, "reward": 1.7856249809265137, "reward_std": 0.04771796241402626, "rewards/accuracy_reward": 0.6356250643730164, "rewards/format_reward": 1.0, "step": 822 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 422.3125, "epoch": 0.008308934881373044, "grad_norm": 5.259475338007134, "kl": 0.055908203125, "learning_rate": 9.998296642509858e-07, "loss": 0.0022, "reward": 1.6964061260223389, "reward_std": 0.031188230961561203, "rewards/accuracy_reward": 0.5526562929153442, "rewards/format_reward": 1.0, "step": 823 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.1875, "epoch": 0.008319030792529027, "grad_norm": 1.8844753714106122, "kl": 0.054443359375, "learning_rate": 9.998292500844434e-07, "loss": 0.0022, "reward": 1.9882187843322754, "reward_std": 0.01870114356279373, "rewards/accuracy_reward": 0.7882187962532043, "rewards/format_reward": 1.0, "step": 824 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 416.78125, "epoch": 0.008329126703685007, "grad_norm": 3.1221044752471716, "kl": 0.06005859375, "learning_rate": 9.998288354150811e-07, "loss": 0.0024, "reward": 1.2706875801086426, "reward_std": 0.19267860054969788, "rewards/accuracy_reward": 0.22068750858306885, "rewards/format_reward": 1.0, "step": 825 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.625, "epoch": 0.00833922261484099, "grad_norm": 2.817584928817178, "kl": 0.06640625, "learning_rate": 9.998284202428995e-07, "loss": 0.0027, "reward": 1.8034687042236328, "reward_std": 0.16415970027446747, "rewards/accuracy_reward": 0.6409687995910645, "rewards/format_reward": 1.0, "step": 826 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.375, "epoch": 0.008349318525996971, "grad_norm": 2.7151514694149035, "kl": 0.051513671875, "learning_rate": 9.998280045678986e-07, "loss": 0.0021, "reward": 2.046781301498413, "reward_std": 0.06326673924922943, "rewards/accuracy_reward": 0.8467812538146973, "rewards/format_reward": 1.0, "step": 827 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 427.78125, "epoch": 0.008359414437152954, "grad_norm": 2.2139839699122095, "kl": 0.05078125, "learning_rate": 9.998275883900793e-07, "loss": 0.002, "reward": 1.741281270980835, "reward_std": 0.02183566614985466, "rewards/accuracy_reward": 0.5912812352180481, "rewards/format_reward": 1.0, "step": 828 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.008369510348308936, "grad_norm": 1.3490883923748824, "kl": 0.046142578125, "learning_rate": 9.998271717094415e-07, "loss": 0.0018, "reward": 2.1047186851501465, "reward_std": 0.0130130834877491, "rewards/accuracy_reward": 0.9047187566757202, "rewards/format_reward": 1.0, "step": 829 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.90625, "epoch": 0.008379606259464916, "grad_norm": 4.7345714607848395, "kl": 0.0517578125, "learning_rate": 9.99826754525986e-07, "loss": 0.0021, "reward": 1.9987499713897705, "reward_std": 0.17720037698745728, "rewards/accuracy_reward": 0.8237500190734863, "rewards/format_reward": 1.0, "step": 830 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 431.3125, "epoch": 0.008389702170620898, "grad_norm": 2.6705245640937774, "kl": 0.048828125, "learning_rate": 9.998263368397133e-07, "loss": 0.0019, "reward": 1.7742189168930054, "reward_std": 0.04191289097070694, "rewards/accuracy_reward": 0.6304687261581421, "rewards/format_reward": 1.0, "step": 831 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.00839979808177688, "grad_norm": 1.408672763700071, "kl": 0.046142578125, "learning_rate": 9.998259186506233e-07, "loss": 0.0018, "reward": 2.1645936965942383, "reward_std": 0.04852624610066414, "rewards/accuracy_reward": 0.9708437323570251, "rewards/format_reward": 1.0, "step": 832 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.46875, "epoch": 0.008409893992932863, "grad_norm": 2.3778953332204518, "kl": 0.048828125, "learning_rate": 9.99825499958717e-07, "loss": 0.002, "reward": 2.0818750858306885, "reward_std": 0.05131559073925018, "rewards/accuracy_reward": 0.8943749666213989, "rewards/format_reward": 1.0, "step": 833 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.90625, "epoch": 0.008419989904088845, "grad_norm": 1.710850559190705, "kl": 0.04931640625, "learning_rate": 9.998250807639946e-07, "loss": 0.002, "reward": 2.1619062423706055, "reward_std": 0.019314264878630638, "rewards/accuracy_reward": 0.9681562185287476, "rewards/format_reward": 1.0, "step": 834 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 424.8125, "epoch": 0.008430085815244825, "grad_norm": 1.839203593011165, "kl": 0.052734375, "learning_rate": 9.998246610664564e-07, "loss": 0.0021, "reward": 1.7120625972747803, "reward_std": 0.029671182855963707, "rewards/accuracy_reward": 0.5683124661445618, "rewards/format_reward": 1.0, "step": 835 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 395.8125, "epoch": 0.008440181726400807, "grad_norm": 2.1094314963728, "kl": 0.05419921875, "learning_rate": 9.998242408661028e-07, "loss": 0.0022, "reward": 1.6352500915527344, "reward_std": 0.032451238483190536, "rewards/accuracy_reward": 0.4852500259876251, "rewards/format_reward": 1.0, "step": 836 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.75, "epoch": 0.00845027763755679, "grad_norm": 4.431292005868651, "kl": 0.0478515625, "learning_rate": 9.998238201629346e-07, "loss": 0.0019, "reward": 2.103749990463257, "reward_std": 0.01952960714697838, "rewards/accuracy_reward": 0.9037500619888306, "rewards/format_reward": 1.0, "step": 837 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.09375, "epoch": 0.008460373548712772, "grad_norm": 2.6872033255329244, "kl": 0.05078125, "learning_rate": 9.998233989569517e-07, "loss": 0.002, "reward": 1.807281255722046, "reward_std": 0.034374311566352844, "rewards/accuracy_reward": 0.6635313034057617, "rewards/format_reward": 1.0, "step": 838 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.9375, "epoch": 0.008470469459868754, "grad_norm": 1.9837054917569708, "kl": 0.047119140625, "learning_rate": 9.99822977248155e-07, "loss": 0.0019, "reward": 2.1370937824249268, "reward_std": 0.06482705473899841, "rewards/accuracy_reward": 0.9558437466621399, "rewards/format_reward": 1.0, "step": 839 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.75, "epoch": 0.008480565371024734, "grad_norm": 4.806524764062593, "kl": 0.04931640625, "learning_rate": 9.998225550365447e-07, "loss": 0.002, "reward": 2.0795626640319824, "reward_std": 0.050442956387996674, "rewards/accuracy_reward": 0.885812520980835, "rewards/format_reward": 1.0, "step": 840 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 409.40625, "epoch": 0.008490661282180716, "grad_norm": 1.1414157459496377, "kl": 0.04638671875, "learning_rate": 9.998221323221213e-07, "loss": 0.0018, "reward": 1.4937188625335693, "reward_std": 0.011563059873878956, "rewards/accuracy_reward": 0.3937187194824219, "rewards/format_reward": 1.0, "step": 841 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.90625, "epoch": 0.008500757193336698, "grad_norm": 1.721670497845263, "kl": 0.048828125, "learning_rate": 9.99821709104885e-07, "loss": 0.002, "reward": 2.063812732696533, "reward_std": 0.15641315281391144, "rewards/accuracy_reward": 0.8763125538825989, "rewards/format_reward": 1.0, "step": 842 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.1875, "epoch": 0.00851085310449268, "grad_norm": 1.5998639022687515, "kl": 0.04541015625, "learning_rate": 9.998212853848365e-07, "loss": 0.0018, "reward": 2.0363125801086426, "reward_std": 0.011419091373682022, "rewards/accuracy_reward": 0.8363125324249268, "rewards/format_reward": 1.0, "step": 843 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.15625, "epoch": 0.008520949015648663, "grad_norm": 2.5051487799735597, "kl": 0.0478515625, "learning_rate": 9.99820861161976e-07, "loss": 0.0019, "reward": 2.0406250953674316, "reward_std": 0.017679745331406593, "rewards/accuracy_reward": 0.8406249284744263, "rewards/format_reward": 1.0, "step": 844 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.75, "epoch": 0.008531044926804645, "grad_norm": 2.485822913670357, "kl": 0.049072265625, "learning_rate": 9.99820436436304e-07, "loss": 0.002, "reward": 1.870750069618225, "reward_std": 0.15145952999591827, "rewards/accuracy_reward": 0.7082500457763672, "rewards/format_reward": 1.0, "step": 845 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 402.78125, "epoch": 0.008541140837960625, "grad_norm": 2.941015627661736, "kl": 0.043701171875, "learning_rate": 9.998200112078212e-07, "loss": 0.0017, "reward": 1.5968749523162842, "reward_std": 0.008838832378387451, "rewards/accuracy_reward": 0.49687498807907104, "rewards/format_reward": 1.0, "step": 846 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.125, "epoch": 0.008551236749116608, "grad_norm": 1.8060483070572841, "kl": 0.048095703125, "learning_rate": 9.998195854765278e-07, "loss": 0.0019, "reward": 1.7949999570846558, "reward_std": 0.08973664790391922, "rewards/accuracy_reward": 0.6449999809265137, "rewards/format_reward": 1.0, "step": 847 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.375, "epoch": 0.00856133266027259, "grad_norm": 3.2547371222037476, "kl": 0.046142578125, "learning_rate": 9.998191592424239e-07, "loss": 0.0018, "reward": 2.069499969482422, "reward_std": 0.042390670627355576, "rewards/accuracy_reward": 0.869499921798706, "rewards/format_reward": 1.0, "step": 848 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.375, "epoch": 0.008571428571428572, "grad_norm": 2.5575702696788087, "kl": 0.04833984375, "learning_rate": 9.998187325055104e-07, "loss": 0.0019, "reward": 2.0915937423706055, "reward_std": 0.058445557951927185, "rewards/accuracy_reward": 0.8915938138961792, "rewards/format_reward": 1.0, "step": 849 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.4375, "epoch": 0.008581524482584554, "grad_norm": 2.2332127029471893, "kl": 0.052490234375, "learning_rate": 9.998183052657879e-07, "loss": 0.0021, "reward": 1.9036458730697632, "reward_std": 0.12231126427650452, "rewards/accuracy_reward": 0.7536457777023315, "rewards/format_reward": 1.0, "step": 850 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 413.59375, "epoch": 0.008591620393740534, "grad_norm": 4.457803344098139, "kl": 0.0517578125, "learning_rate": 9.998178775232562e-07, "loss": 0.0021, "reward": 1.7605626583099365, "reward_std": 0.020212415605783463, "rewards/accuracy_reward": 0.6105624437332153, "rewards/format_reward": 1.0, "step": 851 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.008601716304896517, "grad_norm": 6.497497609697782, "kl": 0.048828125, "learning_rate": 9.998174492779162e-07, "loss": 0.002, "reward": 2.1219687461853027, "reward_std": 0.040639154613018036, "rewards/accuracy_reward": 0.9219686985015869, "rewards/format_reward": 1.0, "step": 852 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.46875, "epoch": 0.008611812216052499, "grad_norm": 2.3009752017208656, "kl": 0.052978515625, "learning_rate": 9.99817020529768e-07, "loss": 0.0021, "reward": 2.0205938816070557, "reward_std": 0.17768199741840363, "rewards/accuracy_reward": 0.839343786239624, "rewards/format_reward": 1.0, "step": 853 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.0625, "epoch": 0.008621908127208481, "grad_norm": 1.9514002548603129, "kl": 0.05517578125, "learning_rate": 9.998165912788122e-07, "loss": 0.0022, "reward": 1.9841251373291016, "reward_std": 0.14990471303462982, "rewards/accuracy_reward": 0.7966249585151672, "rewards/format_reward": 1.0, "step": 854 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.03125, "epoch": 0.008632004038364463, "grad_norm": 2.9643943804872643, "kl": 0.051025390625, "learning_rate": 9.998161615250493e-07, "loss": 0.002, "reward": 1.956375002861023, "reward_std": 0.060635197907686234, "rewards/accuracy_reward": 0.7626250386238098, "rewards/format_reward": 1.0, "step": 855 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.5, "epoch": 0.008642099949520443, "grad_norm": 23.383798129808078, "kl": 0.054931640625, "learning_rate": 9.9981573126848e-07, "loss": 0.0022, "reward": 2.091249942779541, "reward_std": 0.04478508234024048, "rewards/accuracy_reward": 0.8912500143051147, "rewards/format_reward": 1.0, "step": 856 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 410.34375, "epoch": 0.008652195860676426, "grad_norm": 0.9467416699418654, "kl": 0.05029296875, "learning_rate": 9.998153005091038e-07, "loss": 0.002, "reward": 1.5901250839233398, "reward_std": 0.006057760212570429, "rewards/accuracy_reward": 0.49012500047683716, "rewards/format_reward": 1.0, "step": 857 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.34375, "epoch": 0.008662291771832408, "grad_norm": 1.6922286032374827, "kl": 0.0498046875, "learning_rate": 9.998148692469222e-07, "loss": 0.002, "reward": 2.0687499046325684, "reward_std": 0.03661306947469711, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 858 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.0, "epoch": 0.00867238768298839, "grad_norm": 2.615762032663738, "kl": 0.060791015625, "learning_rate": 9.99814437481935e-07, "loss": 0.0024, "reward": 1.7511093616485596, "reward_std": 0.04538225755095482, "rewards/accuracy_reward": 0.6073593497276306, "rewards/format_reward": 1.0, "step": 859 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.375, "epoch": 0.008682483594144372, "grad_norm": 1.9047537186333705, "kl": 0.053955078125, "learning_rate": 9.998140052141429e-07, "loss": 0.0022, "reward": 2.1199374198913574, "reward_std": 0.008818360045552254, "rewards/accuracy_reward": 0.9199374914169312, "rewards/format_reward": 1.0, "step": 860 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.65625, "epoch": 0.008692579505300353, "grad_norm": 4.095658856758433, "kl": 0.04833984375, "learning_rate": 9.99813572443546e-07, "loss": 0.0019, "reward": 2.1319375038146973, "reward_std": 0.049154914915561676, "rewards/accuracy_reward": 0.9381875395774841, "rewards/format_reward": 1.0, "step": 861 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.9375, "epoch": 0.008702675416456335, "grad_norm": 4.769845549507378, "kl": 0.055908203125, "learning_rate": 9.998131391701453e-07, "loss": 0.0022, "reward": 2.107062578201294, "reward_std": 0.0556693933904171, "rewards/accuracy_reward": 0.9070625305175781, "rewards/format_reward": 1.0, "step": 862 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.008712771327612317, "grad_norm": 2.4796567728729695, "kl": 0.051513671875, "learning_rate": 9.998127053939406e-07, "loss": 0.0021, "reward": 1.8909687995910645, "reward_std": 0.052700113505125046, "rewards/accuracy_reward": 0.6972187757492065, "rewards/format_reward": 1.0, "step": 863 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.75, "epoch": 0.008722867238768299, "grad_norm": 2.7125139526309607, "kl": 0.058837890625, "learning_rate": 9.99812271114933e-07, "loss": 0.0024, "reward": 1.7560625076293945, "reward_std": 0.02785191871225834, "rewards/accuracy_reward": 0.6123124957084656, "rewards/format_reward": 1.0, "step": 864 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.008732963149924281, "grad_norm": 1.9429385860338093, "kl": 0.056396484375, "learning_rate": 9.998118363331223e-07, "loss": 0.0023, "reward": 1.7612812519073486, "reward_std": 0.10643378645181656, "rewards/accuracy_reward": 0.6112812757492065, "rewards/format_reward": 1.0, "step": 865 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.5625, "epoch": 0.008743059061080263, "grad_norm": 21.479036535877107, "kl": 0.05517578125, "learning_rate": 9.998114010485093e-07, "loss": 0.0022, "reward": 2.050875186920166, "reward_std": 0.046250373125076294, "rewards/accuracy_reward": 0.8571249842643738, "rewards/format_reward": 1.0, "step": 866 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.40625, "epoch": 0.008753154972236244, "grad_norm": 2.0379782633032657, "kl": 0.058837890625, "learning_rate": 9.998109652610943e-07, "loss": 0.0024, "reward": 1.8206562995910645, "reward_std": 0.02034204825758934, "rewards/accuracy_reward": 0.6706562638282776, "rewards/format_reward": 1.0, "step": 867 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.65625, "epoch": 0.008763250883392226, "grad_norm": 3.190515354783493, "kl": 0.057861328125, "learning_rate": 9.998105289708779e-07, "loss": 0.0023, "reward": 2.041374921798706, "reward_std": 0.06388790160417557, "rewards/accuracy_reward": 0.8601250052452087, "rewards/format_reward": 1.0, "step": 868 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.0625, "epoch": 0.008773346794548208, "grad_norm": 2.806047629064524, "kl": 0.05517578125, "learning_rate": 9.998100921778603e-07, "loss": 0.0022, "reward": 1.9924376010894775, "reward_std": 0.05703843757510185, "rewards/accuracy_reward": 0.7924374938011169, "rewards/format_reward": 1.0, "step": 869 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.00878344270570419, "grad_norm": 2.3654795196412253, "kl": 0.060546875, "learning_rate": 9.998096548820423e-07, "loss": 0.0024, "reward": 2.0454375743865967, "reward_std": 0.03741580992937088, "rewards/accuracy_reward": 0.8454375267028809, "rewards/format_reward": 1.0, "step": 870 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.71875, "epoch": 0.008793538616860172, "grad_norm": 2.1562612651952766, "kl": 0.0546875, "learning_rate": 9.99809217083424e-07, "loss": 0.0022, "reward": 2.126593828201294, "reward_std": 0.03185024484992027, "rewards/accuracy_reward": 0.9265937209129333, "rewards/format_reward": 1.0, "step": 871 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.5625, "epoch": 0.008803634528016153, "grad_norm": 1.1851345569228025, "kl": 0.06298828125, "learning_rate": 9.99808778782006e-07, "loss": 0.0025, "reward": 1.7404062747955322, "reward_std": 0.16630885004997253, "rewards/accuracy_reward": 0.6154062151908875, "rewards/format_reward": 1.0, "step": 872 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.375, "epoch": 0.008813730439172135, "grad_norm": 1.8086277012010623, "kl": 0.05517578125, "learning_rate": 9.998083399777886e-07, "loss": 0.0022, "reward": 2.0475311279296875, "reward_std": 0.019611572846770287, "rewards/accuracy_reward": 0.8475311994552612, "rewards/format_reward": 1.0, "step": 873 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.96875, "epoch": 0.008823826350328117, "grad_norm": 1.3959208462849524, "kl": 0.050048828125, "learning_rate": 9.998079006707724e-07, "loss": 0.002, "reward": 2.096437692642212, "reward_std": 0.01707465574145317, "rewards/accuracy_reward": 0.8964375257492065, "rewards/format_reward": 1.0, "step": 874 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 413.90625, "epoch": 0.0088339222614841, "grad_norm": 3.142331222767786, "kl": 0.056884765625, "learning_rate": 9.998074608609579e-07, "loss": 0.0023, "reward": 1.792625069618225, "reward_std": 0.04018377140164375, "rewards/accuracy_reward": 0.6488749980926514, "rewards/format_reward": 1.0, "step": 875 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 385.84375, "epoch": 0.008844018172640081, "grad_norm": 1.2002443639714997, "kl": 0.060791015625, "learning_rate": 9.998070205483452e-07, "loss": 0.0024, "reward": 1.622187614440918, "reward_std": 0.09139057248830795, "rewards/accuracy_reward": 0.5221874713897705, "rewards/format_reward": 1.0, "step": 876 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.0, "epoch": 0.008854114083796062, "grad_norm": 2.7051806682048527, "kl": 0.052001953125, "learning_rate": 9.99806579732935e-07, "loss": 0.0021, "reward": 2.061812400817871, "reward_std": 0.04384909197688103, "rewards/accuracy_reward": 0.8618124723434448, "rewards/format_reward": 1.0, "step": 877 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.8125, "epoch": 0.008864209994952044, "grad_norm": 1.7221172486555256, "kl": 0.06005859375, "learning_rate": 9.998061384147278e-07, "loss": 0.0024, "reward": 1.793375015258789, "reward_std": 0.16251686215400696, "rewards/accuracy_reward": 0.655875027179718, "rewards/format_reward": 1.0, "step": 878 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.96875, "epoch": 0.008874305906108026, "grad_norm": 2.2142091311395085, "kl": 0.052001953125, "learning_rate": 9.99805696593724e-07, "loss": 0.0021, "reward": 1.8506875038146973, "reward_std": 0.045636821538209915, "rewards/accuracy_reward": 0.6506875157356262, "rewards/format_reward": 1.0, "step": 879 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.8125, "epoch": 0.008884401817264008, "grad_norm": 2.2184099915078788, "kl": 0.0576171875, "learning_rate": 9.99805254269924e-07, "loss": 0.0023, "reward": 1.882062554359436, "reward_std": 0.166863352060318, "rewards/accuracy_reward": 0.7195624709129333, "rewards/format_reward": 1.0, "step": 880 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.0, "epoch": 0.00889449772841999, "grad_norm": 1.421316273411413, "kl": 0.05224609375, "learning_rate": 9.99804811443328e-07, "loss": 0.0021, "reward": 2.062234401702881, "reward_std": 0.017981041222810745, "rewards/accuracy_reward": 0.8622344136238098, "rewards/format_reward": 1.0, "step": 881 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.59375, "epoch": 0.008904593639575971, "grad_norm": 7.172832290134371, "kl": 0.056640625, "learning_rate": 9.99804368113937e-07, "loss": 0.0023, "reward": 1.990875005722046, "reward_std": 0.02557540126144886, "rewards/accuracy_reward": 0.7908750176429749, "rewards/format_reward": 1.0, "step": 882 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.008914689550731953, "grad_norm": 9.39300139585317, "kl": 0.05859375, "learning_rate": 9.99803924281751e-07, "loss": 0.0023, "reward": 2.1250624656677246, "reward_std": 0.04131350666284561, "rewards/accuracy_reward": 0.9250625371932983, "rewards/format_reward": 1.0, "step": 883 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 407.6875, "epoch": 0.008924785461887935, "grad_norm": 3.0445293651649674, "kl": 0.05908203125, "learning_rate": 9.998034799467708e-07, "loss": 0.0024, "reward": 1.7512500286102295, "reward_std": 0.03937630355358124, "rewards/accuracy_reward": 0.6012499928474426, "rewards/format_reward": 1.0, "step": 884 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.03125, "epoch": 0.008934881373043917, "grad_norm": 4.566250892613651, "kl": 0.056396484375, "learning_rate": 9.998030351089962e-07, "loss": 0.0023, "reward": 2.131093740463257, "reward_std": 0.03168313577771187, "rewards/accuracy_reward": 0.9310937523841858, "rewards/format_reward": 1.0, "step": 885 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.25, "epoch": 0.0089449772841999, "grad_norm": 2.5498383390875143, "kl": 0.060302734375, "learning_rate": 9.998025897684284e-07, "loss": 0.0024, "reward": 1.8192812204360962, "reward_std": 0.01678788661956787, "rewards/accuracy_reward": 0.6692811846733093, "rewards/format_reward": 1.0, "step": 886 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.008955073195355882, "grad_norm": 3.9574364965116526, "kl": 0.046630859375, "learning_rate": 9.998021439250675e-07, "loss": 0.0019, "reward": 1.9537501335144043, "reward_std": 0.04404892399907112, "rewards/accuracy_reward": 0.7537500858306885, "rewards/format_reward": 1.0, "step": 887 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.84375, "epoch": 0.008965169106511862, "grad_norm": 2.2494701642525294, "kl": 0.057861328125, "learning_rate": 9.998016975789138e-07, "loss": 0.0023, "reward": 2.109187602996826, "reward_std": 0.0230390802025795, "rewards/accuracy_reward": 0.9091875553131104, "rewards/format_reward": 1.0, "step": 888 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 389.78125, "epoch": 0.008975265017667844, "grad_norm": 2.3708531753856033, "kl": 0.05419921875, "learning_rate": 9.99801250729968e-07, "loss": 0.0022, "reward": 1.9539687633514404, "reward_std": 0.05545176565647125, "rewards/accuracy_reward": 0.7602187395095825, "rewards/format_reward": 1.0, "step": 889 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.0, "epoch": 0.008985360928823826, "grad_norm": 1.7445214412210004, "kl": 0.051513671875, "learning_rate": 9.998008033782305e-07, "loss": 0.0021, "reward": 1.8458750247955322, "reward_std": 0.03623118996620178, "rewards/accuracy_reward": 0.7021249532699585, "rewards/format_reward": 1.0, "step": 890 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.3125, "epoch": 0.008995456839979809, "grad_norm": 3.560141844714833, "kl": 0.055419921875, "learning_rate": 9.998003555237017e-07, "loss": 0.0022, "reward": 2.121875047683716, "reward_std": 0.03971966356039047, "rewards/accuracy_reward": 0.9218749403953552, "rewards/format_reward": 1.0, "step": 891 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 380.6875, "epoch": 0.00900555275113579, "grad_norm": 4.446296585351229, "kl": 0.052734375, "learning_rate": 9.997999071663822e-07, "loss": 0.0021, "reward": 1.8613437414169312, "reward_std": 0.029906127601861954, "rewards/accuracy_reward": 0.7175937294960022, "rewards/format_reward": 1.0, "step": 892 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.625, "epoch": 0.009015648662291771, "grad_norm": 3.1320137899441205, "kl": 0.0498046875, "learning_rate": 9.997994583062722e-07, "loss": 0.002, "reward": 1.8351562023162842, "reward_std": 0.04320047050714493, "rewards/accuracy_reward": 0.6976563334465027, "rewards/format_reward": 1.0, "step": 893 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.625, "epoch": 0.009025744573447753, "grad_norm": 1.7429879672265738, "kl": 0.0498046875, "learning_rate": 9.997990089433723e-07, "loss": 0.002, "reward": 2.1406564712524414, "reward_std": 0.021406814455986023, "rewards/accuracy_reward": 0.940656304359436, "rewards/format_reward": 1.0, "step": 894 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.53125, "epoch": 0.009035840484603735, "grad_norm": 2.3951935932396435, "kl": 0.059814453125, "learning_rate": 9.997985590776828e-07, "loss": 0.0024, "reward": 2.116468667984009, "reward_std": 0.04250234365463257, "rewards/accuracy_reward": 0.9227187037467957, "rewards/format_reward": 1.0, "step": 895 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.21875, "epoch": 0.009045936395759718, "grad_norm": 4.01895882903814, "kl": 0.058349609375, "learning_rate": 9.997981087092044e-07, "loss": 0.0023, "reward": 2.1060314178466797, "reward_std": 0.04161589592695236, "rewards/accuracy_reward": 0.9122812747955322, "rewards/format_reward": 1.0, "step": 896 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.0090560323069157, "grad_norm": 2.3162182720379034, "kl": 0.06201171875, "learning_rate": 9.997976578379375e-07, "loss": 0.0025, "reward": 2.0149688720703125, "reward_std": 0.03269442543387413, "rewards/accuracy_reward": 0.821218729019165, "rewards/format_reward": 1.0, "step": 897 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.00906612821807168, "grad_norm": 2.2706830979692127, "kl": 0.059814453125, "learning_rate": 9.997972064638822e-07, "loss": 0.0024, "reward": 2.091218948364258, "reward_std": 0.04398360848426819, "rewards/accuracy_reward": 0.9037187695503235, "rewards/format_reward": 1.0, "step": 898 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.009076224129227662, "grad_norm": 3.034438112836575, "kl": 0.0498046875, "learning_rate": 9.997967545870396e-07, "loss": 0.002, "reward": 2.0547189712524414, "reward_std": 0.17546001076698303, "rewards/accuracy_reward": 0.8734687566757202, "rewards/format_reward": 1.0, "step": 899 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.0, "epoch": 0.009086320040383645, "grad_norm": 1.9711359606142378, "kl": 0.0595703125, "learning_rate": 9.997963022074096e-07, "loss": 0.0024, "reward": 2.0228750705718994, "reward_std": 0.16375020146369934, "rewards/accuracy_reward": 0.8416250348091125, "rewards/format_reward": 1.0, "step": 900 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.375, "epoch": 0.009096415951539627, "grad_norm": 2.204692220549978, "kl": 0.068359375, "learning_rate": 9.99795849324993e-07, "loss": 0.0027, "reward": 2.0854687690734863, "reward_std": 0.04230019450187683, "rewards/accuracy_reward": 0.8917187452316284, "rewards/format_reward": 1.0, "step": 901 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 422.71875, "epoch": 0.009106511862695609, "grad_norm": 2.537545647162328, "kl": 0.052001953125, "learning_rate": 9.9979539593979e-07, "loss": 0.0021, "reward": 1.5061249732971191, "reward_std": 0.0219787135720253, "rewards/accuracy_reward": 0.4123750329017639, "rewards/format_reward": 1.0, "step": 902 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.00911660777385159, "grad_norm": 2.571481102289498, "kl": 0.0654296875, "learning_rate": 9.99794942051801e-07, "loss": 0.0026, "reward": 2.0499374866485596, "reward_std": 0.020483683794736862, "rewards/accuracy_reward": 0.8499374985694885, "rewards/format_reward": 1.0, "step": 903 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 432.75, "epoch": 0.009126703685007571, "grad_norm": 2.21891829624773, "kl": 0.054443359375, "learning_rate": 9.99794487661027e-07, "loss": 0.0022, "reward": 2.1367812156677246, "reward_std": 0.02499227225780487, "rewards/accuracy_reward": 0.9367812871932983, "rewards/format_reward": 1.0, "step": 904 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.9375, "epoch": 0.009136799596163554, "grad_norm": 2.8926937847817036, "kl": 0.05517578125, "learning_rate": 9.997940327674676e-07, "loss": 0.0022, "reward": 2.0357813835144043, "reward_std": 0.0367196686565876, "rewards/accuracy_reward": 0.8420313000679016, "rewards/format_reward": 1.0, "step": 905 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 446.53125, "epoch": 0.009146895507319536, "grad_norm": 1.6638024331408177, "kl": 0.054443359375, "learning_rate": 9.997935773711241e-07, "loss": 0.0022, "reward": 1.706437587738037, "reward_std": 0.1548319309949875, "rewards/accuracy_reward": 0.5939375162124634, "rewards/format_reward": 1.0, "step": 906 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.84375, "epoch": 0.009156991418475518, "grad_norm": 2.069005565534888, "kl": 0.0634765625, "learning_rate": 9.997931214719965e-07, "loss": 0.0025, "reward": 1.8489999771118164, "reward_std": 0.02954774722456932, "rewards/accuracy_reward": 0.6990000009536743, "rewards/format_reward": 1.0, "step": 907 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 447.84375, "epoch": 0.0091670873296315, "grad_norm": 2.1536948146483206, "kl": 0.049072265625, "learning_rate": 9.997926650700853e-07, "loss": 0.002, "reward": 2.0126876831054688, "reward_std": 0.19206655025482178, "rewards/accuracy_reward": 0.8501874804496765, "rewards/format_reward": 1.0, "step": 908 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.28125, "epoch": 0.00917718324078748, "grad_norm": 2.484708147557248, "kl": 0.0673828125, "learning_rate": 9.99792208165391e-07, "loss": 0.0027, "reward": 2.0314688682556152, "reward_std": 0.04468701779842377, "rewards/accuracy_reward": 0.8377187252044678, "rewards/format_reward": 1.0, "step": 909 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.25, "epoch": 0.009187279151943463, "grad_norm": 2.432775775518575, "kl": 0.064453125, "learning_rate": 9.997917507579142e-07, "loss": 0.0026, "reward": 2.1469063758850098, "reward_std": 0.04915037751197815, "rewards/accuracy_reward": 0.9594062566757202, "rewards/format_reward": 1.0, "step": 910 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 433.65625, "epoch": 0.009197375063099445, "grad_norm": 2.8988863166442527, "kl": 0.064453125, "learning_rate": 9.99791292847655e-07, "loss": 0.0026, "reward": 2.0317187309265137, "reward_std": 0.06022502854466438, "rewards/accuracy_reward": 0.8504688143730164, "rewards/format_reward": 1.0, "step": 911 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 441.65625, "epoch": 0.009207470974255427, "grad_norm": 2.383650908711806, "kl": 0.057861328125, "learning_rate": 9.997908344346142e-07, "loss": 0.0023, "reward": 1.725406289100647, "reward_std": 0.03919292241334915, "rewards/accuracy_reward": 0.5754062533378601, "rewards/format_reward": 1.0, "step": 912 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 436.15625, "epoch": 0.009217566885411409, "grad_norm": 1.7328601858037378, "kl": 0.05517578125, "learning_rate": 9.997903755187923e-07, "loss": 0.0022, "reward": 1.5904687643051147, "reward_std": 0.007217860780656338, "rewards/accuracy_reward": 0.49046874046325684, "rewards/format_reward": 1.0, "step": 913 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.6875, "epoch": 0.00922766279656739, "grad_norm": 2.2947598201763433, "kl": 0.0595703125, "learning_rate": 9.997899161001894e-07, "loss": 0.0024, "reward": 2.101968765258789, "reward_std": 0.035636428743600845, "rewards/accuracy_reward": 0.9082187414169312, "rewards/format_reward": 1.0, "step": 914 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 440.96875, "epoch": 0.009237758707723372, "grad_norm": 3.810701623833797, "kl": 0.06298828125, "learning_rate": 9.997894561788063e-07, "loss": 0.0025, "reward": 1.992281198501587, "reward_std": 0.026126816868782043, "rewards/accuracy_reward": 0.7922812700271606, "rewards/format_reward": 1.0, "step": 915 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 435.1875, "epoch": 0.009247854618879354, "grad_norm": 1.7614967586269883, "kl": 0.05224609375, "learning_rate": 9.997889957546432e-07, "loss": 0.0021, "reward": 1.7449063062667847, "reward_std": 0.009561268612742424, "rewards/accuracy_reward": 0.594906210899353, "rewards/format_reward": 1.0, "step": 916 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 432.65625, "epoch": 0.009257950530035336, "grad_norm": 2.444228305388804, "kl": 0.062255859375, "learning_rate": 9.99788534827701e-07, "loss": 0.0025, "reward": 1.7058438062667847, "reward_std": 0.17731893062591553, "rewards/accuracy_reward": 0.5808437466621399, "rewards/format_reward": 1.0, "step": 917 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 443.6875, "epoch": 0.009268046441191318, "grad_norm": 2.4423476629581655, "kl": 0.0556640625, "learning_rate": 9.997880733979795e-07, "loss": 0.0022, "reward": 1.9882187843322754, "reward_std": 0.17056973278522491, "rewards/accuracy_reward": 0.8069686889648438, "rewards/format_reward": 1.0, "step": 918 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.25, "epoch": 0.009278142352347299, "grad_norm": 2.214230132620995, "kl": 0.0546875, "learning_rate": 9.997876114654798e-07, "loss": 0.0022, "reward": 2.049093723297119, "reward_std": 0.019009632989764214, "rewards/accuracy_reward": 0.8490937352180481, "rewards/format_reward": 1.0, "step": 919 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 437.75, "epoch": 0.00928823826350328, "grad_norm": 1.795794664347093, "kl": 0.055419921875, "learning_rate": 9.997871490302018e-07, "loss": 0.0022, "reward": 1.7952187061309814, "reward_std": 0.06039809063076973, "rewards/accuracy_reward": 0.6577187180519104, "rewards/format_reward": 1.0, "step": 920 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.8125, "epoch": 0.009298334174659263, "grad_norm": 2.9294110781404785, "kl": 0.058837890625, "learning_rate": 9.997866860921465e-07, "loss": 0.0024, "reward": 2.0499062538146973, "reward_std": 0.03382597118616104, "rewards/accuracy_reward": 0.8499062061309814, "rewards/format_reward": 1.0, "step": 921 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 431.8125, "epoch": 0.009308430085815245, "grad_norm": 2.113261956002569, "kl": 0.0625, "learning_rate": 9.99786222651314e-07, "loss": 0.0025, "reward": 1.843406319618225, "reward_std": 0.021154407411813736, "rewards/accuracy_reward": 0.6934062838554382, "rewards/format_reward": 1.0, "step": 922 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.78125, "epoch": 0.009318525996971227, "grad_norm": 1.700313389956953, "kl": 0.05859375, "learning_rate": 9.99785758707705e-07, "loss": 0.0024, "reward": 1.7074999809265137, "reward_std": 0.014774818904697895, "rewards/accuracy_reward": 0.5575000047683716, "rewards/format_reward": 1.0, "step": 923 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.96875, "epoch": 0.00932862190812721, "grad_norm": 2.9872065195978506, "kl": 0.05126953125, "learning_rate": 9.997852942613198e-07, "loss": 0.0021, "reward": 2.0074377059936523, "reward_std": 0.022495286539196968, "rewards/accuracy_reward": 0.8074374198913574, "rewards/format_reward": 1.0, "step": 924 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.5, "epoch": 0.00933871781928319, "grad_norm": 2.50106687371404, "kl": 0.0478515625, "learning_rate": 9.99784829312159e-07, "loss": 0.0019, "reward": 1.9256561994552612, "reward_std": 0.16856028139591217, "rewards/accuracy_reward": 0.756906270980835, "rewards/format_reward": 1.0, "step": 925 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.71875, "epoch": 0.009348813730439172, "grad_norm": 2.353318570061069, "kl": 0.040283203125, "learning_rate": 9.99784363860223e-07, "loss": 0.0016, "reward": 1.7752811908721924, "reward_std": 0.15701444447040558, "rewards/accuracy_reward": 0.6440312266349792, "rewards/format_reward": 1.0, "step": 926 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.65625, "epoch": 0.009358909641595154, "grad_norm": 1.7984480844441642, "kl": 0.04736328125, "learning_rate": 9.997838979055122e-07, "loss": 0.0019, "reward": 1.8339998722076416, "reward_std": 0.15074408054351807, "rewards/accuracy_reward": 0.671500027179718, "rewards/format_reward": 1.0, "step": 927 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.90625, "epoch": 0.009369005552751136, "grad_norm": 5.943453781491926, "kl": 0.06005859375, "learning_rate": 9.997834314480271e-07, "loss": 0.0024, "reward": 2.083625078201294, "reward_std": 0.12485168874263763, "rewards/accuracy_reward": 0.8961249589920044, "rewards/format_reward": 1.0, "step": 928 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 413.5, "epoch": 0.009379101463907118, "grad_norm": 4.63936121453321, "kl": 0.04833984375, "learning_rate": 9.997829644877681e-07, "loss": 0.0019, "reward": 1.746500015258789, "reward_std": 0.12001600861549377, "rewards/accuracy_reward": 0.6027500033378601, "rewards/format_reward": 1.0, "step": 929 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.09375, "epoch": 0.009389197375063099, "grad_norm": 1.840461895168125, "kl": 0.05419921875, "learning_rate": 9.99782497024736e-07, "loss": 0.0022, "reward": 2.067906379699707, "reward_std": 0.018247373402118683, "rewards/accuracy_reward": 0.8679062724113464, "rewards/format_reward": 1.0, "step": 930 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 404.78125, "epoch": 0.009399293286219081, "grad_norm": 2.9859778720920027, "kl": 0.0615234375, "learning_rate": 9.99782029058931e-07, "loss": 0.0025, "reward": 1.8185312747955322, "reward_std": 0.021518679335713387, "rewards/accuracy_reward": 0.6685311794281006, "rewards/format_reward": 1.0, "step": 931 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.6875, "epoch": 0.009409389197375063, "grad_norm": 2.3397536173829283, "kl": 0.045654296875, "learning_rate": 9.997815605903536e-07, "loss": 0.0018, "reward": 1.8960938453674316, "reward_std": 0.1652773916721344, "rewards/accuracy_reward": 0.7398437857627869, "rewards/format_reward": 1.0, "step": 932 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.5625, "epoch": 0.009419485108531045, "grad_norm": 2.615187728195696, "kl": 0.04833984375, "learning_rate": 9.99781091619004e-07, "loss": 0.0019, "reward": 1.5997188091278076, "reward_std": 0.19341713190078735, "rewards/accuracy_reward": 0.4997187554836273, "rewards/format_reward": 1.0, "step": 933 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 405.75, "epoch": 0.009429581019687027, "grad_norm": 2.2916133637143887, "kl": 0.05224609375, "learning_rate": 9.99780622144883e-07, "loss": 0.0021, "reward": 1.7827813625335693, "reward_std": 0.036504779011011124, "rewards/accuracy_reward": 0.6390312910079956, "rewards/format_reward": 1.0, "step": 934 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.09375, "epoch": 0.009439676930843008, "grad_norm": 1.994215276922213, "kl": 0.05078125, "learning_rate": 9.997801521679913e-07, "loss": 0.002, "reward": 1.9593126773834229, "reward_std": 0.1774912327528, "rewards/accuracy_reward": 0.7843124866485596, "rewards/format_reward": 1.0, "step": 935 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.5625, "epoch": 0.00944977284199899, "grad_norm": 2.1270568200321476, "kl": 0.04345703125, "learning_rate": 9.99779681688329e-07, "loss": 0.0017, "reward": 1.9592812061309814, "reward_std": 0.3033108115196228, "rewards/accuracy_reward": 0.8030312657356262, "rewards/format_reward": 1.0, "step": 936 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.3125, "epoch": 0.009459868753154972, "grad_norm": 2.6298695562551244, "kl": 0.04833984375, "learning_rate": 9.997792107058966e-07, "loss": 0.0019, "reward": 1.9961563348770142, "reward_std": 0.19640599191188812, "rewards/accuracy_reward": 0.8149062395095825, "rewards/format_reward": 1.0, "step": 937 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.0625, "epoch": 0.009469964664310954, "grad_norm": 1.9812050917605162, "kl": 0.050537109375, "learning_rate": 9.997787392206946e-07, "loss": 0.002, "reward": 1.8731249570846558, "reward_std": 0.009165996685624123, "rewards/accuracy_reward": 0.7231249809265137, "rewards/format_reward": 1.0, "step": 938 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.09375, "epoch": 0.009480060575466936, "grad_norm": 2.767972563280163, "kl": 0.060546875, "learning_rate": 9.997782672327238e-07, "loss": 0.0024, "reward": 2.0759687423706055, "reward_std": 0.01714373752474785, "rewards/accuracy_reward": 0.8759686946868896, "rewards/format_reward": 1.0, "step": 939 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.9375, "epoch": 0.009490156486622917, "grad_norm": 3.0471747435934815, "kl": 0.06396484375, "learning_rate": 9.99777794741984e-07, "loss": 0.0026, "reward": 1.8886563777923584, "reward_std": 0.05006755515933037, "rewards/accuracy_reward": 0.6949062347412109, "rewards/format_reward": 1.0, "step": 940 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.28125, "epoch": 0.009500252397778899, "grad_norm": 2.2648375200294435, "kl": 0.058349609375, "learning_rate": 9.997773217484764e-07, "loss": 0.0023, "reward": 2.0002188682556152, "reward_std": 0.023515615612268448, "rewards/accuracy_reward": 0.8002187609672546, "rewards/format_reward": 1.0, "step": 941 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.009510348308934881, "grad_norm": 1.9612839133129945, "kl": 0.0439453125, "learning_rate": 9.997768482522011e-07, "loss": 0.0018, "reward": 1.9771875143051147, "reward_std": 0.12008969485759735, "rewards/accuracy_reward": 0.7834374904632568, "rewards/format_reward": 1.0, "step": 942 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.28125, "epoch": 0.009520444220090863, "grad_norm": 1.143037470070718, "kl": 0.04443359375, "learning_rate": 9.997763742531585e-07, "loss": 0.0018, "reward": 2.1590938568115234, "reward_std": 0.007211021613329649, "rewards/accuracy_reward": 0.9590938091278076, "rewards/format_reward": 1.0, "step": 943 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 401.8125, "epoch": 0.009530540131246846, "grad_norm": 2.0394845821143317, "kl": 0.052734375, "learning_rate": 9.997758997513492e-07, "loss": 0.0021, "reward": 1.688906192779541, "reward_std": 0.02035578154027462, "rewards/accuracy_reward": 0.5389062166213989, "rewards/format_reward": 1.0, "step": 944 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.90625, "epoch": 0.009540636042402828, "grad_norm": 2.66323403026248, "kl": 0.0517578125, "learning_rate": 9.997754247467736e-07, "loss": 0.0021, "reward": 2.0002188682556152, "reward_std": 0.02322804369032383, "rewards/accuracy_reward": 0.8002188205718994, "rewards/format_reward": 1.0, "step": 945 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.009550731953558808, "grad_norm": 2.353565938139793, "kl": 0.05908203125, "learning_rate": 9.997749492394325e-07, "loss": 0.0024, "reward": 2.01924991607666, "reward_std": 0.02157290279865265, "rewards/accuracy_reward": 0.8192499876022339, "rewards/format_reward": 1.0, "step": 946 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.03125, "epoch": 0.00956082786471479, "grad_norm": 2.3927026229733728, "kl": 0.056396484375, "learning_rate": 9.99774473229326e-07, "loss": 0.0023, "reward": 1.9575937986373901, "reward_std": 0.04343781620264053, "rewards/accuracy_reward": 0.7638437151908875, "rewards/format_reward": 1.0, "step": 947 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.15625, "epoch": 0.009570923775870772, "grad_norm": 3.0867844807546057, "kl": 0.05126953125, "learning_rate": 9.997739967164546e-07, "loss": 0.0021, "reward": 1.8471875190734863, "reward_std": 0.04654403775930405, "rewards/accuracy_reward": 0.7096875309944153, "rewards/format_reward": 1.0, "step": 948 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.009581019687026755, "grad_norm": 4.122196513029733, "kl": 0.05859375, "learning_rate": 9.99773519700819e-07, "loss": 0.0023, "reward": 2.1596875190734863, "reward_std": 0.016676165163517, "rewards/accuracy_reward": 0.9596874713897705, "rewards/format_reward": 1.0, "step": 949 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.65625, "epoch": 0.009591115598182737, "grad_norm": 3.303493083565302, "kl": 0.05078125, "learning_rate": 9.997730421824195e-07, "loss": 0.002, "reward": 1.9800938367843628, "reward_std": 0.03776086866855621, "rewards/accuracy_reward": 0.780093789100647, "rewards/format_reward": 1.0, "step": 950 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.009601211509338717, "grad_norm": 3.315148578305322, "kl": 0.06494140625, "learning_rate": 9.997725641612568e-07, "loss": 0.0026, "reward": 2.034343719482422, "reward_std": 0.036925043910741806, "rewards/accuracy_reward": 0.840593695640564, "rewards/format_reward": 1.0, "step": 951 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.0096113074204947, "grad_norm": 9.460680943115205, "kl": 0.05615234375, "learning_rate": 9.99772085637331e-07, "loss": 0.0022, "reward": 2.083718776702881, "reward_std": 0.031084399670362473, "rewards/accuracy_reward": 0.883718729019165, "rewards/format_reward": 1.0, "step": 952 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.009621403331650681, "grad_norm": 1.916939455519062, "kl": 0.05224609375, "learning_rate": 9.99771606610643e-07, "loss": 0.0021, "reward": 2.1756250858306885, "reward_std": 0.03140364587306976, "rewards/accuracy_reward": 0.9818750023841858, "rewards/format_reward": 1.0, "step": 953 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.5, "epoch": 0.009631499242806664, "grad_norm": 2.2880517808143543, "kl": 0.048583984375, "learning_rate": 9.997711270811931e-07, "loss": 0.0019, "reward": 2.143812417984009, "reward_std": 0.018591120839118958, "rewards/accuracy_reward": 0.9438124895095825, "rewards/format_reward": 1.0, "step": 954 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.09375, "epoch": 0.009641595153962646, "grad_norm": 1.8888012361098185, "kl": 0.048828125, "learning_rate": 9.997706470489817e-07, "loss": 0.0019, "reward": 2.128000020980835, "reward_std": 0.04288956895470619, "rewards/accuracy_reward": 0.940500020980835, "rewards/format_reward": 1.0, "step": 955 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.96875, "epoch": 0.009651691065118626, "grad_norm": 2.6686472444455753, "kl": 0.0556640625, "learning_rate": 9.997701665140094e-07, "loss": 0.0022, "reward": 2.069093704223633, "reward_std": 0.027659865096211433, "rewards/accuracy_reward": 0.8753437995910645, "rewards/format_reward": 1.0, "step": 956 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.03125, "epoch": 0.009661786976274608, "grad_norm": 4.535113718290894, "kl": 0.0556640625, "learning_rate": 9.997696854762764e-07, "loss": 0.0022, "reward": 2.1370937824249268, "reward_std": 0.011078841052949429, "rewards/accuracy_reward": 0.9370937943458557, "rewards/format_reward": 1.0, "step": 957 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.03125, "epoch": 0.00967188288743059, "grad_norm": 1.6402130318477708, "kl": 0.054443359375, "learning_rate": 9.997692039357837e-07, "loss": 0.0022, "reward": 2.050062417984009, "reward_std": 0.02518017590045929, "rewards/accuracy_reward": 0.8500624895095825, "rewards/format_reward": 1.0, "step": 958 }, { "all_correct": 0.0, "all_wrong": 0.75, "completion_length": 425.21875, "epoch": 0.009681978798586573, "grad_norm": 0.9612217668184577, "kl": 0.0380859375, "learning_rate": 9.997687218925314e-07, "loss": 0.0015, "reward": 1.2625000476837158, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 1.0, "step": 959 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.9375, "epoch": 0.009692074709742555, "grad_norm": 4.854756350792371, "kl": 0.055908203125, "learning_rate": 9.997682393465204e-07, "loss": 0.0022, "reward": 2.090531349182129, "reward_std": 0.01142137125134468, "rewards/accuracy_reward": 0.8905313014984131, "rewards/format_reward": 1.0, "step": 960 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.009702170620898535, "grad_norm": 3.3829693565423598, "kl": 0.048583984375, "learning_rate": 9.997677562977504e-07, "loss": 0.0019, "reward": 1.996250033378601, "reward_std": 0.16485801339149475, "rewards/accuracy_reward": 0.82750004529953, "rewards/format_reward": 1.0, "step": 961 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.03125, "epoch": 0.009712266532054517, "grad_norm": 10.020282449620844, "kl": 0.047607421875, "learning_rate": 9.997672727462227e-07, "loss": 0.0019, "reward": 2.0953750610351562, "reward_std": 0.04625902324914932, "rewards/accuracy_reward": 0.9016250371932983, "rewards/format_reward": 1.0, "step": 962 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.5625, "epoch": 0.0097223624432105, "grad_norm": 2.417639528358192, "kl": 0.054443359375, "learning_rate": 9.997667886919374e-07, "loss": 0.0022, "reward": 2.0590624809265137, "reward_std": 0.0759117603302002, "rewards/accuracy_reward": 0.8840624690055847, "rewards/format_reward": 1.0, "step": 963 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.90625, "epoch": 0.009732458354366482, "grad_norm": 2.057798458055306, "kl": 0.06005859375, "learning_rate": 9.99766304134895e-07, "loss": 0.0024, "reward": 1.9119594097137451, "reward_std": 0.041536882519721985, "rewards/accuracy_reward": 0.7119593620300293, "rewards/format_reward": 1.0, "step": 964 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.46875, "epoch": 0.009742554265522464, "grad_norm": 1.928059327162116, "kl": 0.05126953125, "learning_rate": 9.99765819075096e-07, "loss": 0.002, "reward": 1.797562599182129, "reward_std": 0.03990975394845009, "rewards/accuracy_reward": 0.6475625038146973, "rewards/format_reward": 1.0, "step": 965 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.25, "epoch": 0.009752650176678446, "grad_norm": 2.1930730382623103, "kl": 0.057861328125, "learning_rate": 9.99765333512541e-07, "loss": 0.0023, "reward": 1.9202500581741333, "reward_std": 0.20106805860996246, "rewards/accuracy_reward": 0.7515000104904175, "rewards/format_reward": 1.0, "step": 966 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.009762746087834426, "grad_norm": 4.030345491537256, "kl": 0.048828125, "learning_rate": 9.997648474472302e-07, "loss": 0.002, "reward": 2.1703438758850098, "reward_std": 0.00809373427182436, "rewards/accuracy_reward": 0.9703437089920044, "rewards/format_reward": 1.0, "step": 967 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.125, "epoch": 0.009772841998990409, "grad_norm": 3.4556756367889023, "kl": 0.034423828125, "learning_rate": 9.997643608791643e-07, "loss": 0.0014, "reward": 1.8301875591278076, "reward_std": 0.05761440098285675, "rewards/accuracy_reward": 0.6926875114440918, "rewards/format_reward": 1.0, "step": 968 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.59375, "epoch": 0.00978293791014639, "grad_norm": 8.51772608999231, "kl": 0.056640625, "learning_rate": 9.99763873808344e-07, "loss": 0.0023, "reward": 1.8722189664840698, "reward_std": 0.15943707525730133, "rewards/accuracy_reward": 0.7159687876701355, "rewards/format_reward": 1.0, "step": 969 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.125, "epoch": 0.009793033821302373, "grad_norm": 2.145414620928799, "kl": 0.0498046875, "learning_rate": 9.997633862347695e-07, "loss": 0.002, "reward": 1.7206876277923584, "reward_std": 0.15233366191387177, "rewards/accuracy_reward": 0.583187460899353, "rewards/format_reward": 1.0, "step": 970 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.84375, "epoch": 0.009803129732458355, "grad_norm": 1.8226753115614929, "kl": 0.061767578125, "learning_rate": 9.997628981584414e-07, "loss": 0.0025, "reward": 1.9569687843322754, "reward_std": 0.02007569558918476, "rewards/accuracy_reward": 0.7569687366485596, "rewards/format_reward": 1.0, "step": 971 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.15625, "epoch": 0.009813225643614336, "grad_norm": 1.5373553121552326, "kl": 0.0546875, "learning_rate": 9.9976240957936e-07, "loss": 0.0022, "reward": 1.794281244277954, "reward_std": 0.015348419547080994, "rewards/accuracy_reward": 0.644281268119812, "rewards/format_reward": 1.0, "step": 972 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.3125, "epoch": 0.009823321554770318, "grad_norm": 2.6194502722723554, "kl": 0.054931640625, "learning_rate": 9.99761920497526e-07, "loss": 0.0022, "reward": 2.02315616607666, "reward_std": 0.17349904775619507, "rewards/accuracy_reward": 0.8419061899185181, "rewards/format_reward": 1.0, "step": 973 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.90625, "epoch": 0.0098334174659263, "grad_norm": 5.308859864475841, "kl": 0.0517578125, "learning_rate": 9.9976143091294e-07, "loss": 0.0021, "reward": 1.8197813034057617, "reward_std": 0.02622481808066368, "rewards/accuracy_reward": 0.6697812080383301, "rewards/format_reward": 1.0, "step": 974 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.625, "epoch": 0.009843513377082282, "grad_norm": 2.151841228886061, "kl": 0.06103515625, "learning_rate": 9.997609408256023e-07, "loss": 0.0024, "reward": 2.148562431335449, "reward_std": 0.03071713075041771, "rewards/accuracy_reward": 0.9548125267028809, "rewards/format_reward": 1.0, "step": 975 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.875, "epoch": 0.009853609288238264, "grad_norm": 1.96938603077192, "kl": 0.056396484375, "learning_rate": 9.997604502355133e-07, "loss": 0.0023, "reward": 2.074031114578247, "reward_std": 0.023630715906620026, "rewards/accuracy_reward": 0.8740312457084656, "rewards/format_reward": 1.0, "step": 976 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 440.125, "epoch": 0.009863705199394245, "grad_norm": 2.1572727812300796, "kl": 0.049560546875, "learning_rate": 9.997599591426734e-07, "loss": 0.002, "reward": 1.7907500267028809, "reward_std": 0.12280425429344177, "rewards/accuracy_reward": 0.653249979019165, "rewards/format_reward": 1.0, "step": 977 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.3125, "epoch": 0.009873801110550227, "grad_norm": 4.087430718660842, "kl": 0.06396484375, "learning_rate": 9.997594675470837e-07, "loss": 0.0026, "reward": 2.062000036239624, "reward_std": 0.048345256596803665, "rewards/accuracy_reward": 0.8682499527931213, "rewards/format_reward": 1.0, "step": 978 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.5, "epoch": 0.009883897021706209, "grad_norm": 1.9062750874227639, "kl": 0.06201171875, "learning_rate": 9.997589754487442e-07, "loss": 0.0025, "reward": 2.1643126010894775, "reward_std": 0.0178428515791893, "rewards/accuracy_reward": 0.9643124938011169, "rewards/format_reward": 1.0, "step": 979 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.34375, "epoch": 0.009893992932862191, "grad_norm": 1.6444836588534537, "kl": 0.0517578125, "learning_rate": 9.997584828476554e-07, "loss": 0.0021, "reward": 1.884124994277954, "reward_std": 0.10040219128131866, "rewards/accuracy_reward": 0.734125018119812, "rewards/format_reward": 1.0, "step": 980 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 436.90625, "epoch": 0.009904088844018173, "grad_norm": 1.2942986300056134, "kl": 0.046875, "learning_rate": 9.99757989743818e-07, "loss": 0.0019, "reward": 1.760218858718872, "reward_std": 0.1735367327928543, "rewards/accuracy_reward": 0.6352187395095825, "rewards/format_reward": 1.0, "step": 981 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 425.78125, "epoch": 0.009914184755174154, "grad_norm": 1.9633208272591571, "kl": 0.06396484375, "learning_rate": 9.997574961372324e-07, "loss": 0.0026, "reward": 1.711593747138977, "reward_std": 0.04220384359359741, "rewards/accuracy_reward": 0.5678437948226929, "rewards/format_reward": 1.0, "step": 982 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.21875, "epoch": 0.009924280666330136, "grad_norm": 1.9350896654772571, "kl": 0.0546875, "learning_rate": 9.99757002027899e-07, "loss": 0.0022, "reward": 2.1040310859680176, "reward_std": 0.02122448943555355, "rewards/accuracy_reward": 0.9040312170982361, "rewards/format_reward": 1.0, "step": 983 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.0625, "epoch": 0.009934376577486118, "grad_norm": 2.1652349900451457, "kl": 0.053466796875, "learning_rate": 9.997565074158185e-07, "loss": 0.0021, "reward": 2.105593681335449, "reward_std": 0.031690362840890884, "rewards/accuracy_reward": 0.9118437767028809, "rewards/format_reward": 1.0, "step": 984 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.375, "epoch": 0.0099444724886421, "grad_norm": 2.5040421766431504, "kl": 0.0556640625, "learning_rate": 9.99756012300991e-07, "loss": 0.0022, "reward": 1.8513126373291016, "reward_std": 0.19162538647651672, "rewards/accuracy_reward": 0.7013125419616699, "rewards/format_reward": 1.0, "step": 985 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.625, "epoch": 0.009954568399798082, "grad_norm": 2.103180927587535, "kl": 0.052001953125, "learning_rate": 9.997555166834177e-07, "loss": 0.0021, "reward": 1.9310625791549683, "reward_std": 0.19653166830539703, "rewards/accuracy_reward": 0.7623125314712524, "rewards/format_reward": 1.0, "step": 986 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.4375, "epoch": 0.009964664310954064, "grad_norm": 2.0799948488077424, "kl": 0.055419921875, "learning_rate": 9.997550205630984e-07, "loss": 0.0022, "reward": 1.8023126125335693, "reward_std": 0.022043148055672646, "rewards/accuracy_reward": 0.6523125171661377, "rewards/format_reward": 1.0, "step": 987 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 420.25, "epoch": 0.009974760222110045, "grad_norm": 2.023853669934376, "kl": 0.058349609375, "learning_rate": 9.99754523940034e-07, "loss": 0.0023, "reward": 1.8484375476837158, "reward_std": 0.02682449482381344, "rewards/accuracy_reward": 0.6984374523162842, "rewards/format_reward": 1.0, "step": 988 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 438.65625, "epoch": 0.009984856133266027, "grad_norm": 2.111227520505629, "kl": 0.05322265625, "learning_rate": 9.997540268142249e-07, "loss": 0.0021, "reward": 1.5134999752044678, "reward_std": 0.013160059228539467, "rewards/accuracy_reward": 0.41350001096725464, "rewards/format_reward": 1.0, "step": 989 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.3125, "epoch": 0.009994952044422009, "grad_norm": 2.6387591523420135, "kl": 0.06591796875, "learning_rate": 9.997535291856713e-07, "loss": 0.0026, "reward": 1.9561876058578491, "reward_std": 0.033555276691913605, "rewards/accuracy_reward": 0.7624375224113464, "rewards/format_reward": 1.0, "step": 990 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.65625, "epoch": 0.010005047955577991, "grad_norm": 3.640957325465327, "kl": 0.0673828125, "learning_rate": 9.997530310543744e-07, "loss": 0.0027, "reward": 2.0639686584472656, "reward_std": 0.02422749064862728, "rewards/accuracy_reward": 0.8639687299728394, "rewards/format_reward": 1.0, "step": 991 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.34375, "epoch": 0.010015143866733973, "grad_norm": 2.6919399778999415, "kl": 0.06396484375, "learning_rate": 9.997525324203339e-07, "loss": 0.0026, "reward": 2.101531505584717, "reward_std": 0.04684119671583176, "rewards/accuracy_reward": 0.9140312671661377, "rewards/format_reward": 1.0, "step": 992 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.15625, "epoch": 0.010025239777889954, "grad_norm": 4.371037142004815, "kl": 0.06005859375, "learning_rate": 9.997520332835507e-07, "loss": 0.0024, "reward": 1.8511874675750732, "reward_std": 0.10854290425777435, "rewards/accuracy_reward": 0.7011875510215759, "rewards/format_reward": 1.0, "step": 993 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.9375, "epoch": 0.010035335689045936, "grad_norm": 2.089817008852679, "kl": 0.0771484375, "learning_rate": 9.997515336440256e-07, "loss": 0.0031, "reward": 2.029656410217285, "reward_std": 0.019295968115329742, "rewards/accuracy_reward": 0.8296562433242798, "rewards/format_reward": 1.0, "step": 994 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.28125, "epoch": 0.010045431600201918, "grad_norm": 1.8831946182639734, "kl": 0.050048828125, "learning_rate": 9.997510335017587e-07, "loss": 0.002, "reward": 1.8863751888275146, "reward_std": 0.12467120587825775, "rewards/accuracy_reward": 0.7426249980926514, "rewards/format_reward": 1.0, "step": 995 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 424.25, "epoch": 0.0100555275113579, "grad_norm": 1.6530896176208452, "kl": 0.0703125, "learning_rate": 9.997505328567504e-07, "loss": 0.0028, "reward": 1.7680312395095825, "reward_std": 0.02194797620177269, "rewards/accuracy_reward": 0.6180312633514404, "rewards/format_reward": 1.0, "step": 996 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.65625, "epoch": 0.010065623422513882, "grad_norm": 2.0195511913497066, "kl": 0.064453125, "learning_rate": 9.997500317090016e-07, "loss": 0.0026, "reward": 1.9364376068115234, "reward_std": 0.04032362625002861, "rewards/accuracy_reward": 0.742687463760376, "rewards/format_reward": 1.0, "step": 997 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.010075719333669863, "grad_norm": 2.2614856672148944, "kl": 0.0751953125, "learning_rate": 9.997495300585124e-07, "loss": 0.003, "reward": 2.009093761444092, "reward_std": 0.024159850552678108, "rewards/accuracy_reward": 0.8090937733650208, "rewards/format_reward": 1.0, "step": 998 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.010085815244825845, "grad_norm": 2.2293130545807056, "kl": 0.059326171875, "learning_rate": 9.997490279052836e-07, "loss": 0.0024, "reward": 1.9506561756134033, "reward_std": 0.1803741455078125, "rewards/accuracy_reward": 0.7756562829017639, "rewards/format_reward": 1.0, "step": 999 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.71875, "epoch": 0.010095911155981827, "grad_norm": 3.9419263262462017, "kl": 0.0703125, "learning_rate": 9.997485252493156e-07, "loss": 0.0028, "reward": 2.067312240600586, "reward_std": 0.033549271523952484, "rewards/accuracy_reward": 0.8673125505447388, "rewards/format_reward": 1.0, "step": 1000 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.15625, "epoch": 0.01010600706713781, "grad_norm": 1.8177418032982533, "kl": 0.058349609375, "learning_rate": 9.997480220906089e-07, "loss": 0.0023, "reward": 1.9240624904632568, "reward_std": 0.1714363694190979, "rewards/accuracy_reward": 0.7553125023841858, "rewards/format_reward": 1.0, "step": 1001 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.010116102978293792, "grad_norm": 1.395922547607847, "kl": 0.043212890625, "learning_rate": 9.99747518429164e-07, "loss": 0.0017, "reward": 1.9031250476837158, "reward_std": 0.1513730138540268, "rewards/accuracy_reward": 0.7406250238418579, "rewards/format_reward": 1.0, "step": 1002 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.875, "epoch": 0.010126198889449774, "grad_norm": 5.705038559961498, "kl": 0.0732421875, "learning_rate": 9.997470142649816e-07, "loss": 0.0029, "reward": 2.067578077316284, "reward_std": 0.04160212352871895, "rewards/accuracy_reward": 0.873828113079071, "rewards/format_reward": 1.0, "step": 1003 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.875, "epoch": 0.010136294800605754, "grad_norm": 1.9153399655257646, "kl": 0.056884765625, "learning_rate": 9.997465095980618e-07, "loss": 0.0023, "reward": 1.8131719827651978, "reward_std": 0.024223053827881813, "rewards/accuracy_reward": 0.6631718277931213, "rewards/format_reward": 1.0, "step": 1004 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 433.34375, "epoch": 0.010146390711761736, "grad_norm": 1.165958014281874, "kl": 0.031494140625, "learning_rate": 9.997460044284055e-07, "loss": 0.0013, "reward": 1.6312499046325684, "reward_std": 0.2913779020309448, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1005 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.28125, "epoch": 0.010156486622917718, "grad_norm": 4.484139160855573, "kl": 0.07177734375, "learning_rate": 9.99745498756013e-07, "loss": 0.0029, "reward": 1.8581563234329224, "reward_std": 0.014882848598062992, "rewards/accuracy_reward": 0.7081562280654907, "rewards/format_reward": 1.0, "step": 1006 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.1875, "epoch": 0.0101665825340737, "grad_norm": 2.2797656180128203, "kl": 0.06591796875, "learning_rate": 9.997449925808847e-07, "loss": 0.0026, "reward": 2.100250005722046, "reward_std": 0.04276353865861893, "rewards/accuracy_reward": 0.9127500057220459, "rewards/format_reward": 1.0, "step": 1007 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.90625, "epoch": 0.010176678445229683, "grad_norm": 7.747778430494908, "kl": 0.06494140625, "learning_rate": 9.997444859030214e-07, "loss": 0.0026, "reward": 2.1174373626708984, "reward_std": 0.015153594315052032, "rewards/accuracy_reward": 0.9174374938011169, "rewards/format_reward": 1.0, "step": 1008 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.15625, "epoch": 0.010186774356385663, "grad_norm": 2.617460893655679, "kl": 0.058837890625, "learning_rate": 9.997439787224236e-07, "loss": 0.0024, "reward": 2.152750015258789, "reward_std": 0.029285015538334846, "rewards/accuracy_reward": 0.9589999914169312, "rewards/format_reward": 1.0, "step": 1009 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 375.65625, "epoch": 0.010196870267541645, "grad_norm": 2.384134166288652, "kl": 0.07275390625, "learning_rate": 9.997434710390916e-07, "loss": 0.0029, "reward": 1.9476875066757202, "reward_std": 0.07156717777252197, "rewards/accuracy_reward": 0.7601875066757202, "rewards/format_reward": 1.0, "step": 1010 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 401.21875, "epoch": 0.010206966178697627, "grad_norm": 2.337008201407582, "kl": 0.0673828125, "learning_rate": 9.997429628530258e-07, "loss": 0.0027, "reward": 1.5933749675750732, "reward_std": 0.030534669756889343, "rewards/accuracy_reward": 0.44962501525878906, "rewards/format_reward": 1.0, "step": 1011 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.34375, "epoch": 0.01021706208985361, "grad_norm": 2.893920643488398, "kl": 0.06787109375, "learning_rate": 9.997424541642272e-07, "loss": 0.0027, "reward": 1.96274995803833, "reward_std": 0.031503766775131226, "rewards/accuracy_reward": 0.7689999938011169, "rewards/format_reward": 1.0, "step": 1012 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.28125, "epoch": 0.010227158001009592, "grad_norm": 2.335475093830041, "kl": 0.07080078125, "learning_rate": 9.997419449726958e-07, "loss": 0.0028, "reward": 2.141343593597412, "reward_std": 0.027621503919363022, "rewards/accuracy_reward": 0.9413437247276306, "rewards/format_reward": 1.0, "step": 1013 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.15625, "epoch": 0.010237253912165572, "grad_norm": 1.1092886738824603, "kl": 0.0537109375, "learning_rate": 9.997414352784324e-07, "loss": 0.0022, "reward": 1.8876874446868896, "reward_std": 0.005036931950598955, "rewards/accuracy_reward": 0.7376874685287476, "rewards/format_reward": 1.0, "step": 1014 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 422.15625, "epoch": 0.010247349823321554, "grad_norm": 1.6194746372396898, "kl": 0.0576171875, "learning_rate": 9.997409250814374e-07, "loss": 0.0023, "reward": 1.5827500820159912, "reward_std": 0.007989945821464062, "rewards/accuracy_reward": 0.4827499985694885, "rewards/format_reward": 1.0, "step": 1015 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 427.25, "epoch": 0.010257445734477537, "grad_norm": 0.9462017001823253, "kl": 0.054931640625, "learning_rate": 9.997404143817116e-07, "loss": 0.0022, "reward": 1.8209062814712524, "reward_std": 0.008005502633750439, "rewards/accuracy_reward": 0.6709062457084656, "rewards/format_reward": 1.0, "step": 1016 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.1875, "epoch": 0.010267541645633519, "grad_norm": 2.3484565764596255, "kl": 0.0712890625, "learning_rate": 9.99739903179255e-07, "loss": 0.0028, "reward": 2.045593738555908, "reward_std": 0.021777626127004623, "rewards/accuracy_reward": 0.8455938100814819, "rewards/format_reward": 1.0, "step": 1017 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.5, "epoch": 0.0102776375567895, "grad_norm": 50.59410837168576, "kl": 0.06787109375, "learning_rate": 9.997393914740685e-07, "loss": 0.0027, "reward": 1.9982500076293945, "reward_std": 0.03544563800096512, "rewards/accuracy_reward": 0.8045000433921814, "rewards/format_reward": 1.0, "step": 1018 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 444.25, "epoch": 0.010287733467945481, "grad_norm": 2.485442113249834, "kl": 0.04833984375, "learning_rate": 9.997388792661526e-07, "loss": 0.0019, "reward": 2.0249063968658447, "reward_std": 0.06259691715240479, "rewards/accuracy_reward": 0.849906325340271, "rewards/format_reward": 1.0, "step": 1019 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.010297829379101463, "grad_norm": 2.373039717298499, "kl": 0.0634765625, "learning_rate": 9.997383665555075e-07, "loss": 0.0025, "reward": 2.145343780517578, "reward_std": 0.046093568205833435, "rewards/accuracy_reward": 0.9453437328338623, "rewards/format_reward": 1.0, "step": 1020 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 424.03125, "epoch": 0.010307925290257446, "grad_norm": 2.4089304715323667, "kl": 0.06201171875, "learning_rate": 9.99737853342134e-07, "loss": 0.0025, "reward": 1.788421869277954, "reward_std": 0.029839131981134415, "rewards/accuracy_reward": 0.6446718573570251, "rewards/format_reward": 1.0, "step": 1021 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.59375, "epoch": 0.010318021201413428, "grad_norm": 2.658942731912003, "kl": 0.064453125, "learning_rate": 9.997373396260327e-07, "loss": 0.0026, "reward": 1.9084374904632568, "reward_std": 0.027999596670269966, "rewards/accuracy_reward": 0.7084375023841858, "rewards/format_reward": 1.0, "step": 1022 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.28125, "epoch": 0.01032811711256941, "grad_norm": 2.42512552122032, "kl": 0.056884765625, "learning_rate": 9.997368254072039e-07, "loss": 0.0023, "reward": 1.7765624523162842, "reward_std": 0.17556647956371307, "rewards/accuracy_reward": 0.6203124523162842, "rewards/format_reward": 1.0, "step": 1023 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.375, "epoch": 0.010338213023725392, "grad_norm": 2.0086702955656017, "kl": 0.068359375, "learning_rate": 9.99736310685648e-07, "loss": 0.0027, "reward": 2.0874686241149902, "reward_std": 0.039818521589040756, "rewards/accuracy_reward": 0.8937187790870667, "rewards/format_reward": 1.0, "step": 1024 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.375, "epoch": 0.010348308934881372, "grad_norm": 1.5764606590745704, "kl": 0.0712890625, "learning_rate": 9.997357954613658e-07, "loss": 0.0028, "reward": 2.104968786239624, "reward_std": 0.010702201165258884, "rewards/accuracy_reward": 0.9049687385559082, "rewards/format_reward": 1.0, "step": 1025 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.75, "epoch": 0.010358404846037355, "grad_norm": 2.8372860520475887, "kl": 0.07568359375, "learning_rate": 9.997352797343578e-07, "loss": 0.003, "reward": 2.082437515258789, "reward_std": 0.04537351429462433, "rewards/accuracy_reward": 0.8886874914169312, "rewards/format_reward": 1.0, "step": 1026 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.625, "epoch": 0.010368500757193337, "grad_norm": 1.9348684230427131, "kl": 0.056884765625, "learning_rate": 9.997347635046245e-07, "loss": 0.0023, "reward": 2.1435937881469727, "reward_std": 0.12521390616893768, "rewards/accuracy_reward": 0.9560937881469727, "rewards/format_reward": 1.0, "step": 1027 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.1875, "epoch": 0.010378596668349319, "grad_norm": 8.21816806897499, "kl": 0.045654296875, "learning_rate": 9.99734246772166e-07, "loss": 0.0018, "reward": 2.174656391143799, "reward_std": 0.04236802086234093, "rewards/accuracy_reward": 0.9871562719345093, "rewards/format_reward": 1.0, "step": 1028 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.65625, "epoch": 0.010388692579505301, "grad_norm": 2.3832934616581873, "kl": 0.0576171875, "learning_rate": 9.997337295369835e-07, "loss": 0.0023, "reward": 2.1490626335144043, "reward_std": 0.03189530968666077, "rewards/accuracy_reward": 0.9553124904632568, "rewards/format_reward": 1.0, "step": 1029 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.375, "epoch": 0.010398788490661282, "grad_norm": 5.127334679047149, "kl": 0.061767578125, "learning_rate": 9.997332117990771e-07, "loss": 0.0025, "reward": 2.016437530517578, "reward_std": 0.02175021916627884, "rewards/accuracy_reward": 0.8164374828338623, "rewards/format_reward": 1.0, "step": 1030 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.5, "epoch": 0.010408884401817264, "grad_norm": 2.405349124721101, "kl": 0.0693359375, "learning_rate": 9.997326935584474e-07, "loss": 0.0028, "reward": 2.1195311546325684, "reward_std": 0.03535009175539017, "rewards/accuracy_reward": 0.9257811903953552, "rewards/format_reward": 1.0, "step": 1031 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 433.625, "epoch": 0.010418980312973246, "grad_norm": 1.738281367523508, "kl": 0.05517578125, "learning_rate": 9.997321748150951e-07, "loss": 0.0022, "reward": 1.8479063510894775, "reward_std": 0.023513702675700188, "rewards/accuracy_reward": 0.7041562795639038, "rewards/format_reward": 1.0, "step": 1032 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.9375, "epoch": 0.010429076224129228, "grad_norm": 2.280826484042376, "kl": 0.06787109375, "learning_rate": 9.997316555690203e-07, "loss": 0.0027, "reward": 2.014437437057495, "reward_std": 0.18443317711353302, "rewards/accuracy_reward": 0.8394374847412109, "rewards/format_reward": 1.0, "step": 1033 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.34375, "epoch": 0.01043917213528521, "grad_norm": 2.636340008735747, "kl": 0.07373046875, "learning_rate": 9.99731135820224e-07, "loss": 0.0029, "reward": 2.031437397003174, "reward_std": 0.0363328643143177, "rewards/accuracy_reward": 0.8376874923706055, "rewards/format_reward": 1.0, "step": 1034 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 417.1875, "epoch": 0.01044926804644119, "grad_norm": 2.6770857625748, "kl": 0.0654296875, "learning_rate": 9.997306155687063e-07, "loss": 0.0026, "reward": 1.5789999961853027, "reward_std": 0.03451245278120041, "rewards/accuracy_reward": 0.43525001406669617, "rewards/format_reward": 1.0, "step": 1035 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.375, "epoch": 0.010459363957597173, "grad_norm": 1.7180425205290426, "kl": 0.05078125, "learning_rate": 9.997300948144682e-07, "loss": 0.002, "reward": 1.7570312023162842, "reward_std": 0.01619434356689453, "rewards/accuracy_reward": 0.6070312261581421, "rewards/format_reward": 1.0, "step": 1036 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 426.625, "epoch": 0.010469459868753155, "grad_norm": 4.548143540993093, "kl": 0.06591796875, "learning_rate": 9.997295735575098e-07, "loss": 0.0026, "reward": 1.7448437213897705, "reward_std": 0.01296511571854353, "rewards/accuracy_reward": 0.5948437452316284, "rewards/format_reward": 1.0, "step": 1037 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.0625, "epoch": 0.010479555779909137, "grad_norm": 3.3214980066727704, "kl": 0.06884765625, "learning_rate": 9.99729051797832e-07, "loss": 0.0027, "reward": 2.0721564292907715, "reward_std": 0.040169890969991684, "rewards/accuracy_reward": 0.8846561908721924, "rewards/format_reward": 1.0, "step": 1038 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.1875, "epoch": 0.01048965169106512, "grad_norm": 2.2405865358507144, "kl": 0.062255859375, "learning_rate": 9.997285295354347e-07, "loss": 0.0025, "reward": 2.015531301498413, "reward_std": 0.16789986193180084, "rewards/accuracy_reward": 0.8405312299728394, "rewards/format_reward": 1.0, "step": 1039 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 424.0, "epoch": 0.0104997476022211, "grad_norm": 0.1059823618958552, "kl": 0.052978515625, "learning_rate": 9.99728006770319e-07, "loss": 0.0021, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1040 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.75, "epoch": 0.010509843513377082, "grad_norm": 2.7762286070196933, "kl": 0.062255859375, "learning_rate": 9.997274835024853e-07, "loss": 0.0025, "reward": 1.8150312900543213, "reward_std": 0.012834074907004833, "rewards/accuracy_reward": 0.6650312542915344, "rewards/format_reward": 1.0, "step": 1041 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.625, "epoch": 0.010519939424533064, "grad_norm": 4.463826017566376, "kl": 0.06494140625, "learning_rate": 9.99726959731934e-07, "loss": 0.0026, "reward": 2.113093614578247, "reward_std": 0.012695521116256714, "rewards/accuracy_reward": 0.9130937457084656, "rewards/format_reward": 1.0, "step": 1042 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.75, "epoch": 0.010530035335689046, "grad_norm": 1.1858056271629123, "kl": 0.043212890625, "learning_rate": 9.997264354586658e-07, "loss": 0.0017, "reward": 1.88490629196167, "reward_std": 0.02270304597914219, "rewards/accuracy_reward": 0.7349061965942383, "rewards/format_reward": 1.0, "step": 1043 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 419.9375, "epoch": 0.010540131246845028, "grad_norm": 2.692679587032961, "kl": 0.050537109375, "learning_rate": 9.997259106826811e-07, "loss": 0.002, "reward": 1.8651562929153442, "reward_std": 0.014491287991404533, "rewards/accuracy_reward": 0.7151562571525574, "rewards/format_reward": 1.0, "step": 1044 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 429.53125, "epoch": 0.01055022715800101, "grad_norm": 2.051064655773259, "kl": 0.05224609375, "learning_rate": 9.997253854039805e-07, "loss": 0.0021, "reward": 1.8679375648498535, "reward_std": 0.031231097877025604, "rewards/accuracy_reward": 0.7304375171661377, "rewards/format_reward": 1.0, "step": 1045 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.9375, "epoch": 0.01056032306915699, "grad_norm": 2.2046602648008933, "kl": 0.05859375, "learning_rate": 9.997248596225646e-07, "loss": 0.0024, "reward": 2.133718967437744, "reward_std": 0.046141598373651505, "rewards/accuracy_reward": 0.946218729019165, "rewards/format_reward": 1.0, "step": 1046 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 422.125, "epoch": 0.010570418980312973, "grad_norm": 4.061187610908673, "kl": 0.062255859375, "learning_rate": 9.997243333384335e-07, "loss": 0.0025, "reward": 1.815093755722046, "reward_std": 0.02887376956641674, "rewards/accuracy_reward": 0.6713437438011169, "rewards/format_reward": 1.0, "step": 1047 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.65625, "epoch": 0.010580514891468955, "grad_norm": 2.39210696362523, "kl": 0.06298828125, "learning_rate": 9.997238065515882e-07, "loss": 0.0025, "reward": 2.122093677520752, "reward_std": 0.018251553177833557, "rewards/accuracy_reward": 0.9220937490463257, "rewards/format_reward": 1.0, "step": 1048 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.28125, "epoch": 0.010590610802624937, "grad_norm": 4.406816340200238, "kl": 0.062255859375, "learning_rate": 9.997232792620292e-07, "loss": 0.0025, "reward": 1.8793751001358032, "reward_std": 0.04885557293891907, "rewards/accuracy_reward": 0.6856250166893005, "rewards/format_reward": 1.0, "step": 1049 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.5, "epoch": 0.01060070671378092, "grad_norm": 1.8939299467339425, "kl": 0.0654296875, "learning_rate": 9.997227514697566e-07, "loss": 0.0026, "reward": 1.7661561965942383, "reward_std": 0.018088113516569138, "rewards/accuracy_reward": 0.616156280040741, "rewards/format_reward": 1.0, "step": 1050 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 425.25, "epoch": 0.0106108026249369, "grad_norm": 1.274359624473157, "kl": 0.05419921875, "learning_rate": 9.997222231747715e-07, "loss": 0.0022, "reward": 1.5783438682556152, "reward_std": 0.021635007113218307, "rewards/accuracy_reward": 0.48459377884864807, "rewards/format_reward": 1.0, "step": 1051 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.8125, "epoch": 0.010620898536092882, "grad_norm": 1.6956399098811108, "kl": 0.060546875, "learning_rate": 9.997216943770741e-07, "loss": 0.0024, "reward": 1.9177188873291016, "reward_std": 0.026088416576385498, "rewards/accuracy_reward": 0.717718780040741, "rewards/format_reward": 1.0, "step": 1052 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 397.0, "epoch": 0.010630994447248864, "grad_norm": 2.2756813393929005, "kl": 0.05224609375, "learning_rate": 9.997211650766651e-07, "loss": 0.0021, "reward": 1.7617499828338623, "reward_std": 0.12825649976730347, "rewards/accuracy_reward": 0.6179999709129333, "rewards/format_reward": 1.0, "step": 1053 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.09375, "epoch": 0.010641090358404846, "grad_norm": 10.241446067053497, "kl": 0.05126953125, "learning_rate": 9.997206352735448e-07, "loss": 0.0021, "reward": 2.007718801498413, "reward_std": 0.1678062528371811, "rewards/accuracy_reward": 0.8264687657356262, "rewards/format_reward": 1.0, "step": 1054 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.28125, "epoch": 0.010651186269560829, "grad_norm": 2.140584350445316, "kl": 0.058837890625, "learning_rate": 9.99720104967714e-07, "loss": 0.0024, "reward": 1.817406415939331, "reward_std": 0.02284877374768257, "rewards/accuracy_reward": 0.6674062013626099, "rewards/format_reward": 1.0, "step": 1055 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.46875, "epoch": 0.010661282180716809, "grad_norm": 1.890719984647997, "kl": 0.0576171875, "learning_rate": 9.997195741591732e-07, "loss": 0.0023, "reward": 1.8098437786102295, "reward_std": 0.02320878952741623, "rewards/accuracy_reward": 0.6598438024520874, "rewards/format_reward": 1.0, "step": 1056 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.46875, "epoch": 0.010671378091872791, "grad_norm": 1.4510771434916871, "kl": 0.05224609375, "learning_rate": 9.997190428479225e-07, "loss": 0.0021, "reward": 2.0377187728881836, "reward_std": 0.024558888748288155, "rewards/accuracy_reward": 0.8439687490463257, "rewards/format_reward": 1.0, "step": 1057 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.59375, "epoch": 0.010681474003028773, "grad_norm": 3.9648188445616492, "kl": 0.058837890625, "learning_rate": 9.99718511033963e-07, "loss": 0.0024, "reward": 2.0195000171661377, "reward_std": 0.13117118179798126, "rewards/accuracy_reward": 0.8257499933242798, "rewards/format_reward": 1.0, "step": 1058 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.0625, "epoch": 0.010691569914184755, "grad_norm": 3.1479502876904673, "kl": 0.056884765625, "learning_rate": 9.99717978717295e-07, "loss": 0.0023, "reward": 1.7968437671661377, "reward_std": 0.2528594136238098, "rewards/accuracy_reward": 0.6655937433242798, "rewards/format_reward": 1.0, "step": 1059 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.0, "epoch": 0.010701665825340738, "grad_norm": 2.1359732705528804, "kl": 0.0654296875, "learning_rate": 9.997174458979189e-07, "loss": 0.0026, "reward": 2.075437545776367, "reward_std": 0.02729148231446743, "rewards/accuracy_reward": 0.8754374980926514, "rewards/format_reward": 1.0, "step": 1060 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 387.21875, "epoch": 0.010711761736496718, "grad_norm": 1.3966336725423505, "kl": 0.04296875, "learning_rate": 9.997169125758353e-07, "loss": 0.0017, "reward": 1.8496249914169312, "reward_std": 0.010642802342772484, "rewards/accuracy_reward": 0.6996250152587891, "rewards/format_reward": 1.0, "step": 1061 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 370.5625, "epoch": 0.0107218576476527, "grad_norm": 1.561939797723383, "kl": 0.04638671875, "learning_rate": 9.997163787510451e-07, "loss": 0.0019, "reward": 1.8714687824249268, "reward_std": 0.03068048134446144, "rewards/accuracy_reward": 0.7339687347412109, "rewards/format_reward": 1.0, "step": 1062 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 382.5, "epoch": 0.010731953558808682, "grad_norm": 1.5493755987901083, "kl": 0.043212890625, "learning_rate": 9.997158444235485e-07, "loss": 0.0017, "reward": 1.3646249771118164, "reward_std": 0.14473296701908112, "rewards/accuracy_reward": 0.3021249771118164, "rewards/format_reward": 1.0, "step": 1063 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.46875, "epoch": 0.010742049469964664, "grad_norm": 1.4224248233025443, "kl": 0.061767578125, "learning_rate": 9.99715309593346e-07, "loss": 0.0025, "reward": 2.150624990463257, "reward_std": 0.011761946603655815, "rewards/accuracy_reward": 0.950624942779541, "rewards/format_reward": 1.0, "step": 1064 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 371.28125, "epoch": 0.010752145381120647, "grad_norm": 2.1721895695416773, "kl": 0.056396484375, "learning_rate": 9.997147742604382e-07, "loss": 0.0023, "reward": 2.0575313568115234, "reward_std": 0.17302070558071136, "rewards/accuracy_reward": 0.8825312256813049, "rewards/format_reward": 1.0, "step": 1065 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 381.875, "epoch": 0.010762241292276629, "grad_norm": 1.5234404446161542, "kl": 0.055419921875, "learning_rate": 9.997142384248257e-07, "loss": 0.0022, "reward": 1.8539376258850098, "reward_std": 0.010434264317154884, "rewards/accuracy_reward": 0.7039375305175781, "rewards/format_reward": 1.0, "step": 1066 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.40625, "epoch": 0.01077233720343261, "grad_norm": 2.605609253806264, "kl": 0.06640625, "learning_rate": 9.99713702086509e-07, "loss": 0.0027, "reward": 2.11328125, "reward_std": 0.0288308747112751, "rewards/accuracy_reward": 0.9132812023162842, "rewards/format_reward": 1.0, "step": 1067 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.40625, "epoch": 0.010782433114588591, "grad_norm": 1.7812348440809347, "kl": 0.06298828125, "learning_rate": 9.997131652454887e-07, "loss": 0.0025, "reward": 2.1514687538146973, "reward_std": 0.027175545692443848, "rewards/accuracy_reward": 0.9577187299728394, "rewards/format_reward": 1.0, "step": 1068 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.53125, "epoch": 0.010792529025744573, "grad_norm": 3.7554537033168316, "kl": 0.0498046875, "learning_rate": 9.997126279017654e-07, "loss": 0.002, "reward": 1.7802188396453857, "reward_std": 0.029977109283208847, "rewards/accuracy_reward": 0.636468768119812, "rewards/format_reward": 1.0, "step": 1069 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.90625, "epoch": 0.010802624936900556, "grad_norm": 1.996054562291082, "kl": 0.068359375, "learning_rate": 9.997120900553394e-07, "loss": 0.0027, "reward": 2.093062400817871, "reward_std": 0.03707395866513252, "rewards/accuracy_reward": 0.8930624723434448, "rewards/format_reward": 1.0, "step": 1070 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.25, "epoch": 0.010812720848056538, "grad_norm": 1.7962721635163332, "kl": 0.03662109375, "learning_rate": 9.997115517062112e-07, "loss": 0.0015, "reward": 1.8274376392364502, "reward_std": 0.043551329523324966, "rewards/accuracy_reward": 0.6899374723434448, "rewards/format_reward": 1.0, "step": 1071 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.625, "epoch": 0.010822816759212518, "grad_norm": 2.6065223304400327, "kl": 0.062255859375, "learning_rate": 9.997110128543817e-07, "loss": 0.0025, "reward": 1.7483124732971191, "reward_std": 0.02569950371980667, "rewards/accuracy_reward": 0.598312497138977, "rewards/format_reward": 1.0, "step": 1072 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.75, "epoch": 0.0108329126703685, "grad_norm": 1.4356715435042824, "kl": 0.059814453125, "learning_rate": 9.997104734998512e-07, "loss": 0.0024, "reward": 2.1649062633514404, "reward_std": 0.01230381615459919, "rewards/accuracy_reward": 0.9649062156677246, "rewards/format_reward": 1.0, "step": 1073 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.40625, "epoch": 0.010843008581524483, "grad_norm": 3.2765074912886067, "kl": 0.060791015625, "learning_rate": 9.997099336426203e-07, "loss": 0.0024, "reward": 2.0775623321533203, "reward_std": 0.02683315798640251, "rewards/accuracy_reward": 0.8775624632835388, "rewards/format_reward": 1.0, "step": 1074 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.8125, "epoch": 0.010853104492680465, "grad_norm": 2.2830718301958495, "kl": 0.068359375, "learning_rate": 9.997093932826894e-07, "loss": 0.0027, "reward": 2.063406229019165, "reward_std": 0.03946654871106148, "rewards/accuracy_reward": 0.8696563243865967, "rewards/format_reward": 1.0, "step": 1075 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.21875, "epoch": 0.010863200403836447, "grad_norm": 2.4804695556606986, "kl": 0.058349609375, "learning_rate": 9.997088524200594e-07, "loss": 0.0023, "reward": 2.100375175476074, "reward_std": 0.12873885035514832, "rewards/accuracy_reward": 0.9253749847412109, "rewards/format_reward": 1.0, "step": 1076 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.4375, "epoch": 0.010873296314992427, "grad_norm": 2.1687690560826485, "kl": 0.06103515625, "learning_rate": 9.997083110547305e-07, "loss": 0.0024, "reward": 2.0793750286102295, "reward_std": 0.04513493925333023, "rewards/accuracy_reward": 0.8856250047683716, "rewards/format_reward": 1.0, "step": 1077 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.65625, "epoch": 0.01088339222614841, "grad_norm": 1.7673356320756175, "kl": 0.053955078125, "learning_rate": 9.997077691867033e-07, "loss": 0.0022, "reward": 2.1430625915527344, "reward_std": 0.04000706598162651, "rewards/accuracy_reward": 0.9555625319480896, "rewards/format_reward": 1.0, "step": 1078 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 404.4375, "epoch": 0.010893488137304392, "grad_norm": 2.8600796002506637, "kl": 0.05517578125, "learning_rate": 9.997072268159785e-07, "loss": 0.0022, "reward": 1.4469375610351562, "reward_std": 0.017521033063530922, "rewards/accuracy_reward": 0.34693747758865356, "rewards/format_reward": 1.0, "step": 1079 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.65625, "epoch": 0.010903584048460374, "grad_norm": 2.428509692853485, "kl": 0.0673828125, "learning_rate": 9.997066839425566e-07, "loss": 0.0027, "reward": 2.1129374504089355, "reward_std": 0.04916621744632721, "rewards/accuracy_reward": 0.9191875457763672, "rewards/format_reward": 1.0, "step": 1080 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.3125, "epoch": 0.010913679959616356, "grad_norm": 1.951598813737259, "kl": 0.054443359375, "learning_rate": 9.99706140566438e-07, "loss": 0.0022, "reward": 1.8638125658035278, "reward_std": 0.027878202497959137, "rewards/accuracy_reward": 0.7200624942779541, "rewards/format_reward": 1.0, "step": 1081 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.0625, "epoch": 0.010923775870772338, "grad_norm": 2.867520650632193, "kl": 0.06591796875, "learning_rate": 9.997055966876236e-07, "loss": 0.0026, "reward": 2.1077189445495605, "reward_std": 0.03181684762239456, "rewards/accuracy_reward": 0.9139686822891235, "rewards/format_reward": 1.0, "step": 1082 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.5, "epoch": 0.010933871781928318, "grad_norm": 1.428077121155995, "kl": 0.061279296875, "learning_rate": 9.997050523061135e-07, "loss": 0.0025, "reward": 2.0923123359680176, "reward_std": 0.016804035753011703, "rewards/accuracy_reward": 0.8923125267028809, "rewards/format_reward": 1.0, "step": 1083 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.9375, "epoch": 0.0109439676930843, "grad_norm": 2.1904343214077, "kl": 0.06396484375, "learning_rate": 9.997045074219085e-07, "loss": 0.0026, "reward": 2.118906259536743, "reward_std": 0.03659765422344208, "rewards/accuracy_reward": 0.9251562356948853, "rewards/format_reward": 1.0, "step": 1084 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 420.46875, "epoch": 0.010954063604240283, "grad_norm": 2.4094623896158622, "kl": 0.05224609375, "learning_rate": 9.99703962035009e-07, "loss": 0.0021, "reward": 1.7045625448226929, "reward_std": 0.15402284264564514, "rewards/accuracy_reward": 0.573312520980835, "rewards/format_reward": 1.0, "step": 1085 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.78125, "epoch": 0.010964159515396265, "grad_norm": 1.8529067151905974, "kl": 0.040771484375, "learning_rate": 9.997034161454157e-07, "loss": 0.0016, "reward": 2.1690001487731934, "reward_std": 0.009783023037016392, "rewards/accuracy_reward": 0.9690000414848328, "rewards/format_reward": 1.0, "step": 1086 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.09375, "epoch": 0.010974255426552247, "grad_norm": 2.937677227335512, "kl": 0.055908203125, "learning_rate": 9.99702869753129e-07, "loss": 0.0022, "reward": 2.0578126907348633, "reward_std": 0.029766233637928963, "rewards/accuracy_reward": 0.864062488079071, "rewards/format_reward": 1.0, "step": 1087 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 431.84375, "epoch": 0.010984351337708228, "grad_norm": 4.7733990406253515, "kl": 0.04541015625, "learning_rate": 9.997023228581497e-07, "loss": 0.0018, "reward": 2.146843671798706, "reward_std": 0.029211651533842087, "rewards/accuracy_reward": 0.9530936479568481, "rewards/format_reward": 1.0, "step": 1088 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.5625, "epoch": 0.01099444724886421, "grad_norm": 1.8781827204492292, "kl": 0.06689453125, "learning_rate": 9.997017754604782e-07, "loss": 0.0027, "reward": 2.093031167984009, "reward_std": 0.018142158165574074, "rewards/accuracy_reward": 0.8930312395095825, "rewards/format_reward": 1.0, "step": 1089 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 417.375, "epoch": 0.011004543160020192, "grad_norm": 2.752580716578766, "kl": 0.0595703125, "learning_rate": 9.99701227560115e-07, "loss": 0.0024, "reward": 1.7817500829696655, "reward_std": 0.020694710314273834, "rewards/accuracy_reward": 0.6317499876022339, "rewards/format_reward": 1.0, "step": 1090 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 420.90625, "epoch": 0.011014639071176174, "grad_norm": 3.0317871187853047, "kl": 0.04736328125, "learning_rate": 9.997006791570606e-07, "loss": 0.0019, "reward": 1.8377188444137573, "reward_std": 0.015257144346833229, "rewards/accuracy_reward": 0.6877187490463257, "rewards/format_reward": 1.0, "step": 1091 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 431.09375, "epoch": 0.011024734982332156, "grad_norm": 2.5083649866781554, "kl": 0.056640625, "learning_rate": 9.997001302513157e-07, "loss": 0.0023, "reward": 1.8352186679840088, "reward_std": 0.017005400732159615, "rewards/accuracy_reward": 0.6852187514305115, "rewards/format_reward": 1.0, "step": 1092 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.59375, "epoch": 0.011034830893488137, "grad_norm": 2.1354026828900956, "kl": 0.0634765625, "learning_rate": 9.996995808428807e-07, "loss": 0.0025, "reward": 2.173468828201294, "reward_std": 0.034799862653017044, "rewards/accuracy_reward": 0.9797186851501465, "rewards/format_reward": 1.0, "step": 1093 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 422.40625, "epoch": 0.011044926804644119, "grad_norm": 5.566226853161453, "kl": 0.0654296875, "learning_rate": 9.996990309317564e-07, "loss": 0.0026, "reward": 1.970343828201294, "reward_std": 0.041719261556863785, "rewards/accuracy_reward": 0.776593804359436, "rewards/format_reward": 1.0, "step": 1094 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.1875, "epoch": 0.011055022715800101, "grad_norm": 2.7829224004301216, "kl": 0.05810546875, "learning_rate": 9.996984805179432e-07, "loss": 0.0023, "reward": 2.1005938053131104, "reward_std": 0.01987951248884201, "rewards/accuracy_reward": 0.9005938172340393, "rewards/format_reward": 1.0, "step": 1095 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.625, "epoch": 0.011065118626956083, "grad_norm": 4.327359572101519, "kl": 0.05322265625, "learning_rate": 9.996979296014416e-07, "loss": 0.0021, "reward": 2.10965633392334, "reward_std": 0.02484958991408348, "rewards/accuracy_reward": 0.9096562266349792, "rewards/format_reward": 1.0, "step": 1096 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.5, "epoch": 0.011075214538112065, "grad_norm": 3.454982480519229, "kl": 0.0615234375, "learning_rate": 9.996973781822522e-07, "loss": 0.0025, "reward": 2.045687675476074, "reward_std": 0.04910445958375931, "rewards/accuracy_reward": 0.8581874966621399, "rewards/format_reward": 1.0, "step": 1097 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 429.125, "epoch": 0.011085310449268046, "grad_norm": 1.9230088260613172, "kl": 0.050537109375, "learning_rate": 9.996968262603755e-07, "loss": 0.002, "reward": 1.85756254196167, "reward_std": 0.029434585943818092, "rewards/accuracy_reward": 0.7138125896453857, "rewards/format_reward": 1.0, "step": 1098 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 432.5625, "epoch": 0.011095406360424028, "grad_norm": 4.8242821657838615, "kl": 0.0654296875, "learning_rate": 9.996962738358124e-07, "loss": 0.0026, "reward": 2.056375026702881, "reward_std": 0.025957541540265083, "rewards/accuracy_reward": 0.8563750386238098, "rewards/format_reward": 1.0, "step": 1099 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.28125, "epoch": 0.01110550227158001, "grad_norm": 1.36986102716583, "kl": 0.0634765625, "learning_rate": 9.99695720908563e-07, "loss": 0.0025, "reward": 2.0273749828338623, "reward_std": 0.025717440992593765, "rewards/accuracy_reward": 0.833625078201294, "rewards/format_reward": 1.0, "step": 1100 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 426.40625, "epoch": 0.011115598182735992, "grad_norm": 1.5337604227893677, "kl": 0.046142578125, "learning_rate": 9.99695167478628e-07, "loss": 0.0018, "reward": 1.8729689121246338, "reward_std": 0.03589898720383644, "rewards/accuracy_reward": 0.7354687452316284, "rewards/format_reward": 1.0, "step": 1101 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 440.375, "epoch": 0.011125694093891974, "grad_norm": 2.896869357977033, "kl": 0.06396484375, "learning_rate": 9.99694613546008e-07, "loss": 0.0026, "reward": 1.8784375190734863, "reward_std": 0.015362969599664211, "rewards/accuracy_reward": 0.7284374833106995, "rewards/format_reward": 1.0, "step": 1102 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 441.34375, "epoch": 0.011135790005047956, "grad_norm": 2.1763230375821307, "kl": 0.062255859375, "learning_rate": 9.996940591107035e-07, "loss": 0.0025, "reward": 2.0490312576293945, "reward_std": 0.049232978373765945, "rewards/accuracy_reward": 0.8615312576293945, "rewards/format_reward": 1.0, "step": 1103 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 435.15625, "epoch": 0.011145885916203937, "grad_norm": 2.2089873681827252, "kl": 0.05908203125, "learning_rate": 9.996935041727152e-07, "loss": 0.0024, "reward": 1.9965312480926514, "reward_std": 0.029564041644334793, "rewards/accuracy_reward": 0.7965312600135803, "rewards/format_reward": 1.0, "step": 1104 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.875, "epoch": 0.011155981827359919, "grad_norm": 1.7334369522474744, "kl": 0.047607421875, "learning_rate": 9.996929487320435e-07, "loss": 0.0019, "reward": 1.8562812805175781, "reward_std": 0.011345298029482365, "rewards/accuracy_reward": 0.7062812447547913, "rewards/format_reward": 1.0, "step": 1105 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.15625, "epoch": 0.011166077738515901, "grad_norm": 1.850824004383503, "kl": 0.05224609375, "learning_rate": 9.99692392788689e-07, "loss": 0.0021, "reward": 2.0830624103546143, "reward_std": 0.033153075724840164, "rewards/accuracy_reward": 0.8893125057220459, "rewards/format_reward": 1.0, "step": 1106 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.15625, "epoch": 0.011176173649671883, "grad_norm": 5.186176920487931, "kl": 0.06640625, "learning_rate": 9.996918363426523e-07, "loss": 0.0027, "reward": 2.031874895095825, "reward_std": 0.036645933985710144, "rewards/accuracy_reward": 0.8318749666213989, "rewards/format_reward": 1.0, "step": 1107 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.0, "epoch": 0.011186269560827865, "grad_norm": 3.2588956053685196, "kl": 0.06787109375, "learning_rate": 9.99691279393934e-07, "loss": 0.0027, "reward": 2.085625171661377, "reward_std": 0.04039035737514496, "rewards/accuracy_reward": 0.8918749094009399, "rewards/format_reward": 1.0, "step": 1108 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 418.125, "epoch": 0.011196365471983846, "grad_norm": 2.3335617111233082, "kl": 0.052490234375, "learning_rate": 9.996907219425345e-07, "loss": 0.0021, "reward": 1.8070937395095825, "reward_std": 0.020809903740882874, "rewards/accuracy_reward": 0.6570937633514404, "rewards/format_reward": 1.0, "step": 1109 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 421.125, "epoch": 0.011206461383139828, "grad_norm": 2.7496373716697997, "kl": 0.05078125, "learning_rate": 9.996901639884546e-07, "loss": 0.002, "reward": 1.5264062881469727, "reward_std": 0.02800973504781723, "rewards/accuracy_reward": 0.43265625834465027, "rewards/format_reward": 1.0, "step": 1110 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.96875, "epoch": 0.01121655729429581, "grad_norm": 2.260783375806057, "kl": 0.06201171875, "learning_rate": 9.996896055316947e-07, "loss": 0.0025, "reward": 1.831343650817871, "reward_std": 0.029564231634140015, "rewards/accuracy_reward": 0.6875937581062317, "rewards/format_reward": 1.0, "step": 1111 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.96875, "epoch": 0.011226653205451792, "grad_norm": 1.6990215243155755, "kl": 0.0498046875, "learning_rate": 9.996890465722555e-07, "loss": 0.002, "reward": 1.8267500400543213, "reward_std": 0.017038581892848015, "rewards/accuracy_reward": 0.6767500042915344, "rewards/format_reward": 1.0, "step": 1112 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.625, "epoch": 0.011236749116607775, "grad_norm": 2.438515262080818, "kl": 0.056640625, "learning_rate": 9.996884871101371e-07, "loss": 0.0023, "reward": 2.054468870162964, "reward_std": 0.019908327609300613, "rewards/accuracy_reward": 0.8544687032699585, "rewards/format_reward": 1.0, "step": 1113 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.4375, "epoch": 0.011246845027763755, "grad_norm": 3.196086512210512, "kl": 0.06494140625, "learning_rate": 9.996879271453408e-07, "loss": 0.0026, "reward": 2.105062484741211, "reward_std": 0.025452125817537308, "rewards/accuracy_reward": 0.9050624370574951, "rewards/format_reward": 1.0, "step": 1114 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.09375, "epoch": 0.011256940938919737, "grad_norm": 6.530153249687395, "kl": 0.06103515625, "learning_rate": 9.996873666778665e-07, "loss": 0.0024, "reward": 2.042343854904175, "reward_std": 0.15893198549747467, "rewards/accuracy_reward": 0.8548437356948853, "rewards/format_reward": 1.0, "step": 1115 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.71875, "epoch": 0.01126703685007572, "grad_norm": 3.154398222016349, "kl": 0.0693359375, "learning_rate": 9.996868057077151e-07, "loss": 0.0028, "reward": 1.9943437576293945, "reward_std": 0.02551095001399517, "rewards/accuracy_reward": 0.7943437099456787, "rewards/format_reward": 1.0, "step": 1116 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.25, "epoch": 0.011277132761231701, "grad_norm": 5.964159865567021, "kl": 0.07080078125, "learning_rate": 9.996862442348871e-07, "loss": 0.0028, "reward": 2.0103750228881836, "reward_std": 0.017030611634254456, "rewards/accuracy_reward": 0.8103749752044678, "rewards/format_reward": 1.0, "step": 1117 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 384.4375, "epoch": 0.011287228672387684, "grad_norm": 1.5611269896383995, "kl": 0.0634765625, "learning_rate": 9.99685682259383e-07, "loss": 0.0025, "reward": 1.7031874656677246, "reward_std": 0.00706908293068409, "rewards/accuracy_reward": 0.5531874895095825, "rewards/format_reward": 1.0, "step": 1118 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 376.90625, "epoch": 0.011297324583543664, "grad_norm": 3.8269588489554476, "kl": 0.0595703125, "learning_rate": 9.996851197812035e-07, "loss": 0.0024, "reward": 1.7475625276565552, "reward_std": 0.03279083967208862, "rewards/accuracy_reward": 0.6038125157356262, "rewards/format_reward": 1.0, "step": 1119 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 368.3125, "epoch": 0.011307420494699646, "grad_norm": 1.5573965444630038, "kl": 0.058349609375, "learning_rate": 9.996845568003492e-07, "loss": 0.0023, "reward": 1.872406244277954, "reward_std": 0.012232225388288498, "rewards/accuracy_reward": 0.722406268119812, "rewards/format_reward": 1.0, "step": 1120 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.375, "epoch": 0.011317516405855628, "grad_norm": 3.213555978955212, "kl": 0.0751953125, "learning_rate": 9.996839933168204e-07, "loss": 0.003, "reward": 2.1523749828338623, "reward_std": 0.044646840542554855, "rewards/accuracy_reward": 0.9586250185966492, "rewards/format_reward": 1.0, "step": 1121 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 364.96875, "epoch": 0.01132761231701161, "grad_norm": 3.6989033716732767, "kl": 0.0517578125, "learning_rate": 9.996834293306178e-07, "loss": 0.0021, "reward": 2.10965633392334, "reward_std": 0.06192972883582115, "rewards/accuracy_reward": 0.9284062385559082, "rewards/format_reward": 1.0, "step": 1122 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 374.875, "epoch": 0.011337708228167593, "grad_norm": 2.3037321796217727, "kl": 0.041259765625, "learning_rate": 9.996828648417422e-07, "loss": 0.0016, "reward": 2.1722187995910645, "reward_std": 0.023096617311239243, "rewards/accuracy_reward": 0.9784687757492065, "rewards/format_reward": 1.0, "step": 1123 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.625, "epoch": 0.011347804139323575, "grad_norm": 1.9297568647285743, "kl": 0.0556640625, "learning_rate": 9.996822998501939e-07, "loss": 0.0022, "reward": 2.151343822479248, "reward_std": 0.024603109806776047, "rewards/accuracy_reward": 0.9575937390327454, "rewards/format_reward": 1.0, "step": 1124 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.75, "epoch": 0.011357900050479555, "grad_norm": 2.5848025777969252, "kl": 0.06494140625, "learning_rate": 9.996817343559734e-07, "loss": 0.0026, "reward": 2.1085314750671387, "reward_std": 0.04809747636318207, "rewards/accuracy_reward": 0.9210312366485596, "rewards/format_reward": 1.0, "step": 1125 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 373.375, "epoch": 0.011367995961635537, "grad_norm": 2.349087128578623, "kl": 0.0751953125, "learning_rate": 9.996811683590816e-07, "loss": 0.003, "reward": 2.048187494277954, "reward_std": 0.03527325391769409, "rewards/accuracy_reward": 0.8544374704360962, "rewards/format_reward": 1.0, "step": 1126 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 379.15625, "epoch": 0.01137809187279152, "grad_norm": 3.3071799500094254, "kl": 0.056640625, "learning_rate": 9.996806018595188e-07, "loss": 0.0023, "reward": 2.03125, "reward_std": 0.162931889295578, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 1127 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.011388187783947502, "grad_norm": 2.3711950274789912, "kl": 0.042724609375, "learning_rate": 9.996800348572856e-07, "loss": 0.0017, "reward": 1.9463437795639038, "reward_std": 0.14142341911792755, "rewards/accuracy_reward": 0.7900937795639038, "rewards/format_reward": 1.0, "step": 1128 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 375.1875, "epoch": 0.011398283695103484, "grad_norm": 2.416054490581382, "kl": 0.06396484375, "learning_rate": 9.996794673523825e-07, "loss": 0.0026, "reward": 2.153468608856201, "reward_std": 0.028529657050967216, "rewards/accuracy_reward": 0.9597187638282776, "rewards/format_reward": 1.0, "step": 1129 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.625, "epoch": 0.011408379606259464, "grad_norm": 2.6512292801344626, "kl": 0.05712890625, "learning_rate": 9.996788993448103e-07, "loss": 0.0023, "reward": 2.0799689292907715, "reward_std": 0.04148423671722412, "rewards/accuracy_reward": 0.8924687504768372, "rewards/format_reward": 1.0, "step": 1130 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.71875, "epoch": 0.011418475517415446, "grad_norm": 2.4355557506091334, "kl": 0.06640625, "learning_rate": 9.996783308345696e-07, "loss": 0.0027, "reward": 2.0461251735687256, "reward_std": 0.05493931472301483, "rewards/accuracy_reward": 0.8586249351501465, "rewards/format_reward": 1.0, "step": 1131 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.8125, "epoch": 0.011428571428571429, "grad_norm": 2.8920039198709686, "kl": 0.06494140625, "learning_rate": 9.996777618216605e-07, "loss": 0.0026, "reward": 2.060281276702881, "reward_std": 0.042081475257873535, "rewards/accuracy_reward": 0.8727812767028809, "rewards/format_reward": 1.0, "step": 1132 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.90625, "epoch": 0.01143866733972741, "grad_norm": 2.2015515744195038, "kl": 0.056640625, "learning_rate": 9.996771923060842e-07, "loss": 0.0023, "reward": 2.1624999046325684, "reward_std": 0.020082835108041763, "rewards/accuracy_reward": 0.9625000357627869, "rewards/format_reward": 1.0, "step": 1133 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.46875, "epoch": 0.011448763250883393, "grad_norm": 1.8852524058870836, "kl": 0.06494140625, "learning_rate": 9.99676622287841e-07, "loss": 0.0026, "reward": 2.165562629699707, "reward_std": 0.04335293173789978, "rewards/accuracy_reward": 0.9780625104904175, "rewards/format_reward": 1.0, "step": 1134 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.78125, "epoch": 0.011458859162039373, "grad_norm": 2.0140593422250053, "kl": 0.0458984375, "learning_rate": 9.996760517669311e-07, "loss": 0.0018, "reward": 1.8890001773834229, "reward_std": 0.1630314290523529, "rewards/accuracy_reward": 0.7327499389648438, "rewards/format_reward": 1.0, "step": 1135 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 417.125, "epoch": 0.011468955073195355, "grad_norm": 1.633424067238898, "kl": 0.053955078125, "learning_rate": 9.996754807433558e-07, "loss": 0.0022, "reward": 1.8375937938690186, "reward_std": 0.022025197744369507, "rewards/accuracy_reward": 0.6875936985015869, "rewards/format_reward": 1.0, "step": 1136 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 424.15625, "epoch": 0.011479050984351338, "grad_norm": 1.3179600944662562, "kl": 0.04443359375, "learning_rate": 9.996749092171152e-07, "loss": 0.0018, "reward": 1.8657188415527344, "reward_std": 0.007247470319271088, "rewards/accuracy_reward": 0.7157187461853027, "rewards/format_reward": 1.0, "step": 1137 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.8125, "epoch": 0.01148914689550732, "grad_norm": 5.513412116392377, "kl": 0.05859375, "learning_rate": 9.996743371882098e-07, "loss": 0.0023, "reward": 1.920312523841858, "reward_std": 0.17427870631217957, "rewards/accuracy_reward": 0.7515625357627869, "rewards/format_reward": 1.0, "step": 1138 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 419.46875, "epoch": 0.011499242806663302, "grad_norm": 1.7704233260074114, "kl": 0.0673828125, "learning_rate": 9.996737646566405e-07, "loss": 0.0027, "reward": 1.864187479019165, "reward_std": 0.02974187396466732, "rewards/accuracy_reward": 0.7204374670982361, "rewards/format_reward": 1.0, "step": 1139 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 412.625, "epoch": 0.011509338717819282, "grad_norm": 2.0296068210799465, "kl": 0.06689453125, "learning_rate": 9.996731916224076e-07, "loss": 0.0027, "reward": 1.7363123893737793, "reward_std": 0.12942148745059967, "rewards/accuracy_reward": 0.6363124847412109, "rewards/format_reward": 0.96875, "step": 1140 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.84375, "epoch": 0.011519434628975265, "grad_norm": 2.7995220486570154, "kl": 0.072265625, "learning_rate": 9.99672618085512e-07, "loss": 0.0029, "reward": 2.0654375553131104, "reward_std": 0.05987148731946945, "rewards/accuracy_reward": 0.8841874599456787, "rewards/format_reward": 1.0, "step": 1141 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 435.65625, "epoch": 0.011529530540131247, "grad_norm": 2.2891377067509584, "kl": 0.06103515625, "learning_rate": 9.996720440459538e-07, "loss": 0.0024, "reward": 1.8393751382827759, "reward_std": 0.031177915632724762, "rewards/accuracy_reward": 0.7018749713897705, "rewards/format_reward": 1.0, "step": 1142 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 416.8125, "epoch": 0.011539626451287229, "grad_norm": 1.3318287739059507, "kl": 0.047119140625, "learning_rate": 9.99671469503734e-07, "loss": 0.0019, "reward": 1.4318125247955322, "reward_std": 0.16403280198574066, "rewards/accuracy_reward": 0.3568125069141388, "rewards/format_reward": 1.0, "step": 1143 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.21875, "epoch": 0.011549722362443211, "grad_norm": 2.098495541556782, "kl": 0.07568359375, "learning_rate": 9.99670894458853e-07, "loss": 0.003, "reward": 2.1045000553131104, "reward_std": 0.016686413437128067, "rewards/accuracy_reward": 0.9045000076293945, "rewards/format_reward": 1.0, "step": 1144 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 430.03125, "epoch": 0.011559818273599193, "grad_norm": 1.7860739966350159, "kl": 0.05078125, "learning_rate": 9.996703189113114e-07, "loss": 0.002, "reward": 1.6204687356948853, "reward_std": 0.09755448997020721, "rewards/accuracy_reward": 0.5204687714576721, "rewards/format_reward": 1.0, "step": 1145 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.011569914184755174, "grad_norm": 2.3952087462955576, "kl": 0.06103515625, "learning_rate": 9.996697428611099e-07, "loss": 0.0024, "reward": 1.83412504196167, "reward_std": 0.12277218699455261, "rewards/accuracy_reward": 0.6966249942779541, "rewards/format_reward": 1.0, "step": 1146 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.4375, "epoch": 0.011580010095911156, "grad_norm": 3.4723047518194416, "kl": 0.072265625, "learning_rate": 9.996691663082487e-07, "loss": 0.0029, "reward": 1.8244061470031738, "reward_std": 0.02611856535077095, "rewards/accuracy_reward": 0.6744062304496765, "rewards/format_reward": 1.0, "step": 1147 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 405.9375, "epoch": 0.011590106007067138, "grad_norm": 1.4950120332297558, "kl": 0.05322265625, "learning_rate": 9.996685892527288e-07, "loss": 0.0021, "reward": 1.8772187232971191, "reward_std": 0.036123376339673996, "rewards/accuracy_reward": 0.7397186756134033, "rewards/format_reward": 1.0, "step": 1148 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.625, "epoch": 0.01160020191822312, "grad_norm": 1.757323927520073, "kl": 0.06103515625, "learning_rate": 9.996680116945508e-07, "loss": 0.0024, "reward": 2.154656410217285, "reward_std": 0.04220881313085556, "rewards/accuracy_reward": 0.9609062671661377, "rewards/format_reward": 1.0, "step": 1149 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.011610297829379102, "grad_norm": 6.958849782828155, "kl": 0.05322265625, "learning_rate": 9.99667433633715e-07, "loss": 0.0021, "reward": 2.1486873626708984, "reward_std": 0.007126886863261461, "rewards/accuracy_reward": 0.9486874938011169, "rewards/format_reward": 1.0, "step": 1150 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 418.1875, "epoch": 0.011620393740535083, "grad_norm": 1.6457323948089966, "kl": 0.059326171875, "learning_rate": 9.996668550702219e-07, "loss": 0.0024, "reward": 1.8076250553131104, "reward_std": 0.009628363884985447, "rewards/accuracy_reward": 0.6576249599456787, "rewards/format_reward": 1.0, "step": 1151 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.4375, "epoch": 0.011630489651691065, "grad_norm": 2.0774959927320373, "kl": 0.0654296875, "learning_rate": 9.996662760040725e-07, "loss": 0.0026, "reward": 2.1497812271118164, "reward_std": 0.019013358280062675, "rewards/accuracy_reward": 0.9560312032699585, "rewards/format_reward": 1.0, "step": 1152 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.625, "epoch": 0.011640585562847047, "grad_norm": 2.206979820427835, "kl": 0.05517578125, "learning_rate": 9.99665696435267e-07, "loss": 0.0022, "reward": 2.1137187480926514, "reward_std": 0.0281074196100235, "rewards/accuracy_reward": 0.9199687242507935, "rewards/format_reward": 1.0, "step": 1153 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.28125, "epoch": 0.011650681474003029, "grad_norm": 2.2116118788883368, "kl": 0.0791015625, "learning_rate": 9.996651163638064e-07, "loss": 0.0032, "reward": 2.1031875610351562, "reward_std": 0.028136996552348137, "rewards/accuracy_reward": 0.9094374775886536, "rewards/format_reward": 1.0, "step": 1154 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 389.53125, "epoch": 0.011660777385159011, "grad_norm": 0.9943311095265612, "kl": 0.0498046875, "learning_rate": 9.996645357896906e-07, "loss": 0.002, "reward": 1.8937499523162842, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1155 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 406.0625, "epoch": 0.011670873296314992, "grad_norm": 3.594650591917159, "kl": 0.06884765625, "learning_rate": 9.996639547129208e-07, "loss": 0.0027, "reward": 1.7418124675750732, "reward_std": 0.019877921789884567, "rewards/accuracy_reward": 0.5918124914169312, "rewards/format_reward": 1.0, "step": 1156 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.875, "epoch": 0.011680969207470974, "grad_norm": 2.7035892410916387, "kl": 0.059814453125, "learning_rate": 9.996633731334976e-07, "loss": 0.0024, "reward": 2.0825626850128174, "reward_std": 0.059246841818094254, "rewards/accuracy_reward": 0.8888124823570251, "rewards/format_reward": 1.0, "step": 1157 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.0625, "epoch": 0.011691065118626956, "grad_norm": 3.7825412276376955, "kl": 0.061279296875, "learning_rate": 9.996627910514212e-07, "loss": 0.0024, "reward": 2.1615939140319824, "reward_std": 0.03964172303676605, "rewards/accuracy_reward": 0.9678437113761902, "rewards/format_reward": 1.0, "step": 1158 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.011701161029782938, "grad_norm": 1.9674039567015011, "kl": 0.0751953125, "learning_rate": 9.996622084666923e-07, "loss": 0.003, "reward": 2.102343797683716, "reward_std": 0.0260118767619133, "rewards/accuracy_reward": 0.9085937738418579, "rewards/format_reward": 1.0, "step": 1159 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.8125, "epoch": 0.01171125694093892, "grad_norm": 1.9000398364836022, "kl": 0.06689453125, "learning_rate": 9.996616253793117e-07, "loss": 0.0027, "reward": 2.067718982696533, "reward_std": 0.0546828992664814, "rewards/accuracy_reward": 0.886468768119812, "rewards/format_reward": 1.0, "step": 1160 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.28125, "epoch": 0.0117213528520949, "grad_norm": 1.6727259465322355, "kl": 0.0537109375, "learning_rate": 9.996610417892797e-07, "loss": 0.0021, "reward": 1.8223750591278076, "reward_std": 0.007815131917595863, "rewards/accuracy_reward": 0.672374963760376, "rewards/format_reward": 1.0, "step": 1161 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.53125, "epoch": 0.011731448763250883, "grad_norm": 2.3386516976920837, "kl": 0.059814453125, "learning_rate": 9.99660457696597e-07, "loss": 0.0024, "reward": 2.0414376258850098, "reward_std": 0.010793445631861687, "rewards/accuracy_reward": 0.8414374589920044, "rewards/format_reward": 1.0, "step": 1162 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.75, "epoch": 0.011741544674406865, "grad_norm": 2.3652329434788455, "kl": 0.07421875, "learning_rate": 9.996598731012642e-07, "loss": 0.003, "reward": 2.054999828338623, "reward_std": 0.019358085468411446, "rewards/accuracy_reward": 0.8549999594688416, "rewards/format_reward": 1.0, "step": 1163 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.71875, "epoch": 0.011751640585562847, "grad_norm": 2.180907005321984, "kl": 0.06982421875, "learning_rate": 9.99659288003282e-07, "loss": 0.0028, "reward": 2.1208126544952393, "reward_std": 0.017455464228987694, "rewards/accuracy_reward": 0.9208124876022339, "rewards/format_reward": 1.0, "step": 1164 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.75, "epoch": 0.01176173649671883, "grad_norm": 1.939684825074168, "kl": 0.07666015625, "learning_rate": 9.99658702402651e-07, "loss": 0.0031, "reward": 1.9560000896453857, "reward_std": 0.017781060189008713, "rewards/accuracy_reward": 0.7559999823570251, "rewards/format_reward": 1.0, "step": 1165 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.8125, "epoch": 0.011771832407874811, "grad_norm": 2.1070133094507337, "kl": 0.06494140625, "learning_rate": 9.996581162993713e-07, "loss": 0.0026, "reward": 1.9757813215255737, "reward_std": 0.17556682229042053, "rewards/accuracy_reward": 0.8382812738418579, "rewards/format_reward": 0.9375, "step": 1166 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.1875, "epoch": 0.011781928319030792, "grad_norm": 1.6736591408485935, "kl": 0.0654296875, "learning_rate": 9.996575296934442e-07, "loss": 0.0026, "reward": 2.119812488555908, "reward_std": 0.009419754147529602, "rewards/accuracy_reward": 0.9198125004768372, "rewards/format_reward": 1.0, "step": 1167 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 408.8125, "epoch": 0.011792024230186774, "grad_norm": 2.1200866904229305, "kl": 0.0576171875, "learning_rate": 9.996569425848697e-07, "loss": 0.0023, "reward": 1.846343755722046, "reward_std": 0.011700832284986973, "rewards/accuracy_reward": 0.696343719959259, "rewards/format_reward": 1.0, "step": 1168 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.011802120141342756, "grad_norm": 20.849195524585827, "kl": 0.0703125, "learning_rate": 9.99656354973649e-07, "loss": 0.0028, "reward": 2.058093786239624, "reward_std": 0.023537002503871918, "rewards/accuracy_reward": 0.8580937385559082, "rewards/format_reward": 1.0, "step": 1169 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.21875, "epoch": 0.011812216052498738, "grad_norm": 1.8564220033253445, "kl": 0.04638671875, "learning_rate": 9.99655766859782e-07, "loss": 0.0019, "reward": 1.9203437566757202, "reward_std": 0.19587121903896332, "rewards/accuracy_reward": 0.7953437566757202, "rewards/format_reward": 1.0, "step": 1170 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.4375, "epoch": 0.01182231196365472, "grad_norm": 2.446348505841797, "kl": 0.057861328125, "learning_rate": 9.996551782432697e-07, "loss": 0.0023, "reward": 1.8291562795639038, "reward_std": 0.04030795022845268, "rewards/accuracy_reward": 0.6854062676429749, "rewards/format_reward": 1.0, "step": 1171 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.0625, "epoch": 0.011832407874810701, "grad_norm": 2.1357274380382596, "kl": 0.06298828125, "learning_rate": 9.996545891241126e-07, "loss": 0.0025, "reward": 2.169093608856201, "reward_std": 0.0163727980107069, "rewards/accuracy_reward": 0.9690937995910645, "rewards/format_reward": 1.0, "step": 1172 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.011842503785966683, "grad_norm": 2.151123657620057, "kl": 0.0654296875, "learning_rate": 9.996539995023115e-07, "loss": 0.0026, "reward": 2.0717501640319824, "reward_std": 0.025783751159906387, "rewards/accuracy_reward": 0.871749997138977, "rewards/format_reward": 1.0, "step": 1173 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.0625, "epoch": 0.011852599697122665, "grad_norm": 2.305287935904835, "kl": 0.0830078125, "learning_rate": 9.996534093778665e-07, "loss": 0.0033, "reward": 2.021218776702881, "reward_std": 0.02288401499390602, "rewards/accuracy_reward": 0.821218729019165, "rewards/format_reward": 1.0, "step": 1174 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.011862695608278647, "grad_norm": 1.9467379781913645, "kl": 0.06884765625, "learning_rate": 9.996528187507788e-07, "loss": 0.0027, "reward": 1.9602500200271606, "reward_std": 0.022400427609682083, "rewards/accuracy_reward": 0.7602499723434448, "rewards/format_reward": 1.0, "step": 1175 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.625, "epoch": 0.01187279151943463, "grad_norm": 1.914923940413103, "kl": 0.056396484375, "learning_rate": 9.996522276210485e-07, "loss": 0.0023, "reward": 2.096156120300293, "reward_std": 0.03728471323847771, "rewards/accuracy_reward": 0.9024062156677246, "rewards/format_reward": 1.0, "step": 1176 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.125, "epoch": 0.01188288743059061, "grad_norm": 3.0680768687634346, "kl": 0.06640625, "learning_rate": 9.996516359886765e-07, "loss": 0.0027, "reward": 2.02734375, "reward_std": 0.02607749029994011, "rewards/accuracy_reward": 0.827343761920929, "rewards/format_reward": 1.0, "step": 1177 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.0, "epoch": 0.011892983341746592, "grad_norm": 4.175651239467256, "kl": 0.06201171875, "learning_rate": 9.996510438536632e-07, "loss": 0.0025, "reward": 2.1487812995910645, "reward_std": 0.010495551861822605, "rewards/accuracy_reward": 0.9487812519073486, "rewards/format_reward": 1.0, "step": 1178 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 426.46875, "epoch": 0.011903079252902574, "grad_norm": 2.321866286465882, "kl": 0.053466796875, "learning_rate": 9.996504512160094e-07, "loss": 0.0021, "reward": 2.135906219482422, "reward_std": 0.007814259268343449, "rewards/accuracy_reward": 0.9359062314033508, "rewards/format_reward": 1.0, "step": 1179 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.375, "epoch": 0.011913175164058556, "grad_norm": 2.669637471916329, "kl": 0.06640625, "learning_rate": 9.996498580757154e-07, "loss": 0.0027, "reward": 2.012406349182129, "reward_std": 0.024893544614315033, "rewards/accuracy_reward": 0.8124063014984131, "rewards/format_reward": 1.0, "step": 1180 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.15625, "epoch": 0.011923271075214539, "grad_norm": 2.623756473903068, "kl": 0.0615234375, "learning_rate": 9.99649264432782e-07, "loss": 0.0025, "reward": 1.8530937433242798, "reward_std": 0.024264290928840637, "rewards/accuracy_reward": 0.7030937671661377, "rewards/format_reward": 1.0, "step": 1181 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.4375, "epoch": 0.01193336698637052, "grad_norm": 2.908585544976124, "kl": 0.06103515625, "learning_rate": 9.9964867028721e-07, "loss": 0.0024, "reward": 2.0981249809265137, "reward_std": 0.01925787329673767, "rewards/accuracy_reward": 0.8981250524520874, "rewards/format_reward": 1.0, "step": 1182 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.6875, "epoch": 0.011943462897526501, "grad_norm": 1.4912338273762888, "kl": 0.033935546875, "learning_rate": 9.996480756389995e-07, "loss": 0.0014, "reward": 1.787500023841858, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1183 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.8125, "epoch": 0.011953558808682483, "grad_norm": 11.16745414457489, "kl": 0.056396484375, "learning_rate": 9.996474804881514e-07, "loss": 0.0023, "reward": 2.152531385421753, "reward_std": 0.0452931672334671, "rewards/accuracy_reward": 0.9650312662124634, "rewards/format_reward": 1.0, "step": 1184 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.9375, "epoch": 0.011963654719838466, "grad_norm": 3.197245507779291, "kl": 0.06396484375, "learning_rate": 9.996468848346662e-07, "loss": 0.0026, "reward": 2.1304686069488525, "reward_std": 0.01742878556251526, "rewards/accuracy_reward": 0.9304687976837158, "rewards/format_reward": 1.0, "step": 1185 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.011973750630994448, "grad_norm": 3.814577940005035, "kl": 0.0654296875, "learning_rate": 9.996462886785449e-07, "loss": 0.0026, "reward": 1.986718773841858, "reward_std": 0.028380166739225388, "rewards/accuracy_reward": 0.7867187857627869, "rewards/format_reward": 1.0, "step": 1186 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.78125, "epoch": 0.01198384654215043, "grad_norm": 2.23344722773561, "kl": 0.053955078125, "learning_rate": 9.996456920197874e-07, "loss": 0.0022, "reward": 2.122093677520752, "reward_std": 0.03131597861647606, "rewards/accuracy_reward": 0.9220936894416809, "rewards/format_reward": 1.0, "step": 1187 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.6875, "epoch": 0.01199394245330641, "grad_norm": 1.7074074071060221, "kl": 0.0625, "learning_rate": 9.996450948583949e-07, "loss": 0.0025, "reward": 1.794531226158142, "reward_std": 0.00569998798891902, "rewards/accuracy_reward": 0.64453125, "rewards/format_reward": 1.0, "step": 1188 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.96875, "epoch": 0.012004038364462392, "grad_norm": 1.4505544920540314, "kl": 0.052978515625, "learning_rate": 9.996444971943677e-07, "loss": 0.0021, "reward": 2.188499927520752, "reward_std": 0.020188363268971443, "rewards/accuracy_reward": 0.9947499632835388, "rewards/format_reward": 1.0, "step": 1189 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.6875, "epoch": 0.012014134275618375, "grad_norm": 2.350698352940319, "kl": 0.06982421875, "learning_rate": 9.996438990277063e-07, "loss": 0.0028, "reward": 2.042187452316284, "reward_std": 0.05816667899489403, "rewards/accuracy_reward": 0.8421875238418579, "rewards/format_reward": 1.0, "step": 1190 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.25, "epoch": 0.012024230186774357, "grad_norm": 1.6229934273823934, "kl": 0.05908203125, "learning_rate": 9.996433003584115e-07, "loss": 0.0024, "reward": 1.7907187938690186, "reward_std": 0.013365913182497025, "rewards/accuracy_reward": 0.6407187581062317, "rewards/format_reward": 1.0, "step": 1191 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.34375, "epoch": 0.012034326097930339, "grad_norm": 3.135952995596165, "kl": 0.052978515625, "learning_rate": 9.996427011864842e-07, "loss": 0.0021, "reward": 1.981562614440918, "reward_std": 0.16551752388477325, "rewards/accuracy_reward": 0.8065624833106995, "rewards/format_reward": 1.0, "step": 1192 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.09375, "epoch": 0.01204442200908632, "grad_norm": 2.704372251308204, "kl": 0.0693359375, "learning_rate": 9.996421015119243e-07, "loss": 0.0028, "reward": 2.1309688091278076, "reward_std": 0.012155761942267418, "rewards/accuracy_reward": 0.9309687614440918, "rewards/format_reward": 1.0, "step": 1193 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.53125, "epoch": 0.012054517920242301, "grad_norm": 3.7001990819618604, "kl": 0.060302734375, "learning_rate": 9.996415013347328e-07, "loss": 0.0024, "reward": 1.7922968864440918, "reward_std": 0.012749942019581795, "rewards/accuracy_reward": 0.6422968506813049, "rewards/format_reward": 1.0, "step": 1194 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.84375, "epoch": 0.012064613831398284, "grad_norm": 2.9868620119243414, "kl": 0.064453125, "learning_rate": 9.996409006549103e-07, "loss": 0.0026, "reward": 1.8121249675750732, "reward_std": 0.03271017596125603, "rewards/accuracy_reward": 0.6621249914169312, "rewards/format_reward": 1.0, "step": 1195 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.8125, "epoch": 0.012074709742554266, "grad_norm": 1.8581320455399055, "kl": 0.060791015625, "learning_rate": 9.996402994724575e-07, "loss": 0.0024, "reward": 1.92368745803833, "reward_std": 0.17178243398666382, "rewards/accuracy_reward": 0.7611875534057617, "rewards/format_reward": 1.0, "step": 1196 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.34375, "epoch": 0.012084805653710248, "grad_norm": 3.8558937990265814, "kl": 0.05322265625, "learning_rate": 9.99639697787375e-07, "loss": 0.0021, "reward": 1.7658125162124634, "reward_std": 0.16849391162395477, "rewards/accuracy_reward": 0.6345624923706055, "rewards/format_reward": 1.0, "step": 1197 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.8125, "epoch": 0.012094901564866228, "grad_norm": 3.0420555994961247, "kl": 0.0693359375, "learning_rate": 9.996390955996629e-07, "loss": 0.0028, "reward": 1.8248437643051147, "reward_std": 0.01885915733873844, "rewards/accuracy_reward": 0.6748437881469727, "rewards/format_reward": 1.0, "step": 1198 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.01210499747602221, "grad_norm": 2.8326016474529077, "kl": 0.06884765625, "learning_rate": 9.996384929093225e-07, "loss": 0.0028, "reward": 2.1185312271118164, "reward_std": 0.027592845261096954, "rewards/accuracy_reward": 0.9185312986373901, "rewards/format_reward": 1.0, "step": 1199 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.09375, "epoch": 0.012115093387178193, "grad_norm": 1.09973668964695, "kl": 0.055908203125, "learning_rate": 9.99637889716354e-07, "loss": 0.0022, "reward": 2.0877697467803955, "reward_std": 0.025914184749126434, "rewards/accuracy_reward": 0.9065198302268982, "rewards/format_reward": 1.0, "step": 1200 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.21875, "epoch": 0.012125189298334175, "grad_norm": 32.36397875179463, "kl": 0.07177734375, "learning_rate": 9.996372860207583e-07, "loss": 0.0029, "reward": 1.8480623960494995, "reward_std": 0.026422059163451195, "rewards/accuracy_reward": 0.7043125033378601, "rewards/format_reward": 1.0, "step": 1201 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.375, "epoch": 0.012135285209490157, "grad_norm": 4.586849333148527, "kl": 0.0712890625, "learning_rate": 9.996366818225354e-07, "loss": 0.0029, "reward": 2.0582499504089355, "reward_std": 0.030358608812093735, "rewards/accuracy_reward": 0.858250081539154, "rewards/format_reward": 1.0, "step": 1202 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.9375, "epoch": 0.01214538112064614, "grad_norm": 2.837629149553923, "kl": 0.0654296875, "learning_rate": 9.996360771216867e-07, "loss": 0.0026, "reward": 2.0018436908721924, "reward_std": 0.1686616986989975, "rewards/accuracy_reward": 0.8205937147140503, "rewards/format_reward": 1.0, "step": 1203 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.21875, "epoch": 0.01215547703180212, "grad_norm": 2.9765930508085607, "kl": 0.0732421875, "learning_rate": 9.996354719182123e-07, "loss": 0.0029, "reward": 2.0810625553131104, "reward_std": 0.012865906581282616, "rewards/accuracy_reward": 0.8810625076293945, "rewards/format_reward": 1.0, "step": 1204 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.0, "epoch": 0.012165572942958102, "grad_norm": 1.5757051394198796, "kl": 0.0771484375, "learning_rate": 9.99634866212113e-07, "loss": 0.0031, "reward": 2.1681251525878906, "reward_std": 0.020223677158355713, "rewards/accuracy_reward": 0.9743750095367432, "rewards/format_reward": 1.0, "step": 1205 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 404.8125, "epoch": 0.012175668854114084, "grad_norm": 3.3424203039227995, "kl": 0.0712890625, "learning_rate": 9.996342600033893e-07, "loss": 0.0029, "reward": 1.6804063320159912, "reward_std": 0.01897042617201805, "rewards/accuracy_reward": 0.5304062366485596, "rewards/format_reward": 1.0, "step": 1206 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.78125, "epoch": 0.012185764765270066, "grad_norm": 4.031678095067514, "kl": 0.06494140625, "learning_rate": 9.996336532920418e-07, "loss": 0.0026, "reward": 2.1460418701171875, "reward_std": 0.049446482211351395, "rewards/accuracy_reward": 0.9585416913032532, "rewards/format_reward": 1.0, "step": 1207 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.09375, "epoch": 0.012195860676426048, "grad_norm": 4.716231304068122, "kl": 0.061767578125, "learning_rate": 9.996330460780714e-07, "loss": 0.0025, "reward": 2.162656307220459, "reward_std": 0.025869492441415787, "rewards/accuracy_reward": 0.9626562595367432, "rewards/format_reward": 1.0, "step": 1208 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.012205956587582029, "grad_norm": 3.9130212114950105, "kl": 0.07275390625, "learning_rate": 9.996324383614782e-07, "loss": 0.0029, "reward": 2.0718750953674316, "reward_std": 0.02505643106997013, "rewards/accuracy_reward": 0.871874988079071, "rewards/format_reward": 1.0, "step": 1209 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.53125, "epoch": 0.01221605249873801, "grad_norm": 2.647164873947581, "kl": 0.0634765625, "learning_rate": 9.996318301422632e-07, "loss": 0.0026, "reward": 1.7629687786102295, "reward_std": 0.03527074307203293, "rewards/accuracy_reward": 0.6254687905311584, "rewards/format_reward": 1.0, "step": 1210 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.40625, "epoch": 0.012226148409893993, "grad_norm": 4.176855543023321, "kl": 0.0791015625, "learning_rate": 9.99631221420427e-07, "loss": 0.0032, "reward": 2.02134370803833, "reward_std": 0.030285019427537918, "rewards/accuracy_reward": 0.821343719959259, "rewards/format_reward": 1.0, "step": 1211 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.40625, "epoch": 0.012236244321049975, "grad_norm": 2.8890851255086587, "kl": 0.078125, "learning_rate": 9.9963061219597e-07, "loss": 0.0031, "reward": 2.013625144958496, "reward_std": 0.03576172515749931, "rewards/accuracy_reward": 0.8261249661445618, "rewards/format_reward": 1.0, "step": 1212 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.21875, "epoch": 0.012246340232205957, "grad_norm": 6.28184365366074, "kl": 0.07421875, "learning_rate": 9.99630002468893e-07, "loss": 0.003, "reward": 2.000781297683716, "reward_std": 0.01965649239718914, "rewards/accuracy_reward": 0.8007813096046448, "rewards/format_reward": 1.0, "step": 1213 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.78125, "epoch": 0.012256436143361938, "grad_norm": 2.3935347849051363, "kl": 0.0673828125, "learning_rate": 9.996293922391964e-07, "loss": 0.0027, "reward": 2.0140938758850098, "reward_std": 0.12342230975627899, "rewards/accuracy_reward": 0.8515937328338623, "rewards/format_reward": 0.96875, "step": 1214 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.0625, "epoch": 0.01226653205451792, "grad_norm": 2.8562632623063213, "kl": 0.0703125, "learning_rate": 9.99628781506881e-07, "loss": 0.0028, "reward": 2.039228916168213, "reward_std": 0.01275044959038496, "rewards/accuracy_reward": 0.839229166507721, "rewards/format_reward": 1.0, "step": 1215 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.0, "epoch": 0.012276627965673902, "grad_norm": 2.3310717811688635, "kl": 0.057861328125, "learning_rate": 9.996281702719476e-07, "loss": 0.0023, "reward": 1.5756561756134033, "reward_std": 0.09994722157716751, "rewards/accuracy_reward": 0.47565627098083496, "rewards/format_reward": 1.0, "step": 1216 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.25, "epoch": 0.012286723876829884, "grad_norm": 2.3604150430482003, "kl": 0.064453125, "learning_rate": 9.996275585343964e-07, "loss": 0.0026, "reward": 2.0929999351501465, "reward_std": 0.02667102962732315, "rewards/accuracy_reward": 0.8992500305175781, "rewards/format_reward": 1.0, "step": 1217 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.4375, "epoch": 0.012296819787985866, "grad_norm": 1.0540296773045545, "kl": 0.054443359375, "learning_rate": 9.996269462942284e-07, "loss": 0.0022, "reward": 1.8644375801086426, "reward_std": 0.011946863494813442, "rewards/accuracy_reward": 0.7144374847412109, "rewards/format_reward": 1.0, "step": 1218 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.09375, "epoch": 0.012306915699141847, "grad_norm": 1.5950791018824413, "kl": 0.0673828125, "learning_rate": 9.996263335514437e-07, "loss": 0.0027, "reward": 2.0138437747955322, "reward_std": 0.013898077420890331, "rewards/accuracy_reward": 0.8138437271118164, "rewards/format_reward": 1.0, "step": 1219 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.34375, "epoch": 0.012317011610297829, "grad_norm": 2.0546152084216676, "kl": 0.0732421875, "learning_rate": 9.996257203060434e-07, "loss": 0.0029, "reward": 2.1383748054504395, "reward_std": 0.0525890588760376, "rewards/accuracy_reward": 0.950874924659729, "rewards/format_reward": 1.0, "step": 1220 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.78125, "epoch": 0.012327107521453811, "grad_norm": 1.1752370610381564, "kl": 0.059814453125, "learning_rate": 9.99625106558028e-07, "loss": 0.0024, "reward": 2.085249900817871, "reward_std": 0.0064807310700416565, "rewards/accuracy_reward": 0.8852499723434448, "rewards/format_reward": 1.0, "step": 1221 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.59375, "epoch": 0.012337203432609793, "grad_norm": 2.3351269995865684, "kl": 0.06591796875, "learning_rate": 9.99624492307398e-07, "loss": 0.0026, "reward": 2.021843910217285, "reward_std": 0.0418701171875, "rewards/accuracy_reward": 0.8218437433242798, "rewards/format_reward": 1.0, "step": 1222 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.28125, "epoch": 0.012347299343765775, "grad_norm": 2.293266188346439, "kl": 0.07470703125, "learning_rate": 9.996238775541542e-07, "loss": 0.003, "reward": 2.1017813682556152, "reward_std": 0.01308034360408783, "rewards/accuracy_reward": 0.9017812013626099, "rewards/format_reward": 1.0, "step": 1223 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.8125, "epoch": 0.012357395254921757, "grad_norm": 5.912592879973718, "kl": 0.0625, "learning_rate": 9.996232622982969e-07, "loss": 0.0025, "reward": 2.1228437423706055, "reward_std": 0.012293796986341476, "rewards/accuracy_reward": 0.9228437542915344, "rewards/format_reward": 1.0, "step": 1224 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.1875, "epoch": 0.012367491166077738, "grad_norm": 2.9122596922718755, "kl": 0.06787109375, "learning_rate": 9.996226465398271e-07, "loss": 0.0027, "reward": 2.108375072479248, "reward_std": 0.02066243439912796, "rewards/accuracy_reward": 0.9083750247955322, "rewards/format_reward": 1.0, "step": 1225 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.625, "epoch": 0.01237758707723372, "grad_norm": 1.6866966569454849, "kl": 0.06298828125, "learning_rate": 9.996220302787452e-07, "loss": 0.0025, "reward": 2.1033124923706055, "reward_std": 0.025958046317100525, "rewards/accuracy_reward": 0.9095625281333923, "rewards/format_reward": 1.0, "step": 1226 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.71875, "epoch": 0.012387682988389702, "grad_norm": 3.755421108468443, "kl": 0.06298828125, "learning_rate": 9.996214135150517e-07, "loss": 0.0025, "reward": 1.8263437747955322, "reward_std": 0.021616032347083092, "rewards/accuracy_reward": 0.6763437986373901, "rewards/format_reward": 1.0, "step": 1227 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.28125, "epoch": 0.012397778899545684, "grad_norm": 3.3554877718098925, "kl": 0.06640625, "learning_rate": 9.996207962487477e-07, "loss": 0.0027, "reward": 1.9912188053131104, "reward_std": 0.05111008882522583, "rewards/accuracy_reward": 0.7974687218666077, "rewards/format_reward": 1.0, "step": 1228 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.59375, "epoch": 0.012407874810701667, "grad_norm": 2.520678502819356, "kl": 0.06689453125, "learning_rate": 9.996201784798332e-07, "loss": 0.0027, "reward": 1.9869375228881836, "reward_std": 0.030550027266144753, "rewards/accuracy_reward": 0.7931874990463257, "rewards/format_reward": 1.0, "step": 1229 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 411.625, "epoch": 0.012417970721857647, "grad_norm": 2.713147825887974, "kl": 0.0712890625, "learning_rate": 9.996195602083095e-07, "loss": 0.0028, "reward": 1.738781213760376, "reward_std": 0.025612134486436844, "rewards/accuracy_reward": 0.5887812972068787, "rewards/format_reward": 1.0, "step": 1230 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.9375, "epoch": 0.012428066633013629, "grad_norm": 5.150648361424884, "kl": 0.076171875, "learning_rate": 9.996189414341765e-07, "loss": 0.0031, "reward": 2.027562379837036, "reward_std": 0.020763028413057327, "rewards/accuracy_reward": 0.8275625705718994, "rewards/format_reward": 1.0, "step": 1231 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.8125, "epoch": 0.012438162544169611, "grad_norm": 2.3313416622490384, "kl": 0.0625, "learning_rate": 9.996183221574352e-07, "loss": 0.0025, "reward": 2.1274688243865967, "reward_std": 0.04007156193256378, "rewards/accuracy_reward": 0.9399687647819519, "rewards/format_reward": 1.0, "step": 1232 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.4375, "epoch": 0.012448258455325593, "grad_norm": 2.2271068549650623, "kl": 0.072265625, "learning_rate": 9.996177023780863e-07, "loss": 0.0029, "reward": 2.1133127212524414, "reward_std": 0.014969069510698318, "rewards/accuracy_reward": 0.9133124947547913, "rewards/format_reward": 1.0, "step": 1233 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.375, "epoch": 0.012458354366481576, "grad_norm": 2.4811522556545933, "kl": 0.0625, "learning_rate": 9.996170820961305e-07, "loss": 0.0025, "reward": 1.8024063110351562, "reward_std": 0.10786203294992447, "rewards/accuracy_reward": 0.6524062156677246, "rewards/format_reward": 1.0, "step": 1234 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.4375, "epoch": 0.012468450277637556, "grad_norm": 1.7732570620936448, "kl": 0.061279296875, "learning_rate": 9.996164613115678e-07, "loss": 0.0025, "reward": 2.0885000228881836, "reward_std": 0.018276948481798172, "rewards/accuracy_reward": 0.8885000944137573, "rewards/format_reward": 1.0, "step": 1235 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.375, "epoch": 0.012478546188793538, "grad_norm": 2.088535251986974, "kl": 0.0703125, "learning_rate": 9.996158400243996e-07, "loss": 0.0028, "reward": 1.8749375343322754, "reward_std": 0.03127970173954964, "rewards/accuracy_reward": 0.7374375462532043, "rewards/format_reward": 1.0, "step": 1236 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.3125, "epoch": 0.01248864209994952, "grad_norm": 2.0354817497769018, "kl": 0.0751953125, "learning_rate": 9.99615218234626e-07, "loss": 0.003, "reward": 1.9521563053131104, "reward_std": 0.01239472534507513, "rewards/accuracy_reward": 0.7521562576293945, "rewards/format_reward": 1.0, "step": 1237 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.71875, "epoch": 0.012498738011105502, "grad_norm": 1.8353199135526161, "kl": 0.0703125, "learning_rate": 9.996145959422482e-07, "loss": 0.0028, "reward": 2.105156421661377, "reward_std": 0.011955379508435726, "rewards/accuracy_reward": 0.9051561951637268, "rewards/format_reward": 1.0, "step": 1238 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.78125, "epoch": 0.012508833922261485, "grad_norm": 2.2150159551085693, "kl": 0.072265625, "learning_rate": 9.996139731472659e-07, "loss": 0.0029, "reward": 2.0734376907348633, "reward_std": 0.023725636303424835, "rewards/accuracy_reward": 0.8734375238418579, "rewards/format_reward": 1.0, "step": 1239 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.96875, "epoch": 0.012518929833417465, "grad_norm": 2.17649135913275, "kl": 0.053955078125, "learning_rate": 9.996133498496805e-07, "loss": 0.0022, "reward": 2.135999917984009, "reward_std": 0.016405686736106873, "rewards/accuracy_reward": 0.9359999895095825, "rewards/format_reward": 1.0, "step": 1240 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.03125, "epoch": 0.012529025744573447, "grad_norm": 2.4212598541608092, "kl": 0.06640625, "learning_rate": 9.996127260494925e-07, "loss": 0.0027, "reward": 1.84375, "reward_std": 0.16964933276176453, "rewards/accuracy_reward": 0.6812499761581421, "rewards/format_reward": 1.0, "step": 1241 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.0, "epoch": 0.01253912165572943, "grad_norm": 2.490525378640112, "kl": 0.06787109375, "learning_rate": 9.996121017467024e-07, "loss": 0.0027, "reward": 2.0666251182556152, "reward_std": 0.14795340597629547, "rewards/accuracy_reward": 0.8791249990463257, "rewards/format_reward": 1.0, "step": 1242 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.5, "epoch": 0.012549217566885412, "grad_norm": 2.336343150555205, "kl": 0.0654296875, "learning_rate": 9.99611476941311e-07, "loss": 0.0026, "reward": 2.06459379196167, "reward_std": 0.04782000184059143, "rewards/accuracy_reward": 0.8833438158035278, "rewards/format_reward": 1.0, "step": 1243 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.96875, "epoch": 0.012559313478041394, "grad_norm": 1.8793687772725802, "kl": 0.0654296875, "learning_rate": 9.996108516333183e-07, "loss": 0.0026, "reward": 2.104062557220459, "reward_std": 0.014029006473720074, "rewards/accuracy_reward": 0.9040625095367432, "rewards/format_reward": 1.0, "step": 1244 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.03125, "epoch": 0.012569409389197376, "grad_norm": 2.489502067333611, "kl": 0.0625, "learning_rate": 9.996102258227257e-07, "loss": 0.0025, "reward": 1.968093752861023, "reward_std": 0.01926513761281967, "rewards/accuracy_reward": 0.7680937647819519, "rewards/format_reward": 1.0, "step": 1245 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 439.40625, "epoch": 0.012579505300353356, "grad_norm": 2.021251736704663, "kl": 0.057861328125, "learning_rate": 9.996095995095336e-07, "loss": 0.0023, "reward": 2.128718852996826, "reward_std": 0.029451079666614532, "rewards/accuracy_reward": 0.9412187933921814, "rewards/format_reward": 1.0, "step": 1246 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.46875, "epoch": 0.012589601211509338, "grad_norm": 2.5172806008519992, "kl": 0.06005859375, "learning_rate": 9.996089726937425e-07, "loss": 0.0024, "reward": 2.1328125, "reward_std": 0.024756183847784996, "rewards/accuracy_reward": 0.9328125715255737, "rewards/format_reward": 1.0, "step": 1247 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 437.46875, "epoch": 0.01259969712266532, "grad_norm": 5.907395183936658, "kl": 0.06396484375, "learning_rate": 9.996083453753531e-07, "loss": 0.0026, "reward": 2.01924991607666, "reward_std": 0.1794995814561844, "rewards/accuracy_reward": 0.8442500233650208, "rewards/format_reward": 1.0, "step": 1248 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.9375, "epoch": 0.012609793033821303, "grad_norm": 5.215318012889353, "kl": 0.05712890625, "learning_rate": 9.99607717554366e-07, "loss": 0.0023, "reward": 2.091531276702881, "reward_std": 0.017721835523843765, "rewards/accuracy_reward": 0.891531229019165, "rewards/format_reward": 1.0, "step": 1249 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.1875, "epoch": 0.012619888944977285, "grad_norm": 2.1442469076262447, "kl": 0.0517578125, "learning_rate": 9.996070892307819e-07, "loss": 0.0021, "reward": 2.070000171661377, "reward_std": 0.15854652225971222, "rewards/accuracy_reward": 0.8825000524520874, "rewards/format_reward": 1.0, "step": 1250 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.65625, "epoch": 0.012629984856133265, "grad_norm": 1.8752813721881731, "kl": 0.060546875, "learning_rate": 9.996064604046014e-07, "loss": 0.0024, "reward": 2.1455469131469727, "reward_std": 0.015906188637018204, "rewards/accuracy_reward": 0.9455468654632568, "rewards/format_reward": 1.0, "step": 1251 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 419.9375, "epoch": 0.012640080767289247, "grad_norm": 1.6638932198685714, "kl": 0.04345703125, "learning_rate": 9.996058310758251e-07, "loss": 0.0017, "reward": 1.740687608718872, "reward_std": 0.16251376271247864, "rewards/accuracy_reward": 0.6156874895095825, "rewards/format_reward": 1.0, "step": 1252 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 442.8125, "epoch": 0.01265017667844523, "grad_norm": 3.411487898641729, "kl": 0.053466796875, "learning_rate": 9.996052012444538e-07, "loss": 0.0021, "reward": 1.6040937900543213, "reward_std": 0.14136207103729248, "rewards/accuracy_reward": 0.49784374237060547, "rewards/format_reward": 1.0, "step": 1253 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 434.90625, "epoch": 0.012660272589601212, "grad_norm": 5.90608657834744, "kl": 0.06396484375, "learning_rate": 9.99604570910488e-07, "loss": 0.0026, "reward": 2.18359375, "reward_std": 0.027389660477638245, "rewards/accuracy_reward": 0.9898437261581421, "rewards/format_reward": 1.0, "step": 1254 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 450.34375, "epoch": 0.012670368500757194, "grad_norm": 2.4402550059975776, "kl": 0.0615234375, "learning_rate": 9.99603940073928e-07, "loss": 0.0025, "reward": 1.6985937356948853, "reward_std": 0.14449535310268402, "rewards/accuracy_reward": 0.5735937356948853, "rewards/format_reward": 1.0, "step": 1255 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 447.625, "epoch": 0.012680464411913174, "grad_norm": 2.1671065449150504, "kl": 0.06689453125, "learning_rate": 9.99603308734775e-07, "loss": 0.0027, "reward": 1.7938125133514404, "reward_std": 0.04852333664894104, "rewards/accuracy_reward": 0.6563124656677246, "rewards/format_reward": 1.0, "step": 1256 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 456.21875, "epoch": 0.012690560323069157, "grad_norm": 2.29052043558352, "kl": 0.0712890625, "learning_rate": 9.996026768930295e-07, "loss": 0.0029, "reward": 2.0022501945495605, "reward_std": 0.04705357551574707, "rewards/accuracy_reward": 0.814750075340271, "rewards/format_reward": 1.0, "step": 1257 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 435.03125, "epoch": 0.012700656234225139, "grad_norm": 1.8165720936676637, "kl": 0.05224609375, "learning_rate": 9.99602044548692e-07, "loss": 0.0021, "reward": 1.318750023841858, "reward_std": 0.11153386533260345, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 1258 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 438.59375, "epoch": 0.01271075214538112, "grad_norm": 3.303048161907047, "kl": 0.06591796875, "learning_rate": 9.996014117017628e-07, "loss": 0.0026, "reward": 2.178875207901001, "reward_std": 0.010654782876372337, "rewards/accuracy_reward": 0.9788750410079956, "rewards/format_reward": 1.0, "step": 1259 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.125, "epoch": 0.012720848056537103, "grad_norm": 2.423063172770966, "kl": 0.0751953125, "learning_rate": 9.996007783522433e-07, "loss": 0.003, "reward": 2.0682811737060547, "reward_std": 0.03398808091878891, "rewards/accuracy_reward": 0.8745312094688416, "rewards/format_reward": 1.0, "step": 1260 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.96875, "epoch": 0.012730943967693085, "grad_norm": 3.0169363205732425, "kl": 0.078125, "learning_rate": 9.996001445001336e-07, "loss": 0.0031, "reward": 2.119406223297119, "reward_std": 0.03935016319155693, "rewards/accuracy_reward": 0.9256563186645508, "rewards/format_reward": 1.0, "step": 1261 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.0625, "epoch": 0.012741039878849066, "grad_norm": 1.8021274749014053, "kl": 0.057861328125, "learning_rate": 9.995995101454343e-07, "loss": 0.0023, "reward": 2.140625, "reward_std": 0.016232838854193687, "rewards/accuracy_reward": 0.9406250715255737, "rewards/format_reward": 1.0, "step": 1262 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.0625, "epoch": 0.012751135790005048, "grad_norm": 2.123526786785987, "kl": 0.07470703125, "learning_rate": 9.995988752881466e-07, "loss": 0.003, "reward": 1.8775312900543213, "reward_std": 0.01725463569164276, "rewards/accuracy_reward": 0.6837812662124634, "rewards/format_reward": 1.0, "step": 1263 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.5, "epoch": 0.01276123170116103, "grad_norm": 5.442977512323666, "kl": 0.057861328125, "learning_rate": 9.995982399282706e-07, "loss": 0.0023, "reward": 1.795968770980835, "reward_std": 0.011354215443134308, "rewards/accuracy_reward": 0.6459687352180481, "rewards/format_reward": 1.0, "step": 1264 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.84375, "epoch": 0.012771327612317012, "grad_norm": 2.84358677082522, "kl": 0.06982421875, "learning_rate": 9.99597604065807e-07, "loss": 0.0028, "reward": 2.0719687938690186, "reward_std": 0.030653994530439377, "rewards/accuracy_reward": 0.8782187700271606, "rewards/format_reward": 1.0, "step": 1265 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.375, "epoch": 0.012781423523472994, "grad_norm": 1.5943928878234783, "kl": 0.060546875, "learning_rate": 9.995969677007567e-07, "loss": 0.0024, "reward": 2.156437397003174, "reward_std": 0.039048485457897186, "rewards/accuracy_reward": 0.9626874923706055, "rewards/format_reward": 1.0, "step": 1266 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.012791519434628975, "grad_norm": 2.2684586891746172, "kl": 0.06201171875, "learning_rate": 9.995963308331202e-07, "loss": 0.0025, "reward": 2.07059383392334, "reward_std": 0.033191271126270294, "rewards/accuracy_reward": 0.8705937266349792, "rewards/format_reward": 1.0, "step": 1267 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.125, "epoch": 0.012801615345784957, "grad_norm": 2.5200611561492487, "kl": 0.0556640625, "learning_rate": 9.995956934628979e-07, "loss": 0.0022, "reward": 1.7798750400543213, "reward_std": 0.012680601328611374, "rewards/accuracy_reward": 0.6298750042915344, "rewards/format_reward": 1.0, "step": 1268 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 384.1875, "epoch": 0.012811711256940939, "grad_norm": 6.106934884987925, "kl": 0.0634765625, "learning_rate": 9.995950555900907e-07, "loss": 0.0025, "reward": 1.7724688053131104, "reward_std": 0.037547819316387177, "rewards/accuracy_reward": 0.6224687099456787, "rewards/format_reward": 1.0, "step": 1269 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.9375, "epoch": 0.012821807168096921, "grad_norm": 1.4772199316654464, "kl": 0.0517578125, "learning_rate": 9.995944172146992e-07, "loss": 0.0021, "reward": 1.872093677520752, "reward_std": 0.00679505430161953, "rewards/accuracy_reward": 0.7220937013626099, "rewards/format_reward": 1.0, "step": 1270 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.6875, "epoch": 0.012831903079252903, "grad_norm": 2.9737475148295878, "kl": 0.058837890625, "learning_rate": 9.995937783367244e-07, "loss": 0.0024, "reward": 1.8698437213897705, "reward_std": 0.027287179604172707, "rewards/accuracy_reward": 0.7260937690734863, "rewards/format_reward": 1.0, "step": 1271 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.96875, "epoch": 0.012841998990408884, "grad_norm": 2.712146000957133, "kl": 0.06494140625, "learning_rate": 9.995931389561662e-07, "loss": 0.0026, "reward": 2.1028122901916504, "reward_std": 0.022742610424757004, "rewards/accuracy_reward": 0.9028124809265137, "rewards/format_reward": 1.0, "step": 1272 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.012852094901564866, "grad_norm": 1.9331344935810126, "kl": 0.0595703125, "learning_rate": 9.995924990730256e-07, "loss": 0.0024, "reward": 2.1141250133514404, "reward_std": 0.019881583750247955, "rewards/accuracy_reward": 0.9141249656677246, "rewards/format_reward": 1.0, "step": 1273 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.25, "epoch": 0.012862190812720848, "grad_norm": 2.6984186198616364, "kl": 0.0498046875, "learning_rate": 9.995918586873035e-07, "loss": 0.002, "reward": 1.9558438062667847, "reward_std": 0.1597076952457428, "rewards/accuracy_reward": 0.7933437824249268, "rewards/format_reward": 1.0, "step": 1274 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.6875, "epoch": 0.01287228672387683, "grad_norm": 1.1825778715258841, "kl": 0.059326171875, "learning_rate": 9.995912177990002e-07, "loss": 0.0024, "reward": 2.000093936920166, "reward_std": 0.007885274477303028, "rewards/accuracy_reward": 0.8000937700271606, "rewards/format_reward": 1.0, "step": 1275 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.625, "epoch": 0.012882382635032812, "grad_norm": 1.520774580539589, "kl": 0.05712890625, "learning_rate": 9.995905764081164e-07, "loss": 0.0023, "reward": 1.857812523841858, "reward_std": 0.0036978540010750294, "rewards/accuracy_reward": 0.707812488079071, "rewards/format_reward": 1.0, "step": 1276 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 373.625, "epoch": 0.012892478546188793, "grad_norm": 1.5582207642416894, "kl": 0.04931640625, "learning_rate": 9.99589934514653e-07, "loss": 0.002, "reward": 2.1659374237060547, "reward_std": 0.022556865587830544, "rewards/accuracy_reward": 0.9721875786781311, "rewards/format_reward": 1.0, "step": 1277 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.5, "epoch": 0.012902574457344775, "grad_norm": 1.650594341017154, "kl": 0.05517578125, "learning_rate": 9.995892921186103e-07, "loss": 0.0022, "reward": 2.153343915939331, "reward_std": 0.02546044811606407, "rewards/accuracy_reward": 0.9595937728881836, "rewards/format_reward": 1.0, "step": 1278 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.1875, "epoch": 0.012912670368500757, "grad_norm": 1.5118075484262194, "kl": 0.06201171875, "learning_rate": 9.99588649219989e-07, "loss": 0.0025, "reward": 2.03725004196167, "reward_std": 0.01036485843360424, "rewards/accuracy_reward": 0.8372499346733093, "rewards/format_reward": 1.0, "step": 1279 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.65625, "epoch": 0.01292276627965674, "grad_norm": 2.6114488832636606, "kl": 0.06640625, "learning_rate": 9.9958800581879e-07, "loss": 0.0026, "reward": 2.0344061851501465, "reward_std": 0.0651211142539978, "rewards/accuracy_reward": 0.8344062566757202, "rewards/format_reward": 1.0, "step": 1280 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.5, "epoch": 0.012932862190812721, "grad_norm": 2.2850545065151073, "kl": 0.0615234375, "learning_rate": 9.995873619150138e-07, "loss": 0.0025, "reward": 1.9709062576293945, "reward_std": 0.16299158334732056, "rewards/accuracy_reward": 0.8021562099456787, "rewards/format_reward": 1.0, "step": 1281 }, { "all_correct": 0.25, "all_wrong": 0.75, "completion_length": 391.3125, "epoch": 0.012942958101968704, "grad_norm": 0.0999395896813252, "kl": 0.03759765625, "learning_rate": 9.99586717508661e-07, "loss": 0.0015, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 1282 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 380.3125, "epoch": 0.012953054013124684, "grad_norm": 5.302240309467636, "kl": 0.059326171875, "learning_rate": 9.995860725997324e-07, "loss": 0.0024, "reward": 1.7904374599456787, "reward_std": 0.017597293481230736, "rewards/accuracy_reward": 0.6404374837875366, "rewards/format_reward": 1.0, "step": 1283 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.71875, "epoch": 0.012963149924280666, "grad_norm": 2.4494338340472903, "kl": 0.0595703125, "learning_rate": 9.995854271882284e-07, "loss": 0.0024, "reward": 1.7582812309265137, "reward_std": 0.1068640798330307, "rewards/accuracy_reward": 0.6082811951637268, "rewards/format_reward": 1.0, "step": 1284 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.012973245835436648, "grad_norm": 1.4416831782580526, "kl": 0.04443359375, "learning_rate": 9.995847812741497e-07, "loss": 0.0018, "reward": 2.122499942779541, "reward_std": 0.1237436980009079, "rewards/accuracy_reward": 0.934999942779541, "rewards/format_reward": 1.0, "step": 1285 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.3125, "epoch": 0.01298334174659263, "grad_norm": 3.0176711454959255, "kl": 0.061279296875, "learning_rate": 9.995841348574974e-07, "loss": 0.0024, "reward": 2.1606249809265137, "reward_std": 0.009771675802767277, "rewards/accuracy_reward": 0.9606249928474426, "rewards/format_reward": 1.0, "step": 1286 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.28125, "epoch": 0.012993437657748613, "grad_norm": 2.9844472073090076, "kl": 0.062255859375, "learning_rate": 9.995834879382714e-07, "loss": 0.0025, "reward": 2.1000313758850098, "reward_std": 0.05344241112470627, "rewards/accuracy_reward": 0.9125311970710754, "rewards/format_reward": 1.0, "step": 1287 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 375.78125, "epoch": 0.013003533568904593, "grad_norm": 2.14147269571123, "kl": 0.04931640625, "learning_rate": 9.99582840516473e-07, "loss": 0.002, "reward": 2.1377813816070557, "reward_std": 0.056748099625110626, "rewards/accuracy_reward": 0.9627812504768372, "rewards/format_reward": 1.0, "step": 1288 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 375.78125, "epoch": 0.013013629480060575, "grad_norm": 2.087914435104956, "kl": 0.06298828125, "learning_rate": 9.995821925921024e-07, "loss": 0.0025, "reward": 2.1467812061309814, "reward_std": 0.013432761654257774, "rewards/accuracy_reward": 0.9467812180519104, "rewards/format_reward": 1.0, "step": 1289 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.9375, "epoch": 0.013023725391216557, "grad_norm": 2.2694094316900126, "kl": 0.0625, "learning_rate": 9.995815441651605e-07, "loss": 0.0025, "reward": 2.102937698364258, "reward_std": 0.04673750698566437, "rewards/accuracy_reward": 0.9154375195503235, "rewards/format_reward": 1.0, "step": 1290 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.03125, "epoch": 0.01303382130237254, "grad_norm": 2.089823883582369, "kl": 0.0556640625, "learning_rate": 9.99580895235648e-07, "loss": 0.0022, "reward": 1.835296869277954, "reward_std": 0.014595989137887955, "rewards/accuracy_reward": 0.685296893119812, "rewards/format_reward": 1.0, "step": 1291 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.125, "epoch": 0.013043917213528522, "grad_norm": 1.4775953705645026, "kl": 0.05029296875, "learning_rate": 9.995802458035653e-07, "loss": 0.002, "reward": 2.12081241607666, "reward_std": 0.018950629979372025, "rewards/accuracy_reward": 0.9208124279975891, "rewards/format_reward": 1.0, "step": 1292 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.3125, "epoch": 0.013054013124684502, "grad_norm": 2.3509646265361495, "kl": 0.06591796875, "learning_rate": 9.995795958689132e-07, "loss": 0.0026, "reward": 2.1381564140319824, "reward_std": 0.01633395254611969, "rewards/accuracy_reward": 0.938156247138977, "rewards/format_reward": 1.0, "step": 1293 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.03125, "epoch": 0.013064109035840484, "grad_norm": 1.410273077102216, "kl": 0.05615234375, "learning_rate": 9.995789454316926e-07, "loss": 0.0022, "reward": 1.846250057220459, "reward_std": 0.008210497908294201, "rewards/accuracy_reward": 0.6962500214576721, "rewards/format_reward": 1.0, "step": 1294 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.84375, "epoch": 0.013074204946996466, "grad_norm": 5.909247798342069, "kl": 0.0751953125, "learning_rate": 9.995782944919035e-07, "loss": 0.003, "reward": 2.0627188682556152, "reward_std": 0.02784672938287258, "rewards/accuracy_reward": 0.8627187013626099, "rewards/format_reward": 1.0, "step": 1295 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.8125, "epoch": 0.013084300858152449, "grad_norm": 1.587889552803011, "kl": 0.0634765625, "learning_rate": 9.995776430495472e-07, "loss": 0.0025, "reward": 2.060281276702881, "reward_std": 0.006898585706949234, "rewards/accuracy_reward": 0.860281229019165, "rewards/format_reward": 1.0, "step": 1296 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.5625, "epoch": 0.01309439676930843, "grad_norm": 2.2565648190529974, "kl": 0.05419921875, "learning_rate": 9.99576991104624e-07, "loss": 0.0022, "reward": 2.018937587738037, "reward_std": 0.172810360789299, "rewards/accuracy_reward": 0.8439375162124634, "rewards/format_reward": 1.0, "step": 1297 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 387.59375, "epoch": 0.013104492680464411, "grad_norm": 2.291625749958945, "kl": 0.0595703125, "learning_rate": 9.99576338657135e-07, "loss": 0.0024, "reward": 1.7867969274520874, "reward_std": 0.029625754803419113, "rewards/accuracy_reward": 0.6430468559265137, "rewards/format_reward": 1.0, "step": 1298 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 404.34375, "epoch": 0.013114588591620393, "grad_norm": 2.202782328398808, "kl": 0.0625, "learning_rate": 9.995756857070802e-07, "loss": 0.0025, "reward": 1.7937812805175781, "reward_std": 0.05346707999706268, "rewards/accuracy_reward": 0.6437812447547913, "rewards/format_reward": 1.0, "step": 1299 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 387.875, "epoch": 0.013124684502776375, "grad_norm": 1.8239728484514555, "kl": 0.0595703125, "learning_rate": 9.995750322544608e-07, "loss": 0.0024, "reward": 1.7804062366485596, "reward_std": 0.016080312430858612, "rewards/accuracy_reward": 0.6304062604904175, "rewards/format_reward": 1.0, "step": 1300 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.15625, "epoch": 0.013134780413932358, "grad_norm": 1.0927025984983665, "kl": 0.0498046875, "learning_rate": 9.99574378299277e-07, "loss": 0.002, "reward": 2.131999969482422, "reward_std": 0.0061731114983558655, "rewards/accuracy_reward": 0.9320000410079956, "rewards/format_reward": 1.0, "step": 1301 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 411.84375, "epoch": 0.01314487632508834, "grad_norm": 1.3585694616358102, "kl": 0.043212890625, "learning_rate": 9.995737238415298e-07, "loss": 0.0017, "reward": 1.8793749809265137, "reward_std": 0.005318422336131334, "rewards/accuracy_reward": 0.7293750047683716, "rewards/format_reward": 1.0, "step": 1302 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 395.8125, "epoch": 0.013154972236244322, "grad_norm": 1.3531095972717069, "kl": 0.056640625, "learning_rate": 9.995730688812199e-07, "loss": 0.0023, "reward": 1.8904062509536743, "reward_std": 0.007244979962706566, "rewards/accuracy_reward": 0.7404062747955322, "rewards/format_reward": 1.0, "step": 1303 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.25, "epoch": 0.013165068147400302, "grad_norm": 1.1384731066106242, "kl": 0.0478515625, "learning_rate": 9.995724134183476e-07, "loss": 0.0019, "reward": 1.888562560081482, "reward_std": 0.020672937855124474, "rewards/accuracy_reward": 0.7448124885559082, "rewards/format_reward": 1.0, "step": 1304 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.1875, "epoch": 0.013175164058556284, "grad_norm": 1.8114717242137632, "kl": 0.057373046875, "learning_rate": 9.995717574529139e-07, "loss": 0.0023, "reward": 1.8740313053131104, "reward_std": 0.013084547594189644, "rewards/accuracy_reward": 0.7240312099456787, "rewards/format_reward": 1.0, "step": 1305 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.28125, "epoch": 0.013185259969712267, "grad_norm": 2.4127416433785784, "kl": 0.064453125, "learning_rate": 9.995711009849192e-07, "loss": 0.0026, "reward": 2.0222811698913574, "reward_std": 0.04140947014093399, "rewards/accuracy_reward": 0.834781289100647, "rewards/format_reward": 1.0, "step": 1306 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.5625, "epoch": 0.013195355880868249, "grad_norm": 2.101923039331027, "kl": 0.05908203125, "learning_rate": 9.995704440143645e-07, "loss": 0.0024, "reward": 1.8351876735687256, "reward_std": 0.01242642942816019, "rewards/accuracy_reward": 0.6851875185966492, "rewards/format_reward": 1.0, "step": 1307 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.46875, "epoch": 0.013205451792024231, "grad_norm": 2.146419126647421, "kl": 0.06982421875, "learning_rate": 9.995697865412503e-07, "loss": 0.0028, "reward": 2.1181564331054688, "reward_std": 0.021742530167102814, "rewards/accuracy_reward": 0.9181562662124634, "rewards/format_reward": 1.0, "step": 1308 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.3125, "epoch": 0.013215547703180211, "grad_norm": 2.1478328350111173, "kl": 0.0634765625, "learning_rate": 9.99569128565577e-07, "loss": 0.0025, "reward": 1.8609373569488525, "reward_std": 0.026062097400426865, "rewards/accuracy_reward": 0.7171875238418579, "rewards/format_reward": 1.0, "step": 1309 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 399.78125, "epoch": 0.013225643614336193, "grad_norm": 1.4607780564692614, "kl": 0.0537109375, "learning_rate": 9.995684700873457e-07, "loss": 0.0021, "reward": 1.366781234741211, "reward_std": 0.14307427406311035, "rewards/accuracy_reward": 0.30428123474121094, "rewards/format_reward": 1.0, "step": 1310 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.9375, "epoch": 0.013235739525492176, "grad_norm": 2.5149949870261974, "kl": 0.06396484375, "learning_rate": 9.995678111065567e-07, "loss": 0.0026, "reward": 2.0809688568115234, "reward_std": 0.017543349415063858, "rewards/accuracy_reward": 0.8809688091278076, "rewards/format_reward": 1.0, "step": 1311 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.90625, "epoch": 0.013245835436648158, "grad_norm": 2.1574004127111714, "kl": 0.056640625, "learning_rate": 9.995671516232107e-07, "loss": 0.0023, "reward": 2.0263750553131104, "reward_std": 0.03523516654968262, "rewards/accuracy_reward": 0.8388749957084656, "rewards/format_reward": 1.0, "step": 1312 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.40625, "epoch": 0.01325593134780414, "grad_norm": 3.124754792819082, "kl": 0.07080078125, "learning_rate": 9.995664916373087e-07, "loss": 0.0028, "reward": 2.0395936965942383, "reward_std": 0.023279765620827675, "rewards/accuracy_reward": 0.8395938277244568, "rewards/format_reward": 1.0, "step": 1313 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 421.125, "epoch": 0.01326602725896012, "grad_norm": 1.0714710386094264, "kl": 0.0439453125, "learning_rate": 9.99565831148851e-07, "loss": 0.0018, "reward": 1.3624999523162842, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 1314 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.013276123170116103, "grad_norm": 2.295707649718421, "kl": 0.068359375, "learning_rate": 9.995651701578384e-07, "loss": 0.0027, "reward": 2.0631251335144043, "reward_std": 0.016491370275616646, "rewards/accuracy_reward": 0.8631250858306885, "rewards/format_reward": 1.0, "step": 1315 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.875, "epoch": 0.013286219081272085, "grad_norm": 2.4955599588478172, "kl": 0.0712890625, "learning_rate": 9.995645086642718e-07, "loss": 0.0028, "reward": 2.1765313148498535, "reward_std": 0.024239113554358482, "rewards/accuracy_reward": 0.9827812314033508, "rewards/format_reward": 1.0, "step": 1316 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.75, "epoch": 0.013296314992428067, "grad_norm": 5.798029059152878, "kl": 0.06884765625, "learning_rate": 9.995638466681513e-07, "loss": 0.0028, "reward": 1.8841875791549683, "reward_std": 0.03121303766965866, "rewards/accuracy_reward": 0.6904374361038208, "rewards/format_reward": 1.0, "step": 1317 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.75, "epoch": 0.013306410903584049, "grad_norm": 2.371402486875247, "kl": 0.06494140625, "learning_rate": 9.99563184169478e-07, "loss": 0.0026, "reward": 2.094437599182129, "reward_std": 0.03184982389211655, "rewards/accuracy_reward": 0.8944374918937683, "rewards/format_reward": 1.0, "step": 1318 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.25, "epoch": 0.01331650681474003, "grad_norm": 2.354995783289872, "kl": 0.06396484375, "learning_rate": 9.995625211682525e-07, "loss": 0.0026, "reward": 1.7578749656677246, "reward_std": 0.019864343106746674, "rewards/accuracy_reward": 0.6078749895095825, "rewards/format_reward": 1.0, "step": 1319 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.6875, "epoch": 0.013326602725896012, "grad_norm": 1.7502390114319624, "kl": 0.06298828125, "learning_rate": 9.995618576644755e-07, "loss": 0.0025, "reward": 1.8251874446868896, "reward_std": 0.008528182283043861, "rewards/accuracy_reward": 0.6751875281333923, "rewards/format_reward": 1.0, "step": 1320 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.125, "epoch": 0.013336698637051994, "grad_norm": 2.332436105662824, "kl": 0.0546875, "learning_rate": 9.995611936581474e-07, "loss": 0.0022, "reward": 1.8387501239776611, "reward_std": 0.29255953431129456, "rewards/accuracy_reward": 0.7012499570846558, "rewards/format_reward": 1.0, "step": 1321 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.013346794548207976, "grad_norm": 2.1486807788099607, "kl": 0.05322265625, "learning_rate": 9.995605291492692e-07, "loss": 0.0021, "reward": 2.1381874084472656, "reward_std": 0.03112347424030304, "rewards/accuracy_reward": 0.9444374442100525, "rewards/format_reward": 1.0, "step": 1322 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 420.0, "epoch": 0.013356890459363958, "grad_norm": 1.6655382235357368, "kl": 0.0615234375, "learning_rate": 9.995598641378415e-07, "loss": 0.0025, "reward": 1.72865629196167, "reward_std": 0.008527582511305809, "rewards/accuracy_reward": 0.5786562561988831, "rewards/format_reward": 1.0, "step": 1323 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.8125, "epoch": 0.01336698637051994, "grad_norm": 2.345025289960375, "kl": 0.0615234375, "learning_rate": 9.99559198623865e-07, "loss": 0.0025, "reward": 2.144437551498413, "reward_std": 0.02286599949002266, "rewards/accuracy_reward": 0.9444375038146973, "rewards/format_reward": 1.0, "step": 1324 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.8125, "epoch": 0.01337708228167592, "grad_norm": 3.1672876508411925, "kl": 0.0498046875, "learning_rate": 9.995585326073398e-07, "loss": 0.002, "reward": 1.9906251430511475, "reward_std": 0.18123088777065277, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1325 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.0625, "epoch": 0.013387178192831903, "grad_norm": 3.640498874591153, "kl": 0.06298828125, "learning_rate": 9.995578660882676e-07, "loss": 0.0025, "reward": 2.108687400817871, "reward_std": 0.0057426802814006805, "rewards/accuracy_reward": 0.9086874723434448, "rewards/format_reward": 1.0, "step": 1326 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.013397274103987885, "grad_norm": 2.1408499552459364, "kl": 0.060302734375, "learning_rate": 9.995571990666481e-07, "loss": 0.0024, "reward": 2.1137499809265137, "reward_std": 0.04597185179591179, "rewards/accuracy_reward": 0.9325000047683716, "rewards/format_reward": 1.0, "step": 1327 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.65625, "epoch": 0.013407370015143867, "grad_norm": 2.4453540987074143, "kl": 0.0654296875, "learning_rate": 9.995565315424826e-07, "loss": 0.0026, "reward": 2.0816562175750732, "reward_std": 0.02369699813425541, "rewards/accuracy_reward": 0.881656289100647, "rewards/format_reward": 1.0, "step": 1328 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.5, "epoch": 0.01341746592629985, "grad_norm": 2.0521049642158995, "kl": 0.06103515625, "learning_rate": 9.995558635157715e-07, "loss": 0.0024, "reward": 1.8802499771118164, "reward_std": 0.010018551722168922, "rewards/accuracy_reward": 0.7302500009536743, "rewards/format_reward": 1.0, "step": 1329 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.71875, "epoch": 0.01342756183745583, "grad_norm": 2.5295828170619172, "kl": 0.0654296875, "learning_rate": 9.995551949865156e-07, "loss": 0.0026, "reward": 2.107468843460083, "reward_std": 0.009504757821559906, "rewards/accuracy_reward": 0.9074687361717224, "rewards/format_reward": 1.0, "step": 1330 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.90625, "epoch": 0.013437657748611812, "grad_norm": 1.822726448456917, "kl": 0.06640625, "learning_rate": 9.995545259547154e-07, "loss": 0.0027, "reward": 1.7805938720703125, "reward_std": 0.010086748749017715, "rewards/accuracy_reward": 0.6305937170982361, "rewards/format_reward": 1.0, "step": 1331 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.25, "epoch": 0.013447753659767794, "grad_norm": 4.6055699580648115, "kl": 0.06591796875, "learning_rate": 9.995538564203717e-07, "loss": 0.0026, "reward": 2.158719062805176, "reward_std": 0.030975300818681717, "rewards/accuracy_reward": 0.9649688005447388, "rewards/format_reward": 1.0, "step": 1332 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.5, "epoch": 0.013457849570923776, "grad_norm": 2.310612184361774, "kl": 0.06494140625, "learning_rate": 9.99553186383485e-07, "loss": 0.0026, "reward": 1.9410936832427979, "reward_std": 0.011746946722269058, "rewards/accuracy_reward": 0.7410937547683716, "rewards/format_reward": 1.0, "step": 1333 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.46875, "epoch": 0.013467945482079758, "grad_norm": 2.0382801331959333, "kl": 0.062255859375, "learning_rate": 9.995525158440564e-07, "loss": 0.0025, "reward": 2.0116562843322754, "reward_std": 0.03873772546648979, "rewards/accuracy_reward": 0.8304062485694885, "rewards/format_reward": 1.0, "step": 1334 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.125, "epoch": 0.013478041393235739, "grad_norm": 3.850207418997893, "kl": 0.07470703125, "learning_rate": 9.995518448020861e-07, "loss": 0.003, "reward": 2.1156249046325684, "reward_std": 0.01911686360836029, "rewards/accuracy_reward": 0.9156249761581421, "rewards/format_reward": 1.0, "step": 1335 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.15625, "epoch": 0.013488137304391721, "grad_norm": 2.3403761209569165, "kl": 0.0703125, "learning_rate": 9.99551173257575e-07, "loss": 0.0028, "reward": 1.7744687795639038, "reward_std": 0.009561605751514435, "rewards/accuracy_reward": 0.6244687438011169, "rewards/format_reward": 1.0, "step": 1336 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.5625, "epoch": 0.013498233215547703, "grad_norm": 2.650652799783469, "kl": 0.07373046875, "learning_rate": 9.995505012105237e-07, "loss": 0.003, "reward": 2.025624990463257, "reward_std": 0.02145022340118885, "rewards/accuracy_reward": 0.8256250619888306, "rewards/format_reward": 1.0, "step": 1337 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.53125, "epoch": 0.013508329126703685, "grad_norm": 2.9473171863217025, "kl": 0.06884765625, "learning_rate": 9.99549828660933e-07, "loss": 0.0027, "reward": 2.0462188720703125, "reward_std": 0.015459578484296799, "rewards/accuracy_reward": 0.8462187051773071, "rewards/format_reward": 1.0, "step": 1338 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.21875, "epoch": 0.013518425037859667, "grad_norm": 1.9196053879160953, "kl": 0.0693359375, "learning_rate": 9.995491556088037e-07, "loss": 0.0028, "reward": 2.1302499771118164, "reward_std": 0.014291658997535706, "rewards/accuracy_reward": 0.9302499890327454, "rewards/format_reward": 1.0, "step": 1339 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 431.84375, "epoch": 0.01352852094901565, "grad_norm": 6.182541336963471, "kl": 0.056396484375, "learning_rate": 9.99548482054136e-07, "loss": 0.0023, "reward": 1.8590625524520874, "reward_std": 0.03014282137155533, "rewards/accuracy_reward": 0.7215625047683716, "rewards/format_reward": 1.0, "step": 1340 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.875, "epoch": 0.01353861686017163, "grad_norm": 2.7391126409391253, "kl": 0.06494140625, "learning_rate": 9.99547807996931e-07, "loss": 0.0026, "reward": 1.8554375171661377, "reward_std": 0.10700251162052155, "rewards/accuracy_reward": 0.7054374814033508, "rewards/format_reward": 1.0, "step": 1341 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.4375, "epoch": 0.013548712771327612, "grad_norm": 2.806618998206371, "kl": 0.0673828125, "learning_rate": 9.99547133437189e-07, "loss": 0.0027, "reward": 1.9460937976837158, "reward_std": 0.01850486360490322, "rewards/accuracy_reward": 0.74609375, "rewards/format_reward": 1.0, "step": 1342 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.84375, "epoch": 0.013558808682483594, "grad_norm": 2.4180239271558017, "kl": 0.068359375, "learning_rate": 9.99546458374911e-07, "loss": 0.0027, "reward": 2.080749988555908, "reward_std": 0.03277762979269028, "rewards/accuracy_reward": 0.8932499289512634, "rewards/format_reward": 1.0, "step": 1343 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.25, "epoch": 0.013568904593639576, "grad_norm": 3.5957451723406866, "kl": 0.06884765625, "learning_rate": 9.995457828100976e-07, "loss": 0.0028, "reward": 1.7915936708450317, "reward_std": 0.006602992303669453, "rewards/accuracy_reward": 0.6415937542915344, "rewards/format_reward": 1.0, "step": 1344 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.125, "epoch": 0.013579000504795559, "grad_norm": 2.32580889125143, "kl": 0.07861328125, "learning_rate": 9.995451067427497e-07, "loss": 0.0031, "reward": 2.0139689445495605, "reward_std": 0.024254029616713524, "rewards/accuracy_reward": 0.8202187418937683, "rewards/format_reward": 1.0, "step": 1345 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.3125, "epoch": 0.013589096415951539, "grad_norm": 6.080450163695951, "kl": 0.0625, "learning_rate": 9.995444301728675e-07, "loss": 0.0025, "reward": 2.008906364440918, "reward_std": 0.05269595980644226, "rewards/accuracy_reward": 0.8151562213897705, "rewards/format_reward": 1.0, "step": 1346 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.3125, "epoch": 0.013599192327107521, "grad_norm": 2.2122585416119005, "kl": 0.0703125, "learning_rate": 9.99543753100452e-07, "loss": 0.0028, "reward": 2.153656244277954, "reward_std": 0.016984492540359497, "rewards/accuracy_reward": 0.9536562561988831, "rewards/format_reward": 1.0, "step": 1347 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.9375, "epoch": 0.013609288238263503, "grad_norm": 7.451325271667042, "kl": 0.07421875, "learning_rate": 9.995430755255037e-07, "loss": 0.003, "reward": 2.142500162124634, "reward_std": 0.015737200155854225, "rewards/accuracy_reward": 0.9424999952316284, "rewards/format_reward": 1.0, "step": 1348 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.013619384149419485, "grad_norm": 2.8660460001053996, "kl": 0.06494140625, "learning_rate": 9.995423974480235e-07, "loss": 0.0026, "reward": 1.8169686794281006, "reward_std": 0.11265237629413605, "rewards/accuracy_reward": 0.6669687628746033, "rewards/format_reward": 1.0, "step": 1349 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 415.5625, "epoch": 0.013629480060575468, "grad_norm": 2.942351871255615, "kl": 0.060546875, "learning_rate": 9.995417188680119e-07, "loss": 0.0024, "reward": 1.8509376049041748, "reward_std": 0.026690710335969925, "rewards/accuracy_reward": 0.7071875333786011, "rewards/format_reward": 1.0, "step": 1350 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 417.53125, "epoch": 0.013639575971731448, "grad_norm": 6.530380504867731, "kl": 0.064453125, "learning_rate": 9.9954103978547e-07, "loss": 0.0026, "reward": 1.8404062986373901, "reward_std": 0.018604429438710213, "rewards/accuracy_reward": 0.6904062628746033, "rewards/format_reward": 1.0, "step": 1351 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.21875, "epoch": 0.01364967188288743, "grad_norm": 1.905840978406955, "kl": 0.0654296875, "learning_rate": 9.995403602003978e-07, "loss": 0.0026, "reward": 2.1116249561309814, "reward_std": 0.03204455226659775, "rewards/accuracy_reward": 0.9178750514984131, "rewards/format_reward": 1.0, "step": 1352 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.9375, "epoch": 0.013659767794043412, "grad_norm": 1.9938277502082327, "kl": 0.0703125, "learning_rate": 9.995396801127963e-07, "loss": 0.0028, "reward": 2.119093894958496, "reward_std": 0.039861202239990234, "rewards/accuracy_reward": 0.9315937757492065, "rewards/format_reward": 1.0, "step": 1353 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.03125, "epoch": 0.013669863705199395, "grad_norm": 5.45966372841722, "kl": 0.0615234375, "learning_rate": 9.995389995226664e-07, "loss": 0.0025, "reward": 2.110875129699707, "reward_std": 0.02091435343027115, "rewards/accuracy_reward": 0.9108749628067017, "rewards/format_reward": 1.0, "step": 1354 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.013679959616355377, "grad_norm": 1.418516929390674, "kl": 0.050537109375, "learning_rate": 9.995383184300083e-07, "loss": 0.002, "reward": 2.104750156402588, "reward_std": 0.15655069053173065, "rewards/accuracy_reward": 0.9235000014305115, "rewards/format_reward": 1.0, "step": 1355 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.28125, "epoch": 0.013690055527511357, "grad_norm": 1.7600829572665468, "kl": 0.05126953125, "learning_rate": 9.995376368348232e-07, "loss": 0.002, "reward": 2.1408748626708984, "reward_std": 0.028846224769949913, "rewards/accuracy_reward": 0.9471250176429749, "rewards/format_reward": 1.0, "step": 1356 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.78125, "epoch": 0.01370015143866734, "grad_norm": 2.0697081637695818, "kl": 0.060546875, "learning_rate": 9.995369547371115e-07, "loss": 0.0024, "reward": 1.7988125085830688, "reward_std": 0.020362956449389458, "rewards/accuracy_reward": 0.6488125324249268, "rewards/format_reward": 1.0, "step": 1357 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.78125, "epoch": 0.013710247349823321, "grad_norm": 2.153312442251815, "kl": 0.06591796875, "learning_rate": 9.995362721368742e-07, "loss": 0.0026, "reward": 1.9628437757492065, "reward_std": 0.013093446381390095, "rewards/accuracy_reward": 0.7628437280654907, "rewards/format_reward": 1.0, "step": 1358 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 416.125, "epoch": 0.013720343260979304, "grad_norm": 9.471845879541213, "kl": 0.0673828125, "learning_rate": 9.995355890341114e-07, "loss": 0.0027, "reward": 1.765625, "reward_std": 0.021077603101730347, "rewards/accuracy_reward": 0.6156250238418579, "rewards/format_reward": 1.0, "step": 1359 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.125, "epoch": 0.013730439172135286, "grad_norm": 1.5638581437969032, "kl": 0.04931640625, "learning_rate": 9.995349054288242e-07, "loss": 0.002, "reward": 2.1296563148498535, "reward_std": 0.004615576937794685, "rewards/accuracy_reward": 0.9296562671661377, "rewards/format_reward": 1.0, "step": 1360 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.8125, "epoch": 0.013740535083291268, "grad_norm": 3.058989733651704, "kl": 0.072265625, "learning_rate": 9.995342213210133e-07, "loss": 0.0029, "reward": 2.0526561737060547, "reward_std": 0.009518875740468502, "rewards/accuracy_reward": 0.8526562452316284, "rewards/format_reward": 1.0, "step": 1361 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.71875, "epoch": 0.013750630994447248, "grad_norm": 2.281953772163749, "kl": 0.0615234375, "learning_rate": 9.995335367106793e-07, "loss": 0.0025, "reward": 2.1592812538146973, "reward_std": 0.00875149480998516, "rewards/accuracy_reward": 0.9592812657356262, "rewards/format_reward": 1.0, "step": 1362 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 408.46875, "epoch": 0.01376072690560323, "grad_norm": 1.3447514334721198, "kl": 0.05810546875, "learning_rate": 9.995328515978227e-07, "loss": 0.0023, "reward": 1.8902499675750732, "reward_std": 0.0036030716728419065, "rewards/accuracy_reward": 0.7402499914169312, "rewards/format_reward": 1.0, "step": 1363 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.46875, "epoch": 0.013770822816759213, "grad_norm": 2.339693172886897, "kl": 0.0576171875, "learning_rate": 9.995321659824445e-07, "loss": 0.0023, "reward": 1.9390623569488525, "reward_std": 0.149374321103096, "rewards/accuracy_reward": 0.7765624523162842, "rewards/format_reward": 1.0, "step": 1364 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.013780918727915195, "grad_norm": 2.0223773727611274, "kl": 0.0673828125, "learning_rate": 9.995314798645454e-07, "loss": 0.0027, "reward": 1.796968698501587, "reward_std": 0.09522370249032974, "rewards/accuracy_reward": 0.6469687223434448, "rewards/format_reward": 1.0, "step": 1365 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.28125, "epoch": 0.013791014639071177, "grad_norm": 1.4383191617117586, "kl": 0.054443359375, "learning_rate": 9.995307932441256e-07, "loss": 0.0022, "reward": 2.158031463623047, "reward_std": 0.007827048189938068, "rewards/accuracy_reward": 0.9580312371253967, "rewards/format_reward": 1.0, "step": 1366 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 412.5625, "epoch": 0.013801110550227157, "grad_norm": 1.5882562224413985, "kl": 0.055419921875, "learning_rate": 9.995301061211866e-07, "loss": 0.0022, "reward": 1.473156213760376, "reward_std": 0.010786047205328941, "rewards/accuracy_reward": 0.37315624952316284, "rewards/format_reward": 1.0, "step": 1367 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.46875, "epoch": 0.01381120646138314, "grad_norm": 2.1738824732787667, "kl": 0.06298828125, "learning_rate": 9.995294184957283e-07, "loss": 0.0025, "reward": 2.095531463623047, "reward_std": 0.022937538102269173, "rewards/accuracy_reward": 0.8955312967300415, "rewards/format_reward": 1.0, "step": 1368 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.875, "epoch": 0.013821302372539122, "grad_norm": 1.3259308455628604, "kl": 0.058837890625, "learning_rate": 9.99528730367752e-07, "loss": 0.0024, "reward": 2.189406394958496, "reward_std": 0.0039049747865647078, "rewards/accuracy_reward": 0.9894062280654907, "rewards/format_reward": 1.0, "step": 1369 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.013831398283695104, "grad_norm": 2.3662415989146415, "kl": 0.0712890625, "learning_rate": 9.99528041737258e-07, "loss": 0.0029, "reward": 2.1072187423706055, "reward_std": 0.021365713328123093, "rewards/accuracy_reward": 0.9072187542915344, "rewards/format_reward": 1.0, "step": 1370 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 411.5625, "epoch": 0.013841494194851086, "grad_norm": 1.4280754920705352, "kl": 0.048095703125, "learning_rate": 9.995273526042469e-07, "loss": 0.0019, "reward": 1.565999984741211, "reward_std": 0.005203029606491327, "rewards/accuracy_reward": 0.4659999907016754, "rewards/format_reward": 1.0, "step": 1371 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.5625, "epoch": 0.013851590106007066, "grad_norm": 2.253184145322214, "kl": 0.052001953125, "learning_rate": 9.995266629687199e-07, "loss": 0.0021, "reward": 1.8474375009536743, "reward_std": 0.1196213811635971, "rewards/accuracy_reward": 0.7036874890327454, "rewards/format_reward": 1.0, "step": 1372 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.4375, "epoch": 0.013861686017163049, "grad_norm": 2.2319634757659186, "kl": 0.064453125, "learning_rate": 9.995259728306773e-07, "loss": 0.0026, "reward": 2.1043124198913574, "reward_std": 0.018397368490695953, "rewards/accuracy_reward": 0.9043124914169312, "rewards/format_reward": 1.0, "step": 1373 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.01387178192831903, "grad_norm": 1.7463708284309238, "kl": 0.059326171875, "learning_rate": 9.995252821901199e-07, "loss": 0.0024, "reward": 2.1432814598083496, "reward_std": 0.008465337567031384, "rewards/accuracy_reward": 0.9432812929153442, "rewards/format_reward": 1.0, "step": 1374 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 417.03125, "epoch": 0.013881877839475013, "grad_norm": 3.4620562743208576, "kl": 0.0654296875, "learning_rate": 9.995245910470483e-07, "loss": 0.0026, "reward": 1.7989375591278076, "reward_std": 0.032212331891059875, "rewards/accuracy_reward": 0.648937463760376, "rewards/format_reward": 1.0, "step": 1375 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.34375, "epoch": 0.013891973750630995, "grad_norm": 1.8427729666982222, "kl": 0.055419921875, "learning_rate": 9.995238994014634e-07, "loss": 0.0022, "reward": 1.5824999809265137, "reward_std": 0.10675019025802612, "rewards/accuracy_reward": 0.48250001668930054, "rewards/format_reward": 1.0, "step": 1376 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.875, "epoch": 0.013902069661786975, "grad_norm": 1.864244594575504, "kl": 0.062255859375, "learning_rate": 9.99523207253366e-07, "loss": 0.0025, "reward": 2.153031349182129, "reward_std": 0.028932834044098854, "rewards/accuracy_reward": 0.9530313014984131, "rewards/format_reward": 1.0, "step": 1377 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 422.1875, "epoch": 0.013912165572942958, "grad_norm": 2.063075319665281, "kl": 0.0625, "learning_rate": 9.995225146027561e-07, "loss": 0.0025, "reward": 1.7740623950958252, "reward_std": 0.023695586249232292, "rewards/accuracy_reward": 0.6240625381469727, "rewards/format_reward": 1.0, "step": 1378 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 428.71875, "epoch": 0.01392226148409894, "grad_norm": 1.6239559121329665, "kl": 0.053955078125, "learning_rate": 9.995218214496351e-07, "loss": 0.0022, "reward": 1.764625072479248, "reward_std": 0.006815388333052397, "rewards/accuracy_reward": 0.6146249771118164, "rewards/format_reward": 1.0, "step": 1379 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.5, "epoch": 0.013932357395254922, "grad_norm": 2.0289984319106726, "kl": 0.056640625, "learning_rate": 9.995211277940034e-07, "loss": 0.0023, "reward": 1.9963749647140503, "reward_std": 0.1667252629995346, "rewards/accuracy_reward": 0.827625036239624, "rewards/format_reward": 1.0, "step": 1380 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.25, "epoch": 0.013942453306410904, "grad_norm": 2.317365142752682, "kl": 0.06494140625, "learning_rate": 9.99520433635862e-07, "loss": 0.0026, "reward": 1.9417500495910645, "reward_std": 0.02664189599454403, "rewards/accuracy_reward": 0.7417500615119934, "rewards/format_reward": 1.0, "step": 1381 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.1875, "epoch": 0.013952549217566886, "grad_norm": 2.2250528779963052, "kl": 0.060302734375, "learning_rate": 9.995197389752112e-07, "loss": 0.0024, "reward": 2.091031312942505, "reward_std": 0.02113710343837738, "rewards/accuracy_reward": 0.8910312652587891, "rewards/format_reward": 1.0, "step": 1382 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.65625, "epoch": 0.013962645128722867, "grad_norm": 2.234489687617691, "kl": 0.054443359375, "learning_rate": 9.995190438120518e-07, "loss": 0.0022, "reward": 2.11146879196167, "reward_std": 0.023635348305106163, "rewards/accuracy_reward": 0.917718768119812, "rewards/format_reward": 1.0, "step": 1383 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 435.28125, "epoch": 0.013972741039878849, "grad_norm": 4.032315032011662, "kl": 0.0693359375, "learning_rate": 9.995183481463846e-07, "loss": 0.0028, "reward": 2.027719020843506, "reward_std": 0.01574305072426796, "rewards/accuracy_reward": 0.8277187347412109, "rewards/format_reward": 1.0, "step": 1384 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.53125, "epoch": 0.013982836951034831, "grad_norm": 2.49446741587907, "kl": 0.056884765625, "learning_rate": 9.995176519782105e-07, "loss": 0.0023, "reward": 1.773937463760376, "reward_std": 0.024576958268880844, "rewards/accuracy_reward": 0.6239374876022339, "rewards/format_reward": 1.0, "step": 1385 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.8125, "epoch": 0.013992932862190813, "grad_norm": 2.383180741970792, "kl": 0.068359375, "learning_rate": 9.995169553075298e-07, "loss": 0.0027, "reward": 2.0235748291015625, "reward_std": 0.08641839027404785, "rewards/accuracy_reward": 0.823574960231781, "rewards/format_reward": 1.0, "step": 1386 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 426.5625, "epoch": 0.014003028773346795, "grad_norm": 2.4945918618872023, "kl": 0.06689453125, "learning_rate": 9.995162581343433e-07, "loss": 0.0027, "reward": 2.121812582015991, "reward_std": 0.017744995653629303, "rewards/accuracy_reward": 0.9218124747276306, "rewards/format_reward": 1.0, "step": 1387 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.0625, "epoch": 0.014013124684502776, "grad_norm": 1.7413244605255889, "kl": 0.06298828125, "learning_rate": 9.995155604586519e-07, "loss": 0.0025, "reward": 2.133718729019165, "reward_std": 0.028590453788638115, "rewards/accuracy_reward": 0.9399687051773071, "rewards/format_reward": 1.0, "step": 1388 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 436.6875, "epoch": 0.014023220595658758, "grad_norm": 1.1320721356145378, "kl": 0.052978515625, "learning_rate": 9.99514862280456e-07, "loss": 0.0021, "reward": 1.4360156059265137, "reward_std": 0.009851661510765553, "rewards/accuracy_reward": 0.33601564168930054, "rewards/format_reward": 1.0, "step": 1389 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.78125, "epoch": 0.01403331650681474, "grad_norm": 1.3246975641653014, "kl": 0.053466796875, "learning_rate": 9.995141635997567e-07, "loss": 0.0021, "reward": 1.8822500705718994, "reward_std": 0.022295210510492325, "rewards/accuracy_reward": 0.7384999990463257, "rewards/format_reward": 1.0, "step": 1390 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.71875, "epoch": 0.014043412417970722, "grad_norm": 2.1097183729151827, "kl": 0.064453125, "learning_rate": 9.995134644165543e-07, "loss": 0.0026, "reward": 2.0648281574249268, "reward_std": 0.05622754991054535, "rewards/accuracy_reward": 0.8835780620574951, "rewards/format_reward": 1.0, "step": 1391 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 426.8125, "epoch": 0.014053508329126704, "grad_norm": 1.3633795710985457, "kl": 0.057373046875, "learning_rate": 9.995127647308497e-07, "loss": 0.0023, "reward": 1.5404062271118164, "reward_std": 0.02409602701663971, "rewards/accuracy_reward": 0.4466562271118164, "rewards/format_reward": 1.0, "step": 1392 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.875, "epoch": 0.014063604240282685, "grad_norm": 2.774873071099857, "kl": 0.0693359375, "learning_rate": 9.995120645426437e-07, "loss": 0.0028, "reward": 2.0864062309265137, "reward_std": 0.01608436182141304, "rewards/accuracy_reward": 0.8864063024520874, "rewards/format_reward": 1.0, "step": 1393 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.4375, "epoch": 0.014073700151438667, "grad_norm": 2.062389627910603, "kl": 0.05615234375, "learning_rate": 9.995113638519368e-07, "loss": 0.0023, "reward": 1.7429687976837158, "reward_std": 0.03688301891088486, "rewards/accuracy_reward": 0.5992187261581421, "rewards/format_reward": 1.0, "step": 1394 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.09375, "epoch": 0.014083796062594649, "grad_norm": 2.5721932116308093, "kl": 0.06591796875, "learning_rate": 9.995106626587297e-07, "loss": 0.0026, "reward": 2.073312520980835, "reward_std": 0.020025525242090225, "rewards/accuracy_reward": 0.8733124732971191, "rewards/format_reward": 1.0, "step": 1395 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.75, "epoch": 0.014093891973750631, "grad_norm": 1.6380305296481994, "kl": 0.05615234375, "learning_rate": 9.995099609630234e-07, "loss": 0.0022, "reward": 1.8510937690734863, "reward_std": 0.09646789729595184, "rewards/accuracy_reward": 0.7010937929153442, "rewards/format_reward": 1.0, "step": 1396 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.09375, "epoch": 0.014103987884906613, "grad_norm": 2.1774107417110944, "kl": 0.0576171875, "learning_rate": 9.995092587648183e-07, "loss": 0.0023, "reward": 2.0868124961853027, "reward_std": 0.04521206393837929, "rewards/accuracy_reward": 0.8930625915527344, "rewards/format_reward": 1.0, "step": 1397 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.65625, "epoch": 0.014114083796062594, "grad_norm": 2.0596917204258074, "kl": 0.06982421875, "learning_rate": 9.995085560641151e-07, "loss": 0.0028, "reward": 1.7785625457763672, "reward_std": 0.044484492391347885, "rewards/accuracy_reward": 0.6285625100135803, "rewards/format_reward": 1.0, "step": 1398 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.75, "epoch": 0.014124179707218576, "grad_norm": 2.043458014896638, "kl": 0.0703125, "learning_rate": 9.995078528609148e-07, "loss": 0.0028, "reward": 1.8212499618530273, "reward_std": 0.013459824956953526, "rewards/accuracy_reward": 0.67125004529953, "rewards/format_reward": 1.0, "step": 1399 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.875, "epoch": 0.014134275618374558, "grad_norm": 3.0401744679131353, "kl": 0.06640625, "learning_rate": 9.995071491552179e-07, "loss": 0.0026, "reward": 2.000781297683716, "reward_std": 0.02972412109375, "rewards/accuracy_reward": 0.80078125, "rewards/format_reward": 1.0, "step": 1400 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.59375, "epoch": 0.01414437152953054, "grad_norm": 3.193253255551408, "kl": 0.0712890625, "learning_rate": 9.99506444947025e-07, "loss": 0.0029, "reward": 2.0355000495910645, "reward_std": 0.0429573692381382, "rewards/accuracy_reward": 0.8355000019073486, "rewards/format_reward": 1.0, "step": 1401 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.625, "epoch": 0.014154467440686522, "grad_norm": 1.7701875018583428, "kl": 0.0439453125, "learning_rate": 9.99505740236337e-07, "loss": 0.0018, "reward": 1.7954063415527344, "reward_std": 0.15619760751724243, "rewards/accuracy_reward": 0.6579062938690186, "rewards/format_reward": 1.0, "step": 1402 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.5625, "epoch": 0.014164563351842505, "grad_norm": 2.8686544237747857, "kl": 0.05810546875, "learning_rate": 9.995050350231546e-07, "loss": 0.0023, "reward": 1.909468650817871, "reward_std": 0.1599748283624649, "rewards/accuracy_reward": 0.7469687461853027, "rewards/format_reward": 1.0, "step": 1403 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 392.6875, "epoch": 0.014174659262998485, "grad_norm": 2.6680998952087505, "kl": 0.06396484375, "learning_rate": 9.995043293074785e-07, "loss": 0.0026, "reward": 1.7681562900543213, "reward_std": 0.11966650187969208, "rewards/accuracy_reward": 0.6306562423706055, "rewards/format_reward": 1.0, "step": 1404 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.59375, "epoch": 0.014184755174154467, "grad_norm": 2.6100914431906825, "kl": 0.0654296875, "learning_rate": 9.995036230893093e-07, "loss": 0.0026, "reward": 2.020718574523926, "reward_std": 0.027115482836961746, "rewards/accuracy_reward": 0.8207187652587891, "rewards/format_reward": 1.0, "step": 1405 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.625, "epoch": 0.01419485108531045, "grad_norm": 1.1230846204930927, "kl": 0.04541015625, "learning_rate": 9.995029163686477e-07, "loss": 0.0018, "reward": 1.8797500133514404, "reward_std": 0.017677664756774902, "rewards/accuracy_reward": 0.7360000610351562, "rewards/format_reward": 1.0, "step": 1406 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.90625, "epoch": 0.014204946996466431, "grad_norm": 2.6577710930632223, "kl": 0.064453125, "learning_rate": 9.995022091454946e-07, "loss": 0.0026, "reward": 2.096656322479248, "reward_std": 0.02105616219341755, "rewards/accuracy_reward": 0.8966562151908875, "rewards/format_reward": 1.0, "step": 1407 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.78125, "epoch": 0.014215042907622414, "grad_norm": 2.2789284438110045, "kl": 0.050537109375, "learning_rate": 9.995015014198507e-07, "loss": 0.002, "reward": 1.9709843397140503, "reward_std": 0.1716291904449463, "rewards/accuracy_reward": 0.7959843873977661, "rewards/format_reward": 1.0, "step": 1408 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.8125, "epoch": 0.014225138818778394, "grad_norm": 2.2937400686491536, "kl": 0.060546875, "learning_rate": 9.995007931917165e-07, "loss": 0.0024, "reward": 2.119874954223633, "reward_std": 0.043077193200588226, "rewards/accuracy_reward": 0.9261249899864197, "rewards/format_reward": 1.0, "step": 1409 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.1875, "epoch": 0.014235234729934376, "grad_norm": 2.603605571157805, "kl": 0.062255859375, "learning_rate": 9.995000844610928e-07, "loss": 0.0025, "reward": 2.165750026702881, "reward_std": 0.033014580607414246, "rewards/accuracy_reward": 0.972000002861023, "rewards/format_reward": 1.0, "step": 1410 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.28125, "epoch": 0.014245330641090358, "grad_norm": 3.5213623988867506, "kl": 0.06298828125, "learning_rate": 9.994993752279803e-07, "loss": 0.0025, "reward": 2.0867815017700195, "reward_std": 0.012738799676299095, "rewards/accuracy_reward": 0.8867812156677246, "rewards/format_reward": 1.0, "step": 1411 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.3125, "epoch": 0.01425542655224634, "grad_norm": 2.1348046694394047, "kl": 0.056640625, "learning_rate": 9.994986654923799e-07, "loss": 0.0023, "reward": 1.8180937767028809, "reward_std": 0.022855423390865326, "rewards/accuracy_reward": 0.668093740940094, "rewards/format_reward": 1.0, "step": 1412 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.5, "epoch": 0.014265522463402323, "grad_norm": 3.51023772158857, "kl": 0.06787109375, "learning_rate": 9.99497955254292e-07, "loss": 0.0027, "reward": 2.031343936920166, "reward_std": 0.01822936348617077, "rewards/accuracy_reward": 0.8313437104225159, "rewards/format_reward": 1.0, "step": 1413 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.8125, "epoch": 0.014275618374558303, "grad_norm": 5.127273115434666, "kl": 0.06591796875, "learning_rate": 9.994972445137175e-07, "loss": 0.0026, "reward": 2.1360936164855957, "reward_std": 0.03541477397084236, "rewards/accuracy_reward": 0.9360937476158142, "rewards/format_reward": 1.0, "step": 1414 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.125, "epoch": 0.014285714285714285, "grad_norm": 1.8576631807669388, "kl": 0.059326171875, "learning_rate": 9.994965332706572e-07, "loss": 0.0024, "reward": 1.9086875915527344, "reward_std": 0.18016085028648376, "rewards/accuracy_reward": 0.7461874485015869, "rewards/format_reward": 1.0, "step": 1415 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.90625, "epoch": 0.014295810196870267, "grad_norm": 1.799990428282584, "kl": 0.052734375, "learning_rate": 9.994958215251116e-07, "loss": 0.0021, "reward": 1.8199374675750732, "reward_std": 0.10879360884428024, "rewards/accuracy_reward": 0.7074374556541443, "rewards/format_reward": 0.96875, "step": 1416 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 388.75, "epoch": 0.01430590610802625, "grad_norm": 2.101412231856996, "kl": 0.0517578125, "learning_rate": 9.99495109277082e-07, "loss": 0.0021, "reward": 1.8207812309265137, "reward_std": 0.03874092921614647, "rewards/accuracy_reward": 0.6832812428474426, "rewards/format_reward": 1.0, "step": 1417 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 0.014316002019182232, "grad_norm": 2.115048087015773, "kl": 0.06884765625, "learning_rate": 9.994943965265681e-07, "loss": 0.0027, "reward": 2.115499973297119, "reward_std": 0.0318077877163887, "rewards/accuracy_reward": 0.9217500686645508, "rewards/format_reward": 1.0, "step": 1418 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.59375, "epoch": 0.014326097930338212, "grad_norm": 2.223892325971185, "kl": 0.0634765625, "learning_rate": 9.994936832735713e-07, "loss": 0.0025, "reward": 1.7771563529968262, "reward_std": 0.01233220100402832, "rewards/accuracy_reward": 0.6271562576293945, "rewards/format_reward": 1.0, "step": 1419 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.46875, "epoch": 0.014336193841494194, "grad_norm": 1.340186052141532, "kl": 0.05615234375, "learning_rate": 9.99492969518092e-07, "loss": 0.0022, "reward": 1.8611563444137573, "reward_std": 0.09153912961483002, "rewards/accuracy_reward": 0.7111562490463257, "rewards/format_reward": 1.0, "step": 1420 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.59375, "epoch": 0.014346289752650176, "grad_norm": 5.925146153490348, "kl": 0.0537109375, "learning_rate": 9.994922552601315e-07, "loss": 0.0022, "reward": 1.875906229019165, "reward_std": 0.013414019718766212, "rewards/accuracy_reward": 0.725906252861023, "rewards/format_reward": 1.0, "step": 1421 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.75, "epoch": 0.014356385663806159, "grad_norm": 2.56421830374086, "kl": 0.054443359375, "learning_rate": 9.994915404996897e-07, "loss": 0.0022, "reward": 1.6241250038146973, "reward_std": 0.26332998275756836, "rewards/accuracy_reward": 0.5053750276565552, "rewards/format_reward": 1.0, "step": 1422 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 401.03125, "epoch": 0.01436648157496214, "grad_norm": 3.410977704116192, "kl": 0.06640625, "learning_rate": 9.99490825236768e-07, "loss": 0.0027, "reward": 1.8049376010894775, "reward_std": 0.03017750009894371, "rewards/accuracy_reward": 0.661187469959259, "rewards/format_reward": 1.0, "step": 1423 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.9375, "epoch": 0.014376577486118123, "grad_norm": 4.4368051682008405, "kl": 0.046875, "learning_rate": 9.994901094713668e-07, "loss": 0.0019, "reward": 1.791656255722046, "reward_std": 0.02129550278186798, "rewards/accuracy_reward": 0.6416562795639038, "rewards/format_reward": 1.0, "step": 1424 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.03125, "epoch": 0.014386673397274103, "grad_norm": 2.340453151359687, "kl": 0.064453125, "learning_rate": 9.994893932034868e-07, "loss": 0.0026, "reward": 2.044656276702881, "reward_std": 0.02062463015317917, "rewards/accuracy_reward": 0.8446562886238098, "rewards/format_reward": 1.0, "step": 1425 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.5625, "epoch": 0.014396769308430086, "grad_norm": 1.8484625143113131, "kl": 0.056640625, "learning_rate": 9.99488676433129e-07, "loss": 0.0023, "reward": 2.0775938034057617, "reward_std": 0.01772996038198471, "rewards/accuracy_reward": 0.8775937557220459, "rewards/format_reward": 1.0, "step": 1426 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.25, "epoch": 0.014406865219586068, "grad_norm": 2.154779848441779, "kl": 0.056640625, "learning_rate": 9.994879591602938e-07, "loss": 0.0023, "reward": 1.7130937576293945, "reward_std": 0.018558356910943985, "rewards/accuracy_reward": 0.5630937814712524, "rewards/format_reward": 1.0, "step": 1427 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 428.21875, "epoch": 0.01441696113074205, "grad_norm": 1.8086396908073465, "kl": 0.052978515625, "learning_rate": 9.994872413849819e-07, "loss": 0.0021, "reward": 1.5618749856948853, "reward_std": 0.007500604260712862, "rewards/accuracy_reward": 0.4618750214576721, "rewards/format_reward": 1.0, "step": 1428 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.014427057041898032, "grad_norm": 4.367344122624826, "kl": 0.06201171875, "learning_rate": 9.994865231071944e-07, "loss": 0.0025, "reward": 2.0490000247955322, "reward_std": 0.018298976123332977, "rewards/accuracy_reward": 0.8489999771118164, "rewards/format_reward": 1.0, "step": 1429 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 413.78125, "epoch": 0.014437152953054012, "grad_norm": 2.165943449950652, "kl": 0.0625, "learning_rate": 9.994858043269317e-07, "loss": 0.0025, "reward": 1.8498437404632568, "reward_std": 0.0100832749158144, "rewards/accuracy_reward": 0.6998437643051147, "rewards/format_reward": 1.0, "step": 1430 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.5625, "epoch": 0.014447248864209995, "grad_norm": 1.6410068001427205, "kl": 0.05029296875, "learning_rate": 9.994850850441945e-07, "loss": 0.002, "reward": 1.8736562728881836, "reward_std": 0.011438734829425812, "rewards/accuracy_reward": 0.7236562371253967, "rewards/format_reward": 1.0, "step": 1431 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.28125, "epoch": 0.014457344775365977, "grad_norm": 5.34174565110252, "kl": 0.051513671875, "learning_rate": 9.994843652589838e-07, "loss": 0.0021, "reward": 2.0415000915527344, "reward_std": 0.024158447980880737, "rewards/accuracy_reward": 0.8415000438690186, "rewards/format_reward": 1.0, "step": 1432 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.28125, "epoch": 0.014467440686521959, "grad_norm": 1.7113251896839055, "kl": 0.05615234375, "learning_rate": 9.994836449713e-07, "loss": 0.0022, "reward": 1.823562502861023, "reward_std": 0.00776481069624424, "rewards/accuracy_reward": 0.6735625267028809, "rewards/format_reward": 1.0, "step": 1433 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.1875, "epoch": 0.014477536597677941, "grad_norm": 2.1063898218513275, "kl": 0.062255859375, "learning_rate": 9.99482924181144e-07, "loss": 0.0025, "reward": 2.1596875190734863, "reward_std": 0.02037491276860237, "rewards/accuracy_reward": 0.9596875309944153, "rewards/format_reward": 1.0, "step": 1434 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.53125, "epoch": 0.014487632508833921, "grad_norm": 3.0698857870265464, "kl": 0.0654296875, "learning_rate": 9.994822028885165e-07, "loss": 0.0026, "reward": 2.0082406997680664, "reward_std": 0.02489057555794716, "rewards/accuracy_reward": 0.8082406520843506, "rewards/format_reward": 1.0, "step": 1435 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.75, "epoch": 0.014497728419989904, "grad_norm": 3.1219721521876562, "kl": 0.060791015625, "learning_rate": 9.994814810934182e-07, "loss": 0.0024, "reward": 2.1368749141693115, "reward_std": 0.009235390461981297, "rewards/accuracy_reward": 0.9368749856948853, "rewards/format_reward": 1.0, "step": 1436 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.014507824331145886, "grad_norm": 2.2805572713759905, "kl": 0.06396484375, "learning_rate": 9.994807587958501e-07, "loss": 0.0026, "reward": 1.9931249618530273, "reward_std": 0.015214603394269943, "rewards/accuracy_reward": 0.7931249141693115, "rewards/format_reward": 1.0, "step": 1437 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 407.0, "epoch": 0.014517920242301868, "grad_norm": 1.914902888559126, "kl": 0.0576171875, "learning_rate": 9.994800359958124e-07, "loss": 0.0023, "reward": 1.7394375801086426, "reward_std": 0.015144936740398407, "rewards/accuracy_reward": 0.5894374847412109, "rewards/format_reward": 1.0, "step": 1438 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 419.34375, "epoch": 0.01452801615345785, "grad_norm": 1.9793158898910321, "kl": 0.053955078125, "learning_rate": 9.994793126933063e-07, "loss": 0.0022, "reward": 1.8364063501358032, "reward_std": 0.00885006133466959, "rewards/accuracy_reward": 0.6864063143730164, "rewards/format_reward": 1.0, "step": 1439 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.40625, "epoch": 0.014538112064613832, "grad_norm": 3.4569239051753295, "kl": 0.051025390625, "learning_rate": 9.994785888883321e-07, "loss": 0.002, "reward": 1.8862500190734863, "reward_std": 0.02413082681596279, "rewards/accuracy_reward": 0.7425000071525574, "rewards/format_reward": 1.0, "step": 1440 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.25, "epoch": 0.014548207975769813, "grad_norm": 1.8120799326869197, "kl": 0.060302734375, "learning_rate": 9.99477864580891e-07, "loss": 0.0024, "reward": 2.1495938301086426, "reward_std": 0.012824494391679764, "rewards/accuracy_reward": 0.9495937824249268, "rewards/format_reward": 1.0, "step": 1441 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.0, "epoch": 0.014558303886925795, "grad_norm": 2.1845824610857063, "kl": 0.06640625, "learning_rate": 9.994771397709833e-07, "loss": 0.0026, "reward": 2.1044063568115234, "reward_std": 0.03092525154352188, "rewards/accuracy_reward": 0.910656213760376, "rewards/format_reward": 1.0, "step": 1442 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.25, "epoch": 0.014568399798081777, "grad_norm": 2.7767775493963236, "kl": 0.06103515625, "learning_rate": 9.9947641445861e-07, "loss": 0.0024, "reward": 2.1674375534057617, "reward_std": 0.012727483175694942, "rewards/accuracy_reward": 0.9674375057220459, "rewards/format_reward": 1.0, "step": 1443 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.625, "epoch": 0.01457849570923776, "grad_norm": 1.6816316247844278, "kl": 0.050048828125, "learning_rate": 9.994756886437718e-07, "loss": 0.002, "reward": 1.8730311393737793, "reward_std": 0.012953732162714005, "rewards/accuracy_reward": 0.7230312824249268, "rewards/format_reward": 1.0, "step": 1444 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.96875, "epoch": 0.014588591620393741, "grad_norm": 5.971901555970945, "kl": 0.06494140625, "learning_rate": 9.994749623264693e-07, "loss": 0.0026, "reward": 2.1446876525878906, "reward_std": 0.0371885672211647, "rewards/accuracy_reward": 0.9446874856948853, "rewards/format_reward": 1.0, "step": 1445 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.34375, "epoch": 0.014598687531549722, "grad_norm": 1.8168098377345616, "kl": 0.04833984375, "learning_rate": 9.994742355067033e-07, "loss": 0.0019, "reward": 2.134000062942505, "reward_std": 0.02735806629061699, "rewards/accuracy_reward": 0.940250039100647, "rewards/format_reward": 1.0, "step": 1446 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.84375, "epoch": 0.014608783442705704, "grad_norm": 1.583739909451974, "kl": 0.0576171875, "learning_rate": 9.994735081844748e-07, "loss": 0.0023, "reward": 1.853156328201294, "reward_std": 0.09342735260725021, "rewards/accuracy_reward": 0.7031562924385071, "rewards/format_reward": 1.0, "step": 1447 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.78125, "epoch": 0.014618879353861686, "grad_norm": 1.1554469003355263, "kl": 0.05419921875, "learning_rate": 9.99472780359784e-07, "loss": 0.0022, "reward": 2.1410000324249268, "reward_std": 0.01972128078341484, "rewards/accuracy_reward": 0.9472500085830688, "rewards/format_reward": 1.0, "step": 1448 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.875, "epoch": 0.014628975265017668, "grad_norm": 1.2756127010559326, "kl": 0.05126953125, "learning_rate": 9.994720520326321e-07, "loss": 0.002, "reward": 2.1979689598083496, "reward_std": 0.0017135771922767162, "rewards/accuracy_reward": 0.9979687929153442, "rewards/format_reward": 1.0, "step": 1449 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.40625, "epoch": 0.01463907117617365, "grad_norm": 3.660704232589731, "kl": 0.062255859375, "learning_rate": 9.994713232030194e-07, "loss": 0.0025, "reward": 1.7718749046325684, "reward_std": 0.022992128506302834, "rewards/accuracy_reward": 0.6218750476837158, "rewards/format_reward": 1.0, "step": 1450 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.0625, "epoch": 0.01464916708732963, "grad_norm": 2.5902437914288927, "kl": 0.05859375, "learning_rate": 9.99470593870947e-07, "loss": 0.0023, "reward": 2.1167500019073486, "reward_std": 0.040043048560619354, "rewards/accuracy_reward": 0.9292500615119934, "rewards/format_reward": 1.0, "step": 1451 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.09375, "epoch": 0.014659262998485613, "grad_norm": 1.5421392267194969, "kl": 0.0537109375, "learning_rate": 9.994698640364157e-07, "loss": 0.0022, "reward": 1.8889062404632568, "reward_std": 0.009711762890219688, "rewards/accuracy_reward": 0.7389062643051147, "rewards/format_reward": 1.0, "step": 1452 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.28125, "epoch": 0.014669358909641595, "grad_norm": 1.1175804101976874, "kl": 0.048828125, "learning_rate": 9.994691336994257e-07, "loss": 0.002, "reward": 2.14243745803833, "reward_std": 0.007768685929477215, "rewards/accuracy_reward": 0.942437469959259, "rewards/format_reward": 1.0, "step": 1453 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 414.03125, "epoch": 0.014679454820797577, "grad_norm": 1.9291041061935086, "kl": 0.04248046875, "learning_rate": 9.994684028599782e-07, "loss": 0.0017, "reward": 1.552125096321106, "reward_std": 0.01012414786964655, "rewards/accuracy_reward": 0.45212501287460327, "rewards/format_reward": 1.0, "step": 1454 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.01468955073195356, "grad_norm": 4.614598584496828, "kl": 0.06982421875, "learning_rate": 9.994676715180738e-07, "loss": 0.0028, "reward": 1.9294687509536743, "reward_std": 0.09750890731811523, "rewards/accuracy_reward": 0.7294687032699585, "rewards/format_reward": 1.0, "step": 1455 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.71875, "epoch": 0.01469964664310954, "grad_norm": 2.49366399504095, "kl": 0.06494140625, "learning_rate": 9.994669396737134e-07, "loss": 0.0026, "reward": 2.0622501373291016, "reward_std": 0.03464515507221222, "rewards/accuracy_reward": 0.8684999942779541, "rewards/format_reward": 1.0, "step": 1456 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.65625, "epoch": 0.014709742554265522, "grad_norm": 1.7404416295977059, "kl": 0.058837890625, "learning_rate": 9.994662073268974e-07, "loss": 0.0023, "reward": 1.8627499341964722, "reward_std": 0.03149328753352165, "rewards/accuracy_reward": 0.7377499938011169, "rewards/format_reward": 1.0, "step": 1457 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 438.375, "epoch": 0.014719838465421504, "grad_norm": 1.3599301917570683, "kl": 0.04736328125, "learning_rate": 9.994654744776269e-07, "loss": 0.0019, "reward": 1.3988125324249268, "reward_std": 0.15273453295230865, "rewards/accuracy_reward": 0.336312472820282, "rewards/format_reward": 1.0, "step": 1458 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 423.59375, "epoch": 0.014729934376577486, "grad_norm": 1.7608155528175449, "kl": 0.05859375, "learning_rate": 9.994647411259024e-07, "loss": 0.0023, "reward": 1.739937424659729, "reward_std": 0.1713673323392868, "rewards/accuracy_reward": 0.6211874485015869, "rewards/format_reward": 1.0, "step": 1459 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.71875, "epoch": 0.014740030287733468, "grad_norm": 1.7777295259942731, "kl": 0.05126953125, "learning_rate": 9.994640072717245e-07, "loss": 0.0021, "reward": 2.0250000953674316, "reward_std": 0.03380080685019493, "rewards/accuracy_reward": 0.8312499523162842, "rewards/format_reward": 1.0, "step": 1460 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.0, "epoch": 0.01475012619888945, "grad_norm": 5.29015109223982, "kl": 0.05712890625, "learning_rate": 9.994632729150943e-07, "loss": 0.0023, "reward": 2.1282498836517334, "reward_std": 0.029353372752666473, "rewards/accuracy_reward": 0.934499979019165, "rewards/format_reward": 1.0, "step": 1461 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.78125, "epoch": 0.014760222110045431, "grad_norm": 2.765645939047546, "kl": 0.06640625, "learning_rate": 9.994625380560124e-07, "loss": 0.0027, "reward": 2.160656452178955, "reward_std": 0.026388950645923615, "rewards/accuracy_reward": 0.9606562852859497, "rewards/format_reward": 1.0, "step": 1462 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 405.5625, "epoch": 0.014770318021201413, "grad_norm": 1.9079766133199383, "kl": 0.0595703125, "learning_rate": 9.994618026944794e-07, "loss": 0.0024, "reward": 1.7651562690734863, "reward_std": 0.03402358293533325, "rewards/accuracy_reward": 0.6214062571525574, "rewards/format_reward": 1.0, "step": 1463 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.4375, "epoch": 0.014780413932357395, "grad_norm": 2.365638819617325, "kl": 0.0673828125, "learning_rate": 9.994610668304963e-07, "loss": 0.0027, "reward": 2.0585312843322754, "reward_std": 0.02548539824783802, "rewards/accuracy_reward": 0.8585311770439148, "rewards/format_reward": 1.0, "step": 1464 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.5, "epoch": 0.014790509843513377, "grad_norm": 1.2517593269996934, "kl": 0.0556640625, "learning_rate": 9.994603304640635e-07, "loss": 0.0022, "reward": 1.8472187519073486, "reward_std": 0.006777629256248474, "rewards/accuracy_reward": 0.6972187757492065, "rewards/format_reward": 1.0, "step": 1465 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.15625, "epoch": 0.01480060575466936, "grad_norm": 3.76927392611303, "kl": 0.0654296875, "learning_rate": 9.99459593595182e-07, "loss": 0.0026, "reward": 2.139406204223633, "reward_std": 0.0371459499001503, "rewards/accuracy_reward": 0.9456562995910645, "rewards/format_reward": 1.0, "step": 1466 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.03125, "epoch": 0.01481070166582534, "grad_norm": 1.9015999552031795, "kl": 0.0625, "learning_rate": 9.994588562238526e-07, "loss": 0.0025, "reward": 1.8832812309265137, "reward_std": 0.09871089458465576, "rewards/accuracy_reward": 0.7332812547683716, "rewards/format_reward": 1.0, "step": 1467 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.78125, "epoch": 0.014820797576981322, "grad_norm": 3.390047538813667, "kl": 0.068359375, "learning_rate": 9.994581183500758e-07, "loss": 0.0027, "reward": 2.1229374408721924, "reward_std": 0.011635343544185162, "rewards/accuracy_reward": 0.9229373931884766, "rewards/format_reward": 1.0, "step": 1468 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 396.15625, "epoch": 0.014830893488137304, "grad_norm": 2.0212148387556117, "kl": 0.06005859375, "learning_rate": 9.994573799738524e-07, "loss": 0.0024, "reward": 1.793156385421753, "reward_std": 0.017164431512355804, "rewards/accuracy_reward": 0.6431562304496765, "rewards/format_reward": 1.0, "step": 1469 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.5, "epoch": 0.014840989399293287, "grad_norm": 1.8458136860248442, "kl": 0.06640625, "learning_rate": 9.994566410951833e-07, "loss": 0.0027, "reward": 2.167562484741211, "reward_std": 0.009972688741981983, "rewards/accuracy_reward": 0.9675624966621399, "rewards/format_reward": 1.0, "step": 1470 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.5, "epoch": 0.014851085310449269, "grad_norm": 2.6405425286575115, "kl": 0.0634765625, "learning_rate": 9.99455901714069e-07, "loss": 0.0025, "reward": 2.160750150680542, "reward_std": 0.020855072885751724, "rewards/accuracy_reward": 0.9607500433921814, "rewards/format_reward": 1.0, "step": 1471 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.34375, "epoch": 0.014861181221605249, "grad_norm": 2.302902394227226, "kl": 0.06103515625, "learning_rate": 9.994551618305107e-07, "loss": 0.0025, "reward": 1.9514999389648438, "reward_std": 0.19367387890815735, "rewards/accuracy_reward": 0.7889999747276306, "rewards/format_reward": 1.0, "step": 1472 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.8125, "epoch": 0.014871277132761231, "grad_norm": 5.510511439427771, "kl": 0.07373046875, "learning_rate": 9.994544214445086e-07, "loss": 0.003, "reward": 1.9929156303405762, "reward_std": 0.042401690036058426, "rewards/accuracy_reward": 0.8054155707359314, "rewards/format_reward": 1.0, "step": 1473 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.8125, "epoch": 0.014881373043917213, "grad_norm": 1.8249216300413307, "kl": 0.0537109375, "learning_rate": 9.994536805560638e-07, "loss": 0.0021, "reward": 1.7875001430511475, "reward_std": 0.14405354857444763, "rewards/accuracy_reward": 0.6437499523162842, "rewards/format_reward": 1.0, "step": 1474 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.5625, "epoch": 0.014891468955073196, "grad_norm": 6.97698448829634, "kl": 0.064453125, "learning_rate": 9.994529391651767e-07, "loss": 0.0026, "reward": 2.0729689598083496, "reward_std": 0.12004469335079193, "rewards/accuracy_reward": 0.8792187571525574, "rewards/format_reward": 1.0, "step": 1475 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.71875, "epoch": 0.014901564866229178, "grad_norm": 2.597092098602835, "kl": 0.06884765625, "learning_rate": 9.994521972718485e-07, "loss": 0.0028, "reward": 2.113781452178955, "reward_std": 0.013526367023587227, "rewards/accuracy_reward": 0.9137812256813049, "rewards/format_reward": 1.0, "step": 1476 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.014911660777385158, "grad_norm": 3.113521919360118, "kl": 0.0712890625, "learning_rate": 9.994514548760796e-07, "loss": 0.0029, "reward": 2.039468765258789, "reward_std": 0.04304702952504158, "rewards/accuracy_reward": 0.8457187414169312, "rewards/format_reward": 1.0, "step": 1477 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.71875, "epoch": 0.01492175668854114, "grad_norm": 1.7768287567504333, "kl": 0.0625, "learning_rate": 9.99450711977871e-07, "loss": 0.0025, "reward": 2.0947186946868896, "reward_std": 0.034409962594509125, "rewards/accuracy_reward": 0.9009687304496765, "rewards/format_reward": 1.0, "step": 1478 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.5625, "epoch": 0.014931852599697122, "grad_norm": 3.2658577079857416, "kl": 0.072265625, "learning_rate": 9.99449968577223e-07, "loss": 0.0029, "reward": 1.8767187595367432, "reward_std": 0.1868109256029129, "rewards/accuracy_reward": 0.701718807220459, "rewards/format_reward": 1.0, "step": 1479 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.25, "epoch": 0.014941948510853105, "grad_norm": 1.9021069557441612, "kl": 0.06591796875, "learning_rate": 9.994492246741368e-07, "loss": 0.0026, "reward": 2.145843982696533, "reward_std": 0.028748733922839165, "rewards/accuracy_reward": 0.9520937204360962, "rewards/format_reward": 1.0, "step": 1480 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 399.5, "epoch": 0.014952044422009087, "grad_norm": 2.122334497364833, "kl": 0.05810546875, "learning_rate": 9.994484802686132e-07, "loss": 0.0023, "reward": 1.534156322479248, "reward_std": 0.023593775928020477, "rewards/accuracy_reward": 0.43415623903274536, "rewards/format_reward": 1.0, "step": 1481 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.40625, "epoch": 0.014962140333165069, "grad_norm": 2.1041691358559063, "kl": 0.068359375, "learning_rate": 9.994477353606526e-07, "loss": 0.0027, "reward": 2.1310312747955322, "reward_std": 0.016085822135210037, "rewards/accuracy_reward": 0.9310312271118164, "rewards/format_reward": 1.0, "step": 1482 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.40625, "epoch": 0.01497223624432105, "grad_norm": 1.7689055824777622, "kl": 0.055419921875, "learning_rate": 9.994469899502558e-07, "loss": 0.0022, "reward": 2.189812660217285, "reward_std": 0.009342527016997337, "rewards/accuracy_reward": 0.9898124933242798, "rewards/format_reward": 1.0, "step": 1483 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.375, "epoch": 0.014982332155477032, "grad_norm": 3.403213155590955, "kl": 0.0673828125, "learning_rate": 9.994462440374238e-07, "loss": 0.0027, "reward": 2.0229063034057617, "reward_std": 0.011188726872205734, "rewards/accuracy_reward": 0.8229062557220459, "rewards/format_reward": 1.0, "step": 1484 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.0625, "epoch": 0.014992428066633014, "grad_norm": 1.7315946396051471, "kl": 0.0546875, "learning_rate": 9.994454976221572e-07, "loss": 0.0022, "reward": 2.029343843460083, "reward_std": 0.16208824515342712, "rewards/accuracy_reward": 0.8605937957763672, "rewards/format_reward": 1.0, "step": 1485 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 0.015002523977788996, "grad_norm": 2.209641639816817, "kl": 0.06640625, "learning_rate": 9.994447507044566e-07, "loss": 0.0027, "reward": 2.0487499237060547, "reward_std": 0.019954366609454155, "rewards/accuracy_reward": 0.8487499952316284, "rewards/format_reward": 1.0, "step": 1486 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.78125, "epoch": 0.015012619888944978, "grad_norm": 2.267963791356319, "kl": 0.058349609375, "learning_rate": 9.99444003284323e-07, "loss": 0.0023, "reward": 2.0253748893737793, "reward_std": 0.1343528926372528, "rewards/accuracy_reward": 0.8441250324249268, "rewards/format_reward": 1.0, "step": 1487 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.5625, "epoch": 0.015022715800100958, "grad_norm": 2.4369456112439307, "kl": 0.0751953125, "learning_rate": 9.994432553617572e-07, "loss": 0.003, "reward": 2.1479687690734863, "reward_std": 0.01981426030397415, "rewards/accuracy_reward": 0.9479687213897705, "rewards/format_reward": 1.0, "step": 1488 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.46875, "epoch": 0.01503281171125694, "grad_norm": 16.092811052044905, "kl": 0.0537109375, "learning_rate": 9.994425069367595e-07, "loss": 0.0022, "reward": 2.0812501907348633, "reward_std": 0.14036384224891663, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1489 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.59375, "epoch": 0.015042907622412923, "grad_norm": 1.8114729516248433, "kl": 0.045166015625, "learning_rate": 9.994417580093313e-07, "loss": 0.0018, "reward": 1.858625054359436, "reward_std": 0.10862252861261368, "rewards/accuracy_reward": 0.7148749828338623, "rewards/format_reward": 1.0, "step": 1490 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.34375, "epoch": 0.015053003533568905, "grad_norm": 2.4839889827034543, "kl": 0.0673828125, "learning_rate": 9.994410085794728e-07, "loss": 0.0027, "reward": 2.058000087738037, "reward_std": 0.035621047019958496, "rewards/accuracy_reward": 0.8580000400543213, "rewards/format_reward": 1.0, "step": 1491 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.21875, "epoch": 0.015063099444724887, "grad_norm": 2.384375342250049, "kl": 0.05615234375, "learning_rate": 9.99440258647185e-07, "loss": 0.0022, "reward": 2.104468822479248, "reward_std": 0.02448844164609909, "rewards/accuracy_reward": 0.9044687747955322, "rewards/format_reward": 1.0, "step": 1492 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 387.96875, "epoch": 0.015073195355880867, "grad_norm": 1.8443124849872894, "kl": 0.05859375, "learning_rate": 9.994395082124688e-07, "loss": 0.0023, "reward": 1.8472501039505005, "reward_std": 0.01360088400542736, "rewards/accuracy_reward": 0.6972500085830688, "rewards/format_reward": 1.0, "step": 1493 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.01508329126703685, "grad_norm": 2.6985317295056475, "kl": 0.0625, "learning_rate": 9.994387572753245e-07, "loss": 0.0025, "reward": 2.02915620803833, "reward_std": 0.009188695810735226, "rewards/accuracy_reward": 0.8291562795639038, "rewards/format_reward": 1.0, "step": 1494 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 411.5625, "epoch": 0.015093387178192832, "grad_norm": 2.2630570892776065, "kl": 0.05615234375, "learning_rate": 9.994380058357532e-07, "loss": 0.0022, "reward": 1.560390591621399, "reward_std": 0.010881512425839901, "rewards/accuracy_reward": 0.4603906273841858, "rewards/format_reward": 1.0, "step": 1495 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.78125, "epoch": 0.015103483089348814, "grad_norm": 1.9311387937773241, "kl": 0.06982421875, "learning_rate": 9.994372538937558e-07, "loss": 0.0028, "reward": 2.0896248817443848, "reward_std": 0.017381001263856888, "rewards/accuracy_reward": 0.889625072479248, "rewards/format_reward": 1.0, "step": 1496 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.90625, "epoch": 0.015113579000504796, "grad_norm": 2.620948089889624, "kl": 0.06396484375, "learning_rate": 9.994365014493326e-07, "loss": 0.0026, "reward": 2.132593870162964, "reward_std": 0.13476039469242096, "rewards/accuracy_reward": 0.9450937509536743, "rewards/format_reward": 1.0, "step": 1497 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.28125, "epoch": 0.015123674911660777, "grad_norm": 2.2519537066614563, "kl": 0.07421875, "learning_rate": 9.994357485024847e-07, "loss": 0.003, "reward": 2.145031213760376, "reward_std": 0.030863475054502487, "rewards/accuracy_reward": 0.9512811899185181, "rewards/format_reward": 1.0, "step": 1498 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.34375, "epoch": 0.015133770822816759, "grad_norm": 2.4384962225624376, "kl": 0.06787109375, "learning_rate": 9.994349950532128e-07, "loss": 0.0027, "reward": 2.154562473297119, "reward_std": 0.02205796167254448, "rewards/accuracy_reward": 0.9545624852180481, "rewards/format_reward": 1.0, "step": 1499 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 378.125, "epoch": 0.01514386673397274, "grad_norm": 2.4596597506629903, "kl": 0.064453125, "learning_rate": 9.994342411015177e-07, "loss": 0.0026, "reward": 2.1461782455444336, "reward_std": 0.039061300456523895, "rewards/accuracy_reward": 0.9649280905723572, "rewards/format_reward": 1.0, "step": 1500 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.0625, "epoch": 0.015153962645128723, "grad_norm": 1.6240702271188572, "kl": 0.0703125, "learning_rate": 9.994334866473998e-07, "loss": 0.0028, "reward": 2.165250062942505, "reward_std": 0.010158046148717403, "rewards/accuracy_reward": 0.9652498960494995, "rewards/format_reward": 1.0, "step": 1501 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.46875, "epoch": 0.015164058556284705, "grad_norm": 2.5612711351247923, "kl": 0.0654296875, "learning_rate": 9.994327316908604e-07, "loss": 0.0026, "reward": 2.110281229019165, "reward_std": 0.021865982562303543, "rewards/accuracy_reward": 0.9102813005447388, "rewards/format_reward": 1.0, "step": 1502 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.75, "epoch": 0.015174154467440687, "grad_norm": 2.803065318326512, "kl": 0.07177734375, "learning_rate": 9.994319762319e-07, "loss": 0.0029, "reward": 2.1263437271118164, "reward_std": 0.027065131813287735, "rewards/accuracy_reward": 0.9263437986373901, "rewards/format_reward": 1.0, "step": 1503 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.09375, "epoch": 0.015184250378596668, "grad_norm": 2.7674102166667494, "kl": 0.05859375, "learning_rate": 9.994312202705191e-07, "loss": 0.0024, "reward": 2.0380001068115234, "reward_std": 0.022434048354625702, "rewards/accuracy_reward": 0.8379999399185181, "rewards/format_reward": 1.0, "step": 1504 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 373.28125, "epoch": 0.01519434628975265, "grad_norm": 4.169335020094007, "kl": 0.06640625, "learning_rate": 9.994304638067188e-07, "loss": 0.0027, "reward": 2.0986251831054688, "reward_std": 0.03208031505346298, "rewards/accuracy_reward": 0.9048749208450317, "rewards/format_reward": 1.0, "step": 1505 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 387.34375, "epoch": 0.015204442200908632, "grad_norm": 2.5973517803301642, "kl": 0.06298828125, "learning_rate": 9.994297068405e-07, "loss": 0.0025, "reward": 1.7568750381469727, "reward_std": 0.026615561917424202, "rewards/accuracy_reward": 0.6131249666213989, "rewards/format_reward": 1.0, "step": 1506 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 380.90625, "epoch": 0.015214538112064614, "grad_norm": 3.2385565052244574, "kl": 0.0732421875, "learning_rate": 9.994289493718631e-07, "loss": 0.0029, "reward": 2.040750026702881, "reward_std": 0.06281974911689758, "rewards/accuracy_reward": 0.8532499670982361, "rewards/format_reward": 1.0, "step": 1507 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 374.34375, "epoch": 0.015224634023220596, "grad_norm": 3.4313904796537282, "kl": 0.083984375, "learning_rate": 9.99428191400809e-07, "loss": 0.0034, "reward": 2.1209373474121094, "reward_std": 0.025013674050569534, "rewards/accuracy_reward": 0.9209375381469727, "rewards/format_reward": 1.0, "step": 1508 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.65625, "epoch": 0.015234729934376577, "grad_norm": 2.5056638473002875, "kl": 0.07177734375, "learning_rate": 9.994274329273383e-07, "loss": 0.0029, "reward": 2.0720624923706055, "reward_std": 0.09376393258571625, "rewards/accuracy_reward": 0.8783124685287476, "rewards/format_reward": 1.0, "step": 1509 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 0.015244825845532559, "grad_norm": 2.826814689150976, "kl": 0.0732421875, "learning_rate": 9.99426673951452e-07, "loss": 0.0029, "reward": 2.138406276702881, "reward_std": 0.026191147044301033, "rewards/accuracy_reward": 0.9384062886238098, "rewards/format_reward": 1.0, "step": 1510 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.6875, "epoch": 0.015254921756688541, "grad_norm": 4.0662805346322815, "kl": 0.058349609375, "learning_rate": 9.99425914473151e-07, "loss": 0.0023, "reward": 1.7927812337875366, "reward_std": 0.01713290996849537, "rewards/accuracy_reward": 0.6427812576293945, "rewards/format_reward": 1.0, "step": 1511 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.21875, "epoch": 0.015265017667844523, "grad_norm": 1.3007647110544793, "kl": 0.059326171875, "learning_rate": 9.994251544924356e-07, "loss": 0.0024, "reward": 1.883500099182129, "reward_std": 0.221793532371521, "rewards/accuracy_reward": 0.7397499680519104, "rewards/format_reward": 1.0, "step": 1512 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.34375, "epoch": 0.015275113579000505, "grad_norm": 2.3352282275679204, "kl": 0.06884765625, "learning_rate": 9.99424394009307e-07, "loss": 0.0028, "reward": 2.1018126010894775, "reward_std": 0.018278736621141434, "rewards/accuracy_reward": 0.9018124938011169, "rewards/format_reward": 1.0, "step": 1513 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.75, "epoch": 0.015285209490156486, "grad_norm": 2.304451592813302, "kl": 0.0654296875, "learning_rate": 9.994236330237658e-07, "loss": 0.0026, "reward": 1.8486250638961792, "reward_std": 0.18426670134067535, "rewards/accuracy_reward": 0.6736249923706055, "rewards/format_reward": 1.0, "step": 1514 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.84375, "epoch": 0.015295305401312468, "grad_norm": 1.942097277131215, "kl": 0.07177734375, "learning_rate": 9.994228715358127e-07, "loss": 0.0029, "reward": 1.8287811279296875, "reward_std": 0.030726591125130653, "rewards/accuracy_reward": 0.6850312352180481, "rewards/format_reward": 1.0, "step": 1515 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.53125, "epoch": 0.01530540131246845, "grad_norm": 1.095165805170467, "kl": 0.0634765625, "learning_rate": 9.994221095454485e-07, "loss": 0.0025, "reward": 1.7279062271118164, "reward_std": 0.007748203352093697, "rewards/accuracy_reward": 0.5779062509536743, "rewards/format_reward": 1.0, "step": 1516 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 442.65625, "epoch": 0.015315497223624432, "grad_norm": 2.023712175895843, "kl": 0.06396484375, "learning_rate": 9.99421347052674e-07, "loss": 0.0026, "reward": 1.9560625553131104, "reward_std": 0.15461555123329163, "rewards/accuracy_reward": 0.7810624837875366, "rewards/format_reward": 1.0, "step": 1517 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.875, "epoch": 0.015325593134780414, "grad_norm": 2.161619982067387, "kl": 0.0595703125, "learning_rate": 9.994205840574899e-07, "loss": 0.0024, "reward": 1.994749903678894, "reward_std": 0.16527585685253143, "rewards/accuracy_reward": 0.8259999752044678, "rewards/format_reward": 1.0, "step": 1518 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.015335689045936397, "grad_norm": 3.472213317180419, "kl": 0.080078125, "learning_rate": 9.994198205598968e-07, "loss": 0.0032, "reward": 2.046250104904175, "reward_std": 0.03523227199912071, "rewards/accuracy_reward": 0.846250057220459, "rewards/format_reward": 1.0, "step": 1519 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 434.28125, "epoch": 0.015345784957092377, "grad_norm": 7.581725466046477, "kl": 0.0654296875, "learning_rate": 9.994190565598962e-07, "loss": 0.0026, "reward": 2.0166563987731934, "reward_std": 0.044376544654369354, "rewards/accuracy_reward": 0.8291562795639038, "rewards/format_reward": 1.0, "step": 1520 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 450.75, "epoch": 0.01535588086824836, "grad_norm": 1.883863362355415, "kl": 0.0576171875, "learning_rate": 9.99418292057488e-07, "loss": 0.0023, "reward": 1.5499999523162842, "reward_std": 0.11558602005243301, "rewards/accuracy_reward": 0.45625001192092896, "rewards/format_reward": 1.0, "step": 1521 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.34375, "epoch": 0.015365976779404341, "grad_norm": 6.705710485143173, "kl": 0.07080078125, "learning_rate": 9.994175270526732e-07, "loss": 0.0028, "reward": 2.0567500591278076, "reward_std": 0.017477434128522873, "rewards/accuracy_reward": 0.8567500710487366, "rewards/format_reward": 1.0, "step": 1522 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 450.21875, "epoch": 0.015376072690560324, "grad_norm": 1.888264089872661, "kl": 0.068359375, "learning_rate": 9.99416761545453e-07, "loss": 0.0027, "reward": 2.1631250381469727, "reward_std": 0.0771547332406044, "rewards/accuracy_reward": 0.9881250262260437, "rewards/format_reward": 1.0, "step": 1523 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 438.625, "epoch": 0.015386168601716306, "grad_norm": 2.757804431521899, "kl": 0.057373046875, "learning_rate": 9.994159955358278e-07, "loss": 0.0023, "reward": 2.0151562690734863, "reward_std": 0.17488455772399902, "rewards/accuracy_reward": 0.8339062929153442, "rewards/format_reward": 1.0, "step": 1524 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 452.25, "epoch": 0.015396264512872286, "grad_norm": 2.0989417733296656, "kl": 0.064453125, "learning_rate": 9.994152290237982e-07, "loss": 0.0026, "reward": 1.7600312232971191, "reward_std": 0.26203086972236633, "rewards/accuracy_reward": 0.6350312829017639, "rewards/format_reward": 1.0, "step": 1525 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 448.8125, "epoch": 0.015406360424028268, "grad_norm": 3.838729988588869, "kl": 0.056640625, "learning_rate": 9.994144620093655e-07, "loss": 0.0023, "reward": 1.8004374504089355, "reward_std": 0.1572166234254837, "rewards/accuracy_reward": 0.6754374504089355, "rewards/format_reward": 1.0, "step": 1526 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.8125, "epoch": 0.01541645633518425, "grad_norm": 1.9752301654556306, "kl": 0.06640625, "learning_rate": 9.9941369449253e-07, "loss": 0.0026, "reward": 1.8192187547683716, "reward_std": 0.016062095761299133, "rewards/accuracy_reward": 0.6692187786102295, "rewards/format_reward": 1.0, "step": 1527 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 427.8125, "epoch": 0.015426552246340233, "grad_norm": 1.6580340696154505, "kl": 0.0595703125, "learning_rate": 9.994129264732926e-07, "loss": 0.0024, "reward": 1.827375054359436, "reward_std": 0.006862184964120388, "rewards/accuracy_reward": 0.6773749589920044, "rewards/format_reward": 1.0, "step": 1528 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.09375, "epoch": 0.015436648157496215, "grad_norm": 5.294232792838385, "kl": 0.068359375, "learning_rate": 9.994121579516543e-07, "loss": 0.0027, "reward": 2.1484062671661377, "reward_std": 0.04165094718337059, "rewards/accuracy_reward": 0.9546562433242798, "rewards/format_reward": 1.0, "step": 1529 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 439.21875, "epoch": 0.015446744068652195, "grad_norm": 2.023297686859495, "kl": 0.059326171875, "learning_rate": 9.994113889276158e-07, "loss": 0.0024, "reward": 2.1370625495910645, "reward_std": 0.01609312929213047, "rewards/accuracy_reward": 0.9370625019073486, "rewards/format_reward": 1.0, "step": 1530 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 432.03125, "epoch": 0.015456839979808177, "grad_norm": 2.2850462116689285, "kl": 0.055908203125, "learning_rate": 9.994106194011772e-07, "loss": 0.0022, "reward": 1.8870000839233398, "reward_std": 0.3177204132080078, "rewards/accuracy_reward": 0.737000048160553, "rewards/format_reward": 1.0, "step": 1531 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.96875, "epoch": 0.01546693589096416, "grad_norm": 2.7665553321450997, "kl": 0.0673828125, "learning_rate": 9.994098493723404e-07, "loss": 0.0027, "reward": 2.0755624771118164, "reward_std": 0.025561025366187096, "rewards/accuracy_reward": 0.8755625486373901, "rewards/format_reward": 1.0, "step": 1532 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.375, "epoch": 0.015477031802120142, "grad_norm": 1.8371586209240123, "kl": 0.0634765625, "learning_rate": 9.994090788411054e-07, "loss": 0.0025, "reward": 2.1465001106262207, "reward_std": 0.010845663025975227, "rewards/accuracy_reward": 0.9465000629425049, "rewards/format_reward": 1.0, "step": 1533 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.84375, "epoch": 0.015487127713276124, "grad_norm": 2.946816233827139, "kl": 0.06005859375, "learning_rate": 9.994083078074734e-07, "loss": 0.0024, "reward": 2.1190314292907715, "reward_std": 0.016334168612957, "rewards/accuracy_reward": 0.9190312623977661, "rewards/format_reward": 1.0, "step": 1534 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 422.875, "epoch": 0.015497223624432104, "grad_norm": 1.8070718041732503, "kl": 0.0615234375, "learning_rate": 9.994075362714448e-07, "loss": 0.0025, "reward": 1.788312554359436, "reward_std": 0.03336987644433975, "rewards/accuracy_reward": 0.6445624828338623, "rewards/format_reward": 1.0, "step": 1535 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 435.5625, "epoch": 0.015507319535588086, "grad_norm": 3.9345356818774446, "kl": 0.0595703125, "learning_rate": 9.994067642330203e-07, "loss": 0.0024, "reward": 2.028437614440918, "reward_std": 0.10268658399581909, "rewards/accuracy_reward": 0.8471875190734863, "rewards/format_reward": 1.0, "step": 1536 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.34375, "epoch": 0.015517415446744069, "grad_norm": 5.4634413669013755, "kl": 0.0703125, "learning_rate": 9.994059916922014e-07, "loss": 0.0028, "reward": 2.000781297683716, "reward_std": 0.03346104174852371, "rewards/accuracy_reward": 0.8007813096046448, "rewards/format_reward": 1.0, "step": 1537 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.40625, "epoch": 0.01552751135790005, "grad_norm": 2.4561716770320983, "kl": 0.06396484375, "learning_rate": 9.99405218648988e-07, "loss": 0.0026, "reward": 2.1491875648498535, "reward_std": 0.026994237676262856, "rewards/accuracy_reward": 0.9554375410079956, "rewards/format_reward": 1.0, "step": 1538 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.1875, "epoch": 0.015537607269056033, "grad_norm": 1.9661358079864868, "kl": 0.06982421875, "learning_rate": 9.994044451033815e-07, "loss": 0.0028, "reward": 1.9832812547683716, "reward_std": 0.03882398456335068, "rewards/accuracy_reward": 0.7957811951637268, "rewards/format_reward": 1.0, "step": 1539 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.1875, "epoch": 0.015547703180212015, "grad_norm": 2.7198117156804655, "kl": 0.057373046875, "learning_rate": 9.994036710553822e-07, "loss": 0.0023, "reward": 2.0845627784729004, "reward_std": 0.02714432403445244, "rewards/accuracy_reward": 0.8908125162124634, "rewards/format_reward": 1.0, "step": 1540 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.78125, "epoch": 0.015557799091367995, "grad_norm": 2.0505563653741636, "kl": 0.053955078125, "learning_rate": 9.994028965049913e-07, "loss": 0.0022, "reward": 2.0851876735687256, "reward_std": 0.041111648082733154, "rewards/accuracy_reward": 0.8976874947547913, "rewards/format_reward": 1.0, "step": 1541 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.125, "epoch": 0.015567895002523978, "grad_norm": 2.110323513848292, "kl": 0.057373046875, "learning_rate": 9.994021214522094e-07, "loss": 0.0023, "reward": 2.0774688720703125, "reward_std": 0.01849416270852089, "rewards/accuracy_reward": 0.8774687051773071, "rewards/format_reward": 1.0, "step": 1542 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.96875, "epoch": 0.01557799091367996, "grad_norm": 2.493980452200041, "kl": 0.059814453125, "learning_rate": 9.994013458970373e-07, "loss": 0.0024, "reward": 2.153156280517578, "reward_std": 0.06325649470090866, "rewards/accuracy_reward": 0.9719062447547913, "rewards/format_reward": 1.0, "step": 1543 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.25, "epoch": 0.015588086824835942, "grad_norm": 1.2385366633785007, "kl": 0.054443359375, "learning_rate": 9.994005698394759e-07, "loss": 0.0022, "reward": 1.809000015258789, "reward_std": 0.020828809589147568, "rewards/accuracy_reward": 0.6652500033378601, "rewards/format_reward": 1.0, "step": 1544 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.15625, "epoch": 0.015598182735991924, "grad_norm": 3.363401420233733, "kl": 0.0576171875, "learning_rate": 9.993997932795256e-07, "loss": 0.0023, "reward": 2.1152188777923584, "reward_std": 0.03977964445948601, "rewards/accuracy_reward": 0.9214687347412109, "rewards/format_reward": 1.0, "step": 1545 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 393.9375, "epoch": 0.015608278647147904, "grad_norm": 5.301352351920704, "kl": 0.053955078125, "learning_rate": 9.993990162171877e-07, "loss": 0.0022, "reward": 1.8049376010894775, "reward_std": 0.03029829077422619, "rewards/accuracy_reward": 0.661187469959259, "rewards/format_reward": 1.0, "step": 1546 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 377.375, "epoch": 0.015618374558303887, "grad_norm": 2.5149954829718086, "kl": 0.0615234375, "learning_rate": 9.993982386524624e-07, "loss": 0.0025, "reward": 1.8996250629425049, "reward_std": 0.03380618244409561, "rewards/accuracy_reward": 0.705875039100647, "rewards/format_reward": 1.0, "step": 1547 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 376.71875, "epoch": 0.01562847046945987, "grad_norm": 2.0132045527364584, "kl": 0.060546875, "learning_rate": 9.99397460585351e-07, "loss": 0.0024, "reward": 1.9944063425064087, "reward_std": 0.03420616686344147, "rewards/accuracy_reward": 0.800656259059906, "rewards/format_reward": 1.0, "step": 1548 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.375, "epoch": 0.01563856638061585, "grad_norm": 2.7851259571782454, "kl": 0.061767578125, "learning_rate": 9.99396682015854e-07, "loss": 0.0025, "reward": 2.0658750534057617, "reward_std": 0.03308906406164169, "rewards/accuracy_reward": 0.8721250295639038, "rewards/format_reward": 1.0, "step": 1549 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 432.34375, "epoch": 0.015648662291771833, "grad_norm": 1.7396589861647895, "kl": 0.0419921875, "learning_rate": 9.993959029439723e-07, "loss": 0.0017, "reward": 1.5544999837875366, "reward_std": 0.23221111297607422, "rewards/accuracy_reward": 0.46700000762939453, "rewards/format_reward": 1.0, "step": 1550 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.125, "epoch": 0.015658758202927815, "grad_norm": 3.115198786324134, "kl": 0.05126953125, "learning_rate": 9.993951233697067e-07, "loss": 0.0021, "reward": 2.1370625495910645, "reward_std": 0.017109226435422897, "rewards/accuracy_reward": 0.9370625019073486, "rewards/format_reward": 1.0, "step": 1551 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 441.0625, "epoch": 0.015668854114083797, "grad_norm": 1.5763274684727755, "kl": 0.0400390625, "learning_rate": 9.993943432930578e-07, "loss": 0.0016, "reward": 2.1750001907348633, "reward_std": 0.05850087106227875, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1552 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.65625, "epoch": 0.01567895002523978, "grad_norm": 2.379151641187572, "kl": 0.05810546875, "learning_rate": 9.993935627140266e-07, "loss": 0.0023, "reward": 2.110281467437744, "reward_std": 0.03353164345026016, "rewards/accuracy_reward": 0.9165312647819519, "rewards/format_reward": 1.0, "step": 1553 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.0625, "epoch": 0.015689045936395758, "grad_norm": 3.9314269308325156, "kl": 0.057861328125, "learning_rate": 9.993927816326138e-07, "loss": 0.0023, "reward": 1.8724374771118164, "reward_std": 0.10516127943992615, "rewards/accuracy_reward": 0.7224375009536743, "rewards/format_reward": 1.0, "step": 1554 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.9375, "epoch": 0.01569914184755174, "grad_norm": 4.081508479560071, "kl": 0.06005859375, "learning_rate": 9.993920000488201e-07, "loss": 0.0024, "reward": 2.114656448364258, "reward_std": 0.01626952365040779, "rewards/accuracy_reward": 0.9146562218666077, "rewards/format_reward": 1.0, "step": 1555 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.78125, "epoch": 0.015709237758707723, "grad_norm": 2.4840009032268604, "kl": 0.06591796875, "learning_rate": 9.993912179626465e-07, "loss": 0.0026, "reward": 2.095250129699707, "reward_std": 0.035855066031217575, "rewards/accuracy_reward": 0.9014999866485596, "rewards/format_reward": 1.0, "step": 1556 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.0, "epoch": 0.015719333669863705, "grad_norm": 2.4429376760289685, "kl": 0.05419921875, "learning_rate": 9.993904353740936e-07, "loss": 0.0022, "reward": 2.0978751182556152, "reward_std": 0.043658822774887085, "rewards/accuracy_reward": 0.9041249752044678, "rewards/format_reward": 1.0, "step": 1557 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.1875, "epoch": 0.015729429581019687, "grad_norm": 1.839576094463747, "kl": 0.055419921875, "learning_rate": 9.993896522831621e-07, "loss": 0.0022, "reward": 1.944406270980835, "reward_std": 0.15984486043453217, "rewards/accuracy_reward": 0.781906247138977, "rewards/format_reward": 1.0, "step": 1558 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.71875, "epoch": 0.01573952549217567, "grad_norm": 2.167523511962165, "kl": 0.06689453125, "learning_rate": 9.993888686898533e-07, "loss": 0.0027, "reward": 2.1389689445495605, "reward_std": 0.015454985201358795, "rewards/accuracy_reward": 0.9389687180519104, "rewards/format_reward": 1.0, "step": 1559 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 428.71875, "epoch": 0.01574962140333165, "grad_norm": 2.4720111265902927, "kl": 0.056640625, "learning_rate": 9.993880845941674e-07, "loss": 0.0023, "reward": 1.6834686994552612, "reward_std": 0.1744450479745865, "rewards/accuracy_reward": 0.5584688186645508, "rewards/format_reward": 1.0, "step": 1560 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 458.0625, "epoch": 0.015759717314487633, "grad_norm": 1.2648774076770226, "kl": 0.038818359375, "learning_rate": 9.993872999961053e-07, "loss": 0.0016, "reward": 1.3625000715255737, "reward_std": 0.24493902921676636, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 1561 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.9375, "epoch": 0.015769813225643615, "grad_norm": 2.063300188252303, "kl": 0.056640625, "learning_rate": 9.99386514895668e-07, "loss": 0.0023, "reward": 2.092031478881836, "reward_std": 0.0309164896607399, "rewards/accuracy_reward": 0.9045311808586121, "rewards/format_reward": 1.0, "step": 1562 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 430.09375, "epoch": 0.015779909136799598, "grad_norm": 3.952506492104965, "kl": 0.05615234375, "learning_rate": 9.993857292928563e-07, "loss": 0.0023, "reward": 1.7801719903945923, "reward_std": 0.01924155279994011, "rewards/accuracy_reward": 0.6301718950271606, "rewards/format_reward": 1.0, "step": 1563 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 432.96875, "epoch": 0.015790005047955576, "grad_norm": 2.1528280275195772, "kl": 0.06298828125, "learning_rate": 9.993849431876706e-07, "loss": 0.0025, "reward": 1.8253436088562012, "reward_std": 0.014097852632403374, "rewards/accuracy_reward": 0.6753437519073486, "rewards/format_reward": 1.0, "step": 1564 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.28125, "epoch": 0.01580010095911156, "grad_norm": 2.242875554376563, "kl": 0.056884765625, "learning_rate": 9.993841565801121e-07, "loss": 0.0023, "reward": 2.0817501544952393, "reward_std": 0.0575135238468647, "rewards/accuracy_reward": 0.906749963760376, "rewards/format_reward": 1.0, "step": 1565 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 459.28125, "epoch": 0.01581019687026754, "grad_norm": 1.5284020196848433, "kl": 0.044189453125, "learning_rate": 9.993833694701817e-07, "loss": 0.0018, "reward": 1.868749976158142, "reward_std": 0.0444038063287735, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1566 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.53125, "epoch": 0.015820292781423523, "grad_norm": 2.5210438033320175, "kl": 0.06787109375, "learning_rate": 9.993825818578796e-07, "loss": 0.0027, "reward": 2.1061251163482666, "reward_std": 0.03735330328345299, "rewards/accuracy_reward": 0.9123749732971191, "rewards/format_reward": 1.0, "step": 1567 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.84375, "epoch": 0.015830388692579505, "grad_norm": 1.8509558947601488, "kl": 0.06494140625, "learning_rate": 9.993817937432073e-07, "loss": 0.0026, "reward": 2.1451563835144043, "reward_std": 0.06291639059782028, "rewards/accuracy_reward": 0.9639062881469727, "rewards/format_reward": 1.0, "step": 1568 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 457.125, "epoch": 0.015840484603735487, "grad_norm": 3.5114146285871337, "kl": 0.052734375, "learning_rate": 9.993810051261649e-07, "loss": 0.0021, "reward": 1.8743906021118164, "reward_std": 0.10314375907182693, "rewards/accuracy_reward": 0.7243906259536743, "rewards/format_reward": 1.0, "step": 1569 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.21875, "epoch": 0.01585058051489147, "grad_norm": 1.4021654044629417, "kl": 0.05712890625, "learning_rate": 9.993802160067537e-07, "loss": 0.0023, "reward": 2.0806031227111816, "reward_std": 0.023446839302778244, "rewards/accuracy_reward": 0.8868530988693237, "rewards/format_reward": 1.0, "step": 1570 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.53125, "epoch": 0.01586067642604745, "grad_norm": 1.4403409473132525, "kl": 0.0517578125, "learning_rate": 9.993794263849744e-07, "loss": 0.0021, "reward": 2.172593593597412, "reward_std": 0.00771385058760643, "rewards/accuracy_reward": 0.9725937843322754, "rewards/format_reward": 1.0, "step": 1571 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.875, "epoch": 0.015870772337203434, "grad_norm": 1.8351453582850978, "kl": 0.059814453125, "learning_rate": 9.993786362608276e-07, "loss": 0.0024, "reward": 2.1127188205718994, "reward_std": 0.02718566730618477, "rewards/accuracy_reward": 0.9127187728881836, "rewards/format_reward": 1.0, "step": 1572 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.03125, "epoch": 0.015880868248359416, "grad_norm": 1.8077469179739687, "kl": 0.0615234375, "learning_rate": 9.99377845634314e-07, "loss": 0.0025, "reward": 2.1058437824249268, "reward_std": 0.05023786053061485, "rewards/accuracy_reward": 0.9120936989784241, "rewards/format_reward": 1.0, "step": 1573 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 411.96875, "epoch": 0.015890964159515398, "grad_norm": 1.38844274044611, "kl": 0.061767578125, "learning_rate": 9.99377054505435e-07, "loss": 0.0025, "reward": 1.5598750114440918, "reward_std": 0.007468806579709053, "rewards/accuracy_reward": 0.4598750174045563, "rewards/format_reward": 1.0, "step": 1574 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.015901060070671377, "grad_norm": 2.5166919185481476, "kl": 0.0634765625, "learning_rate": 9.99376262874191e-07, "loss": 0.0025, "reward": 1.8284125328063965, "reward_std": 0.11523476988077164, "rewards/accuracy_reward": 0.6784124970436096, "rewards/format_reward": 1.0, "step": 1575 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 432.625, "epoch": 0.01591115598182736, "grad_norm": 23.747907813998985, "kl": 0.05078125, "learning_rate": 9.993754707405826e-07, "loss": 0.002, "reward": 1.6725624799728394, "reward_std": 0.16391463577747345, "rewards/accuracy_reward": 0.5725624561309814, "rewards/format_reward": 1.0, "step": 1576 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.21875, "epoch": 0.01592125189298334, "grad_norm": 1.6650077844568256, "kl": 0.058837890625, "learning_rate": 9.993746781046106e-07, "loss": 0.0023, "reward": 2.167062282562256, "reward_std": 0.02451390214264393, "rewards/accuracy_reward": 0.973312497138977, "rewards/format_reward": 1.0, "step": 1577 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.84375, "epoch": 0.015931347804139323, "grad_norm": 1.6065568481272674, "kl": 0.05859375, "learning_rate": 9.993738849662764e-07, "loss": 0.0023, "reward": 2.1715002059936523, "reward_std": 0.017188165336847305, "rewards/accuracy_reward": 0.971500039100647, "rewards/format_reward": 1.0, "step": 1578 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.015941443715295305, "grad_norm": 3.0757534199823833, "kl": 0.07861328125, "learning_rate": 9.993730913255802e-07, "loss": 0.0032, "reward": 2.0804686546325684, "reward_std": 0.028899336233735085, "rewards/accuracy_reward": 0.8804687261581421, "rewards/format_reward": 1.0, "step": 1579 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.015951539626451287, "grad_norm": 3.6079866722541065, "kl": 0.0703125, "learning_rate": 9.993722971825232e-07, "loss": 0.0028, "reward": 2.0670530796051025, "reward_std": 0.03639424592256546, "rewards/accuracy_reward": 0.8670531511306763, "rewards/format_reward": 1.0, "step": 1580 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.03125, "epoch": 0.01596163553760727, "grad_norm": 2.0216499670944414, "kl": 0.0673828125, "learning_rate": 9.993715025371056e-07, "loss": 0.0027, "reward": 2.094437599182129, "reward_std": 0.017050331458449364, "rewards/accuracy_reward": 0.8944375514984131, "rewards/format_reward": 1.0, "step": 1581 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.1875, "epoch": 0.01597173144876325, "grad_norm": 2.0615080471909804, "kl": 0.0634765625, "learning_rate": 9.99370707389329e-07, "loss": 0.0025, "reward": 2.066621780395508, "reward_std": 0.018746696412563324, "rewards/accuracy_reward": 0.8666219115257263, "rewards/format_reward": 1.0, "step": 1582 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.5625, "epoch": 0.015981827359919234, "grad_norm": 3.5696895563677478, "kl": 0.0625, "learning_rate": 9.993699117391935e-07, "loss": 0.0025, "reward": 2.0648438930511475, "reward_std": 0.03642412647604942, "rewards/accuracy_reward": 0.8710938096046448, "rewards/format_reward": 1.0, "step": 1583 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.59375, "epoch": 0.015991923271075216, "grad_norm": 2.525195909569904, "kl": 0.064453125, "learning_rate": 9.993691155867003e-07, "loss": 0.0026, "reward": 1.776750087738037, "reward_std": 0.022416772320866585, "rewards/accuracy_reward": 0.6267499923706055, "rewards/format_reward": 1.0, "step": 1584 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 394.28125, "epoch": 0.016002019182231195, "grad_norm": 2.2912316474284657, "kl": 0.06494140625, "learning_rate": 9.993683189318501e-07, "loss": 0.0026, "reward": 1.5481562614440918, "reward_std": 0.016658321022987366, "rewards/accuracy_reward": 0.4481562376022339, "rewards/format_reward": 1.0, "step": 1585 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.5625, "epoch": 0.016012115093387177, "grad_norm": 1.8486798535432867, "kl": 0.055419921875, "learning_rate": 9.993675217746438e-07, "loss": 0.0022, "reward": 2.0383126735687256, "reward_std": 0.03037569858133793, "rewards/accuracy_reward": 0.8445624709129333, "rewards/format_reward": 1.0, "step": 1586 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 386.40625, "epoch": 0.01602221100454316, "grad_norm": 2.2549603885974063, "kl": 0.0673828125, "learning_rate": 9.99366724115082e-07, "loss": 0.0027, "reward": 1.7818437814712524, "reward_std": 0.030613556504249573, "rewards/accuracy_reward": 0.6380937099456787, "rewards/format_reward": 1.0, "step": 1587 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.40625, "epoch": 0.01603230691569914, "grad_norm": 1.6461967011111076, "kl": 0.06884765625, "learning_rate": 9.993659259531654e-07, "loss": 0.0028, "reward": 2.045250177383423, "reward_std": 0.006834371481090784, "rewards/accuracy_reward": 0.8452500104904175, "rewards/format_reward": 1.0, "step": 1588 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.5625, "epoch": 0.016042402826855123, "grad_norm": 1.8476738441999505, "kl": 0.06396484375, "learning_rate": 9.993651272888951e-07, "loss": 0.0026, "reward": 2.130312442779541, "reward_std": 0.007906763814389706, "rewards/accuracy_reward": 0.9303125143051147, "rewards/format_reward": 1.0, "step": 1589 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.0625, "epoch": 0.016052498738011105, "grad_norm": 2.1242560998794566, "kl": 0.056884765625, "learning_rate": 9.99364328122272e-07, "loss": 0.0023, "reward": 2.0283000469207764, "reward_std": 0.17730531096458435, "rewards/accuracy_reward": 0.8470500111579895, "rewards/format_reward": 1.0, "step": 1590 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.53125, "epoch": 0.016062594649167088, "grad_norm": 1.645611003576552, "kl": 0.06640625, "learning_rate": 9.993635284532966e-07, "loss": 0.0027, "reward": 1.8347811698913574, "reward_std": 0.022188015282154083, "rewards/accuracy_reward": 0.691031277179718, "rewards/format_reward": 1.0, "step": 1591 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.5, "epoch": 0.01607269056032307, "grad_norm": 2.55465445139308, "kl": 0.07080078125, "learning_rate": 9.993627282819699e-07, "loss": 0.0028, "reward": 1.951812505722046, "reward_std": 0.0607643723487854, "rewards/accuracy_reward": 0.7518125176429749, "rewards/format_reward": 1.0, "step": 1592 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.25, "epoch": 0.016082786471479052, "grad_norm": 2.01925255844704, "kl": 0.06591796875, "learning_rate": 9.993619276082924e-07, "loss": 0.0026, "reward": 2.134256362915039, "reward_std": 0.035496484488248825, "rewards/accuracy_reward": 0.9405062198638916, "rewards/format_reward": 1.0, "step": 1593 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.25, "epoch": 0.016092882382635034, "grad_norm": 2.5345328436311987, "kl": 0.06201171875, "learning_rate": 9.993611264322654e-07, "loss": 0.0025, "reward": 2.0471625328063965, "reward_std": 0.04415571689605713, "rewards/accuracy_reward": 0.8471624851226807, "rewards/format_reward": 1.0, "step": 1594 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.3125, "epoch": 0.016102978293791016, "grad_norm": 1.9097834351015055, "kl": 0.05419921875, "learning_rate": 9.993603247538894e-07, "loss": 0.0022, "reward": 2.0826468467712402, "reward_std": 0.11744807660579681, "rewards/accuracy_reward": 0.8888968825340271, "rewards/format_reward": 1.0, "step": 1595 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.4375, "epoch": 0.016113074204946995, "grad_norm": 3.008998099678651, "kl": 0.06787109375, "learning_rate": 9.99359522573165e-07, "loss": 0.0027, "reward": 2.069624900817871, "reward_std": 0.03300727158784866, "rewards/accuracy_reward": 0.8758749961853027, "rewards/format_reward": 1.0, "step": 1596 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.53125, "epoch": 0.016123170116102977, "grad_norm": 4.042738553352234, "kl": 0.0771484375, "learning_rate": 9.993587198900933e-07, "loss": 0.0031, "reward": 2.113009452819824, "reward_std": 0.026713114231824875, "rewards/accuracy_reward": 0.9130094051361084, "rewards/format_reward": 1.0, "step": 1597 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.875, "epoch": 0.01613326602725896, "grad_norm": 4.019911459986341, "kl": 0.064453125, "learning_rate": 9.99357916704675e-07, "loss": 0.0026, "reward": 2.0960311889648438, "reward_std": 0.01957208663225174, "rewards/accuracy_reward": 0.8960313200950623, "rewards/format_reward": 1.0, "step": 1598 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.59375, "epoch": 0.01614336193841494, "grad_norm": 1.3438351635262988, "kl": 0.048828125, "learning_rate": 9.99357113016911e-07, "loss": 0.0019, "reward": 1.8249657154083252, "reward_std": 0.11175832897424698, "rewards/accuracy_reward": 0.6812155842781067, "rewards/format_reward": 1.0, "step": 1599 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.875, "epoch": 0.016153457849570924, "grad_norm": 5.5020104876083495, "kl": 0.07080078125, "learning_rate": 9.993563088268022e-07, "loss": 0.0028, "reward": 2.039781093597412, "reward_std": 0.029841627925634384, "rewards/accuracy_reward": 0.8397812843322754, "rewards/format_reward": 1.0, "step": 1600 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 0.016163553760726906, "grad_norm": 2.6059621245860436, "kl": 0.07177734375, "learning_rate": 9.993555041343492e-07, "loss": 0.0029, "reward": 2.0909688472747803, "reward_std": 0.027268730103969574, "rewards/accuracy_reward": 0.8909687399864197, "rewards/format_reward": 1.0, "step": 1601 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.0, "epoch": 0.016173649671882888, "grad_norm": 2.1330804552287144, "kl": 0.06640625, "learning_rate": 9.99354698939553e-07, "loss": 0.0026, "reward": 2.1492812633514404, "reward_std": 0.018289897590875626, "rewards/accuracy_reward": 0.9492812156677246, "rewards/format_reward": 1.0, "step": 1602 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 400.875, "epoch": 0.01618374558303887, "grad_norm": 1.788296348071005, "kl": 0.059814453125, "learning_rate": 9.993538932424143e-07, "loss": 0.0024, "reward": 1.5242187976837158, "reward_std": 0.11346618831157684, "rewards/accuracy_reward": 0.43046876788139343, "rewards/format_reward": 1.0, "step": 1603 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.34375, "epoch": 0.016193841494194852, "grad_norm": 1.6682076327201554, "kl": 0.0595703125, "learning_rate": 9.993530870429337e-07, "loss": 0.0024, "reward": 2.16015625, "reward_std": 0.009950734674930573, "rewards/accuracy_reward": 0.960156261920929, "rewards/format_reward": 1.0, "step": 1604 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.6875, "epoch": 0.016203937405350834, "grad_norm": 1.480196193476887, "kl": 0.052978515625, "learning_rate": 9.993522803411123e-07, "loss": 0.0021, "reward": 2.0108437538146973, "reward_std": 0.1670393943786621, "rewards/accuracy_reward": 0.8295937776565552, "rewards/format_reward": 1.0, "step": 1605 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.875, "epoch": 0.016214033316506813, "grad_norm": 3.4887517643677373, "kl": 0.06396484375, "learning_rate": 9.99351473136951e-07, "loss": 0.0026, "reward": 2.090684413909912, "reward_std": 0.020449530333280563, "rewards/accuracy_reward": 0.8906843662261963, "rewards/format_reward": 1.0, "step": 1606 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.6875, "epoch": 0.016224129227662795, "grad_norm": 2.338594779947078, "kl": 0.07177734375, "learning_rate": 9.993506654304503e-07, "loss": 0.0029, "reward": 2.0017812252044678, "reward_std": 0.03157518059015274, "rewards/accuracy_reward": 0.8017812967300415, "rewards/format_reward": 1.0, "step": 1607 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.90625, "epoch": 0.016234225138818777, "grad_norm": 1.3117127168116665, "kl": 0.0654296875, "learning_rate": 9.99349857221611e-07, "loss": 0.0026, "reward": 1.8192750215530396, "reward_std": 0.02963395230472088, "rewards/accuracy_reward": 0.6692750453948975, "rewards/format_reward": 1.0, "step": 1608 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.5625, "epoch": 0.01624432104997476, "grad_norm": 2.3024705682040643, "kl": 0.07080078125, "learning_rate": 9.993490485104343e-07, "loss": 0.0028, "reward": 2.1316564083099365, "reward_std": 0.02932732366025448, "rewards/accuracy_reward": 0.9379062652587891, "rewards/format_reward": 1.0, "step": 1609 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 384.40625, "epoch": 0.01625441696113074, "grad_norm": 5.905822841973195, "kl": 0.0703125, "learning_rate": 9.993482392969205e-07, "loss": 0.0028, "reward": 2.0562500953674316, "reward_std": 0.04900480806827545, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1610 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.46875, "epoch": 0.016264512872286724, "grad_norm": 2.3001252429079004, "kl": 0.058837890625, "learning_rate": 9.99347429581071e-07, "loss": 0.0024, "reward": 2.1130313873291016, "reward_std": 0.031190175563097, "rewards/accuracy_reward": 0.9192812442779541, "rewards/format_reward": 1.0, "step": 1611 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.625, "epoch": 0.016274608783442706, "grad_norm": 2.7180539327994033, "kl": 0.0625, "learning_rate": 9.993466193628862e-07, "loss": 0.0025, "reward": 2.120112419128418, "reward_std": 0.024371057748794556, "rewards/accuracy_reward": 0.9201124906539917, "rewards/format_reward": 1.0, "step": 1612 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.3125, "epoch": 0.016284704694598688, "grad_norm": 1.7365318699640495, "kl": 0.059326171875, "learning_rate": 9.99345808642367e-07, "loss": 0.0024, "reward": 2.1537814140319824, "reward_std": 0.011253939010202885, "rewards/accuracy_reward": 0.953781247138977, "rewards/format_reward": 1.0, "step": 1613 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.625, "epoch": 0.01629480060575467, "grad_norm": 1.7799114959392421, "kl": 0.049560546875, "learning_rate": 9.993449974195142e-07, "loss": 0.002, "reward": 1.6581562757492065, "reward_std": 0.14468084275722504, "rewards/accuracy_reward": 0.5456562638282776, "rewards/format_reward": 1.0, "step": 1614 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.25, "epoch": 0.016304896516910652, "grad_norm": 2.3059472226129323, "kl": 0.06201171875, "learning_rate": 9.993441856943288e-07, "loss": 0.0025, "reward": 2.1733126640319824, "reward_std": 0.008729016408324242, "rewards/accuracy_reward": 0.973312497138977, "rewards/format_reward": 1.0, "step": 1615 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.53125, "epoch": 0.016314992428066635, "grad_norm": 1.4524438066298533, "kl": 0.06640625, "learning_rate": 9.993433734668113e-07, "loss": 0.0027, "reward": 2.0755937099456787, "reward_std": 0.02301901765167713, "rewards/accuracy_reward": 0.8818438053131104, "rewards/format_reward": 1.0, "step": 1616 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.78125, "epoch": 0.016325088339222613, "grad_norm": 1.7185180360386818, "kl": 0.05908203125, "learning_rate": 9.993425607369628e-07, "loss": 0.0024, "reward": 1.8636562824249268, "reward_std": 0.09922127425670624, "rewards/accuracy_reward": 0.7136561870574951, "rewards/format_reward": 1.0, "step": 1617 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.21875, "epoch": 0.016335184250378595, "grad_norm": 4.251507842596929, "kl": 0.06640625, "learning_rate": 9.99341747504784e-07, "loss": 0.0027, "reward": 2.0398125648498535, "reward_std": 0.038464583456516266, "rewards/accuracy_reward": 0.8460625410079956, "rewards/format_reward": 1.0, "step": 1618 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.34375, "epoch": 0.016345280161534578, "grad_norm": 2.0933805844628965, "kl": 0.05859375, "learning_rate": 9.993409337702757e-07, "loss": 0.0023, "reward": 2.0154190063476562, "reward_std": 0.19427955150604248, "rewards/accuracy_reward": 0.8466687202453613, "rewards/format_reward": 1.0, "step": 1619 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.01635537607269056, "grad_norm": 2.370370645250371, "kl": 0.056884765625, "learning_rate": 9.993401195334388e-07, "loss": 0.0023, "reward": 1.8953750133514404, "reward_std": 0.1807456761598587, "rewards/accuracy_reward": 0.7453750371932983, "rewards/format_reward": 1.0, "step": 1620 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 378.4375, "epoch": 0.016365471983846542, "grad_norm": 3.004575997709921, "kl": 0.06982421875, "learning_rate": 9.993393047942738e-07, "loss": 0.0028, "reward": 2.0765938758850098, "reward_std": 0.05023641884326935, "rewards/accuracy_reward": 0.889093816280365, "rewards/format_reward": 1.0, "step": 1621 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.75, "epoch": 0.016375567895002524, "grad_norm": 5.6580980049940415, "kl": 0.0732421875, "learning_rate": 9.99338489552782e-07, "loss": 0.0029, "reward": 2.080843925476074, "reward_std": 0.018547050654888153, "rewards/accuracy_reward": 0.8808437585830688, "rewards/format_reward": 1.0, "step": 1622 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.375, "epoch": 0.016385663806158506, "grad_norm": 2.0373732523862813, "kl": 0.0634765625, "learning_rate": 9.99337673808964e-07, "loss": 0.0025, "reward": 2.057468891143799, "reward_std": 0.01931338757276535, "rewards/accuracy_reward": 0.8574687242507935, "rewards/format_reward": 1.0, "step": 1623 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.28125, "epoch": 0.01639575971731449, "grad_norm": 4.2495752666678905, "kl": 0.06396484375, "learning_rate": 9.993368575628207e-07, "loss": 0.0026, "reward": 2.1050000190734863, "reward_std": 0.03090023249387741, "rewards/accuracy_reward": 0.9050000905990601, "rewards/format_reward": 1.0, "step": 1624 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.01640585562847047, "grad_norm": 4.152675118224692, "kl": 0.052001953125, "learning_rate": 9.993360408143527e-07, "loss": 0.0021, "reward": 2.1118032932281494, "reward_std": 0.04087933525443077, "rewards/accuracy_reward": 0.9180530905723572, "rewards/format_reward": 1.0, "step": 1625 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.016415951539626453, "grad_norm": 1.941079716474522, "kl": 0.072265625, "learning_rate": 9.99335223563561e-07, "loss": 0.0029, "reward": 2.1656532287597656, "reward_std": 0.01278455276042223, "rewards/accuracy_reward": 0.965653121471405, "rewards/format_reward": 1.0, "step": 1626 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.5, "epoch": 0.01642604745078243, "grad_norm": 3.9126667084160673, "kl": 0.0654296875, "learning_rate": 9.993344058104465e-07, "loss": 0.0026, "reward": 1.9968312978744507, "reward_std": 0.033686257898807526, "rewards/accuracy_reward": 0.7968312501907349, "rewards/format_reward": 1.0, "step": 1627 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.0625, "epoch": 0.016436143361938414, "grad_norm": 1.0963433968045482, "kl": 0.0419921875, "learning_rate": 9.993335875550097e-07, "loss": 0.0017, "reward": 2.039781332015991, "reward_std": 0.006419045850634575, "rewards/accuracy_reward": 0.8397812843322754, "rewards/format_reward": 1.0, "step": 1628 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 424.65625, "epoch": 0.016446239273094396, "grad_norm": 2.142113667154446, "kl": 0.06591796875, "learning_rate": 9.993327687972517e-07, "loss": 0.0026, "reward": 1.8540624380111694, "reward_std": 0.00878607202321291, "rewards/accuracy_reward": 0.7040625214576721, "rewards/format_reward": 1.0, "step": 1629 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 427.03125, "epoch": 0.016456335184250378, "grad_norm": 2.156387700008111, "kl": 0.05615234375, "learning_rate": 9.993319495371733e-07, "loss": 0.0022, "reward": 1.7515312433242798, "reward_std": 0.03474476560950279, "rewards/accuracy_reward": 0.6015312671661377, "rewards/format_reward": 1.0, "step": 1630 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.90625, "epoch": 0.01646643109540636, "grad_norm": 1.0815844485771027, "kl": 0.056884765625, "learning_rate": 9.993311297747754e-07, "loss": 0.0023, "reward": 2.1719436645507812, "reward_std": 0.005440634209662676, "rewards/accuracy_reward": 0.971943736076355, "rewards/format_reward": 1.0, "step": 1631 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.125, "epoch": 0.016476527006562342, "grad_norm": 2.3597136998306714, "kl": 0.06298828125, "learning_rate": 9.993303095100585e-07, "loss": 0.0025, "reward": 2.0508780479431152, "reward_std": 0.043170422315597534, "rewards/accuracy_reward": 0.8571281433105469, "rewards/format_reward": 1.0, "step": 1632 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.71875, "epoch": 0.016486622917718324, "grad_norm": 2.6408754270680403, "kl": 0.057861328125, "learning_rate": 9.993294887430238e-07, "loss": 0.0023, "reward": 1.8181874752044678, "reward_std": 0.02675425261259079, "rewards/accuracy_reward": 0.6681874990463257, "rewards/format_reward": 1.0, "step": 1633 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.03125, "epoch": 0.016496718828874306, "grad_norm": 5.515681481825446, "kl": 0.05810546875, "learning_rate": 9.993286674736719e-07, "loss": 0.0023, "reward": 1.778656244277954, "reward_std": 0.013635846786201, "rewards/accuracy_reward": 0.628656268119812, "rewards/format_reward": 1.0, "step": 1634 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 440.0, "epoch": 0.01650681474003029, "grad_norm": 2.413388587252963, "kl": 0.053466796875, "learning_rate": 9.993278457020035e-07, "loss": 0.0021, "reward": 2.048093795776367, "reward_std": 0.04465322196483612, "rewards/accuracy_reward": 0.8668437004089355, "rewards/format_reward": 1.0, "step": 1635 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 439.84375, "epoch": 0.01651691065118627, "grad_norm": 2.063173787317935, "kl": 0.049072265625, "learning_rate": 9.993270234280197e-07, "loss": 0.002, "reward": 2.129584312438965, "reward_std": 0.057962723076343536, "rewards/accuracy_reward": 0.9483343362808228, "rewards/format_reward": 1.0, "step": 1636 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.875, "epoch": 0.016527006562342253, "grad_norm": 2.031475178467487, "kl": 0.06640625, "learning_rate": 9.993262006517214e-07, "loss": 0.0026, "reward": 2.053187370300293, "reward_std": 0.016413405537605286, "rewards/accuracy_reward": 0.8531874418258667, "rewards/format_reward": 1.0, "step": 1637 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.40625, "epoch": 0.01653710247349823, "grad_norm": 3.7949617287843767, "kl": 0.06298828125, "learning_rate": 9.993253773731093e-07, "loss": 0.0025, "reward": 2.0595312118530273, "reward_std": 0.047408316284418106, "rewards/accuracy_reward": 0.8657811880111694, "rewards/format_reward": 1.0, "step": 1638 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.84375, "epoch": 0.016547198384654214, "grad_norm": 2.3762467119285455, "kl": 0.076171875, "learning_rate": 9.99324553592184e-07, "loss": 0.003, "reward": 2.0590312480926514, "reward_std": 0.02035539411008358, "rewards/accuracy_reward": 0.8590312004089355, "rewards/format_reward": 1.0, "step": 1639 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.8125, "epoch": 0.016557294295810196, "grad_norm": 1.6395448195585414, "kl": 0.05615234375, "learning_rate": 9.993237293089467e-07, "loss": 0.0022, "reward": 1.8027812242507935, "reward_std": 0.030835555866360664, "rewards/accuracy_reward": 0.6590312719345093, "rewards/format_reward": 1.0, "step": 1640 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.75, "epoch": 0.016567390206966178, "grad_norm": 2.543446108075258, "kl": 0.052490234375, "learning_rate": 9.993229045233977e-07, "loss": 0.0021, "reward": 2.158656597137451, "reward_std": 0.041875600814819336, "rewards/accuracy_reward": 0.9711562395095825, "rewards/format_reward": 1.0, "step": 1641 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.1875, "epoch": 0.01657748611812216, "grad_norm": 7.836124477543491, "kl": 0.056884765625, "learning_rate": 9.993220792355385e-07, "loss": 0.0023, "reward": 1.9380625486373901, "reward_std": 0.16419672966003418, "rewards/accuracy_reward": 0.7755624651908875, "rewards/format_reward": 1.0, "step": 1642 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.75, "epoch": 0.016587582029278142, "grad_norm": 2.5373428286787982, "kl": 0.06494140625, "learning_rate": 9.993212534453695e-07, "loss": 0.0026, "reward": 1.9338750839233398, "reward_std": 0.17107801139354706, "rewards/accuracy_reward": 0.7588750123977661, "rewards/format_reward": 1.0, "step": 1643 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.016597677940434125, "grad_norm": 1.6659345137349935, "kl": 0.064453125, "learning_rate": 9.993204271528918e-07, "loss": 0.0026, "reward": 1.917531132698059, "reward_std": 0.10028359293937683, "rewards/accuracy_reward": 0.7675312161445618, "rewards/format_reward": 1.0, "step": 1644 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 408.125, "epoch": 0.016607773851590107, "grad_norm": 1.4416078227984226, "kl": 0.0615234375, "learning_rate": 9.99319600358106e-07, "loss": 0.0025, "reward": 1.8686875104904175, "reward_std": 0.01258583553135395, "rewards/accuracy_reward": 0.7186874747276306, "rewards/format_reward": 1.0, "step": 1645 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.625, "epoch": 0.01661786976274609, "grad_norm": 2.8998864052770705, "kl": 0.06298828125, "learning_rate": 9.993187730610128e-07, "loss": 0.0025, "reward": 2.122968912124634, "reward_std": 0.01583205722272396, "rewards/accuracy_reward": 0.9229687452316284, "rewards/format_reward": 1.0, "step": 1646 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.96875, "epoch": 0.01662796567390207, "grad_norm": 2.4839032460818378, "kl": 0.05419921875, "learning_rate": 9.993179452616134e-07, "loss": 0.0022, "reward": 1.5919374227523804, "reward_std": 0.09560102969408035, "rewards/accuracy_reward": 0.49193745851516724, "rewards/format_reward": 1.0, "step": 1647 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.375, "epoch": 0.016638061585058053, "grad_norm": 6.730385816380033, "kl": 0.06005859375, "learning_rate": 9.993171169599083e-07, "loss": 0.0024, "reward": 1.7159373760223389, "reward_std": 0.02346053533256054, "rewards/accuracy_reward": 0.5659374594688416, "rewards/format_reward": 1.0, "step": 1648 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.3125, "epoch": 0.016648157496214032, "grad_norm": 3.6099008679605062, "kl": 0.076171875, "learning_rate": 9.993162881558986e-07, "loss": 0.0031, "reward": 2.0690219402313232, "reward_std": 0.018736600875854492, "rewards/accuracy_reward": 0.8690218925476074, "rewards/format_reward": 1.0, "step": 1649 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.40625, "epoch": 0.016658253407370014, "grad_norm": 4.559421909327238, "kl": 0.06591796875, "learning_rate": 9.993154588495853e-07, "loss": 0.0026, "reward": 2.0828750133514404, "reward_std": 0.01961648464202881, "rewards/accuracy_reward": 0.8828749656677246, "rewards/format_reward": 1.0, "step": 1650 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 420.78125, "epoch": 0.016668349318525996, "grad_norm": 1.0922621747879382, "kl": 0.060302734375, "learning_rate": 9.993146290409686e-07, "loss": 0.0024, "reward": 1.5770938396453857, "reward_std": 0.005232013761997223, "rewards/accuracy_reward": 0.47709375619888306, "rewards/format_reward": 1.0, "step": 1651 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.75, "epoch": 0.01667844522968198, "grad_norm": 1.507199483505253, "kl": 0.06591796875, "learning_rate": 9.9931379873005e-07, "loss": 0.0026, "reward": 2.1848437786102295, "reward_std": 0.006968313828110695, "rewards/accuracy_reward": 0.9848437905311584, "rewards/format_reward": 1.0, "step": 1652 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.40625, "epoch": 0.01668854114083796, "grad_norm": 1.6079182822824671, "kl": 0.0595703125, "learning_rate": 9.993129679168298e-07, "loss": 0.0024, "reward": 1.8636562824249268, "reward_std": 0.011873016133904457, "rewards/accuracy_reward": 0.7136561870574951, "rewards/format_reward": 1.0, "step": 1653 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.0625, "epoch": 0.016698637051993943, "grad_norm": 2.0485321884016443, "kl": 0.05859375, "learning_rate": 9.993121366013092e-07, "loss": 0.0023, "reward": 2.0181875228881836, "reward_std": 0.01927453838288784, "rewards/accuracy_reward": 0.8181874752044678, "rewards/format_reward": 1.0, "step": 1654 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.15625, "epoch": 0.016708732963149925, "grad_norm": 2.103402096236392, "kl": 0.0703125, "learning_rate": 9.99311304783489e-07, "loss": 0.0028, "reward": 2.1079063415527344, "reward_std": 0.028000403195619583, "rewards/accuracy_reward": 0.9079062342643738, "rewards/format_reward": 1.0, "step": 1655 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 404.625, "epoch": 0.016718828874305907, "grad_norm": 2.5727611698800694, "kl": 0.05859375, "learning_rate": 9.993104724633698e-07, "loss": 0.0024, "reward": 1.8464688062667847, "reward_std": 0.022173305973410606, "rewards/accuracy_reward": 0.696468710899353, "rewards/format_reward": 1.0, "step": 1656 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.375, "epoch": 0.01672892478546189, "grad_norm": 2.3395379843230892, "kl": 0.07421875, "learning_rate": 9.993096396409526e-07, "loss": 0.003, "reward": 2.0390625, "reward_std": 0.02713361755013466, "rewards/accuracy_reward": 0.8390625715255737, "rewards/format_reward": 1.0, "step": 1657 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.65625, "epoch": 0.01673902069661787, "grad_norm": 1.9999889168246834, "kl": 0.061279296875, "learning_rate": 9.993088063162384e-07, "loss": 0.0025, "reward": 1.7817187309265137, "reward_std": 0.01707862690091133, "rewards/accuracy_reward": 0.6317187547683716, "rewards/format_reward": 1.0, "step": 1658 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 404.40625, "epoch": 0.01674911660777385, "grad_norm": 1.5072562190053695, "kl": 0.058349609375, "learning_rate": 9.993079724892274e-07, "loss": 0.0023, "reward": 1.5885937213897705, "reward_std": 0.007240685634315014, "rewards/accuracy_reward": 0.488593727350235, "rewards/format_reward": 1.0, "step": 1659 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 404.71875, "epoch": 0.016759212518929832, "grad_norm": 2.2393585555831446, "kl": 0.062255859375, "learning_rate": 9.993071381599215e-07, "loss": 0.0025, "reward": 1.671375036239624, "reward_std": 0.01283626165241003, "rewards/accuracy_reward": 0.5213750004768372, "rewards/format_reward": 1.0, "step": 1660 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.25, "epoch": 0.016769308430085814, "grad_norm": 2.8094231952526965, "kl": 0.072265625, "learning_rate": 9.993063033283203e-07, "loss": 0.0029, "reward": 2.111593723297119, "reward_std": 0.020293917506933212, "rewards/accuracy_reward": 0.9115936756134033, "rewards/format_reward": 1.0, "step": 1661 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 425.6875, "epoch": 0.016779404341241796, "grad_norm": 1.5817017866318261, "kl": 0.05712890625, "learning_rate": 9.99305467994426e-07, "loss": 0.0023, "reward": 1.8800313472747803, "reward_std": 0.004888439550995827, "rewards/accuracy_reward": 0.7300311923027039, "rewards/format_reward": 1.0, "step": 1662 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.01678950025239778, "grad_norm": 2.8891390879387107, "kl": 0.06884765625, "learning_rate": 9.993046321582382e-07, "loss": 0.0028, "reward": 2.1566874980926514, "reward_std": 0.038382790982723236, "rewards/accuracy_reward": 0.9629374742507935, "rewards/format_reward": 1.0, "step": 1663 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 400.53125, "epoch": 0.01679959616355376, "grad_norm": 1.6917699659399525, "kl": 0.06005859375, "learning_rate": 9.993037958197583e-07, "loss": 0.0024, "reward": 1.8384687900543213, "reward_std": 0.011049559339880943, "rewards/accuracy_reward": 0.6884688138961792, "rewards/format_reward": 1.0, "step": 1664 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.75, "epoch": 0.016809692074709743, "grad_norm": 4.003130110558203, "kl": 0.07275390625, "learning_rate": 9.993029589789874e-07, "loss": 0.0029, "reward": 2.0147812366485596, "reward_std": 0.03966920077800751, "rewards/accuracy_reward": 0.8147812485694885, "rewards/format_reward": 1.0, "step": 1665 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.016819787985865725, "grad_norm": 2.605221944181148, "kl": 0.0703125, "learning_rate": 9.993021216359257e-07, "loss": 0.0028, "reward": 2.0672500133514404, "reward_std": 0.012324647046625614, "rewards/accuracy_reward": 0.8672499656677246, "rewards/format_reward": 1.0, "step": 1666 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.8125, "epoch": 0.016829883897021707, "grad_norm": 2.535231872988375, "kl": 0.05615234375, "learning_rate": 9.993012837905746e-07, "loss": 0.0023, "reward": 1.8408749103546143, "reward_std": 0.012737829238176346, "rewards/accuracy_reward": 0.6908750534057617, "rewards/format_reward": 1.0, "step": 1667 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.78125, "epoch": 0.01683997980817769, "grad_norm": 2.1020104958196155, "kl": 0.05859375, "learning_rate": 9.993004454429346e-07, "loss": 0.0023, "reward": 1.723656415939331, "reward_std": 0.006675533950328827, "rewards/accuracy_reward": 0.5736563205718994, "rewards/format_reward": 1.0, "step": 1668 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.84375, "epoch": 0.01685007571933367, "grad_norm": 2.9140626124755142, "kl": 0.07080078125, "learning_rate": 9.992996065930069e-07, "loss": 0.0028, "reward": 2.119874954223633, "reward_std": 0.023570841178297997, "rewards/accuracy_reward": 0.9198750257492065, "rewards/format_reward": 1.0, "step": 1669 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.78125, "epoch": 0.01686017163048965, "grad_norm": 1.46072013811003, "kl": 0.05712890625, "learning_rate": 9.99298767240792e-07, "loss": 0.0023, "reward": 1.8819687366485596, "reward_std": 0.010592368431389332, "rewards/accuracy_reward": 0.7319687604904175, "rewards/format_reward": 1.0, "step": 1670 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.90625, "epoch": 0.016870267541645632, "grad_norm": 11.854154824279822, "kl": 0.05810546875, "learning_rate": 9.992979273862908e-07, "loss": 0.0023, "reward": 1.9728749990463257, "reward_std": 0.19001564383506775, "rewards/accuracy_reward": 0.7916250228881836, "rewards/format_reward": 1.0, "step": 1671 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.53125, "epoch": 0.016880363452801615, "grad_norm": 3.210440905104275, "kl": 0.08203125, "learning_rate": 9.992970870295043e-07, "loss": 0.0033, "reward": 2.118593692779541, "reward_std": 0.022627785801887512, "rewards/accuracy_reward": 0.9185937643051147, "rewards/format_reward": 1.0, "step": 1672 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 387.96875, "epoch": 0.016890459363957597, "grad_norm": 2.0001117920149314, "kl": 0.061279296875, "learning_rate": 9.99296246170433e-07, "loss": 0.0025, "reward": 1.578281283378601, "reward_std": 0.007933244109153748, "rewards/accuracy_reward": 0.4782812297344208, "rewards/format_reward": 1.0, "step": 1673 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.01690055527511358, "grad_norm": 9.962796551845905, "kl": 0.0576171875, "learning_rate": 9.992954048090782e-07, "loss": 0.0023, "reward": 1.942750096321106, "reward_std": 0.15299050509929657, "rewards/accuracy_reward": 0.7802500128746033, "rewards/format_reward": 1.0, "step": 1674 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.59375, "epoch": 0.01691065118626956, "grad_norm": 2.168360758004406, "kl": 0.0537109375, "learning_rate": 9.992945629454405e-07, "loss": 0.0022, "reward": 2.1412501335144043, "reward_std": 0.008627757430076599, "rewards/accuracy_reward": 0.9412499666213989, "rewards/format_reward": 1.0, "step": 1675 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.53125, "epoch": 0.016920747097425543, "grad_norm": 10.187785278286652, "kl": 0.0517578125, "learning_rate": 9.992937205795209e-07, "loss": 0.0021, "reward": 2.108875274658203, "reward_std": 0.017228666692972183, "rewards/accuracy_reward": 0.908875048160553, "rewards/format_reward": 1.0, "step": 1676 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.75, "epoch": 0.016930843008581525, "grad_norm": 2.5988095501745345, "kl": 0.06005859375, "learning_rate": 9.9929287771132e-07, "loss": 0.0024, "reward": 1.9243125915527344, "reward_std": 0.15735867619514465, "rewards/accuracy_reward": 0.7368125319480896, "rewards/format_reward": 1.0, "step": 1677 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.016940938919737508, "grad_norm": 4.451358451171593, "kl": 0.0712890625, "learning_rate": 9.992920343408389e-07, "loss": 0.0028, "reward": 2.097843647003174, "reward_std": 0.02831408381462097, "rewards/accuracy_reward": 0.8978437185287476, "rewards/format_reward": 1.0, "step": 1678 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.01695103483089349, "grad_norm": 2.0270766399644624, "kl": 0.06005859375, "learning_rate": 9.992911904680784e-07, "loss": 0.0024, "reward": 2.1186251640319824, "reward_std": 0.029537620022892952, "rewards/accuracy_reward": 0.924875020980835, "rewards/format_reward": 1.0, "step": 1679 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 413.53125, "epoch": 0.01696113074204947, "grad_norm": 1.707232358867, "kl": 0.052001953125, "learning_rate": 9.99290346093039e-07, "loss": 0.0021, "reward": 1.7425000667572021, "reward_std": 0.16060368716716766, "rewards/accuracy_reward": 0.6112500429153442, "rewards/format_reward": 1.0, "step": 1680 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.875, "epoch": 0.01697122665320545, "grad_norm": 0.11385690583251892, "kl": 0.05908203125, "learning_rate": 9.992895012157222e-07, "loss": 0.0024, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1681 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.0625, "epoch": 0.016981322564361433, "grad_norm": 3.725559857545036, "kl": 0.0576171875, "learning_rate": 9.992886558361282e-07, "loss": 0.0023, "reward": 2.1487812995910645, "reward_std": 0.024688052013516426, "rewards/accuracy_reward": 0.9550312757492065, "rewards/format_reward": 1.0, "step": 1682 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.5, "epoch": 0.016991418475517415, "grad_norm": 2.3367879515018695, "kl": 0.05859375, "learning_rate": 9.992878099542583e-07, "loss": 0.0023, "reward": 2.1079063415527344, "reward_std": 0.022734977304935455, "rewards/accuracy_reward": 0.9141562581062317, "rewards/format_reward": 1.0, "step": 1683 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.9375, "epoch": 0.017001514386673397, "grad_norm": 2.014434270811577, "kl": 0.0615234375, "learning_rate": 9.99286963570113e-07, "loss": 0.0025, "reward": 1.7871874570846558, "reward_std": 0.022269122302532196, "rewards/accuracy_reward": 0.6371874809265137, "rewards/format_reward": 1.0, "step": 1684 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.21875, "epoch": 0.01701161029782938, "grad_norm": 1.7039308929021086, "kl": 0.061279296875, "learning_rate": 9.992861166836934e-07, "loss": 0.0024, "reward": 1.8817813396453857, "reward_std": 0.09308516979217529, "rewards/accuracy_reward": 0.7317812442779541, "rewards/format_reward": 1.0, "step": 1685 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 404.84375, "epoch": 0.01702170620898536, "grad_norm": 2.2957623366136892, "kl": 0.07080078125, "learning_rate": 9.992852692950005e-07, "loss": 0.0028, "reward": 1.8313125371932983, "reward_std": 0.04633940011262894, "rewards/accuracy_reward": 0.6938124895095825, "rewards/format_reward": 1.0, "step": 1686 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.90625, "epoch": 0.017031802120141343, "grad_norm": 2.0567424771730236, "kl": 0.06494140625, "learning_rate": 9.992844214040348e-07, "loss": 0.0026, "reward": 2.1828436851501465, "reward_std": 0.02215898036956787, "rewards/accuracy_reward": 0.9890937209129333, "rewards/format_reward": 1.0, "step": 1687 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.9375, "epoch": 0.017041898031297326, "grad_norm": 5.173268275755808, "kl": 0.06591796875, "learning_rate": 9.99283573010797e-07, "loss": 0.0026, "reward": 2.0855624675750732, "reward_std": 0.014898855239152908, "rewards/accuracy_reward": 0.8855624794960022, "rewards/format_reward": 1.0, "step": 1688 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.03125, "epoch": 0.017051993942453308, "grad_norm": 3.9948736397695885, "kl": 0.0732421875, "learning_rate": 9.992827241152885e-07, "loss": 0.0029, "reward": 2.086343765258789, "reward_std": 0.03670951724052429, "rewards/accuracy_reward": 0.886343777179718, "rewards/format_reward": 1.0, "step": 1689 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 412.34375, "epoch": 0.01706208985360929, "grad_norm": 3.462932615168974, "kl": 0.0634765625, "learning_rate": 9.9928187471751e-07, "loss": 0.0025, "reward": 1.6992813348770142, "reward_std": 0.021697543561458588, "rewards/accuracy_reward": 0.5492812395095825, "rewards/format_reward": 1.0, "step": 1690 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.15625, "epoch": 0.01707218576476527, "grad_norm": 2.2130833253914317, "kl": 0.0703125, "learning_rate": 9.992810248174623e-07, "loss": 0.0028, "reward": 2.1114063262939453, "reward_std": 0.021566953510046005, "rewards/accuracy_reward": 0.9114062190055847, "rewards/format_reward": 1.0, "step": 1691 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.21875, "epoch": 0.01708228167592125, "grad_norm": 1.5890491530004778, "kl": 0.0546875, "learning_rate": 9.99280174415146e-07, "loss": 0.0022, "reward": 1.848343849182129, "reward_std": 0.009650986641645432, "rewards/accuracy_reward": 0.6983437538146973, "rewards/format_reward": 1.0, "step": 1692 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.96875, "epoch": 0.017092377587077233, "grad_norm": 1.986410584823102, "kl": 0.06689453125, "learning_rate": 9.992793235105624e-07, "loss": 0.0027, "reward": 2.1488752365112305, "reward_std": 0.013423864729702473, "rewards/accuracy_reward": 0.9488749504089355, "rewards/format_reward": 1.0, "step": 1693 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.0, "epoch": 0.017102473498233215, "grad_norm": 2.058660826843954, "kl": 0.0615234375, "learning_rate": 9.992784721037117e-07, "loss": 0.0025, "reward": 2.093937635421753, "reward_std": 0.012747121043503284, "rewards/accuracy_reward": 0.8939374685287476, "rewards/format_reward": 1.0, "step": 1694 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.125, "epoch": 0.017112569409389197, "grad_norm": 2.8795460485828315, "kl": 0.060546875, "learning_rate": 9.992776201945957e-07, "loss": 0.0024, "reward": 2.003812551498413, "reward_std": 0.021909039467573166, "rewards/accuracy_reward": 0.8038125038146973, "rewards/format_reward": 1.0, "step": 1695 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.84375, "epoch": 0.01712266532054518, "grad_norm": 1.7124275922351475, "kl": 0.0634765625, "learning_rate": 9.992767677832143e-07, "loss": 0.0025, "reward": 2.1766250133514404, "reward_std": 0.009887752123177052, "rewards/accuracy_reward": 0.9766249656677246, "rewards/format_reward": 1.0, "step": 1696 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 425.09375, "epoch": 0.01713276123170116, "grad_norm": 3.760995768652907, "kl": 0.07177734375, "learning_rate": 9.99275914869569e-07, "loss": 0.0029, "reward": 1.9949061870574951, "reward_std": 0.019002307206392288, "rewards/accuracy_reward": 0.7949062585830688, "rewards/format_reward": 1.0, "step": 1697 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.375, "epoch": 0.017142857142857144, "grad_norm": 3.760112913707929, "kl": 0.083984375, "learning_rate": 9.992750614536604e-07, "loss": 0.0033, "reward": 2.133625030517578, "reward_std": 0.01724923402070999, "rewards/accuracy_reward": 0.9336249828338623, "rewards/format_reward": 1.0, "step": 1698 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.40625, "epoch": 0.017152953054013126, "grad_norm": 1.8831937138993786, "kl": 0.0634765625, "learning_rate": 9.992742075354894e-07, "loss": 0.0025, "reward": 2.1789374351501465, "reward_std": 0.014230271801352501, "rewards/accuracy_reward": 0.9789374470710754, "rewards/format_reward": 1.0, "step": 1699 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 437.84375, "epoch": 0.017163048965169108, "grad_norm": 5.013657621081171, "kl": 0.06689453125, "learning_rate": 9.99273353115057e-07, "loss": 0.0027, "reward": 2.0132498741149902, "reward_std": 0.04625207185745239, "rewards/accuracy_reward": 0.8319998979568481, "rewards/format_reward": 1.0, "step": 1700 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 444.28125, "epoch": 0.017173144876325087, "grad_norm": 1.435643289724249, "kl": 0.056640625, "learning_rate": 9.992724981923638e-07, "loss": 0.0023, "reward": 2.116374969482422, "reward_std": 0.04236429184675217, "rewards/accuracy_reward": 0.9351249933242798, "rewards/format_reward": 1.0, "step": 1701 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 442.5, "epoch": 0.01718324078748107, "grad_norm": 5.354470597557365, "kl": 0.0751953125, "learning_rate": 9.992716427674107e-07, "loss": 0.003, "reward": 2.0725936889648438, "reward_std": 0.01975807175040245, "rewards/accuracy_reward": 0.8725938200950623, "rewards/format_reward": 1.0, "step": 1702 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.71875, "epoch": 0.01719333669863705, "grad_norm": 2.752416071671429, "kl": 0.0712890625, "learning_rate": 9.992707868401989e-07, "loss": 0.0029, "reward": 2.0549063682556152, "reward_std": 0.011011520400643349, "rewards/accuracy_reward": 0.8549062013626099, "rewards/format_reward": 1.0, "step": 1703 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.34375, "epoch": 0.017203432609793033, "grad_norm": 1.9064487874293092, "kl": 0.0869140625, "learning_rate": 9.99269930410729e-07, "loss": 0.0035, "reward": 2.1449687480926514, "reward_std": 0.011452035047113895, "rewards/accuracy_reward": 0.9449687004089355, "rewards/format_reward": 1.0, "step": 1704 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 436.8125, "epoch": 0.017213528520949015, "grad_norm": 2.904169912689426, "kl": 0.0654296875, "learning_rate": 9.992690734790016e-07, "loss": 0.0026, "reward": 1.7362812757492065, "reward_std": 0.02482963725924492, "rewards/accuracy_reward": 0.5925313234329224, "rewards/format_reward": 1.0, "step": 1705 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.375, "epoch": 0.017223624432104997, "grad_norm": 3.326048275713206, "kl": 0.06884765625, "learning_rate": 9.99268216045018e-07, "loss": 0.0028, "reward": 2.1397814750671387, "reward_std": 0.024728497490286827, "rewards/accuracy_reward": 0.9397812485694885, "rewards/format_reward": 1.0, "step": 1706 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.40625, "epoch": 0.01723372034326098, "grad_norm": 2.232216164047962, "kl": 0.06982421875, "learning_rate": 9.992673581087788e-07, "loss": 0.0028, "reward": 2.121000051498413, "reward_std": 0.0342751182615757, "rewards/accuracy_reward": 0.9272499680519104, "rewards/format_reward": 1.0, "step": 1707 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.5, "epoch": 0.017243816254416962, "grad_norm": 2.215967731036761, "kl": 0.072265625, "learning_rate": 9.992664996702852e-07, "loss": 0.0029, "reward": 2.077937602996826, "reward_std": 0.011143390089273453, "rewards/accuracy_reward": 0.8779374957084656, "rewards/format_reward": 1.0, "step": 1708 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 433.375, "epoch": 0.017253912165572944, "grad_norm": 1.0712019380322413, "kl": 0.07080078125, "learning_rate": 9.992656407295374e-07, "loss": 0.0028, "reward": 2.1753125190734863, "reward_std": 0.003529850160703063, "rewards/accuracy_reward": 0.9753124713897705, "rewards/format_reward": 1.0, "step": 1709 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 433.8125, "epoch": 0.017264008076728926, "grad_norm": 2.135559584040181, "kl": 0.06005859375, "learning_rate": 9.992647812865372e-07, "loss": 0.0024, "reward": 1.7043437957763672, "reward_std": 0.16961507499217987, "rewards/accuracy_reward": 0.5730937719345093, "rewards/format_reward": 1.0, "step": 1710 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 438.21875, "epoch": 0.01727410398788491, "grad_norm": 2.7220061651934038, "kl": 0.0556640625, "learning_rate": 9.992639213412846e-07, "loss": 0.0022, "reward": 1.994093894958496, "reward_std": 0.03136289492249489, "rewards/accuracy_reward": 0.8003438115119934, "rewards/format_reward": 1.0, "step": 1711 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.0, "epoch": 0.017284199899040887, "grad_norm": 3.155115669192789, "kl": 0.07177734375, "learning_rate": 9.992630608937809e-07, "loss": 0.0029, "reward": 2.122593879699707, "reward_std": 0.11900870501995087, "rewards/accuracy_reward": 0.9288437366485596, "rewards/format_reward": 1.0, "step": 1712 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.46875, "epoch": 0.01729429581019687, "grad_norm": 3.436080959816147, "kl": 0.076171875, "learning_rate": 9.99262199944027e-07, "loss": 0.0031, "reward": 2.0186874866485596, "reward_std": 0.024235941469669342, "rewards/accuracy_reward": 0.8249375224113464, "rewards/format_reward": 1.0, "step": 1713 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.5625, "epoch": 0.01730439172135285, "grad_norm": 1.088443322842083, "kl": 0.0673828125, "learning_rate": 9.992613384920235e-07, "loss": 0.0027, "reward": 2.0729687213897705, "reward_std": 0.003216691082343459, "rewards/accuracy_reward": 0.8729687333106995, "rewards/format_reward": 1.0, "step": 1714 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 426.875, "epoch": 0.017314487632508833, "grad_norm": 1.7769149441435461, "kl": 0.06396484375, "learning_rate": 9.992604765377714e-07, "loss": 0.0026, "reward": 1.8037500381469727, "reward_std": 0.026613356545567513, "rewards/accuracy_reward": 0.653749942779541, "rewards/format_reward": 1.0, "step": 1715 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 423.09375, "epoch": 0.017324583543664816, "grad_norm": 1.847237017400514, "kl": 0.051513671875, "learning_rate": 9.992596140812715e-07, "loss": 0.0021, "reward": 1.5677813291549683, "reward_std": 0.026062332093715668, "rewards/accuracy_reward": 0.4802812337875366, "rewards/format_reward": 1.0, "step": 1716 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.90625, "epoch": 0.017334679454820798, "grad_norm": 2.370340417658552, "kl": 0.07568359375, "learning_rate": 9.99258751122525e-07, "loss": 0.003, "reward": 2.082750082015991, "reward_std": 0.01484627090394497, "rewards/accuracy_reward": 0.8827500343322754, "rewards/format_reward": 1.0, "step": 1717 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.0, "epoch": 0.01734477536597678, "grad_norm": 2.198822537183441, "kl": 0.0615234375, "learning_rate": 9.992578876615322e-07, "loss": 0.0025, "reward": 1.8330626487731934, "reward_std": 0.053413115441799164, "rewards/accuracy_reward": 0.6830625534057617, "rewards/format_reward": 1.0, "step": 1718 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.6875, "epoch": 0.017354871277132762, "grad_norm": 2.70407216067506, "kl": 0.07421875, "learning_rate": 9.992570236982948e-07, "loss": 0.003, "reward": 2.035343885421753, "reward_std": 0.017100617289543152, "rewards/accuracy_reward": 0.8353437185287476, "rewards/format_reward": 1.0, "step": 1719 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.625, "epoch": 0.017364967188288744, "grad_norm": 9.228469851576047, "kl": 0.054931640625, "learning_rate": 9.992561592328127e-07, "loss": 0.0022, "reward": 1.9888124465942383, "reward_std": 0.1693473756313324, "rewards/accuracy_reward": 0.807562530040741, "rewards/format_reward": 1.0, "step": 1720 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.65625, "epoch": 0.017375063099444726, "grad_norm": 2.2737129994403715, "kl": 0.06884765625, "learning_rate": 9.992552942650874e-07, "loss": 0.0028, "reward": 1.8528437614440918, "reward_std": 0.010165160521864891, "rewards/accuracy_reward": 0.7028437256813049, "rewards/format_reward": 1.0, "step": 1721 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.15625, "epoch": 0.017385159010600705, "grad_norm": 14.027013283122454, "kl": 0.07275390625, "learning_rate": 9.992544287951195e-07, "loss": 0.0029, "reward": 2.1022186279296875, "reward_std": 0.01787385903298855, "rewards/accuracy_reward": 0.902218759059906, "rewards/format_reward": 1.0, "step": 1722 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.53125, "epoch": 0.017395254921756687, "grad_norm": 1.1857102379869182, "kl": 0.058837890625, "learning_rate": 9.9925356282291e-07, "loss": 0.0024, "reward": 2.1923751831054688, "reward_std": 0.005283051170408726, "rewards/accuracy_reward": 0.9923750162124634, "rewards/format_reward": 1.0, "step": 1723 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.0, "epoch": 0.01740535083291267, "grad_norm": 3.337183712749313, "kl": 0.07080078125, "learning_rate": 9.992526963484599e-07, "loss": 0.0028, "reward": 1.942500114440918, "reward_std": 0.1673094630241394, "rewards/accuracy_reward": 0.7737499475479126, "rewards/format_reward": 1.0, "step": 1724 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.78125, "epoch": 0.01741544674406865, "grad_norm": 1.867741671108934, "kl": 0.06982421875, "learning_rate": 9.992518293717697e-07, "loss": 0.0028, "reward": 2.130406379699707, "reward_std": 0.013370256870985031, "rewards/accuracy_reward": 0.9304062128067017, "rewards/format_reward": 1.0, "step": 1725 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 404.75, "epoch": 0.017425542655224634, "grad_norm": 3.0978216797858815, "kl": 0.055419921875, "learning_rate": 9.992509618928405e-07, "loss": 0.0022, "reward": 1.7754374742507935, "reward_std": 0.15112407505512238, "rewards/accuracy_reward": 0.6379374861717224, "rewards/format_reward": 1.0, "step": 1726 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.3125, "epoch": 0.017435638566380616, "grad_norm": 5.6096556686376475, "kl": 0.0712890625, "learning_rate": 9.992500939116732e-07, "loss": 0.0029, "reward": 1.8020000457763672, "reward_std": 0.023706182837486267, "rewards/accuracy_reward": 0.6520000696182251, "rewards/format_reward": 1.0, "step": 1727 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 400.25, "epoch": 0.017445734477536598, "grad_norm": 1.426960297675008, "kl": 0.0576171875, "learning_rate": 9.992492254282688e-07, "loss": 0.0023, "reward": 1.4434375762939453, "reward_std": 0.03010759875178337, "rewards/accuracy_reward": 0.35593751072883606, "rewards/format_reward": 1.0, "step": 1728 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.5625, "epoch": 0.01745583038869258, "grad_norm": 2.8104879830855505, "kl": 0.07080078125, "learning_rate": 9.992483564426276e-07, "loss": 0.0028, "reward": 2.160343647003174, "reward_std": 0.01273598987609148, "rewards/accuracy_reward": 0.9603437781333923, "rewards/format_reward": 1.0, "step": 1729 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.5625, "epoch": 0.017465926299848562, "grad_norm": 1.054128940277247, "kl": 0.056884765625, "learning_rate": 9.992474869547512e-07, "loss": 0.0023, "reward": 2.191218852996826, "reward_std": 0.004930661525577307, "rewards/accuracy_reward": 0.9912187457084656, "rewards/format_reward": 1.0, "step": 1730 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.1875, "epoch": 0.017476022211004544, "grad_norm": 2.0294979960141384, "kl": 0.06396484375, "learning_rate": 9.9924661696464e-07, "loss": 0.0025, "reward": 2.112687349319458, "reward_std": 0.02023140713572502, "rewards/accuracy_reward": 0.9126875400543213, "rewards/format_reward": 1.0, "step": 1731 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.96875, "epoch": 0.017486118122160527, "grad_norm": 2.1308321041668488, "kl": 0.060546875, "learning_rate": 9.992457464722952e-07, "loss": 0.0024, "reward": 2.1198437213897705, "reward_std": 0.017779693007469177, "rewards/accuracy_reward": 0.9198437929153442, "rewards/format_reward": 1.0, "step": 1732 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.09375, "epoch": 0.017496214033316505, "grad_norm": 13.370238903989996, "kl": 0.060546875, "learning_rate": 9.992448754777174e-07, "loss": 0.0024, "reward": 2.116281509399414, "reward_std": 0.12319228053092957, "rewards/accuracy_reward": 0.922531247138977, "rewards/format_reward": 1.0, "step": 1733 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.125, "epoch": 0.017506309944472487, "grad_norm": 5.436391458443802, "kl": 0.07421875, "learning_rate": 9.992440039809075e-07, "loss": 0.003, "reward": 2.067718744277954, "reward_std": 0.019320469349622726, "rewards/accuracy_reward": 0.8677188158035278, "rewards/format_reward": 1.0, "step": 1734 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.84375, "epoch": 0.01751640585562847, "grad_norm": 4.925788689969648, "kl": 0.0625, "learning_rate": 9.992431319818666e-07, "loss": 0.0025, "reward": 1.9899375438690186, "reward_std": 0.178352952003479, "rewards/accuracy_reward": 0.8086875081062317, "rewards/format_reward": 1.0, "step": 1735 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.46875, "epoch": 0.017526501766784452, "grad_norm": 2.474417976874498, "kl": 0.0712890625, "learning_rate": 9.992422594805953e-07, "loss": 0.0029, "reward": 1.991187334060669, "reward_std": 0.035648077726364136, "rewards/accuracy_reward": 0.7974375486373901, "rewards/format_reward": 1.0, "step": 1736 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.017536597677940434, "grad_norm": 2.03606993660793, "kl": 0.0634765625, "learning_rate": 9.992413864770946e-07, "loss": 0.0025, "reward": 2.158468723297119, "reward_std": 0.012161046266555786, "rewards/accuracy_reward": 0.9584687352180481, "rewards/format_reward": 1.0, "step": 1737 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.3125, "epoch": 0.017546693589096416, "grad_norm": 3.2156086424639096, "kl": 0.0732421875, "learning_rate": 9.992405129713654e-07, "loss": 0.0029, "reward": 2.0839688777923584, "reward_std": 0.016970116645097733, "rewards/accuracy_reward": 0.8839687705039978, "rewards/format_reward": 1.0, "step": 1738 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.71875, "epoch": 0.017556789500252398, "grad_norm": 1.9763407128419035, "kl": 0.07421875, "learning_rate": 9.992396389634086e-07, "loss": 0.003, "reward": 1.807687520980835, "reward_std": 0.012752389535307884, "rewards/accuracy_reward": 0.6576875448226929, "rewards/format_reward": 1.0, "step": 1739 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.09375, "epoch": 0.01756688541140838, "grad_norm": 2.7039137812991445, "kl": 0.0615234375, "learning_rate": 9.99238764453225e-07, "loss": 0.0025, "reward": 1.9864375591278076, "reward_std": 0.17148743569850922, "rewards/accuracy_reward": 0.805187463760376, "rewards/format_reward": 1.0, "step": 1740 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.84375, "epoch": 0.017576981322564363, "grad_norm": 1.4980125649666243, "kl": 0.06396484375, "learning_rate": 9.992378894408157e-07, "loss": 0.0026, "reward": 1.8237812519073486, "reward_std": 0.00946985837072134, "rewards/accuracy_reward": 0.6737812757492065, "rewards/format_reward": 1.0, "step": 1741 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.09375, "epoch": 0.017587077233720345, "grad_norm": 2.861011637742052, "kl": 0.06640625, "learning_rate": 9.992370139261812e-07, "loss": 0.0027, "reward": 2.0860939025878906, "reward_std": 0.12036143988370895, "rewards/accuracy_reward": 0.8923437595367432, "rewards/format_reward": 1.0, "step": 1742 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.28125, "epoch": 0.017597173144876323, "grad_norm": 3.003840765857226, "kl": 0.0712890625, "learning_rate": 9.992361379093227e-07, "loss": 0.0029, "reward": 2.122781276702881, "reward_std": 0.019754163920879364, "rewards/accuracy_reward": 0.922781229019165, "rewards/format_reward": 1.0, "step": 1743 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.25, "epoch": 0.017607269056032306, "grad_norm": 2.333969392317511, "kl": 0.057373046875, "learning_rate": 9.992352613902408e-07, "loss": 0.0023, "reward": 2.0615625381469727, "reward_std": 0.1659175604581833, "rewards/accuracy_reward": 0.880312442779541, "rewards/format_reward": 1.0, "step": 1744 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.125, "epoch": 0.017617364967188288, "grad_norm": 2.489255805183759, "kl": 0.0703125, "learning_rate": 9.992343843689367e-07, "loss": 0.0028, "reward": 2.070812463760376, "reward_std": 0.015326504595577717, "rewards/accuracy_reward": 0.8708125352859497, "rewards/format_reward": 1.0, "step": 1745 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.75, "epoch": 0.01762746087834427, "grad_norm": 1.5598953799876334, "kl": 0.06298828125, "learning_rate": 9.992335068454112e-07, "loss": 0.0025, "reward": 2.0033750534057617, "reward_std": 0.006108523346483707, "rewards/accuracy_reward": 0.8033749461174011, "rewards/format_reward": 1.0, "step": 1746 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.017637556789500252, "grad_norm": 3.248976679569328, "kl": 0.060791015625, "learning_rate": 9.992326288196648e-07, "loss": 0.0024, "reward": 2.0215625762939453, "reward_std": 0.17421936988830566, "rewards/accuracy_reward": 0.8465625047683716, "rewards/format_reward": 1.0, "step": 1747 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.53125, "epoch": 0.017647652700656234, "grad_norm": 1.2615753830774734, "kl": 0.0458984375, "learning_rate": 9.99231750291699e-07, "loss": 0.0018, "reward": 2.1624999046325684, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1748 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.59375, "epoch": 0.017657748611812216, "grad_norm": 2.7056349848697985, "kl": 0.072265625, "learning_rate": 9.992308712615142e-07, "loss": 0.0029, "reward": 2.069218635559082, "reward_std": 0.01869972050189972, "rewards/accuracy_reward": 0.8692187070846558, "rewards/format_reward": 1.0, "step": 1749 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.34375, "epoch": 0.0176678445229682, "grad_norm": 2.796163483307322, "kl": 0.0810546875, "learning_rate": 9.992299917291115e-07, "loss": 0.0033, "reward": 2.0602188110351562, "reward_std": 0.028442231938242912, "rewards/accuracy_reward": 0.8664687871932983, "rewards/format_reward": 1.0, "step": 1750 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.15625, "epoch": 0.01767794043412418, "grad_norm": 1.3611251119187335, "kl": 0.04541015625, "learning_rate": 9.99229111694492e-07, "loss": 0.0018, "reward": 2.1796875, "reward_std": 0.005150848533958197, "rewards/accuracy_reward": 0.979687511920929, "rewards/format_reward": 1.0, "step": 1751 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.09375, "epoch": 0.017688036345280163, "grad_norm": 2.9498687875730023, "kl": 0.076171875, "learning_rate": 9.99228231157656e-07, "loss": 0.003, "reward": 2.0394062995910645, "reward_std": 0.017041802406311035, "rewards/accuracy_reward": 0.8394062519073486, "rewards/format_reward": 1.0, "step": 1752 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.4375, "epoch": 0.017698132256436145, "grad_norm": 2.0999036133080358, "kl": 0.068359375, "learning_rate": 9.99227350118605e-07, "loss": 0.0027, "reward": 2.131593704223633, "reward_std": 0.009935097768902779, "rewards/accuracy_reward": 0.9315937757492065, "rewards/format_reward": 1.0, "step": 1753 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.1875, "epoch": 0.017708228167592124, "grad_norm": 3.2315738902633027, "kl": 0.076171875, "learning_rate": 9.992264685773393e-07, "loss": 0.0031, "reward": 2.0274062156677246, "reward_std": 0.02386646158993244, "rewards/accuracy_reward": 0.8274062275886536, "rewards/format_reward": 1.0, "step": 1754 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 0.017718324078748106, "grad_norm": 0.0971658677832561, "kl": 0.04931640625, "learning_rate": 9.992255865338603e-07, "loss": 0.002, "reward": 2.185999870300293, "reward_std": 0.0, "rewards/accuracy_reward": 0.9860000014305115, "rewards/format_reward": 1.0, "step": 1755 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.96875, "epoch": 0.017728419989904088, "grad_norm": 2.287999526804664, "kl": 0.064453125, "learning_rate": 9.992247039881688e-07, "loss": 0.0026, "reward": 2.117781162261963, "reward_std": 0.012342432513833046, "rewards/accuracy_reward": 0.9177812337875366, "rewards/format_reward": 1.0, "step": 1756 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.01773851590106007, "grad_norm": 2.1729309353730395, "kl": 0.068359375, "learning_rate": 9.992238209402653e-07, "loss": 0.0027, "reward": 2.087625026702881, "reward_std": 0.012761006131768227, "rewards/accuracy_reward": 0.887624979019165, "rewards/format_reward": 1.0, "step": 1757 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 422.03125, "epoch": 0.017748611812216052, "grad_norm": 3.003274884464096, "kl": 0.05224609375, "learning_rate": 9.99222937390151e-07, "loss": 0.0021, "reward": 1.4850938320159912, "reward_std": 0.021802503615617752, "rewards/accuracy_reward": 0.3850937485694885, "rewards/format_reward": 1.0, "step": 1758 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 422.9375, "epoch": 0.017758707723372034, "grad_norm": 1.7860666696435255, "kl": 0.061767578125, "learning_rate": 9.992220533378268e-07, "loss": 0.0025, "reward": 1.7760000228881836, "reward_std": 0.025576133280992508, "rewards/accuracy_reward": 0.6322499513626099, "rewards/format_reward": 1.0, "step": 1759 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.625, "epoch": 0.017768803634528017, "grad_norm": 1.8848180992969528, "kl": 0.05615234375, "learning_rate": 9.992211687832937e-07, "loss": 0.0022, "reward": 2.1564688682556152, "reward_std": 0.023278025910258293, "rewards/accuracy_reward": 0.9627187252044678, "rewards/format_reward": 1.0, "step": 1760 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.34375, "epoch": 0.017778899545684, "grad_norm": 6.032631553484728, "kl": 0.07080078125, "learning_rate": 9.992202837265521e-07, "loss": 0.0028, "reward": 2.082531452178955, "reward_std": 0.0180988647043705, "rewards/accuracy_reward": 0.8825312852859497, "rewards/format_reward": 1.0, "step": 1761 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.8125, "epoch": 0.01778899545683998, "grad_norm": 5.347698765326524, "kl": 0.0673828125, "learning_rate": 9.992193981676034e-07, "loss": 0.0027, "reward": 2.0778439044952393, "reward_std": 0.01546696200966835, "rewards/accuracy_reward": 0.8778437376022339, "rewards/format_reward": 1.0, "step": 1762 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.59375, "epoch": 0.017799091367995963, "grad_norm": 2.420953713374499, "kl": 0.0703125, "learning_rate": 9.992185121064482e-07, "loss": 0.0028, "reward": 2.1048123836517334, "reward_std": 0.008570596575737, "rewards/accuracy_reward": 0.9048124551773071, "rewards/format_reward": 1.0, "step": 1763 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.125, "epoch": 0.017809187279151942, "grad_norm": 6.4944041307307945, "kl": 0.0732421875, "learning_rate": 9.992176255430877e-07, "loss": 0.0029, "reward": 2.1266250610351562, "reward_std": 0.009740049950778484, "rewards/accuracy_reward": 0.9266249537467957, "rewards/format_reward": 1.0, "step": 1764 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.25, "epoch": 0.017819283190307924, "grad_norm": 1.6894754767109603, "kl": 0.06640625, "learning_rate": 9.992167384775225e-07, "loss": 0.0027, "reward": 2.00571870803833, "reward_std": 0.027160944417119026, "rewards/accuracy_reward": 0.8119687438011169, "rewards/format_reward": 1.0, "step": 1765 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.96875, "epoch": 0.017829379101463906, "grad_norm": 1.6260163452360339, "kl": 0.061767578125, "learning_rate": 9.992158509097534e-07, "loss": 0.0025, "reward": 2.1065001487731934, "reward_std": 0.02572716400027275, "rewards/accuracy_reward": 0.9127500057220459, "rewards/format_reward": 1.0, "step": 1766 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.53125, "epoch": 0.017839475012619888, "grad_norm": 1.8396877788550963, "kl": 0.062255859375, "learning_rate": 9.992149628397817e-07, "loss": 0.0025, "reward": 2.1352500915527344, "reward_std": 0.006572412792593241, "rewards/accuracy_reward": 0.9352499842643738, "rewards/format_reward": 1.0, "step": 1767 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.15625, "epoch": 0.01784957092377587, "grad_norm": 2.1075086971685963, "kl": 0.072265625, "learning_rate": 9.992140742676081e-07, "loss": 0.0029, "reward": 2.0841562747955322, "reward_std": 0.0170899648219347, "rewards/accuracy_reward": 0.8841562867164612, "rewards/format_reward": 1.0, "step": 1768 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.65625, "epoch": 0.017859666834931853, "grad_norm": 1.8942140049709209, "kl": 0.060546875, "learning_rate": 9.992131851932332e-07, "loss": 0.0024, "reward": 1.79171884059906, "reward_std": 0.12581849098205566, "rewards/accuracy_reward": 0.6604686975479126, "rewards/format_reward": 1.0, "step": 1769 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 405.21875, "epoch": 0.017869762746087835, "grad_norm": 1.3065031188690968, "kl": 0.051025390625, "learning_rate": 9.992122956166583e-07, "loss": 0.002, "reward": 1.8949062824249268, "reward_std": 0.002705609193071723, "rewards/accuracy_reward": 0.7449063062667847, "rewards/format_reward": 1.0, "step": 1770 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.84375, "epoch": 0.017879858657243817, "grad_norm": 1.8763373093882607, "kl": 0.047607421875, "learning_rate": 9.992114055378842e-07, "loss": 0.0019, "reward": 2.050750255584717, "reward_std": 0.1438855081796646, "rewards/accuracy_reward": 0.8632500171661377, "rewards/format_reward": 1.0, "step": 1771 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 406.0, "epoch": 0.0178899545683998, "grad_norm": 1.9136642534411301, "kl": 0.058349609375, "learning_rate": 9.992105149569117e-07, "loss": 0.0023, "reward": 1.832937479019165, "reward_std": 0.01318342611193657, "rewards/accuracy_reward": 0.6829375624656677, "rewards/format_reward": 1.0, "step": 1772 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.65625, "epoch": 0.01790005047955578, "grad_norm": 2.215528274561603, "kl": 0.0712890625, "learning_rate": 9.992096238737418e-07, "loss": 0.0028, "reward": 2.172281265258789, "reward_std": 0.010178178548812866, "rewards/accuracy_reward": 0.972281277179718, "rewards/format_reward": 1.0, "step": 1773 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.78125, "epoch": 0.017910146390711763, "grad_norm": 10.339018315445918, "kl": 0.0732421875, "learning_rate": 9.992087322883753e-07, "loss": 0.0029, "reward": 2.1055002212524414, "reward_std": 0.01585843786597252, "rewards/accuracy_reward": 0.905500054359436, "rewards/format_reward": 1.0, "step": 1774 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.125, "epoch": 0.017920242301867742, "grad_norm": 3.235385820410048, "kl": 0.07470703125, "learning_rate": 9.99207840200813e-07, "loss": 0.003, "reward": 2.016343593597412, "reward_std": 0.023031413555145264, "rewards/accuracy_reward": 0.8163437247276306, "rewards/format_reward": 1.0, "step": 1775 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.0, "epoch": 0.017930338213023724, "grad_norm": 1.6648881986961557, "kl": 0.052978515625, "learning_rate": 9.992069476110561e-07, "loss": 0.0021, "reward": 1.8728750944137573, "reward_std": 0.023143362253904343, "rewards/accuracy_reward": 0.7291249632835388, "rewards/format_reward": 1.0, "step": 1776 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.017940434124179706, "grad_norm": 2.754178713651875, "kl": 0.06201171875, "learning_rate": 9.992060545191052e-07, "loss": 0.0025, "reward": 2.167678117752075, "reward_std": 0.010647089220583439, "rewards/accuracy_reward": 0.9676781296730042, "rewards/format_reward": 1.0, "step": 1777 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.01795053003533569, "grad_norm": 2.798574639844896, "kl": 0.059814453125, "learning_rate": 9.992051609249614e-07, "loss": 0.0024, "reward": 2.135812520980835, "reward_std": 0.0088956318795681, "rewards/accuracy_reward": 0.9358124732971191, "rewards/format_reward": 1.0, "step": 1778 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.53125, "epoch": 0.01796062594649167, "grad_norm": 2.1214203054968954, "kl": 0.06982421875, "learning_rate": 9.992042668286258e-07, "loss": 0.0028, "reward": 1.8305938243865967, "reward_std": 0.008857069537043571, "rewards/accuracy_reward": 0.680593729019165, "rewards/format_reward": 1.0, "step": 1779 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.46875, "epoch": 0.017970721857647653, "grad_norm": 6.587791129937811, "kl": 0.06787109375, "learning_rate": 9.992033722300987e-07, "loss": 0.0027, "reward": 2.1233437061309814, "reward_std": 0.013804776594042778, "rewards/accuracy_reward": 0.9233437776565552, "rewards/format_reward": 1.0, "step": 1780 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.3125, "epoch": 0.017980817768803635, "grad_norm": 5.270611725077755, "kl": 0.06689453125, "learning_rate": 9.992024771293814e-07, "loss": 0.0027, "reward": 2.090656280517578, "reward_std": 0.01535507757216692, "rewards/accuracy_reward": 0.8906562924385071, "rewards/format_reward": 1.0, "step": 1781 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.6875, "epoch": 0.017990913679959617, "grad_norm": 2.566614855218254, "kl": 0.07373046875, "learning_rate": 9.992015815264746e-07, "loss": 0.0029, "reward": 2.184093713760376, "reward_std": 0.009748351760208607, "rewards/accuracy_reward": 0.9840937852859497, "rewards/format_reward": 1.0, "step": 1782 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.0625, "epoch": 0.0180010095911156, "grad_norm": 2.4638263941946095, "kl": 0.068359375, "learning_rate": 9.992006854213796e-07, "loss": 0.0027, "reward": 1.8822500705718994, "reward_std": 0.011536471545696259, "rewards/accuracy_reward": 0.7322499752044678, "rewards/format_reward": 1.0, "step": 1783 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.90625, "epoch": 0.01801110550227158, "grad_norm": 1.399853281529761, "kl": 0.049072265625, "learning_rate": 9.991997888140969e-07, "loss": 0.002, "reward": 2.1601247787475586, "reward_std": 0.0074232034385204315, "rewards/accuracy_reward": 0.9601249694824219, "rewards/format_reward": 1.0, "step": 1784 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.65625, "epoch": 0.01802120141342756, "grad_norm": 7.153243541677023, "kl": 0.0673828125, "learning_rate": 9.991988917046274e-07, "loss": 0.0027, "reward": 2.121062755584717, "reward_std": 0.02447371743619442, "rewards/accuracy_reward": 0.9273124933242798, "rewards/format_reward": 1.0, "step": 1785 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.625, "epoch": 0.018031297324583542, "grad_norm": 2.3093815576185728, "kl": 0.06396484375, "learning_rate": 9.991979940929723e-07, "loss": 0.0026, "reward": 2.0962812900543213, "reward_std": 0.02681816928088665, "rewards/accuracy_reward": 0.9025312662124634, "rewards/format_reward": 1.0, "step": 1786 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 426.1875, "epoch": 0.018041393235739524, "grad_norm": 3.5075788134679042, "kl": 0.07861328125, "learning_rate": 9.991970959791322e-07, "loss": 0.0032, "reward": 2.1543126106262207, "reward_std": 0.010858705267310143, "rewards/accuracy_reward": 0.9543125033378601, "rewards/format_reward": 1.0, "step": 1787 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.96875, "epoch": 0.018051489146895507, "grad_norm": 1.0114497811273426, "kl": 0.06396484375, "learning_rate": 9.991961973631082e-07, "loss": 0.0026, "reward": 1.9707499742507935, "reward_std": 0.15526476502418518, "rewards/accuracy_reward": 0.8019999861717224, "rewards/format_reward": 1.0, "step": 1788 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.40625, "epoch": 0.01806158505805149, "grad_norm": 2.6936940928942263, "kl": 0.059814453125, "learning_rate": 9.991952982449011e-07, "loss": 0.0024, "reward": 2.1446876525878906, "reward_std": 0.01165645569562912, "rewards/accuracy_reward": 0.9446874856948853, "rewards/format_reward": 1.0, "step": 1789 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 420.96875, "epoch": 0.01807168096920747, "grad_norm": 1.5752439068969664, "kl": 0.0537109375, "learning_rate": 9.991943986245122e-07, "loss": 0.0022, "reward": 1.5175625085830688, "reward_std": 0.019814031198620796, "rewards/accuracy_reward": 0.4175625145435333, "rewards/format_reward": 1.0, "step": 1790 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.71875, "epoch": 0.018081776880363453, "grad_norm": 1.9135995282295009, "kl": 0.068359375, "learning_rate": 9.991934985019416e-07, "loss": 0.0027, "reward": 2.125906229019165, "reward_std": 0.009862982667982578, "rewards/accuracy_reward": 0.9259063005447388, "rewards/format_reward": 1.0, "step": 1791 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.71875, "epoch": 0.018091872791519435, "grad_norm": 11.10329051748649, "kl": 0.07080078125, "learning_rate": 9.99192597877191e-07, "loss": 0.0028, "reward": 2.102562427520752, "reward_std": 0.018177464604377747, "rewards/accuracy_reward": 0.9025625586509705, "rewards/format_reward": 1.0, "step": 1792 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.8125, "epoch": 0.018101968702675417, "grad_norm": 1.953654253418898, "kl": 0.06591796875, "learning_rate": 9.991916967502608e-07, "loss": 0.0026, "reward": 2.1408748626708984, "reward_std": 0.010517846792936325, "rewards/accuracy_reward": 0.9408749938011169, "rewards/format_reward": 1.0, "step": 1793 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 429.15625, "epoch": 0.0181120646138314, "grad_norm": 1.693846668109413, "kl": 0.064453125, "learning_rate": 9.99190795121152e-07, "loss": 0.0026, "reward": 1.793125033378601, "reward_std": 0.005003123078495264, "rewards/accuracy_reward": 0.6431249976158142, "rewards/format_reward": 1.0, "step": 1794 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 422.46875, "epoch": 0.01812216052498738, "grad_norm": 1.6020205603035955, "kl": 0.06640625, "learning_rate": 9.991898929898656e-07, "loss": 0.0027, "reward": 1.746187448501587, "reward_std": 0.007179850246757269, "rewards/accuracy_reward": 0.5961874723434448, "rewards/format_reward": 1.0, "step": 1795 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 438.40625, "epoch": 0.01813225643614336, "grad_norm": 2.3453554774212124, "kl": 0.056640625, "learning_rate": 9.991889903564026e-07, "loss": 0.0023, "reward": 1.812406301498413, "reward_std": 0.026449428871273994, "rewards/accuracy_reward": 0.6686562299728394, "rewards/format_reward": 1.0, "step": 1796 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.018142352347299343, "grad_norm": 2.2941883110950605, "kl": 0.072265625, "learning_rate": 9.991880872207637e-07, "loss": 0.0029, "reward": 2.1702189445495605, "reward_std": 0.02536485716700554, "rewards/accuracy_reward": 0.9764687418937683, "rewards/format_reward": 1.0, "step": 1797 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.3125, "epoch": 0.018152448258455325, "grad_norm": 8.501091039882963, "kl": 0.060546875, "learning_rate": 9.9918718358295e-07, "loss": 0.0024, "reward": 1.9567186832427979, "reward_std": 0.19634002447128296, "rewards/accuracy_reward": 0.7879687547683716, "rewards/format_reward": 1.0, "step": 1798 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.625, "epoch": 0.018162544169611307, "grad_norm": 3.494203906719584, "kl": 0.072265625, "learning_rate": 9.991862794429622e-07, "loss": 0.0029, "reward": 1.993687629699707, "reward_std": 0.012501655146479607, "rewards/accuracy_reward": 0.7936874628067017, "rewards/format_reward": 1.0, "step": 1799 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.01817264008076729, "grad_norm": 2.5987816791487925, "kl": 0.06494140625, "learning_rate": 9.991853748008015e-07, "loss": 0.0026, "reward": 2.124906063079834, "reward_std": 0.02897581458091736, "rewards/accuracy_reward": 0.9311562776565552, "rewards/format_reward": 1.0, "step": 1800 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.01818273599192327, "grad_norm": 1.862479782886569, "kl": 0.06494140625, "learning_rate": 9.991844696564683e-07, "loss": 0.0026, "reward": 2.1281561851501465, "reward_std": 0.02375960163772106, "rewards/accuracy_reward": 0.9344062209129333, "rewards/format_reward": 1.0, "step": 1801 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.15625, "epoch": 0.018192831903079253, "grad_norm": 2.255224813800656, "kl": 0.06103515625, "learning_rate": 9.991835640099643e-07, "loss": 0.0024, "reward": 2.0832815170288086, "reward_std": 0.007892662659287453, "rewards/accuracy_reward": 0.8832812309265137, "rewards/format_reward": 1.0, "step": 1802 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.8125, "epoch": 0.018202927814235235, "grad_norm": 1.6512878617384579, "kl": 0.058349609375, "learning_rate": 9.991826578612896e-07, "loss": 0.0023, "reward": 1.8438750505447388, "reward_std": 0.0439378097653389, "rewards/accuracy_reward": 0.7126250267028809, "rewards/format_reward": 1.0, "step": 1803 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.09375, "epoch": 0.018213023725391218, "grad_norm": 7.143489956322423, "kl": 0.055908203125, "learning_rate": 9.991817512104456e-07, "loss": 0.0022, "reward": 1.849656343460083, "reward_std": 0.011605796404182911, "rewards/accuracy_reward": 0.6996563076972961, "rewards/format_reward": 1.0, "step": 1804 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 407.40625, "epoch": 0.0182231196365472, "grad_norm": 1.0427195197343313, "kl": 0.0537109375, "learning_rate": 9.991808440574331e-07, "loss": 0.0021, "reward": 1.548187494277954, "reward_std": 0.006839683745056391, "rewards/accuracy_reward": 0.4481875002384186, "rewards/format_reward": 1.0, "step": 1805 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.01823321554770318, "grad_norm": 3.4272933920184196, "kl": 0.056640625, "learning_rate": 9.99179936402253e-07, "loss": 0.0023, "reward": 1.8932501077651978, "reward_std": 0.16532297432422638, "rewards/accuracy_reward": 0.7369999885559082, "rewards/format_reward": 1.0, "step": 1806 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.21875, "epoch": 0.01824331145885916, "grad_norm": 5.034173068348392, "kl": 0.064453125, "learning_rate": 9.99179028244906e-07, "loss": 0.0026, "reward": 1.8268749713897705, "reward_std": 0.013947131112217903, "rewards/accuracy_reward": 0.6768749952316284, "rewards/format_reward": 1.0, "step": 1807 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.53125, "epoch": 0.018253407370015143, "grad_norm": 1.9337118192730058, "kl": 0.0654296875, "learning_rate": 9.991781195853934e-07, "loss": 0.0026, "reward": 2.0432188510894775, "reward_std": 0.024637371301651, "rewards/accuracy_reward": 0.8494687676429749, "rewards/format_reward": 1.0, "step": 1808 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.75, "epoch": 0.018263503281171125, "grad_norm": 1.5943256566366995, "kl": 0.054931640625, "learning_rate": 9.99177210423716e-07, "loss": 0.0022, "reward": 2.1279687881469727, "reward_std": 0.006973888725042343, "rewards/accuracy_reward": 0.9279687404632568, "rewards/format_reward": 1.0, "step": 1809 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.46875, "epoch": 0.018273599192327107, "grad_norm": 2.150481349644566, "kl": 0.06494140625, "learning_rate": 9.991763007598744e-07, "loss": 0.0026, "reward": 2.1595311164855957, "reward_std": 0.0100231459364295, "rewards/accuracy_reward": 0.9595311880111694, "rewards/format_reward": 1.0, "step": 1810 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.0, "epoch": 0.01828369510348309, "grad_norm": 2.1142441996715826, "kl": 0.0517578125, "learning_rate": 9.9917539059387e-07, "loss": 0.0021, "reward": 1.8977186679840088, "reward_std": 0.14410588145256042, "rewards/accuracy_reward": 0.7352188229560852, "rewards/format_reward": 1.0, "step": 1811 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.03125, "epoch": 0.01829379101463907, "grad_norm": 1.3647448687733066, "kl": 0.04638671875, "learning_rate": 9.991744799257034e-07, "loss": 0.0019, "reward": 2.012500047683716, "reward_std": 0.2664227783679962, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1812 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.5625, "epoch": 0.018303886925795054, "grad_norm": 1.6834120149327318, "kl": 0.0654296875, "learning_rate": 9.991735687553755e-07, "loss": 0.0026, "reward": 2.131500244140625, "reward_std": 0.006188587285578251, "rewards/accuracy_reward": 0.9314999580383301, "rewards/format_reward": 1.0, "step": 1813 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.21875, "epoch": 0.018313982836951036, "grad_norm": 2.6319439794064143, "kl": 0.06640625, "learning_rate": 9.991726570828874e-07, "loss": 0.0027, "reward": 1.9784061908721924, "reward_std": 0.030963057652115822, "rewards/accuracy_reward": 0.7909062504768372, "rewards/format_reward": 1.0, "step": 1814 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 358.84375, "epoch": 0.018324078748107018, "grad_norm": 2.899632638601739, "kl": 0.072265625, "learning_rate": 9.9917174490824e-07, "loss": 0.0029, "reward": 2.0998125076293945, "reward_std": 0.03744509071111679, "rewards/accuracy_reward": 0.9123125076293945, "rewards/format_reward": 1.0, "step": 1815 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.875, "epoch": 0.018334174659263, "grad_norm": 4.20166109069013, "kl": 0.07275390625, "learning_rate": 9.991708322314341e-07, "loss": 0.0029, "reward": 2.093843936920166, "reward_std": 0.04731232672929764, "rewards/accuracy_reward": 0.9063437581062317, "rewards/format_reward": 1.0, "step": 1816 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 387.25, "epoch": 0.01834427057041898, "grad_norm": 3.2841248504635474, "kl": 0.05908203125, "learning_rate": 9.991699190524705e-07, "loss": 0.0024, "reward": 1.8596563339233398, "reward_std": 0.041405633091926575, "rewards/accuracy_reward": 0.722156286239624, "rewards/format_reward": 1.0, "step": 1817 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 371.6875, "epoch": 0.01835436648157496, "grad_norm": 2.470163396381619, "kl": 0.0712890625, "learning_rate": 9.991690053713505e-07, "loss": 0.0029, "reward": 2.0977187156677246, "reward_std": 0.006674167234450579, "rewards/accuracy_reward": 0.8977187871932983, "rewards/format_reward": 1.0, "step": 1818 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.78125, "epoch": 0.018364462392730943, "grad_norm": 2.470752725165236, "kl": 0.060791015625, "learning_rate": 9.991680911880746e-07, "loss": 0.0024, "reward": 2.1582813262939453, "reward_std": 0.006014433689415455, "rewards/accuracy_reward": 0.9582812786102295, "rewards/format_reward": 1.0, "step": 1819 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.25, "epoch": 0.018374558303886925, "grad_norm": 23.15470268444882, "kl": 0.0654296875, "learning_rate": 9.99167176502644e-07, "loss": 0.0026, "reward": 2.02524995803833, "reward_std": 0.02543935552239418, "rewards/accuracy_reward": 0.8314999341964722, "rewards/format_reward": 1.0, "step": 1820 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.1875, "epoch": 0.018384654215042907, "grad_norm": 4.088799896609849, "kl": 0.0830078125, "learning_rate": 9.991662613150596e-07, "loss": 0.0033, "reward": 2.1440625190734863, "reward_std": 0.011687969788908958, "rewards/accuracy_reward": 0.9440624713897705, "rewards/format_reward": 1.0, "step": 1821 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.96875, "epoch": 0.01839475012619889, "grad_norm": 5.567889150975188, "kl": 0.07568359375, "learning_rate": 9.991653456253221e-07, "loss": 0.003, "reward": 2.130812644958496, "reward_std": 0.015438884496688843, "rewards/accuracy_reward": 0.9308125376701355, "rewards/format_reward": 1.0, "step": 1822 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.3125, "epoch": 0.01840484603735487, "grad_norm": 7.270416719810151, "kl": 0.0576171875, "learning_rate": 9.991644294334328e-07, "loss": 0.0023, "reward": 1.8352186679840088, "reward_std": 0.1405390352010727, "rewards/accuracy_reward": 0.6727187633514404, "rewards/format_reward": 1.0, "step": 1823 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.5, "epoch": 0.018414941948510854, "grad_norm": 2.0841023203496096, "kl": 0.064453125, "learning_rate": 9.991635127393923e-07, "loss": 0.0026, "reward": 2.1751561164855957, "reward_std": 0.02587086334824562, "rewards/accuracy_reward": 0.9876562356948853, "rewards/format_reward": 1.0, "step": 1824 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.46875, "epoch": 0.018425037859666836, "grad_norm": 4.965729929496323, "kl": 0.07568359375, "learning_rate": 9.991625955432016e-07, "loss": 0.003, "reward": 2.1008124351501465, "reward_std": 0.03407168388366699, "rewards/accuracy_reward": 0.9070624709129333, "rewards/format_reward": 1.0, "step": 1825 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.6875, "epoch": 0.018435133770822818, "grad_norm": 1.9219948322104012, "kl": 0.0625, "learning_rate": 9.991616778448617e-07, "loss": 0.0025, "reward": 2.1228437423706055, "reward_std": 0.12926426529884338, "rewards/accuracy_reward": 0.9415937066078186, "rewards/format_reward": 1.0, "step": 1826 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.84375, "epoch": 0.0184452296819788, "grad_norm": 3.3727731253131243, "kl": 0.06494140625, "learning_rate": 9.991607596443735e-07, "loss": 0.0026, "reward": 2.1542186737060547, "reward_std": 0.007565150503069162, "rewards/accuracy_reward": 0.9542187452316284, "rewards/format_reward": 1.0, "step": 1827 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.25, "epoch": 0.01845532559313478, "grad_norm": 2.4626259713277308, "kl": 0.060302734375, "learning_rate": 9.99159840941738e-07, "loss": 0.0024, "reward": 2.141218662261963, "reward_std": 0.04810859635472298, "rewards/accuracy_reward": 0.9537187218666077, "rewards/format_reward": 1.0, "step": 1828 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.90625, "epoch": 0.01846542150429076, "grad_norm": 4.948695515912779, "kl": 0.0732421875, "learning_rate": 9.991589217369558e-07, "loss": 0.0029, "reward": 2.1214375495910645, "reward_std": 0.011327249929308891, "rewards/accuracy_reward": 0.9214375019073486, "rewards/format_reward": 1.0, "step": 1829 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 380.84375, "epoch": 0.018475517415446743, "grad_norm": 2.291365347348519, "kl": 0.0791015625, "learning_rate": 9.99158002030028e-07, "loss": 0.0032, "reward": 1.824406385421753, "reward_std": 0.022966723889112473, "rewards/accuracy_reward": 0.6806561946868896, "rewards/format_reward": 1.0, "step": 1830 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.53125, "epoch": 0.018485613326602725, "grad_norm": 2.86036347477506, "kl": 0.07080078125, "learning_rate": 9.991570818209555e-07, "loss": 0.0028, "reward": 2.1470000743865967, "reward_std": 0.029667232185602188, "rewards/accuracy_reward": 0.953249990940094, "rewards/format_reward": 1.0, "step": 1831 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 417.375, "epoch": 0.018495709237758708, "grad_norm": 0.9757376942020436, "kl": 0.05712890625, "learning_rate": 9.991561611097397e-07, "loss": 0.0023, "reward": 1.8640625476837158, "reward_std": 0.0025310886558145285, "rewards/accuracy_reward": 0.714062511920929, "rewards/format_reward": 1.0, "step": 1832 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 393.3125, "epoch": 0.01850580514891469, "grad_norm": 1.9500909008326954, "kl": 0.06494140625, "learning_rate": 9.99155239896381e-07, "loss": 0.0026, "reward": 1.8564999103546143, "reward_std": 0.008985213935375214, "rewards/accuracy_reward": 0.7065000534057617, "rewards/format_reward": 1.0, "step": 1833 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.9375, "epoch": 0.018515901060070672, "grad_norm": 2.2262175772904644, "kl": 0.05859375, "learning_rate": 9.9915431818088e-07, "loss": 0.0023, "reward": 1.9288749694824219, "reward_std": 0.14810319244861603, "rewards/accuracy_reward": 0.7663750648498535, "rewards/format_reward": 1.0, "step": 1834 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.40625, "epoch": 0.018525996971226654, "grad_norm": 1.5061079655469805, "kl": 0.0693359375, "learning_rate": 9.991533959632386e-07, "loss": 0.0028, "reward": 2.1469686031341553, "reward_std": 0.00954373087733984, "rewards/accuracy_reward": 0.946968674659729, "rewards/format_reward": 1.0, "step": 1835 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.96875, "epoch": 0.018536092882382636, "grad_norm": 2.664846170066828, "kl": 0.07568359375, "learning_rate": 9.991524732434571e-07, "loss": 0.003, "reward": 2.0917186737060547, "reward_std": 0.010963203385472298, "rewards/accuracy_reward": 0.8917187452316284, "rewards/format_reward": 1.0, "step": 1836 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.375, "epoch": 0.01854618879353862, "grad_norm": 2.8105657293015116, "kl": 0.0810546875, "learning_rate": 9.991515500215363e-07, "loss": 0.0033, "reward": 2.0655312538146973, "reward_std": 0.013168446719646454, "rewards/accuracy_reward": 0.8655312657356262, "rewards/format_reward": 1.0, "step": 1837 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.90625, "epoch": 0.018556284704694597, "grad_norm": 3.643238146538038, "kl": 0.0908203125, "learning_rate": 9.991506262974777e-07, "loss": 0.0036, "reward": 1.983875036239624, "reward_std": 0.02061600238084793, "rewards/accuracy_reward": 0.783875048160553, "rewards/format_reward": 1.0, "step": 1838 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.78125, "epoch": 0.01856638061585058, "grad_norm": 2.19078761110415, "kl": 0.07568359375, "learning_rate": 9.991497020712816e-07, "loss": 0.003, "reward": 1.7038438320159912, "reward_std": 0.013387054204940796, "rewards/accuracy_reward": 0.5538437366485596, "rewards/format_reward": 1.0, "step": 1839 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.0625, "epoch": 0.01857647652700656, "grad_norm": 1.9502185902129812, "kl": 0.0830078125, "learning_rate": 9.991487773429491e-07, "loss": 0.0033, "reward": 2.0003750324249268, "reward_std": 0.031002826988697052, "rewards/accuracy_reward": 0.8066250085830688, "rewards/format_reward": 1.0, "step": 1840 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.71875, "epoch": 0.018586572438162544, "grad_norm": 4.16046787839232, "kl": 0.08203125, "learning_rate": 9.991478521124816e-07, "loss": 0.0033, "reward": 2.091249942779541, "reward_std": 0.017016347497701645, "rewards/accuracy_reward": 0.8912500739097595, "rewards/format_reward": 1.0, "step": 1841 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.6875, "epoch": 0.018596668349318526, "grad_norm": 6.556711993552666, "kl": 0.06591796875, "learning_rate": 9.991469263798797e-07, "loss": 0.0026, "reward": 2.1226251125335693, "reward_std": 0.00678679533302784, "rewards/accuracy_reward": 0.922624945640564, "rewards/format_reward": 1.0, "step": 1842 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.40625, "epoch": 0.018606764260474508, "grad_norm": 4.16356150224774, "kl": 0.07421875, "learning_rate": 9.99146000145144e-07, "loss": 0.003, "reward": 2.1351561546325684, "reward_std": 0.02908484824001789, "rewards/accuracy_reward": 0.94140625, "rewards/format_reward": 1.0, "step": 1843 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.40625, "epoch": 0.01861686017163049, "grad_norm": 15.817651108557296, "kl": 0.059814453125, "learning_rate": 9.99145073408276e-07, "loss": 0.0024, "reward": 2.1424062252044678, "reward_std": 0.01015926618129015, "rewards/accuracy_reward": 0.9424062967300415, "rewards/format_reward": 1.0, "step": 1844 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.6875, "epoch": 0.018626956082786472, "grad_norm": 1.4996124430511122, "kl": 0.061279296875, "learning_rate": 9.991441461692763e-07, "loss": 0.0025, "reward": 2.15262508392334, "reward_std": 0.0061193183064460754, "rewards/accuracy_reward": 0.9526249766349792, "rewards/format_reward": 1.0, "step": 1845 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 396.15625, "epoch": 0.018637051993942454, "grad_norm": 2.2122273558935244, "kl": 0.06982421875, "learning_rate": 9.991432184281459e-07, "loss": 0.0028, "reward": 1.8459374904632568, "reward_std": 0.012027600780129433, "rewards/accuracy_reward": 0.6959375143051147, "rewards/format_reward": 1.0, "step": 1846 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 403.5, "epoch": 0.018647147905098437, "grad_norm": 1.4624042983442538, "kl": 0.05712890625, "learning_rate": 9.991422901848857e-07, "loss": 0.0023, "reward": 1.6915000677108765, "reward_std": 0.019752172753214836, "rewards/accuracy_reward": 0.5477499961853027, "rewards/format_reward": 1.0, "step": 1847 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.375, "epoch": 0.01865724381625442, "grad_norm": 2.9265255808682236, "kl": 0.06494140625, "learning_rate": 9.991413614394967e-07, "loss": 0.0026, "reward": 2.153625011444092, "reward_std": 0.009620215743780136, "rewards/accuracy_reward": 0.953624963760376, "rewards/format_reward": 1.0, "step": 1848 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.0, "epoch": 0.018667339727410397, "grad_norm": 2.981960988737329, "kl": 0.0693359375, "learning_rate": 9.991404321919798e-07, "loss": 0.0028, "reward": 2.0896248817443848, "reward_std": 0.01494111493229866, "rewards/accuracy_reward": 0.8896250128746033, "rewards/format_reward": 1.0, "step": 1849 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.5625, "epoch": 0.01867743563856638, "grad_norm": 3.159952804374904, "kl": 0.05859375, "learning_rate": 9.991395024423359e-07, "loss": 0.0023, "reward": 2.0619375705718994, "reward_std": 0.150079607963562, "rewards/accuracy_reward": 0.8744375705718994, "rewards/format_reward": 1.0, "step": 1850 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.40625, "epoch": 0.01868753154972236, "grad_norm": 2.021400659815226, "kl": 0.059814453125, "learning_rate": 9.991385721905662e-07, "loss": 0.0024, "reward": 1.9127812385559082, "reward_std": 0.18456165492534637, "rewards/accuracy_reward": 0.7502812743186951, "rewards/format_reward": 1.0, "step": 1851 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.15625, "epoch": 0.018697627460878344, "grad_norm": 1.9530308698128847, "kl": 0.072265625, "learning_rate": 9.99137641436671e-07, "loss": 0.0029, "reward": 2.0190625190734863, "reward_std": 0.021437423303723335, "rewards/accuracy_reward": 0.8190625309944153, "rewards/format_reward": 1.0, "step": 1852 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.90625, "epoch": 0.018707723372034326, "grad_norm": 2.436577886193157, "kl": 0.08056640625, "learning_rate": 9.99136710180652e-07, "loss": 0.0032, "reward": 2.1177186965942383, "reward_std": 0.021778922528028488, "rewards/accuracy_reward": 0.917718768119812, "rewards/format_reward": 1.0, "step": 1853 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.018717819283190308, "grad_norm": 5.424459135943244, "kl": 0.07275390625, "learning_rate": 9.991357784225096e-07, "loss": 0.0029, "reward": 2.0714375972747803, "reward_std": 0.014402811415493488, "rewards/accuracy_reward": 0.8714375495910645, "rewards/format_reward": 1.0, "step": 1854 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.5625, "epoch": 0.01872791519434629, "grad_norm": 2.036208945866061, "kl": 0.06201171875, "learning_rate": 9.99134846162245e-07, "loss": 0.0025, "reward": 1.8560311794281006, "reward_std": 0.006468633189797401, "rewards/accuracy_reward": 0.7060312032699585, "rewards/format_reward": 1.0, "step": 1855 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.625, "epoch": 0.018738011105502272, "grad_norm": 1.8025154815741924, "kl": 0.057373046875, "learning_rate": 9.991339133998592e-07, "loss": 0.0023, "reward": 2.1092188358306885, "reward_std": 0.03080226480960846, "rewards/accuracy_reward": 0.9154687523841858, "rewards/format_reward": 1.0, "step": 1856 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.0625, "epoch": 0.018748107016658255, "grad_norm": 4.105900114226547, "kl": 0.064453125, "learning_rate": 9.99132980135353e-07, "loss": 0.0026, "reward": 2.111156463623047, "reward_std": 0.02039366215467453, "rewards/accuracy_reward": 0.9111562371253967, "rewards/format_reward": 1.0, "step": 1857 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.53125, "epoch": 0.018758202927814237, "grad_norm": 2.0038463117902467, "kl": 0.06298828125, "learning_rate": 9.99132046368727e-07, "loss": 0.0025, "reward": 1.7917187213897705, "reward_std": 0.009220210835337639, "rewards/accuracy_reward": 0.6417187452316284, "rewards/format_reward": 1.0, "step": 1858 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.96875, "epoch": 0.018768298838970215, "grad_norm": 3.4542979771697135, "kl": 0.06396484375, "learning_rate": 9.991311120999828e-07, "loss": 0.0026, "reward": 2.088562488555908, "reward_std": 0.019190946593880653, "rewards/accuracy_reward": 0.8885625004768372, "rewards/format_reward": 1.0, "step": 1859 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 418.53125, "epoch": 0.018778394750126198, "grad_norm": 1.6045381503003153, "kl": 0.06591796875, "learning_rate": 9.99130177329121e-07, "loss": 0.0026, "reward": 1.8506875038146973, "reward_std": 0.007015874609351158, "rewards/accuracy_reward": 0.7006875276565552, "rewards/format_reward": 1.0, "step": 1860 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.59375, "epoch": 0.01878849066128218, "grad_norm": 3.5291148835647386, "kl": 0.06396484375, "learning_rate": 9.991292420561423e-07, "loss": 0.0025, "reward": 2.12681245803833, "reward_std": 0.03363748639822006, "rewards/accuracy_reward": 0.9330624938011169, "rewards/format_reward": 1.0, "step": 1861 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.46875, "epoch": 0.018798586572438162, "grad_norm": 2.341730804077736, "kl": 0.0703125, "learning_rate": 9.991283062810484e-07, "loss": 0.0028, "reward": 1.8665001392364502, "reward_std": 0.09743289649486542, "rewards/accuracy_reward": 0.7164999842643738, "rewards/format_reward": 1.0, "step": 1862 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.0, "epoch": 0.018808682483594144, "grad_norm": 2.1647901078359246, "kl": 0.0615234375, "learning_rate": 9.991273700038392e-07, "loss": 0.0025, "reward": 2.072718620300293, "reward_std": 0.1550944745540619, "rewards/accuracy_reward": 0.8977186679840088, "rewards/format_reward": 1.0, "step": 1863 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 417.4375, "epoch": 0.018818778394750126, "grad_norm": 1.9383656504846818, "kl": 0.05908203125, "learning_rate": 9.991264332245164e-07, "loss": 0.0024, "reward": 1.8704376220703125, "reward_std": 0.008567352779209614, "rewards/accuracy_reward": 0.7204375267028809, "rewards/format_reward": 1.0, "step": 1864 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.21875, "epoch": 0.01882887430590611, "grad_norm": 1.934018482879718, "kl": 0.0576171875, "learning_rate": 9.99125495943081e-07, "loss": 0.0023, "reward": 1.86328125, "reward_std": 0.14707814157009125, "rewards/accuracy_reward": 0.7007812261581421, "rewards/format_reward": 1.0, "step": 1865 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.59375, "epoch": 0.01883897021706209, "grad_norm": 2.5753934432997263, "kl": 0.06982421875, "learning_rate": 9.991245581595332e-07, "loss": 0.0028, "reward": 2.086937427520752, "reward_std": 0.015337581746280193, "rewards/accuracy_reward": 0.8869374394416809, "rewards/format_reward": 1.0, "step": 1866 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.03125, "epoch": 0.018849066128218073, "grad_norm": 2.006240883550906, "kl": 0.07080078125, "learning_rate": 9.991236198738747e-07, "loss": 0.0028, "reward": 2.144312620162964, "reward_std": 0.013731461018323898, "rewards/accuracy_reward": 0.944312572479248, "rewards/format_reward": 1.0, "step": 1867 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 426.9375, "epoch": 0.018859162039374055, "grad_norm": 1.5195405465510714, "kl": 0.060302734375, "learning_rate": 9.991226810861059e-07, "loss": 0.0024, "reward": 1.8633124828338623, "reward_std": 0.01866181567311287, "rewards/accuracy_reward": 0.7195625305175781, "rewards/format_reward": 1.0, "step": 1868 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.375, "epoch": 0.018869257950530037, "grad_norm": 2.970156705924826, "kl": 0.06689453125, "learning_rate": 9.991217417962284e-07, "loss": 0.0027, "reward": 1.8146250247955322, "reward_std": 0.2908630967140198, "rewards/accuracy_reward": 0.6646249890327454, "rewards/format_reward": 1.0, "step": 1869 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.9375, "epoch": 0.018879353861686016, "grad_norm": 4.638985633527675, "kl": 0.06982421875, "learning_rate": 9.991208020042423e-07, "loss": 0.0028, "reward": 2.0074689388275146, "reward_std": 0.02310880646109581, "rewards/accuracy_reward": 0.8074687719345093, "rewards/format_reward": 1.0, "step": 1870 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.59375, "epoch": 0.018889449772841998, "grad_norm": 2.1347660776998314, "kl": 0.07958984375, "learning_rate": 9.991198617101493e-07, "loss": 0.0032, "reward": 1.9193124771118164, "reward_std": 0.17146196961402893, "rewards/accuracy_reward": 0.7443124651908875, "rewards/format_reward": 1.0, "step": 1871 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.8125, "epoch": 0.01889954568399798, "grad_norm": 2.884452032492732, "kl": 0.0830078125, "learning_rate": 9.9911892091395e-07, "loss": 0.0033, "reward": 2.072187662124634, "reward_std": 0.014267303049564362, "rewards/accuracy_reward": 0.8721874952316284, "rewards/format_reward": 1.0, "step": 1872 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.59375, "epoch": 0.018909641595153962, "grad_norm": 2.512053739185126, "kl": 0.0751953125, "learning_rate": 9.991179796156453e-07, "loss": 0.003, "reward": 2.032437562942505, "reward_std": 0.012473939917981625, "rewards/accuracy_reward": 0.8324375748634338, "rewards/format_reward": 1.0, "step": 1873 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.0625, "epoch": 0.018919737506309944, "grad_norm": 3.1396323320285275, "kl": 0.06201171875, "learning_rate": 9.991170378152364e-07, "loss": 0.0025, "reward": 2.159968852996826, "reward_std": 0.01154524739831686, "rewards/accuracy_reward": 0.9599686861038208, "rewards/format_reward": 1.0, "step": 1874 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.28125, "epoch": 0.018929833417465926, "grad_norm": 6.77366659007131, "kl": 0.07373046875, "learning_rate": 9.991160955127238e-07, "loss": 0.0029, "reward": 2.067906379699707, "reward_std": 0.04136895388364792, "rewards/accuracy_reward": 0.8679062724113464, "rewards/format_reward": 1.0, "step": 1875 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 412.40625, "epoch": 0.01893992932862191, "grad_norm": 1.6504471726108183, "kl": 0.062255859375, "learning_rate": 9.99115152708109e-07, "loss": 0.0025, "reward": 1.5596562623977661, "reward_std": 0.0074893818236887455, "rewards/accuracy_reward": 0.4596562683582306, "rewards/format_reward": 1.0, "step": 1876 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.25, "epoch": 0.01895002523977789, "grad_norm": 12.196490052260797, "kl": 0.0791015625, "learning_rate": 9.991142094013925e-07, "loss": 0.0032, "reward": 2.0610625743865967, "reward_std": 0.026150286197662354, "rewards/accuracy_reward": 0.8610624074935913, "rewards/format_reward": 1.0, "step": 1877 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.40625, "epoch": 0.018960121150933873, "grad_norm": 2.0825866659955667, "kl": 0.06005859375, "learning_rate": 9.991132655925754e-07, "loss": 0.0024, "reward": 2.179374933242798, "reward_std": 0.023218173533678055, "rewards/accuracy_reward": 0.9856250286102295, "rewards/format_reward": 1.0, "step": 1878 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.125, "epoch": 0.018970217062089855, "grad_norm": 3.777206741515817, "kl": 0.07958984375, "learning_rate": 9.991123212816588e-07, "loss": 0.0032, "reward": 2.0209686756134033, "reward_std": 0.026075663045048714, "rewards/accuracy_reward": 0.820968747138977, "rewards/format_reward": 1.0, "step": 1879 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.96875, "epoch": 0.018980312973245834, "grad_norm": 11.580278541666049, "kl": 0.0703125, "learning_rate": 9.991113764686435e-07, "loss": 0.0028, "reward": 2.136219024658203, "reward_std": 0.018004346638917923, "rewards/accuracy_reward": 0.9362187385559082, "rewards/format_reward": 1.0, "step": 1880 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.4375, "epoch": 0.018990408884401816, "grad_norm": 2.4493627892390153, "kl": 0.0791015625, "learning_rate": 9.991104311535304e-07, "loss": 0.0032, "reward": 2.1058437824249268, "reward_std": 0.013614067807793617, "rewards/accuracy_reward": 0.9058436751365662, "rewards/format_reward": 1.0, "step": 1881 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.46875, "epoch": 0.019000504795557798, "grad_norm": 1.864432852453935, "kl": 0.072265625, "learning_rate": 9.991094853363205e-07, "loss": 0.0029, "reward": 2.028156280517578, "reward_std": 0.029986074194312096, "rewards/accuracy_reward": 0.8281562328338623, "rewards/format_reward": 1.0, "step": 1882 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.21875, "epoch": 0.01901060070671378, "grad_norm": 2.103923702344209, "kl": 0.06103515625, "learning_rate": 9.991085390170148e-07, "loss": 0.0024, "reward": 1.5552186965942383, "reward_std": 0.1504894345998764, "rewards/accuracy_reward": 0.4427187442779541, "rewards/format_reward": 1.0, "step": 1883 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.53125, "epoch": 0.019020696617869762, "grad_norm": 3.249605908105449, "kl": 0.0673828125, "learning_rate": 9.991075921956145e-07, "loss": 0.0027, "reward": 1.8299063444137573, "reward_std": 0.15145349502563477, "rewards/accuracy_reward": 0.6674062609672546, "rewards/format_reward": 1.0, "step": 1884 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.65625, "epoch": 0.019030792529025745, "grad_norm": 2.958137413758885, "kl": 0.08203125, "learning_rate": 9.9910664487212e-07, "loss": 0.0033, "reward": 2.092531204223633, "reward_std": 0.015126490034162998, "rewards/accuracy_reward": 0.892531156539917, "rewards/format_reward": 1.0, "step": 1885 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.71875, "epoch": 0.019040888440181727, "grad_norm": 2.6282058399269013, "kl": 0.08349609375, "learning_rate": 9.991056970465325e-07, "loss": 0.0033, "reward": 2.1747498512268066, "reward_std": 0.015023592859506607, "rewards/accuracy_reward": 0.9747499823570251, "rewards/format_reward": 1.0, "step": 1886 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 396.25, "epoch": 0.01905098435133771, "grad_norm": 3.3673134474303783, "kl": 0.08203125, "learning_rate": 9.99104748718853e-07, "loss": 0.0033, "reward": 1.7839999198913574, "reward_std": 0.023341966792941093, "rewards/accuracy_reward": 0.6402499675750732, "rewards/format_reward": 1.0, "step": 1887 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.75, "epoch": 0.01906108026249369, "grad_norm": 2.5615783537930685, "kl": 0.064453125, "learning_rate": 9.991037998890826e-07, "loss": 0.0026, "reward": 1.821874976158142, "reward_std": 0.011127891018986702, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1888 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.019071176173649673, "grad_norm": 2.2070746519808178, "kl": 0.0654296875, "learning_rate": 9.991028505572222e-07, "loss": 0.0026, "reward": 1.860937476158142, "reward_std": 0.32664722204208374, "rewards/accuracy_reward": 0.7109375, "rewards/format_reward": 1.0, "step": 1889 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.34375, "epoch": 0.019081272084805655, "grad_norm": 2.1185417084392593, "kl": 0.05810546875, "learning_rate": 9.991019007232723e-07, "loss": 0.0023, "reward": 2.086750030517578, "reward_std": 0.011466018855571747, "rewards/accuracy_reward": 0.8867499828338623, "rewards/format_reward": 1.0, "step": 1890 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.0625, "epoch": 0.019091367995961634, "grad_norm": 1.9822549089204333, "kl": 0.06591796875, "learning_rate": 9.991009503872344e-07, "loss": 0.0026, "reward": 2.1564064025878906, "reward_std": 0.0059936754405498505, "rewards/accuracy_reward": 0.9564062356948853, "rewards/format_reward": 1.0, "step": 1891 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.25, "epoch": 0.019101463907117616, "grad_norm": 2.1452242540064432, "kl": 0.0712890625, "learning_rate": 9.990999995491092e-07, "loss": 0.0029, "reward": 1.9722501039505005, "reward_std": 0.16976863145828247, "rewards/accuracy_reward": 0.797249972820282, "rewards/format_reward": 1.0, "step": 1892 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.4375, "epoch": 0.0191115598182736, "grad_norm": 2.214910271091742, "kl": 0.10107421875, "learning_rate": 9.990990482088978e-07, "loss": 0.004, "reward": 2.081031322479248, "reward_std": 0.015673616901040077, "rewards/accuracy_reward": 0.8810312747955322, "rewards/format_reward": 1.0, "step": 1893 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.34375, "epoch": 0.01912165572942958, "grad_norm": 2.6087704496599726, "kl": 0.0703125, "learning_rate": 9.99098096366601e-07, "loss": 0.0028, "reward": 1.8711562156677246, "reward_std": 0.09646978974342346, "rewards/accuracy_reward": 0.7211562395095825, "rewards/format_reward": 1.0, "step": 1894 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 385.9375, "epoch": 0.019131751640585563, "grad_norm": 4.892553063361088, "kl": 0.072265625, "learning_rate": 9.990971440222197e-07, "loss": 0.0029, "reward": 1.8292499780654907, "reward_std": 0.025346623733639717, "rewards/accuracy_reward": 0.6855000257492065, "rewards/format_reward": 1.0, "step": 1895 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 378.9375, "epoch": 0.019141847551741545, "grad_norm": 4.262282106213346, "kl": 0.0888671875, "learning_rate": 9.990961911757555e-07, "loss": 0.0035, "reward": 2.1480000019073486, "reward_std": 0.014494506642222404, "rewards/accuracy_reward": 0.9479999542236328, "rewards/format_reward": 1.0, "step": 1896 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.03125, "epoch": 0.019151943462897527, "grad_norm": 4.056068516068446, "kl": 0.08544921875, "learning_rate": 9.990952378272082e-07, "loss": 0.0034, "reward": 2.081031322479248, "reward_std": 0.014977836981415749, "rewards/accuracy_reward": 0.8810312747955322, "rewards/format_reward": 1.0, "step": 1897 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 370.34375, "epoch": 0.01916203937405351, "grad_norm": 1.840147791789812, "kl": 0.0673828125, "learning_rate": 9.990942839765799e-07, "loss": 0.0027, "reward": 1.8183748722076416, "reward_std": 0.012712161988019943, "rewards/accuracy_reward": 0.6683750152587891, "rewards/format_reward": 1.0, "step": 1898 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.46875, "epoch": 0.01917213528520949, "grad_norm": 3.5034061901643594, "kl": 0.07763671875, "learning_rate": 9.990933296238708e-07, "loss": 0.0031, "reward": 2.0024688243865967, "reward_std": 0.036277398467063904, "rewards/accuracy_reward": 0.8149687051773071, "rewards/format_reward": 1.0, "step": 1899 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.875, "epoch": 0.019182231196365473, "grad_norm": 2.4098330103313867, "kl": 0.06982421875, "learning_rate": 9.990923747690822e-07, "loss": 0.0028, "reward": 2.148624897003174, "reward_std": 0.033987924456596375, "rewards/accuracy_reward": 0.9611249566078186, "rewards/format_reward": 1.0, "step": 1900 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.375, "epoch": 0.019192327107521452, "grad_norm": 2.187259502202632, "kl": 0.068359375, "learning_rate": 9.990914194122151e-07, "loss": 0.0027, "reward": 1.9454689025878906, "reward_std": 0.18403849005699158, "rewards/accuracy_reward": 0.77671879529953, "rewards/format_reward": 1.0, "step": 1901 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 365.375, "epoch": 0.019202423018677434, "grad_norm": 3.6349910944477566, "kl": 0.08349609375, "learning_rate": 9.990904635532702e-07, "loss": 0.0033, "reward": 2.0145936012268066, "reward_std": 0.04308151826262474, "rewards/accuracy_reward": 0.827093780040741, "rewards/format_reward": 1.0, "step": 1902 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.019212518929833416, "grad_norm": 2.3472251675270006, "kl": 0.0751953125, "learning_rate": 9.990895071922486e-07, "loss": 0.003, "reward": 2.1053125858306885, "reward_std": 0.02574066072702408, "rewards/accuracy_reward": 0.9115625619888306, "rewards/format_reward": 1.0, "step": 1903 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.96875, "epoch": 0.0192226148409894, "grad_norm": 3.265484714319161, "kl": 0.078125, "learning_rate": 9.990885503291514e-07, "loss": 0.0031, "reward": 2.1029999256134033, "reward_std": 0.03947175294160843, "rewards/accuracy_reward": 0.9154999852180481, "rewards/format_reward": 1.0, "step": 1904 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.78125, "epoch": 0.01923271075214538, "grad_norm": 3.5777689504118837, "kl": 0.07421875, "learning_rate": 9.990875929639792e-07, "loss": 0.003, "reward": 2.031968593597412, "reward_std": 0.017578106373548508, "rewards/accuracy_reward": 0.8319686651229858, "rewards/format_reward": 1.0, "step": 1905 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.90625, "epoch": 0.019242806663301363, "grad_norm": 5.093840363399622, "kl": 0.0712890625, "learning_rate": 9.990866350967335e-07, "loss": 0.0028, "reward": 2.1175310611724854, "reward_std": 0.018222851678729057, "rewards/accuracy_reward": 0.9175313115119934, "rewards/format_reward": 1.0, "step": 1906 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.15625, "epoch": 0.019252902574457345, "grad_norm": 2.1707536217346703, "kl": 0.076171875, "learning_rate": 9.990856767274148e-07, "loss": 0.003, "reward": 1.9188437461853027, "reward_std": 0.03591049462556839, "rewards/accuracy_reward": 0.7250937223434448, "rewards/format_reward": 1.0, "step": 1907 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.90625, "epoch": 0.019262998485613327, "grad_norm": 2.446517438416342, "kl": 0.08056640625, "learning_rate": 9.990847178560242e-07, "loss": 0.0032, "reward": 2.0743751525878906, "reward_std": 0.008065243251621723, "rewards/accuracy_reward": 0.87437504529953, "rewards/format_reward": 1.0, "step": 1908 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.59375, "epoch": 0.01927309439676931, "grad_norm": 2.3552057466566403, "kl": 0.0849609375, "learning_rate": 9.99083758482563e-07, "loss": 0.0034, "reward": 2.076812267303467, "reward_std": 0.01506664790213108, "rewards/accuracy_reward": 0.8768125176429749, "rewards/format_reward": 1.0, "step": 1909 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.375, "epoch": 0.01928319030792529, "grad_norm": 2.683948148520874, "kl": 0.06982421875, "learning_rate": 9.990827986070314e-07, "loss": 0.0028, "reward": 1.8235000371932983, "reward_std": 0.007316797971725464, "rewards/accuracy_reward": 0.6735000014305115, "rewards/format_reward": 1.0, "step": 1910 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.90625, "epoch": 0.019293286219081274, "grad_norm": 3.4023511148359704, "kl": 0.06640625, "learning_rate": 9.99081838229431e-07, "loss": 0.0026, "reward": 1.8427188396453857, "reward_std": 0.10262349247932434, "rewards/accuracy_reward": 0.6927187442779541, "rewards/format_reward": 1.0, "step": 1911 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.0, "epoch": 0.019303382130237252, "grad_norm": 17.50603451110715, "kl": 0.06884765625, "learning_rate": 9.990808773497629e-07, "loss": 0.0028, "reward": 1.9265937805175781, "reward_std": 0.19313524663448334, "rewards/accuracy_reward": 0.7578436732292175, "rewards/format_reward": 1.0, "step": 1912 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.9375, "epoch": 0.019313478041393235, "grad_norm": 2.0836041185145477, "kl": 0.06396484375, "learning_rate": 9.990799159680271e-07, "loss": 0.0026, "reward": 2.0399062633514404, "reward_std": 0.025812046602368355, "rewards/accuracy_reward": 0.8461562395095825, "rewards/format_reward": 1.0, "step": 1913 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.9375, "epoch": 0.019323573952549217, "grad_norm": 1.904425160793415, "kl": 0.0732421875, "learning_rate": 9.990789540842258e-07, "loss": 0.0029, "reward": 2.1698436737060547, "reward_std": 0.010347455739974976, "rewards/accuracy_reward": 0.9698436856269836, "rewards/format_reward": 1.0, "step": 1914 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.40625, "epoch": 0.0193336698637052, "grad_norm": 2.1429536694015416, "kl": 0.06982421875, "learning_rate": 9.990779916983592e-07, "loss": 0.0028, "reward": 2.1391563415527344, "reward_std": 0.013617593795061111, "rewards/accuracy_reward": 0.9391562938690186, "rewards/format_reward": 1.0, "step": 1915 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.28125, "epoch": 0.01934376577486118, "grad_norm": 2.96716757939773, "kl": 0.076171875, "learning_rate": 9.990770288104284e-07, "loss": 0.003, "reward": 2.044187545776367, "reward_std": 0.02008627913892269, "rewards/accuracy_reward": 0.8441874980926514, "rewards/format_reward": 1.0, "step": 1916 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.019353861686017163, "grad_norm": 1.8016330534669882, "kl": 0.0751953125, "learning_rate": 9.990760654204345e-07, "loss": 0.003, "reward": 1.98256254196167, "reward_std": 0.16366326808929443, "rewards/accuracy_reward": 0.8138124942779541, "rewards/format_reward": 1.0, "step": 1917 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 426.96875, "epoch": 0.019363957597173145, "grad_norm": 1.7177158269153945, "kl": 0.0732421875, "learning_rate": 9.990751015283782e-07, "loss": 0.0029, "reward": 2.0057501792907715, "reward_std": 0.1437106430530548, "rewards/accuracy_reward": 0.8182500004768372, "rewards/format_reward": 1.0, "step": 1918 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.59375, "epoch": 0.019374053508329128, "grad_norm": 2.0194452427478695, "kl": 0.0732421875, "learning_rate": 9.99074137134261e-07, "loss": 0.0029, "reward": 1.7701563835144043, "reward_std": 0.014718519523739815, "rewards/accuracy_reward": 0.6201562285423279, "rewards/format_reward": 1.0, "step": 1919 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.03125, "epoch": 0.01938414941948511, "grad_norm": 2.834665856549103, "kl": 0.08251953125, "learning_rate": 9.990731722380834e-07, "loss": 0.0033, "reward": 2.055593729019165, "reward_std": 0.028351103886961937, "rewards/accuracy_reward": 0.8618437051773071, "rewards/format_reward": 1.0, "step": 1920 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.019394245330641092, "grad_norm": 4.728539596728679, "kl": 0.07177734375, "learning_rate": 9.990722068398466e-07, "loss": 0.0029, "reward": 2.08803129196167, "reward_std": 0.013509277254343033, "rewards/accuracy_reward": 0.8880312442779541, "rewards/format_reward": 1.0, "step": 1921 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.09375, "epoch": 0.01940434124179707, "grad_norm": 1.643570871618473, "kl": 0.0625, "learning_rate": 9.990712409395511e-07, "loss": 0.0025, "reward": 2.099250078201294, "reward_std": 0.21169719099998474, "rewards/accuracy_reward": 0.9242500066757202, "rewards/format_reward": 1.0, "step": 1922 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.09375, "epoch": 0.019414437152953053, "grad_norm": 1.0513722503494092, "kl": 0.0673828125, "learning_rate": 9.990702745371985e-07, "loss": 0.0027, "reward": 1.8622498512268066, "reward_std": 0.002492846455425024, "rewards/accuracy_reward": 0.7122500538825989, "rewards/format_reward": 1.0, "step": 1923 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.90625, "epoch": 0.019424533064109035, "grad_norm": 84.07405992732569, "kl": 0.06591796875, "learning_rate": 9.990693076327896e-07, "loss": 0.0026, "reward": 2.1280312538146973, "reward_std": 0.03841710835695267, "rewards/accuracy_reward": 0.940531313419342, "rewards/format_reward": 1.0, "step": 1924 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.09375, "epoch": 0.019434628975265017, "grad_norm": 2.651812510341064, "kl": 0.07177734375, "learning_rate": 9.990683402263253e-07, "loss": 0.0029, "reward": 2.0926876068115234, "reward_std": 0.014064518734812737, "rewards/accuracy_reward": 0.8926874399185181, "rewards/format_reward": 1.0, "step": 1925 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.59375, "epoch": 0.019444724886421, "grad_norm": 3.588863769971299, "kl": 0.0732421875, "learning_rate": 9.990673723178065e-07, "loss": 0.0029, "reward": 1.808500051498413, "reward_std": 0.007439322303980589, "rewards/accuracy_reward": 0.6585000157356262, "rewards/format_reward": 1.0, "step": 1926 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.28125, "epoch": 0.01945482079757698, "grad_norm": 3.435941095781619, "kl": 0.0732421875, "learning_rate": 9.990664039072342e-07, "loss": 0.0029, "reward": 1.8360313177108765, "reward_std": 0.01832827739417553, "rewards/accuracy_reward": 0.6860312819480896, "rewards/format_reward": 1.0, "step": 1927 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.78125, "epoch": 0.019464916708732963, "grad_norm": 2.9011554617561397, "kl": 0.07421875, "learning_rate": 9.990654349946095e-07, "loss": 0.003, "reward": 2.1259377002716064, "reward_std": 0.03177288919687271, "rewards/accuracy_reward": 0.9321874380111694, "rewards/format_reward": 1.0, "step": 1928 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 0.019475012619888946, "grad_norm": 2.222638666710367, "kl": 0.06689453125, "learning_rate": 9.990644655799332e-07, "loss": 0.0026, "reward": 2.0855093002319336, "reward_std": 0.02831733226776123, "rewards/accuracy_reward": 0.8917593955993652, "rewards/format_reward": 1.0, "step": 1929 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.40625, "epoch": 0.019485108531044928, "grad_norm": 3.075151482916567, "kl": 0.0673828125, "learning_rate": 9.990634956632064e-07, "loss": 0.0027, "reward": 2.0030624866485596, "reward_std": 0.17613890767097473, "rewards/accuracy_reward": 0.8218125104904175, "rewards/format_reward": 1.0, "step": 1930 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.15625, "epoch": 0.01949520444220091, "grad_norm": 2.8614629910777474, "kl": 0.0830078125, "learning_rate": 9.9906252524443e-07, "loss": 0.0033, "reward": 2.051874876022339, "reward_std": 0.025833984836935997, "rewards/accuracy_reward": 0.8581249713897705, "rewards/format_reward": 1.0, "step": 1931 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.84375, "epoch": 0.019505300353356892, "grad_norm": 2.740086012849363, "kl": 0.0732421875, "learning_rate": 9.99061554323605e-07, "loss": 0.0029, "reward": 2.0779685974121094, "reward_std": 0.02894885279238224, "rewards/accuracy_reward": 0.8842187523841858, "rewards/format_reward": 1.0, "step": 1932 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.90625, "epoch": 0.01951539626451287, "grad_norm": 2.417147568097482, "kl": 0.07666015625, "learning_rate": 9.990605829007326e-07, "loss": 0.0031, "reward": 2.1085939407348633, "reward_std": 0.03128103166818619, "rewards/accuracy_reward": 0.9148436784744263, "rewards/format_reward": 1.0, "step": 1933 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.78125, "epoch": 0.019525492175668853, "grad_norm": 4.155976930921375, "kl": 0.08251953125, "learning_rate": 9.990596109758134e-07, "loss": 0.0033, "reward": 2.1230626106262207, "reward_std": 0.01582619920372963, "rewards/accuracy_reward": 0.9230625629425049, "rewards/format_reward": 1.0, "step": 1934 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.28125, "epoch": 0.019535588086824835, "grad_norm": 3.1113028881785003, "kl": 0.06591796875, "learning_rate": 9.990586385488483e-07, "loss": 0.0026, "reward": 2.071625232696533, "reward_std": 0.1358143389225006, "rewards/accuracy_reward": 0.8778749704360962, "rewards/format_reward": 1.0, "step": 1935 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.90625, "epoch": 0.019545683997980817, "grad_norm": 3.4133614794603675, "kl": 0.07763671875, "learning_rate": 9.99057665619839e-07, "loss": 0.0031, "reward": 2.1542186737060547, "reward_std": 0.0279111098498106, "rewards/accuracy_reward": 0.9604687690734863, "rewards/format_reward": 1.0, "step": 1936 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.84375, "epoch": 0.0195557799091368, "grad_norm": 5.21161913325279, "kl": 0.07275390625, "learning_rate": 9.990566921887858e-07, "loss": 0.0029, "reward": 2.063687324523926, "reward_std": 0.02324383333325386, "rewards/accuracy_reward": 0.8636875152587891, "rewards/format_reward": 1.0, "step": 1937 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.65625, "epoch": 0.01956587582029278, "grad_norm": 2.486302473575274, "kl": 0.08203125, "learning_rate": 9.9905571825569e-07, "loss": 0.0033, "reward": 1.9851250648498535, "reward_std": 0.04201429709792137, "rewards/accuracy_reward": 0.7976250052452087, "rewards/format_reward": 1.0, "step": 1938 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.625, "epoch": 0.019575971731448764, "grad_norm": 2.19722803049317, "kl": 0.05859375, "learning_rate": 9.990547438205523e-07, "loss": 0.0023, "reward": 1.8119688034057617, "reward_std": 0.24811212718486786, "rewards/accuracy_reward": 0.680718719959259, "rewards/format_reward": 1.0, "step": 1939 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.28125, "epoch": 0.019586067642604746, "grad_norm": 3.932645759182044, "kl": 0.08154296875, "learning_rate": 9.990537688833737e-07, "loss": 0.0033, "reward": 2.1154065132141113, "reward_std": 0.019938521087169647, "rewards/accuracy_reward": 0.9154062271118164, "rewards/format_reward": 1.0, "step": 1940 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 400.84375, "epoch": 0.019596163553760728, "grad_norm": 4.793270319525176, "kl": 0.07470703125, "learning_rate": 9.990527934441558e-07, "loss": 0.003, "reward": 1.8304376602172852, "reward_std": 0.008281940594315529, "rewards/accuracy_reward": 0.6804375052452087, "rewards/format_reward": 1.0, "step": 1941 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.625, "epoch": 0.01960625946491671, "grad_norm": 3.3975668735217144, "kl": 0.07373046875, "learning_rate": 9.990518175028986e-07, "loss": 0.003, "reward": 2.1148126125335693, "reward_std": 0.02699383720755577, "rewards/accuracy_reward": 0.9210624694824219, "rewards/format_reward": 1.0, "step": 1942 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.25, "epoch": 0.01961635537607269, "grad_norm": 2.099617748969788, "kl": 0.0712890625, "learning_rate": 9.99050841059604e-07, "loss": 0.0029, "reward": 2.159656524658203, "reward_std": 0.010155325755476952, "rewards/accuracy_reward": 0.9596562385559082, "rewards/format_reward": 1.0, "step": 1943 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.90625, "epoch": 0.01962645128722867, "grad_norm": 3.7878754250870124, "kl": 0.07421875, "learning_rate": 9.990498641142723e-07, "loss": 0.003, "reward": 2.0624375343322754, "reward_std": 0.03829731419682503, "rewards/accuracy_reward": 0.8811875581741333, "rewards/format_reward": 1.0, "step": 1944 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.125, "epoch": 0.019636547198384653, "grad_norm": 3.8770520114819056, "kl": 0.0791015625, "learning_rate": 9.990488866669048e-07, "loss": 0.0032, "reward": 2.0575623512268066, "reward_std": 0.00418261019513011, "rewards/accuracy_reward": 0.8575625419616699, "rewards/format_reward": 1.0, "step": 1945 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.875, "epoch": 0.019646643109540635, "grad_norm": 3.668017484852554, "kl": 0.06494140625, "learning_rate": 9.990479087175026e-07, "loss": 0.0026, "reward": 1.7331875562667847, "reward_std": 0.016445158049464226, "rewards/accuracy_reward": 0.583187460899353, "rewards/format_reward": 1.0, "step": 1946 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.25, "epoch": 0.019656739020696617, "grad_norm": 0.9886122293422731, "kl": 0.05224609375, "learning_rate": 9.990469302660665e-07, "loss": 0.0021, "reward": 1.8876030445098877, "reward_std": 0.009006709791719913, "rewards/accuracy_reward": 0.7376031279563904, "rewards/format_reward": 1.0, "step": 1947 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.71875, "epoch": 0.0196668349318526, "grad_norm": 6.457743142888691, "kl": 0.07421875, "learning_rate": 9.990459513125973e-07, "loss": 0.003, "reward": 1.8156561851501465, "reward_std": 0.16639156639575958, "rewards/accuracy_reward": 0.6531562805175781, "rewards/format_reward": 1.0, "step": 1948 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.9375, "epoch": 0.019676930843008582, "grad_norm": 2.657345338649127, "kl": 0.0908203125, "learning_rate": 9.990449718570964e-07, "loss": 0.0036, "reward": 2.085812568664551, "reward_std": 0.01647813990712166, "rewards/accuracy_reward": 0.885812520980835, "rewards/format_reward": 1.0, "step": 1949 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.625, "epoch": 0.019687026754164564, "grad_norm": 1.8997157275840897, "kl": 0.0732421875, "learning_rate": 9.990439918995643e-07, "loss": 0.0029, "reward": 2.135812520980835, "reward_std": 0.009969262406229973, "rewards/accuracy_reward": 0.9358124732971191, "rewards/format_reward": 1.0, "step": 1950 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.0, "epoch": 0.019697122665320546, "grad_norm": 3.0785220159544666, "kl": 0.0751953125, "learning_rate": 9.990430114400026e-07, "loss": 0.003, "reward": 2.125718832015991, "reward_std": 0.017356164753437042, "rewards/accuracy_reward": 0.9257187247276306, "rewards/format_reward": 1.0, "step": 1951 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.1875, "epoch": 0.01970721857647653, "grad_norm": 3.03037533017752, "kl": 0.06689453125, "learning_rate": 9.990420304784117e-07, "loss": 0.0027, "reward": 2.1393749713897705, "reward_std": 0.02038019336760044, "rewards/accuracy_reward": 0.9456250071525574, "rewards/format_reward": 1.0, "step": 1952 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.96875, "epoch": 0.01971731448763251, "grad_norm": 2.2704964045933806, "kl": 0.07275390625, "learning_rate": 9.990410490147928e-07, "loss": 0.0029, "reward": 2.114874839782715, "reward_std": 0.008627266623079777, "rewards/accuracy_reward": 0.9148750305175781, "rewards/format_reward": 1.0, "step": 1953 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.5625, "epoch": 0.01972741039878849, "grad_norm": 2.410234711585241, "kl": 0.09521484375, "learning_rate": 9.990400670491472e-07, "loss": 0.0038, "reward": 2.06640625, "reward_std": 0.01763783022761345, "rewards/accuracy_reward": 0.8664062023162842, "rewards/format_reward": 1.0, "step": 1954 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.46875, "epoch": 0.01973750630994447, "grad_norm": 2.0650113392619693, "kl": 0.080078125, "learning_rate": 9.990390845814754e-07, "loss": 0.0032, "reward": 2.1399998664855957, "reward_std": 0.023833170533180237, "rewards/accuracy_reward": 0.9462499618530273, "rewards/format_reward": 1.0, "step": 1955 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.34375, "epoch": 0.019747602221100453, "grad_norm": 3.556180129269004, "kl": 0.1005859375, "learning_rate": 9.990381016117788e-07, "loss": 0.004, "reward": 1.9521563053131104, "reward_std": 0.01057572290301323, "rewards/accuracy_reward": 0.7521562576293945, "rewards/format_reward": 1.0, "step": 1956 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.6875, "epoch": 0.019757698132256436, "grad_norm": 2.477695781724278, "kl": 0.05908203125, "learning_rate": 9.990371181400578e-07, "loss": 0.0024, "reward": 1.460843801498413, "reward_std": 0.301347553730011, "rewards/accuracy_reward": 0.37959375977516174, "rewards/format_reward": 1.0, "step": 1957 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.9375, "epoch": 0.019767794043412418, "grad_norm": 1.6733895670667234, "kl": 0.072265625, "learning_rate": 9.990361341663142e-07, "loss": 0.0029, "reward": 2.1572813987731934, "reward_std": 0.007571767084300518, "rewards/accuracy_reward": 0.957281231880188, "rewards/format_reward": 1.0, "step": 1958 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.875, "epoch": 0.0197778899545684, "grad_norm": 3.349558550722768, "kl": 0.0908203125, "learning_rate": 9.990351496905483e-07, "loss": 0.0036, "reward": 2.1299376487731934, "reward_std": 0.013743752613663673, "rewards/accuracy_reward": 0.929937481880188, "rewards/format_reward": 1.0, "step": 1959 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.019787985865724382, "grad_norm": 1.8170188157890572, "kl": 0.080078125, "learning_rate": 9.990341647127614e-07, "loss": 0.0032, "reward": 2.145312547683716, "reward_std": 0.01191069558262825, "rewards/accuracy_reward": 0.9453125596046448, "rewards/format_reward": 1.0, "step": 1960 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.46875, "epoch": 0.019798081776880364, "grad_norm": 3.663196614235526, "kl": 0.0908203125, "learning_rate": 9.990331792329545e-07, "loss": 0.0036, "reward": 1.9941563606262207, "reward_std": 0.015028061345219612, "rewards/accuracy_reward": 0.7941562533378601, "rewards/format_reward": 1.0, "step": 1961 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.5, "epoch": 0.019808177688036346, "grad_norm": 3.0295840403416685, "kl": 0.080078125, "learning_rate": 9.990321932511286e-07, "loss": 0.0032, "reward": 2.1194686889648438, "reward_std": 0.013717175461351871, "rewards/accuracy_reward": 0.9194687604904175, "rewards/format_reward": 1.0, "step": 1962 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.09375, "epoch": 0.01981827359919233, "grad_norm": 4.731954624278885, "kl": 0.07275390625, "learning_rate": 9.990312067672846e-07, "loss": 0.0029, "reward": 1.8314688205718994, "reward_std": 0.01354968547821045, "rewards/accuracy_reward": 0.6814687848091125, "rewards/format_reward": 1.0, "step": 1963 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.1875, "epoch": 0.019828369510348307, "grad_norm": 1.6400972957426594, "kl": 0.061279296875, "learning_rate": 9.990302197814234e-07, "loss": 0.0024, "reward": 1.8306562900543213, "reward_std": 0.0073362430557608604, "rewards/accuracy_reward": 0.6806562542915344, "rewards/format_reward": 1.0, "step": 1964 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 398.3125, "epoch": 0.01983846542150429, "grad_norm": 1.1644843929026476, "kl": 0.064453125, "learning_rate": 9.990292322935463e-07, "loss": 0.0026, "reward": 1.57421875, "reward_std": 0.0023506793659180403, "rewards/accuracy_reward": 0.4742187559604645, "rewards/format_reward": 1.0, "step": 1965 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.625, "epoch": 0.01984856133266027, "grad_norm": 5.608809952954283, "kl": 0.068359375, "learning_rate": 9.99028244303654e-07, "loss": 0.0027, "reward": 2.1649062633514404, "reward_std": 0.0072278911247849464, "rewards/accuracy_reward": 0.9649062156677246, "rewards/format_reward": 1.0, "step": 1966 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.8125, "epoch": 0.019858657243816254, "grad_norm": 31.060663132288585, "kl": 0.057861328125, "learning_rate": 9.990272558117476e-07, "loss": 0.0023, "reward": 2.1048126220703125, "reward_std": 0.15561699867248535, "rewards/accuracy_reward": 0.9298125505447388, "rewards/format_reward": 1.0, "step": 1967 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 376.8125, "epoch": 0.019868753154972236, "grad_norm": 1.7942660035319868, "kl": 0.0791015625, "learning_rate": 9.99026266817828e-07, "loss": 0.0032, "reward": 1.8805937767028809, "reward_std": 0.009519658982753754, "rewards/accuracy_reward": 0.730593740940094, "rewards/format_reward": 1.0, "step": 1968 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.53125, "epoch": 0.019878849066128218, "grad_norm": 6.76261645122694, "kl": 0.09619140625, "learning_rate": 9.990252773218964e-07, "loss": 0.0038, "reward": 2.1544690132141113, "reward_std": 0.046090178191661835, "rewards/accuracy_reward": 0.9669687151908875, "rewards/format_reward": 1.0, "step": 1969 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.03125, "epoch": 0.0198889449772842, "grad_norm": 2.895054554890502, "kl": 0.07763671875, "learning_rate": 9.990242873239539e-07, "loss": 0.0031, "reward": 1.7319375276565552, "reward_std": 0.02251330018043518, "rewards/accuracy_reward": 0.5819374918937683, "rewards/format_reward": 1.0, "step": 1970 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 382.75, "epoch": 0.019899040888440182, "grad_norm": 1.6491535308214087, "kl": 0.07958984375, "learning_rate": 9.99023296824001e-07, "loss": 0.0032, "reward": 1.7587813138961792, "reward_std": 0.02108052931725979, "rewards/accuracy_reward": 0.6150312423706055, "rewards/format_reward": 1.0, "step": 1971 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 384.21875, "epoch": 0.019909136799596164, "grad_norm": 1.4479340456822087, "kl": 0.07470703125, "learning_rate": 9.99022305822039e-07, "loss": 0.003, "reward": 1.8570001125335693, "reward_std": 0.02131773717701435, "rewards/accuracy_reward": 0.7132500410079956, "rewards/format_reward": 1.0, "step": 1972 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.75, "epoch": 0.019919232710752147, "grad_norm": 2.698210231100933, "kl": 0.0771484375, "learning_rate": 9.990213143180688e-07, "loss": 0.0031, "reward": 2.129687786102295, "reward_std": 0.03354129195213318, "rewards/accuracy_reward": 0.9359375238418579, "rewards/format_reward": 1.0, "step": 1973 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 360.78125, "epoch": 0.01992932862190813, "grad_norm": 3.6461304582893685, "kl": 0.0771484375, "learning_rate": 9.990203223120917e-07, "loss": 0.0031, "reward": 1.859874963760376, "reward_std": 0.03247683867812157, "rewards/accuracy_reward": 0.7223750352859497, "rewards/format_reward": 1.0, "step": 1974 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.90625, "epoch": 0.019939424533064107, "grad_norm": 31.653251290485976, "kl": 0.08740234375, "learning_rate": 9.990193298041085e-07, "loss": 0.0035, "reward": 2.1700313091278076, "reward_std": 0.010135479271411896, "rewards/accuracy_reward": 0.9700312614440918, "rewards/format_reward": 1.0, "step": 1975 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.375, "epoch": 0.01994952044422009, "grad_norm": 2.6877036797056273, "kl": 0.083984375, "learning_rate": 9.9901833679412e-07, "loss": 0.0034, "reward": 2.1264376640319824, "reward_std": 0.012021507136523724, "rewards/accuracy_reward": 0.926437497138977, "rewards/format_reward": 1.0, "step": 1976 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.0, "epoch": 0.019959616355376072, "grad_norm": 2.876537768657146, "kl": 0.0810546875, "learning_rate": 9.990173432821274e-07, "loss": 0.0032, "reward": 2.0618438720703125, "reward_std": 0.017928607761859894, "rewards/accuracy_reward": 0.8618437647819519, "rewards/format_reward": 1.0, "step": 1977 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.0, "epoch": 0.019969712266532054, "grad_norm": 2.148836119000721, "kl": 0.08447265625, "learning_rate": 9.990163492681317e-07, "loss": 0.0034, "reward": 2.020531177520752, "reward_std": 0.022939840331673622, "rewards/accuracy_reward": 0.8267812132835388, "rewards/format_reward": 1.0, "step": 1978 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 390.09375, "epoch": 0.019979808177688036, "grad_norm": 3.130777981736471, "kl": 0.076171875, "learning_rate": 9.990153547521338e-07, "loss": 0.003, "reward": 1.7369375228881836, "reward_std": 0.011657549068331718, "rewards/accuracy_reward": 0.5869375467300415, "rewards/format_reward": 1.0, "step": 1979 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.8125, "epoch": 0.019989904088844018, "grad_norm": 1.8783984734515187, "kl": 0.0751953125, "learning_rate": 9.990143597341349e-07, "loss": 0.003, "reward": 2.122468948364258, "reward_std": 0.02513699233531952, "rewards/accuracy_reward": 0.9287187457084656, "rewards/format_reward": 1.0, "step": 1980 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.125, "epoch": 0.02, "grad_norm": 2.9544868169136054, "kl": 0.08984375, "learning_rate": 9.990133642141357e-07, "loss": 0.0036, "reward": 2.0527501106262207, "reward_std": 0.020475327968597412, "rewards/accuracy_reward": 0.8527500629425049, "rewards/format_reward": 1.0, "step": 1981 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.21875, "epoch": 0.020010095911155983, "grad_norm": 2.2798072649183934, "kl": 0.07275390625, "learning_rate": 9.990123681921374e-07, "loss": 0.0029, "reward": 1.9568750858306885, "reward_std": 0.12250073254108429, "rewards/accuracy_reward": 0.7631250023841858, "rewards/format_reward": 1.0, "step": 1982 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.020020191822311965, "grad_norm": 2.782035607867346, "kl": 0.09619140625, "learning_rate": 9.990113716681412e-07, "loss": 0.0039, "reward": 2.112687587738037, "reward_std": 0.015724461525678635, "rewards/accuracy_reward": 0.9126875400543213, "rewards/format_reward": 1.0, "step": 1983 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.78125, "epoch": 0.020030287733467947, "grad_norm": 2.8775511792682495, "kl": 0.07421875, "learning_rate": 9.990103746421476e-07, "loss": 0.003, "reward": 2.159437656402588, "reward_std": 0.016079191118478775, "rewards/accuracy_reward": 0.9594374895095825, "rewards/format_reward": 1.0, "step": 1984 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.1875, "epoch": 0.02004038364462393, "grad_norm": 2.395465996454835, "kl": 0.0791015625, "learning_rate": 9.990093771141581e-07, "loss": 0.0032, "reward": 2.1215624809265137, "reward_std": 0.014879407361149788, "rewards/accuracy_reward": 0.9215625524520874, "rewards/format_reward": 1.0, "step": 1985 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.75, "epoch": 0.020050479555779908, "grad_norm": 2.608488305179106, "kl": 0.08349609375, "learning_rate": 9.990083790841734e-07, "loss": 0.0033, "reward": 2.020156145095825, "reward_std": 0.013298998586833477, "rewards/accuracy_reward": 0.8201562762260437, "rewards/format_reward": 1.0, "step": 1986 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.59375, "epoch": 0.02006057546693589, "grad_norm": 3.1180617934625405, "kl": 0.08984375, "learning_rate": 9.990073805521945e-07, "loss": 0.0036, "reward": 2.140500068664551, "reward_std": 0.016370464116334915, "rewards/accuracy_reward": 0.940500020980835, "rewards/format_reward": 1.0, "step": 1987 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.0625, "epoch": 0.020070671378091872, "grad_norm": 1.8554099702217937, "kl": 0.07373046875, "learning_rate": 9.990063815182226e-07, "loss": 0.003, "reward": 2.150750160217285, "reward_std": 0.010465890169143677, "rewards/accuracy_reward": 0.9507499933242798, "rewards/format_reward": 1.0, "step": 1988 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.5625, "epoch": 0.020080767289247854, "grad_norm": 4.724014926918523, "kl": 0.09423828125, "learning_rate": 9.990053819822586e-07, "loss": 0.0038, "reward": 2.087531328201294, "reward_std": 0.036988452076911926, "rewards/accuracy_reward": 0.8875312209129333, "rewards/format_reward": 1.0, "step": 1989 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.020090863200403836, "grad_norm": 20.7165464493838, "kl": 0.087890625, "learning_rate": 9.990043819443035e-07, "loss": 0.0035, "reward": 2.0470001697540283, "reward_std": 0.028242077678442, "rewards/accuracy_reward": 0.8470000624656677, "rewards/format_reward": 1.0, "step": 1990 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 422.375, "epoch": 0.02010095911155982, "grad_norm": 1.0620197685327681, "kl": 0.0625, "learning_rate": 9.99003381404358e-07, "loss": 0.0025, "reward": 1.5715625286102295, "reward_std": 0.006469364743679762, "rewards/accuracy_reward": 0.4715625047683716, "rewards/format_reward": 1.0, "step": 1991 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.0201110550227158, "grad_norm": 2.3015020814617206, "kl": 0.072265625, "learning_rate": 9.990023803624238e-07, "loss": 0.0029, "reward": 2.1129062175750732, "reward_std": 0.007519492879509926, "rewards/accuracy_reward": 0.9129061698913574, "rewards/format_reward": 1.0, "step": 1992 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 0.020121150933871783, "grad_norm": 3.3270520118772775, "kl": 0.07958984375, "learning_rate": 9.990013788185015e-07, "loss": 0.0032, "reward": 1.8775625228881836, "reward_std": 0.01735202595591545, "rewards/accuracy_reward": 0.6775624752044678, "rewards/format_reward": 1.0, "step": 1993 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.6875, "epoch": 0.020131246845027765, "grad_norm": 1.8979045510612749, "kl": 0.08203125, "learning_rate": 9.99000376772592e-07, "loss": 0.0033, "reward": 2.1799373626708984, "reward_std": 0.00884217582643032, "rewards/accuracy_reward": 0.9799374938011169, "rewards/format_reward": 1.0, "step": 1994 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 422.3125, "epoch": 0.020141342756183747, "grad_norm": 4.705610721332827, "kl": 0.0634765625, "learning_rate": 9.989993742246964e-07, "loss": 0.0025, "reward": 1.7422500848770142, "reward_std": 0.1326545923948288, "rewards/accuracy_reward": 0.6047499775886536, "rewards/format_reward": 1.0, "step": 1995 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.75, "epoch": 0.020151438667339726, "grad_norm": 14.035011361346339, "kl": 0.08056640625, "learning_rate": 9.98998371174816e-07, "loss": 0.0032, "reward": 1.9358437061309814, "reward_std": 0.02146998420357704, "rewards/accuracy_reward": 0.7358438372612, "rewards/format_reward": 1.0, "step": 1996 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.78125, "epoch": 0.020161534578495708, "grad_norm": 2.923481112516065, "kl": 0.07666015625, "learning_rate": 9.989973676229512e-07, "loss": 0.0031, "reward": 1.8441874980926514, "reward_std": 0.025737863034009933, "rewards/accuracy_reward": 0.7004375457763672, "rewards/format_reward": 1.0, "step": 1997 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 0.02017163048965169, "grad_norm": 3.845422881215062, "kl": 0.08935546875, "learning_rate": 9.989963635691036e-07, "loss": 0.0036, "reward": 2.0377187728881836, "reward_std": 0.026152949780225754, "rewards/accuracy_reward": 0.8377187252044678, "rewards/format_reward": 1.0, "step": 1998 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.53125, "epoch": 0.020181726400807672, "grad_norm": 3.7916670721903816, "kl": 0.0791015625, "learning_rate": 9.98995359013274e-07, "loss": 0.0032, "reward": 1.8158749341964722, "reward_std": 0.016739649698138237, "rewards/accuracy_reward": 0.6658750176429749, "rewards/format_reward": 1.0, "step": 1999 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.625, "epoch": 0.020191822311963654, "grad_norm": 2.8617840788765947, "kl": 0.07666015625, "learning_rate": 9.989943539554634e-07, "loss": 0.0031, "reward": 1.9051563739776611, "reward_std": 0.17394836246967316, "rewards/accuracy_reward": 0.7489062547683716, "rewards/format_reward": 1.0, "step": 2000 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.5, "epoch": 0.020201918223119637, "grad_norm": 5.335973704284112, "kl": 0.0888671875, "learning_rate": 9.989933483956727e-07, "loss": 0.0036, "reward": 2.101687431335449, "reward_std": 0.018748462200164795, "rewards/accuracy_reward": 0.901687502861023, "rewards/format_reward": 1.0, "step": 2001 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.02021201413427562, "grad_norm": 4.984281433101182, "kl": 0.07373046875, "learning_rate": 9.98992342333903e-07, "loss": 0.003, "reward": 2.111375093460083, "reward_std": 0.017180925235152245, "rewards/accuracy_reward": 0.9113749861717224, "rewards/format_reward": 1.0, "step": 2002 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.90625, "epoch": 0.0202221100454316, "grad_norm": 2.350561431157563, "kl": 0.08203125, "learning_rate": 9.989913357701553e-07, "loss": 0.0033, "reward": 2.0142812728881836, "reward_std": 0.017096608877182007, "rewards/accuracy_reward": 0.8142812252044678, "rewards/format_reward": 1.0, "step": 2003 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.5, "epoch": 0.020232205956587583, "grad_norm": 3.594596700835705, "kl": 0.08056640625, "learning_rate": 9.989903287044307e-07, "loss": 0.0032, "reward": 2.025343894958496, "reward_std": 0.03221556544303894, "rewards/accuracy_reward": 0.8253437280654907, "rewards/format_reward": 1.0, "step": 2004 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.03125, "epoch": 0.020242301867743565, "grad_norm": 2.466363489927528, "kl": 0.0927734375, "learning_rate": 9.9898932113673e-07, "loss": 0.0037, "reward": 2.143249988555908, "reward_std": 0.013729519210755825, "rewards/accuracy_reward": 0.9432500004768372, "rewards/format_reward": 1.0, "step": 2005 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.5625, "epoch": 0.020252397778899547, "grad_norm": 3.0039385732867276, "kl": 0.0703125, "learning_rate": 9.989883130670545e-07, "loss": 0.0028, "reward": 1.9971874952316284, "reward_std": 0.2679932117462158, "rewards/accuracy_reward": 0.8221874833106995, "rewards/format_reward": 1.0, "step": 2006 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 395.65625, "epoch": 0.020262493690055526, "grad_norm": 2.680937935477676, "kl": 0.0703125, "learning_rate": 9.98987304495405e-07, "loss": 0.0028, "reward": 1.879906415939331, "reward_std": 0.017996082082390785, "rewards/accuracy_reward": 0.7299063205718994, "rewards/format_reward": 1.0, "step": 2007 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.03125, "epoch": 0.020272589601211508, "grad_norm": 1.3549582383253052, "kl": 0.06298828125, "learning_rate": 9.989862954217826e-07, "loss": 0.0025, "reward": 2.105781316757202, "reward_std": 0.03905763477087021, "rewards/accuracy_reward": 0.9182812571525574, "rewards/format_reward": 1.0, "step": 2008 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.0, "epoch": 0.02028268551236749, "grad_norm": 2.85491750124472, "kl": 0.07861328125, "learning_rate": 9.989852858461884e-07, "loss": 0.0032, "reward": 1.9254062175750732, "reward_std": 0.16987478733062744, "rewards/accuracy_reward": 0.7566562294960022, "rewards/format_reward": 1.0, "step": 2009 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.53125, "epoch": 0.020292781423523473, "grad_norm": 2.3014377973200233, "kl": 0.0693359375, "learning_rate": 9.989842757686232e-07, "loss": 0.0028, "reward": 1.8290624618530273, "reward_std": 0.02272520773112774, "rewards/accuracy_reward": 0.6853125095367432, "rewards/format_reward": 1.0, "step": 2010 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.0625, "epoch": 0.020302877334679455, "grad_norm": 2.5517174731956174, "kl": 0.083984375, "learning_rate": 9.98983265189088e-07, "loss": 0.0034, "reward": 2.1495625972747803, "reward_std": 0.034690842032432556, "rewards/accuracy_reward": 0.9558125138282776, "rewards/format_reward": 1.0, "step": 2011 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 387.125, "epoch": 0.020312973245835437, "grad_norm": 5.580126380886107, "kl": 0.07958984375, "learning_rate": 9.989822541075843e-07, "loss": 0.0032, "reward": 1.7134063243865967, "reward_std": 0.034709200263023376, "rewards/accuracy_reward": 0.5696563124656677, "rewards/format_reward": 1.0, "step": 2012 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.125, "epoch": 0.02032306915699142, "grad_norm": 2.413630677528888, "kl": 0.06640625, "learning_rate": 9.989812425241127e-07, "loss": 0.0027, "reward": 1.7567499876022339, "reward_std": 0.11037341505289078, "rewards/accuracy_reward": 0.6129999756813049, "rewards/format_reward": 1.0, "step": 2013 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.09375, "epoch": 0.0203331650681474, "grad_norm": 2.560614893765111, "kl": 0.07177734375, "learning_rate": 9.989802304386743e-07, "loss": 0.0029, "reward": 1.7615625858306885, "reward_std": 0.16454461216926575, "rewards/accuracy_reward": 0.6303125023841858, "rewards/format_reward": 1.0, "step": 2014 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 386.90625, "epoch": 0.020343260979303383, "grad_norm": 35.576775512761046, "kl": 0.0634765625, "learning_rate": 9.9897921785127e-07, "loss": 0.0025, "reward": 1.870437502861023, "reward_std": 0.03536522388458252, "rewards/accuracy_reward": 0.7329374551773071, "rewards/format_reward": 1.0, "step": 2015 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 381.25, "epoch": 0.020353356890459365, "grad_norm": 2.975208082730544, "kl": 0.06982421875, "learning_rate": 9.989782047619009e-07, "loss": 0.0028, "reward": 2.0138750076293945, "reward_std": 0.03372287377715111, "rewards/accuracy_reward": 0.8138750195503235, "rewards/format_reward": 1.0, "step": 2016 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.59375, "epoch": 0.020363452801615344, "grad_norm": 2.2949837119536043, "kl": 0.0703125, "learning_rate": 9.98977191170568e-07, "loss": 0.0028, "reward": 1.8415937423706055, "reward_std": 0.025358060374855995, "rewards/accuracy_reward": 0.6915937662124634, "rewards/format_reward": 1.0, "step": 2017 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 382.4375, "epoch": 0.020373548712771326, "grad_norm": 2.4760016321030216, "kl": 0.06689453125, "learning_rate": 9.989761770772725e-07, "loss": 0.0027, "reward": 1.7726249694824219, "reward_std": 0.005401181988418102, "rewards/accuracy_reward": 0.6226250529289246, "rewards/format_reward": 1.0, "step": 2018 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.125, "epoch": 0.02038364462392731, "grad_norm": 2.765811284688597, "kl": 0.07763671875, "learning_rate": 9.98975162482015e-07, "loss": 0.0031, "reward": 2.1103436946868896, "reward_std": 0.02155698463320732, "rewards/accuracy_reward": 0.9103437066078186, "rewards/format_reward": 1.0, "step": 2019 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.84375, "epoch": 0.02039374053508329, "grad_norm": 4.096403832712707, "kl": 0.0849609375, "learning_rate": 9.98974147384797e-07, "loss": 0.0034, "reward": 2.0, "reward_std": 0.03194475919008255, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 2020 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.625, "epoch": 0.020403836446239273, "grad_norm": 3.6303282516935202, "kl": 0.083984375, "learning_rate": 9.989731317856194e-07, "loss": 0.0034, "reward": 2.113499879837036, "reward_std": 0.04804852232336998, "rewards/accuracy_reward": 0.9259999990463257, "rewards/format_reward": 1.0, "step": 2021 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.8125, "epoch": 0.020413932357395255, "grad_norm": 4.157528209951982, "kl": 0.08203125, "learning_rate": 9.98972115684483e-07, "loss": 0.0033, "reward": 2.022125005722046, "reward_std": 0.01702837646007538, "rewards/accuracy_reward": 0.8221250176429749, "rewards/format_reward": 1.0, "step": 2022 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.28125, "epoch": 0.020424028268551237, "grad_norm": 0.9794081328554318, "kl": 0.0625, "learning_rate": 9.98971099081389e-07, "loss": 0.0025, "reward": 1.9105000495910645, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.7605000138282776, "rewards/format_reward": 1.0, "step": 2023 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.28125, "epoch": 0.02043412417970722, "grad_norm": 2.6673061441296078, "kl": 0.09326171875, "learning_rate": 9.989700819763385e-07, "loss": 0.0037, "reward": 2.0116875171661377, "reward_std": 0.01581251621246338, "rewards/accuracy_reward": 0.8116875290870667, "rewards/format_reward": 1.0, "step": 2024 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.0625, "epoch": 0.0204442200908632, "grad_norm": 1.7791652220738357, "kl": 0.07763671875, "learning_rate": 9.989690643693325e-07, "loss": 0.0031, "reward": 2.071812629699707, "reward_std": 0.007987920194864273, "rewards/accuracy_reward": 0.8718124628067017, "rewards/format_reward": 1.0, "step": 2025 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.375, "epoch": 0.020454316002019184, "grad_norm": 1.600280927993615, "kl": 0.06884765625, "learning_rate": 9.989680462603718e-07, "loss": 0.0028, "reward": 2.1464688777923584, "reward_std": 0.10809893161058426, "rewards/accuracy_reward": 0.9527187347412109, "rewards/format_reward": 1.0, "step": 2026 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.78125, "epoch": 0.020464411913175166, "grad_norm": 1.672946876202943, "kl": 0.06982421875, "learning_rate": 9.989670276494574e-07, "loss": 0.0028, "reward": 1.8575936555862427, "reward_std": 0.00510430708527565, "rewards/accuracy_reward": 0.7075937390327454, "rewards/format_reward": 1.0, "step": 2027 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 423.6875, "epoch": 0.020474507824331144, "grad_norm": 2.0418889804630327, "kl": 0.057861328125, "learning_rate": 9.989660085365907e-07, "loss": 0.0023, "reward": 1.8713124990463257, "reward_std": 0.005480065941810608, "rewards/accuracy_reward": 0.7213124632835388, "rewards/format_reward": 1.0, "step": 2028 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.020484603735487127, "grad_norm": 3.711937205688598, "kl": 0.0830078125, "learning_rate": 9.989649889217723e-07, "loss": 0.0033, "reward": 2.118875026702881, "reward_std": 0.02627306431531906, "rewards/accuracy_reward": 0.9188750386238098, "rewards/format_reward": 1.0, "step": 2029 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.9375, "epoch": 0.02049469964664311, "grad_norm": 2.4947963912158415, "kl": 0.0703125, "learning_rate": 9.989639688050038e-07, "loss": 0.0028, "reward": 1.7455625534057617, "reward_std": 0.026316137984395027, "rewards/accuracy_reward": 0.6018125414848328, "rewards/format_reward": 1.0, "step": 2030 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.03125, "epoch": 0.02050479555779909, "grad_norm": 1.5897044889889278, "kl": 0.05615234375, "learning_rate": 9.989629481862856e-07, "loss": 0.0022, "reward": 1.7229688167572021, "reward_std": 0.1690257340669632, "rewards/accuracy_reward": 0.5979687571525574, "rewards/format_reward": 1.0, "step": 2031 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 394.90625, "epoch": 0.020514891468955073, "grad_norm": 6.216933860221115, "kl": 0.0751953125, "learning_rate": 9.989619270656192e-07, "loss": 0.003, "reward": 1.7331249713897705, "reward_std": 0.01739508844912052, "rewards/accuracy_reward": 0.5831249952316284, "rewards/format_reward": 1.0, "step": 2032 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.84375, "epoch": 0.020524987380111055, "grad_norm": 2.5684441870301935, "kl": 0.078125, "learning_rate": 9.98960905443005e-07, "loss": 0.0031, "reward": 2.1019999980926514, "reward_std": 0.0087264534085989, "rewards/accuracy_reward": 0.9019999504089355, "rewards/format_reward": 1.0, "step": 2033 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.78125, "epoch": 0.020535083291267037, "grad_norm": 2.2158662458440768, "kl": 0.07275390625, "learning_rate": 9.989598833184447e-07, "loss": 0.0029, "reward": 1.8128125667572021, "reward_std": 0.025998611003160477, "rewards/accuracy_reward": 0.6690624952316284, "rewards/format_reward": 1.0, "step": 2034 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.21875, "epoch": 0.02054517920242302, "grad_norm": 1.6230066761529145, "kl": 0.076171875, "learning_rate": 9.989588606919392e-07, "loss": 0.0031, "reward": 2.105562686920166, "reward_std": 0.026437396183609962, "rewards/accuracy_reward": 0.9118124842643738, "rewards/format_reward": 1.0, "step": 2035 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.34375, "epoch": 0.020555275113579, "grad_norm": 3.7717000247091774, "kl": 0.068359375, "learning_rate": 9.989578375634892e-07, "loss": 0.0027, "reward": 1.7811250686645508, "reward_std": 0.01557660661637783, "rewards/accuracy_reward": 0.6311249732971191, "rewards/format_reward": 1.0, "step": 2036 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.1875, "epoch": 0.020565371024734984, "grad_norm": 4.619600924894214, "kl": 0.07373046875, "learning_rate": 9.98956813933096e-07, "loss": 0.003, "reward": 1.8454062938690186, "reward_std": 0.01431332528591156, "rewards/accuracy_reward": 0.6954061985015869, "rewards/format_reward": 1.0, "step": 2037 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.34375, "epoch": 0.020575466935890963, "grad_norm": 2.089981209046951, "kl": 0.076171875, "learning_rate": 9.989557898007605e-07, "loss": 0.003, "reward": 2.135499954223633, "reward_std": 0.008041337132453918, "rewards/accuracy_reward": 0.9355000257492065, "rewards/format_reward": 1.0, "step": 2038 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.46875, "epoch": 0.020585562847046945, "grad_norm": 1.13041547118946, "kl": 0.07568359375, "learning_rate": 9.989547651664841e-07, "loss": 0.003, "reward": 2.091625213623047, "reward_std": 0.021023370325565338, "rewards/accuracy_reward": 0.8978750109672546, "rewards/format_reward": 1.0, "step": 2039 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.21875, "epoch": 0.020595658758202927, "grad_norm": 2.8566836719693005, "kl": 0.0771484375, "learning_rate": 9.989537400302672e-07, "loss": 0.0031, "reward": 2.1383438110351562, "reward_std": 0.03432636708021164, "rewards/accuracy_reward": 0.9445937871932983, "rewards/format_reward": 1.0, "step": 2040 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 406.65625, "epoch": 0.02060575466935891, "grad_norm": 1.5389206010312009, "kl": 0.08056640625, "learning_rate": 9.989527143921113e-07, "loss": 0.0032, "reward": 1.8553438186645508, "reward_std": 0.007489971816539764, "rewards/accuracy_reward": 0.7053437232971191, "rewards/format_reward": 1.0, "step": 2041 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.03125, "epoch": 0.02061585058051489, "grad_norm": 4.347189935526206, "kl": 0.08349609375, "learning_rate": 9.98951688252017e-07, "loss": 0.0033, "reward": 2.0725936889648438, "reward_std": 0.02830960787832737, "rewards/accuracy_reward": 0.8788437843322754, "rewards/format_reward": 1.0, "step": 2042 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.5, "epoch": 0.020625946491670873, "grad_norm": 2.9616547702405227, "kl": 0.07470703125, "learning_rate": 9.98950661609986e-07, "loss": 0.003, "reward": 2.156125068664551, "reward_std": 0.010667011141777039, "rewards/accuracy_reward": 0.956125020980835, "rewards/format_reward": 1.0, "step": 2043 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 427.0625, "epoch": 0.020636042402826855, "grad_norm": 1.6280297135718762, "kl": 0.060302734375, "learning_rate": 9.989496344660188e-07, "loss": 0.0024, "reward": 1.8606873750686646, "reward_std": 0.03328666090965271, "rewards/accuracy_reward": 0.7231874465942383, "rewards/format_reward": 1.0, "step": 2044 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 406.375, "epoch": 0.020646138313982838, "grad_norm": 14.301918153198692, "kl": 0.07373046875, "learning_rate": 9.989486068201166e-07, "loss": 0.003, "reward": 1.8754687309265137, "reward_std": 0.0074773747473955154, "rewards/accuracy_reward": 0.7254687547683716, "rewards/format_reward": 1.0, "step": 2045 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.02065623422513882, "grad_norm": 3.891805934778106, "kl": 0.07470703125, "learning_rate": 9.989475786722804e-07, "loss": 0.003, "reward": 1.8551251888275146, "reward_std": 0.10117879509925842, "rewards/accuracy_reward": 0.7051250338554382, "rewards/format_reward": 1.0, "step": 2046 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.65625, "epoch": 0.020666330136294802, "grad_norm": 0.813048363449424, "kl": 0.05419921875, "learning_rate": 9.989465500225111e-07, "loss": 0.0022, "reward": 2.1202502250671387, "reward_std": 0.023145517334342003, "rewards/accuracy_reward": 0.9327499866485596, "rewards/format_reward": 1.0, "step": 2047 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.020676426047450784, "grad_norm": 2.1376477676724033, "kl": 0.06640625, "learning_rate": 9.9894552087081e-07, "loss": 0.0027, "reward": 1.9060626029968262, "reward_std": 0.16958832740783691, "rewards/accuracy_reward": 0.7373125553131104, "rewards/format_reward": 1.0, "step": 2048 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.09375, "epoch": 0.020686521958606763, "grad_norm": 6.669976121358341, "kl": 0.078125, "learning_rate": 9.989444912171782e-07, "loss": 0.0031, "reward": 2.142125129699707, "reward_std": 0.022655898705124855, "rewards/accuracy_reward": 0.9421249628067017, "rewards/format_reward": 1.0, "step": 2049 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 408.53125, "epoch": 0.020696617869762745, "grad_norm": 4.250867989619068, "kl": 0.07080078125, "learning_rate": 9.989434610616162e-07, "loss": 0.0028, "reward": 1.8206250667572021, "reward_std": 0.035100750625133514, "rewards/accuracy_reward": 0.6768749952316284, "rewards/format_reward": 1.0, "step": 2050 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.5, "epoch": 0.020706713780918727, "grad_norm": 2.912842215712259, "kl": 0.08447265625, "learning_rate": 9.989424304041258e-07, "loss": 0.0034, "reward": 2.026531219482422, "reward_std": 0.02076500840485096, "rewards/accuracy_reward": 0.8265312314033508, "rewards/format_reward": 1.0, "step": 2051 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.75, "epoch": 0.02071680969207471, "grad_norm": 8.876562002446502, "kl": 0.07861328125, "learning_rate": 9.989413992447074e-07, "loss": 0.0031, "reward": 2.0366873741149902, "reward_std": 0.028219230473041534, "rewards/accuracy_reward": 0.8366875052452087, "rewards/format_reward": 1.0, "step": 2052 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.4375, "epoch": 0.02072690560323069, "grad_norm": 4.88537929311398, "kl": 0.07275390625, "learning_rate": 9.989403675833623e-07, "loss": 0.0029, "reward": 2.1265313625335693, "reward_std": 0.029935281723737717, "rewards/accuracy_reward": 0.9265312552452087, "rewards/format_reward": 1.0, "step": 2053 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.28125, "epoch": 0.020737001514386674, "grad_norm": 3.4271438607623406, "kl": 0.08056640625, "learning_rate": 9.989393354200914e-07, "loss": 0.0032, "reward": 2.1102187633514404, "reward_std": 0.016028478741645813, "rewards/accuracy_reward": 0.9102187156677246, "rewards/format_reward": 1.0, "step": 2054 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 397.875, "epoch": 0.020747097425542656, "grad_norm": 3.222323826109745, "kl": 0.055419921875, "learning_rate": 9.989383027548962e-07, "loss": 0.0022, "reward": 1.5875000953674316, "reward_std": 0.01336308941245079, "rewards/accuracy_reward": 0.48750001192092896, "rewards/format_reward": 1.0, "step": 2055 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.40625, "epoch": 0.020757193336698638, "grad_norm": 2.7764305921837638, "kl": 0.06591796875, "learning_rate": 9.98937269587777e-07, "loss": 0.0026, "reward": 1.8301249742507935, "reward_std": 0.02171158790588379, "rewards/accuracy_reward": 0.6801249980926514, "rewards/format_reward": 1.0, "step": 2056 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.09375, "epoch": 0.02076728924785462, "grad_norm": 2.171388606878805, "kl": 0.08056640625, "learning_rate": 9.989362359187353e-07, "loss": 0.0032, "reward": 2.047374963760376, "reward_std": 0.024511966854333878, "rewards/accuracy_reward": 0.8536249399185181, "rewards/format_reward": 1.0, "step": 2057 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 415.8125, "epoch": 0.020777385159010602, "grad_norm": 1.837152742103255, "kl": 0.05859375, "learning_rate": 9.98935201747772e-07, "loss": 0.0023, "reward": 1.8686561584472656, "reward_std": 0.007262636441737413, "rewards/accuracy_reward": 0.7186563014984131, "rewards/format_reward": 1.0, "step": 2058 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.02078748107016658, "grad_norm": 3.2992790723400995, "kl": 0.06689453125, "learning_rate": 9.989341670748886e-07, "loss": 0.0027, "reward": 2.1124062538146973, "reward_std": 0.04271622747182846, "rewards/accuracy_reward": 0.9311562776565552, "rewards/format_reward": 1.0, "step": 2059 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.03125, "epoch": 0.020797576981322563, "grad_norm": 2.311988533300744, "kl": 0.0693359375, "learning_rate": 9.989331319000853e-07, "loss": 0.0028, "reward": 1.747843861579895, "reward_std": 0.02190137654542923, "rewards/accuracy_reward": 0.5978437662124634, "rewards/format_reward": 1.0, "step": 2060 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 418.65625, "epoch": 0.020807672892478545, "grad_norm": 1.5904847933614157, "kl": 0.058349609375, "learning_rate": 9.98932096223364e-07, "loss": 0.0023, "reward": 1.543562412261963, "reward_std": 0.010834148153662682, "rewards/accuracy_reward": 0.44356250762939453, "rewards/format_reward": 1.0, "step": 2061 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.020817768803634527, "grad_norm": 2.247681667616463, "kl": 0.0693359375, "learning_rate": 9.98931060044725e-07, "loss": 0.0028, "reward": 2.1743438243865967, "reward_std": 0.02551322430372238, "rewards/accuracy_reward": 0.980593740940094, "rewards/format_reward": 1.0, "step": 2062 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.625, "epoch": 0.02082786471479051, "grad_norm": 4.229500966095026, "kl": 0.064453125, "learning_rate": 9.989300233641696e-07, "loss": 0.0026, "reward": 2.0429062843322754, "reward_std": 0.01803898625075817, "rewards/accuracy_reward": 0.8429062366485596, "rewards/format_reward": 1.0, "step": 2063 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.28125, "epoch": 0.02083796062594649, "grad_norm": 6.431388488806165, "kl": 0.0771484375, "learning_rate": 9.989289861816991e-07, "loss": 0.0031, "reward": 2.1327500343322754, "reward_std": 0.01492675393819809, "rewards/accuracy_reward": 0.9327499866485596, "rewards/format_reward": 1.0, "step": 2064 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.020848056537102474, "grad_norm": 2.0527117650295286, "kl": 0.06982421875, "learning_rate": 9.989279484973143e-07, "loss": 0.0028, "reward": 2.1299686431884766, "reward_std": 0.03393946960568428, "rewards/accuracy_reward": 0.9424687623977661, "rewards/format_reward": 1.0, "step": 2065 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.8125, "epoch": 0.020858152448258456, "grad_norm": 14.905595530555686, "kl": 0.07568359375, "learning_rate": 9.989269103110162e-07, "loss": 0.003, "reward": 2.1065001487731934, "reward_std": 0.02708832547068596, "rewards/accuracy_reward": 0.9065001010894775, "rewards/format_reward": 1.0, "step": 2066 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.5625, "epoch": 0.020868248359414438, "grad_norm": 2.4335514730301377, "kl": 0.076171875, "learning_rate": 9.989258716228062e-07, "loss": 0.0031, "reward": 2.141031265258789, "reward_std": 0.016965975984930992, "rewards/accuracy_reward": 0.9410312175750732, "rewards/format_reward": 1.0, "step": 2067 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.8125, "epoch": 0.02087834427057042, "grad_norm": 2.855526055743474, "kl": 0.06396484375, "learning_rate": 9.98924832432685e-07, "loss": 0.0026, "reward": 2.099562644958496, "reward_std": 0.15087297558784485, "rewards/accuracy_reward": 0.9120625257492065, "rewards/format_reward": 1.0, "step": 2068 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.9375, "epoch": 0.020888440181726402, "grad_norm": 1.1187532537743878, "kl": 0.0576171875, "learning_rate": 9.989237927406534e-07, "loss": 0.0023, "reward": 2.1919374465942383, "reward_std": 0.002499109134078026, "rewards/accuracy_reward": 0.9919375777244568, "rewards/format_reward": 1.0, "step": 2069 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.21875, "epoch": 0.02089853609288238, "grad_norm": 4.358501545022062, "kl": 0.0849609375, "learning_rate": 9.989227525467132e-07, "loss": 0.0034, "reward": 2.10575008392334, "reward_std": 0.03513549268245697, "rewards/accuracy_reward": 0.9120000600814819, "rewards/format_reward": 1.0, "step": 2070 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.6875, "epoch": 0.020908632004038363, "grad_norm": 2.837021086901192, "kl": 0.06591796875, "learning_rate": 9.989217118508649e-07, "loss": 0.0026, "reward": 2.0188751220703125, "reward_std": 0.11270163208246231, "rewards/accuracy_reward": 0.8376250267028809, "rewards/format_reward": 1.0, "step": 2071 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.25, "epoch": 0.020918727915194345, "grad_norm": 1.965708437099337, "kl": 0.06982421875, "learning_rate": 9.989206706531097e-07, "loss": 0.0028, "reward": 1.8208438158035278, "reward_std": 0.03004186972975731, "rewards/accuracy_reward": 0.683343768119812, "rewards/format_reward": 1.0, "step": 2072 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.25, "epoch": 0.020928823826350328, "grad_norm": 1.8974600651480356, "kl": 0.051025390625, "learning_rate": 9.989196289534486e-07, "loss": 0.002, "reward": 2.1395626068115234, "reward_std": 0.034630194306373596, "rewards/accuracy_reward": 0.9520624876022339, "rewards/format_reward": 1.0, "step": 2073 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.5, "epoch": 0.02093891973750631, "grad_norm": 4.458389203177858, "kl": 0.06396484375, "learning_rate": 9.989185867518825e-07, "loss": 0.0025, "reward": 1.8099374771118164, "reward_std": 0.012992609292268753, "rewards/accuracy_reward": 0.6599375009536743, "rewards/format_reward": 1.0, "step": 2074 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.020949015648662292, "grad_norm": 3.0650856756269844, "kl": 0.078125, "learning_rate": 9.989175440484128e-07, "loss": 0.0031, "reward": 2.0659687519073486, "reward_std": 0.11804667115211487, "rewards/accuracy_reward": 0.8784687519073486, "rewards/format_reward": 1.0, "step": 2075 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 0.020959111559818274, "grad_norm": 5.819923913226857, "kl": 0.068359375, "learning_rate": 9.989165008430403e-07, "loss": 0.0027, "reward": 2.0927813053131104, "reward_std": 0.0062811728566884995, "rewards/accuracy_reward": 0.8927812576293945, "rewards/format_reward": 1.0, "step": 2076 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.0625, "epoch": 0.020969207470974256, "grad_norm": 4.57582724031969, "kl": 0.08447265625, "learning_rate": 9.989154571357662e-07, "loss": 0.0034, "reward": 2.0947811603546143, "reward_std": 0.019105494022369385, "rewards/accuracy_reward": 0.894781231880188, "rewards/format_reward": 1.0, "step": 2077 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.875, "epoch": 0.02097930338213024, "grad_norm": 3.489707152376873, "kl": 0.06689453125, "learning_rate": 9.989144129265914e-07, "loss": 0.0027, "reward": 2.092656373977661, "reward_std": 0.020435474812984467, "rewards/accuracy_reward": 0.8926562070846558, "rewards/format_reward": 1.0, "step": 2078 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 433.4375, "epoch": 0.02098939929328622, "grad_norm": 1.5132839309909765, "kl": 0.0556640625, "learning_rate": 9.98913368215517e-07, "loss": 0.0022, "reward": 1.6073437929153442, "reward_std": 0.09149058908224106, "rewards/accuracy_reward": 0.5073437690734863, "rewards/format_reward": 1.0, "step": 2079 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.75, "epoch": 0.0209994952044422, "grad_norm": 4.669791653790019, "kl": 0.080078125, "learning_rate": 9.98912323002544e-07, "loss": 0.0032, "reward": 1.8407187461853027, "reward_std": 0.011993227526545525, "rewards/accuracy_reward": 0.6907187700271606, "rewards/format_reward": 1.0, "step": 2080 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.0, "epoch": 0.02100959111559818, "grad_norm": 4.087548804725039, "kl": 0.06982421875, "learning_rate": 9.989112772876738e-07, "loss": 0.0028, "reward": 1.6419687271118164, "reward_std": 0.16933177411556244, "rewards/accuracy_reward": 0.5169687271118164, "rewards/format_reward": 1.0, "step": 2081 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 406.8125, "epoch": 0.021019687026754164, "grad_norm": 1.069252671351915, "kl": 0.05712890625, "learning_rate": 9.98910231070907e-07, "loss": 0.0023, "reward": 1.59375, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2082 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 419.125, "epoch": 0.021029782937910146, "grad_norm": 3.8528222507580976, "kl": 0.0595703125, "learning_rate": 9.989091843522447e-07, "loss": 0.0024, "reward": 1.5104687213897705, "reward_std": 0.02042592316865921, "rewards/accuracy_reward": 0.4104687571525574, "rewards/format_reward": 1.0, "step": 2083 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.375, "epoch": 0.021039878849066128, "grad_norm": 1.9742754422505555, "kl": 0.05712890625, "learning_rate": 9.989081371316883e-07, "loss": 0.0023, "reward": 2.1054375171661377, "reward_std": 0.01041790097951889, "rewards/accuracy_reward": 0.9054374694824219, "rewards/format_reward": 1.0, "step": 2084 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.125, "epoch": 0.02104997476022211, "grad_norm": 3.5108104733717544, "kl": 0.08154296875, "learning_rate": 9.989070894092383e-07, "loss": 0.0033, "reward": 2.1302499771118164, "reward_std": 0.0343012735247612, "rewards/accuracy_reward": 0.9365000128746033, "rewards/format_reward": 1.0, "step": 2085 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.3125, "epoch": 0.021060070671378092, "grad_norm": 1.9776694489363975, "kl": 0.0791015625, "learning_rate": 9.989060411848964e-07, "loss": 0.0032, "reward": 1.8154374361038208, "reward_std": 0.016085587441921234, "rewards/accuracy_reward": 0.6654375195503235, "rewards/format_reward": 1.0, "step": 2086 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 436.375, "epoch": 0.021070166582534074, "grad_norm": 2.5387351591933536, "kl": 0.0537109375, "learning_rate": 9.989049924586632e-07, "loss": 0.0021, "reward": 1.84765625, "reward_std": 0.011491648852825165, "rewards/accuracy_reward": 0.6976562142372131, "rewards/format_reward": 1.0, "step": 2087 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.021080262493690057, "grad_norm": 2.4392712443130744, "kl": 0.08056640625, "learning_rate": 9.9890394323054e-07, "loss": 0.0032, "reward": 2.085625171661377, "reward_std": 0.037642426788806915, "rewards/accuracy_reward": 0.8918749094009399, "rewards/format_reward": 1.0, "step": 2088 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.5, "epoch": 0.02109035840484604, "grad_norm": 2.180741373286503, "kl": 0.078125, "learning_rate": 9.989028935005277e-07, "loss": 0.0031, "reward": 2.1030311584472656, "reward_std": 0.017089948058128357, "rewards/accuracy_reward": 0.9030312299728394, "rewards/format_reward": 1.0, "step": 2089 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.40625, "epoch": 0.02110045431600202, "grad_norm": 1.9999704840695107, "kl": 0.0732421875, "learning_rate": 9.989018432686273e-07, "loss": 0.0029, "reward": 2.0831563472747803, "reward_std": 0.006969203241169453, "rewards/accuracy_reward": 0.8831562995910645, "rewards/format_reward": 1.0, "step": 2090 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 437.0, "epoch": 0.021110550227158, "grad_norm": 2.426630165338106, "kl": 0.06298828125, "learning_rate": 9.9890079253484e-07, "loss": 0.0025, "reward": 1.8276875019073486, "reward_std": 0.03426753729581833, "rewards/accuracy_reward": 0.6776874661445618, "rewards/format_reward": 1.0, "step": 2091 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.375, "epoch": 0.02112064613831398, "grad_norm": 3.968141512790259, "kl": 0.07373046875, "learning_rate": 9.988997412991668e-07, "loss": 0.0029, "reward": 2.0345001220703125, "reward_std": 0.012878724373877048, "rewards/accuracy_reward": 0.8345000147819519, "rewards/format_reward": 1.0, "step": 2092 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 439.625, "epoch": 0.021130742049469964, "grad_norm": 1.9152645073790773, "kl": 0.0673828125, "learning_rate": 9.98898689561609e-07, "loss": 0.0027, "reward": 1.7746562957763672, "reward_std": 0.012056681327521801, "rewards/accuracy_reward": 0.6246562600135803, "rewards/format_reward": 1.0, "step": 2093 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 432.34375, "epoch": 0.021140837960625946, "grad_norm": 3.7510227276225656, "kl": 0.07666015625, "learning_rate": 9.988976373221672e-07, "loss": 0.0031, "reward": 2.148312568664551, "reward_std": 0.029411911964416504, "rewards/accuracy_reward": 0.9545624852180481, "rewards/format_reward": 1.0, "step": 2094 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 434.21875, "epoch": 0.021150933871781928, "grad_norm": 16.388694102333297, "kl": 0.07763671875, "learning_rate": 9.988965845808429e-07, "loss": 0.0031, "reward": 2.1542811393737793, "reward_std": 0.04150727391242981, "rewards/accuracy_reward": 0.9667812585830688, "rewards/format_reward": 1.0, "step": 2095 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.40625, "epoch": 0.02116102978293791, "grad_norm": 1.9788027001040966, "kl": 0.0771484375, "learning_rate": 9.988955313376365e-07, "loss": 0.0031, "reward": 2.1680312156677246, "reward_std": 0.008054003119468689, "rewards/accuracy_reward": 0.9680312871932983, "rewards/format_reward": 1.0, "step": 2096 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.34375, "epoch": 0.021171125694093892, "grad_norm": 1.642275906277221, "kl": 0.07568359375, "learning_rate": 9.9889447759255e-07, "loss": 0.003, "reward": 2.1015312671661377, "reward_std": 0.02692592889070511, "rewards/accuracy_reward": 0.9077812433242798, "rewards/format_reward": 1.0, "step": 2097 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 448.1875, "epoch": 0.021181221605249875, "grad_norm": 3.5091790974622454, "kl": 0.0810546875, "learning_rate": 9.988934233455838e-07, "loss": 0.0032, "reward": 2.0226874351501465, "reward_std": 0.022537050768733025, "rewards/accuracy_reward": 0.8226875066757202, "rewards/format_reward": 1.0, "step": 2098 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 448.78125, "epoch": 0.021191317516405857, "grad_norm": 1.853497507469735, "kl": 0.072265625, "learning_rate": 9.988923685967389e-07, "loss": 0.0029, "reward": 2.161343574523926, "reward_std": 0.01991986110806465, "rewards/accuracy_reward": 0.9675938487052917, "rewards/format_reward": 1.0, "step": 2099 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 413.75, "epoch": 0.02120141342756184, "grad_norm": 0.11377606756317131, "kl": 0.054931640625, "learning_rate": 9.98891313346017e-07, "loss": 0.0022, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2100 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 453.15625, "epoch": 0.021211509338717818, "grad_norm": 2.333861695595406, "kl": 0.0732421875, "learning_rate": 9.988902575934186e-07, "loss": 0.0029, "reward": 2.1006875038146973, "reward_std": 0.05220644176006317, "rewards/accuracy_reward": 0.9194374680519104, "rewards/format_reward": 1.0, "step": 2101 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 447.46875, "epoch": 0.0212216052498738, "grad_norm": 0.9715564586526837, "kl": 0.0654296875, "learning_rate": 9.988892013389449e-07, "loss": 0.0026, "reward": 2.190093994140625, "reward_std": 0.003245690604671836, "rewards/accuracy_reward": 0.9900937080383301, "rewards/format_reward": 1.0, "step": 2102 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 432.875, "epoch": 0.021231701161029782, "grad_norm": 5.6314071395340815, "kl": 0.0703125, "learning_rate": 9.98888144582597e-07, "loss": 0.0028, "reward": 1.4880938529968262, "reward_std": 0.009956080466508865, "rewards/accuracy_reward": 0.3880937397480011, "rewards/format_reward": 1.0, "step": 2103 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 432.125, "epoch": 0.021241797072185764, "grad_norm": 3.135126392364352, "kl": 0.080078125, "learning_rate": 9.988870873243759e-07, "loss": 0.0032, "reward": 2.112281322479248, "reward_std": 0.017666341736912727, "rewards/accuracy_reward": 0.9122812151908875, "rewards/format_reward": 1.0, "step": 2104 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.125, "epoch": 0.021251892983341746, "grad_norm": 1.6836273029145497, "kl": 0.05078125, "learning_rate": 9.988860295642827e-07, "loss": 0.002, "reward": 2.090031147003174, "reward_std": 0.16384725272655487, "rewards/accuracy_reward": 0.9087812304496765, "rewards/format_reward": 1.0, "step": 2105 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 435.40625, "epoch": 0.02126198889449773, "grad_norm": 2.7052267924047073, "kl": 0.08544921875, "learning_rate": 9.988849713023182e-07, "loss": 0.0034, "reward": 1.9726563692092896, "reward_std": 0.027498995885252953, "rewards/accuracy_reward": 0.7726563215255737, "rewards/format_reward": 1.0, "step": 2106 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.1875, "epoch": 0.02127208480565371, "grad_norm": 4.009605971698855, "kl": 0.054931640625, "learning_rate": 9.988839125384842e-07, "loss": 0.0022, "reward": 2.1410937309265137, "reward_std": 0.0127406045794487, "rewards/accuracy_reward": 0.9410938024520874, "rewards/format_reward": 1.0, "step": 2107 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.59375, "epoch": 0.021282180716809693, "grad_norm": 2.8610928792724475, "kl": 0.08251953125, "learning_rate": 9.98882853272781e-07, "loss": 0.0033, "reward": 2.059999942779541, "reward_std": 0.023370906710624695, "rewards/accuracy_reward": 0.8600000143051147, "rewards/format_reward": 1.0, "step": 2108 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 435.875, "epoch": 0.021292276627965675, "grad_norm": 5.759060701777079, "kl": 0.06884765625, "learning_rate": 9.9888179350521e-07, "loss": 0.0028, "reward": 1.8133126497268677, "reward_std": 0.01686481013894081, "rewards/accuracy_reward": 0.6633124947547913, "rewards/format_reward": 1.0, "step": 2109 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.5, "epoch": 0.021302372539121657, "grad_norm": 2.908891572479038, "kl": 0.07568359375, "learning_rate": 9.988807332357724e-07, "loss": 0.003, "reward": 2.157531261444092, "reward_std": 0.04218391329050064, "rewards/accuracy_reward": 0.9700312614440918, "rewards/format_reward": 1.0, "step": 2110 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.59375, "epoch": 0.02131246845027764, "grad_norm": 1.6667879045118803, "kl": 0.07568359375, "learning_rate": 9.98879672464469e-07, "loss": 0.003, "reward": 2.044281244277954, "reward_std": 0.008205441758036613, "rewards/accuracy_reward": 0.8442812561988831, "rewards/format_reward": 1.0, "step": 2111 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.28125, "epoch": 0.021322564361433618, "grad_norm": 2.2005361367760914, "kl": 0.083984375, "learning_rate": 9.98878611191301e-07, "loss": 0.0033, "reward": 2.1679372787475586, "reward_std": 0.00866367295384407, "rewards/accuracy_reward": 0.9679375886917114, "rewards/format_reward": 1.0, "step": 2112 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.53125, "epoch": 0.0213326602725896, "grad_norm": 2.3831087738377508, "kl": 0.07080078125, "learning_rate": 9.988775494162693e-07, "loss": 0.0028, "reward": 2.110156297683716, "reward_std": 0.011370650492608547, "rewards/accuracy_reward": 0.91015625, "rewards/format_reward": 1.0, "step": 2113 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.21875, "epoch": 0.021342756183745582, "grad_norm": 2.104315265725207, "kl": 0.057373046875, "learning_rate": 9.988764871393753e-07, "loss": 0.0023, "reward": 2.170781135559082, "reward_std": 0.013326562009751797, "rewards/accuracy_reward": 0.9707812666893005, "rewards/format_reward": 1.0, "step": 2114 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.65625, "epoch": 0.021352852094901564, "grad_norm": 2.3655170036408806, "kl": 0.06640625, "learning_rate": 9.988754243606197e-07, "loss": 0.0026, "reward": 1.9393750429153442, "reward_std": 0.16900770366191864, "rewards/accuracy_reward": 0.7643750309944153, "rewards/format_reward": 1.0, "step": 2115 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.8125, "epoch": 0.021362948006057546, "grad_norm": 2.9671452305593653, "kl": 0.08203125, "learning_rate": 9.988743610800037e-07, "loss": 0.0033, "reward": 2.0247812271118164, "reward_std": 0.014850197359919548, "rewards/accuracy_reward": 0.8247812986373901, "rewards/format_reward": 1.0, "step": 2116 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 409.21875, "epoch": 0.02137304391721353, "grad_norm": 7.5399986713432785, "kl": 0.08203125, "learning_rate": 9.988732972975287e-07, "loss": 0.0033, "reward": 1.855718731880188, "reward_std": 0.008290504105389118, "rewards/accuracy_reward": 0.7057187557220459, "rewards/format_reward": 1.0, "step": 2117 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 395.25, "epoch": 0.02138313982836951, "grad_norm": 1.6322826035591929, "kl": 0.051513671875, "learning_rate": 9.988722330131951e-07, "loss": 0.0021, "reward": 1.891031265258789, "reward_std": 0.00864211656153202, "rewards/accuracy_reward": 0.7410312294960022, "rewards/format_reward": 1.0, "step": 2118 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.34375, "epoch": 0.021393235739525493, "grad_norm": 29.053591972326224, "kl": 0.07763671875, "learning_rate": 9.988711682270045e-07, "loss": 0.0031, "reward": 2.1471874713897705, "reward_std": 0.015050126239657402, "rewards/accuracy_reward": 0.9471874237060547, "rewards/format_reward": 1.0, "step": 2119 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 388.5, "epoch": 0.021403331650681475, "grad_norm": 0.9486239658736145, "kl": 0.06103515625, "learning_rate": 9.988701029389579e-07, "loss": 0.0024, "reward": 1.59375, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2120 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 385.5, "epoch": 0.021413427561837457, "grad_norm": 3.2771543191848878, "kl": 0.068359375, "learning_rate": 9.988690371490563e-07, "loss": 0.0027, "reward": 1.875156283378601, "reward_std": 0.01685749739408493, "rewards/accuracy_reward": 0.725156307220459, "rewards/format_reward": 1.0, "step": 2121 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 401.34375, "epoch": 0.021423523472993436, "grad_norm": 1.1125699606981128, "kl": 0.05615234375, "learning_rate": 9.988679708573006e-07, "loss": 0.0022, "reward": 1.8916873931884766, "reward_std": 0.0035775911528617144, "rewards/accuracy_reward": 0.7416874766349792, "rewards/format_reward": 1.0, "step": 2122 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.71875, "epoch": 0.021433619384149418, "grad_norm": 2.483742345456736, "kl": 0.07763671875, "learning_rate": 9.988669040636923e-07, "loss": 0.0031, "reward": 1.954062581062317, "reward_std": 0.016719717532396317, "rewards/accuracy_reward": 0.7540625333786011, "rewards/format_reward": 1.0, "step": 2123 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 402.84375, "epoch": 0.0214437152953054, "grad_norm": 2.771835545406739, "kl": 0.07080078125, "learning_rate": 9.98865836768232e-07, "loss": 0.0028, "reward": 1.5624375343322754, "reward_std": 0.022642547264695168, "rewards/accuracy_reward": 0.4686874747276306, "rewards/format_reward": 1.0, "step": 2124 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.3125, "epoch": 0.021453811206461382, "grad_norm": 2.147583858383804, "kl": 0.0673828125, "learning_rate": 9.98864768970921e-07, "loss": 0.0027, "reward": 2.089718818664551, "reward_std": 0.008346646092832088, "rewards/accuracy_reward": 0.889718770980835, "rewards/format_reward": 1.0, "step": 2125 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.28125, "epoch": 0.021463907117617365, "grad_norm": 4.348707750847445, "kl": 0.07080078125, "learning_rate": 9.988637006717604e-07, "loss": 0.0028, "reward": 2.1588125228881836, "reward_std": 0.02511058747768402, "rewards/accuracy_reward": 0.9588125348091125, "rewards/format_reward": 1.0, "step": 2126 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 399.5625, "epoch": 0.021474003028773347, "grad_norm": 1.99270454499038, "kl": 0.06005859375, "learning_rate": 9.98862631870751e-07, "loss": 0.0024, "reward": 1.6132500171661377, "reward_std": 0.15655067563056946, "rewards/accuracy_reward": 0.5069999694824219, "rewards/format_reward": 1.0, "step": 2127 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 404.40625, "epoch": 0.02148409893992933, "grad_norm": 0.10300060462534463, "kl": 0.059814453125, "learning_rate": 9.988615625678945e-07, "loss": 0.0024, "reward": 1.8375000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2128 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.75, "epoch": 0.02149419485108531, "grad_norm": 2.41924130787229, "kl": 0.0634765625, "learning_rate": 9.988604927631914e-07, "loss": 0.0025, "reward": 1.6874374151229858, "reward_std": 0.1686633825302124, "rewards/accuracy_reward": 0.5624375343322754, "rewards/format_reward": 1.0, "step": 2129 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.78125, "epoch": 0.021504290762241293, "grad_norm": 1.1078438455411173, "kl": 0.05615234375, "learning_rate": 9.98859422456643e-07, "loss": 0.0022, "reward": 2.174062490463257, "reward_std": 0.0033852325286716223, "rewards/accuracy_reward": 0.9740625023841858, "rewards/format_reward": 1.0, "step": 2130 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.25, "epoch": 0.021514386673397275, "grad_norm": 0.8332801320214501, "kl": 0.049072265625, "learning_rate": 9.988583516482503e-07, "loss": 0.002, "reward": 1.6749999523162842, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 2131 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 400.71875, "epoch": 0.021524482584553258, "grad_norm": 1.9403719151199281, "kl": 0.0771484375, "learning_rate": 9.988572803380143e-07, "loss": 0.0031, "reward": 1.8549375534057617, "reward_std": 0.009670395404100418, "rewards/accuracy_reward": 0.7049375176429749, "rewards/format_reward": 1.0, "step": 2132 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.46875, "epoch": 0.021534578495709236, "grad_norm": 3.332997321074731, "kl": 0.07080078125, "learning_rate": 9.988562085259364e-07, "loss": 0.0028, "reward": 2.133625030517578, "reward_std": 0.01564745232462883, "rewards/accuracy_reward": 0.9336250424385071, "rewards/format_reward": 1.0, "step": 2133 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.71875, "epoch": 0.02154467440686522, "grad_norm": 2.486009157033397, "kl": 0.07080078125, "learning_rate": 9.988551362120174e-07, "loss": 0.0028, "reward": 2.110062599182129, "reward_std": 0.014010213315486908, "rewards/accuracy_reward": 0.9100624918937683, "rewards/format_reward": 1.0, "step": 2134 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.0215547703180212, "grad_norm": 28.128043227153583, "kl": 0.08447265625, "learning_rate": 9.988540633962585e-07, "loss": 0.0034, "reward": 2.0480000972747803, "reward_std": 0.015089062973856926, "rewards/accuracy_reward": 0.8479999899864197, "rewards/format_reward": 1.0, "step": 2135 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.625, "epoch": 0.021564866229177183, "grad_norm": 1.4499688807041557, "kl": 0.049560546875, "learning_rate": 9.988529900786606e-07, "loss": 0.002, "reward": 1.9375, "reward_std": 0.3156214952468872, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2136 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.34375, "epoch": 0.021574962140333165, "grad_norm": 4.558782520335593, "kl": 0.0634765625, "learning_rate": 9.988519162592248e-07, "loss": 0.0025, "reward": 2.1172499656677246, "reward_std": 0.010924087837338448, "rewards/accuracy_reward": 0.9172500371932983, "rewards/format_reward": 1.0, "step": 2137 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 403.96875, "epoch": 0.021585058051489147, "grad_norm": 5.190046219797782, "kl": 0.06982421875, "learning_rate": 9.988508419379526e-07, "loss": 0.0028, "reward": 1.4736875295639038, "reward_std": 0.008256610482931137, "rewards/accuracy_reward": 0.3736875057220459, "rewards/format_reward": 1.0, "step": 2138 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.02159515396264513, "grad_norm": 7.297880398875295, "kl": 0.054443359375, "learning_rate": 9.988497671148446e-07, "loss": 0.0022, "reward": 2.0839061737060547, "reward_std": 0.00911066122353077, "rewards/accuracy_reward": 0.8839062452316284, "rewards/format_reward": 1.0, "step": 2139 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.46875, "epoch": 0.02160524987380111, "grad_norm": 2.0557346563093497, "kl": 0.07666015625, "learning_rate": 9.98848691789902e-07, "loss": 0.0031, "reward": 2.1261560916900635, "reward_std": 0.009490891359746456, "rewards/accuracy_reward": 0.9261562824249268, "rewards/format_reward": 1.0, "step": 2140 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.021615345784957093, "grad_norm": 1.9990573928461846, "kl": 0.064453125, "learning_rate": 9.988476159631261e-07, "loss": 0.0026, "reward": 2.1519689559936523, "reward_std": 0.011072540655732155, "rewards/accuracy_reward": 0.951968789100647, "rewards/format_reward": 1.0, "step": 2141 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.5625, "epoch": 0.021625441696113076, "grad_norm": 4.472461200131896, "kl": 0.0791015625, "learning_rate": 9.988465396345177e-07, "loss": 0.0032, "reward": 2.0740625858306885, "reward_std": 0.013767162337899208, "rewards/accuracy_reward": 0.8740625381469727, "rewards/format_reward": 1.0, "step": 2142 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.65625, "epoch": 0.021635537607269054, "grad_norm": 2.4974259174875257, "kl": 0.08251953125, "learning_rate": 9.988454628040778e-07, "loss": 0.0033, "reward": 2.122812509536743, "reward_std": 0.01530616357922554, "rewards/accuracy_reward": 0.9228124618530273, "rewards/format_reward": 1.0, "step": 2143 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.65625, "epoch": 0.021645633518425036, "grad_norm": 2.237427368610979, "kl": 0.07080078125, "learning_rate": 9.98844385471808e-07, "loss": 0.0028, "reward": 2.181062698364258, "reward_std": 0.022221136838197708, "rewards/accuracy_reward": 0.9873124957084656, "rewards/format_reward": 1.0, "step": 2144 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.71875, "epoch": 0.02165572942958102, "grad_norm": 1.7413056922834314, "kl": 0.062255859375, "learning_rate": 9.98843307637709e-07, "loss": 0.0025, "reward": 1.987375020980835, "reward_std": 0.1446191370487213, "rewards/accuracy_reward": 0.7998749613761902, "rewards/format_reward": 1.0, "step": 2145 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.15625, "epoch": 0.021665825340737, "grad_norm": 2.531948971840856, "kl": 0.06640625, "learning_rate": 9.988422293017818e-07, "loss": 0.0027, "reward": 1.7703437805175781, "reward_std": 0.01889433152973652, "rewards/accuracy_reward": 0.6203436851501465, "rewards/format_reward": 1.0, "step": 2146 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.90625, "epoch": 0.021675921251892983, "grad_norm": 1.8282085896591733, "kl": 0.080078125, "learning_rate": 9.988411504640278e-07, "loss": 0.0032, "reward": 2.1238436698913574, "reward_std": 0.023443862795829773, "rewards/accuracy_reward": 0.9300937652587891, "rewards/format_reward": 1.0, "step": 2147 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.15625, "epoch": 0.021686017163048965, "grad_norm": 3.647680108493076, "kl": 0.07763671875, "learning_rate": 9.988400711244478e-07, "loss": 0.0031, "reward": 2.0760936737060547, "reward_std": 0.02159261144697666, "rewards/accuracy_reward": 0.8760937452316284, "rewards/format_reward": 1.0, "step": 2148 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.25, "epoch": 0.021696113074204947, "grad_norm": 1.766255633476896, "kl": 0.06982421875, "learning_rate": 9.98838991283043e-07, "loss": 0.0028, "reward": 1.8493125438690186, "reward_std": 0.009341899305582047, "rewards/accuracy_reward": 0.6993124485015869, "rewards/format_reward": 1.0, "step": 2149 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.0, "epoch": 0.02170620898536093, "grad_norm": 6.4927622714293625, "kl": 0.06689453125, "learning_rate": 9.988379109398143e-07, "loss": 0.0027, "reward": 2.1150312423706055, "reward_std": 0.017403369769454002, "rewards/accuracy_reward": 0.9150312542915344, "rewards/format_reward": 1.0, "step": 2150 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.84375, "epoch": 0.02171630489651691, "grad_norm": 2.3238810704844948, "kl": 0.080078125, "learning_rate": 9.988368300947634e-07, "loss": 0.0032, "reward": 2.044468879699707, "reward_std": 0.042601875960826874, "rewards/accuracy_reward": 0.8569687604904175, "rewards/format_reward": 1.0, "step": 2151 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.46875, "epoch": 0.021726400807672894, "grad_norm": 1.7474438218028991, "kl": 0.07080078125, "learning_rate": 9.988357487478906e-07, "loss": 0.0028, "reward": 2.0985312461853027, "reward_std": 0.029783353209495544, "rewards/accuracy_reward": 0.9047812223434448, "rewards/format_reward": 1.0, "step": 2152 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 401.59375, "epoch": 0.021736496718828876, "grad_norm": 1.9547498323978691, "kl": 0.072265625, "learning_rate": 9.988346668991975e-07, "loss": 0.0029, "reward": 1.8737812042236328, "reward_std": 0.005318574607372284, "rewards/accuracy_reward": 0.7237812280654907, "rewards/format_reward": 1.0, "step": 2153 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.34375, "epoch": 0.021746592629984855, "grad_norm": 3.2132520613489874, "kl": 0.07373046875, "learning_rate": 9.988335845486852e-07, "loss": 0.0029, "reward": 2.115781307220459, "reward_std": 0.015234776772558689, "rewards/accuracy_reward": 0.9157812595367432, "rewards/format_reward": 1.0, "step": 2154 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 399.21875, "epoch": 0.021756688541140837, "grad_norm": 1.6292554567278157, "kl": 0.06640625, "learning_rate": 9.988325016963544e-07, "loss": 0.0027, "reward": 1.5623126029968262, "reward_std": 0.004449658561497927, "rewards/accuracy_reward": 0.4623124897480011, "rewards/format_reward": 1.0, "step": 2155 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.40625, "epoch": 0.02176678445229682, "grad_norm": 2.005810008413493, "kl": 0.07958984375, "learning_rate": 9.988314183422065e-07, "loss": 0.0032, "reward": 2.129593849182129, "reward_std": 0.006572524085640907, "rewards/accuracy_reward": 0.9295938014984131, "rewards/format_reward": 1.0, "step": 2156 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.875, "epoch": 0.0217768803634528, "grad_norm": 3.827655690988921, "kl": 0.07421875, "learning_rate": 9.988303344862424e-07, "loss": 0.003, "reward": 2.1635937690734863, "reward_std": 0.010148046538233757, "rewards/accuracy_reward": 0.9635937213897705, "rewards/format_reward": 1.0, "step": 2157 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.5625, "epoch": 0.021786976274608783, "grad_norm": 6.348496443525671, "kl": 0.07666015625, "learning_rate": 9.988292501284634e-07, "loss": 0.0031, "reward": 2.0956249237060547, "reward_std": 0.02131861262023449, "rewards/accuracy_reward": 0.8956249356269836, "rewards/format_reward": 1.0, "step": 2158 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.1875, "epoch": 0.021797072185764765, "grad_norm": 4.58026120100488, "kl": 0.076171875, "learning_rate": 9.988281652688706e-07, "loss": 0.003, "reward": 2.132031202316284, "reward_std": 0.01608177274465561, "rewards/accuracy_reward": 0.9320312738418579, "rewards/format_reward": 1.0, "step": 2159 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.59375, "epoch": 0.021807168096920748, "grad_norm": 2.69332107855197, "kl": 0.07275390625, "learning_rate": 9.988270799074649e-07, "loss": 0.0029, "reward": 2.063093662261963, "reward_std": 0.061694007366895676, "rewards/accuracy_reward": 0.8630937337875366, "rewards/format_reward": 1.0, "step": 2160 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.40625, "epoch": 0.02181726400807673, "grad_norm": 2.738952160297268, "kl": 0.07470703125, "learning_rate": 9.988259940442474e-07, "loss": 0.003, "reward": 2.0508437156677246, "reward_std": 0.02821531519293785, "rewards/accuracy_reward": 0.8508437275886536, "rewards/format_reward": 1.0, "step": 2161 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 431.5625, "epoch": 0.021827359919232712, "grad_norm": 9.27299960074919, "kl": 0.07421875, "learning_rate": 9.988249076792193e-07, "loss": 0.003, "reward": 2.11118745803833, "reward_std": 0.005204933695495129, "rewards/accuracy_reward": 0.9111875295639038, "rewards/format_reward": 1.0, "step": 2162 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.1875, "epoch": 0.021837455830388694, "grad_norm": 5.14475396067284, "kl": 0.080078125, "learning_rate": 9.988238208123818e-07, "loss": 0.0032, "reward": 2.12681245803833, "reward_std": 0.014727634377777576, "rewards/accuracy_reward": 0.926812469959259, "rewards/format_reward": 1.0, "step": 2163 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.53125, "epoch": 0.021847551741544676, "grad_norm": 3.638785321538025, "kl": 0.0634765625, "learning_rate": 9.988227334437356e-07, "loss": 0.0026, "reward": 2.1603751182556152, "reward_std": 0.010242786258459091, "rewards/accuracy_reward": 0.9603750109672546, "rewards/format_reward": 1.0, "step": 2164 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 431.34375, "epoch": 0.021857647652700655, "grad_norm": 2.4669846829451165, "kl": 0.07275390625, "learning_rate": 9.98821645573282e-07, "loss": 0.0029, "reward": 1.8816876411437988, "reward_std": 0.00552311260253191, "rewards/accuracy_reward": 0.7316874861717224, "rewards/format_reward": 1.0, "step": 2165 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 432.65625, "epoch": 0.021867743563856637, "grad_norm": 4.511437025842707, "kl": 0.060546875, "learning_rate": 9.988205572010225e-07, "loss": 0.0024, "reward": 1.654843807220459, "reward_std": 0.15823765099048615, "rewards/accuracy_reward": 0.5360937714576721, "rewards/format_reward": 1.0, "step": 2166 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 430.4375, "epoch": 0.02187783947501262, "grad_norm": 2.2160642352282713, "kl": 0.072265625, "learning_rate": 9.988194683269574e-07, "loss": 0.0029, "reward": 1.8565000295639038, "reward_std": 0.022554529830813408, "rewards/accuracy_reward": 0.7065000534057617, "rewards/format_reward": 1.0, "step": 2167 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 429.5625, "epoch": 0.0218879353861686, "grad_norm": 2.046142097732815, "kl": 0.06591796875, "learning_rate": 9.988183789510887e-07, "loss": 0.0026, "reward": 1.590499997138977, "reward_std": 0.005844710860401392, "rewards/accuracy_reward": 0.49049997329711914, "rewards/format_reward": 1.0, "step": 2168 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 439.375, "epoch": 0.021898031297324583, "grad_norm": 3.187560768930927, "kl": 0.06640625, "learning_rate": 9.988172890734166e-07, "loss": 0.0027, "reward": 2.131500005722046, "reward_std": 0.036621224135160446, "rewards/accuracy_reward": 0.937749981880188, "rewards/format_reward": 1.0, "step": 2169 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 452.25, "epoch": 0.021908127208480566, "grad_norm": 3.1979916164849973, "kl": 0.0712890625, "learning_rate": 9.98816198693943e-07, "loss": 0.0028, "reward": 1.9465000629425049, "reward_std": 0.1672196388244629, "rewards/accuracy_reward": 0.7777500152587891, "rewards/format_reward": 1.0, "step": 2170 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 439.21875, "epoch": 0.021918223119636548, "grad_norm": 4.6577953654072966, "kl": 0.07275390625, "learning_rate": 9.988151078126683e-07, "loss": 0.0029, "reward": 2.0727500915527344, "reward_std": 0.03876354172825813, "rewards/accuracy_reward": 0.8852499723434448, "rewards/format_reward": 1.0, "step": 2171 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.34375, "epoch": 0.02192831903079253, "grad_norm": 3.2492945965765814, "kl": 0.0791015625, "learning_rate": 9.98814016429594e-07, "loss": 0.0032, "reward": 2.0818123817443848, "reward_std": 0.009811054915189743, "rewards/accuracy_reward": 0.8818125128746033, "rewards/format_reward": 1.0, "step": 2172 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 447.9375, "epoch": 0.021938414941948512, "grad_norm": 22.10669027695238, "kl": 0.087890625, "learning_rate": 9.988129245447211e-07, "loss": 0.0035, "reward": 2.135312557220459, "reward_std": 0.009441627189517021, "rewards/accuracy_reward": 0.9353125095367432, "rewards/format_reward": 1.0, "step": 2173 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 431.78125, "epoch": 0.021948510853104494, "grad_norm": 2.53022937735552, "kl": 0.06396484375, "learning_rate": 9.988118321580509e-07, "loss": 0.0026, "reward": 2.087343692779541, "reward_std": 0.00872923992574215, "rewards/accuracy_reward": 0.88734370470047, "rewards/format_reward": 1.0, "step": 2174 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.0, "epoch": 0.021958606764260473, "grad_norm": 3.0494867757071025, "kl": 0.072265625, "learning_rate": 9.98810739269584e-07, "loss": 0.0029, "reward": 2.100062608718872, "reward_std": 0.020467426627874374, "rewards/accuracy_reward": 0.9000625014305115, "rewards/format_reward": 1.0, "step": 2175 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 455.65625, "epoch": 0.021968702675416455, "grad_norm": 2.400768873857394, "kl": 0.06982421875, "learning_rate": 9.98809645879322e-07, "loss": 0.0028, "reward": 1.9798123836517334, "reward_std": 0.16145059466362, "rewards/accuracy_reward": 0.817312479019165, "rewards/format_reward": 1.0, "step": 2176 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 445.78125, "epoch": 0.021978798586572437, "grad_norm": 2.5955066258918658, "kl": 0.076171875, "learning_rate": 9.988085519872657e-07, "loss": 0.0031, "reward": 2.1164376735687256, "reward_std": 0.0314079150557518, "rewards/accuracy_reward": 0.9226875305175781, "rewards/format_reward": 1.0, "step": 2177 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 440.34375, "epoch": 0.02198889449772842, "grad_norm": 3.374550864951992, "kl": 0.07421875, "learning_rate": 9.988074575934164e-07, "loss": 0.003, "reward": 2.0110623836517334, "reward_std": 0.0328667089343071, "rewards/accuracy_reward": 0.8173125386238098, "rewards/format_reward": 1.0, "step": 2178 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 448.5625, "epoch": 0.0219989904088844, "grad_norm": 2.3556129449297236, "kl": 0.068359375, "learning_rate": 9.988063626977752e-07, "loss": 0.0027, "reward": 2.1362812519073486, "reward_std": 0.04480232298374176, "rewards/accuracy_reward": 0.9425312280654907, "rewards/format_reward": 1.0, "step": 2179 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.1875, "epoch": 0.022009086320040384, "grad_norm": 2.8691254986783368, "kl": 0.0732421875, "learning_rate": 9.98805267300343e-07, "loss": 0.0029, "reward": 1.9997187852859497, "reward_std": 0.025641759857535362, "rewards/accuracy_reward": 0.8059687614440918, "rewards/format_reward": 1.0, "step": 2180 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 426.0, "epoch": 0.022019182231196366, "grad_norm": 3.632203223972468, "kl": 0.07763671875, "learning_rate": 9.98804171401121e-07, "loss": 0.0031, "reward": 1.7715938091278076, "reward_std": 0.011093231849372387, "rewards/accuracy_reward": 0.621593713760376, "rewards/format_reward": 1.0, "step": 2181 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.21875, "epoch": 0.022029278142352348, "grad_norm": 4.2157382803361845, "kl": 0.06787109375, "learning_rate": 9.988030750001104e-07, "loss": 0.0027, "reward": 1.886125087738037, "reward_std": 0.008126430213451385, "rewards/accuracy_reward": 0.7361249923706055, "rewards/format_reward": 1.0, "step": 2182 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.5625, "epoch": 0.02203937405350833, "grad_norm": 1.0788206980682553, "kl": 0.06591796875, "learning_rate": 9.98801978097312e-07, "loss": 0.0026, "reward": 2.1962499618530273, "reward_std": 0.0026254388503730297, "rewards/accuracy_reward": 0.9962499737739563, "rewards/format_reward": 1.0, "step": 2183 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 429.5625, "epoch": 0.022049469964664312, "grad_norm": 1.78264383189779, "kl": 0.060302734375, "learning_rate": 9.988008806927276e-07, "loss": 0.0024, "reward": 1.5567500591278076, "reward_std": 0.007771361619234085, "rewards/accuracy_reward": 0.4567500352859497, "rewards/format_reward": 1.0, "step": 2184 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 419.75, "epoch": 0.022059565875820294, "grad_norm": 1.4633696108626075, "kl": 0.0625, "learning_rate": 9.987997827863574e-07, "loss": 0.0025, "reward": 1.8242812156677246, "reward_std": 0.002804443007335067, "rewards/accuracy_reward": 0.6742812395095825, "rewards/format_reward": 1.0, "step": 2185 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 414.59375, "epoch": 0.022069661786976273, "grad_norm": 1.313590045035277, "kl": 0.061279296875, "learning_rate": 9.98798684378203e-07, "loss": 0.0025, "reward": 1.8744375705718994, "reward_std": 0.02027786336839199, "rewards/accuracy_reward": 0.7306874990463257, "rewards/format_reward": 1.0, "step": 2186 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 416.125, "epoch": 0.022079757698132255, "grad_norm": 1.6675284212145454, "kl": 0.059326171875, "learning_rate": 9.987975854682655e-07, "loss": 0.0024, "reward": 1.5722813606262207, "reward_std": 0.014250579290091991, "rewards/accuracy_reward": 0.47228124737739563, "rewards/format_reward": 1.0, "step": 2187 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.0625, "epoch": 0.022089853609288237, "grad_norm": 2.776668176031033, "kl": 0.07080078125, "learning_rate": 9.987964860565458e-07, "loss": 0.0028, "reward": 2.0034375190734863, "reward_std": 0.014480662532150745, "rewards/accuracy_reward": 0.8034374713897705, "rewards/format_reward": 1.0, "step": 2188 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.625, "epoch": 0.02209994952044422, "grad_norm": 2.4578290945336247, "kl": 0.080078125, "learning_rate": 9.987953861430453e-07, "loss": 0.0032, "reward": 2.088343858718872, "reward_std": 0.01753653772175312, "rewards/accuracy_reward": 0.8883437514305115, "rewards/format_reward": 1.0, "step": 2189 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.15625, "epoch": 0.022110045431600202, "grad_norm": 4.270506468030174, "kl": 0.07958984375, "learning_rate": 9.98794285727765e-07, "loss": 0.0032, "reward": 2.034437656402588, "reward_std": 0.019465642049908638, "rewards/accuracy_reward": 0.8344374895095825, "rewards/format_reward": 1.0, "step": 2190 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.022120141342756184, "grad_norm": 2.731756051491235, "kl": 0.07373046875, "learning_rate": 9.98793184810706e-07, "loss": 0.0029, "reward": 2.120406150817871, "reward_std": 0.0061632608994841576, "rewards/accuracy_reward": 0.9204062223434448, "rewards/format_reward": 1.0, "step": 2191 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 411.84375, "epoch": 0.022130237253912166, "grad_norm": 0.08458827207864278, "kl": 0.0537109375, "learning_rate": 9.987920833918693e-07, "loss": 0.0021, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2192 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 403.0, "epoch": 0.02214033316506815, "grad_norm": 2.5638390954763723, "kl": 0.056396484375, "learning_rate": 9.98790981471256e-07, "loss": 0.0023, "reward": 1.557281255722046, "reward_std": 0.005273640621453524, "rewards/accuracy_reward": 0.4572812616825104, "rewards/format_reward": 1.0, "step": 2193 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.84375, "epoch": 0.02215042907622413, "grad_norm": 2.8410723001485105, "kl": 0.0634765625, "learning_rate": 9.987898790488673e-07, "loss": 0.0025, "reward": 1.8385626077651978, "reward_std": 0.10919512063264847, "rewards/accuracy_reward": 0.6885625123977661, "rewards/format_reward": 1.0, "step": 2194 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.96875, "epoch": 0.022160524987380113, "grad_norm": 0.08855456952729121, "kl": 0.042236328125, "learning_rate": 9.987887761247042e-07, "loss": 0.0017, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2195 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.59375, "epoch": 0.02217062089853609, "grad_norm": 2.270834300723107, "kl": 0.0771484375, "learning_rate": 9.987876726987681e-07, "loss": 0.0031, "reward": 2.070125102996826, "reward_std": 0.01320735178887844, "rewards/accuracy_reward": 0.8701249957084656, "rewards/format_reward": 1.0, "step": 2196 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.03125, "epoch": 0.022180716809692073, "grad_norm": 2.4853309632028364, "kl": 0.080078125, "learning_rate": 9.987865687710599e-07, "loss": 0.0032, "reward": 2.139625072479248, "reward_std": 0.012007972225546837, "rewards/accuracy_reward": 0.9396250247955322, "rewards/format_reward": 1.0, "step": 2197 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.0625, "epoch": 0.022190812720848056, "grad_norm": 3.188520304885863, "kl": 0.07568359375, "learning_rate": 9.987854643415807e-07, "loss": 0.003, "reward": 2.0054376125335693, "reward_std": 0.1713368445634842, "rewards/accuracy_reward": 0.8304374814033508, "rewards/format_reward": 1.0, "step": 2198 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.25, "epoch": 0.022200908632004038, "grad_norm": 2.065161248385204, "kl": 0.07080078125, "learning_rate": 9.987843594103315e-07, "loss": 0.0028, "reward": 2.023843765258789, "reward_std": 0.012694879435002804, "rewards/accuracy_reward": 0.8238437175750732, "rewards/format_reward": 1.0, "step": 2199 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.25, "epoch": 0.02221100454316002, "grad_norm": 2.091325852592107, "kl": 0.07080078125, "learning_rate": 9.987832539773136e-07, "loss": 0.0028, "reward": 2.1718435287475586, "reward_std": 0.007069193758070469, "rewards/accuracy_reward": 0.9718437194824219, "rewards/format_reward": 1.0, "step": 2200 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.875, "epoch": 0.022221100454316002, "grad_norm": 3.2691457594290143, "kl": 0.0771484375, "learning_rate": 9.987821480425282e-07, "loss": 0.0031, "reward": 2.0018749237060547, "reward_std": 0.028958937153220177, "rewards/accuracy_reward": 0.8018749952316284, "rewards/format_reward": 1.0, "step": 2201 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 408.875, "epoch": 0.022231196365471984, "grad_norm": 4.783766943951561, "kl": 0.0654296875, "learning_rate": 9.98781041605976e-07, "loss": 0.0026, "reward": 1.5588750839233398, "reward_std": 0.007295968942344189, "rewards/accuracy_reward": 0.45887500047683716, "rewards/format_reward": 1.0, "step": 2202 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.09375, "epoch": 0.022241292276627966, "grad_norm": 3.0039836786145817, "kl": 0.0751953125, "learning_rate": 9.987799346676586e-07, "loss": 0.003, "reward": 1.8600624799728394, "reward_std": 0.012003691866993904, "rewards/accuracy_reward": 0.7100625038146973, "rewards/format_reward": 1.0, "step": 2203 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.0, "epoch": 0.02225138818778395, "grad_norm": 3.173664830916918, "kl": 0.0673828125, "learning_rate": 9.98778827227577e-07, "loss": 0.0027, "reward": 2.1276564598083496, "reward_std": 0.01210703607648611, "rewards/accuracy_reward": 0.9276561737060547, "rewards/format_reward": 1.0, "step": 2204 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.21875, "epoch": 0.02226148409893993, "grad_norm": 6.319168248589324, "kl": 0.060546875, "learning_rate": 9.98777719285732e-07, "loss": 0.0024, "reward": 1.8468437194824219, "reward_std": 0.006118020974099636, "rewards/accuracy_reward": 0.6968437433242798, "rewards/format_reward": 1.0, "step": 2205 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.96875, "epoch": 0.022271580010095913, "grad_norm": 2.278899395353239, "kl": 0.0771484375, "learning_rate": 9.98776610842125e-07, "loss": 0.0031, "reward": 1.978968858718872, "reward_std": 0.01314245443791151, "rewards/accuracy_reward": 0.7789687514305115, "rewards/format_reward": 1.0, "step": 2206 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.25, "epoch": 0.02228167592125189, "grad_norm": 2.4827697915887157, "kl": 0.06884765625, "learning_rate": 9.987755018967572e-07, "loss": 0.0027, "reward": 1.7983437776565552, "reward_std": 0.008584888651967049, "rewards/accuracy_reward": 0.6483436822891235, "rewards/format_reward": 1.0, "step": 2207 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 420.125, "epoch": 0.022291771832407874, "grad_norm": 0.08660224396937352, "kl": 0.054931640625, "learning_rate": 9.987743924496292e-07, "loss": 0.0022, "reward": 1.7687499523162842, "reward_std": 0.0, "rewards/accuracy_reward": 0.6187499761581421, "rewards/format_reward": 1.0, "step": 2208 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 404.0625, "epoch": 0.022301867743563856, "grad_norm": 2.0131706816436816, "kl": 0.07421875, "learning_rate": 9.987732825007427e-07, "loss": 0.003, "reward": 1.8761248588562012, "reward_std": 0.008659726940095425, "rewards/accuracy_reward": 0.7261250019073486, "rewards/format_reward": 1.0, "step": 2209 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.022311963654719838, "grad_norm": 1.9405050550371385, "kl": 0.0673828125, "learning_rate": 9.987721720500987e-07, "loss": 0.0027, "reward": 2.140437602996826, "reward_std": 0.026527315378189087, "rewards/accuracy_reward": 0.9466875195503235, "rewards/format_reward": 1.0, "step": 2210 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.09375, "epoch": 0.02232205956587582, "grad_norm": 2.4578311403797506, "kl": 0.0810546875, "learning_rate": 9.98771061097698e-07, "loss": 0.0032, "reward": 2.110593795776367, "reward_std": 0.02427861839532852, "rewards/accuracy_reward": 0.9105937480926514, "rewards/format_reward": 1.0, "step": 2211 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.96875, "epoch": 0.022332155477031802, "grad_norm": 2.2372217399780188, "kl": 0.06689453125, "learning_rate": 9.98769949643542e-07, "loss": 0.0027, "reward": 2.0985000133514404, "reward_std": 0.023585520684719086, "rewards/accuracy_reward": 0.8985000252723694, "rewards/format_reward": 1.0, "step": 2212 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.8125, "epoch": 0.022342251388187784, "grad_norm": 1.6951340786486528, "kl": 0.0703125, "learning_rate": 9.987688376876317e-07, "loss": 0.0028, "reward": 2.1505937576293945, "reward_std": 0.01290079578757286, "rewards/accuracy_reward": 0.9505937695503235, "rewards/format_reward": 1.0, "step": 2213 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.34375, "epoch": 0.022352347299343767, "grad_norm": 5.399737848963443, "kl": 0.08642578125, "learning_rate": 9.987677252299684e-07, "loss": 0.0035, "reward": 2.111999988555908, "reward_std": 0.01928216591477394, "rewards/accuracy_reward": 0.9120000004768372, "rewards/format_reward": 1.0, "step": 2214 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.15625, "epoch": 0.02236244321049975, "grad_norm": 2.839775845963471, "kl": 0.0869140625, "learning_rate": 9.98766612270553e-07, "loss": 0.0035, "reward": 2.161468744277954, "reward_std": 0.013088706880807877, "rewards/accuracy_reward": 0.9614686965942383, "rewards/format_reward": 1.0, "step": 2215 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 407.03125, "epoch": 0.02237253912165573, "grad_norm": 0.1124597238068284, "kl": 0.06201171875, "learning_rate": 9.987654988093866e-07, "loss": 0.0025, "reward": 1.4542500972747803, "reward_std": 0.0, "rewards/accuracy_reward": 0.3542500138282776, "rewards/format_reward": 1.0, "step": 2216 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.375, "epoch": 0.02238263503281171, "grad_norm": 2.1193857390265616, "kl": 0.06884765625, "learning_rate": 9.987643848464706e-07, "loss": 0.0028, "reward": 2.152562379837036, "reward_std": 0.008452216163277626, "rewards/accuracy_reward": 0.9525624513626099, "rewards/format_reward": 1.0, "step": 2217 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.022392730943967692, "grad_norm": 4.228129945902247, "kl": 0.0634765625, "learning_rate": 9.987632703818059e-07, "loss": 0.0025, "reward": 2.0928125381469727, "reward_std": 0.028495293110609055, "rewards/accuracy_reward": 0.8990625143051147, "rewards/format_reward": 1.0, "step": 2218 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.28125, "epoch": 0.022402826855123674, "grad_norm": 2.4824234111913013, "kl": 0.080078125, "learning_rate": 9.987621554153935e-07, "loss": 0.0032, "reward": 2.141937494277954, "reward_std": 0.014792711474001408, "rewards/accuracy_reward": 0.9419375061988831, "rewards/format_reward": 1.0, "step": 2219 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.0, "epoch": 0.022412922766279656, "grad_norm": 1.3695989841361718, "kl": 0.07666015625, "learning_rate": 9.98761039947235e-07, "loss": 0.0031, "reward": 2.100156307220459, "reward_std": 0.00594309763982892, "rewards/accuracy_reward": 0.9001562595367432, "rewards/format_reward": 1.0, "step": 2220 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.3125, "epoch": 0.022423018677435638, "grad_norm": 1.903247719526134, "kl": 0.07373046875, "learning_rate": 9.987599239773308e-07, "loss": 0.0029, "reward": 2.032531261444092, "reward_std": 0.0065007940866053104, "rewards/accuracy_reward": 0.8325312733650208, "rewards/format_reward": 1.0, "step": 2221 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.15625, "epoch": 0.02243311458859162, "grad_norm": 2.2442981322575735, "kl": 0.09228515625, "learning_rate": 9.987588075056827e-07, "loss": 0.0037, "reward": 1.9604687690734863, "reward_std": 0.01871672458946705, "rewards/accuracy_reward": 0.7604687809944153, "rewards/format_reward": 1.0, "step": 2222 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.5, "epoch": 0.022443210499747603, "grad_norm": 1.2088032302140617, "kl": 0.0625, "learning_rate": 9.987576905322915e-07, "loss": 0.0025, "reward": 2.121687412261963, "reward_std": 0.008646589703857899, "rewards/accuracy_reward": 0.9216874837875366, "rewards/format_reward": 1.0, "step": 2223 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.21875, "epoch": 0.022453306410903585, "grad_norm": 1.3184628517229373, "kl": 0.07470703125, "learning_rate": 9.987565730571584e-07, "loss": 0.003, "reward": 1.9721875190734863, "reward_std": 0.004364515654742718, "rewards/accuracy_reward": 0.7721874713897705, "rewards/format_reward": 1.0, "step": 2224 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.022463402322059567, "grad_norm": 1.8760381448444468, "kl": 0.06640625, "learning_rate": 9.987554550802843e-07, "loss": 0.0027, "reward": 2.0113439559936523, "reward_std": 0.014547910541296005, "rewards/accuracy_reward": 0.811343789100647, "rewards/format_reward": 1.0, "step": 2225 }, { "all_correct": 0.0, "all_wrong": 0.75, "completion_length": 401.09375, "epoch": 0.02247349823321555, "grad_norm": 1.616132814248277, "kl": 0.05859375, "learning_rate": 9.987543366016708e-07, "loss": 0.0023, "reward": 1.24790620803833, "reward_std": 0.003977470565587282, "rewards/accuracy_reward": 0.1979062557220459, "rewards/format_reward": 1.0, "step": 2226 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.5625, "epoch": 0.02248359414437153, "grad_norm": 4.427014447355131, "kl": 0.07373046875, "learning_rate": 9.987532176213187e-07, "loss": 0.003, "reward": 2.1142501831054688, "reward_std": 0.014358892105519772, "rewards/accuracy_reward": 0.9142499566078186, "rewards/format_reward": 1.0, "step": 2227 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.21875, "epoch": 0.02249369005552751, "grad_norm": 4.903195597711286, "kl": 0.07861328125, "learning_rate": 9.98752098139229e-07, "loss": 0.0031, "reward": 2.0439062118530273, "reward_std": 0.017560172826051712, "rewards/accuracy_reward": 0.8439062237739563, "rewards/format_reward": 1.0, "step": 2228 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.9375, "epoch": 0.022503785966683492, "grad_norm": 3.449460219034468, "kl": 0.06005859375, "learning_rate": 9.987509781554031e-07, "loss": 0.0024, "reward": 1.8659688234329224, "reward_std": 0.011066717095673084, "rewards/accuracy_reward": 0.7159687280654907, "rewards/format_reward": 1.0, "step": 2229 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.022513881877839474, "grad_norm": 3.6057369836622803, "kl": 0.0791015625, "learning_rate": 9.98749857669842e-07, "loss": 0.0032, "reward": 2.155656337738037, "reward_std": 0.018383536487817764, "rewards/accuracy_reward": 0.9556562304496765, "rewards/format_reward": 1.0, "step": 2230 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.15625, "epoch": 0.022523977788995456, "grad_norm": 6.105072865701828, "kl": 0.0634765625, "learning_rate": 9.98748736682547e-07, "loss": 0.0025, "reward": 2.1689376831054688, "reward_std": 0.008702751249074936, "rewards/accuracy_reward": 0.9689375162124634, "rewards/format_reward": 1.0, "step": 2231 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.34375, "epoch": 0.02253407370015144, "grad_norm": 2.0670272188110865, "kl": 0.06689453125, "learning_rate": 9.98747615193519e-07, "loss": 0.0027, "reward": 1.8425625562667847, "reward_std": 0.006719027645885944, "rewards/accuracy_reward": 0.6925625205039978, "rewards/format_reward": 1.0, "step": 2232 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.625, "epoch": 0.02254416961130742, "grad_norm": 3.4507614138737255, "kl": 0.06689453125, "learning_rate": 9.987464932027592e-07, "loss": 0.0027, "reward": 1.7712187767028809, "reward_std": 0.016139749437570572, "rewards/accuracy_reward": 0.6212188005447388, "rewards/format_reward": 1.0, "step": 2233 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.15625, "epoch": 0.022554265522463403, "grad_norm": 2.6793595446445972, "kl": 0.0732421875, "learning_rate": 9.987453707102686e-07, "loss": 0.0029, "reward": 2.1714375019073486, "reward_std": 0.00943441316485405, "rewards/accuracy_reward": 0.9714374542236328, "rewards/format_reward": 1.0, "step": 2234 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.40625, "epoch": 0.022564361433619385, "grad_norm": 2.059572435172671, "kl": 0.0625, "learning_rate": 9.987442477160488e-07, "loss": 0.0025, "reward": 2.174187660217285, "reward_std": 0.022205453366041183, "rewards/accuracy_reward": 0.9804375171661377, "rewards/format_reward": 1.0, "step": 2235 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.84375, "epoch": 0.022574457344775367, "grad_norm": 3.0981234020955997, "kl": 0.07470703125, "learning_rate": 9.987431242201003e-07, "loss": 0.003, "reward": 2.027031183242798, "reward_std": 0.03599461540579796, "rewards/accuracy_reward": 0.8332812786102295, "rewards/format_reward": 1.0, "step": 2236 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.59375, "epoch": 0.02258455325593135, "grad_norm": 18.006608441073126, "kl": 0.080078125, "learning_rate": 9.987420002224247e-07, "loss": 0.0032, "reward": 2.1642813682556152, "reward_std": 0.009860566817224026, "rewards/accuracy_reward": 0.9642812013626099, "rewards/format_reward": 1.0, "step": 2237 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.5625, "epoch": 0.022594649167087328, "grad_norm": 2.059430943216797, "kl": 0.06640625, "learning_rate": 9.987408757230229e-07, "loss": 0.0027, "reward": 2.106281280517578, "reward_std": 0.010531269945204258, "rewards/accuracy_reward": 0.9062812328338623, "rewards/format_reward": 1.0, "step": 2238 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.5625, "epoch": 0.02260474507824331, "grad_norm": 1.3157624370000185, "kl": 0.056640625, "learning_rate": 9.98739750721896e-07, "loss": 0.0023, "reward": 2.195812702178955, "reward_std": 0.0031474167481064796, "rewards/accuracy_reward": 0.9958125352859497, "rewards/format_reward": 1.0, "step": 2239 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.125, "epoch": 0.022614840989399292, "grad_norm": 3.358643456227692, "kl": 0.076171875, "learning_rate": 9.987386252190454e-07, "loss": 0.003, "reward": 2.101343870162964, "reward_std": 0.014716953039169312, "rewards/accuracy_reward": 0.901343822479248, "rewards/format_reward": 1.0, "step": 2240 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.0625, "epoch": 0.022624936900555274, "grad_norm": 3.8653306951511404, "kl": 0.0859375, "learning_rate": 9.98737499214472e-07, "loss": 0.0034, "reward": 2.0106563568115234, "reward_std": 0.014069722034037113, "rewards/accuracy_reward": 0.8106563091278076, "rewards/format_reward": 1.0, "step": 2241 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.96875, "epoch": 0.022635032811711257, "grad_norm": 3.6814790697944826, "kl": 0.07421875, "learning_rate": 9.987363727081768e-07, "loss": 0.003, "reward": 2.0328125953674316, "reward_std": 0.016941003501415253, "rewards/accuracy_reward": 0.8328125476837158, "rewards/format_reward": 1.0, "step": 2242 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.3125, "epoch": 0.02264512872286724, "grad_norm": 4.052032287775143, "kl": 0.080078125, "learning_rate": 9.987352457001614e-07, "loss": 0.0032, "reward": 2.0963125228881836, "reward_std": 0.00915389321744442, "rewards/accuracy_reward": 0.8963125348091125, "rewards/format_reward": 1.0, "step": 2243 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.5, "epoch": 0.02265522463402322, "grad_norm": 1.925337999769864, "kl": 0.06787109375, "learning_rate": 9.987341181904265e-07, "loss": 0.0027, "reward": 1.5699999332427979, "reward_std": 0.14066657423973083, "rewards/accuracy_reward": 0.45750001072883606, "rewards/format_reward": 1.0, "step": 2244 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 0.022665320545179203, "grad_norm": 3.2080059864680543, "kl": 0.06591796875, "learning_rate": 9.987329901789733e-07, "loss": 0.0026, "reward": 2.0741562843322754, "reward_std": 0.05289522185921669, "rewards/accuracy_reward": 0.8741562366485596, "rewards/format_reward": 1.0, "step": 2245 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.6875, "epoch": 0.022675416456335185, "grad_norm": 1.6799743986538522, "kl": 0.068359375, "learning_rate": 9.987318616658032e-07, "loss": 0.0027, "reward": 2.1854686737060547, "reward_std": 0.004432301037013531, "rewards/accuracy_reward": 0.9854687452316284, "rewards/format_reward": 1.0, "step": 2246 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.5625, "epoch": 0.022685512367491167, "grad_norm": 4.381348388438729, "kl": 0.08203125, "learning_rate": 9.98730732650917e-07, "loss": 0.0033, "reward": 2.120374917984009, "reward_std": 0.025893760845065117, "rewards/accuracy_reward": 0.9203749895095825, "rewards/format_reward": 1.0, "step": 2247 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.53125, "epoch": 0.02269560827864715, "grad_norm": 2.127687778730984, "kl": 0.07275390625, "learning_rate": 9.987296031343161e-07, "loss": 0.0029, "reward": 2.1235313415527344, "reward_std": 0.015123441815376282, "rewards/accuracy_reward": 0.923531174659729, "rewards/format_reward": 1.0, "step": 2248 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.59375, "epoch": 0.022705704189803128, "grad_norm": 25.069727846372672, "kl": 0.061279296875, "learning_rate": 9.987284731160017e-07, "loss": 0.0024, "reward": 1.854781150817871, "reward_std": 0.004488951992243528, "rewards/accuracy_reward": 0.7047812342643738, "rewards/format_reward": 1.0, "step": 2249 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.09375, "epoch": 0.02271580010095911, "grad_norm": 2.80332372236734, "kl": 0.08056640625, "learning_rate": 9.987273425959746e-07, "loss": 0.0032, "reward": 2.0935001373291016, "reward_std": 0.012167445383965969, "rewards/accuracy_reward": 0.8934999704360962, "rewards/format_reward": 1.0, "step": 2250 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.625, "epoch": 0.022725896012115093, "grad_norm": 1.8887268160945185, "kl": 0.07421875, "learning_rate": 9.98726211574236e-07, "loss": 0.003, "reward": 2.1360626220703125, "reward_std": 0.007838339544832706, "rewards/accuracy_reward": 0.9360625147819519, "rewards/format_reward": 1.0, "step": 2251 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 384.21875, "epoch": 0.022735991923271075, "grad_norm": 5.87964673871163, "kl": 0.061279296875, "learning_rate": 9.987250800507872e-07, "loss": 0.0025, "reward": 1.8280000686645508, "reward_std": 0.022463126108050346, "rewards/accuracy_reward": 0.6842499375343323, "rewards/format_reward": 1.0, "step": 2252 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 370.34375, "epoch": 0.022746087834427057, "grad_norm": 10.617508057088612, "kl": 0.109375, "learning_rate": 9.987239480256292e-07, "loss": 0.0044, "reward": 1.9329376220703125, "reward_std": 0.23360909521579742, "rewards/accuracy_reward": 0.8266875147819519, "rewards/format_reward": 0.9375, "step": 2253 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.84375, "epoch": 0.02275618374558304, "grad_norm": 2.3315321939036804, "kl": 0.048583984375, "learning_rate": 9.987228154987634e-07, "loss": 0.0019, "reward": 1.6940624713897705, "reward_std": 0.3873511552810669, "rewards/accuracy_reward": 0.5753124952316284, "rewards/format_reward": 1.0, "step": 2254 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.75, "epoch": 0.02276627965673902, "grad_norm": 1.595362717838441, "kl": 0.06494140625, "learning_rate": 9.98721682470191e-07, "loss": 0.0026, "reward": 1.7759063243865967, "reward_std": 0.00938338041305542, "rewards/accuracy_reward": 0.625906229019165, "rewards/format_reward": 1.0, "step": 2255 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.84375, "epoch": 0.022776375567895003, "grad_norm": 2.0900556296972583, "kl": 0.06689453125, "learning_rate": 9.987205489399125e-07, "loss": 0.0027, "reward": 2.15946888923645, "reward_std": 0.021605923771858215, "rewards/accuracy_reward": 0.9657187461853027, "rewards/format_reward": 1.0, "step": 2256 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.8125, "epoch": 0.022786471479050985, "grad_norm": 3.2153009195715563, "kl": 0.08984375, "learning_rate": 9.987194149079295e-07, "loss": 0.0036, "reward": 2.0354063510894775, "reward_std": 0.01755228079855442, "rewards/accuracy_reward": 0.8354062438011169, "rewards/format_reward": 1.0, "step": 2257 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.65625, "epoch": 0.022796567390206968, "grad_norm": 2.6191300712170333, "kl": 0.08056640625, "learning_rate": 9.987182803742432e-07, "loss": 0.0032, "reward": 2.1276562213897705, "reward_std": 0.014916185289621353, "rewards/accuracy_reward": 0.9276562333106995, "rewards/format_reward": 1.0, "step": 2258 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.96875, "epoch": 0.022806663301362946, "grad_norm": 2.59213213724701, "kl": 0.07373046875, "learning_rate": 9.987171453388544e-07, "loss": 0.003, "reward": 2.101468801498413, "reward_std": 0.025969093665480614, "rewards/accuracy_reward": 0.9077187776565552, "rewards/format_reward": 1.0, "step": 2259 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.71875, "epoch": 0.02281675921251893, "grad_norm": 4.344201461546082, "kl": 0.07275390625, "learning_rate": 9.987160098017646e-07, "loss": 0.0029, "reward": 2.134124994277954, "reward_std": 0.02758822962641716, "rewards/accuracy_reward": 0.940375030040741, "rewards/format_reward": 1.0, "step": 2260 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.59375, "epoch": 0.02282685512367491, "grad_norm": 2.356772132019964, "kl": 0.09423828125, "learning_rate": 9.98714873762975e-07, "loss": 0.0038, "reward": 2.144843816757202, "reward_std": 0.028317609801888466, "rewards/accuracy_reward": 0.9510937333106995, "rewards/format_reward": 1.0, "step": 2261 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.25, "epoch": 0.022836951034830893, "grad_norm": 6.310973723790409, "kl": 0.0771484375, "learning_rate": 9.987137372224862e-07, "loss": 0.0031, "reward": 1.8160312175750732, "reward_std": 0.01700183004140854, "rewards/accuracy_reward": 0.6660312414169312, "rewards/format_reward": 1.0, "step": 2262 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 399.46875, "epoch": 0.022847046945986875, "grad_norm": 2.3460800424944046, "kl": 0.0732421875, "learning_rate": 9.987126001802999e-07, "loss": 0.0029, "reward": 1.5232501029968262, "reward_std": 0.09853677451610565, "rewards/accuracy_reward": 0.4545000195503235, "rewards/format_reward": 0.96875, "step": 2263 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.71875, "epoch": 0.022857142857142857, "grad_norm": 8.306530619989536, "kl": 0.0849609375, "learning_rate": 9.98711462636417e-07, "loss": 0.0034, "reward": 2.0966875553131104, "reward_std": 0.01641889289021492, "rewards/accuracy_reward": 0.8966875076293945, "rewards/format_reward": 1.0, "step": 2264 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.875, "epoch": 0.02286723876829884, "grad_norm": 2.1349169658334572, "kl": 0.0703125, "learning_rate": 9.987103245908386e-07, "loss": 0.0028, "reward": 1.7857813835144043, "reward_std": 0.011606559157371521, "rewards/accuracy_reward": 0.6357812285423279, "rewards/format_reward": 1.0, "step": 2265 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.02287733467945482, "grad_norm": 4.245939612747638, "kl": 0.0771484375, "learning_rate": 9.98709186043566e-07, "loss": 0.0031, "reward": 2.1141562461853027, "reward_std": 0.014863035641610622, "rewards/accuracy_reward": 0.9141563177108765, "rewards/format_reward": 1.0, "step": 2266 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.71875, "epoch": 0.022887430590610804, "grad_norm": 4.046846256895957, "kl": 0.0810546875, "learning_rate": 9.987080469946003e-07, "loss": 0.0032, "reward": 2.038875102996826, "reward_std": 0.014451762661337852, "rewards/accuracy_reward": 0.8388750553131104, "rewards/format_reward": 1.0, "step": 2267 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 391.1875, "epoch": 0.022897526501766786, "grad_norm": 1.6326008928277886, "kl": 0.06689453125, "learning_rate": 9.987069074439425e-07, "loss": 0.0027, "reward": 1.8731563091278076, "reward_std": 0.008102456107735634, "rewards/accuracy_reward": 0.7231562733650208, "rewards/format_reward": 1.0, "step": 2268 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.25, "epoch": 0.022907622412922768, "grad_norm": 2.0676821720635346, "kl": 0.064453125, "learning_rate": 9.98705767391594e-07, "loss": 0.0026, "reward": 2.1156249046325684, "reward_std": 0.10869427770376205, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.96875, "step": 2269 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.8125, "epoch": 0.022917718324078747, "grad_norm": 2.45676684768292, "kl": 0.0869140625, "learning_rate": 9.987046268375558e-07, "loss": 0.0035, "reward": 2.133718967437744, "reward_std": 0.028049584478139877, "rewards/accuracy_reward": 0.9399687647819519, "rewards/format_reward": 1.0, "step": 2270 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.34375, "epoch": 0.02292781423523473, "grad_norm": 1.8782009549840615, "kl": 0.06689453125, "learning_rate": 9.98703485781829e-07, "loss": 0.0027, "reward": 2.1777501106262207, "reward_std": 0.04140572249889374, "rewards/accuracy_reward": 0.9902499914169312, "rewards/format_reward": 1.0, "step": 2271 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.8125, "epoch": 0.02293791014639071, "grad_norm": 3.4006986139265885, "kl": 0.08447265625, "learning_rate": 9.98702344224415e-07, "loss": 0.0034, "reward": 1.8430625200271606, "reward_std": 0.01270347647368908, "rewards/accuracy_reward": 0.6930625438690186, "rewards/format_reward": 1.0, "step": 2272 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.5625, "epoch": 0.022948006057546693, "grad_norm": 2.6294942242608816, "kl": 0.0791015625, "learning_rate": 9.987012021653143e-07, "loss": 0.0032, "reward": 1.7682812213897705, "reward_std": 0.1184542328119278, "rewards/accuracy_reward": 0.6557812690734863, "rewards/format_reward": 0.96875, "step": 2273 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.75, "epoch": 0.022958101968702675, "grad_norm": 3.4285938914558067, "kl": 0.076171875, "learning_rate": 9.987000596045289e-07, "loss": 0.0031, "reward": 2.0455000400543213, "reward_std": 0.026259079575538635, "rewards/accuracy_reward": 0.8454999923706055, "rewards/format_reward": 1.0, "step": 2274 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.9375, "epoch": 0.022968197879858657, "grad_norm": 3.404488073607502, "kl": 0.0849609375, "learning_rate": 9.986989165420593e-07, "loss": 0.0034, "reward": 2.111562490463257, "reward_std": 0.020189646631479263, "rewards/accuracy_reward": 0.9115625619888306, "rewards/format_reward": 1.0, "step": 2275 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.625, "epoch": 0.02297829379101464, "grad_norm": 6.381275087654588, "kl": 0.07958984375, "learning_rate": 9.986977729779072e-07, "loss": 0.0032, "reward": 2.10184383392334, "reward_std": 0.022484123706817627, "rewards/accuracy_reward": 0.901843786239624, "rewards/format_reward": 1.0, "step": 2276 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 411.75, "epoch": 0.02298838970217062, "grad_norm": 1.3499019636544694, "kl": 0.06982421875, "learning_rate": 9.986966289120733e-07, "loss": 0.0028, "reward": 1.5836563110351562, "reward_std": 0.0049460288137197495, "rewards/accuracy_reward": 0.48365625739097595, "rewards/format_reward": 1.0, "step": 2277 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.71875, "epoch": 0.022998485613326604, "grad_norm": 1.8287341315920398, "kl": 0.0771484375, "learning_rate": 9.986954843445588e-07, "loss": 0.0031, "reward": 1.8492188453674316, "reward_std": 0.022176265716552734, "rewards/accuracy_reward": 0.7054687142372131, "rewards/format_reward": 1.0, "step": 2278 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.28125, "epoch": 0.023008581524482586, "grad_norm": 1.7364349857374675, "kl": 0.0751953125, "learning_rate": 9.98694339275365e-07, "loss": 0.003, "reward": 2.1793437004089355, "reward_std": 0.008122281171381474, "rewards/accuracy_reward": 0.9793437719345093, "rewards/format_reward": 1.0, "step": 2279 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.46875, "epoch": 0.023018677435638565, "grad_norm": 2.3191726986971646, "kl": 0.078125, "learning_rate": 9.98693193704493e-07, "loss": 0.0031, "reward": 2.087062358856201, "reward_std": 0.01706843636929989, "rewards/accuracy_reward": 0.8870625495910645, "rewards/format_reward": 1.0, "step": 2280 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.5, "epoch": 0.023028773346794547, "grad_norm": 1.7641677780738134, "kl": 0.08056640625, "learning_rate": 9.98692047631944e-07, "loss": 0.0032, "reward": 2.1486873626708984, "reward_std": 0.008127192035317421, "rewards/accuracy_reward": 0.9486874938011169, "rewards/format_reward": 1.0, "step": 2281 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.96875, "epoch": 0.02303886925795053, "grad_norm": 1.766817479433999, "kl": 0.08154296875, "learning_rate": 9.986909010577191e-07, "loss": 0.0033, "reward": 1.8211250305175781, "reward_std": 0.02384505420923233, "rewards/accuracy_reward": 0.6773750185966492, "rewards/format_reward": 1.0, "step": 2282 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 411.46875, "epoch": 0.02304896516910651, "grad_norm": 3.519998536519585, "kl": 0.09619140625, "learning_rate": 9.986897539818195e-07, "loss": 0.0038, "reward": 1.8589688539505005, "reward_std": 0.0149960583075881, "rewards/accuracy_reward": 0.7089687585830688, "rewards/format_reward": 1.0, "step": 2283 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.375, "epoch": 0.023059061080262493, "grad_norm": 2.4985601757923956, "kl": 0.091796875, "learning_rate": 9.986886064042463e-07, "loss": 0.0037, "reward": 2.014906406402588, "reward_std": 0.024645773693919182, "rewards/accuracy_reward": 0.8149062395095825, "rewards/format_reward": 1.0, "step": 2284 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 419.09375, "epoch": 0.023069156991418475, "grad_norm": 2.2487892773337035, "kl": 0.08984375, "learning_rate": 9.986874583250007e-07, "loss": 0.0036, "reward": 1.7956874370574951, "reward_std": 0.014143511652946472, "rewards/accuracy_reward": 0.6456875205039978, "rewards/format_reward": 1.0, "step": 2285 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.0625, "epoch": 0.023079252902574458, "grad_norm": 2.951171046950231, "kl": 0.0732421875, "learning_rate": 9.986863097440838e-07, "loss": 0.0029, "reward": 1.8742811679840088, "reward_std": 0.02528602071106434, "rewards/accuracy_reward": 0.7305312156677246, "rewards/format_reward": 1.0, "step": 2286 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.02308934881373044, "grad_norm": 3.110503174386479, "kl": 0.08544921875, "learning_rate": 9.986851606614968e-07, "loss": 0.0034, "reward": 2.0487687587738037, "reward_std": 0.12046118080615997, "rewards/accuracy_reward": 0.8612686991691589, "rewards/format_reward": 1.0, "step": 2287 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 401.8125, "epoch": 0.023099444724886422, "grad_norm": 10.632246880370714, "kl": 0.0771484375, "learning_rate": 9.98684011077241e-07, "loss": 0.0031, "reward": 1.8006560802459717, "reward_std": 0.016783414408564568, "rewards/accuracy_reward": 0.6506562232971191, "rewards/format_reward": 1.0, "step": 2288 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.90625, "epoch": 0.023109540636042404, "grad_norm": 10.105210473916063, "kl": 0.08447265625, "learning_rate": 9.986828609913171e-07, "loss": 0.0034, "reward": 1.9970312118530273, "reward_std": 0.027555881068110466, "rewards/accuracy_reward": 0.7970312833786011, "rewards/format_reward": 1.0, "step": 2289 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.40625, "epoch": 0.023119636547198386, "grad_norm": 2.9385370052077344, "kl": 0.08203125, "learning_rate": 9.986817104037266e-07, "loss": 0.0033, "reward": 2.1258749961853027, "reward_std": 0.02581120654940605, "rewards/accuracy_reward": 0.9258750081062317, "rewards/format_reward": 1.0, "step": 2290 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.0625, "epoch": 0.023129732458354365, "grad_norm": 2.9728615317462, "kl": 0.072265625, "learning_rate": 9.986805593144707e-07, "loss": 0.0029, "reward": 2.0185000896453857, "reward_std": 0.16611063480377197, "rewards/accuracy_reward": 0.843500018119812, "rewards/format_reward": 1.0, "step": 2291 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.78125, "epoch": 0.023139828369510347, "grad_norm": 2.3681070033525335, "kl": 0.07275390625, "learning_rate": 9.986794077235506e-07, "loss": 0.0029, "reward": 2.095937490463257, "reward_std": 0.013532626442611217, "rewards/accuracy_reward": 0.895937442779541, "rewards/format_reward": 1.0, "step": 2292 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.71875, "epoch": 0.02314992428066633, "grad_norm": 1.8157489704523004, "kl": 0.08251953125, "learning_rate": 9.98678255630967e-07, "loss": 0.0033, "reward": 1.8051562309265137, "reward_std": 0.013143888674676418, "rewards/accuracy_reward": 0.6551562547683716, "rewards/format_reward": 1.0, "step": 2293 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 422.15625, "epoch": 0.02316002019182231, "grad_norm": 2.179051062292796, "kl": 0.0830078125, "learning_rate": 9.986771030367215e-07, "loss": 0.0033, "reward": 1.8370624780654907, "reward_std": 0.008357472717761993, "rewards/accuracy_reward": 0.6870624423027039, "rewards/format_reward": 1.0, "step": 2294 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.03125, "epoch": 0.023170116102978294, "grad_norm": 2.2013806067658805, "kl": 0.0810546875, "learning_rate": 9.986759499408153e-07, "loss": 0.0032, "reward": 2.1525938510894775, "reward_std": 0.012225695885717869, "rewards/accuracy_reward": 0.9525937438011169, "rewards/format_reward": 1.0, "step": 2295 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.023180212014134276, "grad_norm": 3.5608621643790204, "kl": 0.08935546875, "learning_rate": 9.986747963432494e-07, "loss": 0.0036, "reward": 2.0681562423706055, "reward_std": 0.013632578775286674, "rewards/accuracy_reward": 0.8681562542915344, "rewards/format_reward": 1.0, "step": 2296 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 428.65625, "epoch": 0.023190307925290258, "grad_norm": 1.9699588248614184, "kl": 0.07373046875, "learning_rate": 9.986736422440249e-07, "loss": 0.003, "reward": 1.8612812757492065, "reward_std": 0.015125991776585579, "rewards/accuracy_reward": 0.7112812399864197, "rewards/format_reward": 1.0, "step": 2297 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.09375, "epoch": 0.02320040383644624, "grad_norm": 1.0042707823947667, "kl": 0.0771484375, "learning_rate": 9.98672487643143e-07, "loss": 0.0031, "reward": 1.8866562843322754, "reward_std": 0.001841657911427319, "rewards/accuracy_reward": 0.7366562485694885, "rewards/format_reward": 1.0, "step": 2298 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.0625, "epoch": 0.023210499747602222, "grad_norm": 11.203408122940772, "kl": 0.09375, "learning_rate": 9.986713325406049e-07, "loss": 0.0037, "reward": 2.0623435974121094, "reward_std": 0.01728416234254837, "rewards/accuracy_reward": 0.8623437285423279, "rewards/format_reward": 1.0, "step": 2299 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 431.71875, "epoch": 0.023220595658758204, "grad_norm": 1.4021592107155607, "kl": 0.0703125, "learning_rate": 9.986701769364117e-07, "loss": 0.0028, "reward": 1.8209376335144043, "reward_std": 0.004376772791147232, "rewards/accuracy_reward": 0.6709375381469727, "rewards/format_reward": 1.0, "step": 2300 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.71875, "epoch": 0.023230691569914183, "grad_norm": 2.8040261098495884, "kl": 0.0810546875, "learning_rate": 9.986690208305646e-07, "loss": 0.0033, "reward": 1.835687518119812, "reward_std": 0.025085948407649994, "rewards/accuracy_reward": 0.6856874823570251, "rewards/format_reward": 1.0, "step": 2301 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.34375, "epoch": 0.023240787481070165, "grad_norm": 30.07822131758145, "kl": 0.06494140625, "learning_rate": 9.98667864223065e-07, "loss": 0.0026, "reward": 2.1396560668945312, "reward_std": 0.0315043106675148, "rewards/accuracy_reward": 0.9459062814712524, "rewards/format_reward": 1.0, "step": 2302 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.5, "epoch": 0.023250883392226147, "grad_norm": 2.0975660130730027, "kl": 0.07177734375, "learning_rate": 9.986667071139138e-07, "loss": 0.0029, "reward": 2.0411877632141113, "reward_std": 0.015061859972774982, "rewards/accuracy_reward": 0.8411874771118164, "rewards/format_reward": 1.0, "step": 2303 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 423.28125, "epoch": 0.02326097930338213, "grad_norm": 1.3653838030344512, "kl": 0.06640625, "learning_rate": 9.986655495031119e-07, "loss": 0.0027, "reward": 1.8899061679840088, "reward_std": 0.005962582305073738, "rewards/accuracy_reward": 0.7399061918258667, "rewards/format_reward": 1.0, "step": 2304 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 439.15625, "epoch": 0.02327107521453811, "grad_norm": 1.654325141046534, "kl": 0.06201171875, "learning_rate": 9.986643913906609e-07, "loss": 0.0025, "reward": 1.3113281726837158, "reward_std": 0.09106174111366272, "rewards/accuracy_reward": 0.2613281011581421, "rewards/format_reward": 1.0, "step": 2305 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.53125, "epoch": 0.023281171125694094, "grad_norm": 3.305933526963756, "kl": 0.06884765625, "learning_rate": 9.98663232776562e-07, "loss": 0.0027, "reward": 2.149531364440918, "reward_std": 0.041137050837278366, "rewards/accuracy_reward": 0.9620311856269836, "rewards/format_reward": 1.0, "step": 2306 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.53125, "epoch": 0.023291267036850076, "grad_norm": 2.9937744220876943, "kl": 0.0654296875, "learning_rate": 9.98662073660816e-07, "loss": 0.0026, "reward": 2.117000102996826, "reward_std": 0.028959622606635094, "rewards/accuracy_reward": 0.9294999241828918, "rewards/format_reward": 1.0, "step": 2307 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 454.96875, "epoch": 0.023301362948006058, "grad_norm": 3.0082455067453218, "kl": 0.051513671875, "learning_rate": 9.986609140434244e-07, "loss": 0.0021, "reward": 1.8164688348770142, "reward_std": 0.23799102008342743, "rewards/accuracy_reward": 0.6789687871932983, "rewards/format_reward": 1.0, "step": 2308 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.6875, "epoch": 0.02331145885916204, "grad_norm": 5.225218047873011, "kl": 0.0908203125, "learning_rate": 9.98659753924388e-07, "loss": 0.0036, "reward": 1.9710625410079956, "reward_std": 0.014780629426240921, "rewards/accuracy_reward": 0.7710624933242798, "rewards/format_reward": 1.0, "step": 2309 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.28125, "epoch": 0.023321554770318022, "grad_norm": 2.603939945359323, "kl": 0.0830078125, "learning_rate": 9.986585933037086e-07, "loss": 0.0033, "reward": 2.0655627250671387, "reward_std": 0.014592474326491356, "rewards/accuracy_reward": 0.8655624985694885, "rewards/format_reward": 1.0, "step": 2310 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.03125, "epoch": 0.023331650681474005, "grad_norm": 1.9219849115240355, "kl": 0.06201171875, "learning_rate": 9.986574321813866e-07, "loss": 0.0025, "reward": 1.840250015258789, "reward_std": 0.006239281501621008, "rewards/accuracy_reward": 0.6902499794960022, "rewards/format_reward": 1.0, "step": 2311 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 428.9375, "epoch": 0.023341746592629983, "grad_norm": 5.250716660306808, "kl": 0.07373046875, "learning_rate": 9.986562705574237e-07, "loss": 0.003, "reward": 1.8286250829696655, "reward_std": 0.016401775181293488, "rewards/accuracy_reward": 0.6786250472068787, "rewards/format_reward": 1.0, "step": 2312 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.75, "epoch": 0.023351842503785965, "grad_norm": 1.807678706923267, "kl": 0.07470703125, "learning_rate": 9.986551084318209e-07, "loss": 0.003, "reward": 1.8400311470031738, "reward_std": 0.020544515922665596, "rewards/accuracy_reward": 0.6900312304496765, "rewards/format_reward": 1.0, "step": 2313 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 448.75, "epoch": 0.023361938414941948, "grad_norm": 1.7640766791784686, "kl": 0.06103515625, "learning_rate": 9.986539458045794e-07, "loss": 0.0024, "reward": 2.015500068664551, "reward_std": 0.19339314103126526, "rewards/accuracy_reward": 0.846750020980835, "rewards/format_reward": 1.0, "step": 2314 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.90625, "epoch": 0.02337203432609793, "grad_norm": 2.6511725436788462, "kl": 0.0830078125, "learning_rate": 9.986527826757003e-07, "loss": 0.0033, "reward": 2.0037500858306885, "reward_std": 0.023673273622989655, "rewards/accuracy_reward": 0.8100000023841858, "rewards/format_reward": 1.0, "step": 2315 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.9375, "epoch": 0.023382130237253912, "grad_norm": 1.8165959386236021, "kl": 0.0771484375, "learning_rate": 9.986516190451848e-07, "loss": 0.0031, "reward": 2.1347498893737793, "reward_std": 0.007207803428173065, "rewards/accuracy_reward": 0.9347500205039978, "rewards/format_reward": 1.0, "step": 2316 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.25, "epoch": 0.023392226148409894, "grad_norm": 5.426456014856194, "kl": 0.07080078125, "learning_rate": 9.98650454913034e-07, "loss": 0.0028, "reward": 2.1326563358306885, "reward_std": 0.012994170188903809, "rewards/accuracy_reward": 0.9326562285423279, "rewards/format_reward": 1.0, "step": 2317 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.3125, "epoch": 0.023402322059565876, "grad_norm": 2.7696541905199816, "kl": 0.07568359375, "learning_rate": 9.986492902792493e-07, "loss": 0.003, "reward": 1.9018124341964722, "reward_std": 0.14769302308559418, "rewards/accuracy_reward": 0.7393125295639038, "rewards/format_reward": 1.0, "step": 2318 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 419.90625, "epoch": 0.02341241797072186, "grad_norm": 3.42579451207885, "kl": 0.06982421875, "learning_rate": 9.986481251438317e-07, "loss": 0.0028, "reward": 1.8091249465942383, "reward_std": 0.025267181918025017, "rewards/accuracy_reward": 0.6653749942779541, "rewards/format_reward": 1.0, "step": 2319 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.875, "epoch": 0.02342251388187784, "grad_norm": 3.6239841232368435, "kl": 0.06982421875, "learning_rate": 9.986469595067824e-07, "loss": 0.0028, "reward": 2.0404999256134033, "reward_std": 0.1624850481748581, "rewards/accuracy_reward": 0.8592499494552612, "rewards/format_reward": 1.0, "step": 2320 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.53125, "epoch": 0.023432609793033823, "grad_norm": 4.564852022882375, "kl": 0.0810546875, "learning_rate": 9.986457933681026e-07, "loss": 0.0033, "reward": 2.0741562843322754, "reward_std": 0.03800766542553902, "rewards/accuracy_reward": 0.8804062604904175, "rewards/format_reward": 1.0, "step": 2321 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.71875, "epoch": 0.0234427057041898, "grad_norm": 1.7058552674998142, "kl": 0.054931640625, "learning_rate": 9.986446267277933e-07, "loss": 0.0022, "reward": 2.0873124599456787, "reward_std": 0.019274283200502396, "rewards/accuracy_reward": 0.8935624957084656, "rewards/format_reward": 1.0, "step": 2322 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.28125, "epoch": 0.023452801615345784, "grad_norm": 1.9946913344447506, "kl": 0.0615234375, "learning_rate": 9.98643459585856e-07, "loss": 0.0025, "reward": 2.185999870300293, "reward_std": 0.008889716118574142, "rewards/accuracy_reward": 0.9860000610351562, "rewards/format_reward": 1.0, "step": 2323 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.5625, "epoch": 0.023462897526501766, "grad_norm": 2.75446380169225, "kl": 0.0791015625, "learning_rate": 9.986422919422914e-07, "loss": 0.0032, "reward": 2.104687452316284, "reward_std": 0.01790405623614788, "rewards/accuracy_reward": 0.9046875238418579, "rewards/format_reward": 1.0, "step": 2324 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 422.78125, "epoch": 0.023472993437657748, "grad_norm": 1.6412988444620888, "kl": 0.060546875, "learning_rate": 9.986411237971013e-07, "loss": 0.0024, "reward": 1.528031349182129, "reward_std": 0.023275984451174736, "rewards/accuracy_reward": 0.43428122997283936, "rewards/format_reward": 1.0, "step": 2325 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.53125, "epoch": 0.02348308934881373, "grad_norm": 2.6499472726513136, "kl": 0.07373046875, "learning_rate": 9.986399551502861e-07, "loss": 0.0029, "reward": 2.0460312366485596, "reward_std": 0.03155561536550522, "rewards/accuracy_reward": 0.8522812128067017, "rewards/format_reward": 1.0, "step": 2326 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.0, "epoch": 0.023493185259969712, "grad_norm": 2.618307605781817, "kl": 0.06787109375, "learning_rate": 9.98638786001848e-07, "loss": 0.0027, "reward": 2.161562442779541, "reward_std": 0.042381130158901215, "rewards/accuracy_reward": 0.974062442779541, "rewards/format_reward": 1.0, "step": 2327 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.375, "epoch": 0.023503281171125694, "grad_norm": 4.1903883643323345, "kl": 0.0869140625, "learning_rate": 9.986376163517871e-07, "loss": 0.0035, "reward": 2.072781562805176, "reward_std": 0.028068162500858307, "rewards/accuracy_reward": 0.879031240940094, "rewards/format_reward": 1.0, "step": 2328 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 453.5, "epoch": 0.023513377082281677, "grad_norm": 1.9696483406227114, "kl": 0.060302734375, "learning_rate": 9.986364462001053e-07, "loss": 0.0024, "reward": 1.5597811937332153, "reward_std": 0.03072594292461872, "rewards/accuracy_reward": 0.47853127121925354, "rewards/format_reward": 1.0, "step": 2329 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.6875, "epoch": 0.02352347299343766, "grad_norm": 2.170371527722383, "kl": 0.0673828125, "learning_rate": 9.986352755468034e-07, "loss": 0.0027, "reward": 1.9178435802459717, "reward_std": 0.14828629791736603, "rewards/accuracy_reward": 0.7553437948226929, "rewards/format_reward": 1.0, "step": 2330 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 409.78125, "epoch": 0.02353356890459364, "grad_norm": 1.6064526269980208, "kl": 0.07177734375, "learning_rate": 9.986341043918829e-07, "loss": 0.0029, "reward": 1.8443750143051147, "reward_std": 0.006134305614978075, "rewards/accuracy_reward": 0.6943749189376831, "rewards/format_reward": 1.0, "step": 2331 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.03125, "epoch": 0.023543664815749623, "grad_norm": 2.1485078130968134, "kl": 0.07275390625, "learning_rate": 9.98632932735345e-07, "loss": 0.0029, "reward": 2.160749912261963, "reward_std": 0.008301623165607452, "rewards/accuracy_reward": 0.9607500433921814, "rewards/format_reward": 1.0, "step": 2332 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.21875, "epoch": 0.0235537607269056, "grad_norm": 7.4883205907531165, "kl": 0.057861328125, "learning_rate": 9.986317605771901e-07, "loss": 0.0023, "reward": 1.8109064102172852, "reward_std": 0.024520207196474075, "rewards/accuracy_reward": 0.6671562194824219, "rewards/format_reward": 1.0, "step": 2333 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.09375, "epoch": 0.023563856638061584, "grad_norm": 3.334970553394541, "kl": 0.068359375, "learning_rate": 9.986305879174205e-07, "loss": 0.0027, "reward": 2.1845312118530273, "reward_std": 0.02505059540271759, "rewards/accuracy_reward": 0.990781307220459, "rewards/format_reward": 1.0, "step": 2334 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.5, "epoch": 0.023573952549217566, "grad_norm": 2.793230350651501, "kl": 0.06689453125, "learning_rate": 9.986294147560367e-07, "loss": 0.0027, "reward": 2.102656364440918, "reward_std": 0.010022778064012527, "rewards/accuracy_reward": 0.9026561975479126, "rewards/format_reward": 1.0, "step": 2335 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.625, "epoch": 0.023584048460373548, "grad_norm": 12.942004395752068, "kl": 0.0751953125, "learning_rate": 9.986282410930397e-07, "loss": 0.003, "reward": 2.1022188663482666, "reward_std": 0.010006004013121128, "rewards/accuracy_reward": 0.9022188186645508, "rewards/format_reward": 1.0, "step": 2336 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.21875, "epoch": 0.02359414437152953, "grad_norm": 1.9961511603376454, "kl": 0.06494140625, "learning_rate": 9.986270669284313e-07, "loss": 0.0026, "reward": 2.177000045776367, "reward_std": 0.00808964017778635, "rewards/accuracy_reward": 0.9769999980926514, "rewards/format_reward": 1.0, "step": 2337 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 413.6875, "epoch": 0.023604240282685512, "grad_norm": 10.666731352176235, "kl": 0.06640625, "learning_rate": 9.986258922622124e-07, "loss": 0.0027, "reward": 1.683500051498413, "reward_std": 0.1632971465587616, "rewards/accuracy_reward": 0.5647500157356262, "rewards/format_reward": 1.0, "step": 2338 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.78125, "epoch": 0.023614336193841495, "grad_norm": 3.337494781528239, "kl": 0.07080078125, "learning_rate": 9.98624717094384e-07, "loss": 0.0028, "reward": 1.9008126258850098, "reward_std": 0.15273158252239227, "rewards/accuracy_reward": 0.7133125066757202, "rewards/format_reward": 1.0, "step": 2339 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.3125, "epoch": 0.023624432104997477, "grad_norm": 1.5574488404847406, "kl": 0.052978515625, "learning_rate": 9.986235414249475e-07, "loss": 0.0021, "reward": 2.1165313720703125, "reward_std": 0.021109243854880333, "rewards/accuracy_reward": 0.922781229019165, "rewards/format_reward": 1.0, "step": 2340 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.1875, "epoch": 0.02363452801615346, "grad_norm": 1.9245301176070384, "kl": 0.061279296875, "learning_rate": 9.98622365253904e-07, "loss": 0.0025, "reward": 2.084437370300293, "reward_std": 0.1116669625043869, "rewards/accuracy_reward": 0.8906874656677246, "rewards/format_reward": 1.0, "step": 2341 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.40625, "epoch": 0.02364462392730944, "grad_norm": 3.5164559018658297, "kl": 0.06494140625, "learning_rate": 9.986211885812548e-07, "loss": 0.0026, "reward": 2.1606154441833496, "reward_std": 0.004191938787698746, "rewards/accuracy_reward": 0.9606156349182129, "rewards/format_reward": 1.0, "step": 2342 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.84375, "epoch": 0.023654719838465423, "grad_norm": 2.608900554249474, "kl": 0.068359375, "learning_rate": 9.98620011407001e-07, "loss": 0.0027, "reward": 2.0384063720703125, "reward_std": 0.028688892722129822, "rewards/accuracy_reward": 0.844656229019165, "rewards/format_reward": 1.0, "step": 2343 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.0625, "epoch": 0.023664815749621402, "grad_norm": 1.2726410035736286, "kl": 0.06396484375, "learning_rate": 9.986188337311437e-07, "loss": 0.0026, "reward": 2.1227498054504395, "reward_std": 0.01847946271300316, "rewards/accuracy_reward": 0.9290000200271606, "rewards/format_reward": 1.0, "step": 2344 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 383.53125, "epoch": 0.023674911660777384, "grad_norm": 2.219522039009062, "kl": 0.0712890625, "learning_rate": 9.98617655553684e-07, "loss": 0.0028, "reward": 1.8245000839233398, "reward_std": 0.05441012978553772, "rewards/accuracy_reward": 0.6932500600814819, "rewards/format_reward": 1.0, "step": 2345 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.125, "epoch": 0.023685007571933366, "grad_norm": 3.0614007012138225, "kl": 0.06689453125, "learning_rate": 9.986164768746237e-07, "loss": 0.0027, "reward": 1.7577500343322754, "reward_std": 0.010414744727313519, "rewards/accuracy_reward": 0.6077499389648438, "rewards/format_reward": 1.0, "step": 2346 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 386.46875, "epoch": 0.02369510348308935, "grad_norm": 3.6664300731553596, "kl": 0.076171875, "learning_rate": 9.986152976939633e-07, "loss": 0.0031, "reward": 1.8109219074249268, "reward_std": 0.016024714335799217, "rewards/accuracy_reward": 0.6609219312667847, "rewards/format_reward": 1.0, "step": 2347 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.0625, "epoch": 0.02370519939424533, "grad_norm": 2.8512390618181223, "kl": 0.0849609375, "learning_rate": 9.986141180117041e-07, "loss": 0.0034, "reward": 2.0912184715270996, "reward_std": 0.029811210930347443, "rewards/accuracy_reward": 0.8974686861038208, "rewards/format_reward": 1.0, "step": 2348 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.0625, "epoch": 0.023715295305401313, "grad_norm": 3.1323952233645334, "kl": 0.0751953125, "learning_rate": 9.986129378278475e-07, "loss": 0.003, "reward": 2.004812479019165, "reward_std": 0.008656734600663185, "rewards/accuracy_reward": 0.8048125505447388, "rewards/format_reward": 1.0, "step": 2349 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.023725391216557295, "grad_norm": 1.837535312031801, "kl": 0.06396484375, "learning_rate": 9.986117571423947e-07, "loss": 0.0026, "reward": 2.144218921661377, "reward_std": 0.027378009632229805, "rewards/accuracy_reward": 0.9504687190055847, "rewards/format_reward": 1.0, "step": 2350 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.40625, "epoch": 0.023735487127713277, "grad_norm": 1.8862595621846998, "kl": 0.05322265625, "learning_rate": 9.986105759553466e-07, "loss": 0.0021, "reward": 1.5311250686645508, "reward_std": 0.0933704599738121, "rewards/accuracy_reward": 0.4311249852180481, "rewards/format_reward": 1.0, "step": 2351 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.875, "epoch": 0.02374558303886926, "grad_norm": 1.945108273729523, "kl": 0.06884765625, "learning_rate": 9.986093942667047e-07, "loss": 0.0028, "reward": 2.1457812786102295, "reward_std": 0.018916379660367966, "rewards/accuracy_reward": 0.9520312547683716, "rewards/format_reward": 1.0, "step": 2352 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.34375, "epoch": 0.02375567895002524, "grad_norm": 3.6545055930583805, "kl": 0.06591796875, "learning_rate": 9.9860821207647e-07, "loss": 0.0026, "reward": 1.8688437938690186, "reward_std": 0.008122086524963379, "rewards/accuracy_reward": 0.7188436985015869, "rewards/format_reward": 1.0, "step": 2353 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 388.40625, "epoch": 0.02376577486118122, "grad_norm": 2.0518620192656902, "kl": 0.076171875, "learning_rate": 9.986070293846436e-07, "loss": 0.003, "reward": 1.8748750686645508, "reward_std": 0.0058408272452652454, "rewards/accuracy_reward": 0.7248749732971191, "rewards/format_reward": 1.0, "step": 2354 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.84375, "epoch": 0.023775870772337202, "grad_norm": 2.2198096930708373, "kl": 0.08251953125, "learning_rate": 9.98605846191227e-07, "loss": 0.0033, "reward": 2.1066250801086426, "reward_std": 0.010849324986338615, "rewards/accuracy_reward": 0.9066250324249268, "rewards/format_reward": 1.0, "step": 2355 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.0625, "epoch": 0.023785966683493184, "grad_norm": 3.1970795940473677, "kl": 0.072265625, "learning_rate": 9.986046624962214e-07, "loss": 0.0029, "reward": 2.01771879196167, "reward_std": 0.026943467557430267, "rewards/accuracy_reward": 0.823968768119812, "rewards/format_reward": 1.0, "step": 2356 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.625, "epoch": 0.023796062594649166, "grad_norm": 1.4782821097272165, "kl": 0.05322265625, "learning_rate": 9.986034782996275e-07, "loss": 0.0021, "reward": 1.78125, "reward_std": 0.17294242978096008, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2357 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.78125, "epoch": 0.02380615850580515, "grad_norm": 2.7637174717691453, "kl": 0.083984375, "learning_rate": 9.986022936014472e-07, "loss": 0.0034, "reward": 2.127218723297119, "reward_std": 0.015450092032551765, "rewards/accuracy_reward": 0.9272187352180481, "rewards/format_reward": 1.0, "step": 2358 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.875, "epoch": 0.02381625441696113, "grad_norm": 1.446992433735482, "kl": 0.0654296875, "learning_rate": 9.98601108401681e-07, "loss": 0.0026, "reward": 2.0405001640319824, "reward_std": 0.010586300864815712, "rewards/accuracy_reward": 0.840499997138977, "rewards/format_reward": 1.0, "step": 2359 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.03125, "epoch": 0.023826350328117113, "grad_norm": 2.6463360732928973, "kl": 0.07470703125, "learning_rate": 9.985999227003304e-07, "loss": 0.003, "reward": 2.086750030517578, "reward_std": 0.0382281169295311, "rewards/accuracy_reward": 0.8992500305175781, "rewards/format_reward": 1.0, "step": 2360 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.5, "epoch": 0.023836446239273095, "grad_norm": 2.943112171176473, "kl": 0.08203125, "learning_rate": 9.985987364973967e-07, "loss": 0.0033, "reward": 2.0552186965942383, "reward_std": 0.018421979621052742, "rewards/accuracy_reward": 0.855218768119812, "rewards/format_reward": 1.0, "step": 2361 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 384.40625, "epoch": 0.023846542150429077, "grad_norm": 1.9186319995854753, "kl": 0.076171875, "learning_rate": 9.985975497928808e-07, "loss": 0.003, "reward": 1.833031177520752, "reward_std": 0.02051207795739174, "rewards/accuracy_reward": 0.6892812252044678, "rewards/format_reward": 1.0, "step": 2362 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.15625, "epoch": 0.02385663806158506, "grad_norm": 2.198562500918523, "kl": 0.0771484375, "learning_rate": 9.985963625867842e-07, "loss": 0.0031, "reward": 2.055468797683716, "reward_std": 0.006248275749385357, "rewards/accuracy_reward": 0.85546875, "rewards/format_reward": 1.0, "step": 2363 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.96875, "epoch": 0.02386673397274104, "grad_norm": 2.4661132265052985, "kl": 0.061767578125, "learning_rate": 9.98595174879108e-07, "loss": 0.0025, "reward": 1.806999921798706, "reward_std": 0.00288778031244874, "rewards/accuracy_reward": 0.656999945640564, "rewards/format_reward": 1.0, "step": 2364 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.75, "epoch": 0.02387682988389702, "grad_norm": 2.068523340994724, "kl": 0.061767578125, "learning_rate": 9.985939866698534e-07, "loss": 0.0025, "reward": 1.613234281539917, "reward_std": 0.10420605540275574, "rewards/accuracy_reward": 0.5132343769073486, "rewards/format_reward": 1.0, "step": 2365 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.90625, "epoch": 0.023886925795053002, "grad_norm": 1.4958880452421053, "kl": 0.0673828125, "learning_rate": 9.985927979590214e-07, "loss": 0.0027, "reward": 1.8177499771118164, "reward_std": 0.018753353506326675, "rewards/accuracy_reward": 0.6677500009536743, "rewards/format_reward": 1.0, "step": 2366 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.8125, "epoch": 0.023897021706208985, "grad_norm": 2.355031903266981, "kl": 0.07958984375, "learning_rate": 9.985916087466135e-07, "loss": 0.0032, "reward": 2.158843755722046, "reward_std": 0.012581443414092064, "rewards/accuracy_reward": 0.9588437676429749, "rewards/format_reward": 1.0, "step": 2367 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.25, "epoch": 0.023907117617364967, "grad_norm": 3.1831983250778673, "kl": 0.087890625, "learning_rate": 9.985904190326308e-07, "loss": 0.0035, "reward": 2.041468620300293, "reward_std": 0.015428250655531883, "rewards/accuracy_reward": 0.8414688110351562, "rewards/format_reward": 1.0, "step": 2368 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.90625, "epoch": 0.02391721352852095, "grad_norm": 2.239485202370729, "kl": 0.0732421875, "learning_rate": 9.985892288170744e-07, "loss": 0.0029, "reward": 1.844156265258789, "reward_std": 0.009350541979074478, "rewards/accuracy_reward": 0.694156289100647, "rewards/format_reward": 1.0, "step": 2369 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.15625, "epoch": 0.02392730943967693, "grad_norm": 1.7050996892289227, "kl": 0.06884765625, "learning_rate": 9.985880380999453e-07, "loss": 0.0028, "reward": 2.1774845123291016, "reward_std": 0.021290941163897514, "rewards/accuracy_reward": 0.9837344288825989, "rewards/format_reward": 1.0, "step": 2370 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.40625, "epoch": 0.023937405350832913, "grad_norm": 3.507972834918963, "kl": 0.0849609375, "learning_rate": 9.985868468812452e-07, "loss": 0.0034, "reward": 1.967343807220459, "reward_std": 0.016236331313848495, "rewards/accuracy_reward": 0.7673437595367432, "rewards/format_reward": 1.0, "step": 2371 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.6875, "epoch": 0.023947501261988895, "grad_norm": 3.200202887569994, "kl": 0.0732421875, "learning_rate": 9.98585655160975e-07, "loss": 0.0029, "reward": 1.85546875, "reward_std": 0.010031454265117645, "rewards/accuracy_reward": 0.7054687142372131, "rewards/format_reward": 1.0, "step": 2372 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.6875, "epoch": 0.023957597173144878, "grad_norm": 11.153098916786261, "kl": 0.0791015625, "learning_rate": 9.98584462939136e-07, "loss": 0.0032, "reward": 2.1288437843322754, "reward_std": 0.011433403007686138, "rewards/accuracy_reward": 0.9288437366485596, "rewards/format_reward": 1.0, "step": 2373 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 386.84375, "epoch": 0.02396769308430086, "grad_norm": 2.21770242387048, "kl": 0.059814453125, "learning_rate": 9.985832702157293e-07, "loss": 0.0024, "reward": 1.8409688472747803, "reward_std": 0.035748548805713654, "rewards/accuracy_reward": 0.6972187161445618, "rewards/format_reward": 1.0, "step": 2374 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.21875, "epoch": 0.02397778899545684, "grad_norm": 1.9953705764854632, "kl": 0.06591796875, "learning_rate": 9.98582076990756e-07, "loss": 0.0026, "reward": 1.9005937576293945, "reward_std": 0.0962805300951004, "rewards/accuracy_reward": 0.7505937814712524, "rewards/format_reward": 1.0, "step": 2375 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.03125, "epoch": 0.02398788490661282, "grad_norm": 5.604029422345426, "kl": 0.08154296875, "learning_rate": 9.985808832642176e-07, "loss": 0.0033, "reward": 2.0010626316070557, "reward_std": 0.009754039347171783, "rewards/accuracy_reward": 0.8010624647140503, "rewards/format_reward": 1.0, "step": 2376 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 404.65625, "epoch": 0.023997980817768803, "grad_norm": 2.572851551639012, "kl": 0.0654296875, "learning_rate": 9.98579689036115e-07, "loss": 0.0026, "reward": 1.8709686994552612, "reward_std": 0.015169557183980942, "rewards/accuracy_reward": 0.7209687232971191, "rewards/format_reward": 1.0, "step": 2377 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.024008076728924785, "grad_norm": 2.543079320308924, "kl": 0.0869140625, "learning_rate": 9.985784943064497e-07, "loss": 0.0035, "reward": 2.073531150817871, "reward_std": 0.019056491553783417, "rewards/accuracy_reward": 0.8735312223434448, "rewards/format_reward": 1.0, "step": 2378 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.8125, "epoch": 0.024018172640080767, "grad_norm": 1.9518503728351602, "kl": 0.0908203125, "learning_rate": 9.985772990752226e-07, "loss": 0.0036, "reward": 1.9756250381469727, "reward_std": 0.011730052530765533, "rewards/accuracy_reward": 0.7756249308586121, "rewards/format_reward": 1.0, "step": 2379 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.9375, "epoch": 0.02402826855123675, "grad_norm": 24.845733096743626, "kl": 0.07861328125, "learning_rate": 9.985761033424352e-07, "loss": 0.0031, "reward": 2.087031364440918, "reward_std": 0.12102025002241135, "rewards/accuracy_reward": 0.8932812213897705, "rewards/format_reward": 1.0, "step": 2380 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.78125, "epoch": 0.02403836446239273, "grad_norm": 5.653322099396271, "kl": 0.07666015625, "learning_rate": 9.985749071080884e-07, "loss": 0.0031, "reward": 2.102843761444092, "reward_std": 0.010818383656442165, "rewards/accuracy_reward": 0.902843713760376, "rewards/format_reward": 1.0, "step": 2381 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.8125, "epoch": 0.024048460373548713, "grad_norm": 4.374270720792365, "kl": 0.0751953125, "learning_rate": 9.985737103721837e-07, "loss": 0.003, "reward": 2.014719009399414, "reward_std": 0.0278609711676836, "rewards/accuracy_reward": 0.820968747138977, "rewards/format_reward": 1.0, "step": 2382 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.4375, "epoch": 0.024058556284704696, "grad_norm": 8.427606191246845, "kl": 0.083984375, "learning_rate": 9.98572513134722e-07, "loss": 0.0034, "reward": 2.011531352996826, "reward_std": 0.017078954726457596, "rewards/accuracy_reward": 0.8115311861038208, "rewards/format_reward": 1.0, "step": 2383 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.875, "epoch": 0.024068652195860678, "grad_norm": 1.8251019728681552, "kl": 0.08203125, "learning_rate": 9.985713153957047e-07, "loss": 0.0033, "reward": 2.1654062271118164, "reward_std": 0.011467852629721165, "rewards/accuracy_reward": 0.9654062986373901, "rewards/format_reward": 1.0, "step": 2384 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 391.90625, "epoch": 0.02407874810701666, "grad_norm": 2.0168808975296253, "kl": 0.083984375, "learning_rate": 9.98570117155133e-07, "loss": 0.0034, "reward": 1.7382187843322754, "reward_std": 0.02205590158700943, "rewards/accuracy_reward": 0.5944687128067017, "rewards/format_reward": 1.0, "step": 2385 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.8125, "epoch": 0.02408884401817264, "grad_norm": 1.7582075284109984, "kl": 0.06298828125, "learning_rate": 9.985689184130081e-07, "loss": 0.0025, "reward": 1.7257500886917114, "reward_std": 0.17823928594589233, "rewards/accuracy_reward": 0.6069999933242798, "rewards/format_reward": 1.0, "step": 2386 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.9375, "epoch": 0.02409893992932862, "grad_norm": 2.574381456430458, "kl": 0.0751953125, "learning_rate": 9.985677191693312e-07, "loss": 0.003, "reward": 1.8133749961853027, "reward_std": 0.03167978674173355, "rewards/accuracy_reward": 0.6696250438690186, "rewards/format_reward": 1.0, "step": 2387 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.28125, "epoch": 0.024109035840484603, "grad_norm": 3.456463776733543, "kl": 0.0791015625, "learning_rate": 9.985665194241035e-07, "loss": 0.0032, "reward": 2.178562641143799, "reward_std": 0.012616187334060669, "rewards/accuracy_reward": 0.9785624742507935, "rewards/format_reward": 1.0, "step": 2388 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.0, "epoch": 0.024119131751640585, "grad_norm": 2.781044104384751, "kl": 0.08544921875, "learning_rate": 9.98565319177326e-07, "loss": 0.0034, "reward": 2.1570310592651367, "reward_std": 0.010575290769338608, "rewards/accuracy_reward": 0.95703125, "rewards/format_reward": 1.0, "step": 2389 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 384.8125, "epoch": 0.024129227662796567, "grad_norm": 5.260233607421754, "kl": 0.07958984375, "learning_rate": 9.985641184290003e-07, "loss": 0.0032, "reward": 1.8561251163482666, "reward_std": 0.031076157465577126, "rewards/accuracy_reward": 0.7123749852180481, "rewards/format_reward": 1.0, "step": 2390 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 396.75, "epoch": 0.02413932357395255, "grad_norm": 3.378835547854174, "kl": 0.08837890625, "learning_rate": 9.985629171791273e-07, "loss": 0.0035, "reward": 1.8197813034057617, "reward_std": 0.0169757679104805, "rewards/accuracy_reward": 0.6697812676429749, "rewards/format_reward": 1.0, "step": 2391 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 426.34375, "epoch": 0.02414941948510853, "grad_norm": 2.286757249222581, "kl": 0.0712890625, "learning_rate": 9.985617154277084e-07, "loss": 0.0028, "reward": 1.8452500104904175, "reward_std": 0.026116710156202316, "rewards/accuracy_reward": 0.7015000581741333, "rewards/format_reward": 1.0, "step": 2392 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 406.09375, "epoch": 0.024159515396264514, "grad_norm": 0.9765744483326675, "kl": 0.052978515625, "learning_rate": 9.985605131747445e-07, "loss": 0.0021, "reward": 1.59375, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2393 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.125, "epoch": 0.024169611307420496, "grad_norm": 1.6396947148424588, "kl": 0.0751953125, "learning_rate": 9.985593104202372e-07, "loss": 0.003, "reward": 2.1489062309265137, "reward_std": 0.005456455051898956, "rewards/accuracy_reward": 0.9489062428474426, "rewards/format_reward": 1.0, "step": 2394 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.46875, "epoch": 0.024179707218576478, "grad_norm": 4.331461755553234, "kl": 0.07373046875, "learning_rate": 9.985581071641876e-07, "loss": 0.003, "reward": 1.8454062938690186, "reward_std": 0.024533476680517197, "rewards/accuracy_reward": 0.7016562223434448, "rewards/format_reward": 1.0, "step": 2395 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.59375, "epoch": 0.024189803129732457, "grad_norm": 9.058015580144893, "kl": 0.0693359375, "learning_rate": 9.985569034065969e-07, "loss": 0.0028, "reward": 2.1246562004089355, "reward_std": 0.11433114856481552, "rewards/accuracy_reward": 0.9309062957763672, "rewards/format_reward": 1.0, "step": 2396 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.71875, "epoch": 0.02419989904088844, "grad_norm": 14.055345716157005, "kl": 0.0791015625, "learning_rate": 9.98555699147466e-07, "loss": 0.0032, "reward": 1.8459374904632568, "reward_std": 0.017221612855792046, "rewards/accuracy_reward": 0.6959375143051147, "rewards/format_reward": 1.0, "step": 2397 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.0625, "epoch": 0.02420999495204442, "grad_norm": 2.9404433148976463, "kl": 0.07861328125, "learning_rate": 9.985544943867967e-07, "loss": 0.0031, "reward": 2.132781505584717, "reward_std": 0.01958390697836876, "rewards/accuracy_reward": 0.9327812194824219, "rewards/format_reward": 1.0, "step": 2398 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.46875, "epoch": 0.024220090863200403, "grad_norm": 2.2994942223288177, "kl": 0.07763671875, "learning_rate": 9.985532891245898e-07, "loss": 0.0031, "reward": 2.0980312824249268, "reward_std": 0.012400808744132519, "rewards/accuracy_reward": 0.8980312347412109, "rewards/format_reward": 1.0, "step": 2399 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.78125, "epoch": 0.024230186774356385, "grad_norm": 1.7229190191640253, "kl": 0.07373046875, "learning_rate": 9.985520833608464e-07, "loss": 0.003, "reward": 2.047281265258789, "reward_std": 0.0068480512127280235, "rewards/accuracy_reward": 0.8472812175750732, "rewards/format_reward": 1.0, "step": 2400 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.625, "epoch": 0.024240282685512368, "grad_norm": 0.9569233558401367, "kl": 0.056640625, "learning_rate": 9.98550877095568e-07, "loss": 0.0023, "reward": 1.912500023841858, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2401 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.02425037859666835, "grad_norm": 2.0343810839304264, "kl": 0.08837890625, "learning_rate": 9.985496703287557e-07, "loss": 0.0035, "reward": 2.044468879699707, "reward_std": 0.013289064168930054, "rewards/accuracy_reward": 0.8444687724113464, "rewards/format_reward": 1.0, "step": 2402 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.125, "epoch": 0.024260474507824332, "grad_norm": 3.754451734802867, "kl": 0.083984375, "learning_rate": 9.985484630604108e-07, "loss": 0.0034, "reward": 2.1535937786102295, "reward_std": 0.01924367994070053, "rewards/accuracy_reward": 0.9535937309265137, "rewards/format_reward": 1.0, "step": 2403 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.25, "epoch": 0.024270570418980314, "grad_norm": 2.1207461442175024, "kl": 0.0810546875, "learning_rate": 9.985472552905345e-07, "loss": 0.0032, "reward": 2.1888437271118164, "reward_std": 0.004834786057472229, "rewards/accuracy_reward": 0.9888437986373901, "rewards/format_reward": 1.0, "step": 2404 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.8125, "epoch": 0.024280666330136296, "grad_norm": 5.85246685774252, "kl": 0.08056640625, "learning_rate": 9.98546047019128e-07, "loss": 0.0032, "reward": 2.0502188205718994, "reward_std": 0.022842247039079666, "rewards/accuracy_reward": 0.8502187728881836, "rewards/format_reward": 1.0, "step": 2405 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 400.78125, "epoch": 0.02429076224129228, "grad_norm": 1.8992244599679546, "kl": 0.05859375, "learning_rate": 9.985448382461925e-07, "loss": 0.0023, "reward": 1.8802499771118164, "reward_std": 0.003427827265113592, "rewards/accuracy_reward": 0.7302500009536743, "rewards/format_reward": 1.0, "step": 2406 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.40625, "epoch": 0.024300858152448257, "grad_norm": 1.5792682159377855, "kl": 0.07861328125, "learning_rate": 9.98543628971729e-07, "loss": 0.0031, "reward": 2.0472500324249268, "reward_std": 0.006553805898874998, "rewards/accuracy_reward": 0.8472499847412109, "rewards/format_reward": 1.0, "step": 2407 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.4375, "epoch": 0.02431095406360424, "grad_norm": 2.4071574870750805, "kl": 0.07080078125, "learning_rate": 9.98542419195739e-07, "loss": 0.0028, "reward": 2.1060001850128174, "reward_std": 0.025352023541927338, "rewards/accuracy_reward": 0.9122500419616699, "rewards/format_reward": 1.0, "step": 2408 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.28125, "epoch": 0.02432104997476022, "grad_norm": 1.5208079985711485, "kl": 0.06982421875, "learning_rate": 9.985412089182238e-07, "loss": 0.0028, "reward": 2.1354689598083496, "reward_std": 0.004742510616779327, "rewards/accuracy_reward": 0.9354687333106995, "rewards/format_reward": 1.0, "step": 2409 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.6875, "epoch": 0.024331145885916203, "grad_norm": 2.5683196272684103, "kl": 0.0888671875, "learning_rate": 9.985399981391843e-07, "loss": 0.0036, "reward": 2.165156364440918, "reward_std": 0.005992860998958349, "rewards/accuracy_reward": 0.9651562571525574, "rewards/format_reward": 1.0, "step": 2410 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 423.5, "epoch": 0.024341241797072186, "grad_norm": 1.0485588313584338, "kl": 0.052001953125, "learning_rate": 9.98538786858622e-07, "loss": 0.0021, "reward": 1.5661561489105225, "reward_std": 0.004724449478089809, "rewards/accuracy_reward": 0.4661562442779541, "rewards/format_reward": 1.0, "step": 2411 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.125, "epoch": 0.024351337708228168, "grad_norm": 3.7163432601681317, "kl": 0.0751953125, "learning_rate": 9.985375750765378e-07, "loss": 0.003, "reward": 1.8035311698913574, "reward_std": 0.009626949205994606, "rewards/accuracy_reward": 0.6535313129425049, "rewards/format_reward": 1.0, "step": 2412 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.625, "epoch": 0.02436143361938415, "grad_norm": 3.0640080278014423, "kl": 0.0791015625, "learning_rate": 9.985363627929332e-07, "loss": 0.0031, "reward": 1.796468734741211, "reward_std": 0.013229557313024998, "rewards/accuracy_reward": 0.6464687585830688, "rewards/format_reward": 1.0, "step": 2413 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.46875, "epoch": 0.024371529530540132, "grad_norm": 2.156737291780229, "kl": 0.06298828125, "learning_rate": 9.985351500078095e-07, "loss": 0.0025, "reward": 1.8857500553131104, "reward_std": 0.007371518760919571, "rewards/accuracy_reward": 0.7357499599456787, "rewards/format_reward": 1.0, "step": 2414 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.4375, "epoch": 0.024381625441696114, "grad_norm": 1.678095251871546, "kl": 0.07373046875, "learning_rate": 9.985339367211674e-07, "loss": 0.0029, "reward": 2.0310311317443848, "reward_std": 0.014616591855883598, "rewards/accuracy_reward": 0.8310312032699585, "rewards/format_reward": 1.0, "step": 2415 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 377.53125, "epoch": 0.024391721352852096, "grad_norm": 1.032868724070519, "kl": 0.0673828125, "learning_rate": 9.985327229330087e-07, "loss": 0.0027, "reward": 1.841499924659729, "reward_std": 0.012526415288448334, "rewards/accuracy_reward": 0.6915000081062317, "rewards/format_reward": 1.0, "step": 2416 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.0, "epoch": 0.024401817264008075, "grad_norm": 3.088628595231343, "kl": 0.05810546875, "learning_rate": 9.985315086433344e-07, "loss": 0.0023, "reward": 1.9197187423706055, "reward_std": 0.13416767120361328, "rewards/accuracy_reward": 0.7634687423706055, "rewards/format_reward": 1.0, "step": 2417 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.96875, "epoch": 0.024411913175164057, "grad_norm": 1.9954886787752726, "kl": 0.07080078125, "learning_rate": 9.985302938521456e-07, "loss": 0.0028, "reward": 2.145250082015991, "reward_std": 0.029583118855953217, "rewards/accuracy_reward": 0.9514999985694885, "rewards/format_reward": 1.0, "step": 2418 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.02442200908632004, "grad_norm": 2.8590149299035668, "kl": 0.07666015625, "learning_rate": 9.985290785594435e-07, "loss": 0.0031, "reward": 2.127687454223633, "reward_std": 0.01606416143476963, "rewards/accuracy_reward": 0.9276875257492065, "rewards/format_reward": 1.0, "step": 2419 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.1875, "epoch": 0.02443210499747602, "grad_norm": 2.009923659432395, "kl": 0.06982421875, "learning_rate": 9.985278627652298e-07, "loss": 0.0028, "reward": 2.1481873989105225, "reward_std": 0.008374281227588654, "rewards/accuracy_reward": 0.948187530040741, "rewards/format_reward": 1.0, "step": 2420 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.25, "epoch": 0.024442200908632004, "grad_norm": 3.0973090907407506, "kl": 0.08203125, "learning_rate": 9.985266464695052e-07, "loss": 0.0033, "reward": 2.0690627098083496, "reward_std": 0.01978858932852745, "rewards/accuracy_reward": 0.8690624833106995, "rewards/format_reward": 1.0, "step": 2421 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.0625, "epoch": 0.024452296819787986, "grad_norm": 2.133906661526919, "kl": 0.078125, "learning_rate": 9.98525429672271e-07, "loss": 0.0031, "reward": 2.142531394958496, "reward_std": 0.008267208002507687, "rewards/accuracy_reward": 0.942531168460846, "rewards/format_reward": 1.0, "step": 2422 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.90625, "epoch": 0.024462392730943968, "grad_norm": 2.8546063377993582, "kl": 0.09375, "learning_rate": 9.985242123735286e-07, "loss": 0.0037, "reward": 2.1297812461853027, "reward_std": 0.015170993283390999, "rewards/accuracy_reward": 0.9297811985015869, "rewards/format_reward": 1.0, "step": 2423 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.96875, "epoch": 0.02447248864209995, "grad_norm": 3.176734005367824, "kl": 0.0830078125, "learning_rate": 9.985229945732793e-07, "loss": 0.0033, "reward": 1.9999688863754272, "reward_std": 0.0209501925855875, "rewards/accuracy_reward": 0.7999687194824219, "rewards/format_reward": 1.0, "step": 2424 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.8125, "epoch": 0.024482584553255932, "grad_norm": 0.09328482909516901, "kl": 0.0556640625, "learning_rate": 9.98521776271524e-07, "loss": 0.0022, "reward": 1.8859999179840088, "reward_std": 0.0, "rewards/accuracy_reward": 0.7360000014305115, "rewards/format_reward": 1.0, "step": 2425 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.9375, "epoch": 0.024492680464411914, "grad_norm": 2.708182208632286, "kl": 0.08349609375, "learning_rate": 9.985205574682641e-07, "loss": 0.0033, "reward": 2.0813751220703125, "reward_std": 0.017291422933340073, "rewards/accuracy_reward": 0.8813750147819519, "rewards/format_reward": 1.0, "step": 2426 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 420.875, "epoch": 0.024502776375567897, "grad_norm": 2.2782223613955983, "kl": 0.08203125, "learning_rate": 9.985193381635012e-07, "loss": 0.0033, "reward": 1.8165156841278076, "reward_std": 0.02052798680961132, "rewards/accuracy_reward": 0.666515588760376, "rewards/format_reward": 1.0, "step": 2427 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.46875, "epoch": 0.024512872286723875, "grad_norm": 2.312285554832468, "kl": 0.091796875, "learning_rate": 9.985181183572356e-07, "loss": 0.0037, "reward": 2.104062557220459, "reward_std": 0.012639984488487244, "rewards/accuracy_reward": 0.9040625095367432, "rewards/format_reward": 1.0, "step": 2428 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.375, "epoch": 0.024522968197879857, "grad_norm": 1.049249617651878, "kl": 0.06884765625, "learning_rate": 9.985168980494694e-07, "loss": 0.0028, "reward": 1.8595937490463257, "reward_std": 0.004551171790808439, "rewards/accuracy_reward": 0.7095937728881836, "rewards/format_reward": 1.0, "step": 2429 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.09375, "epoch": 0.02453306410903584, "grad_norm": 5.7032547864481895, "kl": 0.07861328125, "learning_rate": 9.985156772402036e-07, "loss": 0.0031, "reward": 2.107250213623047, "reward_std": 0.019266974180936813, "rewards/accuracy_reward": 0.9072499871253967, "rewards/format_reward": 1.0, "step": 2430 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.3125, "epoch": 0.024543160020191822, "grad_norm": 4.7248070965554625, "kl": 0.06640625, "learning_rate": 9.98514455929439e-07, "loss": 0.0027, "reward": 2.111687660217285, "reward_std": 0.010295436717569828, "rewards/accuracy_reward": 0.9116874933242798, "rewards/format_reward": 1.0, "step": 2431 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 0.024553255931347804, "grad_norm": 3.921757489680559, "kl": 0.08203125, "learning_rate": 9.985132341171773e-07, "loss": 0.0033, "reward": 2.1502811908721924, "reward_std": 0.02396799996495247, "rewards/accuracy_reward": 0.9565312266349792, "rewards/format_reward": 1.0, "step": 2432 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 413.625, "epoch": 0.024563351842503786, "grad_norm": 4.11679033649671, "kl": 0.07470703125, "learning_rate": 9.985120118034198e-07, "loss": 0.003, "reward": 1.7322187423706055, "reward_std": 0.017786262556910515, "rewards/accuracy_reward": 0.5822187662124634, "rewards/format_reward": 1.0, "step": 2433 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 0.02457344775365977, "grad_norm": 1.6541300887275292, "kl": 0.083984375, "learning_rate": 9.985107889881673e-07, "loss": 0.0034, "reward": 1.9510936737060547, "reward_std": 0.1619134247303009, "rewards/accuracy_reward": 0.7698436975479126, "rewards/format_reward": 1.0, "step": 2434 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.28125, "epoch": 0.02458354366481575, "grad_norm": 1.7550505735056399, "kl": 0.08056640625, "learning_rate": 9.985095656714213e-07, "loss": 0.0032, "reward": 2.1257500648498535, "reward_std": 0.006332590244710445, "rewards/accuracy_reward": 0.9257500171661377, "rewards/format_reward": 1.0, "step": 2435 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.0, "epoch": 0.024593639575971733, "grad_norm": 2.817133194143248, "kl": 0.0751953125, "learning_rate": 9.98508341853183e-07, "loss": 0.003, "reward": 2.044250011444092, "reward_std": 0.033377595245838165, "rewards/accuracy_reward": 0.8504999876022339, "rewards/format_reward": 1.0, "step": 2436 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.84375, "epoch": 0.024603735487127715, "grad_norm": 2.904017187496809, "kl": 0.0791015625, "learning_rate": 9.985071175334537e-07, "loss": 0.0032, "reward": 2.0045313835144043, "reward_std": 0.06426775455474854, "rewards/accuracy_reward": 0.8045312762260437, "rewards/format_reward": 1.0, "step": 2437 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.024613831398283693, "grad_norm": 2.1368634181579576, "kl": 0.0693359375, "learning_rate": 9.985058927122345e-07, "loss": 0.0028, "reward": 1.9199374914169312, "reward_std": 0.09628628194332123, "rewards/accuracy_reward": 0.7699375152587891, "rewards/format_reward": 1.0, "step": 2438 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.84375, "epoch": 0.024623927309439676, "grad_norm": 2.5747665593549494, "kl": 0.064453125, "learning_rate": 9.985046673895265e-07, "loss": 0.0026, "reward": 2.1875627040863037, "reward_std": 0.02121061086654663, "rewards/accuracy_reward": 0.9875624775886536, "rewards/format_reward": 1.0, "step": 2439 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.53125, "epoch": 0.024634023220595658, "grad_norm": 2.878494654330415, "kl": 0.076171875, "learning_rate": 9.985034415653313e-07, "loss": 0.0031, "reward": 2.0733749866485596, "reward_std": 0.007875164970755577, "rewards/accuracy_reward": 0.8733749389648438, "rewards/format_reward": 1.0, "step": 2440 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.59375, "epoch": 0.02464411913175164, "grad_norm": 2.179097582078124, "kl": 0.09228515625, "learning_rate": 9.9850221523965e-07, "loss": 0.0037, "reward": 2.086718797683716, "reward_std": 0.014139652252197266, "rewards/accuracy_reward": 0.88671875, "rewards/format_reward": 1.0, "step": 2441 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.024654215042907622, "grad_norm": 1.6142494768979718, "kl": 0.07275390625, "learning_rate": 9.985009884124835e-07, "loss": 0.0029, "reward": 2.0768749713897705, "reward_std": 0.0032446179538965225, "rewards/accuracy_reward": 0.8768750429153442, "rewards/format_reward": 1.0, "step": 2442 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.625, "epoch": 0.024664310954063604, "grad_norm": 13.691687935753636, "kl": 0.06640625, "learning_rate": 9.984997610838335e-07, "loss": 0.0027, "reward": 2.12904691696167, "reward_std": 0.016125818714499474, "rewards/accuracy_reward": 0.9290468692779541, "rewards/format_reward": 1.0, "step": 2443 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.125, "epoch": 0.024674406865219586, "grad_norm": 1.8334910469968928, "kl": 0.06396484375, "learning_rate": 9.984985332537011e-07, "loss": 0.0026, "reward": 2.0984063148498535, "reward_std": 0.11535340547561646, "rewards/accuracy_reward": 0.904656171798706, "rewards/format_reward": 1.0, "step": 2444 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.875, "epoch": 0.02468450277637557, "grad_norm": 3.040729805877467, "kl": 0.06689453125, "learning_rate": 9.984973049220875e-07, "loss": 0.0027, "reward": 2.1284687519073486, "reward_std": 0.003631322644650936, "rewards/accuracy_reward": 0.9284688234329224, "rewards/format_reward": 1.0, "step": 2445 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 395.0, "epoch": 0.02469459868753155, "grad_norm": 2.126181950561639, "kl": 0.07568359375, "learning_rate": 9.984960760889937e-07, "loss": 0.003, "reward": 1.8562500476837158, "reward_std": 0.022769644856452942, "rewards/accuracy_reward": 0.7124999761581421, "rewards/format_reward": 1.0, "step": 2446 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.84375, "epoch": 0.024704694598687533, "grad_norm": 2.910009593582469, "kl": 0.0703125, "learning_rate": 9.984948467544213e-07, "loss": 0.0028, "reward": 2.1379687786102295, "reward_std": 0.008193500339984894, "rewards/accuracy_reward": 0.9379687309265137, "rewards/format_reward": 1.0, "step": 2447 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.71875, "epoch": 0.024714790509843515, "grad_norm": 1.9626375103175944, "kl": 0.0712890625, "learning_rate": 9.984936169183715e-07, "loss": 0.0028, "reward": 1.7706249952316284, "reward_std": 0.11230596899986267, "rewards/accuracy_reward": 0.6268749833106995, "rewards/format_reward": 1.0, "step": 2448 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 413.40625, "epoch": 0.024724886420999494, "grad_norm": 2.145267529778595, "kl": 0.0771484375, "learning_rate": 9.984923865808454e-07, "loss": 0.0031, "reward": 1.8381562232971191, "reward_std": 0.01036495715379715, "rewards/accuracy_reward": 0.688156247138977, "rewards/format_reward": 1.0, "step": 2449 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.125, "epoch": 0.024734982332155476, "grad_norm": 1.7790148067627396, "kl": 0.0654296875, "learning_rate": 9.984911557418442e-07, "loss": 0.0026, "reward": 1.7782968282699585, "reward_std": 0.010064392350614071, "rewards/accuracy_reward": 0.6282968521118164, "rewards/format_reward": 1.0, "step": 2450 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.78125, "epoch": 0.024745078243311458, "grad_norm": 2.235864532427717, "kl": 0.0771484375, "learning_rate": 9.984899244013692e-07, "loss": 0.0031, "reward": 1.9427812099456787, "reward_std": 0.009505638852715492, "rewards/accuracy_reward": 0.7427812814712524, "rewards/format_reward": 1.0, "step": 2451 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.21875, "epoch": 0.02475517415446744, "grad_norm": 1.759223200039052, "kl": 0.06689453125, "learning_rate": 9.984886925594215e-07, "loss": 0.0027, "reward": 2.150156259536743, "reward_std": 0.01387363113462925, "rewards/accuracy_reward": 0.9501562118530273, "rewards/format_reward": 1.0, "step": 2452 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.28125, "epoch": 0.024765270065623422, "grad_norm": 7.743323368498358, "kl": 0.0830078125, "learning_rate": 9.984874602160028e-07, "loss": 0.0033, "reward": 2.13603138923645, "reward_std": 0.028597569093108177, "rewards/accuracy_reward": 0.9422812461853027, "rewards/format_reward": 1.0, "step": 2453 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.6875, "epoch": 0.024775365976779404, "grad_norm": 1.5644593977630963, "kl": 0.0703125, "learning_rate": 9.98486227371114e-07, "loss": 0.0028, "reward": 2.1688125133514404, "reward_std": 0.005336448084563017, "rewards/accuracy_reward": 0.9688124656677246, "rewards/format_reward": 1.0, "step": 2454 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.024785461887935387, "grad_norm": 2.554228758843675, "kl": 0.0849609375, "learning_rate": 9.98484994024756e-07, "loss": 0.0034, "reward": 2.063406229019165, "reward_std": 0.010934852994978428, "rewards/accuracy_reward": 0.863406240940094, "rewards/format_reward": 1.0, "step": 2455 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.5, "epoch": 0.02479555779909137, "grad_norm": 2.295262665566723, "kl": 0.068359375, "learning_rate": 9.984837601769308e-07, "loss": 0.0027, "reward": 1.8493437767028809, "reward_std": 0.010667329654097557, "rewards/accuracy_reward": 0.6993438005447388, "rewards/format_reward": 1.0, "step": 2456 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.59375, "epoch": 0.02480565371024735, "grad_norm": 2.6932570031525196, "kl": 0.0771484375, "learning_rate": 9.98482525827639e-07, "loss": 0.0031, "reward": 2.1453750133514404, "reward_std": 0.026649579405784607, "rewards/accuracy_reward": 0.9516249895095825, "rewards/format_reward": 1.0, "step": 2457 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.75, "epoch": 0.024815749621403333, "grad_norm": 5.854222184585719, "kl": 0.0771484375, "learning_rate": 9.98481290976882e-07, "loss": 0.0031, "reward": 2.0536563396453857, "reward_std": 0.014903544448316097, "rewards/accuracy_reward": 0.8536561727523804, "rewards/format_reward": 1.0, "step": 2458 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.21875, "epoch": 0.024825845532559312, "grad_norm": 2.663519568763795, "kl": 0.08642578125, "learning_rate": 9.984800556246614e-07, "loss": 0.0035, "reward": 2.0450000762939453, "reward_std": 0.01504178624600172, "rewards/accuracy_reward": 0.8449999690055847, "rewards/format_reward": 1.0, "step": 2459 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.40625, "epoch": 0.024835941443715294, "grad_norm": 2.324172575813754, "kl": 0.0537109375, "learning_rate": 9.98478819770978e-07, "loss": 0.0021, "reward": 1.8331875801086426, "reward_std": 0.0064320070669054985, "rewards/accuracy_reward": 0.6831874847412109, "rewards/format_reward": 1.0, "step": 2460 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.875, "epoch": 0.024846037354871276, "grad_norm": 2.618688768243423, "kl": 0.06787109375, "learning_rate": 9.984775834158334e-07, "loss": 0.0027, "reward": 2.071312427520752, "reward_std": 0.16711212694644928, "rewards/accuracy_reward": 0.8900625109672546, "rewards/format_reward": 1.0, "step": 2461 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.0625, "epoch": 0.024856133266027258, "grad_norm": 10.047677932821175, "kl": 0.08056640625, "learning_rate": 9.984763465592285e-07, "loss": 0.0032, "reward": 2.0529065132141113, "reward_std": 0.13252437114715576, "rewards/accuracy_reward": 0.8716562390327454, "rewards/format_reward": 1.0, "step": 2462 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 405.03125, "epoch": 0.02486622917718324, "grad_norm": 18.23380891151765, "kl": 0.06494140625, "learning_rate": 9.984751092011647e-07, "loss": 0.0026, "reward": 1.87890625, "reward_std": 0.010133386589586735, "rewards/accuracy_reward": 0.7289062142372131, "rewards/format_reward": 1.0, "step": 2463 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.9375, "epoch": 0.024876325088339223, "grad_norm": 1.9896271096639926, "kl": 0.060791015625, "learning_rate": 9.984738713416434e-07, "loss": 0.0024, "reward": 2.0734686851501465, "reward_std": 0.1403588354587555, "rewards/accuracy_reward": 0.8922187685966492, "rewards/format_reward": 1.0, "step": 2464 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.8125, "epoch": 0.024886420999495205, "grad_norm": 3.0913987055094343, "kl": 0.08251953125, "learning_rate": 9.984726329806654e-07, "loss": 0.0033, "reward": 2.1303749084472656, "reward_std": 0.014981052838265896, "rewards/accuracy_reward": 0.9303749799728394, "rewards/format_reward": 1.0, "step": 2465 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 400.125, "epoch": 0.024896516910651187, "grad_norm": 1.762100607555575, "kl": 0.07373046875, "learning_rate": 9.984713941182325e-07, "loss": 0.0029, "reward": 1.8659999370574951, "reward_std": 0.005375794135034084, "rewards/accuracy_reward": 0.7160000205039978, "rewards/format_reward": 1.0, "step": 2466 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 377.8125, "epoch": 0.02490661282180717, "grad_norm": 2.1061584163898845, "kl": 0.0625, "learning_rate": 9.984701547543455e-07, "loss": 0.0025, "reward": 1.8718125820159912, "reward_std": 0.028069481253623962, "rewards/accuracy_reward": 0.7343124151229858, "rewards/format_reward": 1.0, "step": 2467 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.9375, "epoch": 0.02491670873296315, "grad_norm": 2.8623971747557206, "kl": 0.08642578125, "learning_rate": 9.98468914889006e-07, "loss": 0.0035, "reward": 2.1186563968658447, "reward_std": 0.01567704789340496, "rewards/accuracy_reward": 0.9186562895774841, "rewards/format_reward": 1.0, "step": 2468 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.59375, "epoch": 0.024926804644119133, "grad_norm": 1.665653671574234, "kl": 0.07666015625, "learning_rate": 9.98467674522215e-07, "loss": 0.0031, "reward": 2.14131236076355, "reward_std": 0.010259904898703098, "rewards/accuracy_reward": 0.9413125514984131, "rewards/format_reward": 1.0, "step": 2469 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.4375, "epoch": 0.024936900555275112, "grad_norm": 2.4497848229515893, "kl": 0.07373046875, "learning_rate": 9.98466433653974e-07, "loss": 0.003, "reward": 1.8029999732971191, "reward_std": 0.11720188707113266, "rewards/accuracy_reward": 0.659250020980835, "rewards/format_reward": 1.0, "step": 2470 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 389.46875, "epoch": 0.024946996466431094, "grad_norm": 1.5625670247584762, "kl": 0.060546875, "learning_rate": 9.984651922842837e-07, "loss": 0.0024, "reward": 1.581937551498413, "reward_std": 0.0034789072815328836, "rewards/accuracy_reward": 0.4819375276565552, "rewards/format_reward": 1.0, "step": 2471 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.0625, "epoch": 0.024957092377587076, "grad_norm": 1.6798343307790997, "kl": 0.0615234375, "learning_rate": 9.98463950413146e-07, "loss": 0.0025, "reward": 1.8910000324249268, "reward_std": 0.005774018354713917, "rewards/accuracy_reward": 0.7409999966621399, "rewards/format_reward": 1.0, "step": 2472 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.90625, "epoch": 0.02496718828874306, "grad_norm": 2.312809379753002, "kl": 0.0810546875, "learning_rate": 9.98462708040562e-07, "loss": 0.0032, "reward": 2.0939061641693115, "reward_std": 0.03223859891295433, "rewards/accuracy_reward": 0.9001562595367432, "rewards/format_reward": 1.0, "step": 2473 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.375, "epoch": 0.02497728419989904, "grad_norm": 1.5219461676126271, "kl": 0.064453125, "learning_rate": 9.984614651665323e-07, "loss": 0.0026, "reward": 2.0225937366485596, "reward_std": 0.023833302780985832, "rewards/accuracy_reward": 0.8288437128067017, "rewards/format_reward": 1.0, "step": 2474 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.59375, "epoch": 0.024987380111055023, "grad_norm": 17.47572863294277, "kl": 0.0751953125, "learning_rate": 9.984602217910592e-07, "loss": 0.003, "reward": 2.1876561641693115, "reward_std": 0.008743786253035069, "rewards/accuracy_reward": 0.9876562356948853, "rewards/format_reward": 1.0, "step": 2475 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 391.59375, "epoch": 0.024997476022211005, "grad_norm": 2.930446762207635, "kl": 0.07275390625, "learning_rate": 9.984589779141432e-07, "loss": 0.0029, "reward": 1.8103125095367432, "reward_std": 0.025095287710428238, "rewards/accuracy_reward": 0.6665624976158142, "rewards/format_reward": 1.0, "step": 2476 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.28125, "epoch": 0.025007571933366987, "grad_norm": 2.9605920575013425, "kl": 0.08984375, "learning_rate": 9.984577335357858e-07, "loss": 0.0036, "reward": 2.0661873817443848, "reward_std": 0.010553011670708656, "rewards/accuracy_reward": 0.8661875128746033, "rewards/format_reward": 1.0, "step": 2477 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.09375, "epoch": 0.02501766784452297, "grad_norm": 2.145378790780412, "kl": 0.0830078125, "learning_rate": 9.984564886559882e-07, "loss": 0.0033, "reward": 2.084796905517578, "reward_std": 0.0285276398062706, "rewards/accuracy_reward": 0.8910468816757202, "rewards/format_reward": 1.0, "step": 2478 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 394.34375, "epoch": 0.02502776375567895, "grad_norm": 2.8593264975976638, "kl": 0.0693359375, "learning_rate": 9.984552432747517e-07, "loss": 0.0028, "reward": 1.5233125686645508, "reward_std": 0.009186807088553905, "rewards/accuracy_reward": 0.4233125150203705, "rewards/format_reward": 1.0, "step": 2479 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.03125, "epoch": 0.02503785966683493, "grad_norm": 8.864062038529111, "kl": 0.08056640625, "learning_rate": 9.984539973920775e-07, "loss": 0.0032, "reward": 2.107874870300293, "reward_std": 0.032994046807289124, "rewards/accuracy_reward": 0.9141249656677246, "rewards/format_reward": 1.0, "step": 2480 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.65625, "epoch": 0.025047955577990912, "grad_norm": 0.8442076210043672, "kl": 0.05322265625, "learning_rate": 9.98452751007967e-07, "loss": 0.0021, "reward": 2.1937499046325684, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2481 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.96875, "epoch": 0.025058051489146894, "grad_norm": 2.003600465125513, "kl": 0.0732421875, "learning_rate": 9.984515041224214e-07, "loss": 0.0029, "reward": 2.174781322479248, "reward_std": 0.03950440138578415, "rewards/accuracy_reward": 0.9872812628746033, "rewards/format_reward": 1.0, "step": 2482 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.09375, "epoch": 0.025068147400302877, "grad_norm": 2.0497847256053934, "kl": 0.0693359375, "learning_rate": 9.984502567354417e-07, "loss": 0.0028, "reward": 2.1494686603546143, "reward_std": 0.013553818687796593, "rewards/accuracy_reward": 0.9494687914848328, "rewards/format_reward": 1.0, "step": 2483 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.3125, "epoch": 0.02507824331145886, "grad_norm": 2.727652936916719, "kl": 0.07080078125, "learning_rate": 9.984490088470295e-07, "loss": 0.0028, "reward": 2.1150312423706055, "reward_std": 0.13666832447052002, "rewards/accuracy_reward": 0.9275312423706055, "rewards/format_reward": 1.0, "step": 2484 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.02508833922261484, "grad_norm": 15.013740046128971, "kl": 0.05615234375, "learning_rate": 9.984477604571857e-07, "loss": 0.0023, "reward": 1.9954376220703125, "reward_std": 0.18592619895935059, "rewards/accuracy_reward": 0.8329375386238098, "rewards/format_reward": 1.0, "step": 2485 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.53125, "epoch": 0.025098435133770823, "grad_norm": 3.1174466208352962, "kl": 0.072265625, "learning_rate": 9.98446511565912e-07, "loss": 0.0029, "reward": 1.9498436450958252, "reward_std": 0.14746007323265076, "rewards/accuracy_reward": 0.7873437404632568, "rewards/format_reward": 1.0, "step": 2486 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.6875, "epoch": 0.025108531044926805, "grad_norm": 2.002635423070096, "kl": 0.0712890625, "learning_rate": 9.984452621732093e-07, "loss": 0.0029, "reward": 1.7901250123977661, "reward_std": 0.030163493007421494, "rewards/accuracy_reward": 0.6526250243186951, "rewards/format_reward": 1.0, "step": 2487 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.9375, "epoch": 0.025118626956082787, "grad_norm": 2.8256597027074175, "kl": 0.07275390625, "learning_rate": 9.98444012279079e-07, "loss": 0.0029, "reward": 1.8689374923706055, "reward_std": 0.024714991450309753, "rewards/accuracy_reward": 0.7251875400543213, "rewards/format_reward": 1.0, "step": 2488 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.02512872286723877, "grad_norm": 10.154599406338013, "kl": 0.06591796875, "learning_rate": 9.984427618835223e-07, "loss": 0.0026, "reward": 2.0166375637054443, "reward_std": 0.011367165483534336, "rewards/accuracy_reward": 0.8166375160217285, "rewards/format_reward": 1.0, "step": 2489 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.40625, "epoch": 0.02513881877839475, "grad_norm": 4.9336585371101345, "kl": 0.060546875, "learning_rate": 9.984415109865406e-07, "loss": 0.0024, "reward": 1.8045625686645508, "reward_std": 0.006319353356957436, "rewards/accuracy_reward": 0.6545625329017639, "rewards/format_reward": 1.0, "step": 2490 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.75, "epoch": 0.02514891468955073, "grad_norm": 1.5913096331908798, "kl": 0.0771484375, "learning_rate": 9.984402595881347e-07, "loss": 0.0031, "reward": 2.1521875858306885, "reward_std": 0.008414295502007008, "rewards/accuracy_reward": 0.9521874785423279, "rewards/format_reward": 1.0, "step": 2491 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.15625, "epoch": 0.025159010600706713, "grad_norm": 2.0253452439877337, "kl": 0.0634765625, "learning_rate": 9.984390076883064e-07, "loss": 0.0026, "reward": 1.844437599182129, "reward_std": 0.028846852481365204, "rewards/accuracy_reward": 0.7006875276565552, "rewards/format_reward": 1.0, "step": 2492 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.59375, "epoch": 0.025169106511862695, "grad_norm": 1.8558722187248038, "kl": 0.06640625, "learning_rate": 9.984377552870568e-07, "loss": 0.0027, "reward": 2.1042189598083496, "reward_std": 0.1162465438246727, "rewards/accuracy_reward": 0.9104687571525574, "rewards/format_reward": 1.0, "step": 2493 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.03125, "epoch": 0.025179202423018677, "grad_norm": 2.062402726817567, "kl": 0.07958984375, "learning_rate": 9.984365023843873e-07, "loss": 0.0032, "reward": 2.109781265258789, "reward_std": 0.012056143023073673, "rewards/accuracy_reward": 0.9097812175750732, "rewards/format_reward": 1.0, "step": 2494 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 393.3125, "epoch": 0.02518929833417466, "grad_norm": 1.4330275167748387, "kl": 0.0615234375, "learning_rate": 9.984352489802988e-07, "loss": 0.0025, "reward": 1.8847811222076416, "reward_std": 0.01226932741701603, "rewards/accuracy_reward": 0.7347812652587891, "rewards/format_reward": 1.0, "step": 2495 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.02519939424533064, "grad_norm": 1.8200401422583206, "kl": 0.07080078125, "learning_rate": 9.984339950747928e-07, "loss": 0.0028, "reward": 2.0616250038146973, "reward_std": 0.011455351486802101, "rewards/accuracy_reward": 0.8616249561309814, "rewards/format_reward": 1.0, "step": 2496 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.025209490156486623, "grad_norm": 2.6478777210145017, "kl": 0.07568359375, "learning_rate": 9.984327406678705e-07, "loss": 0.003, "reward": 1.9301249980926514, "reward_std": 0.010455639101564884, "rewards/accuracy_reward": 0.7301250100135803, "rewards/format_reward": 1.0, "step": 2497 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.84375, "epoch": 0.025219586067642605, "grad_norm": 3.5080227338101144, "kl": 0.07421875, "learning_rate": 9.98431485759533e-07, "loss": 0.003, "reward": 2.1667187213897705, "reward_std": 0.009972352534532547, "rewards/accuracy_reward": 0.9667187333106995, "rewards/format_reward": 1.0, "step": 2498 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.59375, "epoch": 0.025229681978798588, "grad_norm": 3.5600145020846976, "kl": 0.07080078125, "learning_rate": 9.98430230349782e-07, "loss": 0.0028, "reward": 1.895218849182129, "reward_std": 0.1594342142343521, "rewards/accuracy_reward": 0.7327187061309814, "rewards/format_reward": 1.0, "step": 2499 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.28125, "epoch": 0.02523977788995457, "grad_norm": 2.784524810605146, "kl": 0.06884765625, "learning_rate": 9.984289744386183e-07, "loss": 0.0028, "reward": 2.1494061946868896, "reward_std": 0.007309379521757364, "rewards/accuracy_reward": 0.9494062662124634, "rewards/format_reward": 1.0, "step": 2500 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.96875, "epoch": 0.02524987380111055, "grad_norm": 2.856087466338432, "kl": 0.07373046875, "learning_rate": 9.984277180260435e-07, "loss": 0.0029, "reward": 1.87709379196167, "reward_std": 0.09933438152074814, "rewards/accuracy_reward": 0.7270937561988831, "rewards/format_reward": 1.0, "step": 2501 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.78125, "epoch": 0.02525996971226653, "grad_norm": 2.1469666009736663, "kl": 0.06103515625, "learning_rate": 9.984264611120586e-07, "loss": 0.0024, "reward": 1.850218653678894, "reward_std": 0.021737046539783478, "rewards/accuracy_reward": 0.7002187371253967, "rewards/format_reward": 1.0, "step": 2502 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 411.125, "epoch": 0.025270065623422513, "grad_norm": 2.185974994046535, "kl": 0.07080078125, "learning_rate": 9.984252036966652e-07, "loss": 0.0028, "reward": 1.4390000104904175, "reward_std": 0.020545557141304016, "rewards/accuracy_reward": 0.3452499806880951, "rewards/format_reward": 1.0, "step": 2503 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.5, "epoch": 0.025280161534578495, "grad_norm": 1.8549137651090282, "kl": 0.0791015625, "learning_rate": 9.98423945779864e-07, "loss": 0.0032, "reward": 2.094874858856201, "reward_std": 0.012657815590500832, "rewards/accuracy_reward": 0.8948750495910645, "rewards/format_reward": 1.0, "step": 2504 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 414.59375, "epoch": 0.025290257445734477, "grad_norm": 4.122562437812167, "kl": 0.07177734375, "learning_rate": 9.984226873616569e-07, "loss": 0.0029, "reward": 1.8615000247955322, "reward_std": 0.008168615400791168, "rewards/accuracy_reward": 0.7114999890327454, "rewards/format_reward": 1.0, "step": 2505 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 418.59375, "epoch": 0.02530035335689046, "grad_norm": 2.206835174036078, "kl": 0.048095703125, "learning_rate": 9.98421428442045e-07, "loss": 0.0019, "reward": 1.670281171798706, "reward_std": 0.16042889654636383, "rewards/accuracy_reward": 0.5515313148498535, "rewards/format_reward": 1.0, "step": 2506 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.4375, "epoch": 0.02531044926804644, "grad_norm": 3.4361029257186213, "kl": 0.07177734375, "learning_rate": 9.984201690210291e-07, "loss": 0.0029, "reward": 1.815406322479248, "reward_std": 0.01191718876361847, "rewards/accuracy_reward": 0.6654062271118164, "rewards/format_reward": 1.0, "step": 2507 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.46875, "epoch": 0.025320545179202424, "grad_norm": 3.339898286067771, "kl": 0.0830078125, "learning_rate": 9.98418909098611e-07, "loss": 0.0033, "reward": 2.1337814331054688, "reward_std": 0.016749169677495956, "rewards/accuracy_reward": 0.9337813258171082, "rewards/format_reward": 1.0, "step": 2508 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.025330641090358406, "grad_norm": 3.5531931909042234, "kl": 0.05908203125, "learning_rate": 9.984176486747917e-07, "loss": 0.0024, "reward": 2.170187473297119, "reward_std": 0.0315595418214798, "rewards/accuracy_reward": 0.9764375686645508, "rewards/format_reward": 1.0, "step": 2509 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.71875, "epoch": 0.025340737001514388, "grad_norm": 4.13021482391945, "kl": 0.0732421875, "learning_rate": 9.984163877495726e-07, "loss": 0.0029, "reward": 1.807937502861023, "reward_std": 0.013282015919685364, "rewards/accuracy_reward": 0.6579374670982361, "rewards/format_reward": 1.0, "step": 2510 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.78125, "epoch": 0.02535083291267037, "grad_norm": 1.8193841841931369, "kl": 0.064453125, "learning_rate": 9.984151263229549e-07, "loss": 0.0026, "reward": 2.1480000019073486, "reward_std": 0.007115539629012346, "rewards/accuracy_reward": 0.9480000138282776, "rewards/format_reward": 1.0, "step": 2511 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.15625, "epoch": 0.02536092882382635, "grad_norm": 6.007096639202397, "kl": 0.07861328125, "learning_rate": 9.9841386439494e-07, "loss": 0.0032, "reward": 2.1239376068115234, "reward_std": 0.013791965320706367, "rewards/accuracy_reward": 0.9239374399185181, "rewards/format_reward": 1.0, "step": 2512 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.15625, "epoch": 0.02537102473498233, "grad_norm": 3.2143479307281146, "kl": 0.0830078125, "learning_rate": 9.98412601965529e-07, "loss": 0.0033, "reward": 2.095031261444092, "reward_std": 0.014593683183193207, "rewards/accuracy_reward": 0.8950312733650208, "rewards/format_reward": 1.0, "step": 2513 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.15625, "epoch": 0.025381120646138313, "grad_norm": 0.8815647054098685, "kl": 0.060791015625, "learning_rate": 9.98411339034723e-07, "loss": 0.0024, "reward": 2.075000047683716, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.8812500238418579, "rewards/format_reward": 1.0, "step": 2514 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.71875, "epoch": 0.025391216557294295, "grad_norm": 2.39169287344188, "kl": 0.07666015625, "learning_rate": 9.984100756025238e-07, "loss": 0.0031, "reward": 1.8745312690734863, "reward_std": 0.10265316069126129, "rewards/accuracy_reward": 0.7245312333106995, "rewards/format_reward": 1.0, "step": 2515 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.875, "epoch": 0.025401312468450277, "grad_norm": 2.5908575815624513, "kl": 0.0654296875, "learning_rate": 9.984088116689323e-07, "loss": 0.0026, "reward": 1.8748438358306885, "reward_std": 0.005508926697075367, "rewards/accuracy_reward": 0.7248437404632568, "rewards/format_reward": 1.0, "step": 2516 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.0, "epoch": 0.02541140837960626, "grad_norm": 1.8684219714991694, "kl": 0.07958984375, "learning_rate": 9.984075472339499e-07, "loss": 0.0032, "reward": 2.17900013923645, "reward_std": 0.02696920558810234, "rewards/accuracy_reward": 0.9852499961853027, "rewards/format_reward": 1.0, "step": 2517 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.5625, "epoch": 0.02542150429076224, "grad_norm": 3.4957914791021345, "kl": 0.07958984375, "learning_rate": 9.984062822975777e-07, "loss": 0.0032, "reward": 2.119968891143799, "reward_std": 0.018107611685991287, "rewards/accuracy_reward": 0.9199687242507935, "rewards/format_reward": 1.0, "step": 2518 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.75, "epoch": 0.025431600201918224, "grad_norm": 2.718098214845122, "kl": 0.0615234375, "learning_rate": 9.98405016859817e-07, "loss": 0.0025, "reward": 1.917375087738037, "reward_std": 0.1628478467464447, "rewards/accuracy_reward": 0.7486249804496765, "rewards/format_reward": 1.0, "step": 2519 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.03125, "epoch": 0.025441696113074206, "grad_norm": 1.0271281035536721, "kl": 0.0546875, "learning_rate": 9.984037509206692e-07, "loss": 0.0022, "reward": 1.8815937042236328, "reward_std": 0.0021168356761336327, "rewards/accuracy_reward": 0.7315937280654907, "rewards/format_reward": 1.0, "step": 2520 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.025451792024230188, "grad_norm": 4.18877742546915, "kl": 0.0673828125, "learning_rate": 9.984024844801357e-07, "loss": 0.0027, "reward": 1.9721250534057617, "reward_std": 0.16799503564834595, "rewards/accuracy_reward": 0.8033750057220459, "rewards/format_reward": 1.0, "step": 2521 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 386.25, "epoch": 0.02546188793538617, "grad_norm": 1.7962670073016918, "kl": 0.06591796875, "learning_rate": 9.984012175382176e-07, "loss": 0.0026, "reward": 1.8589999675750732, "reward_std": 0.007951208390295506, "rewards/accuracy_reward": 0.7090000510215759, "rewards/format_reward": 1.0, "step": 2522 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.0625, "epoch": 0.02547198384654215, "grad_norm": 1.8664694409278157, "kl": 0.0712890625, "learning_rate": 9.98399950094916e-07, "loss": 0.0028, "reward": 1.8062188625335693, "reward_std": 0.010420400649309158, "rewards/accuracy_reward": 0.6562187671661377, "rewards/format_reward": 1.0, "step": 2523 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.4375, "epoch": 0.02548207975769813, "grad_norm": 2.7861897336518586, "kl": 0.062255859375, "learning_rate": 9.983986821502327e-07, "loss": 0.0025, "reward": 2.154937744140625, "reward_std": 0.010990886017680168, "rewards/accuracy_reward": 0.9549375176429749, "rewards/format_reward": 1.0, "step": 2524 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.375, "epoch": 0.025492175668854113, "grad_norm": 3.201790831544837, "kl": 0.0791015625, "learning_rate": 9.983974137041682e-07, "loss": 0.0032, "reward": 2.121875047683716, "reward_std": 0.055538348853588104, "rewards/accuracy_reward": 0.9468750357627869, "rewards/format_reward": 1.0, "step": 2525 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.53125, "epoch": 0.025502271580010095, "grad_norm": 2.0972903725305083, "kl": 0.048583984375, "learning_rate": 9.983961447567244e-07, "loss": 0.0019, "reward": 1.8893437385559082, "reward_std": 0.0074461535550653934, "rewards/accuracy_reward": 0.7393437623977661, "rewards/format_reward": 1.0, "step": 2526 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.025512367491166078, "grad_norm": 1.969700825048235, "kl": 0.0654296875, "learning_rate": 9.983948753079024e-07, "loss": 0.0026, "reward": 1.7857500314712524, "reward_std": 0.14625297486782074, "rewards/accuracy_reward": 0.6232500076293945, "rewards/format_reward": 1.0, "step": 2527 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.02552246340232206, "grad_norm": 2.4873753031221986, "kl": 0.0654296875, "learning_rate": 9.983936053577035e-07, "loss": 0.0026, "reward": 2.0890626907348633, "reward_std": 0.015581520274281502, "rewards/accuracy_reward": 0.8890625238418579, "rewards/format_reward": 1.0, "step": 2528 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.46875, "epoch": 0.025532559313478042, "grad_norm": 3.655067765422229, "kl": 0.060546875, "learning_rate": 9.98392334906129e-07, "loss": 0.0024, "reward": 2.1746249198913574, "reward_std": 0.02041420340538025, "rewards/accuracy_reward": 0.9808750152587891, "rewards/format_reward": 1.0, "step": 2529 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.8125, "epoch": 0.025542655224634024, "grad_norm": 3.2402909643223685, "kl": 0.07275390625, "learning_rate": 9.983910639531799e-07, "loss": 0.0029, "reward": 2.1178436279296875, "reward_std": 0.012211569584906101, "rewards/accuracy_reward": 0.917843759059906, "rewards/format_reward": 1.0, "step": 2530 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 387.6875, "epoch": 0.025552751135790006, "grad_norm": 3.37788693109781, "kl": 0.068359375, "learning_rate": 9.983897924988578e-07, "loss": 0.0027, "reward": 1.7717812061309814, "reward_std": 0.0173207875341177, "rewards/accuracy_reward": 0.6217812299728394, "rewards/format_reward": 1.0, "step": 2531 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 381.0625, "epoch": 0.02556284704694599, "grad_norm": 4.503881212431049, "kl": 0.06005859375, "learning_rate": 9.983885205431638e-07, "loss": 0.0024, "reward": 1.8787500858306885, "reward_std": 0.026939302682876587, "rewards/accuracy_reward": 0.7350000143051147, "rewards/format_reward": 1.0, "step": 2532 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.9375, "epoch": 0.025572942958101967, "grad_norm": 3.2810073568590314, "kl": 0.078125, "learning_rate": 9.983872480860994e-07, "loss": 0.0031, "reward": 2.0427498817443848, "reward_std": 0.03117670677602291, "rewards/accuracy_reward": 0.8490000367164612, "rewards/format_reward": 1.0, "step": 2533 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.125, "epoch": 0.02558303886925795, "grad_norm": 1.9994496553499306, "kl": 0.06640625, "learning_rate": 9.983859751276658e-07, "loss": 0.0027, "reward": 1.9778125286102295, "reward_std": 0.16978952288627625, "rewards/accuracy_reward": 0.8028125166893005, "rewards/format_reward": 1.0, "step": 2534 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 395.59375, "epoch": 0.02559313478041393, "grad_norm": 2.115829244047261, "kl": 0.06787109375, "learning_rate": 9.98384701667864e-07, "loss": 0.0027, "reward": 1.8847813606262207, "reward_std": 0.01006772369146347, "rewards/accuracy_reward": 0.7347812652587891, "rewards/format_reward": 1.0, "step": 2535 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.1875, "epoch": 0.025603230691569914, "grad_norm": 3.07442480981166, "kl": 0.0732421875, "learning_rate": 9.983834277066954e-07, "loss": 0.0029, "reward": 2.150437355041504, "reward_std": 0.01852233335375786, "rewards/accuracy_reward": 0.9504374265670776, "rewards/format_reward": 1.0, "step": 2536 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.5625, "epoch": 0.025613326602725896, "grad_norm": 4.1571652797518635, "kl": 0.07177734375, "learning_rate": 9.983821532441617e-07, "loss": 0.0029, "reward": 1.8972814083099365, "reward_std": 0.15859472751617432, "rewards/accuracy_reward": 0.7347812056541443, "rewards/format_reward": 1.0, "step": 2537 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.21875, "epoch": 0.025623422513881878, "grad_norm": 3.9988974068456873, "kl": 0.0908203125, "learning_rate": 9.983808782802636e-07, "loss": 0.0036, "reward": 2.0789685249328613, "reward_std": 0.02461520954966545, "rewards/accuracy_reward": 0.8852187395095825, "rewards/format_reward": 1.0, "step": 2538 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.96875, "epoch": 0.02563351842503786, "grad_norm": 2.702230268099079, "kl": 0.078125, "learning_rate": 9.983796028150028e-07, "loss": 0.0031, "reward": 2.13100004196167, "reward_std": 0.040956445038318634, "rewards/accuracy_reward": 0.9435000419616699, "rewards/format_reward": 1.0, "step": 2539 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.90625, "epoch": 0.025643614336193842, "grad_norm": 2.1660655304368195, "kl": 0.072265625, "learning_rate": 9.983783268483804e-07, "loss": 0.0029, "reward": 2.0631561279296875, "reward_std": 0.01149714458733797, "rewards/accuracy_reward": 0.8631561994552612, "rewards/format_reward": 1.0, "step": 2540 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.53125, "epoch": 0.025653710247349824, "grad_norm": 2.2478402327926847, "kl": 0.08349609375, "learning_rate": 9.983770503803976e-07, "loss": 0.0034, "reward": 2.1457815170288086, "reward_std": 0.027521274983882904, "rewards/accuracy_reward": 0.9520312547683716, "rewards/format_reward": 1.0, "step": 2541 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 399.53125, "epoch": 0.025663806158505807, "grad_norm": 2.533990723786623, "kl": 0.068359375, "learning_rate": 9.98375773411056e-07, "loss": 0.0027, "reward": 1.79812490940094, "reward_std": 0.019827863201498985, "rewards/accuracy_reward": 0.6481250524520874, "rewards/format_reward": 1.0, "step": 2542 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.5625, "epoch": 0.02567390206966179, "grad_norm": 3.1252494541969846, "kl": 0.076171875, "learning_rate": 9.983744959403565e-07, "loss": 0.0031, "reward": 2.1318750381469727, "reward_std": 0.010839754715561867, "rewards/accuracy_reward": 0.9318749904632568, "rewards/format_reward": 1.0, "step": 2543 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.53125, "epoch": 0.025683997980817767, "grad_norm": 2.0814634239951046, "kl": 0.0634765625, "learning_rate": 9.983732179683007e-07, "loss": 0.0025, "reward": 1.9298126697540283, "reward_std": 0.1480197310447693, "rewards/accuracy_reward": 0.7673124670982361, "rewards/format_reward": 1.0, "step": 2544 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.02569409389197375, "grad_norm": 2.959945169460058, "kl": 0.06005859375, "learning_rate": 9.983719394948896e-07, "loss": 0.0024, "reward": 2.082937717437744, "reward_std": 0.014087652787566185, "rewards/accuracy_reward": 0.8829375505447388, "rewards/format_reward": 1.0, "step": 2545 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.21875, "epoch": 0.02570418980312973, "grad_norm": 2.8982578493972815, "kl": 0.0712890625, "learning_rate": 9.983706605201247e-07, "loss": 0.0029, "reward": 2.151062488555908, "reward_std": 0.03386708348989487, "rewards/accuracy_reward": 0.9573125243186951, "rewards/format_reward": 1.0, "step": 2546 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.0625, "epoch": 0.025714285714285714, "grad_norm": 4.812734451990627, "kl": 0.07373046875, "learning_rate": 9.983693810440072e-07, "loss": 0.0029, "reward": 2.0629687309265137, "reward_std": 0.10236360132694244, "rewards/accuracy_reward": 0.8942188024520874, "rewards/format_reward": 0.96875, "step": 2547 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.0, "epoch": 0.025724381625441696, "grad_norm": 3.2693122462775417, "kl": 0.08935546875, "learning_rate": 9.983681010665385e-07, "loss": 0.0036, "reward": 2.0620312690734863, "reward_std": 0.013278924860060215, "rewards/accuracy_reward": 0.8620311617851257, "rewards/format_reward": 1.0, "step": 2548 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.59375, "epoch": 0.025734477536597678, "grad_norm": 1.5534213853351375, "kl": 0.06640625, "learning_rate": 9.9836682058772e-07, "loss": 0.0027, "reward": 2.1100311279296875, "reward_std": 0.009150708094239235, "rewards/accuracy_reward": 0.910031259059906, "rewards/format_reward": 1.0, "step": 2549 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.6875, "epoch": 0.02574457344775366, "grad_norm": 2.9061618366712616, "kl": 0.083984375, "learning_rate": 9.983655396075523e-07, "loss": 0.0034, "reward": 2.0801875591278076, "reward_std": 0.04018842801451683, "rewards/accuracy_reward": 0.8926874399185181, "rewards/format_reward": 1.0, "step": 2550 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.78125, "epoch": 0.025754669358909642, "grad_norm": 6.157505191520766, "kl": 0.07861328125, "learning_rate": 9.983642581260374e-07, "loss": 0.0031, "reward": 2.181062698364258, "reward_std": 0.008288410492241383, "rewards/accuracy_reward": 0.9810625314712524, "rewards/format_reward": 1.0, "step": 2551 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.40625, "epoch": 0.025764765270065625, "grad_norm": 0.10176468590989637, "kl": 0.0537109375, "learning_rate": 9.983629761431763e-07, "loss": 0.0022, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2552 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.75, "epoch": 0.025774861181221607, "grad_norm": 2.6454683987182093, "kl": 0.0859375, "learning_rate": 9.983616936589706e-07, "loss": 0.0034, "reward": 2.1151251792907715, "reward_std": 0.01737046241760254, "rewards/accuracy_reward": 0.9151250123977661, "rewards/format_reward": 1.0, "step": 2553 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.6875, "epoch": 0.025784957092377585, "grad_norm": 3.0198213950595574, "kl": 0.08837890625, "learning_rate": 9.983604106734208e-07, "loss": 0.0035, "reward": 2.064218759536743, "reward_std": 0.10040829330682755, "rewards/accuracy_reward": 0.8954687118530273, "rewards/format_reward": 0.96875, "step": 2554 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.15625, "epoch": 0.025795053003533568, "grad_norm": 3.9419760951304603, "kl": 0.0830078125, "learning_rate": 9.983591271865293e-07, "loss": 0.0033, "reward": 1.9798437356948853, "reward_std": 0.09362438321113586, "rewards/accuracy_reward": 0.811093807220459, "rewards/format_reward": 0.96875, "step": 2555 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.5, "epoch": 0.02580514891468955, "grad_norm": 2.5744736047344965, "kl": 0.07958984375, "learning_rate": 9.983578431982967e-07, "loss": 0.0032, "reward": 2.0930938720703125, "reward_std": 0.01860295608639717, "rewards/accuracy_reward": 0.8930937647819519, "rewards/format_reward": 1.0, "step": 2556 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.90625, "epoch": 0.025815244825845532, "grad_norm": 2.7218524185755872, "kl": 0.0830078125, "learning_rate": 9.983565587087241e-07, "loss": 0.0033, "reward": 2.010906219482422, "reward_std": 0.018679045140743256, "rewards/accuracy_reward": 0.8109062910079956, "rewards/format_reward": 1.0, "step": 2557 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 429.84375, "epoch": 0.025825340737001514, "grad_norm": 3.419244860380065, "kl": 0.06884765625, "learning_rate": 9.983552737178133e-07, "loss": 0.0027, "reward": 1.8734686374664307, "reward_std": 0.0077234357595443726, "rewards/accuracy_reward": 0.7234687209129333, "rewards/format_reward": 1.0, "step": 2558 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 422.5, "epoch": 0.025835436648157496, "grad_norm": 1.4837598182150684, "kl": 0.06201171875, "learning_rate": 9.983539882255655e-07, "loss": 0.0025, "reward": 1.4728749990463257, "reward_std": 0.16094209253787994, "rewards/accuracy_reward": 0.39162498712539673, "rewards/format_reward": 1.0, "step": 2559 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.3125, "epoch": 0.02584553255931348, "grad_norm": 2.779786520061497, "kl": 0.0771484375, "learning_rate": 9.98352702231982e-07, "loss": 0.0031, "reward": 2.119687557220459, "reward_std": 0.02083572745323181, "rewards/accuracy_reward": 0.9196875095367432, "rewards/format_reward": 1.0, "step": 2560 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 434.15625, "epoch": 0.02585562847046946, "grad_norm": 7.803520826242877, "kl": 0.08203125, "learning_rate": 9.983514157370638e-07, "loss": 0.0033, "reward": 2.1358749866485596, "reward_std": 0.01722002774477005, "rewards/accuracy_reward": 0.9358749389648438, "rewards/format_reward": 1.0, "step": 2561 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 445.28125, "epoch": 0.025865724381625443, "grad_norm": 2.30297947534852, "kl": 0.08544921875, "learning_rate": 9.983501287408122e-07, "loss": 0.0034, "reward": 1.7751561403274536, "reward_std": 0.014646792784333229, "rewards/accuracy_reward": 0.6251562833786011, "rewards/format_reward": 1.0, "step": 2562 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.0, "epoch": 0.025875820292781425, "grad_norm": 4.158182491314714, "kl": 0.078125, "learning_rate": 9.98348841243229e-07, "loss": 0.0031, "reward": 2.0758752822875977, "reward_std": 0.02763022482395172, "rewards/accuracy_reward": 0.8821250200271606, "rewards/format_reward": 1.0, "step": 2563 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 433.09375, "epoch": 0.025885916203937407, "grad_norm": 2.660391624021197, "kl": 0.0927734375, "learning_rate": 9.98347553244315e-07, "loss": 0.0037, "reward": 1.755218744277954, "reward_std": 0.013412482105195522, "rewards/accuracy_reward": 0.605218768119812, "rewards/format_reward": 1.0, "step": 2564 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 432.0625, "epoch": 0.025896012115093386, "grad_norm": 2.3055785477610926, "kl": 0.0830078125, "learning_rate": 9.983462647440716e-07, "loss": 0.0033, "reward": 1.8371875286102295, "reward_std": 0.10331671684980392, "rewards/accuracy_reward": 0.7184374928474426, "rewards/format_reward": 0.96875, "step": 2565 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.9375, "epoch": 0.025906108026249368, "grad_norm": 3.1632321987731387, "kl": 0.087890625, "learning_rate": 9.983449757425004e-07, "loss": 0.0035, "reward": 2.1296563148498535, "reward_std": 0.039418432861566544, "rewards/accuracy_reward": 0.9359062910079956, "rewards/format_reward": 1.0, "step": 2566 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.15625, "epoch": 0.02591620393740535, "grad_norm": 2.133570869367182, "kl": 0.083984375, "learning_rate": 9.983436862396024e-07, "loss": 0.0034, "reward": 1.8686250448226929, "reward_std": 0.013239181600511074, "rewards/accuracy_reward": 0.7186249494552612, "rewards/format_reward": 1.0, "step": 2567 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.375, "epoch": 0.025926299848561332, "grad_norm": 2.826055523998162, "kl": 0.078125, "learning_rate": 9.98342396235379e-07, "loss": 0.0031, "reward": 2.008812427520752, "reward_std": 0.01610919088125229, "rewards/accuracy_reward": 0.8088124990463257, "rewards/format_reward": 1.0, "step": 2568 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.9375, "epoch": 0.025936395759717314, "grad_norm": 3.234302240540234, "kl": 0.08154296875, "learning_rate": 9.983411057298313e-07, "loss": 0.0033, "reward": 2.0687813758850098, "reward_std": 0.02294952981173992, "rewards/accuracy_reward": 0.8687812089920044, "rewards/format_reward": 1.0, "step": 2569 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.21875, "epoch": 0.025946491670873296, "grad_norm": 2.8911814208380697, "kl": 0.0810546875, "learning_rate": 9.983398147229607e-07, "loss": 0.0032, "reward": 2.11356258392334, "reward_std": 0.014849908649921417, "rewards/accuracy_reward": 0.9135624170303345, "rewards/format_reward": 1.0, "step": 2570 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 428.84375, "epoch": 0.02595658758202928, "grad_norm": 3.6016535848854128, "kl": 0.0712890625, "learning_rate": 9.983385232147687e-07, "loss": 0.0028, "reward": 1.8822813034057617, "reward_std": 0.006378378719091415, "rewards/accuracy_reward": 0.7322812080383301, "rewards/format_reward": 1.0, "step": 2571 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.6875, "epoch": 0.02596668349318526, "grad_norm": 2.850448591531716, "kl": 0.07421875, "learning_rate": 9.983372312052563e-07, "loss": 0.003, "reward": 2.109687328338623, "reward_std": 0.013005789369344711, "rewards/accuracy_reward": 0.9096875190734863, "rewards/format_reward": 1.0, "step": 2572 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.5625, "epoch": 0.025976779404341243, "grad_norm": 7.728111573917399, "kl": 0.08349609375, "learning_rate": 9.98335938694425e-07, "loss": 0.0033, "reward": 2.043781280517578, "reward_std": 0.033531688153743744, "rewards/accuracy_reward": 0.8500312566757202, "rewards/format_reward": 1.0, "step": 2573 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 431.0625, "epoch": 0.025986875315497225, "grad_norm": 1.8690477725009977, "kl": 0.07421875, "learning_rate": 9.983346456822762e-07, "loss": 0.003, "reward": 1.8340312242507935, "reward_std": 0.018242530524730682, "rewards/accuracy_reward": 0.6840312480926514, "rewards/format_reward": 1.0, "step": 2574 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.78125, "epoch": 0.025996971226653204, "grad_norm": 2.7138369822484156, "kl": 0.07958984375, "learning_rate": 9.98333352168811e-07, "loss": 0.0032, "reward": 2.1320314407348633, "reward_std": 0.019235841929912567, "rewards/accuracy_reward": 0.9320312738418579, "rewards/format_reward": 1.0, "step": 2575 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.25, "epoch": 0.026007067137809186, "grad_norm": 3.823852201328248, "kl": 0.0869140625, "learning_rate": 9.983320581540307e-07, "loss": 0.0035, "reward": 2.0469374656677246, "reward_std": 0.03632677346467972, "rewards/accuracy_reward": 0.8531875014305115, "rewards/format_reward": 1.0, "step": 2576 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.5, "epoch": 0.026017163048965168, "grad_norm": 2.661192866490424, "kl": 0.0830078125, "learning_rate": 9.983307636379367e-07, "loss": 0.0033, "reward": 2.1203126907348633, "reward_std": 0.02945638634264469, "rewards/accuracy_reward": 0.9265625476837158, "rewards/format_reward": 1.0, "step": 2577 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 436.4375, "epoch": 0.02602725896012115, "grad_norm": 3.0166348528157125, "kl": 0.07080078125, "learning_rate": 9.983294686205302e-07, "loss": 0.0028, "reward": 1.7586874961853027, "reward_std": 0.019876571372151375, "rewards/accuracy_reward": 0.6086875200271606, "rewards/format_reward": 1.0, "step": 2578 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 424.46875, "epoch": 0.026037354871277132, "grad_norm": 2.627262542917659, "kl": 0.07373046875, "learning_rate": 9.983281731018126e-07, "loss": 0.0029, "reward": 1.8756248950958252, "reward_std": 0.00658432999625802, "rewards/accuracy_reward": 0.7256249785423279, "rewards/format_reward": 1.0, "step": 2579 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.65625, "epoch": 0.026047450782433115, "grad_norm": 1.8136856158954011, "kl": 0.05859375, "learning_rate": 9.98326877081785e-07, "loss": 0.0023, "reward": 2.154562473297119, "reward_std": 0.04018190875649452, "rewards/accuracy_reward": 0.9670624732971191, "rewards/format_reward": 1.0, "step": 2580 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.65625, "epoch": 0.026057546693589097, "grad_norm": 1.400311234505205, "kl": 0.0654296875, "learning_rate": 9.983255805604492e-07, "loss": 0.0026, "reward": 2.162062644958496, "reward_std": 0.005177877377718687, "rewards/accuracy_reward": 0.9620624780654907, "rewards/format_reward": 1.0, "step": 2581 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.375, "epoch": 0.02606764260474508, "grad_norm": 3.0878800297163034, "kl": 0.076171875, "learning_rate": 9.98324283537806e-07, "loss": 0.003, "reward": 2.1352500915527344, "reward_std": 0.011136200278997421, "rewards/accuracy_reward": 0.9352500438690186, "rewards/format_reward": 1.0, "step": 2582 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.6875, "epoch": 0.02607773851590106, "grad_norm": 2.1785636682295735, "kl": 0.0595703125, "learning_rate": 9.983229860138568e-07, "loss": 0.0024, "reward": 2.132499933242798, "reward_std": 0.12093903869390488, "rewards/accuracy_reward": 0.9450000524520874, "rewards/format_reward": 1.0, "step": 2583 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.03125, "epoch": 0.026087834427057043, "grad_norm": 4.084177461517524, "kl": 0.07470703125, "learning_rate": 9.98321687988603e-07, "loss": 0.003, "reward": 1.8250312805175781, "reward_std": 0.10857272148132324, "rewards/accuracy_reward": 0.6750312447547913, "rewards/format_reward": 1.0, "step": 2584 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.59375, "epoch": 0.026097930338213025, "grad_norm": 1.563422257975254, "kl": 0.072265625, "learning_rate": 9.98320389462046e-07, "loss": 0.0029, "reward": 2.123781204223633, "reward_std": 0.007406529039144516, "rewards/accuracy_reward": 0.9237812161445618, "rewards/format_reward": 1.0, "step": 2585 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.5625, "epoch": 0.026108026249369004, "grad_norm": 1.7850563398341406, "kl": 0.0693359375, "learning_rate": 9.983190904341868e-07, "loss": 0.0028, "reward": 2.1514062881469727, "reward_std": 0.00814976915717125, "rewards/accuracy_reward": 0.9514062404632568, "rewards/format_reward": 1.0, "step": 2586 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.03125, "epoch": 0.026118122160524986, "grad_norm": 2.600661439363874, "kl": 0.05712890625, "learning_rate": 9.983177909050272e-07, "loss": 0.0023, "reward": 1.9634063243865967, "reward_std": 0.1589168906211853, "rewards/accuracy_reward": 0.7946562170982361, "rewards/format_reward": 1.0, "step": 2587 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 408.5, "epoch": 0.02612821807168097, "grad_norm": 2.1290491159304206, "kl": 0.056884765625, "learning_rate": 9.98316490874568e-07, "loss": 0.0023, "reward": 1.3729686737060547, "reward_std": 0.14055506885051727, "rewards/accuracy_reward": 0.31046876311302185, "rewards/format_reward": 1.0, "step": 2588 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.90625, "epoch": 0.02613831398283695, "grad_norm": 2.399284458696052, "kl": 0.07568359375, "learning_rate": 9.983151903428106e-07, "loss": 0.003, "reward": 2.134187698364258, "reward_std": 0.021133990958333015, "rewards/accuracy_reward": 0.9341875314712524, "rewards/format_reward": 1.0, "step": 2589 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.46875, "epoch": 0.026148409893992933, "grad_norm": 1.9391956169836653, "kl": 0.064453125, "learning_rate": 9.983138893097567e-07, "loss": 0.0026, "reward": 1.9081251621246338, "reward_std": 0.16229557991027832, "rewards/accuracy_reward": 0.7393750548362732, "rewards/format_reward": 1.0, "step": 2590 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.8125, "epoch": 0.026158505805148915, "grad_norm": 2.411637652346899, "kl": 0.08642578125, "learning_rate": 9.98312587775407e-07, "loss": 0.0034, "reward": 2.1467812061309814, "reward_std": 0.015337648801505566, "rewards/accuracy_reward": 0.9467812180519104, "rewards/format_reward": 1.0, "step": 2591 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.8125, "epoch": 0.026168601716304897, "grad_norm": 3.3387135006695483, "kl": 0.06689453125, "learning_rate": 9.983112857397636e-07, "loss": 0.0027, "reward": 2.1655311584472656, "reward_std": 0.00766886118799448, "rewards/accuracy_reward": 0.9655312299728394, "rewards/format_reward": 1.0, "step": 2592 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 426.53125, "epoch": 0.02617869762746088, "grad_norm": 1.7358435710533227, "kl": 0.05517578125, "learning_rate": 9.98309983202827e-07, "loss": 0.0022, "reward": 1.6051561832427979, "reward_std": 0.11665721982717514, "rewards/accuracy_reward": 0.5176562666893005, "rewards/format_reward": 1.0, "step": 2593 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 396.90625, "epoch": 0.02618879353861686, "grad_norm": 2.189950624327208, "kl": 0.068359375, "learning_rate": 9.983086801645988e-07, "loss": 0.0027, "reward": 1.7079999446868896, "reward_std": 0.014634953811764717, "rewards/accuracy_reward": 0.5579999685287476, "rewards/format_reward": 1.0, "step": 2594 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.375, "epoch": 0.026198889449772843, "grad_norm": 1.3410159524396488, "kl": 0.056884765625, "learning_rate": 9.983073766250808e-07, "loss": 0.0023, "reward": 1.8734687566757202, "reward_std": 0.011986909434199333, "rewards/accuracy_reward": 0.7234687805175781, "rewards/format_reward": 1.0, "step": 2595 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.1875, "epoch": 0.026208985360928822, "grad_norm": 3.252619825446878, "kl": 0.07958984375, "learning_rate": 9.983060725842734e-07, "loss": 0.0032, "reward": 2.169656276702881, "reward_std": 0.00867268443107605, "rewards/accuracy_reward": 0.9696562886238098, "rewards/format_reward": 1.0, "step": 2596 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.71875, "epoch": 0.026219081272084804, "grad_norm": 9.55480003035635, "kl": 0.06201171875, "learning_rate": 9.983047680421786e-07, "loss": 0.0025, "reward": 2.1324377059936523, "reward_std": 0.1320730745792389, "rewards/accuracy_reward": 0.9449374675750732, "rewards/format_reward": 1.0, "step": 2597 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 393.09375, "epoch": 0.026229177183240786, "grad_norm": 2.028536962691514, "kl": 0.06591796875, "learning_rate": 9.983034629987976e-07, "loss": 0.0026, "reward": 1.858374834060669, "reward_std": 0.026641901582479477, "rewards/accuracy_reward": 0.7146250009536743, "rewards/format_reward": 1.0, "step": 2598 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.46875, "epoch": 0.02623927309439677, "grad_norm": 2.166054157058365, "kl": 0.078125, "learning_rate": 9.983021574541316e-07, "loss": 0.0031, "reward": 2.004718780517578, "reward_std": 0.014473705552518368, "rewards/accuracy_reward": 0.8047187328338623, "rewards/format_reward": 1.0, "step": 2599 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.125, "epoch": 0.02624936900555275, "grad_norm": 1.3885089203567602, "kl": 0.06982421875, "learning_rate": 9.983008514081818e-07, "loss": 0.0028, "reward": 2.067812442779541, "reward_std": 0.004248274955898523, "rewards/accuracy_reward": 0.86781245470047, "rewards/format_reward": 1.0, "step": 2600 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.96875, "epoch": 0.026259464916708733, "grad_norm": 2.269787321718859, "kl": 0.06787109375, "learning_rate": 9.982995448609498e-07, "loss": 0.0027, "reward": 1.7764687538146973, "reward_std": 0.011489463970065117, "rewards/accuracy_reward": 0.6264687776565552, "rewards/format_reward": 1.0, "step": 2601 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.03125, "epoch": 0.026269560827864715, "grad_norm": 8.654265601930483, "kl": 0.0732421875, "learning_rate": 9.982982378124366e-07, "loss": 0.0029, "reward": 2.14634370803833, "reward_std": 0.031415000557899475, "rewards/accuracy_reward": 0.9525936841964722, "rewards/format_reward": 1.0, "step": 2602 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 415.71875, "epoch": 0.026279656739020697, "grad_norm": 1.1639853099979218, "kl": 0.07177734375, "learning_rate": 9.982969302626439e-07, "loss": 0.0029, "reward": 1.8814375400543213, "reward_std": 0.014580884948372841, "rewards/accuracy_reward": 0.7376874685287476, "rewards/format_reward": 1.0, "step": 2603 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.6875, "epoch": 0.02628975265017668, "grad_norm": 2.633425595232667, "kl": 0.0654296875, "learning_rate": 9.982956222115726e-07, "loss": 0.0026, "reward": 2.073906421661377, "reward_std": 0.02568369358778, "rewards/accuracy_reward": 0.8739062547683716, "rewards/format_reward": 1.0, "step": 2604 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.0625, "epoch": 0.02629984856133266, "grad_norm": 9.848450961289291, "kl": 0.06884765625, "learning_rate": 9.982943136592242e-07, "loss": 0.0027, "reward": 1.8531562089920044, "reward_std": 0.007544497027993202, "rewards/accuracy_reward": 0.7031562328338623, "rewards/format_reward": 1.0, "step": 2605 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.65625, "epoch": 0.026309944472488644, "grad_norm": 1.3444685873226965, "kl": 0.039306640625, "learning_rate": 9.982930046056003e-07, "loss": 0.0016, "reward": 1.9375001192092896, "reward_std": 0.29413777589797974, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2606 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.875, "epoch": 0.026320040383644622, "grad_norm": 2.6498986296709703, "kl": 0.0703125, "learning_rate": 9.982916950507016e-07, "loss": 0.0028, "reward": 2.061000108718872, "reward_std": 0.01734635978937149, "rewards/accuracy_reward": 0.8610000014305115, "rewards/format_reward": 1.0, "step": 2607 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.78125, "epoch": 0.026330136294800605, "grad_norm": 1.995852142098849, "kl": 0.06689453125, "learning_rate": 9.982903849945299e-07, "loss": 0.0027, "reward": 2.155031204223633, "reward_std": 0.055645015090703964, "rewards/accuracy_reward": 0.9737812280654907, "rewards/format_reward": 1.0, "step": 2608 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.09375, "epoch": 0.026340232205956587, "grad_norm": 4.057046273932881, "kl": 0.06494140625, "learning_rate": 9.982890744370864e-07, "loss": 0.0026, "reward": 1.5954687595367432, "reward_std": 0.1611555516719818, "rewards/accuracy_reward": 0.4767187237739563, "rewards/format_reward": 1.0, "step": 2609 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.28125, "epoch": 0.02635032811711257, "grad_norm": 1.82876894304322, "kl": 0.0673828125, "learning_rate": 9.982877633783724e-07, "loss": 0.0027, "reward": 2.1450939178466797, "reward_std": 0.03002346307039261, "rewards/accuracy_reward": 0.9575937390327454, "rewards/format_reward": 1.0, "step": 2610 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.75, "epoch": 0.02636042402826855, "grad_norm": 2.828827804135065, "kl": 0.0751953125, "learning_rate": 9.982864518183892e-07, "loss": 0.003, "reward": 2.108750104904175, "reward_std": 0.017514454200863838, "rewards/accuracy_reward": 0.9087499976158142, "rewards/format_reward": 1.0, "step": 2611 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 416.3125, "epoch": 0.026370519939424533, "grad_norm": 1.9493402091016132, "kl": 0.0771484375, "learning_rate": 9.98285139757138e-07, "loss": 0.0031, "reward": 1.7894062995910645, "reward_std": 0.03496653959155083, "rewards/accuracy_reward": 0.6456562280654907, "rewards/format_reward": 1.0, "step": 2612 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.026380615850580515, "grad_norm": 2.100448128745761, "kl": 0.0791015625, "learning_rate": 9.982838271946204e-07, "loss": 0.0032, "reward": 2.131312370300293, "reward_std": 0.013075239956378937, "rewards/accuracy_reward": 0.9313125014305115, "rewards/format_reward": 1.0, "step": 2613 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 409.4375, "epoch": 0.026390711761736498, "grad_norm": 1.06249533778645, "kl": 0.046875, "learning_rate": 9.982825141308375e-07, "loss": 0.0019, "reward": 1.8557500839233398, "reward_std": 0.006306183058768511, "rewards/accuracy_reward": 0.7057499885559082, "rewards/format_reward": 1.0, "step": 2614 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.02640080767289248, "grad_norm": 1.8613845015397004, "kl": 0.0693359375, "learning_rate": 9.982812005657907e-07, "loss": 0.0028, "reward": 2.1826250553131104, "reward_std": 0.0076194340363144875, "rewards/accuracy_reward": 0.9826250076293945, "rewards/format_reward": 1.0, "step": 2615 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.25, "epoch": 0.026410903584048462, "grad_norm": 1.6381431448936463, "kl": 0.06982421875, "learning_rate": 9.982798864994815e-07, "loss": 0.0028, "reward": 1.8915624618530273, "reward_std": 0.008866478689014912, "rewards/accuracy_reward": 0.74156254529953, "rewards/format_reward": 1.0, "step": 2616 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.6875, "epoch": 0.02642099949520444, "grad_norm": 7.5756805173707145, "kl": 0.07861328125, "learning_rate": 9.982785719319109e-07, "loss": 0.0031, "reward": 2.080124855041504, "reward_std": 0.029950063675642014, "rewards/accuracy_reward": 0.8863750100135803, "rewards/format_reward": 1.0, "step": 2617 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 419.25, "epoch": 0.026431095406360423, "grad_norm": 0.9266761698797037, "kl": 0.055419921875, "learning_rate": 9.982772568630803e-07, "loss": 0.0022, "reward": 1.8928437232971191, "reward_std": 0.0025175896007567644, "rewards/accuracy_reward": 0.742843747138977, "rewards/format_reward": 1.0, "step": 2618 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.026441191317516405, "grad_norm": 3.02069778602089, "kl": 0.07568359375, "learning_rate": 9.982759412929911e-07, "loss": 0.003, "reward": 2.179874897003174, "reward_std": 0.013154065236449242, "rewards/accuracy_reward": 0.9798750281333923, "rewards/format_reward": 1.0, "step": 2619 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.625, "epoch": 0.026451287228672387, "grad_norm": 1.7742685678132126, "kl": 0.06884765625, "learning_rate": 9.982746252216448e-07, "loss": 0.0028, "reward": 1.8408124446868896, "reward_std": 0.02114546298980713, "rewards/accuracy_reward": 0.6908125281333923, "rewards/format_reward": 1.0, "step": 2620 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 433.5, "epoch": 0.02646138313982837, "grad_norm": 0.8124906260271607, "kl": 0.056640625, "learning_rate": 9.982733086490422e-07, "loss": 0.0023, "reward": 2.15625, "reward_std": 0.023145517334342003, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2621 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.25, "epoch": 0.02647147905098435, "grad_norm": 2.624172703217949, "kl": 0.08056640625, "learning_rate": 9.982719915751851e-07, "loss": 0.0032, "reward": 2.0663437843322754, "reward_std": 0.028974052518606186, "rewards/accuracy_reward": 0.8663437366485596, "rewards/format_reward": 1.0, "step": 2622 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.78125, "epoch": 0.026481574962140333, "grad_norm": 3.019400261076141, "kl": 0.06201171875, "learning_rate": 9.982706740000747e-07, "loss": 0.0025, "reward": 1.8151562213897705, "reward_std": 0.009670400060713291, "rewards/accuracy_reward": 0.6651562452316284, "rewards/format_reward": 1.0, "step": 2623 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 422.5625, "epoch": 0.026491670873296316, "grad_norm": 4.310121353681803, "kl": 0.08984375, "learning_rate": 9.982693559237125e-07, "loss": 0.0036, "reward": 1.9386250972747803, "reward_std": 0.02926054038107395, "rewards/accuracy_reward": 0.7386249899864197, "rewards/format_reward": 1.0, "step": 2624 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.28125, "epoch": 0.026501766784452298, "grad_norm": 1.9458554428294048, "kl": 0.051513671875, "learning_rate": 9.982680373460994e-07, "loss": 0.0021, "reward": 2.012500047683716, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2625 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.3125, "epoch": 0.02651186269560828, "grad_norm": 2.778295779117946, "kl": 0.06396484375, "learning_rate": 9.98266718267237e-07, "loss": 0.0026, "reward": 2.1678123474121094, "reward_std": 0.03051576018333435, "rewards/accuracy_reward": 0.9740625023841858, "rewards/format_reward": 1.0, "step": 2626 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 414.25, "epoch": 0.026521958606764262, "grad_norm": 2.7329452720824823, "kl": 0.0673828125, "learning_rate": 9.982653986871266e-07, "loss": 0.0027, "reward": 1.8619686365127563, "reward_std": 0.021444939076900482, "rewards/accuracy_reward": 0.7119687795639038, "rewards/format_reward": 1.0, "step": 2627 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.28125, "epoch": 0.02653205451792024, "grad_norm": 2.405470704221892, "kl": 0.07373046875, "learning_rate": 9.982640786057693e-07, "loss": 0.003, "reward": 2.0849690437316895, "reward_std": 0.01472485065460205, "rewards/accuracy_reward": 0.8849687576293945, "rewards/format_reward": 1.0, "step": 2628 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 434.09375, "epoch": 0.026542150429076223, "grad_norm": 1.5742310276267637, "kl": 0.052734375, "learning_rate": 9.98262758023167e-07, "loss": 0.0021, "reward": 1.6060625314712524, "reward_std": 0.11077115684747696, "rewards/accuracy_reward": 0.5123125314712524, "rewards/format_reward": 1.0, "step": 2629 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 413.125, "epoch": 0.026552246340232205, "grad_norm": 2.0182837258651385, "kl": 0.06396484375, "learning_rate": 9.982614369393207e-07, "loss": 0.0026, "reward": 1.8904376029968262, "reward_std": 0.007109799422323704, "rewards/accuracy_reward": 0.7404375076293945, "rewards/format_reward": 1.0, "step": 2630 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.46875, "epoch": 0.026562342251388187, "grad_norm": 0.08625990713473157, "kl": 0.046875, "learning_rate": 9.982601153542314e-07, "loss": 0.0019, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2631 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.53125, "epoch": 0.02657243816254417, "grad_norm": 1.1585295385960543, "kl": 0.0673828125, "learning_rate": 9.98258793267901e-07, "loss": 0.0027, "reward": 1.768531322479248, "reward_std": 0.0014357513282448053, "rewards/accuracy_reward": 0.6185312271118164, "rewards/format_reward": 1.0, "step": 2632 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 422.4375, "epoch": 0.02658253407370015, "grad_norm": 1.7519634552508347, "kl": 0.050048828125, "learning_rate": 9.982574706803302e-07, "loss": 0.002, "reward": 1.8491876125335693, "reward_std": 0.10931669920682907, "rewards/accuracy_reward": 0.7054375410079956, "rewards/format_reward": 1.0, "step": 2633 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 418.3125, "epoch": 0.026592629984856134, "grad_norm": 1.5626650215277331, "kl": 0.058837890625, "learning_rate": 9.98256147591521e-07, "loss": 0.0024, "reward": 1.5636563301086426, "reward_std": 0.005455884151160717, "rewards/accuracy_reward": 0.4636562764644623, "rewards/format_reward": 1.0, "step": 2634 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.09375, "epoch": 0.026602725896012116, "grad_norm": 1.7066250169996153, "kl": 0.06640625, "learning_rate": 9.982548240014743e-07, "loss": 0.0027, "reward": 2.1282811164855957, "reward_std": 0.008311113342642784, "rewards/accuracy_reward": 0.9282812476158142, "rewards/format_reward": 1.0, "step": 2635 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.84375, "epoch": 0.026612821807168098, "grad_norm": 3.4051178923336654, "kl": 0.07373046875, "learning_rate": 9.982534999101916e-07, "loss": 0.003, "reward": 2.0618906021118164, "reward_std": 0.03198734670877457, "rewards/accuracy_reward": 0.8618905544281006, "rewards/format_reward": 1.0, "step": 2636 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.3125, "epoch": 0.02662291771832408, "grad_norm": 2.176035360260561, "kl": 0.08935546875, "learning_rate": 9.98252175317674e-07, "loss": 0.0036, "reward": 1.9844844341278076, "reward_std": 0.012023771181702614, "rewards/accuracy_reward": 0.784484326839447, "rewards/format_reward": 1.0, "step": 2637 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.02663301362948006, "grad_norm": 2.170212620710001, "kl": 0.0849609375, "learning_rate": 9.982508502239234e-07, "loss": 0.0034, "reward": 2.090400218963623, "reward_std": 0.01569283939898014, "rewards/accuracy_reward": 0.8903999924659729, "rewards/format_reward": 1.0, "step": 2638 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.53125, "epoch": 0.02664310954063604, "grad_norm": 1.3645317479497812, "kl": 0.06103515625, "learning_rate": 9.982495246289405e-07, "loss": 0.0024, "reward": 2.1546249389648438, "reward_std": 0.11534155905246735, "rewards/accuracy_reward": 0.9608750343322754, "rewards/format_reward": 1.0, "step": 2639 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.3125, "epoch": 0.026653205451792023, "grad_norm": 1.5299600487726643, "kl": 0.06396484375, "learning_rate": 9.98248198532727e-07, "loss": 0.0026, "reward": 1.866281270980835, "reward_std": 0.006213200278580189, "rewards/accuracy_reward": 0.7162812948226929, "rewards/format_reward": 1.0, "step": 2640 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.026663301362948005, "grad_norm": 3.0482710992557625, "kl": 0.08203125, "learning_rate": 9.98246871935284e-07, "loss": 0.0033, "reward": 2.090031147003174, "reward_std": 0.016149641945958138, "rewards/accuracy_reward": 0.8900312185287476, "rewards/format_reward": 1.0, "step": 2641 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.78125, "epoch": 0.026673397274103988, "grad_norm": 1.9306560355339197, "kl": 0.050048828125, "learning_rate": 9.982455448366128e-07, "loss": 0.002, "reward": 2.124687671661377, "reward_std": 0.02253429964184761, "rewards/accuracy_reward": 0.9309374690055847, "rewards/format_reward": 1.0, "step": 2642 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.0625, "epoch": 0.02668349318525997, "grad_norm": 3.43164248306123, "kl": 0.076171875, "learning_rate": 9.982442172367152e-07, "loss": 0.003, "reward": 2.0210938453674316, "reward_std": 0.031315527856349945, "rewards/accuracy_reward": 0.8210937976837158, "rewards/format_reward": 1.0, "step": 2643 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.3125, "epoch": 0.026693589096415952, "grad_norm": 1.7054849572699626, "kl": 0.0732421875, "learning_rate": 9.98242889135592e-07, "loss": 0.0029, "reward": 2.1694061756134033, "reward_std": 0.0074572074227035046, "rewards/accuracy_reward": 0.969406247138977, "rewards/format_reward": 1.0, "step": 2644 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.0, "epoch": 0.026703685007571934, "grad_norm": 4.521070632790399, "kl": 0.080078125, "learning_rate": 9.982415605332447e-07, "loss": 0.0032, "reward": 2.038937568664551, "reward_std": 0.036777567118406296, "rewards/accuracy_reward": 0.8451875448226929, "rewards/format_reward": 1.0, "step": 2645 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 395.25, "epoch": 0.026713780918727916, "grad_norm": 0.963657108260461, "kl": 0.064453125, "learning_rate": 9.98240231429675e-07, "loss": 0.0026, "reward": 1.5406875610351562, "reward_std": 0.007486596237868071, "rewards/accuracy_reward": 0.44068747758865356, "rewards/format_reward": 1.0, "step": 2646 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.125, "epoch": 0.0267238768298839, "grad_norm": 2.6910027555107225, "kl": 0.07568359375, "learning_rate": 9.982389018248837e-07, "loss": 0.003, "reward": 2.126187562942505, "reward_std": 0.02751275524497032, "rewards/accuracy_reward": 0.9261875152587891, "rewards/format_reward": 1.0, "step": 2647 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.0, "epoch": 0.02673397274103988, "grad_norm": 3.1554019862963147, "kl": 0.06591796875, "learning_rate": 9.982375717188725e-07, "loss": 0.0026, "reward": 1.8530311584472656, "reward_std": 0.09919992834329605, "rewards/accuracy_reward": 0.7030311822891235, "rewards/format_reward": 1.0, "step": 2648 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.96875, "epoch": 0.02674406865219586, "grad_norm": 1.4814982013070175, "kl": 0.0498046875, "learning_rate": 9.982362411116424e-07, "loss": 0.002, "reward": 2.1850624084472656, "reward_std": 0.008648659102618694, "rewards/accuracy_reward": 0.9850624799728394, "rewards/format_reward": 1.0, "step": 2649 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.46875, "epoch": 0.02675416456335184, "grad_norm": 1.006096959228932, "kl": 0.06982421875, "learning_rate": 9.982349100031953e-07, "loss": 0.0028, "reward": 1.9526562690734863, "reward_std": 0.003430282697081566, "rewards/accuracy_reward": 0.7526562213897705, "rewards/format_reward": 1.0, "step": 2650 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.125, "epoch": 0.026764260474507823, "grad_norm": 2.8013743168085585, "kl": 0.0927734375, "learning_rate": 9.982335783935318e-07, "loss": 0.0037, "reward": 2.133490562438965, "reward_std": 0.02447477914392948, "rewards/accuracy_reward": 0.9334906339645386, "rewards/format_reward": 1.0, "step": 2651 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.34375, "epoch": 0.026774356385663806, "grad_norm": 2.0490498844505898, "kl": 0.06591796875, "learning_rate": 9.982322462826537e-07, "loss": 0.0026, "reward": 2.1530001163482666, "reward_std": 0.01280923094600439, "rewards/accuracy_reward": 0.9529999494552612, "rewards/format_reward": 1.0, "step": 2652 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.28125, "epoch": 0.026784452296819788, "grad_norm": 2.1124880519842, "kl": 0.06005859375, "learning_rate": 9.982309136705624e-07, "loss": 0.0024, "reward": 2.1060938835144043, "reward_std": 0.0069242725148797035, "rewards/accuracy_reward": 0.9060937166213989, "rewards/format_reward": 1.0, "step": 2653 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.09375, "epoch": 0.02679454820797577, "grad_norm": 3.4084371248117984, "kl": 0.0703125, "learning_rate": 9.982295805572591e-07, "loss": 0.0028, "reward": 2.1079063415527344, "reward_std": 0.01705128699541092, "rewards/accuracy_reward": 0.9079062938690186, "rewards/format_reward": 1.0, "step": 2654 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.5625, "epoch": 0.026804644119131752, "grad_norm": 1.631299287718525, "kl": 0.06298828125, "learning_rate": 9.982282469427451e-07, "loss": 0.0025, "reward": 1.8773125410079956, "reward_std": 0.09452182799577713, "rewards/accuracy_reward": 0.727312445640564, "rewards/format_reward": 1.0, "step": 2655 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 416.9375, "epoch": 0.026814740030287734, "grad_norm": 1.6001038654651556, "kl": 0.06396484375, "learning_rate": 9.982269128270218e-07, "loss": 0.0026, "reward": 1.50209379196167, "reward_std": 0.025093957781791687, "rewards/accuracy_reward": 0.40834376215934753, "rewards/format_reward": 1.0, "step": 2656 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.875, "epoch": 0.026824835941443716, "grad_norm": 1.3079364918272034, "kl": 0.05322265625, "learning_rate": 9.982255782100905e-07, "loss": 0.0021, "reward": 2.1861562728881836, "reward_std": 0.02258927933871746, "rewards/accuracy_reward": 0.9924062490463257, "rewards/format_reward": 1.0, "step": 2657 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 415.3125, "epoch": 0.0268349318525997, "grad_norm": 1.7392778086899818, "kl": 0.05712890625, "learning_rate": 9.982242430919526e-07, "loss": 0.0023, "reward": 1.5749688148498535, "reward_std": 0.004184963647276163, "rewards/accuracy_reward": 0.4749687612056732, "rewards/format_reward": 1.0, "step": 2658 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 401.96875, "epoch": 0.026845027763755677, "grad_norm": 5.054592445721362, "kl": 0.0693359375, "learning_rate": 9.982229074726095e-07, "loss": 0.0028, "reward": 1.7355313301086426, "reward_std": 0.019196301698684692, "rewards/accuracy_reward": 0.5855311751365662, "rewards/format_reward": 1.0, "step": 2659 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.0625, "epoch": 0.02685512367491166, "grad_norm": 2.46569247370836, "kl": 0.062255859375, "learning_rate": 9.982215713520623e-07, "loss": 0.0025, "reward": 1.9497499465942383, "reward_std": 0.17476683855056763, "rewards/accuracy_reward": 0.7747500538825989, "rewards/format_reward": 1.0, "step": 2660 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.875, "epoch": 0.02686521958606764, "grad_norm": 2.0060989845647605, "kl": 0.06591796875, "learning_rate": 9.982202347303126e-07, "loss": 0.0026, "reward": 1.865187644958496, "reward_std": 0.01017092913389206, "rewards/accuracy_reward": 0.7151874899864197, "rewards/format_reward": 1.0, "step": 2661 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.4375, "epoch": 0.026875315497223624, "grad_norm": 3.2185096251817655, "kl": 0.0810546875, "learning_rate": 9.982188976073615e-07, "loss": 0.0032, "reward": 2.1334688663482666, "reward_std": 0.018196426331996918, "rewards/accuracy_reward": 0.9334686994552612, "rewards/format_reward": 1.0, "step": 2662 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.46875, "epoch": 0.026885411408379606, "grad_norm": 2.7787276726378325, "kl": 0.06640625, "learning_rate": 9.982175599832107e-07, "loss": 0.0027, "reward": 2.0010311603546143, "reward_std": 0.04195598140358925, "rewards/accuracy_reward": 0.8072812557220459, "rewards/format_reward": 1.0, "step": 2663 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.53125, "epoch": 0.026895507319535588, "grad_norm": 2.8413283605716093, "kl": 0.0634765625, "learning_rate": 9.982162218578613e-07, "loss": 0.0025, "reward": 1.8721561431884766, "reward_std": 0.009112885221838951, "rewards/accuracy_reward": 0.722156286239624, "rewards/format_reward": 1.0, "step": 2664 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.0625, "epoch": 0.02690560323069157, "grad_norm": 2.269167770113478, "kl": 0.05859375, "learning_rate": 9.982148832313147e-07, "loss": 0.0023, "reward": 2.166250228881836, "reward_std": 0.008160953409969807, "rewards/accuracy_reward": 0.966249942779541, "rewards/format_reward": 1.0, "step": 2665 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.40625, "epoch": 0.026915699141847552, "grad_norm": 2.817286469910832, "kl": 0.0830078125, "learning_rate": 9.98213544103572e-07, "loss": 0.0033, "reward": 2.066531181335449, "reward_std": 0.02165684476494789, "rewards/accuracy_reward": 0.8665311932563782, "rewards/format_reward": 1.0, "step": 2666 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.84375, "epoch": 0.026925795053003534, "grad_norm": 2.724777334984441, "kl": 0.0693359375, "learning_rate": 9.98212204474635e-07, "loss": 0.0028, "reward": 2.041874885559082, "reward_std": 0.03212722763419151, "rewards/accuracy_reward": 0.8481249809265137, "rewards/format_reward": 1.0, "step": 2667 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.34375, "epoch": 0.026935890964159517, "grad_norm": 2.094009419112791, "kl": 0.058349609375, "learning_rate": 9.982108643445047e-07, "loss": 0.0023, "reward": 2.0404374599456787, "reward_std": 0.027485080063343048, "rewards/accuracy_reward": 0.8404375314712524, "rewards/format_reward": 1.0, "step": 2668 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 396.0625, "epoch": 0.0269459868753155, "grad_norm": 1.2862963882996379, "kl": 0.059814453125, "learning_rate": 9.982095237131826e-07, "loss": 0.0024, "reward": 1.4964687824249268, "reward_std": 0.010166861116886139, "rewards/accuracy_reward": 0.39646875858306885, "rewards/format_reward": 1.0, "step": 2669 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.9375, "epoch": 0.026956082786471477, "grad_norm": 1.415146063464998, "kl": 0.060791015625, "learning_rate": 9.982081825806702e-07, "loss": 0.0024, "reward": 2.034468650817871, "reward_std": 0.009184835478663445, "rewards/accuracy_reward": 0.8344687819480896, "rewards/format_reward": 1.0, "step": 2670 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 408.625, "epoch": 0.02696617869762746, "grad_norm": 0.9618510963755875, "kl": 0.0458984375, "learning_rate": 9.982068409469685e-07, "loss": 0.0018, "reward": 1.8962187767028809, "reward_std": 0.003697105683386326, "rewards/accuracy_reward": 0.746218740940094, "rewards/format_reward": 1.0, "step": 2671 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.09375, "epoch": 0.026976274608783442, "grad_norm": 2.119241460818996, "kl": 0.0673828125, "learning_rate": 9.982054988120791e-07, "loss": 0.0027, "reward": 1.958531141281128, "reward_std": 0.1672874242067337, "rewards/accuracy_reward": 0.7835312485694885, "rewards/format_reward": 1.0, "step": 2672 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.34375, "epoch": 0.026986370519939424, "grad_norm": 2.812490275065061, "kl": 0.0615234375, "learning_rate": 9.982041561760031e-07, "loss": 0.0025, "reward": 1.7384063005447388, "reward_std": 0.026597891002893448, "rewards/accuracy_reward": 0.594656229019165, "rewards/format_reward": 1.0, "step": 2673 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.6875, "epoch": 0.026996466431095406, "grad_norm": 1.9457386899962634, "kl": 0.07080078125, "learning_rate": 9.982028130387422e-07, "loss": 0.0028, "reward": 2.0875937938690186, "reward_std": 0.013576706871390343, "rewards/accuracy_reward": 0.8875937461853027, "rewards/format_reward": 1.0, "step": 2674 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.34375, "epoch": 0.02700656234225139, "grad_norm": 2.6361407638556327, "kl": 0.06640625, "learning_rate": 9.982014694002976e-07, "loss": 0.0027, "reward": 2.1755623817443848, "reward_std": 0.037334755063056946, "rewards/accuracy_reward": 0.9818124771118164, "rewards/format_reward": 1.0, "step": 2675 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.6875, "epoch": 0.02701665825340737, "grad_norm": 1.7124443247220287, "kl": 0.061279296875, "learning_rate": 9.982001252606705e-07, "loss": 0.0025, "reward": 1.5748437643051147, "reward_std": 0.09461972862482071, "rewards/accuracy_reward": 0.47484374046325684, "rewards/format_reward": 1.0, "step": 2676 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 390.46875, "epoch": 0.027026754164563353, "grad_norm": 3.2001781279704806, "kl": 0.0634765625, "learning_rate": 9.981987806198624e-07, "loss": 0.0025, "reward": 1.831375002861023, "reward_std": 0.014221429824829102, "rewards/accuracy_reward": 0.6813750267028809, "rewards/format_reward": 1.0, "step": 2677 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.8125, "epoch": 0.027036850075719335, "grad_norm": 1.635223545452289, "kl": 0.056884765625, "learning_rate": 9.981974354778745e-07, "loss": 0.0023, "reward": 2.1784374713897705, "reward_std": 0.0064778863452374935, "rewards/accuracy_reward": 0.9784374833106995, "rewards/format_reward": 1.0, "step": 2678 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.96875, "epoch": 0.027046945986875317, "grad_norm": 2.262734616314554, "kl": 0.0634765625, "learning_rate": 9.981960898347085e-07, "loss": 0.0025, "reward": 1.886906385421753, "reward_std": 0.12051891535520554, "rewards/accuracy_reward": 0.7494062185287476, "rewards/format_reward": 1.0, "step": 2679 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.03125, "epoch": 0.0270570418980313, "grad_norm": 4.50451702489144, "kl": 0.06591796875, "learning_rate": 9.981947436903656e-07, "loss": 0.0026, "reward": 2.101781129837036, "reward_std": 0.01593118906021118, "rewards/accuracy_reward": 0.9017812609672546, "rewards/format_reward": 1.0, "step": 2680 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.875, "epoch": 0.027067137809187278, "grad_norm": 1.662901236230492, "kl": 0.07177734375, "learning_rate": 9.98193397044847e-07, "loss": 0.0029, "reward": 2.042781352996826, "reward_std": 0.012335184030234814, "rewards/accuracy_reward": 0.8427812457084656, "rewards/format_reward": 1.0, "step": 2681 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.625, "epoch": 0.02707723372034326, "grad_norm": 2.6367117003450846, "kl": 0.0703125, "learning_rate": 9.98192049898154e-07, "loss": 0.0028, "reward": 1.8015625476837158, "reward_std": 0.012438907288014889, "rewards/accuracy_reward": 0.6515624523162842, "rewards/format_reward": 1.0, "step": 2682 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.9375, "epoch": 0.027087329631499242, "grad_norm": 2.8877209452590424, "kl": 0.0771484375, "learning_rate": 9.981907022502883e-07, "loss": 0.0031, "reward": 2.063687324523926, "reward_std": 0.030666008591651917, "rewards/accuracy_reward": 0.869937539100647, "rewards/format_reward": 1.0, "step": 2683 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 395.71875, "epoch": 0.027097425542655224, "grad_norm": 0.09136264511980699, "kl": 0.0478515625, "learning_rate": 9.981893541012509e-07, "loss": 0.0019, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2684 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.34375, "epoch": 0.027107521453811206, "grad_norm": 2.9877508995294844, "kl": 0.06787109375, "learning_rate": 9.981880054510434e-07, "loss": 0.0027, "reward": 2.1750311851501465, "reward_std": 0.025722766295075417, "rewards/accuracy_reward": 0.9812812209129333, "rewards/format_reward": 1.0, "step": 2685 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.3125, "epoch": 0.02711761736496719, "grad_norm": 7.45378138132877, "kl": 0.059814453125, "learning_rate": 9.98186656299667e-07, "loss": 0.0024, "reward": 1.8917813301086426, "reward_std": 0.1653580516576767, "rewards/accuracy_reward": 0.7230312824249268, "rewards/format_reward": 1.0, "step": 2686 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.5625, "epoch": 0.02712771327612317, "grad_norm": 2.1864995645672556, "kl": 0.0771484375, "learning_rate": 9.981853066471233e-07, "loss": 0.0031, "reward": 2.1049375534057617, "reward_std": 0.01687164045870304, "rewards/accuracy_reward": 0.9049374461174011, "rewards/format_reward": 1.0, "step": 2687 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.03125, "epoch": 0.027137809187279153, "grad_norm": 2.1925277028153216, "kl": 0.07177734375, "learning_rate": 9.981839564934133e-07, "loss": 0.0029, "reward": 1.7370624542236328, "reward_std": 0.013007921166718006, "rewards/accuracy_reward": 0.5870624780654907, "rewards/format_reward": 1.0, "step": 2688 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 412.65625, "epoch": 0.027147905098435135, "grad_norm": 1.131586362023955, "kl": 0.06396484375, "learning_rate": 9.981826058385386e-07, "loss": 0.0025, "reward": 1.5332187414169312, "reward_std": 0.0002815134357661009, "rewards/accuracy_reward": 0.433218777179718, "rewards/format_reward": 1.0, "step": 2689 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.027158001009591117, "grad_norm": 2.5330150189331673, "kl": 0.08154296875, "learning_rate": 9.981812546825006e-07, "loss": 0.0033, "reward": 2.0806875228881836, "reward_std": 0.024743063375353813, "rewards/accuracy_reward": 0.8869374990463257, "rewards/format_reward": 1.0, "step": 2690 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.027168096920747096, "grad_norm": 3.083272073007579, "kl": 0.0771484375, "learning_rate": 9.981799030253003e-07, "loss": 0.0031, "reward": 2.069499969482422, "reward_std": 0.014571579173207283, "rewards/accuracy_reward": 0.8695000410079956, "rewards/format_reward": 1.0, "step": 2691 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.5, "epoch": 0.027178192831903078, "grad_norm": 1.981653550237217, "kl": 0.06201171875, "learning_rate": 9.981785508669396e-07, "loss": 0.0025, "reward": 2.1578125953674316, "reward_std": 0.009761599823832512, "rewards/accuracy_reward": 0.9578125476837158, "rewards/format_reward": 1.0, "step": 2692 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.53125, "epoch": 0.02718828874305906, "grad_norm": 3.745862661760847, "kl": 0.0849609375, "learning_rate": 9.981771982074194e-07, "loss": 0.0034, "reward": 2.0893750190734863, "reward_std": 0.012738585472106934, "rewards/accuracy_reward": 0.8893749713897705, "rewards/format_reward": 1.0, "step": 2693 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.4375, "epoch": 0.027198384654215042, "grad_norm": 2.648838232803915, "kl": 0.0712890625, "learning_rate": 9.981758450467414e-07, "loss": 0.0029, "reward": 1.729312539100647, "reward_std": 0.007278076838701963, "rewards/accuracy_reward": 0.5793124437332153, "rewards/format_reward": 1.0, "step": 2694 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.9375, "epoch": 0.027208480565371024, "grad_norm": 2.5127000461160924, "kl": 0.0830078125, "learning_rate": 9.981744913849067e-07, "loss": 0.0033, "reward": 1.8943126201629639, "reward_std": 0.025361692532896996, "rewards/accuracy_reward": 0.7005624771118164, "rewards/format_reward": 1.0, "step": 2695 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.5, "epoch": 0.027218576476527007, "grad_norm": 2.550857494925397, "kl": 0.07421875, "learning_rate": 9.981731372219168e-07, "loss": 0.003, "reward": 2.1592812538146973, "reward_std": 0.022906364873051643, "rewards/accuracy_reward": 0.9655312299728394, "rewards/format_reward": 1.0, "step": 2696 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.625, "epoch": 0.02722867238768299, "grad_norm": 4.58616711017665, "kl": 0.0791015625, "learning_rate": 9.98171782557773e-07, "loss": 0.0031, "reward": 2.1255626678466797, "reward_std": 0.0072512077167630196, "rewards/accuracy_reward": 0.9255625009536743, "rewards/format_reward": 1.0, "step": 2697 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.15625, "epoch": 0.02723876829883897, "grad_norm": 4.08751413911989, "kl": 0.07568359375, "learning_rate": 9.981704273924768e-07, "loss": 0.003, "reward": 2.0295000076293945, "reward_std": 0.02621440216898918, "rewards/accuracy_reward": 0.8357499837875366, "rewards/format_reward": 1.0, "step": 2698 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.78125, "epoch": 0.027248864209994953, "grad_norm": 3.8941295433634644, "kl": 0.0859375, "learning_rate": 9.981690717260292e-07, "loss": 0.0034, "reward": 2.080812454223633, "reward_std": 0.01579369232058525, "rewards/accuracy_reward": 0.8808125257492065, "rewards/format_reward": 1.0, "step": 2699 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.53125, "epoch": 0.027258960121150935, "grad_norm": 2.4284465457640625, "kl": 0.06982421875, "learning_rate": 9.98167715558432e-07, "loss": 0.0028, "reward": 1.88853120803833, "reward_std": 0.15905968844890594, "rewards/accuracy_reward": 0.7260312438011169, "rewards/format_reward": 1.0, "step": 2700 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 410.28125, "epoch": 0.027269056032306917, "grad_norm": 2.1093735779858545, "kl": 0.076171875, "learning_rate": 9.981663588896863e-07, "loss": 0.003, "reward": 1.571906328201294, "reward_std": 0.014163952320814133, "rewards/accuracy_reward": 0.47190624475479126, "rewards/format_reward": 1.0, "step": 2701 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 427.3125, "epoch": 0.027279151943462896, "grad_norm": 2.028995620534056, "kl": 0.056640625, "learning_rate": 9.981650017197936e-07, "loss": 0.0023, "reward": 1.8055626153945923, "reward_std": 0.01403803750872612, "rewards/accuracy_reward": 0.6555625200271606, "rewards/format_reward": 1.0, "step": 2702 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.0625, "epoch": 0.027289247854618878, "grad_norm": 2.017563682758754, "kl": 0.06494140625, "learning_rate": 9.981636440487553e-07, "loss": 0.0026, "reward": 1.8488751649856567, "reward_std": 0.007856779731810093, "rewards/accuracy_reward": 0.6988749504089355, "rewards/format_reward": 1.0, "step": 2703 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.03125, "epoch": 0.02729934376577486, "grad_norm": 1.9249833529451124, "kl": 0.0625, "learning_rate": 9.981622858765725e-07, "loss": 0.0025, "reward": 2.1770312786102295, "reward_std": 0.022464429959654808, "rewards/accuracy_reward": 0.9770312309265137, "rewards/format_reward": 1.0, "step": 2704 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.71875, "epoch": 0.027309439676930843, "grad_norm": 2.890109476720054, "kl": 0.08642578125, "learning_rate": 9.981609272032469e-07, "loss": 0.0034, "reward": 1.972312569618225, "reward_std": 0.019668057560920715, "rewards/accuracy_reward": 0.7723125219345093, "rewards/format_reward": 1.0, "step": 2705 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.625, "epoch": 0.027319535588086825, "grad_norm": 8.619204936809039, "kl": 0.07763671875, "learning_rate": 9.981595680287795e-07, "loss": 0.0031, "reward": 2.137500047683716, "reward_std": 0.01917753741145134, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 2706 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.027329631499242807, "grad_norm": 0.8124034670559629, "kl": 0.06787109375, "learning_rate": 9.981582083531721e-07, "loss": 0.0027, "reward": 2.1987814903259277, "reward_std": 0.0034471547696739435, "rewards/accuracy_reward": 0.9987812638282776, "rewards/format_reward": 1.0, "step": 2707 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.1875, "epoch": 0.02733972741039879, "grad_norm": 1.7515650424484859, "kl": 0.06884765625, "learning_rate": 9.981568481764257e-07, "loss": 0.0027, "reward": 2.106750011444092, "reward_std": 0.010943202301859856, "rewards/accuracy_reward": 0.9067500233650208, "rewards/format_reward": 1.0, "step": 2708 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.875, "epoch": 0.02734982332155477, "grad_norm": 1.9692636647110984, "kl": 0.06298828125, "learning_rate": 9.98155487498542e-07, "loss": 0.0025, "reward": 2.1825625896453857, "reward_std": 0.006572823040187359, "rewards/accuracy_reward": 0.9825624823570251, "rewards/format_reward": 1.0, "step": 2709 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.5625, "epoch": 0.027359919232710753, "grad_norm": 1.7956391769036606, "kl": 0.0625, "learning_rate": 9.98154126319522e-07, "loss": 0.0025, "reward": 1.705625057220459, "reward_std": 0.16012531518936157, "rewards/accuracy_reward": 0.5868750214576721, "rewards/format_reward": 1.0, "step": 2710 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.6875, "epoch": 0.027370015143866736, "grad_norm": 1.350379760109285, "kl": 0.06884765625, "learning_rate": 9.981527646393674e-07, "loss": 0.0028, "reward": 1.7990000247955322, "reward_std": 0.02197260409593582, "rewards/accuracy_reward": 0.6552499532699585, "rewards/format_reward": 1.0, "step": 2711 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 415.34375, "epoch": 0.027380111055022714, "grad_norm": 1.2803863020520543, "kl": 0.0673828125, "learning_rate": 9.981514024580794e-07, "loss": 0.0027, "reward": 1.8856875896453857, "reward_std": 0.003943243995308876, "rewards/accuracy_reward": 0.7356874942779541, "rewards/format_reward": 1.0, "step": 2712 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 418.0625, "epoch": 0.027390206966178696, "grad_norm": 1.1059111366938688, "kl": 0.055908203125, "learning_rate": 9.981500397756592e-07, "loss": 0.0022, "reward": 1.5347187519073486, "reward_std": 0.007689792662858963, "rewards/accuracy_reward": 0.4347187876701355, "rewards/format_reward": 1.0, "step": 2713 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.84375, "epoch": 0.02740030287733468, "grad_norm": 2.6316688949494154, "kl": 0.08251953125, "learning_rate": 9.981486765921084e-07, "loss": 0.0033, "reward": 2.0952811241149902, "reward_std": 0.030849367380142212, "rewards/accuracy_reward": 0.9015312194824219, "rewards/format_reward": 1.0, "step": 2714 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.8125, "epoch": 0.02741039878849066, "grad_norm": 4.6499314275154715, "kl": 0.0869140625, "learning_rate": 9.981473129074284e-07, "loss": 0.0035, "reward": 2.045687437057495, "reward_std": 0.02988586388528347, "rewards/accuracy_reward": 0.8519375324249268, "rewards/format_reward": 1.0, "step": 2715 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.25, "epoch": 0.027420494699646643, "grad_norm": 4.265771818482657, "kl": 0.076171875, "learning_rate": 9.981459487216207e-07, "loss": 0.0031, "reward": 2.095968723297119, "reward_std": 0.012652475386857986, "rewards/accuracy_reward": 0.8959687352180481, "rewards/format_reward": 1.0, "step": 2716 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.71875, "epoch": 0.027430590610802625, "grad_norm": 1.8721393480233097, "kl": 0.0712890625, "learning_rate": 9.981445840346862e-07, "loss": 0.0029, "reward": 2.102375030517578, "reward_std": 0.03951933979988098, "rewards/accuracy_reward": 0.9148750305175781, "rewards/format_reward": 1.0, "step": 2717 }, { "all_correct": 0.25, "all_wrong": 0.75, "completion_length": 390.84375, "epoch": 0.027440686521958607, "grad_norm": 1.0955464494608282, "kl": 0.0517578125, "learning_rate": 9.981432188466265e-07, "loss": 0.0021, "reward": 1.2937500476837158, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 2718 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.8125, "epoch": 0.02745078243311459, "grad_norm": 2.8997209499124548, "kl": 0.0693359375, "learning_rate": 9.981418531574432e-07, "loss": 0.0028, "reward": 1.7793124914169312, "reward_std": 0.022216398268938065, "rewards/accuracy_reward": 0.6355624198913574, "rewards/format_reward": 1.0, "step": 2719 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.21875, "epoch": 0.02746087834427057, "grad_norm": 7.4995271303446245, "kl": 0.07421875, "learning_rate": 9.981404869671375e-07, "loss": 0.003, "reward": 2.142843723297119, "reward_std": 0.01470143347978592, "rewards/accuracy_reward": 0.9428437352180481, "rewards/format_reward": 1.0, "step": 2720 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.027470974255426554, "grad_norm": 1.825453101713549, "kl": 0.06396484375, "learning_rate": 9.981391202757108e-07, "loss": 0.0026, "reward": 2.1401875019073486, "reward_std": 0.011360252276062965, "rewards/accuracy_reward": 0.9401875138282776, "rewards/format_reward": 1.0, "step": 2721 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.0625, "epoch": 0.027481070166582536, "grad_norm": 1.4139331003346165, "kl": 0.0654296875, "learning_rate": 9.981377530831643e-07, "loss": 0.0026, "reward": 2.1511876583099365, "reward_std": 0.006683865562081337, "rewards/accuracy_reward": 0.9511875510215759, "rewards/format_reward": 1.0, "step": 2722 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.28125, "epoch": 0.027491166077738514, "grad_norm": 2.2393031745431475, "kl": 0.080078125, "learning_rate": 9.981363853894998e-07, "loss": 0.0032, "reward": 2.099843978881836, "reward_std": 0.010560243390500546, "rewards/accuracy_reward": 0.8998437523841858, "rewards/format_reward": 1.0, "step": 2723 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.8125, "epoch": 0.027501261988894497, "grad_norm": 1.9264506076909442, "kl": 0.0771484375, "learning_rate": 9.98135017194718e-07, "loss": 0.0031, "reward": 1.8791248798370361, "reward_std": 0.0060431682504713535, "rewards/accuracy_reward": 0.7291250228881836, "rewards/format_reward": 1.0, "step": 2724 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.71875, "epoch": 0.02751135790005048, "grad_norm": 1.7247522403144475, "kl": 0.06640625, "learning_rate": 9.98133648498821e-07, "loss": 0.0027, "reward": 1.8205937147140503, "reward_std": 0.010048173367977142, "rewards/accuracy_reward": 0.6705937385559082, "rewards/format_reward": 1.0, "step": 2725 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 399.90625, "epoch": 0.02752145381120646, "grad_norm": 3.136344101772427, "kl": 0.07275390625, "learning_rate": 9.981322793018096e-07, "loss": 0.0029, "reward": 1.753156304359436, "reward_std": 0.01547764427959919, "rewards/accuracy_reward": 0.6031562685966492, "rewards/format_reward": 1.0, "step": 2726 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.28125, "epoch": 0.027531549722362443, "grad_norm": 1.7688481670800156, "kl": 0.06689453125, "learning_rate": 9.981309096036857e-07, "loss": 0.0027, "reward": 2.1572813987731934, "reward_std": 0.007666308432817459, "rewards/accuracy_reward": 0.9572812914848328, "rewards/format_reward": 1.0, "step": 2727 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.09375, "epoch": 0.027541645633518425, "grad_norm": 3.178340359555563, "kl": 0.08154296875, "learning_rate": 9.981295394044503e-07, "loss": 0.0033, "reward": 1.7455625534057617, "reward_std": 0.013181490823626518, "rewards/accuracy_reward": 0.5955624580383301, "rewards/format_reward": 1.0, "step": 2728 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.5625, "epoch": 0.027551741544674407, "grad_norm": 3.197518551422863, "kl": 0.07861328125, "learning_rate": 9.981281687041047e-07, "loss": 0.0031, "reward": 1.7919687032699585, "reward_std": 0.013510811142623425, "rewards/accuracy_reward": 0.6419687271118164, "rewards/format_reward": 1.0, "step": 2729 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.375, "epoch": 0.02756183745583039, "grad_norm": 1.9927273321550119, "kl": 0.078125, "learning_rate": 9.981267975026509e-07, "loss": 0.0031, "reward": 2.1443748474121094, "reward_std": 0.008234651759266853, "rewards/accuracy_reward": 0.9443750381469727, "rewards/format_reward": 1.0, "step": 2730 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.03125, "epoch": 0.02757193336698637, "grad_norm": 4.318002108563918, "kl": 0.06982421875, "learning_rate": 9.981254258000894e-07, "loss": 0.0028, "reward": 2.1113438606262207, "reward_std": 0.007890940643846989, "rewards/accuracy_reward": 0.9113437533378601, "rewards/format_reward": 1.0, "step": 2731 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.027582029278142354, "grad_norm": 3.6416088657862296, "kl": 0.0703125, "learning_rate": 9.98124053596422e-07, "loss": 0.0028, "reward": 2.107156276702881, "reward_std": 0.14755654335021973, "rewards/accuracy_reward": 0.9196562170982361, "rewards/format_reward": 1.0, "step": 2732 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.5625, "epoch": 0.027592125189298333, "grad_norm": 4.568506625025532, "kl": 0.06787109375, "learning_rate": 9.981226808916505e-07, "loss": 0.0027, "reward": 1.825874924659729, "reward_std": 0.012183181941509247, "rewards/accuracy_reward": 0.6758749485015869, "rewards/format_reward": 1.0, "step": 2733 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.53125, "epoch": 0.027602221100454315, "grad_norm": 1.5404877514985884, "kl": 0.07177734375, "learning_rate": 9.981213076857758e-07, "loss": 0.0029, "reward": 2.1037187576293945, "reward_std": 0.004475605208426714, "rewards/accuracy_reward": 0.9037188291549683, "rewards/format_reward": 1.0, "step": 2734 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 407.53125, "epoch": 0.027612317011610297, "grad_norm": 3.2634056164683134, "kl": 0.0634765625, "learning_rate": 9.98119933978799e-07, "loss": 0.0025, "reward": 1.7723437547683716, "reward_std": 0.009451000019907951, "rewards/accuracy_reward": 0.6223437786102295, "rewards/format_reward": 1.0, "step": 2735 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.03125, "epoch": 0.02762241292276628, "grad_norm": 0.08927550152221157, "kl": 0.05126953125, "learning_rate": 9.981185597707222e-07, "loss": 0.0021, "reward": 1.8375000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2736 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.02763250883392226, "grad_norm": 6.569765583881573, "kl": 0.08056640625, "learning_rate": 9.981171850615465e-07, "loss": 0.0032, "reward": 2.077937602996826, "reward_std": 0.05152323842048645, "rewards/accuracy_reward": 0.8841875195503235, "rewards/format_reward": 1.0, "step": 2737 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.625, "epoch": 0.027642604745078243, "grad_norm": 2.9983581856714654, "kl": 0.0673828125, "learning_rate": 9.98115809851273e-07, "loss": 0.0027, "reward": 2.1118125915527344, "reward_std": 0.01587536558508873, "rewards/accuracy_reward": 0.9118125438690186, "rewards/format_reward": 1.0, "step": 2738 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.59375, "epoch": 0.027652700656234225, "grad_norm": 2.66613122742627, "kl": 0.08544921875, "learning_rate": 9.981144341399032e-07, "loss": 0.0034, "reward": 2.1229686737060547, "reward_std": 0.010299880057573318, "rewards/accuracy_reward": 0.9229687452316284, "rewards/format_reward": 1.0, "step": 2739 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.71875, "epoch": 0.027662796567390208, "grad_norm": 2.0292576571152487, "kl": 0.06787109375, "learning_rate": 9.981130579274389e-07, "loss": 0.0027, "reward": 1.978562593460083, "reward_std": 0.15892702341079712, "rewards/accuracy_reward": 0.7973124980926514, "rewards/format_reward": 1.0, "step": 2740 }, { "all_correct": 0.25, "all_wrong": 0.75, "completion_length": 408.46875, "epoch": 0.02767289247854619, "grad_norm": 0.09929062910851902, "kl": 0.05712890625, "learning_rate": 9.981116812138808e-07, "loss": 0.0023, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 2741 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.027682988389702172, "grad_norm": 2.3092723572165443, "kl": 0.072265625, "learning_rate": 9.981103039992308e-07, "loss": 0.0029, "reward": 2.086531162261963, "reward_std": 0.01970745623111725, "rewards/accuracy_reward": 0.8865312337875366, "rewards/format_reward": 1.0, "step": 2742 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.09375, "epoch": 0.027693084300858154, "grad_norm": 1.7066378409641678, "kl": 0.068359375, "learning_rate": 9.981089262834903e-07, "loss": 0.0027, "reward": 2.1853749752044678, "reward_std": 0.007609303575009108, "rewards/accuracy_reward": 0.985374927520752, "rewards/format_reward": 1.0, "step": 2743 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.53125, "epoch": 0.027703180212014133, "grad_norm": 3.3821316366792087, "kl": 0.0810546875, "learning_rate": 9.981075480666603e-07, "loss": 0.0032, "reward": 2.0781874656677246, "reward_std": 0.028825394809246063, "rewards/accuracy_reward": 0.8844374418258667, "rewards/format_reward": 1.0, "step": 2744 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.65625, "epoch": 0.027713276123170115, "grad_norm": 0.9699971897906564, "kl": 0.0576171875, "learning_rate": 9.981061693487424e-07, "loss": 0.0023, "reward": 2.18331241607666, "reward_std": 0.0013144169934093952, "rewards/accuracy_reward": 0.9833124876022339, "rewards/format_reward": 1.0, "step": 2745 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.6875, "epoch": 0.027723372034326097, "grad_norm": 4.543375322349543, "kl": 0.07958984375, "learning_rate": 9.981047901297382e-07, "loss": 0.0032, "reward": 2.0215001106262207, "reward_std": 0.0385952927172184, "rewards/accuracy_reward": 0.8277499675750732, "rewards/format_reward": 1.0, "step": 2746 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.75, "epoch": 0.02773346794548208, "grad_norm": 1.743498380208137, "kl": 0.06494140625, "learning_rate": 9.981034104096487e-07, "loss": 0.0026, "reward": 2.1655936241149902, "reward_std": 0.007506330497562885, "rewards/accuracy_reward": 0.9655937552452087, "rewards/format_reward": 1.0, "step": 2747 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.75, "epoch": 0.02774356385663806, "grad_norm": 1.4884761410842313, "kl": 0.058837890625, "learning_rate": 9.981020301884758e-07, "loss": 0.0024, "reward": 1.8075313568115234, "reward_std": 0.008366508409380913, "rewards/accuracy_reward": 0.6575312614440918, "rewards/format_reward": 1.0, "step": 2748 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.65625, "epoch": 0.027753659767794044, "grad_norm": 3.1760129971151585, "kl": 0.076171875, "learning_rate": 9.981006494662202e-07, "loss": 0.0031, "reward": 2.1169373989105225, "reward_std": 0.009852477349340916, "rewards/accuracy_reward": 0.9169374704360962, "rewards/format_reward": 1.0, "step": 2749 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.15625, "epoch": 0.027763755678950026, "grad_norm": 1.7511181045666995, "kl": 0.056884765625, "learning_rate": 9.980992682428836e-07, "loss": 0.0023, "reward": 1.8141250610351562, "reward_std": 0.16152024269104004, "rewards/accuracy_reward": 0.6828749179840088, "rewards/format_reward": 1.0, "step": 2750 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.1875, "epoch": 0.027773851590106008, "grad_norm": 1.5175133825841447, "kl": 0.0673828125, "learning_rate": 9.980978865184678e-07, "loss": 0.0027, "reward": 2.138906240463257, "reward_std": 0.02105693146586418, "rewards/accuracy_reward": 0.9451562166213989, "rewards/format_reward": 1.0, "step": 2751 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.3125, "epoch": 0.02778394750126199, "grad_norm": 9.759900732893058, "kl": 0.068359375, "learning_rate": 9.980965042929735e-07, "loss": 0.0027, "reward": 2.031343698501587, "reward_std": 0.011506367474794388, "rewards/accuracy_reward": 0.8313437104225159, "rewards/format_reward": 1.0, "step": 2752 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.71875, "epoch": 0.027794043412417972, "grad_norm": 2.1967119003970508, "kl": 0.08349609375, "learning_rate": 9.980951215664027e-07, "loss": 0.0033, "reward": 2.1621875762939453, "reward_std": 0.008014960214495659, "rewards/accuracy_reward": 0.9621875286102295, "rewards/format_reward": 1.0, "step": 2753 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.02780413932357395, "grad_norm": 3.321183671365667, "kl": 0.07958984375, "learning_rate": 9.980937383387562e-07, "loss": 0.0032, "reward": 2.0903749465942383, "reward_std": 0.02797507494688034, "rewards/accuracy_reward": 0.890375018119812, "rewards/format_reward": 1.0, "step": 2754 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.0, "epoch": 0.027814235234729933, "grad_norm": 1.8976356618549097, "kl": 0.07177734375, "learning_rate": 9.98092354610036e-07, "loss": 0.0029, "reward": 1.8502187728881836, "reward_std": 0.009504428133368492, "rewards/accuracy_reward": 0.7002187967300415, "rewards/format_reward": 1.0, "step": 2755 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.59375, "epoch": 0.027824331145885915, "grad_norm": 3.5805168405201315, "kl": 0.08740234375, "learning_rate": 9.98090970380243e-07, "loss": 0.0035, "reward": 2.1007189750671387, "reward_std": 0.035604313015937805, "rewards/accuracy_reward": 0.9069687724113464, "rewards/format_reward": 1.0, "step": 2756 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.3125, "epoch": 0.027834427057041897, "grad_norm": 1.6903299142385353, "kl": 0.0732421875, "learning_rate": 9.980895856493787e-07, "loss": 0.0029, "reward": 2.1278748512268066, "reward_std": 0.007902707904577255, "rewards/accuracy_reward": 0.9278749823570251, "rewards/format_reward": 1.0, "step": 2757 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.46875, "epoch": 0.02784452296819788, "grad_norm": 1.6641781347191527, "kl": 0.07275390625, "learning_rate": 9.980882004174447e-07, "loss": 0.0029, "reward": 1.8703124523162842, "reward_std": 0.00485941581428051, "rewards/accuracy_reward": 0.7203124761581421, "rewards/format_reward": 1.0, "step": 2758 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.96875, "epoch": 0.02785461887935386, "grad_norm": 2.8883184900750125, "kl": 0.0673828125, "learning_rate": 9.980868146844423e-07, "loss": 0.0027, "reward": 2.0845000743865967, "reward_std": 0.014640956185758114, "rewards/accuracy_reward": 0.8845000267028809, "rewards/format_reward": 1.0, "step": 2759 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.09375, "epoch": 0.027864714790509844, "grad_norm": 2.9465388451851884, "kl": 0.072265625, "learning_rate": 9.980854284503728e-07, "loss": 0.0029, "reward": 2.186906337738037, "reward_std": 0.008313160389661789, "rewards/accuracy_reward": 0.9869062900543213, "rewards/format_reward": 1.0, "step": 2760 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.09375, "epoch": 0.027874810701665826, "grad_norm": 1.5016269454285498, "kl": 0.0693359375, "learning_rate": 9.980840417152378e-07, "loss": 0.0028, "reward": 1.8503124713897705, "reward_std": 0.006762629374861717, "rewards/accuracy_reward": 0.7003125548362732, "rewards/format_reward": 1.0, "step": 2761 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.28125, "epoch": 0.027884906612821808, "grad_norm": 3.4418687601432105, "kl": 0.0673828125, "learning_rate": 9.980826544790384e-07, "loss": 0.0027, "reward": 1.8602187633514404, "reward_std": 0.01302350964397192, "rewards/accuracy_reward": 0.7102187871932983, "rewards/format_reward": 1.0, "step": 2762 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.875, "epoch": 0.02789500252397779, "grad_norm": 2.6343135959481994, "kl": 0.07861328125, "learning_rate": 9.980812667417762e-07, "loss": 0.0031, "reward": 2.078843593597412, "reward_std": 0.02945401705801487, "rewards/accuracy_reward": 0.8850937485694885, "rewards/format_reward": 1.0, "step": 2763 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.027905098435133772, "grad_norm": 4.402938007546072, "kl": 0.07470703125, "learning_rate": 9.980798785034528e-07, "loss": 0.003, "reward": 2.165343761444092, "reward_std": 0.015871554613113403, "rewards/accuracy_reward": 0.9653437733650208, "rewards/format_reward": 1.0, "step": 2764 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.96875, "epoch": 0.02791519434628975, "grad_norm": 2.837959757646275, "kl": 0.07080078125, "learning_rate": 9.980784897640688e-07, "loss": 0.0028, "reward": 1.8456876277923584, "reward_std": 0.01197061687707901, "rewards/accuracy_reward": 0.695687472820282, "rewards/format_reward": 1.0, "step": 2765 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.4375, "epoch": 0.027925290257445733, "grad_norm": 2.1572271835567234, "kl": 0.07373046875, "learning_rate": 9.980771005236266e-07, "loss": 0.0029, "reward": 1.85728120803833, "reward_std": 0.01083050761371851, "rewards/accuracy_reward": 0.707281231880188, "rewards/format_reward": 1.0, "step": 2766 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.03125, "epoch": 0.027935386168601715, "grad_norm": 2.411714136647814, "kl": 0.08544921875, "learning_rate": 9.98075710782127e-07, "loss": 0.0034, "reward": 2.0433437824249268, "reward_std": 0.016480199992656708, "rewards/accuracy_reward": 0.8433437943458557, "rewards/format_reward": 1.0, "step": 2767 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.5625, "epoch": 0.027945482079757698, "grad_norm": 2.098138854372304, "kl": 0.076171875, "learning_rate": 9.980743205395715e-07, "loss": 0.0031, "reward": 1.8699064254760742, "reward_std": 0.010003980249166489, "rewards/accuracy_reward": 0.719906210899353, "rewards/format_reward": 1.0, "step": 2768 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.5625, "epoch": 0.02795557799091368, "grad_norm": 1.4913272181815158, "kl": 0.06640625, "learning_rate": 9.980729297959615e-07, "loss": 0.0027, "reward": 2.187000036239624, "reward_std": 0.006692499853670597, "rewards/accuracy_reward": 0.9869999885559082, "rewards/format_reward": 1.0, "step": 2769 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.5, "epoch": 0.027965673902069662, "grad_norm": 2.666401275383691, "kl": 0.0712890625, "learning_rate": 9.980715385512985e-07, "loss": 0.0028, "reward": 2.151656150817871, "reward_std": 0.008817662484943867, "rewards/accuracy_reward": 0.9516562819480896, "rewards/format_reward": 1.0, "step": 2770 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.6875, "epoch": 0.027975769813225644, "grad_norm": 4.581297573419974, "kl": 0.083984375, "learning_rate": 9.98070146805584e-07, "loss": 0.0034, "reward": 2.0328750610351562, "reward_std": 0.020174000412225723, "rewards/accuracy_reward": 0.8328750133514404, "rewards/format_reward": 1.0, "step": 2771 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 428.1875, "epoch": 0.027985865724381626, "grad_norm": 2.56343706962078, "kl": 0.06640625, "learning_rate": 9.98068754558819e-07, "loss": 0.0027, "reward": 1.81040620803833, "reward_std": 0.31607770919799805, "rewards/accuracy_reward": 0.6666562557220459, "rewards/format_reward": 1.0, "step": 2772 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.15625, "epoch": 0.02799596163553761, "grad_norm": 1.6827304537173762, "kl": 0.0712890625, "learning_rate": 9.980673618110051e-07, "loss": 0.0028, "reward": 1.8360000848770142, "reward_std": 0.011294619180262089, "rewards/accuracy_reward": 0.6859999895095825, "rewards/format_reward": 1.0, "step": 2773 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 424.84375, "epoch": 0.02800605754669359, "grad_norm": 1.7471130722946662, "kl": 0.05810546875, "learning_rate": 9.980659685621438e-07, "loss": 0.0023, "reward": 1.6554687023162842, "reward_std": 0.16010792553424835, "rewards/accuracy_reward": 0.5367187261581421, "rewards/format_reward": 1.0, "step": 2774 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.125, "epoch": 0.02801615345784957, "grad_norm": 2.668216849058807, "kl": 0.07177734375, "learning_rate": 9.980645748122365e-07, "loss": 0.0029, "reward": 2.0996251106262207, "reward_std": 0.012008780613541603, "rewards/accuracy_reward": 0.8996250033378601, "rewards/format_reward": 1.0, "step": 2775 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.0625, "epoch": 0.02802624936900555, "grad_norm": 1.6959327461452884, "kl": 0.076171875, "learning_rate": 9.980631805612846e-07, "loss": 0.0031, "reward": 2.182000160217285, "reward_std": 0.005276885814964771, "rewards/accuracy_reward": 0.9819999933242798, "rewards/format_reward": 1.0, "step": 2776 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 389.21875, "epoch": 0.028036345280161534, "grad_norm": 2.6407828088932934, "kl": 0.0771484375, "learning_rate": 9.980617858092892e-07, "loss": 0.0031, "reward": 1.8242499828338623, "reward_std": 0.0070762624964118, "rewards/accuracy_reward": 0.6742500066757202, "rewards/format_reward": 1.0, "step": 2777 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 416.40625, "epoch": 0.028046441191317516, "grad_norm": 0.09091373707713497, "kl": 0.0478515625, "learning_rate": 9.980603905562522e-07, "loss": 0.0019, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2778 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.875, "epoch": 0.028056537102473498, "grad_norm": 2.934730655473817, "kl": 0.0712890625, "learning_rate": 9.980589948021744e-07, "loss": 0.0028, "reward": 1.8756874799728394, "reward_std": 0.029126394540071487, "rewards/accuracy_reward": 0.7319375276565552, "rewards/format_reward": 1.0, "step": 2779 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.71875, "epoch": 0.02806663301362948, "grad_norm": 2.3675207760312813, "kl": 0.0859375, "learning_rate": 9.980575985470579e-07, "loss": 0.0034, "reward": 2.0244061946868896, "reward_std": 0.019126510247588158, "rewards/accuracy_reward": 0.8244062662124634, "rewards/format_reward": 1.0, "step": 2780 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.5625, "epoch": 0.028076728924785462, "grad_norm": 1.9177498014794903, "kl": 0.0751953125, "learning_rate": 9.980562017909037e-07, "loss": 0.003, "reward": 2.1324377059936523, "reward_std": 0.014098579995334148, "rewards/accuracy_reward": 0.9324374794960022, "rewards/format_reward": 1.0, "step": 2781 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.9375, "epoch": 0.028086824835941444, "grad_norm": 2.768519301748491, "kl": 0.07373046875, "learning_rate": 9.980548045337131e-07, "loss": 0.0029, "reward": 2.1216249465942383, "reward_std": 0.013626769185066223, "rewards/accuracy_reward": 0.921625018119812, "rewards/format_reward": 1.0, "step": 2782 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.5625, "epoch": 0.028096920747097427, "grad_norm": 1.7951930973265704, "kl": 0.06884765625, "learning_rate": 9.980534067754879e-07, "loss": 0.0028, "reward": 2.1299376487731934, "reward_std": 0.005836993455886841, "rewards/accuracy_reward": 0.929937481880188, "rewards/format_reward": 1.0, "step": 2783 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.02810701665825341, "grad_norm": 2.82868011409519, "kl": 0.0771484375, "learning_rate": 9.98052008516229e-07, "loss": 0.0031, "reward": 1.9763438701629639, "reward_std": 0.020175911486148834, "rewards/accuracy_reward": 0.7763437628746033, "rewards/format_reward": 1.0, "step": 2784 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.09375, "epoch": 0.02811711256940939, "grad_norm": 1.616771584393594, "kl": 0.06640625, "learning_rate": 9.980506097559383e-07, "loss": 0.0027, "reward": 1.7455625534057617, "reward_std": 0.021482640877366066, "rewards/accuracy_reward": 0.6018125414848328, "rewards/format_reward": 1.0, "step": 2785 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.375, "epoch": 0.02812720848056537, "grad_norm": 1.752507882871046, "kl": 0.068359375, "learning_rate": 9.980492104946169e-07, "loss": 0.0027, "reward": 1.782562494277954, "reward_std": 0.008636190555989742, "rewards/accuracy_reward": 0.6325624585151672, "rewards/format_reward": 1.0, "step": 2786 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.8125, "epoch": 0.02813730439172135, "grad_norm": 3.0417056181190025, "kl": 0.0751953125, "learning_rate": 9.980478107322664e-07, "loss": 0.003, "reward": 2.1343750953674316, "reward_std": 0.009782080538570881, "rewards/accuracy_reward": 0.934374988079071, "rewards/format_reward": 1.0, "step": 2787 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.4375, "epoch": 0.028147400302877334, "grad_norm": 2.119615176733641, "kl": 0.06201171875, "learning_rate": 9.98046410468888e-07, "loss": 0.0025, "reward": 1.8867499828338623, "reward_std": 0.19816726446151733, "rewards/accuracy_reward": 0.7430000305175781, "rewards/format_reward": 1.0, "step": 2788 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.71875, "epoch": 0.028157496214033316, "grad_norm": 4.934463361806203, "kl": 0.07861328125, "learning_rate": 9.980450097044831e-07, "loss": 0.0031, "reward": 2.1063437461853027, "reward_std": 0.028440235182642937, "rewards/accuracy_reward": 0.9125937819480896, "rewards/format_reward": 1.0, "step": 2789 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.3125, "epoch": 0.028167592125189298, "grad_norm": 1.5725124277033025, "kl": 0.068359375, "learning_rate": 9.980436084390534e-07, "loss": 0.0027, "reward": 2.1732187271118164, "reward_std": 0.02402641251683235, "rewards/accuracy_reward": 0.979468822479248, "rewards/format_reward": 1.0, "step": 2790 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.02817768803634528, "grad_norm": 2.547768898280095, "kl": 0.0810546875, "learning_rate": 9.980422066725999e-07, "loss": 0.0032, "reward": 2.159031391143799, "reward_std": 0.010813332162797451, "rewards/accuracy_reward": 0.9590312242507935, "rewards/format_reward": 1.0, "step": 2791 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.028187783947501262, "grad_norm": 4.559579735436742, "kl": 0.080078125, "learning_rate": 9.980408044051244e-07, "loss": 0.0032, "reward": 2.15443754196167, "reward_std": 0.009896733798086643, "rewards/accuracy_reward": 0.9544374942779541, "rewards/format_reward": 1.0, "step": 2792 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 411.0, "epoch": 0.028197879858657245, "grad_norm": 3.1132417072453658, "kl": 0.060791015625, "learning_rate": 9.980394016366282e-07, "loss": 0.0024, "reward": 1.8734374046325684, "reward_std": 0.006429553031921387, "rewards/accuracy_reward": 0.7234375476837158, "rewards/format_reward": 1.0, "step": 2793 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 415.21875, "epoch": 0.028207975769813227, "grad_norm": 3.095448684721463, "kl": 0.0595703125, "learning_rate": 9.980379983671128e-07, "loss": 0.0024, "reward": 1.5754687786102295, "reward_std": 0.010610232129693031, "rewards/accuracy_reward": 0.4754687547683716, "rewards/format_reward": 1.0, "step": 2794 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.5, "epoch": 0.02821807168096921, "grad_norm": 4.329085490181945, "kl": 0.06591796875, "learning_rate": 9.980365945965793e-07, "loss": 0.0026, "reward": 1.9811874628067017, "reward_std": 0.16802561283111572, "rewards/accuracy_reward": 0.8061875104904175, "rewards/format_reward": 1.0, "step": 2795 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.0625, "epoch": 0.028228167592125188, "grad_norm": 2.4662014654165714, "kl": 0.07470703125, "learning_rate": 9.980351903250292e-07, "loss": 0.003, "reward": 1.8608750104904175, "reward_std": 0.006214376538991928, "rewards/accuracy_reward": 0.7108749747276306, "rewards/format_reward": 1.0, "step": 2796 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.02823826350328117, "grad_norm": 2.5473084275945768, "kl": 0.06494140625, "learning_rate": 9.980337855524642e-07, "loss": 0.0026, "reward": 1.9968751668930054, "reward_std": 0.27423590421676636, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2797 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 405.59375, "epoch": 0.028248359414437152, "grad_norm": 1.1839765594699798, "kl": 0.0576171875, "learning_rate": 9.980323802788853e-07, "loss": 0.0023, "reward": 1.5938124656677246, "reward_std": 0.003913687542080879, "rewards/accuracy_reward": 0.4938125014305115, "rewards/format_reward": 1.0, "step": 2798 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.0625, "epoch": 0.028258455325593134, "grad_norm": 3.081365621431814, "kl": 0.05859375, "learning_rate": 9.980309745042943e-07, "loss": 0.0023, "reward": 1.8912813663482666, "reward_std": 0.00497572124004364, "rewards/accuracy_reward": 0.741281270980835, "rewards/format_reward": 1.0, "step": 2799 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.25, "epoch": 0.028268551236749116, "grad_norm": 2.0554210985413, "kl": 0.06689453125, "learning_rate": 9.980295682286922e-07, "loss": 0.0027, "reward": 1.815500020980835, "reward_std": 0.04625547304749489, "rewards/accuracy_reward": 0.684249997138977, "rewards/format_reward": 1.0, "step": 2800 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.625, "epoch": 0.0282786471479051, "grad_norm": 2.853477989049777, "kl": 0.068359375, "learning_rate": 9.98028161452081e-07, "loss": 0.0027, "reward": 2.0804688930511475, "reward_std": 0.01916251890361309, "rewards/accuracy_reward": 0.8804687857627869, "rewards/format_reward": 1.0, "step": 2801 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.59375, "epoch": 0.02828874305906108, "grad_norm": 1.9101274834456006, "kl": 0.07275390625, "learning_rate": 9.980267541744616e-07, "loss": 0.0029, "reward": 2.1199374198913574, "reward_std": 0.011257938109338284, "rewards/accuracy_reward": 0.9199374914169312, "rewards/format_reward": 1.0, "step": 2802 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.0625, "epoch": 0.028298838970217063, "grad_norm": 1.72293475390895, "kl": 0.068359375, "learning_rate": 9.980253463958355e-07, "loss": 0.0027, "reward": 2.003218650817871, "reward_std": 0.006108684930950403, "rewards/accuracy_reward": 0.8032188415527344, "rewards/format_reward": 1.0, "step": 2803 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.53125, "epoch": 0.028308934881373045, "grad_norm": 4.063744073194105, "kl": 0.0771484375, "learning_rate": 9.980239381162043e-07, "loss": 0.0031, "reward": 2.072312355041504, "reward_std": 0.008381776511669159, "rewards/accuracy_reward": 0.8723124861717224, "rewards/format_reward": 1.0, "step": 2804 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.028319030792529027, "grad_norm": 2.3974113829627033, "kl": 0.072265625, "learning_rate": 9.980225293355694e-07, "loss": 0.0029, "reward": 2.051687717437744, "reward_std": 0.03202720358967781, "rewards/accuracy_reward": 0.8579375147819519, "rewards/format_reward": 1.0, "step": 2805 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.28125, "epoch": 0.02832912670368501, "grad_norm": 5.470800540581013, "kl": 0.0703125, "learning_rate": 9.980211200539318e-07, "loss": 0.0028, "reward": 1.882718801498413, "reward_std": 0.16591531038284302, "rewards/accuracy_reward": 0.7139687538146973, "rewards/format_reward": 1.0, "step": 2806 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.84375, "epoch": 0.028339222614840988, "grad_norm": 2.5423684977169008, "kl": 0.0634765625, "learning_rate": 9.980197102712936e-07, "loss": 0.0026, "reward": 1.89243745803833, "reward_std": 0.09628788381814957, "rewards/accuracy_reward": 0.742437481880188, "rewards/format_reward": 1.0, "step": 2807 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.1875, "epoch": 0.02834931852599697, "grad_norm": 2.109585805271959, "kl": 0.06787109375, "learning_rate": 9.980182999876559e-07, "loss": 0.0027, "reward": 2.1217498779296875, "reward_std": 0.03313405066728592, "rewards/accuracy_reward": 0.934249997138977, "rewards/format_reward": 1.0, "step": 2808 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.4375, "epoch": 0.028359414437152952, "grad_norm": 2.4179037847047433, "kl": 0.07080078125, "learning_rate": 9.980168892030198e-07, "loss": 0.0028, "reward": 1.8717812299728394, "reward_std": 0.012904549948871136, "rewards/accuracy_reward": 0.7217812538146973, "rewards/format_reward": 1.0, "step": 2809 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 443.15625, "epoch": 0.028369510348308934, "grad_norm": 0.920218563476072, "kl": 0.048828125, "learning_rate": 9.980154779173872e-07, "loss": 0.002, "reward": 1.8919062614440918, "reward_std": 0.0016793846152722836, "rewards/accuracy_reward": 0.7419062852859497, "rewards/format_reward": 1.0, "step": 2810 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.25, "epoch": 0.028379606259464916, "grad_norm": 2.144204165900385, "kl": 0.060546875, "learning_rate": 9.980140661307592e-07, "loss": 0.0024, "reward": 2.1410937309265137, "reward_std": 0.047375112771987915, "rewards/accuracy_reward": 0.9598437547683716, "rewards/format_reward": 1.0, "step": 2811 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 388.0, "epoch": 0.0283897021706209, "grad_norm": 2.44352178894331, "kl": 0.078125, "learning_rate": 9.980126538431375e-07, "loss": 0.0031, "reward": 1.8522499799728394, "reward_std": 0.010929964482784271, "rewards/accuracy_reward": 0.7022500038146973, "rewards/format_reward": 1.0, "step": 2812 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.78125, "epoch": 0.02839979808177688, "grad_norm": 2.8462380967820593, "kl": 0.0732421875, "learning_rate": 9.980112410545233e-07, "loss": 0.0029, "reward": 1.7886874675750732, "reward_std": 0.003011136082932353, "rewards/accuracy_reward": 0.6386874914169312, "rewards/format_reward": 1.0, "step": 2813 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.09375, "epoch": 0.028409893992932863, "grad_norm": 2.081268000613865, "kl": 0.072265625, "learning_rate": 9.98009827764918e-07, "loss": 0.0029, "reward": 1.8730623722076416, "reward_std": 0.1497718244791031, "rewards/accuracy_reward": 0.710562527179718, "rewards/format_reward": 1.0, "step": 2814 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.34375, "epoch": 0.028419989904088845, "grad_norm": 7.870734125158966, "kl": 0.07080078125, "learning_rate": 9.980084139743233e-07, "loss": 0.0028, "reward": 1.9456562995910645, "reward_std": 0.17107246816158295, "rewards/accuracy_reward": 0.7769061923027039, "rewards/format_reward": 1.0, "step": 2815 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.5625, "epoch": 0.028430085815244827, "grad_norm": 1.619326933097037, "kl": 0.06005859375, "learning_rate": 9.980069996827404e-07, "loss": 0.0024, "reward": 2.171281337738037, "reward_std": 0.005581995472311974, "rewards/accuracy_reward": 0.9712811708450317, "rewards/format_reward": 1.0, "step": 2816 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.028440181726400806, "grad_norm": 3.0075287218465054, "kl": 0.0712890625, "learning_rate": 9.980055848901708e-07, "loss": 0.0029, "reward": 2.134812355041504, "reward_std": 0.007323271594941616, "rewards/accuracy_reward": 0.9348124861717224, "rewards/format_reward": 1.0, "step": 2817 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.9375, "epoch": 0.028450277637556788, "grad_norm": 2.751435777743705, "kl": 0.0712890625, "learning_rate": 9.980041695966157e-07, "loss": 0.0028, "reward": 2.034656286239624, "reward_std": 0.008261539041996002, "rewards/accuracy_reward": 0.8346562385559082, "rewards/format_reward": 1.0, "step": 2818 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 395.28125, "epoch": 0.02846037354871277, "grad_norm": 2.7954955594442064, "kl": 0.0673828125, "learning_rate": 9.980027538020768e-07, "loss": 0.0027, "reward": 1.5074374675750732, "reward_std": 0.011924735270440578, "rewards/accuracy_reward": 0.4074375331401825, "rewards/format_reward": 1.0, "step": 2819 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.375, "epoch": 0.028470469459868752, "grad_norm": 3.2973850376329907, "kl": 0.0712890625, "learning_rate": 9.980013375065555e-07, "loss": 0.0029, "reward": 2.0314064025878906, "reward_std": 0.016378069296479225, "rewards/accuracy_reward": 0.8314062356948853, "rewards/format_reward": 1.0, "step": 2820 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.0, "epoch": 0.028480565371024735, "grad_norm": 6.426718740545425, "kl": 0.091796875, "learning_rate": 9.979999207100528e-07, "loss": 0.0037, "reward": 2.1395936012268066, "reward_std": 0.011794757097959518, "rewards/accuracy_reward": 0.9395937323570251, "rewards/format_reward": 1.0, "step": 2821 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.5625, "epoch": 0.028490661282180717, "grad_norm": 5.922614407790392, "kl": 0.0615234375, "learning_rate": 9.979985034125709e-07, "loss": 0.0025, "reward": 1.6079063415527344, "reward_std": 0.095892533659935, "rewards/accuracy_reward": 0.5079062581062317, "rewards/format_reward": 1.0, "step": 2822 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.625, "epoch": 0.0285007571933367, "grad_norm": 2.9747871369068095, "kl": 0.0751953125, "learning_rate": 9.979970856141107e-07, "loss": 0.003, "reward": 2.1741249561309814, "reward_std": 0.0078092170879244804, "rewards/accuracy_reward": 0.9741249680519104, "rewards/format_reward": 1.0, "step": 2823 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.0, "epoch": 0.02851085310449268, "grad_norm": 2.047132086154724, "kl": 0.0771484375, "learning_rate": 9.979956673146737e-07, "loss": 0.0031, "reward": 1.8695937395095825, "reward_std": 0.008124979212880135, "rewards/accuracy_reward": 0.7195937633514404, "rewards/format_reward": 1.0, "step": 2824 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.59375, "epoch": 0.028520949015648663, "grad_norm": 2.996634368448036, "kl": 0.0830078125, "learning_rate": 9.979942485142613e-07, "loss": 0.0033, "reward": 2.1173439025878906, "reward_std": 0.01652662083506584, "rewards/accuracy_reward": 0.91734379529953, "rewards/format_reward": 1.0, "step": 2825 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.028531044926804645, "grad_norm": 2.132169690273214, "kl": 0.0849609375, "learning_rate": 9.97992829212875e-07, "loss": 0.0034, "reward": 2.0817811489105225, "reward_std": 0.020348533987998962, "rewards/accuracy_reward": 0.8880312442779541, "rewards/format_reward": 1.0, "step": 2826 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.09375, "epoch": 0.028541140837960628, "grad_norm": 3.002341890352319, "kl": 0.072265625, "learning_rate": 9.979914094105165e-07, "loss": 0.0029, "reward": 2.1713123321533203, "reward_std": 0.011068829335272312, "rewards/accuracy_reward": 0.9713124632835388, "rewards/format_reward": 1.0, "step": 2827 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.75, "epoch": 0.028551236749116606, "grad_norm": 2.540892403662879, "kl": 0.0771484375, "learning_rate": 9.979899891071865e-07, "loss": 0.0031, "reward": 2.0880000591278076, "reward_std": 0.013502497225999832, "rewards/accuracy_reward": 0.8880000114440918, "rewards/format_reward": 1.0, "step": 2828 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 423.5, "epoch": 0.02856133266027259, "grad_norm": 2.4963616977565444, "kl": 0.061767578125, "learning_rate": 9.97988568302887e-07, "loss": 0.0025, "reward": 1.8842968940734863, "reward_std": 0.025288688018918037, "rewards/accuracy_reward": 0.7405468821525574, "rewards/format_reward": 1.0, "step": 2829 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.75, "epoch": 0.02857142857142857, "grad_norm": 2.509440097742122, "kl": 0.05615234375, "learning_rate": 9.979871469976195e-07, "loss": 0.0023, "reward": 2.149250030517578, "reward_std": 0.003212692216038704, "rewards/accuracy_reward": 0.9492499232292175, "rewards/format_reward": 1.0, "step": 2830 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.84375, "epoch": 0.028581524482584553, "grad_norm": 4.24326593623297, "kl": 0.07080078125, "learning_rate": 9.97985725191385e-07, "loss": 0.0028, "reward": 2.1119375228881836, "reward_std": 0.11478140205144882, "rewards/accuracy_reward": 0.9181874990463257, "rewards/format_reward": 1.0, "step": 2831 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.625, "epoch": 0.028591620393740535, "grad_norm": 1.532039977833056, "kl": 0.06396484375, "learning_rate": 9.979843028841853e-07, "loss": 0.0026, "reward": 1.7550625801086426, "reward_std": 0.14095932245254517, "rewards/accuracy_reward": 0.6175625324249268, "rewards/format_reward": 1.0, "step": 2832 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.40625, "epoch": 0.028601716304896517, "grad_norm": 4.0560077985036225, "kl": 0.0693359375, "learning_rate": 9.979828800760218e-07, "loss": 0.0028, "reward": 2.1264686584472656, "reward_std": 0.01647695153951645, "rewards/accuracy_reward": 0.9264687299728394, "rewards/format_reward": 1.0, "step": 2833 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.46875, "epoch": 0.0286118122160525, "grad_norm": 7.849561453163596, "kl": 0.083984375, "learning_rate": 9.979814567668957e-07, "loss": 0.0034, "reward": 2.0825624465942383, "reward_std": 0.014104998670518398, "rewards/accuracy_reward": 0.882562518119812, "rewards/format_reward": 1.0, "step": 2834 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.9375, "epoch": 0.02862190812720848, "grad_norm": 5.154731052078133, "kl": 0.0703125, "learning_rate": 9.979800329568084e-07, "loss": 0.0028, "reward": 1.9925938844680786, "reward_std": 0.020185347646474838, "rewards/accuracy_reward": 0.7925937175750732, "rewards/format_reward": 1.0, "step": 2835 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.25, "epoch": 0.028632004038364463, "grad_norm": 7.3402955839897706, "kl": 0.0830078125, "learning_rate": 9.979786086457615e-07, "loss": 0.0033, "reward": 2.083031177520752, "reward_std": 0.024217788130044937, "rewards/accuracy_reward": 0.8830312490463257, "rewards/format_reward": 1.0, "step": 2836 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.9375, "epoch": 0.028642099949520446, "grad_norm": 1.8273629692927094, "kl": 0.078125, "learning_rate": 9.979771838337567e-07, "loss": 0.0031, "reward": 2.0334062576293945, "reward_std": 0.011539513245224953, "rewards/accuracy_reward": 0.8334062695503235, "rewards/format_reward": 1.0, "step": 2837 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.4375, "epoch": 0.028652195860676424, "grad_norm": 2.403016256876455, "kl": 0.078125, "learning_rate": 9.979757585207948e-07, "loss": 0.0031, "reward": 2.1004061698913574, "reward_std": 0.011778783053159714, "rewards/accuracy_reward": 0.9004062414169312, "rewards/format_reward": 1.0, "step": 2838 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.28125, "epoch": 0.028662291771832406, "grad_norm": 1.3586859691761504, "kl": 0.060791015625, "learning_rate": 9.979743327068779e-07, "loss": 0.0024, "reward": 1.809093713760376, "reward_std": 0.008112435229122639, "rewards/accuracy_reward": 0.6590937376022339, "rewards/format_reward": 1.0, "step": 2839 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 415.5, "epoch": 0.02867238768298839, "grad_norm": 4.1230332376257595, "kl": 0.06787109375, "learning_rate": 9.979729063920068e-07, "loss": 0.0027, "reward": 1.8326562643051147, "reward_std": 0.02954881638288498, "rewards/accuracy_reward": 0.6889062523841858, "rewards/format_reward": 1.0, "step": 2840 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.15625, "epoch": 0.02868248359414437, "grad_norm": 1.8205229324562653, "kl": 0.07421875, "learning_rate": 9.979714795761836e-07, "loss": 0.003, "reward": 2.163875102996826, "reward_std": 0.004839453846216202, "rewards/accuracy_reward": 0.9638749957084656, "rewards/format_reward": 1.0, "step": 2841 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.028692579505300353, "grad_norm": 4.661073620440992, "kl": 0.08154296875, "learning_rate": 9.979700522594091e-07, "loss": 0.0033, "reward": 2.0240938663482666, "reward_std": 0.16932791471481323, "rewards/accuracy_reward": 0.842843770980835, "rewards/format_reward": 1.0, "step": 2842 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.875, "epoch": 0.028702675416456335, "grad_norm": 2.3658679283056316, "kl": 0.06787109375, "learning_rate": 9.979686244416852e-07, "loss": 0.0027, "reward": 1.9376875162124634, "reward_std": 0.14757120609283447, "rewards/accuracy_reward": 0.7564375400543213, "rewards/format_reward": 1.0, "step": 2843 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 426.28125, "epoch": 0.028712771327612317, "grad_norm": 1.540222603061596, "kl": 0.05908203125, "learning_rate": 9.979671961230131e-07, "loss": 0.0024, "reward": 1.8555312156677246, "reward_std": 0.006264732219278812, "rewards/accuracy_reward": 0.7055312395095825, "rewards/format_reward": 1.0, "step": 2844 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.4375, "epoch": 0.0287228672387683, "grad_norm": 3.9345433495283753, "kl": 0.08544921875, "learning_rate": 9.979657673033945e-07, "loss": 0.0034, "reward": 2.1166248321533203, "reward_std": 0.012466519139707088, "rewards/accuracy_reward": 0.9166250228881836, "rewards/format_reward": 1.0, "step": 2845 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.5, "epoch": 0.02873296314992428, "grad_norm": 1.9415033443626042, "kl": 0.06884765625, "learning_rate": 9.979643379828305e-07, "loss": 0.0027, "reward": 2.181593894958496, "reward_std": 0.006982602644711733, "rewards/accuracy_reward": 0.9815937280654907, "rewards/format_reward": 1.0, "step": 2846 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.028743059061080264, "grad_norm": 1.901455698027265, "kl": 0.0771484375, "learning_rate": 9.979629081613227e-07, "loss": 0.0031, "reward": 2.1818127632141113, "reward_std": 0.007138554006814957, "rewards/accuracy_reward": 0.9818124771118164, "rewards/format_reward": 1.0, "step": 2847 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.46875, "epoch": 0.028753154972236246, "grad_norm": 3.799537465363258, "kl": 0.068359375, "learning_rate": 9.979614778388723e-07, "loss": 0.0027, "reward": 2.1070313453674316, "reward_std": 0.009389156475663185, "rewards/accuracy_reward": 0.9070312976837158, "rewards/format_reward": 1.0, "step": 2848 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.0625, "epoch": 0.028763250883392225, "grad_norm": 2.995757648789038, "kl": 0.0673828125, "learning_rate": 9.979600470154813e-07, "loss": 0.0027, "reward": 1.8432499170303345, "reward_std": 0.01295832172036171, "rewards/accuracy_reward": 0.6932500004768372, "rewards/format_reward": 1.0, "step": 2849 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.0, "epoch": 0.028773346794548207, "grad_norm": 23.276613796123645, "kl": 0.07861328125, "learning_rate": 9.979586156911504e-07, "loss": 0.0031, "reward": 2.0278749465942383, "reward_std": 0.02107147127389908, "rewards/accuracy_reward": 0.8341250419616699, "rewards/format_reward": 1.0, "step": 2850 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.34375, "epoch": 0.02878344270570419, "grad_norm": 1.2790936782604645, "kl": 0.062255859375, "learning_rate": 9.979571838658816e-07, "loss": 0.0025, "reward": 1.8086249828338623, "reward_std": 0.0024567374493926764, "rewards/accuracy_reward": 0.6586250066757202, "rewards/format_reward": 1.0, "step": 2851 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.0, "epoch": 0.02879353861686017, "grad_norm": 1.5432092998382956, "kl": 0.06689453125, "learning_rate": 9.979557515396764e-07, "loss": 0.0027, "reward": 1.8663438558578491, "reward_std": 0.00678937416523695, "rewards/accuracy_reward": 0.7163437604904175, "rewards/format_reward": 1.0, "step": 2852 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.46875, "epoch": 0.028803634528016153, "grad_norm": 2.3355837662145533, "kl": 0.060546875, "learning_rate": 9.979543187125357e-07, "loss": 0.0024, "reward": 2.0146563053131104, "reward_std": 0.1670863926410675, "rewards/accuracy_reward": 0.8396562337875366, "rewards/format_reward": 1.0, "step": 2853 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.21875, "epoch": 0.028813730439172135, "grad_norm": 3.263042690577163, "kl": 0.07861328125, "learning_rate": 9.979528853844614e-07, "loss": 0.0031, "reward": 2.0931248664855957, "reward_std": 0.016204973682761192, "rewards/accuracy_reward": 0.8931249380111694, "rewards/format_reward": 1.0, "step": 2854 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.4375, "epoch": 0.028823826350328118, "grad_norm": 2.625647689616431, "kl": 0.07275390625, "learning_rate": 9.979514515554548e-07, "loss": 0.0029, "reward": 1.8034688234329224, "reward_std": 0.012645936571061611, "rewards/accuracy_reward": 0.6534687876701355, "rewards/format_reward": 1.0, "step": 2855 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.6875, "epoch": 0.0288339222614841, "grad_norm": 7.182471854324334, "kl": 0.06201171875, "learning_rate": 9.97950017225517e-07, "loss": 0.0024, "reward": 1.831770896911621, "reward_std": 0.0033175237476825714, "rewards/accuracy_reward": 0.6817708611488342, "rewards/format_reward": 1.0, "step": 2856 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.96875, "epoch": 0.028844018172640082, "grad_norm": 3.9857968809719813, "kl": 0.07763671875, "learning_rate": 9.979485823946503e-07, "loss": 0.0031, "reward": 2.1378750801086426, "reward_std": 0.019790375605225563, "rewards/accuracy_reward": 0.9378750324249268, "rewards/format_reward": 1.0, "step": 2857 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.028854114083796064, "grad_norm": 3.6794787416608097, "kl": 0.0751953125, "learning_rate": 9.979471470628554e-07, "loss": 0.003, "reward": 2.088656425476074, "reward_std": 0.024348298087716103, "rewards/accuracy_reward": 0.894906222820282, "rewards/format_reward": 1.0, "step": 2858 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.028864209994952046, "grad_norm": 2.9394131476786423, "kl": 0.0771484375, "learning_rate": 9.979457112301339e-07, "loss": 0.0031, "reward": 2.0981249809265137, "reward_std": 0.004765167832374573, "rewards/accuracy_reward": 0.8981249332427979, "rewards/format_reward": 1.0, "step": 2859 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.875, "epoch": 0.028874305906108025, "grad_norm": 2.1277314739388795, "kl": 0.07861328125, "learning_rate": 9.979442748964875e-07, "loss": 0.0031, "reward": 2.1495938301086426, "reward_std": 0.02047867700457573, "rewards/accuracy_reward": 0.9558438062667847, "rewards/format_reward": 1.0, "step": 2860 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.65625, "epoch": 0.028884401817264007, "grad_norm": 3.52199448383404, "kl": 0.07861328125, "learning_rate": 9.979428380619174e-07, "loss": 0.0031, "reward": 2.0303125381469727, "reward_std": 0.014857962727546692, "rewards/accuracy_reward": 0.8303124904632568, "rewards/format_reward": 1.0, "step": 2861 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.4375, "epoch": 0.02889449772841999, "grad_norm": 3.106737620050333, "kl": 0.080078125, "learning_rate": 9.97941400726425e-07, "loss": 0.0032, "reward": 2.155125141143799, "reward_std": 0.028782447800040245, "rewards/accuracy_reward": 0.9613749980926514, "rewards/format_reward": 1.0, "step": 2862 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 416.21875, "epoch": 0.02890459363957597, "grad_norm": 1.8293338665203547, "kl": 0.0732421875, "learning_rate": 9.97939962890012e-07, "loss": 0.0029, "reward": 1.8435938358306885, "reward_std": 0.023468222469091415, "rewards/accuracy_reward": 0.69984370470047, "rewards/format_reward": 1.0, "step": 2863 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.90625, "epoch": 0.028914689550731953, "grad_norm": 2.420994675282304, "kl": 0.0693359375, "learning_rate": 9.979385245526795e-07, "loss": 0.0028, "reward": 1.7980625629425049, "reward_std": 0.1509498804807663, "rewards/accuracy_reward": 0.6355624794960022, "rewards/format_reward": 1.0, "step": 2864 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.5, "epoch": 0.028924785461887936, "grad_norm": 2.060054913521598, "kl": 0.064453125, "learning_rate": 9.979370857144293e-07, "loss": 0.0026, "reward": 2.0714375972747803, "reward_std": 0.008797630667686462, "rewards/accuracy_reward": 0.8714374303817749, "rewards/format_reward": 1.0, "step": 2865 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 406.40625, "epoch": 0.028934881373043918, "grad_norm": 8.86818276510925, "kl": 0.0712890625, "learning_rate": 9.979356463752626e-07, "loss": 0.0029, "reward": 1.8478436470031738, "reward_std": 0.0076448447071015835, "rewards/accuracy_reward": 0.6978436708450317, "rewards/format_reward": 1.0, "step": 2866 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.0289449772841999, "grad_norm": 1.6047490155042015, "kl": 0.064453125, "learning_rate": 9.97934206535181e-07, "loss": 0.0026, "reward": 2.169562339782715, "reward_std": 0.012051100842654705, "rewards/accuracy_reward": 0.9695625305175781, "rewards/format_reward": 1.0, "step": 2867 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.375, "epoch": 0.028955073195355882, "grad_norm": 1.8004818137981178, "kl": 0.0712890625, "learning_rate": 9.97932766194186e-07, "loss": 0.0029, "reward": 2.1511874198913574, "reward_std": 0.009337393566966057, "rewards/accuracy_reward": 0.9511874914169312, "rewards/format_reward": 1.0, "step": 2868 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.0, "epoch": 0.028965169106511864, "grad_norm": 4.4769085190227855, "kl": 0.08203125, "learning_rate": 9.979313253522787e-07, "loss": 0.0033, "reward": 2.0199062824249268, "reward_std": 0.011184696108102798, "rewards/accuracy_reward": 0.8199062347412109, "rewards/format_reward": 1.0, "step": 2869 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.4375, "epoch": 0.028975265017667843, "grad_norm": 2.8627245175855016, "kl": 0.07861328125, "learning_rate": 9.979298840094607e-07, "loss": 0.0031, "reward": 1.9733750820159912, "reward_std": 0.012904210016131401, "rewards/accuracy_reward": 0.7733749151229858, "rewards/format_reward": 1.0, "step": 2870 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.3125, "epoch": 0.028985360928823825, "grad_norm": 2.1920202486924443, "kl": 0.06396484375, "learning_rate": 9.97928442165734e-07, "loss": 0.0026, "reward": 2.1766092777252197, "reward_std": 0.02211136557161808, "rewards/accuracy_reward": 0.9828593730926514, "rewards/format_reward": 1.0, "step": 2871 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 400.0, "epoch": 0.028995456839979807, "grad_norm": 1.6646336998183668, "kl": 0.06201171875, "learning_rate": 9.979269998210991e-07, "loss": 0.0025, "reward": 1.8235313892364502, "reward_std": 0.006993941031396389, "rewards/accuracy_reward": 0.6735312342643738, "rewards/format_reward": 1.0, "step": 2872 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.25, "epoch": 0.02900555275113579, "grad_norm": 3.2444087890284288, "kl": 0.0693359375, "learning_rate": 9.979255569755583e-07, "loss": 0.0028, "reward": 1.895937442779541, "reward_std": 0.17528940737247467, "rewards/accuracy_reward": 0.7209374904632568, "rewards/format_reward": 1.0, "step": 2873 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.625, "epoch": 0.02901564866229177, "grad_norm": 3.1112639296748563, "kl": 0.06591796875, "learning_rate": 9.979241136291125e-07, "loss": 0.0026, "reward": 2.1528124809265137, "reward_std": 0.03266218304634094, "rewards/accuracy_reward": 0.9653124809265137, "rewards/format_reward": 1.0, "step": 2874 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.029025744573447754, "grad_norm": 2.764432942342741, "kl": 0.08251953125, "learning_rate": 9.979226697817633e-07, "loss": 0.0033, "reward": 2.0309689044952393, "reward_std": 0.16848129034042358, "rewards/accuracy_reward": 0.855968713760376, "rewards/format_reward": 1.0, "step": 2875 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 404.4375, "epoch": 0.029035840484603736, "grad_norm": 1.3245050854790268, "kl": 0.0654296875, "learning_rate": 9.979212254335123e-07, "loss": 0.0026, "reward": 1.8561875820159912, "reward_std": 0.005243611056357622, "rewards/accuracy_reward": 0.7061874866485596, "rewards/format_reward": 1.0, "step": 2876 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.9375, "epoch": 0.029045936395759718, "grad_norm": 2.562473924995991, "kl": 0.07275390625, "learning_rate": 9.979197805843608e-07, "loss": 0.0029, "reward": 2.0943126678466797, "reward_std": 0.06442128121852875, "rewards/accuracy_reward": 0.8943125605583191, "rewards/format_reward": 1.0, "step": 2877 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.375, "epoch": 0.0290560323069157, "grad_norm": 2.25340939545287, "kl": 0.07666015625, "learning_rate": 9.979183352343104e-07, "loss": 0.0031, "reward": 2.0688438415527344, "reward_std": 0.026105955243110657, "rewards/accuracy_reward": 0.8750937581062317, "rewards/format_reward": 1.0, "step": 2878 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.125, "epoch": 0.029066128218071682, "grad_norm": 2.1346226398185015, "kl": 0.057861328125, "learning_rate": 9.979168893833622e-07, "loss": 0.0023, "reward": 2.0830001831054688, "reward_std": 0.026965666562318802, "rewards/accuracy_reward": 0.8892500400543213, "rewards/format_reward": 1.0, "step": 2879 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.3125, "epoch": 0.029076224129227664, "grad_norm": 2.9645530200733865, "kl": 0.06689453125, "learning_rate": 9.979154430315181e-07, "loss": 0.0027, "reward": 2.0536656379699707, "reward_std": 0.036689162254333496, "rewards/accuracy_reward": 0.8661655783653259, "rewards/format_reward": 1.0, "step": 2880 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.75, "epoch": 0.029086320040383643, "grad_norm": 1.6759815167876764, "kl": 0.06787109375, "learning_rate": 9.979139961787794e-07, "loss": 0.0027, "reward": 2.1559064388275146, "reward_std": 0.006456772796809673, "rewards/accuracy_reward": 0.9559062719345093, "rewards/format_reward": 1.0, "step": 2881 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 405.125, "epoch": 0.029096415951539625, "grad_norm": 3.0699722952828266, "kl": 0.059814453125, "learning_rate": 9.979125488251475e-07, "loss": 0.0024, "reward": 1.8157501220703125, "reward_std": 0.111826092004776, "rewards/accuracy_reward": 0.671999990940094, "rewards/format_reward": 1.0, "step": 2882 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 402.1875, "epoch": 0.029106511862695608, "grad_norm": 2.166774109083921, "kl": 0.07275390625, "learning_rate": 9.979111009706238e-07, "loss": 0.0029, "reward": 1.8000937700271606, "reward_std": 0.02989295870065689, "rewards/accuracy_reward": 0.6563437581062317, "rewards/format_reward": 1.0, "step": 2883 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.84375, "epoch": 0.02911660777385159, "grad_norm": 4.363503055488595, "kl": 0.0771484375, "learning_rate": 9.979096526152099e-07, "loss": 0.0031, "reward": 2.098468780517578, "reward_std": 0.031684476882219315, "rewards/accuracy_reward": 0.9047187566757202, "rewards/format_reward": 1.0, "step": 2884 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.75, "epoch": 0.029126703685007572, "grad_norm": 2.222803651237222, "kl": 0.0751953125, "learning_rate": 9.97908203758907e-07, "loss": 0.003, "reward": 2.1021251678466797, "reward_std": 0.012338602915406227, "rewards/accuracy_reward": 0.9021250009536743, "rewards/format_reward": 1.0, "step": 2885 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 414.90625, "epoch": 0.029136799596163554, "grad_norm": 2.461407973540318, "kl": 0.07275390625, "learning_rate": 9.97906754401717e-07, "loss": 0.0029, "reward": 1.8604687452316284, "reward_std": 0.005287719424813986, "rewards/accuracy_reward": 0.7104687690734863, "rewards/format_reward": 1.0, "step": 2886 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 408.40625, "epoch": 0.029146895507319536, "grad_norm": 1.5472110503073462, "kl": 0.06787109375, "learning_rate": 9.97905304543641e-07, "loss": 0.0027, "reward": 1.855218768119812, "reward_std": 0.006114935036748648, "rewards/accuracy_reward": 0.7052187323570251, "rewards/format_reward": 1.0, "step": 2887 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.75, "epoch": 0.02915699141847552, "grad_norm": 2.933845083275693, "kl": 0.06787109375, "learning_rate": 9.979038541846805e-07, "loss": 0.0027, "reward": 2.104781150817871, "reward_std": 0.01159657258540392, "rewards/accuracy_reward": 0.9047812819480896, "rewards/format_reward": 1.0, "step": 2888 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.75, "epoch": 0.0291670873296315, "grad_norm": 3.177977077712963, "kl": 0.07666015625, "learning_rate": 9.97902403324837e-07, "loss": 0.0031, "reward": 2.123781204223633, "reward_std": 0.0132709676399827, "rewards/accuracy_reward": 0.9237812757492065, "rewards/format_reward": 1.0, "step": 2889 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.1875, "epoch": 0.029177183240787483, "grad_norm": 2.553640771598235, "kl": 0.076171875, "learning_rate": 9.979009519641121e-07, "loss": 0.003, "reward": 2.0315937995910645, "reward_std": 0.02083861455321312, "rewards/accuracy_reward": 0.8315937519073486, "rewards/format_reward": 1.0, "step": 2890 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 393.28125, "epoch": 0.02918727915194346, "grad_norm": 2.6547225913776953, "kl": 0.05908203125, "learning_rate": 9.978995001025072e-07, "loss": 0.0024, "reward": 1.8819999694824219, "reward_std": 0.007483305875211954, "rewards/accuracy_reward": 0.7319999933242798, "rewards/format_reward": 1.0, "step": 2891 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.3125, "epoch": 0.029197375063099443, "grad_norm": 3.190148984655211, "kl": 0.05078125, "learning_rate": 9.978980477400237e-07, "loss": 0.002, "reward": 1.640125036239624, "reward_std": 0.14484168589115143, "rewards/accuracy_reward": 0.5276249647140503, "rewards/format_reward": 1.0, "step": 2892 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.96875, "epoch": 0.029207470974255426, "grad_norm": 1.4674101289761432, "kl": 0.0556640625, "learning_rate": 9.97896594876663e-07, "loss": 0.0022, "reward": 2.0708749294281006, "reward_std": 0.15827959775924683, "rewards/accuracy_reward": 0.8896250128746033, "rewards/format_reward": 1.0, "step": 2893 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.84375, "epoch": 0.029217566885411408, "grad_norm": 1.5458360908008706, "kl": 0.05810546875, "learning_rate": 9.978951415124264e-07, "loss": 0.0023, "reward": 2.1772501468658447, "reward_std": 0.020975163206458092, "rewards/accuracy_reward": 0.9835000038146973, "rewards/format_reward": 1.0, "step": 2894 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.1875, "epoch": 0.02922766279656739, "grad_norm": 2.7534394486086757, "kl": 0.07421875, "learning_rate": 9.978936876473158e-07, "loss": 0.003, "reward": 2.179093837738037, "reward_std": 0.009757397696375847, "rewards/accuracy_reward": 0.9790937304496765, "rewards/format_reward": 1.0, "step": 2895 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.029237758707723372, "grad_norm": 2.4197659378166367, "kl": 0.099609375, "learning_rate": 9.978922332813324e-07, "loss": 0.004, "reward": 2.093125104904175, "reward_std": 0.014550823718309402, "rewards/accuracy_reward": 0.8931249380111694, "rewards/format_reward": 1.0, "step": 2896 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.21875, "epoch": 0.029247854618879354, "grad_norm": 2.5103398105047896, "kl": 0.06640625, "learning_rate": 9.978907784144776e-07, "loss": 0.0027, "reward": 2.0307188034057617, "reward_std": 0.17234471440315247, "rewards/accuracy_reward": 0.855718731880188, "rewards/format_reward": 1.0, "step": 2897 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.46875, "epoch": 0.029257950530035336, "grad_norm": 1.796438838920744, "kl": 0.08203125, "learning_rate": 9.978893230467534e-07, "loss": 0.0033, "reward": 2.148312568664551, "reward_std": 0.003936573397368193, "rewards/accuracy_reward": 0.948312520980835, "rewards/format_reward": 1.0, "step": 2898 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.09375, "epoch": 0.02926804644119132, "grad_norm": 1.531692991297515, "kl": 0.056396484375, "learning_rate": 9.978878671781604e-07, "loss": 0.0023, "reward": 1.8013125658035278, "reward_std": 0.09165547043085098, "rewards/accuracy_reward": 0.651312530040741, "rewards/format_reward": 1.0, "step": 2899 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 411.0, "epoch": 0.0292781423523473, "grad_norm": 2.1315164770481165, "kl": 0.076171875, "learning_rate": 9.978864108087005e-07, "loss": 0.0031, "reward": 1.8305001258850098, "reward_std": 0.008191049098968506, "rewards/accuracy_reward": 0.6805000305175781, "rewards/format_reward": 1.0, "step": 2900 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.78125, "epoch": 0.029288238263503283, "grad_norm": 4.912034429046427, "kl": 0.0751953125, "learning_rate": 9.978849539383755e-07, "loss": 0.003, "reward": 2.115999937057495, "reward_std": 0.008006655611097813, "rewards/accuracy_reward": 0.9160000085830688, "rewards/format_reward": 1.0, "step": 2901 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.625, "epoch": 0.02929833417465926, "grad_norm": 2.167841911159625, "kl": 0.080078125, "learning_rate": 9.978834965671862e-07, "loss": 0.0032, "reward": 2.1355624198913574, "reward_std": 0.011934183537960052, "rewards/accuracy_reward": 0.9355624914169312, "rewards/format_reward": 1.0, "step": 2902 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.5625, "epoch": 0.029308430085815244, "grad_norm": 2.2161788593353187, "kl": 0.07421875, "learning_rate": 9.978820386951346e-07, "loss": 0.003, "reward": 2.129584312438965, "reward_std": 0.022112973034381866, "rewards/accuracy_reward": 0.9358344078063965, "rewards/format_reward": 1.0, "step": 2903 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.78125, "epoch": 0.029318525996971226, "grad_norm": 7.471124480308667, "kl": 0.0634765625, "learning_rate": 9.97880580322222e-07, "loss": 0.0025, "reward": 2.198312759399414, "reward_std": 0.0018261463847011328, "rewards/accuracy_reward": 0.9983124732971191, "rewards/format_reward": 1.0, "step": 2904 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.6875, "epoch": 0.029328621908127208, "grad_norm": 1.3551093326183257, "kl": 0.05615234375, "learning_rate": 9.978791214484497e-07, "loss": 0.0022, "reward": 1.9096250534057617, "reward_std": 0.09209886193275452, "rewards/accuracy_reward": 0.7596250176429749, "rewards/format_reward": 1.0, "step": 2905 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.34375, "epoch": 0.02933871781928319, "grad_norm": 3.463788369358868, "kl": 0.0673828125, "learning_rate": 9.978776620738194e-07, "loss": 0.0027, "reward": 1.873687505722046, "reward_std": 0.1253819763660431, "rewards/accuracy_reward": 0.7299374938011169, "rewards/format_reward": 1.0, "step": 2906 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.1875, "epoch": 0.029348813730439172, "grad_norm": 1.7737148604887856, "kl": 0.068359375, "learning_rate": 9.978762021983324e-07, "loss": 0.0027, "reward": 1.8118749856948853, "reward_std": 0.020356960594654083, "rewards/accuracy_reward": 0.6681250333786011, "rewards/format_reward": 1.0, "step": 2907 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.59375, "epoch": 0.029358909641595154, "grad_norm": 4.459875790129103, "kl": 0.07275390625, "learning_rate": 9.978747418219904e-07, "loss": 0.0029, "reward": 2.1227188110351562, "reward_std": 0.018511038273572922, "rewards/accuracy_reward": 0.9227187037467957, "rewards/format_reward": 1.0, "step": 2908 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.09375, "epoch": 0.029369005552751137, "grad_norm": 2.0671713885624285, "kl": 0.08203125, "learning_rate": 9.978732809447946e-07, "loss": 0.0033, "reward": 2.121875286102295, "reward_std": 0.006895151920616627, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2909 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.03125, "epoch": 0.02937910146390712, "grad_norm": 2.8357346635787324, "kl": 0.0634765625, "learning_rate": 9.978718195667463e-07, "loss": 0.0025, "reward": 2.071812629699707, "reward_std": 0.027233021333813667, "rewards/accuracy_reward": 0.8718125224113464, "rewards/format_reward": 1.0, "step": 2910 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.59375, "epoch": 0.0293891973750631, "grad_norm": 1.664935685842084, "kl": 0.0576171875, "learning_rate": 9.978703576878477e-07, "loss": 0.0023, "reward": 1.6920312643051147, "reward_std": 0.2513754367828369, "rewards/accuracy_reward": 0.5670312643051147, "rewards/format_reward": 1.0, "step": 2911 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 405.03125, "epoch": 0.02939929328621908, "grad_norm": 4.240787896732806, "kl": 0.07373046875, "learning_rate": 9.978688953080995e-07, "loss": 0.003, "reward": 1.835531234741211, "reward_std": 0.013405855745077133, "rewards/accuracy_reward": 0.6855312585830688, "rewards/format_reward": 1.0, "step": 2912 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.40625, "epoch": 0.029409389197375062, "grad_norm": 1.5252873668897828, "kl": 0.07470703125, "learning_rate": 9.978674324275037e-07, "loss": 0.003, "reward": 2.143343925476074, "reward_std": 0.004699156619608402, "rewards/accuracy_reward": 0.9433437585830688, "rewards/format_reward": 1.0, "step": 2913 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 399.0625, "epoch": 0.029419485108531044, "grad_norm": 1.4727022993563488, "kl": 0.05029296875, "learning_rate": 9.978659690460614e-07, "loss": 0.002, "reward": 1.465531349182129, "reward_std": 0.15053525567054749, "rewards/accuracy_reward": 0.37803125381469727, "rewards/format_reward": 1.0, "step": 2914 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.53125, "epoch": 0.029429581019687026, "grad_norm": 2.951236716629015, "kl": 0.08251953125, "learning_rate": 9.978645051637743e-07, "loss": 0.0033, "reward": 2.1059062480926514, "reward_std": 0.00816391408443451, "rewards/accuracy_reward": 0.9059062004089355, "rewards/format_reward": 1.0, "step": 2915 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.875, "epoch": 0.02943967693084301, "grad_norm": 3.937009570332875, "kl": 0.0712890625, "learning_rate": 9.97863040780644e-07, "loss": 0.0028, "reward": 2.1603126525878906, "reward_std": 0.01573421061038971, "rewards/accuracy_reward": 0.96031254529953, "rewards/format_reward": 1.0, "step": 2916 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.8125, "epoch": 0.02944977284199899, "grad_norm": 1.8352518855594528, "kl": 0.05322265625, "learning_rate": 9.978615758966715e-07, "loss": 0.0021, "reward": 1.925374984741211, "reward_std": 0.1477118730545044, "rewards/accuracy_reward": 0.7628750205039978, "rewards/format_reward": 1.0, "step": 2917 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.25, "epoch": 0.029459868753154973, "grad_norm": 2.0829666913662175, "kl": 0.08447265625, "learning_rate": 9.978601105118586e-07, "loss": 0.0034, "reward": 2.1057186126708984, "reward_std": 0.02939305081963539, "rewards/accuracy_reward": 0.9119687080383301, "rewards/format_reward": 1.0, "step": 2918 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 381.5625, "epoch": 0.029469964664310955, "grad_norm": 2.457731496771381, "kl": 0.0595703125, "learning_rate": 9.978586446262067e-07, "loss": 0.0024, "reward": 1.54478120803833, "reward_std": 0.008842423558235168, "rewards/accuracy_reward": 0.44478124380111694, "rewards/format_reward": 1.0, "step": 2919 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.15625, "epoch": 0.029480060575466937, "grad_norm": 1.2411129056404069, "kl": 0.0654296875, "learning_rate": 9.978571782397174e-07, "loss": 0.0026, "reward": 2.1941561698913574, "reward_std": 0.0027088942006230354, "rewards/accuracy_reward": 0.9941563010215759, "rewards/format_reward": 1.0, "step": 2920 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.34375, "epoch": 0.02949015648662292, "grad_norm": 2.607632024144283, "kl": 0.0703125, "learning_rate": 9.97855711352392e-07, "loss": 0.0028, "reward": 2.069531202316284, "reward_std": 0.016022849828004837, "rewards/accuracy_reward": 0.8695312738418579, "rewards/format_reward": 1.0, "step": 2921 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.40625, "epoch": 0.0295002523977789, "grad_norm": 2.339358623414213, "kl": 0.07421875, "learning_rate": 9.978542439642322e-07, "loss": 0.003, "reward": 2.0945000648498535, "reward_std": 0.016463693231344223, "rewards/accuracy_reward": 0.8945000171661377, "rewards/format_reward": 1.0, "step": 2922 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.84375, "epoch": 0.02951034830893488, "grad_norm": 3.776956820368833, "kl": 0.06396484375, "learning_rate": 9.978527760752392e-07, "loss": 0.0026, "reward": 1.9313750267028809, "reward_std": 0.15284529328346252, "rewards/accuracy_reward": 0.768875002861023, "rewards/format_reward": 1.0, "step": 2923 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 378.3125, "epoch": 0.029520444220090862, "grad_norm": 1.3835856576633814, "kl": 0.050048828125, "learning_rate": 9.978513076854145e-07, "loss": 0.002, "reward": 1.5500000715255737, "reward_std": 0.12921153008937836, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 2924 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 392.46875, "epoch": 0.029530540131246844, "grad_norm": 1.8298067021891755, "kl": 0.05615234375, "learning_rate": 9.978498387947597e-07, "loss": 0.0022, "reward": 1.3937499523162842, "reward_std": 0.2504068613052368, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 2925 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 386.21875, "epoch": 0.029540636042402826, "grad_norm": 3.4180392323214126, "kl": 0.06494140625, "learning_rate": 9.978483694032766e-07, "loss": 0.0026, "reward": 1.8669687509536743, "reward_std": 0.00658503919839859, "rewards/accuracy_reward": 0.7169687151908875, "rewards/format_reward": 1.0, "step": 2926 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.25, "epoch": 0.02955073195355881, "grad_norm": 2.442021445912558, "kl": 0.0712890625, "learning_rate": 9.97846899510966e-07, "loss": 0.0029, "reward": 2.159343719482422, "reward_std": 0.027934450656175613, "rewards/accuracy_reward": 0.9655938148498535, "rewards/format_reward": 1.0, "step": 2927 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.90625, "epoch": 0.02956082786471479, "grad_norm": 4.56694557663215, "kl": 0.07666015625, "learning_rate": 9.978454291178298e-07, "loss": 0.003, "reward": 2.0711874961853027, "reward_std": 0.025464531034231186, "rewards/accuracy_reward": 0.8711875081062317, "rewards/format_reward": 1.0, "step": 2928 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.4375, "epoch": 0.029570923775870773, "grad_norm": 2.3514965297070036, "kl": 0.08251953125, "learning_rate": 9.978439582238695e-07, "loss": 0.0033, "reward": 2.110093593597412, "reward_std": 0.027694731950759888, "rewards/accuracy_reward": 0.9163437485694885, "rewards/format_reward": 1.0, "step": 2929 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.90625, "epoch": 0.029581019687026755, "grad_norm": 3.7269905464591635, "kl": 0.064453125, "learning_rate": 9.978424868290864e-07, "loss": 0.0026, "reward": 2.005312442779541, "reward_std": 0.18361788988113403, "rewards/accuracy_reward": 0.8365625143051147, "rewards/format_reward": 1.0, "step": 2930 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.4375, "epoch": 0.029591115598182737, "grad_norm": 2.000104263222601, "kl": 0.076171875, "learning_rate": 9.978410149334819e-07, "loss": 0.0031, "reward": 1.7551875114440918, "reward_std": 0.010073678568005562, "rewards/accuracy_reward": 0.6051874756813049, "rewards/format_reward": 1.0, "step": 2931 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.6875, "epoch": 0.02960121150933872, "grad_norm": 1.5151290918832245, "kl": 0.07421875, "learning_rate": 9.978395425370577e-07, "loss": 0.003, "reward": 1.8723124265670776, "reward_std": 0.0040388829074800014, "rewards/accuracy_reward": 0.7223125100135803, "rewards/format_reward": 1.0, "step": 2932 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 392.9375, "epoch": 0.029611307420494698, "grad_norm": 1.9461199170366497, "kl": 0.06201171875, "learning_rate": 9.978380696398154e-07, "loss": 0.0025, "reward": 1.8814687728881836, "reward_std": 0.008156394585967064, "rewards/accuracy_reward": 0.7314687371253967, "rewards/format_reward": 1.0, "step": 2933 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.53125, "epoch": 0.02962140333165068, "grad_norm": 2.7734408189118183, "kl": 0.0869140625, "learning_rate": 9.978365962417563e-07, "loss": 0.0035, "reward": 2.096656322479248, "reward_std": 0.0118191447108984, "rewards/accuracy_reward": 0.8966562747955322, "rewards/format_reward": 1.0, "step": 2934 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 408.5, "epoch": 0.029631499242806662, "grad_norm": 1.6540101795799742, "kl": 0.068359375, "learning_rate": 9.978351223428818e-07, "loss": 0.0027, "reward": 1.7360312938690186, "reward_std": 0.021031420677900314, "rewards/accuracy_reward": 0.5860312581062317, "rewards/format_reward": 1.0, "step": 2935 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.0, "epoch": 0.029641595153962644, "grad_norm": 2.471115493557179, "kl": 0.052734375, "learning_rate": 9.978336479431934e-07, "loss": 0.0021, "reward": 2.157343864440918, "reward_std": 0.0066533321514725685, "rewards/accuracy_reward": 0.9573436975479126, "rewards/format_reward": 1.0, "step": 2936 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.25, "epoch": 0.029651691065118627, "grad_norm": 2.7503136516801145, "kl": 0.0830078125, "learning_rate": 9.978321730426927e-07, "loss": 0.0033, "reward": 1.8075625896453857, "reward_std": 0.016225391998887062, "rewards/accuracy_reward": 0.6575624942779541, "rewards/format_reward": 1.0, "step": 2937 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.09375, "epoch": 0.02966178697627461, "grad_norm": 2.326669317143653, "kl": 0.0673828125, "learning_rate": 9.978306976413813e-07, "loss": 0.0027, "reward": 1.88881254196167, "reward_std": 0.14543141424655914, "rewards/accuracy_reward": 0.726312518119812, "rewards/format_reward": 1.0, "step": 2938 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.3125, "epoch": 0.02967188288743059, "grad_norm": 1.782063376811099, "kl": 0.0703125, "learning_rate": 9.978292217392603e-07, "loss": 0.0028, "reward": 2.0266876220703125, "reward_std": 0.15744031965732574, "rewards/accuracy_reward": 0.851687490940094, "rewards/format_reward": 1.0, "step": 2939 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.8125, "epoch": 0.029681978798586573, "grad_norm": 5.761326877748872, "kl": 0.08056640625, "learning_rate": 9.978277453363313e-07, "loss": 0.0032, "reward": 2.110187530517578, "reward_std": 0.025168098509311676, "rewards/accuracy_reward": 0.9101875424385071, "rewards/format_reward": 1.0, "step": 2940 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 393.09375, "epoch": 0.029692074709742555, "grad_norm": 1.329934140072755, "kl": 0.0625, "learning_rate": 9.97826268432596e-07, "loss": 0.0025, "reward": 1.776031255722046, "reward_std": 0.002916814759373665, "rewards/accuracy_reward": 0.6260312795639038, "rewards/format_reward": 1.0, "step": 2941 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 422.0, "epoch": 0.029702170620898537, "grad_norm": 1.3140293858855845, "kl": 0.06396484375, "learning_rate": 9.978247910280559e-07, "loss": 0.0026, "reward": 1.6274688243865967, "reward_std": 0.09256621450185776, "rewards/accuracy_reward": 0.527468740940094, "rewards/format_reward": 1.0, "step": 2942 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 417.59375, "epoch": 0.02971226653205452, "grad_norm": 2.0969976581413143, "kl": 0.060791015625, "learning_rate": 9.978233131227124e-07, "loss": 0.0024, "reward": 1.861281394958496, "reward_std": 0.009505515918135643, "rewards/accuracy_reward": 0.7112811803817749, "rewards/format_reward": 1.0, "step": 2943 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.84375, "epoch": 0.029722362443210498, "grad_norm": 2.6862596713767775, "kl": 0.08154296875, "learning_rate": 9.978218347165666e-07, "loss": 0.0033, "reward": 2.0504062175750732, "reward_std": 0.01336317416280508, "rewards/accuracy_reward": 0.850406289100647, "rewards/format_reward": 1.0, "step": 2944 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.53125, "epoch": 0.02973245835436648, "grad_norm": 2.8464891241218426, "kl": 0.08447265625, "learning_rate": 9.978203558096206e-07, "loss": 0.0034, "reward": 2.096156120300293, "reward_std": 0.011002570390701294, "rewards/accuracy_reward": 0.8961561918258667, "rewards/format_reward": 1.0, "step": 2945 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.21875, "epoch": 0.029742554265522463, "grad_norm": 1.8324183908591338, "kl": 0.06640625, "learning_rate": 9.978188764018755e-07, "loss": 0.0026, "reward": 1.9614686965942383, "reward_std": 0.1441819816827774, "rewards/accuracy_reward": 0.7989687919616699, "rewards/format_reward": 1.0, "step": 2946 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.375, "epoch": 0.029752650176678445, "grad_norm": 2.03563430685562, "kl": 0.06689453125, "learning_rate": 9.97817396493333e-07, "loss": 0.0027, "reward": 1.8835935592651367, "reward_std": 0.007384384050965309, "rewards/accuracy_reward": 0.733593761920929, "rewards/format_reward": 1.0, "step": 2947 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.59375, "epoch": 0.029762746087834427, "grad_norm": 5.130791047351235, "kl": 0.050537109375, "learning_rate": 9.978159160839945e-07, "loss": 0.002, "reward": 1.8871874809265137, "reward_std": 0.018549028784036636, "rewards/accuracy_reward": 0.7434374690055847, "rewards/format_reward": 1.0, "step": 2948 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.9375, "epoch": 0.02977284199899041, "grad_norm": 1.4441612052955188, "kl": 0.06298828125, "learning_rate": 9.978144351738613e-07, "loss": 0.0025, "reward": 1.8164687156677246, "reward_std": 0.003431569552049041, "rewards/accuracy_reward": 0.6664687395095825, "rewards/format_reward": 1.0, "step": 2949 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.3125, "epoch": 0.02978293791014639, "grad_norm": 1.651720211096927, "kl": 0.08251953125, "learning_rate": 9.978129537629352e-07, "loss": 0.0033, "reward": 2.102843761444092, "reward_std": 0.006265058182179928, "rewards/accuracy_reward": 0.902843713760376, "rewards/format_reward": 1.0, "step": 2950 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.21875, "epoch": 0.029793033821302373, "grad_norm": 2.834464820400824, "kl": 0.08935546875, "learning_rate": 9.978114718512177e-07, "loss": 0.0036, "reward": 2.007093906402588, "reward_std": 0.012953625991940498, "rewards/accuracy_reward": 0.8070937395095825, "rewards/format_reward": 1.0, "step": 2951 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 405.78125, "epoch": 0.029803129732458356, "grad_norm": 2.3615419973010723, "kl": 0.0712890625, "learning_rate": 9.9780998943871e-07, "loss": 0.0029, "reward": 1.5286250114440918, "reward_std": 0.09548913687467575, "rewards/accuracy_reward": 0.4286249577999115, "rewards/format_reward": 1.0, "step": 2952 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.029813225643614338, "grad_norm": 2.384755556651491, "kl": 0.068359375, "learning_rate": 9.978085065254138e-07, "loss": 0.0027, "reward": 1.9903749227523804, "reward_std": 0.164528951048851, "rewards/accuracy_reward": 0.8216249942779541, "rewards/format_reward": 1.0, "step": 2953 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 384.875, "epoch": 0.029823321554770316, "grad_norm": 15.386247193108984, "kl": 0.0751953125, "learning_rate": 9.978070231113304e-07, "loss": 0.003, "reward": 1.8692500591278076, "reward_std": 0.007690517231822014, "rewards/accuracy_reward": 0.7192500233650208, "rewards/format_reward": 1.0, "step": 2954 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.03125, "epoch": 0.0298334174659263, "grad_norm": 2.8434520522753064, "kl": 0.06494140625, "learning_rate": 9.978055391964615e-07, "loss": 0.0026, "reward": 1.6568437814712524, "reward_std": 0.14631998538970947, "rewards/accuracy_reward": 0.5443437099456787, "rewards/format_reward": 1.0, "step": 2955 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.15625, "epoch": 0.02984351337708228, "grad_norm": 1.8537654258569543, "kl": 0.06396484375, "learning_rate": 9.978040547808086e-07, "loss": 0.0026, "reward": 2.093625068664551, "reward_std": 0.027376871556043625, "rewards/accuracy_reward": 0.8998750448226929, "rewards/format_reward": 1.0, "step": 2956 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.9375, "epoch": 0.029853609288238263, "grad_norm": 2.4982726660785137, "kl": 0.07568359375, "learning_rate": 9.97802569864373e-07, "loss": 0.003, "reward": 2.132093906402588, "reward_std": 0.014696966856718063, "rewards/accuracy_reward": 0.9320937395095825, "rewards/format_reward": 1.0, "step": 2957 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.09375, "epoch": 0.029863705199394245, "grad_norm": 2.9350353239655984, "kl": 0.09423828125, "learning_rate": 9.978010844471566e-07, "loss": 0.0038, "reward": 1.9595625400543213, "reward_std": 0.013816125690937042, "rewards/accuracy_reward": 0.7595624923706055, "rewards/format_reward": 1.0, "step": 2958 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.25, "epoch": 0.029873801110550227, "grad_norm": 1.8841263412173812, "kl": 0.0712890625, "learning_rate": 9.977995985291604e-07, "loss": 0.0028, "reward": 2.179281234741211, "reward_std": 0.009654843248426914, "rewards/accuracy_reward": 0.9792813062667847, "rewards/format_reward": 1.0, "step": 2959 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 373.09375, "epoch": 0.02988389702170621, "grad_norm": 5.603419596256897, "kl": 0.076171875, "learning_rate": 9.977981121103861e-07, "loss": 0.0031, "reward": 2.1396875381469727, "reward_std": 0.025471381843090057, "rewards/accuracy_reward": 0.9459375143051147, "rewards/format_reward": 1.0, "step": 2960 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.65625, "epoch": 0.02989399293286219, "grad_norm": 4.022092497684827, "kl": 0.07958984375, "learning_rate": 9.977966251908352e-07, "loss": 0.0032, "reward": 2.1352813243865967, "reward_std": 0.017574645578861237, "rewards/accuracy_reward": 0.9352812767028809, "rewards/format_reward": 1.0, "step": 2961 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.65625, "epoch": 0.029904088844018174, "grad_norm": 3.2267719759226354, "kl": 0.0791015625, "learning_rate": 9.97795137770509e-07, "loss": 0.0032, "reward": 2.086656093597412, "reward_std": 0.007993210107088089, "rewards/accuracy_reward": 0.8866562843322754, "rewards/format_reward": 1.0, "step": 2962 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 386.3125, "epoch": 0.029914184755174156, "grad_norm": 1.3956551622926876, "kl": 0.058349609375, "learning_rate": 9.977936498494097e-07, "loss": 0.0023, "reward": 1.5016875267028809, "reward_std": 0.004074634984135628, "rewards/accuracy_reward": 0.40168750286102295, "rewards/format_reward": 1.0, "step": 2963 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.5625, "epoch": 0.029924280666330138, "grad_norm": 3.279220232714513, "kl": 0.07763671875, "learning_rate": 9.977921614275378e-07, "loss": 0.0031, "reward": 2.088531255722046, "reward_std": 0.01811472326517105, "rewards/accuracy_reward": 0.8885312676429749, "rewards/format_reward": 1.0, "step": 2964 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 370.78125, "epoch": 0.029934376577486117, "grad_norm": 1.8763882000010537, "kl": 0.06494140625, "learning_rate": 9.977906725048954e-07, "loss": 0.0026, "reward": 1.7484686374664307, "reward_std": 0.26889199018478394, "rewards/accuracy_reward": 0.6297187805175781, "rewards/format_reward": 1.0, "step": 2965 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 387.65625, "epoch": 0.0299444724886421, "grad_norm": 3.833714186848673, "kl": 0.047119140625, "learning_rate": 9.97789183081484e-07, "loss": 0.0019, "reward": 1.8589999675750732, "reward_std": 0.025160981342196465, "rewards/accuracy_reward": 0.7152500152587891, "rewards/format_reward": 1.0, "step": 2966 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.15625, "epoch": 0.02995456839979808, "grad_norm": 2.2942830068591618, "kl": 0.07177734375, "learning_rate": 9.97787693157305e-07, "loss": 0.0029, "reward": 1.9537501335144043, "reward_std": 0.17404016852378845, "rewards/accuracy_reward": 0.7724999785423279, "rewards/format_reward": 1.0, "step": 2967 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.029964664310954063, "grad_norm": 1.4696630168054527, "kl": 0.0634765625, "learning_rate": 9.977862027323599e-07, "loss": 0.0025, "reward": 2.103968858718872, "reward_std": 0.0041845254600048065, "rewards/accuracy_reward": 0.9039688110351562, "rewards/format_reward": 1.0, "step": 2968 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.75, "epoch": 0.029974760222110045, "grad_norm": 2.846726718971459, "kl": 0.07958984375, "learning_rate": 9.9778471180665e-07, "loss": 0.0032, "reward": 1.9862501621246338, "reward_std": 0.01461106538772583, "rewards/accuracy_reward": 0.7862499952316284, "rewards/format_reward": 1.0, "step": 2969 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.4375, "epoch": 0.029984856133266027, "grad_norm": 1.9374718353905969, "kl": 0.076171875, "learning_rate": 9.97783220380177e-07, "loss": 0.003, "reward": 2.0725936889648438, "reward_std": 0.0090194595977664, "rewards/accuracy_reward": 0.8725937604904175, "rewards/format_reward": 1.0, "step": 2970 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 375.5625, "epoch": 0.02999495204442201, "grad_norm": 1.5879466091308174, "kl": 0.06787109375, "learning_rate": 9.977817284529424e-07, "loss": 0.0027, "reward": 1.558187484741211, "reward_std": 0.02186949923634529, "rewards/accuracy_reward": 0.46443748474121094, "rewards/format_reward": 1.0, "step": 2971 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 374.53125, "epoch": 0.03000504795557799, "grad_norm": 1.9264757256462364, "kl": 0.06298828125, "learning_rate": 9.977802360249478e-07, "loss": 0.0025, "reward": 2.131218910217285, "reward_std": 0.029802370816469193, "rewards/accuracy_reward": 0.943718671798706, "rewards/format_reward": 1.0, "step": 2972 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 380.5625, "epoch": 0.030015143866733974, "grad_norm": 2.0510069119798002, "kl": 0.0703125, "learning_rate": 9.977787430961944e-07, "loss": 0.0028, "reward": 1.847812533378601, "reward_std": 0.044723354279994965, "rewards/accuracy_reward": 0.71031254529953, "rewards/format_reward": 1.0, "step": 2973 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 373.46875, "epoch": 0.030025239777889956, "grad_norm": 1.6922288427183363, "kl": 0.07421875, "learning_rate": 9.97777249666684e-07, "loss": 0.003, "reward": 1.817093849182129, "reward_std": 0.02167305164039135, "rewards/accuracy_reward": 0.6733437776565552, "rewards/format_reward": 1.0, "step": 2974 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 387.0, "epoch": 0.030035335689045935, "grad_norm": 3.1056230724559675, "kl": 0.07470703125, "learning_rate": 9.977757557364179e-07, "loss": 0.003, "reward": 1.828812599182129, "reward_std": 0.011358743533492088, "rewards/accuracy_reward": 0.6788125038146973, "rewards/format_reward": 1.0, "step": 2975 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 387.53125, "epoch": 0.030045431600201917, "grad_norm": 1.3314468664268975, "kl": 0.06201171875, "learning_rate": 9.977742613053976e-07, "loss": 0.0025, "reward": 1.8840312957763672, "reward_std": 0.004624266177415848, "rewards/accuracy_reward": 0.7340312004089355, "rewards/format_reward": 1.0, "step": 2976 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 387.78125, "epoch": 0.0300555275113579, "grad_norm": 3.893058464631234, "kl": 0.0703125, "learning_rate": 9.977727663736249e-07, "loss": 0.0028, "reward": 1.7032188177108765, "reward_std": 0.030578572303056717, "rewards/accuracy_reward": 0.5657187700271606, "rewards/format_reward": 1.0, "step": 2977 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.46875, "epoch": 0.03006562342251388, "grad_norm": 2.6175800687653608, "kl": 0.068359375, "learning_rate": 9.97771270941101e-07, "loss": 0.0027, "reward": 1.6761250495910645, "reward_std": 0.25860583782196045, "rewards/accuracy_reward": 0.5511250495910645, "rewards/format_reward": 1.0, "step": 2978 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.71875, "epoch": 0.030075719333669863, "grad_norm": 2.2045500333125485, "kl": 0.0703125, "learning_rate": 9.977697750078276e-07, "loss": 0.0028, "reward": 2.036562442779541, "reward_std": 0.014937922358512878, "rewards/accuracy_reward": 0.8365625143051147, "rewards/format_reward": 1.0, "step": 2979 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.96875, "epoch": 0.030085815244825845, "grad_norm": 1.8208236831619589, "kl": 0.0712890625, "learning_rate": 9.977682785738059e-07, "loss": 0.0029, "reward": 1.821812629699707, "reward_std": 0.02724960818886757, "rewards/accuracy_reward": 0.6780624985694885, "rewards/format_reward": 1.0, "step": 2980 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 385.625, "epoch": 0.030095911155981828, "grad_norm": 2.9119174675715955, "kl": 0.07421875, "learning_rate": 9.977667816390378e-07, "loss": 0.003, "reward": 1.8582186698913574, "reward_std": 0.003961857873946428, "rewards/accuracy_reward": 0.7082187533378601, "rewards/format_reward": 1.0, "step": 2981 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.4375, "epoch": 0.03010600706713781, "grad_norm": 1.3779365744544882, "kl": 0.0693359375, "learning_rate": 9.977652842035245e-07, "loss": 0.0028, "reward": 2.10128116607666, "reward_std": 0.02329721674323082, "rewards/accuracy_reward": 0.9075312614440918, "rewards/format_reward": 1.0, "step": 2982 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.4375, "epoch": 0.030116102978293792, "grad_norm": 2.786964380000689, "kl": 0.06640625, "learning_rate": 9.977637862672676e-07, "loss": 0.0027, "reward": 1.9548437595367432, "reward_std": 0.16123725473880768, "rewards/accuracy_reward": 0.7798436880111694, "rewards/format_reward": 1.0, "step": 2983 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.78125, "epoch": 0.030126198889449774, "grad_norm": 2.0045727308223387, "kl": 0.0673828125, "learning_rate": 9.977622878302688e-07, "loss": 0.0027, "reward": 2.105968952178955, "reward_std": 0.11622175574302673, "rewards/accuracy_reward": 0.9122187495231628, "rewards/format_reward": 1.0, "step": 2984 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.875, "epoch": 0.030136294800605756, "grad_norm": 2.046462307402914, "kl": 0.06640625, "learning_rate": 9.977607888925293e-07, "loss": 0.0027, "reward": 2.093937397003174, "reward_std": 0.010258457623422146, "rewards/accuracy_reward": 0.8939374685287476, "rewards/format_reward": 1.0, "step": 2985 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 431.1875, "epoch": 0.030146390711761735, "grad_norm": 3.3452398515752995, "kl": 0.052001953125, "learning_rate": 9.977592894540506e-07, "loss": 0.0021, "reward": 1.855062484741211, "reward_std": 0.004863768350332975, "rewards/accuracy_reward": 0.7050624489784241, "rewards/format_reward": 1.0, "step": 2986 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.6875, "epoch": 0.030156486622917717, "grad_norm": 2.1347876535641968, "kl": 0.07080078125, "learning_rate": 9.977577895148345e-07, "loss": 0.0028, "reward": 2.080000162124634, "reward_std": 0.012911571189761162, "rewards/accuracy_reward": 0.8799999952316284, "rewards/format_reward": 1.0, "step": 2987 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.34375, "epoch": 0.0301665825340737, "grad_norm": 2.3540856433445065, "kl": 0.068359375, "learning_rate": 9.977562890748823e-07, "loss": 0.0027, "reward": 1.7775624990463257, "reward_std": 0.014079246670007706, "rewards/accuracy_reward": 0.6275625228881836, "rewards/format_reward": 1.0, "step": 2988 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 407.0, "epoch": 0.03017667844522968, "grad_norm": 1.776337926445302, "kl": 0.0693359375, "learning_rate": 9.977547881341957e-07, "loss": 0.0028, "reward": 1.8593125343322754, "reward_std": 0.00737964129075408, "rewards/accuracy_reward": 0.7093124985694885, "rewards/format_reward": 1.0, "step": 2989 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.375, "epoch": 0.030186774356385664, "grad_norm": 2.7393618085953744, "kl": 0.07177734375, "learning_rate": 9.977532866927761e-07, "loss": 0.0029, "reward": 1.9174062013626099, "reward_std": 0.1868608146905899, "rewards/accuracy_reward": 0.7424062490463257, "rewards/format_reward": 1.0, "step": 2990 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.71875, "epoch": 0.030196870267541646, "grad_norm": 3.2000072974041833, "kl": 0.0751953125, "learning_rate": 9.97751784750625e-07, "loss": 0.003, "reward": 2.09346866607666, "reward_std": 0.03061774931848049, "rewards/accuracy_reward": 0.8997187614440918, "rewards/format_reward": 1.0, "step": 2991 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.03125, "epoch": 0.030206966178697628, "grad_norm": 2.5657118644488888, "kl": 0.07470703125, "learning_rate": 9.977502823077437e-07, "loss": 0.003, "reward": 2.1020002365112305, "reward_std": 0.027980590239167213, "rewards/accuracy_reward": 0.9082500338554382, "rewards/format_reward": 1.0, "step": 2992 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 401.125, "epoch": 0.03021706208985361, "grad_norm": 2.1745202409146227, "kl": 0.06689453125, "learning_rate": 9.97748779364134e-07, "loss": 0.0027, "reward": 1.8785624504089355, "reward_std": 0.0077952067367732525, "rewards/accuracy_reward": 0.7285624742507935, "rewards/format_reward": 1.0, "step": 2993 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.125, "epoch": 0.030227158001009592, "grad_norm": 1.7219529356264445, "kl": 0.0673828125, "learning_rate": 9.977472759197974e-07, "loss": 0.0027, "reward": 2.154968738555908, "reward_std": 0.022295048460364342, "rewards/accuracy_reward": 0.9612188339233398, "rewards/format_reward": 1.0, "step": 2994 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.5625, "epoch": 0.030237253912165574, "grad_norm": 2.1390678061193387, "kl": 0.06640625, "learning_rate": 9.977457719747351e-07, "loss": 0.0026, "reward": 2.096374988555908, "reward_std": 0.02685847505927086, "rewards/accuracy_reward": 0.9026249647140503, "rewards/format_reward": 1.0, "step": 2995 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.625, "epoch": 0.030247349823321553, "grad_norm": 5.144394959282201, "kl": 0.06201171875, "learning_rate": 9.977442675289492e-07, "loss": 0.0025, "reward": 1.8235937356948853, "reward_std": 0.007500200532376766, "rewards/accuracy_reward": 0.6735937595367432, "rewards/format_reward": 1.0, "step": 2996 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.030257445734477535, "grad_norm": 2.2588750402159166, "kl": 0.08154296875, "learning_rate": 9.977427625824407e-07, "loss": 0.0033, "reward": 2.143843650817871, "reward_std": 0.01755443774163723, "rewards/accuracy_reward": 0.9438437223434448, "rewards/format_reward": 1.0, "step": 2997 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.8125, "epoch": 0.030267541645633517, "grad_norm": 2.6324092344821217, "kl": 0.076171875, "learning_rate": 9.977412571352113e-07, "loss": 0.003, "reward": 2.112375020980835, "reward_std": 0.01040356233716011, "rewards/accuracy_reward": 0.9123749732971191, "rewards/format_reward": 1.0, "step": 2998 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.0302776375567895, "grad_norm": 4.55903405273369, "kl": 0.060302734375, "learning_rate": 9.977397511872623e-07, "loss": 0.0024, "reward": 2.158156394958496, "reward_std": 0.005091247148811817, "rewards/accuracy_reward": 0.9581562280654907, "rewards/format_reward": 1.0, "step": 2999 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.4375, "epoch": 0.03028773346794548, "grad_norm": 4.79650578525187, "kl": 0.076171875, "learning_rate": 9.977382447385956e-07, "loss": 0.0031, "reward": 2.120375156402588, "reward_std": 0.01339969877153635, "rewards/accuracy_reward": 0.9203749895095825, "rewards/format_reward": 1.0, "step": 3000 } ], "logging_steps": 1.0, "max_steps": 99050, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }