{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.996510067114094, "eval_steps": 400, "global_step": 116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008590604026845637, "grad_norm": 86.01569524610598, "learning_rate": 4.166666666666666e-08, "logits/chosen": -2.431039571762085, "logits/rejected": -2.618009090423584, "logps/chosen": -1197.8489990234375, "logps/rejected": -7907.7099609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.042953020134228186, "grad_norm": 81.41508200934528, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.8407392501831055, "logits/rejected": -3.0651891231536865, "logps/chosen": -1897.8328857421875, "logps/rejected": -9360.8955078125, "loss": 0.692, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.00021657101751770824, "rewards/margins": 0.003072525840252638, "rewards/rejected": -0.003289096988737583, "step": 5 }, { "epoch": 0.08590604026845637, "grad_norm": 77.79677990779399, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6457934379577637, "logits/rejected": -2.990572690963745, "logps/chosen": -1608.490478515625, "logps/rejected": -9246.4970703125, "loss": 0.6499, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.021037336438894272, "rewards/margins": 0.08746902644634247, "rewards/rejected": -0.10850635915994644, "step": 10 }, { "epoch": 0.12885906040268458, "grad_norm": 34.39608246130055, "learning_rate": 4.989741394042727e-07, "logits/chosen": -2.5478570461273193, "logits/rejected": -2.9816832542419434, "logps/chosen": -1612.6597900390625, "logps/rejected": -9213.7060546875, "loss": 0.4677, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.11297205835580826, "rewards/margins": 0.6206797361373901, "rewards/rejected": -0.7336517572402954, "step": 15 }, { "epoch": 0.17181208053691274, "grad_norm": 5.898677172155064, "learning_rate": 4.92735454356513e-07, "logits/chosen": -2.497690200805664, "logits/rejected": -3.0749311447143555, "logps/chosen": -1422.3905029296875, "logps/rejected": -9622.453125, "loss": 0.2772, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.46565741300582886, "rewards/margins": 2.972280979156494, "rewards/rejected": -3.437938690185547, "step": 20 }, { "epoch": 0.21476510067114093, "grad_norm": 4.142364189438871, "learning_rate": 4.809698831278217e-07, "logits/chosen": -2.579451322555542, "logits/rejected": -3.227189540863037, "logps/chosen": -1883.857421875, "logps/rejected": -10082.71875, "loss": 0.255, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9519465565681458, "rewards/margins": 5.0396575927734375, "rewards/rejected": -5.991604328155518, "step": 25 }, { "epoch": 0.25771812080536916, "grad_norm": 7.398342740617321, "learning_rate": 4.639453180753619e-07, "logits/chosen": -2.527676820755005, "logits/rejected": -3.185889959335327, "logps/chosen": -2057.02001953125, "logps/rejected": -9853.166015625, "loss": 0.2471, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.8792552947998047, "rewards/margins": 8.077213287353516, "rewards/rejected": -9.956467628479004, "step": 30 }, { "epoch": 0.3006711409395973, "grad_norm": 12.059892215323622, "learning_rate": 4.420493945100701e-07, "logits/chosen": -2.484814167022705, "logits/rejected": -3.121709108352661, "logps/chosen": -2086.67919921875, "logps/rejected": -9674.890625, "loss": 0.274, "rewards/accuracies": 0.875, "rewards/chosen": -2.0851800441741943, "rewards/margins": 7.848902225494385, "rewards/rejected": -9.934083938598633, "step": 35 }, { "epoch": 0.3436241610738255, "grad_norm": 28.99159189374227, "learning_rate": 4.157806645601988e-07, "logits/chosen": -2.427899122238159, "logits/rejected": -2.911158800125122, "logps/chosen": -1157.116455078125, "logps/rejected": -10012.34765625, "loss": 0.1935, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.1424468755722046, "rewards/margins": 10.81905460357666, "rewards/rejected": -11.961501121520996, "step": 40 }, { "epoch": 0.3865771812080537, "grad_norm": 27.796451654387887, "learning_rate": 3.857372455503697e-07, "logits/chosen": -2.5545668601989746, "logits/rejected": -2.8794655799865723, "logps/chosen": -1950.16796875, "logps/rejected": -10788.267578125, "loss": 0.1834, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.45682954788208, "rewards/margins": 11.35061264038086, "rewards/rejected": -13.807443618774414, "step": 45 }, { "epoch": 0.42953020134228187, "grad_norm": 26.159018172068677, "learning_rate": 3.5260320136318924e-07, "logits/chosen": -2.4899744987487793, "logits/rejected": -2.9161746501922607, "logps/chosen": -1632.9305419921875, "logps/rejected": -10670.7177734375, "loss": 0.1654, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.4171953201293945, "rewards/margins": 11.587867736816406, "rewards/rejected": -14.0050630569458, "step": 50 }, { "epoch": 0.47248322147651006, "grad_norm": 27.198586027054176, "learning_rate": 3.171329668685942e-07, "logits/chosen": -2.460887908935547, "logits/rejected": -2.9514319896698, "logps/chosen": -1985.7174072265625, "logps/rejected": -10099.3125, "loss": 0.1791, "rewards/accuracies": 0.9375, "rewards/chosen": -2.890523672103882, "rewards/margins": 9.628759384155273, "rewards/rejected": -12.519282341003418, "step": 55 }, { "epoch": 0.5154362416107383, "grad_norm": 39.49613447619216, "learning_rate": 2.801341700638307e-07, "logits/chosen": -2.5868403911590576, "logits/rejected": -3.0467333793640137, "logps/chosen": -1847.5205078125, "logps/rejected": -10758.123046875, "loss": 0.1843, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.3636813163757324, "rewards/margins": 10.008157730102539, "rewards/rejected": -12.37183952331543, "step": 60 }, { "epoch": 0.5583892617449664, "grad_norm": 12.8001844827942, "learning_rate": 2.424492430497778e-07, "logits/chosen": -2.506343126296997, "logits/rejected": -2.965503215789795, "logps/chosen": -2238.29443359375, "logps/rejected": -10792.2021484375, "loss": 0.2211, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6938767433166504, "rewards/margins": 9.952713012695312, "rewards/rejected": -12.646589279174805, "step": 65 }, { "epoch": 0.6013422818791946, "grad_norm": 15.245673155295346, "learning_rate": 2.0493624054652355e-07, "logits/chosen": -2.58244252204895, "logits/rejected": -2.951399326324463, "logps/chosen": -2069.97998046875, "logps/rejected": -10962.5087890625, "loss": 0.246, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.3197269439697266, "rewards/margins": 10.182031631469727, "rewards/rejected": -12.50175952911377, "step": 70 }, { "epoch": 0.6442953020134228, "grad_norm": 24.590124308811014, "learning_rate": 1.6844930269478273e-07, "logits/chosen": -2.5273938179016113, "logits/rejected": -2.789759397506714, "logps/chosen": -2302.49169921875, "logps/rejected": -10204.7763671875, "loss": 0.2857, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.474081039428711, "rewards/margins": 8.876008987426758, "rewards/rejected": -11.350090980529785, "step": 75 }, { "epoch": 0.687248322147651, "grad_norm": 5.243275488519254, "learning_rate": 1.3381920698905784e-07, "logits/chosen": -2.599067211151123, "logits/rejected": -2.9476146697998047, "logps/chosen": -2229.91162109375, "logps/rejected": -10514.13671875, "loss": 0.1797, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.218345880508423, "rewards/margins": 8.61392593383789, "rewards/rejected": -10.832271575927734, "step": 80 }, { "epoch": 0.7302013422818792, "grad_norm": 8.334219171923868, "learning_rate": 1.0183445215899584e-07, "logits/chosen": -2.6111998558044434, "logits/rejected": -2.9625191688537598, "logps/chosen": -1786.7320556640625, "logps/rejected": -10765.2060546875, "loss": 0.1725, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8339601755142212, "rewards/margins": 8.938019752502441, "rewards/rejected": -10.771979331970215, "step": 85 }, { "epoch": 0.7731543624161074, "grad_norm": 14.219523417845217, "learning_rate": 7.322330470336313e-08, "logits/chosen": -2.2908596992492676, "logits/rejected": -2.7106270790100098, "logps/chosen": -1873.132568359375, "logps/rejected": -9457.634765625, "loss": 0.1766, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8528430461883545, "rewards/margins": 7.991453647613525, "rewards/rejected": -9.844297409057617, "step": 90 }, { "epoch": 0.8161073825503355, "grad_norm": 15.487241466447763, "learning_rate": 4.863721686226349e-08, "logits/chosen": -2.6290388107299805, "logits/rejected": -2.9791619777679443, "logps/chosen": -1920.321044921875, "logps/rejected": -10810.255859375, "loss": 0.203, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.1639926433563232, "rewards/margins": 9.248581886291504, "rewards/rejected": -11.412572860717773, "step": 95 }, { "epoch": 0.8590604026845637, "grad_norm": 10.617279968480949, "learning_rate": 2.863599358669755e-08, "logits/chosen": -2.513326644897461, "logits/rejected": -2.773226499557495, "logps/chosen": -2062.977294921875, "logps/rejected": -10156.541015625, "loss": 0.158, "rewards/accuracies": 0.90625, "rewards/chosen": -2.4399707317352295, "rewards/margins": 8.490180969238281, "rewards/rejected": -10.930150985717773, "step": 100 }, { "epoch": 0.9020134228187919, "grad_norm": 15.766621732547646, "learning_rate": 1.3675046241339916e-08, "logits/chosen": -2.458155870437622, "logits/rejected": -2.8766350746154785, "logps/chosen": -1824.9993896484375, "logps/rejected": -10561.1455078125, "loss": 0.2026, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.1738946437835693, "rewards/margins": 8.930900573730469, "rewards/rejected": -11.104796409606934, "step": 105 }, { "epoch": 0.9449664429530201, "grad_norm": 5.046777954270063, "learning_rate": 4.0950232632141205e-09, "logits/chosen": -2.5404601097106934, "logits/rejected": -2.9703125953674316, "logps/chosen": -1576.762939453125, "logps/rejected": -11024.712890625, "loss": 0.2015, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8676944971084595, "rewards/margins": 9.766222953796387, "rewards/rejected": -11.633917808532715, "step": 110 }, { "epoch": 0.9879194630872483, "grad_norm": 13.782488804833061, "learning_rate": 1.1405387761664887e-10, "logits/chosen": -2.4785690307617188, "logits/rejected": -2.7061634063720703, "logps/chosen": -2462.81689453125, "logps/rejected": -9758.763671875, "loss": 0.2002, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0159003734588623, "rewards/margins": 7.688973426818848, "rewards/rejected": -10.704873085021973, "step": 115 }, { "epoch": 0.996510067114094, "step": 116, "total_flos": 0.0, "train_loss": 0.2600768296497649, "train_runtime": 7822.1359, "train_samples_per_second": 1.904, "train_steps_per_second": 0.015 } ], "logging_steps": 5, "max_steps": 116, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }