{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06, "eval_steps": 500, "global_step": 30, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 60.206947326660156, "learning_rate": 4.8333333333333334e-05, "logits/chosen": -1.116684913635254, "logits/rejected": -1.2412071228027344, "logps/chosen": -80.99737548828125, "logps/rejected": -208.84976196289062, "loss": 0.8252, "rewards/accuracies": 0.75, "rewards/chosen": 19.664356231689453, "rewards/margins": 11.318536758422852, "rewards/rejected": 8.345821380615234, "step": 1 }, { "epoch": 0.004, "grad_norm": 18.464937210083008, "learning_rate": 4.666666666666667e-05, "logits/chosen": -1.2304766178131104, "logits/rejected": -1.1085100173950195, "logps/chosen": -44.50049591064453, "logps/rejected": -95.98702239990234, "loss": 1.0253, "rewards/accuracies": 0.75, "rewards/chosen": 9.29911994934082, "rewards/margins": 5.345921516418457, "rewards/rejected": 3.9531972408294678, "step": 2 }, { "epoch": 0.006, "grad_norm": 21.883440017700195, "learning_rate": 4.5e-05, "logits/chosen": -1.190551519393921, "logits/rejected": -0.9011133313179016, "logps/chosen": -102.20394897460938, "logps/rejected": -98.943603515625, "loss": 0.3271, "rewards/accuracies": 0.75, "rewards/chosen": 14.595328330993652, "rewards/margins": 10.861374855041504, "rewards/rejected": 3.7339534759521484, "step": 3 }, { "epoch": 0.008, "grad_norm": 6.222319643711671e-05, "learning_rate": 4.3333333333333334e-05, "logits/chosen": -0.4864046573638916, "logits/rejected": -0.9188252091407776, "logps/chosen": -127.2591781616211, "logps/rejected": -402.8990783691406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.617298126220703, "rewards/margins": 22.434566497802734, "rewards/rejected": -3.817267656326294, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.3858901858329773, "learning_rate": 4.166666666666667e-05, "logits/chosen": -1.0422379970550537, "logits/rejected": -0.8618497848510742, "logps/chosen": -73.0203857421875, "logps/rejected": -301.347412109375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 25.020734786987305, "rewards/margins": 27.691850662231445, "rewards/rejected": -2.6711173057556152, "step": 5 }, { "epoch": 0.012, "grad_norm": 0.0033887920435518026, "learning_rate": 4e-05, "logits/chosen": -1.2303388118743896, "logits/rejected": -0.7831270694732666, "logps/chosen": -85.19882202148438, "logps/rejected": -286.6585693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.92217254638672, "rewards/margins": 28.057987213134766, "rewards/rejected": -5.135812282562256, "step": 6 }, { "epoch": 0.014, "grad_norm": 0.01970483362674713, "learning_rate": 3.8333333333333334e-05, "logits/chosen": -1.266550064086914, "logits/rejected": -0.7790937423706055, "logps/chosen": -104.19335174560547, "logps/rejected": -176.42501831054688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 12.132671356201172, "rewards/margins": 16.39078140258789, "rewards/rejected": -4.2581095695495605, "step": 7 }, { "epoch": 0.016, "grad_norm": 207.87570190429688, "learning_rate": 3.6666666666666666e-05, "logits/chosen": -1.1942288875579834, "logits/rejected": -0.5387299060821533, "logps/chosen": -330.1429748535156, "logps/rejected": -239.27011108398438, "loss": 3.0167, "rewards/accuracies": 0.75, "rewards/chosen": 9.409097671508789, "rewards/margins": 14.1237154006958, "rewards/rejected": -4.714616298675537, "step": 8 }, { "epoch": 0.018, "grad_norm": 0.0003472985699772835, "learning_rate": 3.5e-05, "logits/chosen": -1.2635735273361206, "logits/rejected": -0.413591206073761, "logps/chosen": -185.6806640625, "logps/rejected": -166.2919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 24.54703712463379, "rewards/margins": 25.826885223388672, "rewards/rejected": -1.279848337173462, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.05313471704721451, "learning_rate": 3.3333333333333335e-05, "logits/chosen": -1.0146455764770508, "logits/rejected": -0.8916444182395935, "logps/chosen": -209.36700439453125, "logps/rejected": -468.6343994140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 22.948640823364258, "rewards/margins": 28.48465919494629, "rewards/rejected": -5.536019325256348, "step": 10 }, { "epoch": 0.022, "grad_norm": 142.11721801757812, "learning_rate": 3.1666666666666666e-05, "logits/chosen": -0.9070144891738892, "logits/rejected": -0.6460937261581421, "logps/chosen": -159.49267578125, "logps/rejected": -283.4989929199219, "loss": 1.5215, "rewards/accuracies": 0.75, "rewards/chosen": 18.536209106445312, "rewards/margins": 17.37179183959961, "rewards/rejected": 1.1644160747528076, "step": 11 }, { "epoch": 0.024, "grad_norm": 2.082591663565836e-07, "learning_rate": 3e-05, "logits/chosen": -1.2319185733795166, "logits/rejected": -0.69745934009552, "logps/chosen": -118.86239624023438, "logps/rejected": -291.8245544433594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.314111709594727, "rewards/margins": 24.709243774414062, "rewards/rejected": -9.395132064819336, "step": 12 }, { "epoch": 0.026, "grad_norm": 0.2945432960987091, "learning_rate": 2.8333333333333335e-05, "logits/chosen": -1.1761776208877563, "logits/rejected": -0.7650622725486755, "logps/chosen": -149.94891357421875, "logps/rejected": -175.96791076660156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 13.455711364746094, "rewards/margins": 17.48712158203125, "rewards/rejected": -4.031410217285156, "step": 13 }, { "epoch": 0.028, "grad_norm": 1.873824954032898, "learning_rate": 2.6666666666666667e-05, "logits/chosen": -0.8233789801597595, "logits/rejected": -1.152618646621704, "logps/chosen": -146.26565551757812, "logps/rejected": -507.0345458984375, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 20.955230712890625, "rewards/margins": 25.935028076171875, "rewards/rejected": -4.979795932769775, "step": 14 }, { "epoch": 0.03, "grad_norm": 28.382314682006836, "learning_rate": 2.5e-05, "logits/chosen": -1.1895701885223389, "logits/rejected": -0.8884562849998474, "logps/chosen": -93.35548400878906, "logps/rejected": -189.5703125, "loss": 0.1604, "rewards/accuracies": 1.0, "rewards/chosen": 11.567254066467285, "rewards/margins": 15.98021411895752, "rewards/rejected": -4.412960052490234, "step": 15 }, { "epoch": 0.032, "grad_norm": 0.0003264884580858052, "learning_rate": 2.3333333333333336e-05, "logits/chosen": -1.0162668228149414, "logits/rejected": -0.9473219513893127, "logps/chosen": -269.1289367675781, "logps/rejected": -664.7567138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.435200691223145, "rewards/margins": 40.45425796508789, "rewards/rejected": -25.01905632019043, "step": 16 }, { "epoch": 0.034, "grad_norm": 39.93281936645508, "learning_rate": 2.1666666666666667e-05, "logits/chosen": -1.261150598526001, "logits/rejected": -0.5414608120918274, "logps/chosen": -169.63702392578125, "logps/rejected": -218.22845458984375, "loss": 2.1803, "rewards/accuracies": 0.75, "rewards/chosen": 19.81353759765625, "rewards/margins": 21.26726722717285, "rewards/rejected": -1.4537286758422852, "step": 17 }, { "epoch": 0.036, "grad_norm": 0.3639410138130188, "learning_rate": 2e-05, "logits/chosen": -1.1032836437225342, "logits/rejected": -0.42093324661254883, "logps/chosen": -250.34255981445312, "logps/rejected": -339.02777099609375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 29.49982452392578, "rewards/margins": 29.380714416503906, "rewards/rejected": 0.11910903453826904, "step": 18 }, { "epoch": 0.038, "grad_norm": 91.46900939941406, "learning_rate": 1.8333333333333333e-05, "logits/chosen": -1.382272720336914, "logits/rejected": -1.3159658908843994, "logps/chosen": -157.1051788330078, "logps/rejected": -93.73989868164062, "loss": 4.247, "rewards/accuracies": 0.25, "rewards/chosen": 1.4054617881774902, "rewards/margins": -1.071337103843689, "rewards/rejected": 2.4767990112304688, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.07625256478786469, "learning_rate": 1.6666666666666667e-05, "logits/chosen": -1.2797192335128784, "logits/rejected": -0.41436293721199036, "logps/chosen": -233.0648956298828, "logps/rejected": -280.9295959472656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 14.561626434326172, "rewards/margins": 24.783857345581055, "rewards/rejected": -10.222232818603516, "step": 20 }, { "epoch": 0.042, "grad_norm": 0.0006307783187367022, "learning_rate": 1.5e-05, "logits/chosen": -1.200671672821045, "logits/rejected": -0.8377301692962646, "logps/chosen": -181.38812255859375, "logps/rejected": -400.5192565917969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.238813400268555, "rewards/margins": 29.048757553100586, "rewards/rejected": -8.809943199157715, "step": 21 }, { "epoch": 0.044, "grad_norm": 3.4681459510466084e-05, "learning_rate": 1.3333333333333333e-05, "logits/chosen": -1.0809839963912964, "logits/rejected": -1.0748283863067627, "logps/chosen": -210.24159240722656, "logps/rejected": -478.85113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.9466552734375, "rewards/margins": 30.102947235107422, "rewards/rejected": -15.156291007995605, "step": 22 }, { "epoch": 0.046, "grad_norm": 1.9936389435315505e-05, "learning_rate": 1.1666666666666668e-05, "logits/chosen": -1.1689856052398682, "logits/rejected": -0.40747758746147156, "logps/chosen": -278.3692932128906, "logps/rejected": -260.0756530761719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.959720611572266, "rewards/margins": 28.647663116455078, "rewards/rejected": -5.687943458557129, "step": 23 }, { "epoch": 0.048, "grad_norm": 2.3284033886739053e-05, "learning_rate": 1e-05, "logits/chosen": -0.9488776922225952, "logits/rejected": -0.9771822094917297, "logps/chosen": -107.5537109375, "logps/rejected": -420.6499328613281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.997261047363281, "rewards/margins": 30.08607292175293, "rewards/rejected": -16.08881187438965, "step": 24 }, { "epoch": 0.05, "grad_norm": 0.00019201346731279045, "learning_rate": 8.333333333333334e-06, "logits/chosen": -1.0477547645568848, "logits/rejected": -0.8248160481452942, "logps/chosen": -200.435546875, "logps/rejected": -410.39874267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.744895935058594, "rewards/margins": 28.151798248291016, "rewards/rejected": -15.406902313232422, "step": 25 }, { "epoch": 0.052, "grad_norm": 9.264203981729224e-06, "learning_rate": 6.666666666666667e-06, "logits/chosen": -0.795575737953186, "logits/rejected": -0.8969402313232422, "logps/chosen": -187.31686401367188, "logps/rejected": -461.4927673339844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.49878692626953, "rewards/margins": 30.602787017822266, "rewards/rejected": -13.10400104522705, "step": 26 }, { "epoch": 0.054, "grad_norm": 0.0012664368841797113, "learning_rate": 5e-06, "logits/chosen": -1.146821141242981, "logits/rejected": -1.307145118713379, "logps/chosen": -82.47093963623047, "logps/rejected": -296.2177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.118415832519531, "rewards/margins": 22.095069885253906, "rewards/rejected": -11.976654052734375, "step": 27 }, { "epoch": 0.056, "grad_norm": 24.4777889251709, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.1492491960525513, "logits/rejected": -0.9397568702697754, "logps/chosen": -207.41566467285156, "logps/rejected": -408.0826110839844, "loss": 0.219, "rewards/accuracies": 0.75, "rewards/chosen": 14.095235824584961, "rewards/margins": 25.648344039916992, "rewards/rejected": -11.553108215332031, "step": 28 }, { "epoch": 0.058, "grad_norm": 0.1801503300666809, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -1.2135852575302124, "logits/rejected": -0.22595801949501038, "logps/chosen": -190.64984130859375, "logps/rejected": -219.69593811035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 9.875904083251953, "rewards/margins": 18.048542022705078, "rewards/rejected": -8.172636985778809, "step": 29 }, { "epoch": 0.06, "grad_norm": 0.11778837442398071, "learning_rate": 0.0, "logits/chosen": -0.8404010534286499, "logits/rejected": -0.7674828171730042, "logps/chosen": -144.8819580078125, "logps/rejected": -553.426513671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 18.967082977294922, "rewards/margins": 35.870948791503906, "rewards/rejected": -16.903865814208984, "step": 30 } ], "logging_steps": 1, "max_steps": 30, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }