{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 50.84480396591844, "learning_rate": 8.333333333333333e-08, "logits/chosen": -2.5962367057800293, "logits/rejected": -2.5291247367858887, "logps/chosen": -313.06549072265625, "logps/pi_response": -192.275634765625, "logps/ref_response": -192.275634765625, "logps/rejected": -467.38446044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 42.41677392718424, "learning_rate": 4.930057285201027e-07, "logits/chosen": -2.5569324493408203, "logits/rejected": -2.5175046920776367, "logps/chosen": -302.5186767578125, "logps/pi_response": -169.27621459960938, "logps/ref_response": -160.4980010986328, "logps/rejected": -473.1149597167969, "loss": 0.6544, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": -0.23846334218978882, "rewards/margins": 0.1443914771080017, "rewards/rejected": -0.3828548491001129, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 39.98829098029278, "learning_rate": 4.187457503795526e-07, "logits/chosen": -2.4396791458129883, "logits/rejected": -2.3597466945648193, "logps/chosen": -386.8421325683594, "logps/pi_response": -202.8572998046875, "logps/ref_response": -167.9676055908203, "logps/rejected": -638.3150024414062, "loss": 0.5895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0396512746810913, "rewards/margins": 0.9388242959976196, "rewards/rejected": -1.978475570678711, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 33.31734090524923, "learning_rate": 2.8691164100062034e-07, "logits/chosen": -2.3278262615203857, "logits/rejected": -2.257200241088867, "logps/chosen": -412.71881103515625, "logps/pi_response": -218.89254760742188, "logps/ref_response": -161.32485961914062, "logps/rejected": -613.4661865234375, "loss": 0.5116, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1704436540603638, "rewards/margins": 0.7398843169212341, "rewards/rejected": -1.9103281497955322, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 29.897018751189872, "learning_rate": 1.4248369943086995e-07, "logits/chosen": -2.344947338104248, "logits/rejected": -2.2430291175842285, "logps/chosen": -405.5809020996094, "logps/pi_response": -244.9563751220703, "logps/ref_response": -167.4927520751953, "logps/rejected": -667.6639404296875, "loss": 0.4908, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -1.212226390838623, "rewards/margins": 0.8576709032058716, "rewards/rejected": -2.069897413253784, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 33.345127493561584, "learning_rate": 3.473909705816111e-08, "logits/chosen": -2.3016886711120605, "logits/rejected": -2.236023187637329, "logps/chosen": -455.349365234375, "logps/pi_response": -263.25799560546875, "logps/ref_response": -165.34706115722656, "logps/rejected": -681.8700561523438, "loss": 0.4903, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.555412769317627, "rewards/margins": 0.8201801180839539, "rewards/rejected": -2.3755927085876465, "step": 50 }, { "epoch": 0.9874476987447699, "step": 59, "total_flos": 0.0, "train_loss": 0.5340633715613413, "train_runtime": 2647.515, "train_samples_per_second": 5.773, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }