{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.746268656716418, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007462686567164179, "grad_norm": 36.40082931518555, "learning_rate": 5e-06, "loss": 1.6703, "step": 1 }, { "epoch": 0.014925373134328358, "grad_norm": 34.74425506591797, "learning_rate": 1e-05, "loss": 1.6634, "step": 2 }, { "epoch": 0.022388059701492536, "grad_norm": 27.06759262084961, "learning_rate": 9.89795918367347e-06, "loss": 1.5168, "step": 3 }, { "epoch": 0.029850746268656716, "grad_norm": 19.73917579650879, "learning_rate": 9.795918367346939e-06, "loss": 1.5485, "step": 4 }, { "epoch": 0.03731343283582089, "grad_norm": 16.78091049194336, "learning_rate": 9.693877551020408e-06, "loss": 1.3194, "step": 5 }, { "epoch": 0.04477611940298507, "grad_norm": 17.270545959472656, "learning_rate": 9.591836734693878e-06, "loss": 1.6497, "step": 6 }, { "epoch": 0.05223880597014925, "grad_norm": 28.16309356689453, "learning_rate": 9.489795918367348e-06, "loss": 1.3361, "step": 7 }, { "epoch": 0.05970149253731343, "grad_norm": 16.819095611572266, "learning_rate": 9.387755102040818e-06, "loss": 1.3992, "step": 8 }, { "epoch": 0.06716417910447761, "grad_norm": 16.588680267333984, "learning_rate": 9.285714285714288e-06, "loss": 1.295, "step": 9 }, { "epoch": 0.07462686567164178, "grad_norm": 18.916818618774414, "learning_rate": 9.183673469387756e-06, "loss": 1.5424, "step": 10 }, { "epoch": 0.08208955223880597, "grad_norm": 20.738298416137695, "learning_rate": 9.081632653061225e-06, "loss": 1.7722, "step": 11 }, { "epoch": 0.08955223880597014, "grad_norm": 17.011619567871094, "learning_rate": 8.979591836734695e-06, "loss": 1.3152, "step": 12 }, { "epoch": 0.09701492537313433, "grad_norm": 17.751367568969727, "learning_rate": 8.877551020408163e-06, "loss": 1.5504, "step": 13 }, { "epoch": 0.1044776119402985, "grad_norm": 16.80768394470215, "learning_rate": 8.775510204081633e-06, "loss": 1.3405, "step": 14 }, { "epoch": 0.11194029850746269, "grad_norm": 16.026403427124023, "learning_rate": 8.673469387755103e-06, "loss": 1.5062, "step": 15 }, { "epoch": 0.11940298507462686, "grad_norm": 23.68254852294922, "learning_rate": 8.571428571428571e-06, "loss": 1.5282, "step": 16 }, { "epoch": 0.12686567164179105, "grad_norm": 14.981740951538086, "learning_rate": 8.469387755102042e-06, "loss": 0.9734, "step": 17 }, { "epoch": 0.13432835820895522, "grad_norm": 15.60690975189209, "learning_rate": 8.36734693877551e-06, "loss": 1.2847, "step": 18 }, { "epoch": 0.1417910447761194, "grad_norm": 14.372577667236328, "learning_rate": 8.26530612244898e-06, "loss": 1.3, "step": 19 }, { "epoch": 0.14925373134328357, "grad_norm": 15.565217971801758, "learning_rate": 8.16326530612245e-06, "loss": 1.344, "step": 20 }, { "epoch": 0.15671641791044777, "grad_norm": 16.995765686035156, "learning_rate": 8.06122448979592e-06, "loss": 1.1579, "step": 21 }, { "epoch": 0.16417910447761194, "grad_norm": 19.22214698791504, "learning_rate": 7.959183673469388e-06, "loss": 1.3757, "step": 22 }, { "epoch": 0.17164179104477612, "grad_norm": 15.240199089050293, "learning_rate": 7.857142857142858e-06, "loss": 1.3662, "step": 23 }, { "epoch": 0.1791044776119403, "grad_norm": 13.559714317321777, "learning_rate": 7.755102040816327e-06, "loss": 1.4103, "step": 24 }, { "epoch": 0.1865671641791045, "grad_norm": 11.714433670043945, "learning_rate": 7.653061224489796e-06, "loss": 0.4849, "step": 25 }, { "epoch": 0.19402985074626866, "grad_norm": 9.63026237487793, "learning_rate": 7.551020408163265e-06, "loss": 0.8331, "step": 26 }, { "epoch": 0.20149253731343283, "grad_norm": 14.981061935424805, "learning_rate": 7.448979591836736e-06, "loss": 1.1483, "step": 27 }, { "epoch": 0.208955223880597, "grad_norm": 9.80345630645752, "learning_rate": 7.346938775510205e-06, "loss": 0.3934, "step": 28 }, { "epoch": 0.21641791044776118, "grad_norm": 15.892987251281738, "learning_rate": 7.244897959183675e-06, "loss": 1.3524, "step": 29 }, { "epoch": 0.22388059701492538, "grad_norm": 15.154711723327637, "learning_rate": 7.1428571428571436e-06, "loss": 1.2404, "step": 30 }, { "epoch": 0.23134328358208955, "grad_norm": 21.161544799804688, "learning_rate": 7.0408163265306125e-06, "loss": 1.166, "step": 31 }, { "epoch": 0.23880597014925373, "grad_norm": 16.721675872802734, "learning_rate": 6.938775510204082e-06, "loss": 1.3453, "step": 32 }, { "epoch": 0.2462686567164179, "grad_norm": 15.240865707397461, "learning_rate": 6.836734693877551e-06, "loss": 1.017, "step": 33 }, { "epoch": 0.2537313432835821, "grad_norm": 13.743293762207031, "learning_rate": 6.734693877551021e-06, "loss": 1.2646, "step": 34 }, { "epoch": 0.26119402985074625, "grad_norm": 21.402868270874023, "learning_rate": 6.63265306122449e-06, "loss": 1.2539, "step": 35 }, { "epoch": 0.26865671641791045, "grad_norm": 12.299694061279297, "learning_rate": 6.530612244897959e-06, "loss": 0.5733, "step": 36 }, { "epoch": 0.27611940298507465, "grad_norm": 13.034706115722656, "learning_rate": 6.4285714285714295e-06, "loss": 1.3188, "step": 37 }, { "epoch": 0.2835820895522388, "grad_norm": 13.541306495666504, "learning_rate": 6.326530612244899e-06, "loss": 0.8876, "step": 38 }, { "epoch": 0.291044776119403, "grad_norm": 14.614362716674805, "learning_rate": 6.224489795918368e-06, "loss": 1.4944, "step": 39 }, { "epoch": 0.29850746268656714, "grad_norm": 18.72150230407715, "learning_rate": 6.122448979591837e-06, "loss": 1.4622, "step": 40 }, { "epoch": 0.30597014925373134, "grad_norm": 12.321627616882324, "learning_rate": 6.020408163265307e-06, "loss": 1.2637, "step": 41 }, { "epoch": 0.31343283582089554, "grad_norm": 11.823638916015625, "learning_rate": 5.918367346938776e-06, "loss": 1.1925, "step": 42 }, { "epoch": 0.3208955223880597, "grad_norm": 10.207959175109863, "learning_rate": 5.816326530612246e-06, "loss": 1.1677, "step": 43 }, { "epoch": 0.3283582089552239, "grad_norm": 14.41779613494873, "learning_rate": 5.7142857142857145e-06, "loss": 1.3201, "step": 44 }, { "epoch": 0.3358208955223881, "grad_norm": 10.349492073059082, "learning_rate": 5.6122448979591834e-06, "loss": 1.0491, "step": 45 }, { "epoch": 0.34328358208955223, "grad_norm": 9.687318801879883, "learning_rate": 5.510204081632653e-06, "loss": 0.5153, "step": 46 }, { "epoch": 0.35074626865671643, "grad_norm": 12.616000175476074, "learning_rate": 5.408163265306123e-06, "loss": 1.3589, "step": 47 }, { "epoch": 0.3582089552238806, "grad_norm": 15.669512748718262, "learning_rate": 5.306122448979593e-06, "loss": 0.7591, "step": 48 }, { "epoch": 0.3656716417910448, "grad_norm": 11.46850299835205, "learning_rate": 5.204081632653062e-06, "loss": 1.0643, "step": 49 }, { "epoch": 0.373134328358209, "grad_norm": 13.470056533813477, "learning_rate": 5.1020408163265315e-06, "loss": 1.1636, "step": 50 }, { "epoch": 0.3805970149253731, "grad_norm": 12.210711479187012, "learning_rate": 5e-06, "loss": 0.5832, "step": 51 }, { "epoch": 0.3880597014925373, "grad_norm": 12.775903701782227, "learning_rate": 4.897959183673469e-06, "loss": 0.8638, "step": 52 }, { "epoch": 0.39552238805970147, "grad_norm": 12.06881046295166, "learning_rate": 4.795918367346939e-06, "loss": 1.234, "step": 53 }, { "epoch": 0.40298507462686567, "grad_norm": 11.675975799560547, "learning_rate": 4.693877551020409e-06, "loss": 1.1502, "step": 54 }, { "epoch": 0.41044776119402987, "grad_norm": 10.595233917236328, "learning_rate": 4.591836734693878e-06, "loss": 0.5959, "step": 55 }, { "epoch": 0.417910447761194, "grad_norm": 13.459734916687012, "learning_rate": 4.489795918367348e-06, "loss": 1.4252, "step": 56 }, { "epoch": 0.4253731343283582, "grad_norm": 10.65233325958252, "learning_rate": 4.3877551020408165e-06, "loss": 1.0071, "step": 57 }, { "epoch": 0.43283582089552236, "grad_norm": 10.991082191467285, "learning_rate": 4.2857142857142855e-06, "loss": 0.9786, "step": 58 }, { "epoch": 0.44029850746268656, "grad_norm": 12.973753929138184, "learning_rate": 4.183673469387755e-06, "loss": 1.3687, "step": 59 }, { "epoch": 0.44776119402985076, "grad_norm": 8.963390350341797, "learning_rate": 4.081632653061225e-06, "loss": 0.4663, "step": 60 }, { "epoch": 0.4552238805970149, "grad_norm": 15.190298080444336, "learning_rate": 3.979591836734694e-06, "loss": 1.5676, "step": 61 }, { "epoch": 0.4626865671641791, "grad_norm": 12.159441947937012, "learning_rate": 3.877551020408164e-06, "loss": 0.9444, "step": 62 }, { "epoch": 0.4701492537313433, "grad_norm": 12.203204154968262, "learning_rate": 3.7755102040816327e-06, "loss": 1.0785, "step": 63 }, { "epoch": 0.47761194029850745, "grad_norm": 13.535058975219727, "learning_rate": 3.6734693877551024e-06, "loss": 1.3801, "step": 64 }, { "epoch": 0.48507462686567165, "grad_norm": 15.462494850158691, "learning_rate": 3.5714285714285718e-06, "loss": 1.3773, "step": 65 }, { "epoch": 0.4925373134328358, "grad_norm": 10.449372291564941, "learning_rate": 3.469387755102041e-06, "loss": 0.8747, "step": 66 }, { "epoch": 0.5, "grad_norm": 15.766761779785156, "learning_rate": 3.3673469387755105e-06, "loss": 1.0549, "step": 67 }, { "epoch": 0.5074626865671642, "grad_norm": 7.633336067199707, "learning_rate": 3.2653061224489794e-06, "loss": 0.344, "step": 68 }, { "epoch": 0.5149253731343284, "grad_norm": 13.762042999267578, "learning_rate": 3.1632653061224496e-06, "loss": 1.2072, "step": 69 }, { "epoch": 0.5223880597014925, "grad_norm": 11.871623992919922, "learning_rate": 3.0612244897959185e-06, "loss": 1.1146, "step": 70 }, { "epoch": 0.5298507462686567, "grad_norm": 12.153115272521973, "learning_rate": 2.959183673469388e-06, "loss": 1.0648, "step": 71 }, { "epoch": 0.5373134328358209, "grad_norm": 15.02953052520752, "learning_rate": 2.8571428571428573e-06, "loss": 1.0374, "step": 72 }, { "epoch": 0.5447761194029851, "grad_norm": 13.172088623046875, "learning_rate": 2.7551020408163266e-06, "loss": 1.2356, "step": 73 }, { "epoch": 0.5522388059701493, "grad_norm": 11.575133323669434, "learning_rate": 2.6530612244897964e-06, "loss": 1.0695, "step": 74 }, { "epoch": 0.5597014925373134, "grad_norm": 12.820709228515625, "learning_rate": 2.5510204081632657e-06, "loss": 1.1283, "step": 75 }, { "epoch": 0.5671641791044776, "grad_norm": 12.87095832824707, "learning_rate": 2.4489795918367347e-06, "loss": 1.1721, "step": 76 }, { "epoch": 0.5746268656716418, "grad_norm": 13.630508422851562, "learning_rate": 2.3469387755102044e-06, "loss": 1.2478, "step": 77 }, { "epoch": 0.582089552238806, "grad_norm": 17.44233512878418, "learning_rate": 2.244897959183674e-06, "loss": 1.3778, "step": 78 }, { "epoch": 0.5895522388059702, "grad_norm": 12.048669815063477, "learning_rate": 2.1428571428571427e-06, "loss": 0.8454, "step": 79 }, { "epoch": 0.5970149253731343, "grad_norm": 10.956369400024414, "learning_rate": 2.0408163265306125e-06, "loss": 0.8798, "step": 80 }, { "epoch": 0.6044776119402985, "grad_norm": 12.508173942565918, "learning_rate": 1.938775510204082e-06, "loss": 1.2105, "step": 81 }, { "epoch": 0.6119402985074627, "grad_norm": 10.739660263061523, "learning_rate": 1.8367346938775512e-06, "loss": 1.0513, "step": 82 }, { "epoch": 0.6194029850746269, "grad_norm": 14.237381935119629, "learning_rate": 1.7346938775510206e-06, "loss": 1.3152, "step": 83 }, { "epoch": 0.6268656716417911, "grad_norm": 10.444908142089844, "learning_rate": 1.6326530612244897e-06, "loss": 0.7026, "step": 84 }, { "epoch": 0.6343283582089553, "grad_norm": 12.019598960876465, "learning_rate": 1.5306122448979593e-06, "loss": 1.1211, "step": 85 }, { "epoch": 0.6417910447761194, "grad_norm": 12.660603523254395, "learning_rate": 1.4285714285714286e-06, "loss": 0.9115, "step": 86 }, { "epoch": 0.6492537313432836, "grad_norm": 10.193121910095215, "learning_rate": 1.3265306122448982e-06, "loss": 0.9419, "step": 87 }, { "epoch": 0.6567164179104478, "grad_norm": 14.029544830322266, "learning_rate": 1.2244897959183673e-06, "loss": 1.0446, "step": 88 }, { "epoch": 0.664179104477612, "grad_norm": 10.461629867553711, "learning_rate": 1.122448979591837e-06, "loss": 0.9771, "step": 89 }, { "epoch": 0.6716417910447762, "grad_norm": 7.607001781463623, "learning_rate": 1.0204081632653063e-06, "loss": 0.5959, "step": 90 }, { "epoch": 0.6791044776119403, "grad_norm": 10.331579208374023, "learning_rate": 9.183673469387756e-07, "loss": 0.9838, "step": 91 }, { "epoch": 0.6865671641791045, "grad_norm": 10.43489933013916, "learning_rate": 8.163265306122449e-07, "loss": 0.9485, "step": 92 }, { "epoch": 0.6940298507462687, "grad_norm": 11.645877838134766, "learning_rate": 7.142857142857143e-07, "loss": 1.1304, "step": 93 }, { "epoch": 0.7014925373134329, "grad_norm": 11.262922286987305, "learning_rate": 6.122448979591837e-07, "loss": 0.7431, "step": 94 }, { "epoch": 0.7089552238805971, "grad_norm": 12.494215965270996, "learning_rate": 5.102040816326531e-07, "loss": 1.1966, "step": 95 }, { "epoch": 0.7164179104477612, "grad_norm": 11.587281227111816, "learning_rate": 4.0816326530612243e-07, "loss": 0.6951, "step": 96 }, { "epoch": 0.7238805970149254, "grad_norm": 10.64450454711914, "learning_rate": 3.0612244897959183e-07, "loss": 0.6916, "step": 97 }, { "epoch": 0.7313432835820896, "grad_norm": 11.353170394897461, "learning_rate": 2.0408163265306121e-07, "loss": 0.7753, "step": 98 }, { "epoch": 0.7388059701492538, "grad_norm": 10.214649200439453, "learning_rate": 1.0204081632653061e-07, "loss": 0.6923, "step": 99 }, { "epoch": 0.746268656716418, "grad_norm": 11.73704719543457, "learning_rate": 0.0, "loss": 1.1282, "step": 100 }, { "epoch": 0.746268656716418, "step": 100, "total_flos": 2305515375820800.0, "train_loss": 1.1278739917278289, "train_runtime": 528.5108, "train_samples_per_second": 0.189, "train_steps_per_second": 0.189 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2305515375820800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }