{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9036144578313253, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 4.9995000000000005e-05, "loss": 1.7049, "step": 1 }, { "epoch": 0.01, "learning_rate": 4.999e-05, "loss": 2.9872, "step": 2 }, { "epoch": 0.02, "learning_rate": 4.9985e-05, "loss": 2.1065, "step": 3 }, { "epoch": 0.02, "learning_rate": 4.9980000000000006e-05, "loss": 2.2636, "step": 4 }, { "epoch": 0.03, "learning_rate": 4.9975e-05, "loss": 1.8175, "step": 5 }, { "epoch": 0.04, "learning_rate": 4.997e-05, "loss": 1.5265, "step": 6 }, { "epoch": 0.04, "learning_rate": 4.9965e-05, "loss": 1.3999, "step": 7 }, { "epoch": 0.05, "learning_rate": 4.996e-05, "loss": 1.3867, "step": 8 }, { "epoch": 0.05, "learning_rate": 4.9955e-05, "loss": 1.3329, "step": 9 }, { "epoch": 0.06, "learning_rate": 4.995e-05, "loss": 1.3145, "step": 10 }, { "epoch": 0.07, "learning_rate": 4.9945000000000004e-05, "loss": 1.2775, "step": 11 }, { "epoch": 0.07, "learning_rate": 4.9940000000000006e-05, "loss": 1.2551, "step": 12 }, { "epoch": 0.08, "learning_rate": 4.9935e-05, "loss": 1.2279, "step": 13 }, { "epoch": 0.08, "learning_rate": 4.9930000000000005e-05, "loss": 1.1668, "step": 14 }, { "epoch": 0.09, "learning_rate": 4.992500000000001e-05, "loss": 1.1261, "step": 15 }, { "epoch": 0.1, "learning_rate": 4.992e-05, "loss": 1.1738, "step": 16 }, { "epoch": 0.1, "learning_rate": 4.9915e-05, "loss": 1.1098, "step": 17 }, { "epoch": 0.11, "learning_rate": 4.991e-05, "loss": 1.1268, "step": 18 }, { "epoch": 0.11, "learning_rate": 4.9905000000000004e-05, "loss": 1.1261, "step": 19 }, { "epoch": 0.12, "learning_rate": 4.99e-05, "loss": 1.1152, "step": 20 }, { "epoch": 0.13, "learning_rate": 4.9895e-05, "loss": 1.1345, "step": 21 }, { "epoch": 0.13, "learning_rate": 4.9890000000000005e-05, "loss": 1.1341, "step": 22 }, { "epoch": 0.14, "learning_rate": 4.9885e-05, "loss": 1.1288, "step": 23 }, { "epoch": 0.14, "learning_rate": 4.9880000000000004e-05, "loss": 1.1264, "step": 24 }, { "epoch": 0.15, "learning_rate": 4.9875000000000006e-05, "loss": 1.0576, "step": 25 }, { "epoch": 0.16, "learning_rate": 4.987e-05, "loss": 1.0905, "step": 26 }, { "epoch": 0.16, "learning_rate": 4.9865e-05, "loss": 1.0967, "step": 27 }, { "epoch": 0.17, "learning_rate": 4.986e-05, "loss": 1.0937, "step": 28 }, { "epoch": 0.17, "learning_rate": 4.9855e-05, "loss": 1.0597, "step": 29 }, { "epoch": 0.18, "learning_rate": 4.9850000000000006e-05, "loss": 1.0706, "step": 30 }, { "epoch": 0.19, "learning_rate": 4.9845e-05, "loss": 1.0608, "step": 31 }, { "epoch": 0.19, "learning_rate": 4.9840000000000004e-05, "loss": 1.0817, "step": 32 }, { "epoch": 0.2, "learning_rate": 4.9835000000000007e-05, "loss": 1.0739, "step": 33 }, { "epoch": 0.2, "learning_rate": 4.983e-05, "loss": 1.0434, "step": 34 }, { "epoch": 0.21, "learning_rate": 4.9825000000000005e-05, "loss": 1.0168, "step": 35 }, { "epoch": 0.22, "learning_rate": 4.982e-05, "loss": 1.0351, "step": 36 }, { "epoch": 0.22, "learning_rate": 4.9815e-05, "loss": 1.0435, "step": 37 }, { "epoch": 0.23, "learning_rate": 4.981e-05, "loss": 1.0247, "step": 38 }, { "epoch": 0.23, "learning_rate": 4.9805e-05, "loss": 1.0065, "step": 39 }, { "epoch": 0.24, "learning_rate": 4.9800000000000004e-05, "loss": 1.0036, "step": 40 }, { "epoch": 0.25, "learning_rate": 4.9795e-05, "loss": 1.0483, "step": 41 }, { "epoch": 0.25, "learning_rate": 4.979e-05, "loss": 1.0252, "step": 42 }, { "epoch": 0.26, "learning_rate": 4.9785000000000005e-05, "loss": 1.0344, "step": 43 }, { "epoch": 0.27, "learning_rate": 4.978e-05, "loss": 1.0353, "step": 44 }, { "epoch": 0.27, "learning_rate": 4.9775000000000004e-05, "loss": 1.0381, "step": 45 }, { "epoch": 0.28, "learning_rate": 4.977e-05, "loss": 0.9899, "step": 46 }, { "epoch": 0.28, "learning_rate": 4.9765e-05, "loss": 0.9786, "step": 47 }, { "epoch": 0.29, "learning_rate": 4.976e-05, "loss": 1.0434, "step": 48 }, { "epoch": 0.3, "learning_rate": 4.9755e-05, "loss": 1.0069, "step": 49 }, { "epoch": 0.3, "learning_rate": 4.975e-05, "loss": 0.9665, "step": 50 }, { "epoch": 0.31, "learning_rate": 4.9745000000000006e-05, "loss": 0.9754, "step": 51 }, { "epoch": 0.31, "learning_rate": 4.974e-05, "loss": 0.9718, "step": 52 }, { "epoch": 0.32, "learning_rate": 4.9735000000000004e-05, "loss": 0.9828, "step": 53 }, { "epoch": 0.33, "learning_rate": 4.973000000000001e-05, "loss": 0.996, "step": 54 }, { "epoch": 0.33, "learning_rate": 4.9725e-05, "loss": 1.026, "step": 55 }, { "epoch": 0.34, "learning_rate": 4.972e-05, "loss": 0.9832, "step": 56 }, { "epoch": 0.34, "learning_rate": 4.9715e-05, "loss": 0.9985, "step": 57 }, { "epoch": 0.35, "learning_rate": 4.9710000000000003e-05, "loss": 1.0145, "step": 58 }, { "epoch": 0.36, "learning_rate": 4.9705e-05, "loss": 0.9708, "step": 59 }, { "epoch": 0.36, "learning_rate": 4.97e-05, "loss": 0.9411, "step": 60 }, { "epoch": 0.37, "learning_rate": 4.9695000000000004e-05, "loss": 0.9774, "step": 61 }, { "epoch": 0.37, "learning_rate": 4.969e-05, "loss": 1.0044, "step": 62 }, { "epoch": 0.38, "learning_rate": 4.9685e-05, "loss": 0.9581, "step": 63 }, { "epoch": 0.39, "learning_rate": 4.9680000000000005e-05, "loss": 1.0007, "step": 64 }, { "epoch": 0.39, "learning_rate": 4.967500000000001e-05, "loss": 0.9589, "step": 65 }, { "epoch": 0.4, "learning_rate": 4.967e-05, "loss": 0.9704, "step": 66 }, { "epoch": 0.4, "learning_rate": 4.9665e-05, "loss": 0.978, "step": 67 }, { "epoch": 0.41, "learning_rate": 4.966e-05, "loss": 0.9554, "step": 68 }, { "epoch": 0.42, "learning_rate": 4.9655000000000005e-05, "loss": 0.9433, "step": 69 }, { "epoch": 0.42, "learning_rate": 4.965e-05, "loss": 0.9905, "step": 70 }, { "epoch": 0.43, "learning_rate": 4.9645e-05, "loss": 0.9578, "step": 71 }, { "epoch": 0.43, "learning_rate": 4.9640000000000006e-05, "loss": 0.9069, "step": 72 }, { "epoch": 0.44, "learning_rate": 4.9635e-05, "loss": 0.9647, "step": 73 }, { "epoch": 0.45, "learning_rate": 4.9630000000000004e-05, "loss": 0.9377, "step": 74 }, { "epoch": 0.45, "learning_rate": 4.962500000000001e-05, "loss": 0.9343, "step": 75 }, { "epoch": 0.46, "learning_rate": 4.962e-05, "loss": 0.9644, "step": 76 }, { "epoch": 0.46, "learning_rate": 4.9615e-05, "loss": 0.9737, "step": 77 }, { "epoch": 0.47, "learning_rate": 4.961e-05, "loss": 0.9552, "step": 78 }, { "epoch": 0.48, "learning_rate": 4.9605000000000004e-05, "loss": 0.9158, "step": 79 }, { "epoch": 0.48, "learning_rate": 4.96e-05, "loss": 0.9278, "step": 80 }, { "epoch": 0.49, "learning_rate": 4.9595e-05, "loss": 0.9464, "step": 81 }, { "epoch": 0.49, "learning_rate": 4.9590000000000005e-05, "loss": 0.9672, "step": 82 }, { "epoch": 0.5, "learning_rate": 4.9585e-05, "loss": 0.9487, "step": 83 }, { "epoch": 0.51, "learning_rate": 4.958e-05, "loss": 0.9248, "step": 84 }, { "epoch": 0.51, "learning_rate": 4.9575000000000006e-05, "loss": 0.9443, "step": 85 }, { "epoch": 0.52, "learning_rate": 4.957e-05, "loss": 0.9565, "step": 86 }, { "epoch": 0.52, "learning_rate": 4.9565e-05, "loss": 0.934, "step": 87 }, { "epoch": 0.53, "learning_rate": 4.956e-05, "loss": 0.9198, "step": 88 }, { "epoch": 0.54, "learning_rate": 4.9555e-05, "loss": 0.8948, "step": 89 }, { "epoch": 0.54, "learning_rate": 4.9550000000000005e-05, "loss": 0.9336, "step": 90 }, { "epoch": 0.55, "learning_rate": 4.9545e-05, "loss": 0.9117, "step": 91 }, { "epoch": 0.55, "learning_rate": 4.9540000000000003e-05, "loss": 0.876, "step": 92 }, { "epoch": 0.56, "learning_rate": 4.9535000000000006e-05, "loss": 0.9297, "step": 93 }, { "epoch": 0.57, "learning_rate": 4.953e-05, "loss": 0.9241, "step": 94 }, { "epoch": 0.57, "learning_rate": 4.9525000000000004e-05, "loss": 0.9305, "step": 95 }, { "epoch": 0.58, "learning_rate": 4.952e-05, "loss": 0.9395, "step": 96 }, { "epoch": 0.58, "learning_rate": 4.9515e-05, "loss": 0.9208, "step": 97 }, { "epoch": 0.59, "learning_rate": 4.951e-05, "loss": 0.9112, "step": 98 }, { "epoch": 0.6, "learning_rate": 4.9505e-05, "loss": 0.9159, "step": 99 }, { "epoch": 0.6, "learning_rate": 4.9500000000000004e-05, "loss": 0.902, "step": 100 }, { "epoch": 0.61, "learning_rate": 4.9495e-05, "loss": 0.9412, "step": 101 }, { "epoch": 0.61, "learning_rate": 4.949e-05, "loss": 0.8916, "step": 102 }, { "epoch": 0.62, "learning_rate": 4.9485000000000005e-05, "loss": 0.8846, "step": 103 }, { "epoch": 0.63, "learning_rate": 4.948000000000001e-05, "loss": 0.8974, "step": 104 }, { "epoch": 0.63, "learning_rate": 4.9475e-05, "loss": 0.9332, "step": 105 }, { "epoch": 0.64, "learning_rate": 4.947e-05, "loss": 0.9, "step": 106 }, { "epoch": 0.64, "learning_rate": 4.9465e-05, "loss": 0.92, "step": 107 }, { "epoch": 0.65, "learning_rate": 4.946e-05, "loss": 0.8802, "step": 108 }, { "epoch": 0.66, "learning_rate": 4.9455e-05, "loss": 0.9199, "step": 109 }, { "epoch": 0.66, "learning_rate": 4.945e-05, "loss": 0.8872, "step": 110 }, { "epoch": 0.67, "learning_rate": 4.9445000000000005e-05, "loss": 0.8689, "step": 111 }, { "epoch": 0.67, "learning_rate": 4.944e-05, "loss": 0.906, "step": 112 }, { "epoch": 0.68, "learning_rate": 4.9435000000000004e-05, "loss": 0.8725, "step": 113 }, { "epoch": 0.69, "learning_rate": 4.9430000000000006e-05, "loss": 0.8708, "step": 114 }, { "epoch": 0.69, "learning_rate": 4.9425e-05, "loss": 0.8726, "step": 115 }, { "epoch": 0.7, "learning_rate": 4.942e-05, "loss": 0.8584, "step": 116 }, { "epoch": 0.7, "learning_rate": 4.9415e-05, "loss": 0.9218, "step": 117 }, { "epoch": 0.71, "learning_rate": 4.941e-05, "loss": 0.8279, "step": 118 }, { "epoch": 0.72, "learning_rate": 4.9405e-05, "loss": 0.9098, "step": 119 }, { "epoch": 0.72, "learning_rate": 4.94e-05, "loss": 0.8924, "step": 120 }, { "epoch": 0.73, "learning_rate": 4.9395000000000004e-05, "loss": 0.8897, "step": 121 }, { "epoch": 0.73, "learning_rate": 4.939e-05, "loss": 0.876, "step": 122 }, { "epoch": 0.74, "learning_rate": 4.9385e-05, "loss": 0.912, "step": 123 }, { "epoch": 0.75, "learning_rate": 4.9380000000000005e-05, "loss": 0.878, "step": 124 }, { "epoch": 0.75, "learning_rate": 4.937500000000001e-05, "loss": 0.8596, "step": 125 }, { "epoch": 0.76, "learning_rate": 4.937e-05, "loss": 0.8849, "step": 126 }, { "epoch": 0.77, "learning_rate": 4.9365e-05, "loss": 0.8471, "step": 127 }, { "epoch": 0.77, "learning_rate": 4.936e-05, "loss": 0.894, "step": 128 }, { "epoch": 0.78, "learning_rate": 4.9355000000000004e-05, "loss": 0.9051, "step": 129 }, { "epoch": 0.78, "learning_rate": 4.935e-05, "loss": 0.8714, "step": 130 }, { "epoch": 0.79, "learning_rate": 4.9345e-05, "loss": 0.8685, "step": 131 }, { "epoch": 0.8, "learning_rate": 4.9340000000000005e-05, "loss": 0.8667, "step": 132 }, { "epoch": 0.8, "learning_rate": 4.9335e-05, "loss": 0.8521, "step": 133 }, { "epoch": 0.81, "learning_rate": 4.9330000000000004e-05, "loss": 0.8964, "step": 134 }, { "epoch": 0.81, "learning_rate": 4.9325000000000006e-05, "loss": 0.8463, "step": 135 }, { "epoch": 0.82, "learning_rate": 4.932e-05, "loss": 0.8626, "step": 136 }, { "epoch": 0.83, "learning_rate": 4.9315e-05, "loss": 0.84, "step": 137 }, { "epoch": 0.83, "learning_rate": 4.931e-05, "loss": 0.8535, "step": 138 }, { "epoch": 0.84, "learning_rate": 4.9305e-05, "loss": 0.8539, "step": 139 }, { "epoch": 0.84, "learning_rate": 4.93e-05, "loss": 0.849, "step": 140 }, { "epoch": 0.85, "learning_rate": 4.9295e-05, "loss": 0.8713, "step": 141 }, { "epoch": 0.86, "learning_rate": 4.9290000000000004e-05, "loss": 0.8493, "step": 142 }, { "epoch": 0.86, "learning_rate": 4.928500000000001e-05, "loss": 0.8424, "step": 143 }, { "epoch": 0.87, "learning_rate": 4.928e-05, "loss": 0.8723, "step": 144 }, { "epoch": 0.87, "learning_rate": 4.9275000000000005e-05, "loss": 0.8732, "step": 145 }, { "epoch": 0.88, "learning_rate": 4.927000000000001e-05, "loss": 0.8698, "step": 146 }, { "epoch": 0.89, "learning_rate": 4.9265e-05, "loss": 0.8615, "step": 147 }, { "epoch": 0.89, "learning_rate": 4.926e-05, "loss": 0.8728, "step": 148 }, { "epoch": 0.9, "learning_rate": 4.9255e-05, "loss": 0.9018, "step": 149 }, { "epoch": 0.9, "learning_rate": 4.9250000000000004e-05, "loss": 0.8704, "step": 150 } ], "logging_steps": 1, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 61, "save_steps": 10, "total_flos": 1.916845826789376e+16, "train_batch_size": 3584, "trial_name": null, "trial_params": null }