{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 739, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013531799729364006, "grad_norm": 22.625, "learning_rate": 1.3513513513513515e-07, "loss": 2.1989, "step": 1 }, { "epoch": 0.013531799729364006, "grad_norm": 22.5, "learning_rate": 1.3513513513513515e-06, "loss": 2.2213, "step": 10 }, { "epoch": 0.02706359945872801, "grad_norm": 8.6875, "learning_rate": 2.702702702702703e-06, "loss": 2.2125, "step": 20 }, { "epoch": 0.04059539918809202, "grad_norm": 5.15625, "learning_rate": 4.0540540540540545e-06, "loss": 2.1298, "step": 30 }, { "epoch": 0.05412719891745602, "grad_norm": 3.734375, "learning_rate": 5.405405405405406e-06, "loss": 2.0609, "step": 40 }, { "epoch": 0.06765899864682003, "grad_norm": 3.65625, "learning_rate": 6.7567567567567575e-06, "loss": 2.0108, "step": 50 }, { "epoch": 0.08119079837618404, "grad_norm": 3.546875, "learning_rate": 8.108108108108109e-06, "loss": 1.918, "step": 60 }, { "epoch": 0.09472259810554803, "grad_norm": 3.03125, "learning_rate": 9.45945945945946e-06, "loss": 1.8292, "step": 70 }, { "epoch": 0.10825439783491204, "grad_norm": 2.90625, "learning_rate": 9.99799150901764e-06, "loss": 1.8037, "step": 80 }, { "epoch": 0.12178619756427606, "grad_norm": 2.796875, "learning_rate": 9.985723240513795e-06, "loss": 1.7704, "step": 90 }, { "epoch": 0.13531799729364005, "grad_norm": 2.578125, "learning_rate": 9.962329873928743e-06, "loss": 1.7527, "step": 100 }, { "epoch": 0.14884979702300405, "grad_norm": 2.609375, "learning_rate": 9.92786360901067e-06, "loss": 1.7479, "step": 110 }, { "epoch": 0.16238159675236807, "grad_norm": 2.640625, "learning_rate": 9.882401353473711e-06, "loss": 1.7169, "step": 120 }, { "epoch": 0.17591339648173207, "grad_norm": 2.625, "learning_rate": 9.826044551386743e-06, "loss": 1.7196, "step": 130 }, { "epoch": 0.18944519621109607, "grad_norm": 2.703125, "learning_rate": 9.758918956812024e-06, "loss": 1.7238, "step": 140 }, { "epoch": 0.2029769959404601, "grad_norm": 2.65625, "learning_rate": 9.681174353198687e-06, "loss": 1.7291, "step": 150 }, { "epoch": 0.2165087956698241, "grad_norm": 2.625, "learning_rate": 9.59298421915731e-06, "loss": 1.7071, "step": 160 }, { "epoch": 0.23004059539918809, "grad_norm": 2.546875, "learning_rate": 9.494545341361291e-06, "loss": 1.715, "step": 170 }, { "epoch": 0.2435723951285521, "grad_norm": 2.734375, "learning_rate": 9.386077375438848e-06, "loss": 1.7119, "step": 180 }, { "epoch": 0.2571041948579161, "grad_norm": 2.5625, "learning_rate": 9.267822355835402e-06, "loss": 1.7079, "step": 190 }, { "epoch": 0.2706359945872801, "grad_norm": 2.6875, "learning_rate": 9.140044155740102e-06, "loss": 1.6912, "step": 200 }, { "epoch": 0.28416779431664413, "grad_norm": 2.6875, "learning_rate": 9.003027898281551e-06, "loss": 1.7097, "step": 210 }, { "epoch": 0.2976995940460081, "grad_norm": 2.59375, "learning_rate": 8.85707932030663e-06, "loss": 1.6883, "step": 220 }, { "epoch": 0.3112313937753721, "grad_norm": 2.484375, "learning_rate": 8.702524090162023e-06, "loss": 1.6917, "step": 230 }, { "epoch": 0.32476319350473615, "grad_norm": 2.65625, "learning_rate": 8.539707081000808e-06, "loss": 1.6951, "step": 240 }, { "epoch": 0.3382949932341001, "grad_norm": 2.46875, "learning_rate": 8.36899160123559e-06, "loss": 1.7043, "step": 250 }, { "epoch": 0.35182679296346414, "grad_norm": 2.5, "learning_rate": 8.190758583855379e-06, "loss": 1.6851, "step": 260 }, { "epoch": 0.36535859269282817, "grad_norm": 2.53125, "learning_rate": 8.005405736415127e-06, "loss": 1.6912, "step": 270 }, { "epoch": 0.37889039242219213, "grad_norm": 2.578125, "learning_rate": 7.813346653594667e-06, "loss": 1.7011, "step": 280 }, { "epoch": 0.39242219215155616, "grad_norm": 2.484375, "learning_rate": 7.615009894307263e-06, "loss": 1.6955, "step": 290 }, { "epoch": 0.4059539918809202, "grad_norm": 2.640625, "learning_rate": 7.410838025417083e-06, "loss": 1.6848, "step": 300 }, { "epoch": 0.41948579161028415, "grad_norm": 2.6875, "learning_rate": 7.201286634199484e-06, "loss": 1.6749, "step": 310 }, { "epoch": 0.4330175913396482, "grad_norm": 2.546875, "learning_rate": 6.986823311747652e-06, "loss": 1.6797, "step": 320 }, { "epoch": 0.4465493910690122, "grad_norm": 2.53125, "learning_rate": 6.767926609594032e-06, "loss": 1.6744, "step": 330 }, { "epoch": 0.46008119079837617, "grad_norm": 2.453125, "learning_rate": 6.545084971874738e-06, "loss": 1.6636, "step": 340 }, { "epoch": 0.4736129905277402, "grad_norm": 2.640625, "learning_rate": 6.3187956454196885e-06, "loss": 1.6949, "step": 350 }, { "epoch": 0.4871447902571042, "grad_norm": 2.46875, "learning_rate": 6.0895635702004985e-06, "loss": 1.6903, "step": 360 }, { "epoch": 0.5006765899864682, "grad_norm": 2.53125, "learning_rate": 5.857900252611959e-06, "loss": 1.6602, "step": 370 }, { "epoch": 0.5142083897158322, "grad_norm": 2.515625, "learning_rate": 5.624322624101255e-06, "loss": 1.6888, "step": 380 }, { "epoch": 0.5277401894451962, "grad_norm": 2.765625, "learning_rate": 5.3893518876917795e-06, "loss": 1.6876, "step": 390 }, { "epoch": 0.5412719891745602, "grad_norm": 2.546875, "learning_rate": 5.153512354975388e-06, "loss": 1.656, "step": 400 }, { "epoch": 0.5548037889039242, "grad_norm": 2.71875, "learning_rate": 4.917330276168208e-06, "loss": 1.6737, "step": 410 }, { "epoch": 0.5683355886332883, "grad_norm": 2.59375, "learning_rate": 4.681332665840647e-06, "loss": 1.6856, "step": 420 }, { "epoch": 0.5818673883626523, "grad_norm": 2.5625, "learning_rate": 4.446046126941801e-06, "loss": 1.6917, "step": 430 }, { "epoch": 0.5953991880920162, "grad_norm": 2.71875, "learning_rate": 4.211995675742358e-06, "loss": 1.6728, "step": 440 }, { "epoch": 0.6089309878213802, "grad_norm": 2.59375, "learning_rate": 3.979703570318017e-06, "loss": 1.6919, "step": 450 }, { "epoch": 0.6224627875507442, "grad_norm": 2.609375, "learning_rate": 3.749688145187497e-06, "loss": 1.6789, "step": 460 }, { "epoch": 0.6359945872801083, "grad_norm": 2.578125, "learning_rate": 3.5224626547055463e-06, "loss": 1.687, "step": 470 }, { "epoch": 0.6495263870094723, "grad_norm": 2.703125, "learning_rate": 3.298534127791785e-06, "loss": 1.678, "step": 480 }, { "epoch": 0.6630581867388363, "grad_norm": 2.453125, "learning_rate": 3.078402236550926e-06, "loss": 1.6756, "step": 490 }, { "epoch": 0.6765899864682002, "grad_norm": 2.546875, "learning_rate": 2.86255818130892e-06, "loss": 1.6895, "step": 500 }, { "epoch": 0.6901217861975643, "grad_norm": 2.5, "learning_rate": 2.6514835945529706e-06, "loss": 1.685, "step": 510 }, { "epoch": 0.7036535859269283, "grad_norm": 2.609375, "learning_rate": 2.4456494662211082e-06, "loss": 1.6879, "step": 520 }, { "epoch": 0.7171853856562923, "grad_norm": 2.59375, "learning_rate": 2.245515092739488e-06, "loss": 1.6812, "step": 530 }, { "epoch": 0.7307171853856563, "grad_norm": 2.671875, "learning_rate": 2.0515270521524562e-06, "loss": 1.6727, "step": 540 }, { "epoch": 0.7442489851150202, "grad_norm": 2.5625, "learning_rate": 1.864118207632315e-06, "loss": 1.6839, "step": 550 }, { "epoch": 0.7577807848443843, "grad_norm": 2.78125, "learning_rate": 1.683706741592327e-06, "loss": 1.6745, "step": 560 }, { "epoch": 0.7713125845737483, "grad_norm": 2.796875, "learning_rate": 1.5106952225582312e-06, "loss": 1.677, "step": 570 }, { "epoch": 0.7848443843031123, "grad_norm": 2.53125, "learning_rate": 1.3454697068804434e-06, "loss": 1.6677, "step": 580 }, { "epoch": 0.7983761840324763, "grad_norm": 2.515625, "learning_rate": 1.1883988772913924e-06, "loss": 1.6725, "step": 590 }, { "epoch": 0.8119079837618404, "grad_norm": 2.5625, "learning_rate": 1.0398332202301708e-06, "loss": 1.669, "step": 600 }, { "epoch": 0.8254397834912043, "grad_norm": 2.515625, "learning_rate": 9.001042437702468e-07, "loss": 1.6775, "step": 610 }, { "epoch": 0.8389715832205683, "grad_norm": 2.578125, "learning_rate": 7.695237378953224e-07, "loss": 1.6914, "step": 620 }, { "epoch": 0.8525033829499323, "grad_norm": 2.5, "learning_rate": 6.483830787739659e-07, "loss": 1.6743, "step": 630 }, { "epoch": 0.8660351826792964, "grad_norm": 2.453125, "learning_rate": 5.369525785854368e-07, "loss": 1.6784, "step": 640 }, { "epoch": 0.8795669824086604, "grad_norm": 2.546875, "learning_rate": 4.3548088234752814e-07, "loss": 1.687, "step": 650 }, { "epoch": 0.8930987821380244, "grad_norm": 2.59375, "learning_rate": 3.4419441309229587e-07, "loss": 1.6729, "step": 660 }, { "epoch": 0.9066305818673883, "grad_norm": 2.5625, "learning_rate": 2.6329686662774247e-07, "loss": 1.6775, "step": 670 }, { "epoch": 0.9201623815967523, "grad_norm": 2.390625, "learning_rate": 1.9296875701281858e-07, "loss": 1.6813, "step": 680 }, { "epoch": 0.9336941813261164, "grad_norm": 2.46875, "learning_rate": 1.333670137599713e-07, "loss": 1.6944, "step": 690 }, { "epoch": 0.9472259810554804, "grad_norm": 2.75, "learning_rate": 8.462463166403978e-08, "loss": 1.6899, "step": 700 }, { "epoch": 0.9607577807848444, "grad_norm": 2.4375, "learning_rate": 4.685037403886483e-08, "loss": 1.6669, "step": 710 }, { "epoch": 0.9742895805142084, "grad_norm": 2.640625, "learning_rate": 2.012853002380466e-08, "loss": 1.6881, "step": 720 }, { "epoch": 0.9878213802435724, "grad_norm": 2.59375, "learning_rate": 4.51872650170937e-09, "loss": 1.6671, "step": 730 }, { "epoch": 1.0, "eval_loss": 1.7291793823242188, "eval_runtime": 27.3164, "eval_samples_per_second": 15.924, "eval_steps_per_second": 2.013, "step": 739 }, { "epoch": 1.0, "step": 739, "total_flos": 7.51411531916247e+16, "train_loss": 1.726003464891075, "train_runtime": 1754.8001, "train_samples_per_second": 3.368, "train_steps_per_second": 0.421 } ], "logging_steps": 10, "max_steps": 739, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.51411531916247e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }