|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4370868163689013, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008741736327378026, |
|
"grad_norm": 19.75, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 4.7931, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017483472654756052, |
|
"grad_norm": 21.75, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 4.937, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0026225208982134074, |
|
"grad_norm": 20.625, |
|
"learning_rate": 2.586206896551724e-06, |
|
"loss": 5.0176, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0034966945309512104, |
|
"grad_norm": 19.875, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 4.8236, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004370868163689013, |
|
"grad_norm": 16.875, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 4.761, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005245041796426815, |
|
"grad_norm": 16.125, |
|
"learning_rate": 5.172413793103448e-06, |
|
"loss": 4.9055, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.006119215429164618, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 4.6787, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006993389061902421, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 4.6797, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007867562694640224, |
|
"grad_norm": 19.0, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 4.6406, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.008741736327378026, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 4.5986, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009615909960115828, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 4.5762, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01049008359285363, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.0344827586206897e-05, |
|
"loss": 4.5882, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011364257225591434, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.1206896551724138e-05, |
|
"loss": 4.3547, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.012238430858329236, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.206896551724138e-05, |
|
"loss": 4.2879, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.013112604491067038, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.2931034482758622e-05, |
|
"loss": 4.3241, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013986778123804841, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 4.386, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014860951756542643, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 1.4655172413793103e-05, |
|
"loss": 4.266, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015735125389280447, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.5517241379310346e-05, |
|
"loss": 4.2994, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01660929902201825, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.6379310344827585e-05, |
|
"loss": 4.1198, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01748347265475605, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 4.1751, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.018357646287493853, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.810344827586207e-05, |
|
"loss": 3.9965, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.019231819920231655, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.896551724137931e-05, |
|
"loss": 3.9684, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.020105993552969458, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9827586206896554e-05, |
|
"loss": 3.9812, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02098016718570726, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.0689655172413793e-05, |
|
"loss": 3.9997, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.021854340818445065, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.1551724137931033e-05, |
|
"loss": 3.9327, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022728514451182867, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 2.2413793103448276e-05, |
|
"loss": 3.937, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02360268808392067, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.327586206896552e-05, |
|
"loss": 3.8923, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02447686171665847, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.413793103448276e-05, |
|
"loss": 3.8121, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.025351035349396273, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 3.8329, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.026225208982134075, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 3.7719, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.027099382614871877, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.672413793103448e-05, |
|
"loss": 3.8168, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027973556247609683, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 3.6792, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028847729880347485, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.844827586206897e-05, |
|
"loss": 3.6723, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.029721903513085287, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 2.9310344827586206e-05, |
|
"loss": 3.6362, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03059607714582309, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.017241379310345e-05, |
|
"loss": 3.6452, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.031470250778560895, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.103448275862069e-05, |
|
"loss": 3.5118, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03234442441129869, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.1896551724137935e-05, |
|
"loss": 3.4852, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0332185980440365, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.275862068965517e-05, |
|
"loss": 3.3851, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0340927716767743, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.3620689655172414e-05, |
|
"loss": 3.3676, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0349669453095121, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 3.4513, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0358411189422499, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.53448275862069e-05, |
|
"loss": 3.3572, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03671529257498771, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.620689655172414e-05, |
|
"loss": 3.229, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03758946620772551, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.7068965517241385e-05, |
|
"loss": 3.2683, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03846363984046331, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.793103448275862e-05, |
|
"loss": 3.2209, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.039337813473201116, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.8793103448275865e-05, |
|
"loss": 3.3169, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.040211987105938915, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.965517241379311e-05, |
|
"loss": 3.2609, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04108616073867672, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.0517241379310344e-05, |
|
"loss": 3.2348, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04196033437141452, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 3.2157, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.042834508004152325, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.224137931034483e-05, |
|
"loss": 3.118, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04370868163689013, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 3.1812, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04458285526962793, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.396551724137931e-05, |
|
"loss": 3.1164, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.045457028902365734, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 3.1016, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04633120253510353, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.5689655172413794e-05, |
|
"loss": 3.0119, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04720537616784134, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 3.0376, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04807954980057914, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.741379310344828e-05, |
|
"loss": 3.0525, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04895372343331694, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 3.0417, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04982789706605475, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.913793103448276e-05, |
|
"loss": 2.9921, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.050702070698792547, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 5e-05, |
|
"loss": 2.9874, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.05157624433153035, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.9999895202727756e-05, |
|
"loss": 2.9822, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.05245041796426815, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.9999580811789614e-05, |
|
"loss": 2.9278, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.053324591597005956, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.999905682982135e-05, |
|
"loss": 2.9424, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.054198765229743755, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.999832326121594e-05, |
|
"loss": 2.8771, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.05507293886248156, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.999738011212344e-05, |
|
"loss": 2.9132, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.055947112495219366, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.999622739045101e-05, |
|
"loss": 2.9479, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.056821286127957164, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.999486510586282e-05, |
|
"loss": 2.948, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05769545976069497, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.9993293269779975e-05, |
|
"loss": 2.948, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05856963339343277, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.9991511895380396e-05, |
|
"loss": 2.9111, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.059443807026170574, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.998952099759874e-05, |
|
"loss": 2.9496, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.06031798065890837, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.998732059312625e-05, |
|
"loss": 2.8007, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.06119215429164618, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.998491070041066e-05, |
|
"loss": 2.8642, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.062066327924383984, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.998229133965596e-05, |
|
"loss": 2.8762, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.06294050155712179, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.997946253282231e-05, |
|
"loss": 2.8961, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06381467518985959, |
|
"grad_norm": 1.25, |
|
"learning_rate": 4.9976424303625815e-05, |
|
"loss": 2.825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06468884882259739, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.997317667753831e-05, |
|
"loss": 2.8532, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06556302245533518, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.9969719681787196e-05, |
|
"loss": 2.9245, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.066437196088073, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.9966053345355174e-05, |
|
"loss": 2.7549, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0673113697208108, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.9962177698979995e-05, |
|
"loss": 2.8295, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0681855433535486, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.995809277515424e-05, |
|
"loss": 2.7792, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0690597169862864, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.9953798608125025e-05, |
|
"loss": 2.7635, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0699338906190242, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 4.99492952338937e-05, |
|
"loss": 2.8317, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.070808064251762, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.994458269021557e-05, |
|
"loss": 2.7627, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0716822378844998, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.993966101659958e-05, |
|
"loss": 2.8273, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07255641151723762, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.993453025430797e-05, |
|
"loss": 2.8587, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07343058514997541, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.992919044635592e-05, |
|
"loss": 2.8023, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07430475878271321, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.9923641637511226e-05, |
|
"loss": 2.6944, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07517893241545102, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.991788387429388e-05, |
|
"loss": 2.7955, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07605310604818882, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.9911917204975724e-05, |
|
"loss": 2.8184, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07692727968092662, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.9905741679580007e-05, |
|
"loss": 2.8002, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07780145331366442, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.989935734988098e-05, |
|
"loss": 2.7749, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.07867562694640223, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.989276426940348e-05, |
|
"loss": 2.8351, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07954980057914003, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.988596249342244e-05, |
|
"loss": 2.7638, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.08042397421187783, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.987895207896248e-05, |
|
"loss": 2.7492, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.08129814784461564, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.987173308479738e-05, |
|
"loss": 2.7668, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.08217232147735344, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.9864305571449616e-05, |
|
"loss": 2.7527, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08304649511009124, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.985666960118983e-05, |
|
"loss": 2.7963, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08392066874282904, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.984882523803634e-05, |
|
"loss": 2.7924, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.08479484237556685, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.9840772547754566e-05, |
|
"loss": 2.763, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.08566901600830465, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.983251159785651e-05, |
|
"loss": 2.7398, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.08654318964104245, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.982404245760018e-05, |
|
"loss": 2.7528, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.08741736327378026, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.9815365197988986e-05, |
|
"loss": 2.8205, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08829153690651806, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9806479891771195e-05, |
|
"loss": 2.7228, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.08916571053925586, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.9797386613439265e-05, |
|
"loss": 2.7599, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.09003988417199366, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.978808543922925e-05, |
|
"loss": 2.7388, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.09091405780473147, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.9778576447120184e-05, |
|
"loss": 2.7801, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.09178823143746927, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.976885971683337e-05, |
|
"loss": 2.656, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09266240507020707, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.9758935329831754e-05, |
|
"loss": 2.763, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.09353657870294488, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.974880336931923e-05, |
|
"loss": 2.7975, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09441075233568268, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.9738463920239955e-05, |
|
"loss": 2.7029, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09528492596842048, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.972791706927759e-05, |
|
"loss": 2.689, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.09615909960115827, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9717162904854664e-05, |
|
"loss": 2.7322, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09703327323389609, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9706201517131725e-05, |
|
"loss": 2.778, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.09790744686663388, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.9695032998006655e-05, |
|
"loss": 2.8284, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.09878162049937168, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9683657441113884e-05, |
|
"loss": 2.71, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0996557941321095, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.967207494182361e-05, |
|
"loss": 2.6782, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1005299677648473, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.966028559724096e-05, |
|
"loss": 2.706, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.10140414139758509, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.964828950620524e-05, |
|
"loss": 2.7667, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.10227831503032289, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.963608676928905e-05, |
|
"loss": 2.685, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1031524886630607, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.962367748879748e-05, |
|
"loss": 2.6407, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1040266622957985, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.961106176876723e-05, |
|
"loss": 2.662, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.1049008359285363, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 2.7101, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10577500956127411, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.958521143489032e-05, |
|
"loss": 2.7607, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.10664918319401191, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.9571977037767217e-05, |
|
"loss": 2.6531, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.10752335682674971, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.955853663455072e-05, |
|
"loss": 2.6706, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.10839753045948751, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.954489033792227e-05, |
|
"loss": 2.6516, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.10927170409222532, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.95310382622894e-05, |
|
"loss": 2.6962, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11014587772496312, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.951698052378492e-05, |
|
"loss": 2.702, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.11102005135770092, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.950271724026582e-05, |
|
"loss": 2.6833, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.11189422499043873, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.948824853131236e-05, |
|
"loss": 2.691, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.11276839862317653, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.947357451822706e-05, |
|
"loss": 2.64, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11364257225591433, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.945869532403362e-05, |
|
"loss": 2.6507, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11451674588865213, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.944361107347597e-05, |
|
"loss": 2.7446, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11539091952138994, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.942832189301716e-05, |
|
"loss": 2.6651, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11626509315412774, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.941282791083836e-05, |
|
"loss": 2.6495, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.11713926678686554, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9397129256837724e-05, |
|
"loss": 2.6474, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.11801344041960335, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.938122606262936e-05, |
|
"loss": 2.6893, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11888761405234115, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.936511846154215e-05, |
|
"loss": 2.6667, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.11976178768507895, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.934880658861872e-05, |
|
"loss": 2.7114, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.12063596131781675, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.933229058061425e-05, |
|
"loss": 2.6641, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.12151013495055456, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9315570575995364e-05, |
|
"loss": 2.7359, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.12238430858329236, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.92986467149389e-05, |
|
"loss": 2.6406, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12325848221603015, |
|
"grad_norm": 1.0, |
|
"learning_rate": 4.9281519139330846e-05, |
|
"loss": 2.6395, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12413265584876797, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.926418799276504e-05, |
|
"loss": 2.664, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.12500682948150577, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.924665342054204e-05, |
|
"loss": 2.6725, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12588100311424358, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.922891556966788e-05, |
|
"loss": 2.6244, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12675517674698136, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.921097458885282e-05, |
|
"loss": 2.6786, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12762935037971918, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.9192830628510126e-05, |
|
"loss": 2.7084, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.128503524012457, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.9174483840754815e-05, |
|
"loss": 2.688, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.12937769764519477, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.9155934379402335e-05, |
|
"loss": 2.6582, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.13025187127793258, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.9137182399967343e-05, |
|
"loss": 2.6099, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.13112604491067037, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.911822805966232e-05, |
|
"loss": 2.6315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13200021854340818, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.909907151739633e-05, |
|
"loss": 2.6418, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.132874392176146, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.907971293377365e-05, |
|
"loss": 2.6344, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13374856580888378, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.9060152471092414e-05, |
|
"loss": 2.6904, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1346227394416216, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.904039029334326e-05, |
|
"loss": 2.6464, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1354969130743594, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.9020426566207997e-05, |
|
"loss": 2.6811, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1363710867070972, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.900026145705815e-05, |
|
"loss": 2.6346, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.137245260339835, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.897989513495358e-05, |
|
"loss": 2.6762, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1381194339725728, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.89593277706411e-05, |
|
"loss": 2.6383, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.1389936076053106, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.8938559536552994e-05, |
|
"loss": 2.634, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1398677812380484, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.891759060680562e-05, |
|
"loss": 2.6626, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14074195487078622, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.8896421157197896e-05, |
|
"loss": 2.664, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.141616128503524, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.887505136520987e-05, |
|
"loss": 2.6787, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.14249030213626182, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.885348141000122e-05, |
|
"loss": 2.6107, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1433644757689996, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.883171147240975e-05, |
|
"loss": 2.6128, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14423864940173742, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.880974173494984e-05, |
|
"loss": 2.6087, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14511282303447523, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.8787572381811e-05, |
|
"loss": 2.6377, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.14598699666721301, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.876520359885624e-05, |
|
"loss": 2.6326, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14686117029995083, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.874263557362056e-05, |
|
"loss": 2.6361, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14773534393268864, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.871986849530934e-05, |
|
"loss": 2.7243, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.14860951756542642, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.869690255479682e-05, |
|
"loss": 2.6845, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14948369119816424, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.867373794462442e-05, |
|
"loss": 2.6677, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.15035786483090205, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.8650374858999185e-05, |
|
"loss": 2.659, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.15123203846363983, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.862681349379212e-05, |
|
"loss": 2.6327, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.15210621209637765, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.860305404653657e-05, |
|
"loss": 2.6229, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.15298038572911546, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.857909671642656e-05, |
|
"loss": 2.618, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.15385455936185324, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.8554941704315116e-05, |
|
"loss": 2.5778, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.15472873299459106, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.853058921271259e-05, |
|
"loss": 2.5795, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.15560290662732884, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.850603944578494e-05, |
|
"loss": 2.6069, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.15647708026006665, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.848129260935208e-05, |
|
"loss": 2.6211, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.15735125389280447, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.845634891088608e-05, |
|
"loss": 2.601, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15822542752554225, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.8431208559509456e-05, |
|
"loss": 2.6104, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.15909960115828006, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.8405871765993433e-05, |
|
"loss": 2.6695, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.15997377479101788, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.8380338742756157e-05, |
|
"loss": 2.6339, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.16084794842375566, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.835460970386093e-05, |
|
"loss": 2.6176, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.16172212205649347, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.8328684865014386e-05, |
|
"loss": 2.6188, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16259629568923128, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.830256444356473e-05, |
|
"loss": 2.5651, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.16347046932196907, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.827624865849987e-05, |
|
"loss": 2.6513, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.16434464295470688, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.82497377304456e-05, |
|
"loss": 2.6408, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.1652188165874447, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.822303188166377e-05, |
|
"loss": 2.6039, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16609299022018248, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.819613133605036e-05, |
|
"loss": 2.6749, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1669671638529203, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.816903631913372e-05, |
|
"loss": 2.602, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.16784133748565808, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.814174705807252e-05, |
|
"loss": 2.5986, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1687155111183959, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.811426378165398e-05, |
|
"loss": 2.5921, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1695896847511337, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.808658672029189e-05, |
|
"loss": 2.5958, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.17046385838387149, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.8058716106024705e-05, |
|
"loss": 2.5892, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1713380320166093, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.803065217251357e-05, |
|
"loss": 2.5633, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1722122056493471, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.800239515504036e-05, |
|
"loss": 2.6577, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.1730863792820849, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.7973945290505766e-05, |
|
"loss": 2.6721, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1739605529148227, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.794530281742724e-05, |
|
"loss": 2.6837, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.17483472654756052, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.791646797593702e-05, |
|
"loss": 2.5801, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1757089001802983, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.7887441007780123e-05, |
|
"loss": 2.5675, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.17658307381303612, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.7858222156312316e-05, |
|
"loss": 2.6157, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17745724744577393, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.782881166649808e-05, |
|
"loss": 2.6109, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17833142107851171, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.779920978490854e-05, |
|
"loss": 2.5524, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.17920559471124953, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.776941675971941e-05, |
|
"loss": 2.6292, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1800797683439873, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.773943284070892e-05, |
|
"loss": 2.5868, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.18095394197672512, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.7709258279255696e-05, |
|
"loss": 2.5811, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.18182811560946294, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.767889332833667e-05, |
|
"loss": 2.6033, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.18270228924220072, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.764833824252498e-05, |
|
"loss": 2.5816, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.18357646287493853, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.7617593277987794e-05, |
|
"loss": 2.6657, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18445063650767635, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.758665869248417e-05, |
|
"loss": 2.5748, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.18532481014041413, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.755553474536294e-05, |
|
"loss": 2.6091, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18619898377315194, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 2.5747, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.18707315740588976, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.749271981159855e-05, |
|
"loss": 2.6302, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.18794733103862754, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.7461029351582076e-05, |
|
"loss": 2.6072, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18882150467136535, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.7429150583196976e-05, |
|
"loss": 2.6458, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.18969567830410317, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.739708377370789e-05, |
|
"loss": 2.5746, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.19056985193684095, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.736482919195593e-05, |
|
"loss": 2.5883, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.19144402556957876, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.733238710835648e-05, |
|
"loss": 2.657, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.19231819920231655, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.729975779489689e-05, |
|
"loss": 2.6394, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19319237283505436, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.7266941525134215e-05, |
|
"loss": 2.6204, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.19406654646779217, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.7233938574192894e-05, |
|
"loss": 2.5254, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.19494072010052996, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.720074921876245e-05, |
|
"loss": 2.5567, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.19581489373326777, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.716737373709521e-05, |
|
"loss": 2.6215, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.19668906736600558, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.713381240900394e-05, |
|
"loss": 2.5763, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19756324099874337, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.710006551585946e-05, |
|
"loss": 2.6087, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.19843741463148118, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.7066133340588394e-05, |
|
"loss": 2.5327, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.199311588264219, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.703201616767067e-05, |
|
"loss": 2.5569, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.20018576189695678, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.699771428313722e-05, |
|
"loss": 2.5719, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.2010599355296946, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.696322797456757e-05, |
|
"loss": 2.5906, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2019341091624324, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.69285575310874e-05, |
|
"loss": 2.5452, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.20280828279517019, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.689370324336615e-05, |
|
"loss": 2.6078, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.203682456427908, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.685866540361456e-05, |
|
"loss": 2.561, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.20455663006064578, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.682344430558222e-05, |
|
"loss": 2.6126, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2054308036933836, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.6788040244555145e-05, |
|
"loss": 2.6181, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2063049773261214, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.6752453517353245e-05, |
|
"loss": 2.5554, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2071791509588592, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.6716684422327886e-05, |
|
"loss": 2.5949, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.208053324591597, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.6680733259359346e-05, |
|
"loss": 2.5931, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.20892749822433482, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.6644600329854325e-05, |
|
"loss": 2.5865, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.2098016718570726, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.6608285936743445e-05, |
|
"loss": 2.5658, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21067584548981041, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.657179038447862e-05, |
|
"loss": 2.5902, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.21155001912254823, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.653511397903063e-05, |
|
"loss": 2.5303, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.212424192755286, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.649825702788643e-05, |
|
"loss": 2.6264, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.21329836638802382, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.6461219840046654e-05, |
|
"loss": 2.5831, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.21417254002076164, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.642400272602302e-05, |
|
"loss": 2.6215, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21504671365349942, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.638660599783567e-05, |
|
"loss": 2.5877, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.21592088728623723, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.6349029969010644e-05, |
|
"loss": 2.5607, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.21679506091897502, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.631127495457713e-05, |
|
"loss": 2.5615, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.21766923455171283, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.6273341271064965e-05, |
|
"loss": 2.6131, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.21854340818445064, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.6235229236501845e-05, |
|
"loss": 2.6152, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21941758181718843, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.619693917041076e-05, |
|
"loss": 2.5947, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.22029175544992624, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.615847139380728e-05, |
|
"loss": 2.6395, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.22116592908266405, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.611982622919683e-05, |
|
"loss": 2.5855, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.22204010271540184, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.608100400057206e-05, |
|
"loss": 2.5098, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.22291427634813965, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.604200503341004e-05, |
|
"loss": 2.6061, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22378844998087746, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.6002829654669616e-05, |
|
"loss": 2.5075, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.22466262361361525, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.596347819278861e-05, |
|
"loss": 2.5869, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.22553679724635306, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.5923950977681084e-05, |
|
"loss": 2.586, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.22641097087909087, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.58842483407346e-05, |
|
"loss": 2.5124, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22728514451182866, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.584437061480739e-05, |
|
"loss": 2.5364, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22815931814456647, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.58043181342256e-05, |
|
"loss": 2.5939, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.22903349177730425, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.5764091234780504e-05, |
|
"loss": 2.5893, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22990766541004207, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.572369025372564e-05, |
|
"loss": 2.5496, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.23078183904277988, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.568311552977401e-05, |
|
"loss": 2.6138, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.23165601267551766, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.564236740309525e-05, |
|
"loss": 2.5724, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23253018630825548, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.560144621531278e-05, |
|
"loss": 2.5762, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2334043599409933, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.5560352309500886e-05, |
|
"loss": 2.5781, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.23427853357373107, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.551908603018191e-05, |
|
"loss": 2.606, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.2351527072064689, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.547764772332333e-05, |
|
"loss": 2.589, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.2360268808392067, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.5436037736334894e-05, |
|
"loss": 2.6229, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23690105447194448, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.539425641806562e-05, |
|
"loss": 2.5875, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.2377752281046823, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.535230411880098e-05, |
|
"loss": 2.6023, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2386494017374201, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.531018119025989e-05, |
|
"loss": 2.5965, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2395235753701579, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.5267887985591795e-05, |
|
"loss": 2.5359, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.2403977490028957, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 2.5603, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2412719226356335, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.5182792167607155e-05, |
|
"loss": 2.6296, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2421460962683713, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.513999026771539e-05, |
|
"loss": 2.5896, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.24302026990110911, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.509701951854017e-05, |
|
"loss": 2.5494, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2438944435338469, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.505388028033888e-05, |
|
"loss": 2.6256, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2447686171665847, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.501057291478149e-05, |
|
"loss": 2.6245, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24564279079932252, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.496709778494749e-05, |
|
"loss": 2.5308, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.2465169644320603, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.492345525532288e-05, |
|
"loss": 2.6629, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.24739113806479812, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.487964569179711e-05, |
|
"loss": 2.4932, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24826531169753593, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.4835669461660004e-05, |
|
"loss": 2.5798, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.24913948533027372, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.479152693359868e-05, |
|
"loss": 2.6232, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.25001365896301153, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.474721847769445e-05, |
|
"loss": 2.5524, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.2508878325957493, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.4702744465419744e-05, |
|
"loss": 2.6093, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.25176200622848716, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.465810526963499e-05, |
|
"loss": 2.5971, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.25263617986122494, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.461330126458544e-05, |
|
"loss": 2.529, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.2535103534939627, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.4568332825898105e-05, |
|
"loss": 2.5475, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25438452712670057, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.452320033057856e-05, |
|
"loss": 2.5431, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.25525870075943835, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.447790415700781e-05, |
|
"loss": 2.5771, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.25613287439217614, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.4432444684939077e-05, |
|
"loss": 2.6166, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.257007048024914, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.438682229549466e-05, |
|
"loss": 2.5507, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.25788122165765176, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.434103737116272e-05, |
|
"loss": 2.5351, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.25875539529038954, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.429509029579405e-05, |
|
"loss": 2.6678, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.25962956892312733, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.4248981454598935e-05, |
|
"loss": 2.5859, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.26050374255586517, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.420271123414381e-05, |
|
"loss": 2.5215, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.26137791618860295, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.415628002234812e-05, |
|
"loss": 2.5394, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.26225208982134074, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.4109688208481015e-05, |
|
"loss": 2.6149, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2631262634540786, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.406293618315809e-05, |
|
"loss": 2.5216, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.26400043708681636, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.4016024338338114e-05, |
|
"loss": 2.5536, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.26487461071955415, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.3968953067319777e-05, |
|
"loss": 2.5415, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.265748784352292, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.3921722764738326e-05, |
|
"loss": 2.5575, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.2666229579850298, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.387433382656232e-05, |
|
"loss": 2.4776, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.26749713161776756, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 2.5806, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.2683713052505054, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.377908163394734e-05, |
|
"loss": 2.5854, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.2692454788832432, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.373121917808196e-05, |
|
"loss": 2.5241, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.27011965251598097, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.368319968376253e-05, |
|
"loss": 2.4803, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2709938261487188, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.363502355357399e-05, |
|
"loss": 2.5509, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2718679997814566, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.358669119141453e-05, |
|
"loss": 2.5421, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.2727421734141944, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.3538203002492104e-05, |
|
"loss": 2.5374, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.2736163470469322, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.348955939332111e-05, |
|
"loss": 2.5822, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.27449052067967, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.344076077171897e-05, |
|
"loss": 2.5644, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2753646943124078, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.339180754680267e-05, |
|
"loss": 2.6278, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2762388679451456, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.3342700128985345e-05, |
|
"loss": 2.577, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.2771130415778834, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.3293438929972894e-05, |
|
"loss": 2.5167, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.2779872152106212, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.324402436276046e-05, |
|
"loss": 2.5297, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.27886138884335904, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.319445684162897e-05, |
|
"loss": 2.58, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.2797355624760968, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.3144736782141725e-05, |
|
"loss": 2.5503, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2806097361088346, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.309486460114085e-05, |
|
"loss": 2.4978, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.28148390974157245, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.3044840716743824e-05, |
|
"loss": 2.5319, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.28235808337431023, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.299466554833997e-05, |
|
"loss": 2.5353, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.283232257007048, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.294433951658697e-05, |
|
"loss": 2.6071, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.2841064306397858, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.289386304340727e-05, |
|
"loss": 2.6526, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.28498060427252364, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.284323655198462e-05, |
|
"loss": 2.553, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.2858547779052614, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.2792460466760485e-05, |
|
"loss": 2.5924, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2867289515379992, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.274153521343046e-05, |
|
"loss": 2.5093, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.28760312517073705, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.269046121894077e-05, |
|
"loss": 2.5962, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.28847729880347484, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.2639238911484633e-05, |
|
"loss": 2.5287, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2893514724362126, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.2587868720498705e-05, |
|
"loss": 2.5151, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.29022564606895046, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.253635107665945e-05, |
|
"loss": 2.5844, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.29109981970168824, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.2484686411879554e-05, |
|
"loss": 2.5545, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.29197399333442603, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.2432875159304295e-05, |
|
"loss": 2.5029, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.29284816696716387, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.2380917753307904e-05, |
|
"loss": 2.5439, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.29372234059990165, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.232881462948994e-05, |
|
"loss": 2.5714, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.29459651423263944, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.227656622467162e-05, |
|
"loss": 2.5515, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.2954706878653773, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.222417297689217e-05, |
|
"loss": 2.5615, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.29634486149811506, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.217163532540514e-05, |
|
"loss": 2.57, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.29721903513085285, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.211895371067474e-05, |
|
"loss": 2.5805, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2980932087635907, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.206612857437213e-05, |
|
"loss": 2.6419, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2989673823963285, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.2013160359371736e-05, |
|
"loss": 2.5025, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.29984155602906626, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.19600495097475e-05, |
|
"loss": 2.4513, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.3007157296618041, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.1906796470769195e-05, |
|
"loss": 2.6036, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.3015899032945419, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.185340168889868e-05, |
|
"loss": 2.5366, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.30246407692727967, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.179986561178617e-05, |
|
"loss": 2.539, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.3033382505600175, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.1746188688266444e-05, |
|
"loss": 2.5152, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.3042124241927553, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.16923713683551e-05, |
|
"loss": 2.6098, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.3050865978254931, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.163841410324482e-05, |
|
"loss": 2.5229, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3059607714582309, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.158431734530154e-05, |
|
"loss": 2.5009, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3068349450909687, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.153008154806067e-05, |
|
"loss": 2.4947, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.3077091187237065, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.1475707166223296e-05, |
|
"loss": 2.5652, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.30858329235644427, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.142119465565238e-05, |
|
"loss": 2.5643, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3094574659891821, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.13665444733689e-05, |
|
"loss": 2.5575, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3103316396219199, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.131175707754807e-05, |
|
"loss": 2.4748, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3112058132546577, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.125683292751546e-05, |
|
"loss": 2.53, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3120799868873955, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.120177248374315e-05, |
|
"loss": 2.5582, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3129541605201333, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.114657620784589e-05, |
|
"loss": 2.5842, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.3138283341528711, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.109124456257721e-05, |
|
"loss": 2.5279, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.31470250778560893, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.103577801182557e-05, |
|
"loss": 2.5657, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3155766814183467, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.098017702061039e-05, |
|
"loss": 2.5622, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3164508550510845, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.0924442055078276e-05, |
|
"loss": 2.5328, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.31732502868382234, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.0868573582499004e-05, |
|
"loss": 2.5514, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.3181992023165601, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.0812572071261654e-05, |
|
"loss": 2.5575, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3190733759492979, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.07564379908707e-05, |
|
"loss": 2.5688, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.31994754958203575, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.070017181194199e-05, |
|
"loss": 2.5032, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.32082172321477354, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.0643774006198907e-05, |
|
"loss": 2.5319, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.3216958968475113, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 2.5558, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.32257007048024916, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.053058540667676e-05, |
|
"loss": 2.5876, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.32344424411298694, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.0473795561846215e-05, |
|
"loss": 2.5354, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32431841774572473, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.0416875988090375e-05, |
|
"loss": 2.531, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.32519259137846257, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.035982716261053e-05, |
|
"loss": 2.5584, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.32606676501120035, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.030264956369157e-05, |
|
"loss": 2.4785, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.32694093864393814, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.0245343670698025e-05, |
|
"loss": 2.549, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.327815112276676, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.018790996406998e-05, |
|
"loss": 2.4917, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.32868928590941376, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.01303489253191e-05, |
|
"loss": 2.4882, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.32956345954215155, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.0072661037024596e-05, |
|
"loss": 2.5832, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.3304376331748894, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.0014846782829104e-05, |
|
"loss": 2.5667, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.3313118068076272, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.9956906647434736e-05, |
|
"loss": 2.511, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.33218598044036496, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.989884111659893e-05, |
|
"loss": 2.5146, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33306015407310274, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.984065067713043e-05, |
|
"loss": 2.4662, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.3339343277058406, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.978233581688518e-05, |
|
"loss": 2.5807, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.33480850133857837, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.9723897024762255e-05, |
|
"loss": 2.5095, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.33568267497131615, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.9665334790699714e-05, |
|
"loss": 2.5084, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.336556848604054, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.960664960567057e-05, |
|
"loss": 2.5447, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3374310222367918, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.95478419616786e-05, |
|
"loss": 2.5544, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.33830519586952956, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.948891235175425e-05, |
|
"loss": 2.5338, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.3391793695022674, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.942986126995052e-05, |
|
"loss": 2.5239, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.3400535431350052, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.937068921133879e-05, |
|
"loss": 2.5493, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.34092771676774297, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.931139667200469e-05, |
|
"loss": 2.4874, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3418018904004808, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.9251984149043917e-05, |
|
"loss": 2.5066, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.3426760640332186, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.919245214055812e-05, |
|
"loss": 2.5081, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.3435502376659564, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.913280114565066e-05, |
|
"loss": 2.5536, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.3444244112986942, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.9073031664422444e-05, |
|
"loss": 2.5335, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.345298584931432, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.901314419796778e-05, |
|
"loss": 2.4885, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3461727585641698, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.8953139248370116e-05, |
|
"loss": 2.5373, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.34704693219690763, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.889301731869784e-05, |
|
"loss": 2.563, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.3479211058296454, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.883277891300011e-05, |
|
"loss": 2.5089, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.3487952794623832, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.8772424536302564e-05, |
|
"loss": 2.5444, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.34966945309512104, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.8711954694603126e-05, |
|
"loss": 2.4677, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3505436267278588, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.865136989486776e-05, |
|
"loss": 2.4907, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.3514178003605966, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.8590670645026195e-05, |
|
"loss": 2.4889, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.35229197399333445, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.85298574539677e-05, |
|
"loss": 2.5175, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.35316614762607224, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.84689308315368e-05, |
|
"loss": 2.555, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.35404032125881, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.8407891288529004e-05, |
|
"loss": 2.4927, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.35491449489154786, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.834673933668651e-05, |
|
"loss": 2.4928, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.35578866852428565, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.828547548869396e-05, |
|
"loss": 2.5426, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.35666284215702343, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.822410025817406e-05, |
|
"loss": 2.5477, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.3575370157897612, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.8162614159683374e-05, |
|
"loss": 2.5466, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.35841118942249905, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.8101017708707906e-05, |
|
"loss": 2.5304, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35928536305523684, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 3.8039311421658887e-05, |
|
"loss": 2.556, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.3601595366879746, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.797749581586835e-05, |
|
"loss": 2.5913, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.36103371032071246, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.7915571409584836e-05, |
|
"loss": 2.5172, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.36190788395345025, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.7853538721969064e-05, |
|
"loss": 2.4756, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.36278205758618803, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.779139827308956e-05, |
|
"loss": 2.5278, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3636562312189259, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.7729150583918264e-05, |
|
"loss": 2.4925, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.36453040485166366, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.766679617632624e-05, |
|
"loss": 2.5038, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.36540457848440144, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.760433557307922e-05, |
|
"loss": 2.518, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.3662787521171393, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.754176929783327e-05, |
|
"loss": 2.554, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.36715292574987707, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.74790978751304e-05, |
|
"loss": 2.5062, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.36802709938261485, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.7416321830394144e-05, |
|
"loss": 2.5755, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.3689012730153527, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.735344168992515e-05, |
|
"loss": 2.5203, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.3697754466480905, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.7290457980896795e-05, |
|
"loss": 2.4996, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.37064962028082826, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.722737123135075e-05, |
|
"loss": 2.5625, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.3715237939135661, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.716418197019257e-05, |
|
"loss": 2.5665, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3723979675463039, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.710089072718722e-05, |
|
"loss": 2.5188, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.37327214117904167, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.7037498032954664e-05, |
|
"loss": 2.5166, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3741463148117795, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.697400441896543e-05, |
|
"loss": 2.5166, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3750204884445173, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.691041041753613e-05, |
|
"loss": 2.5436, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3758946620772551, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.6846716561824965e-05, |
|
"loss": 2.5019, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3767688357099929, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.678292338582735e-05, |
|
"loss": 2.5575, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.3776430093427307, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.671903142437134e-05, |
|
"loss": 2.5161, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.3785171829754685, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.6655041213113184e-05, |
|
"loss": 2.5285, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.37939135660820633, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.659095328853288e-05, |
|
"loss": 2.4936, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.3802655302409441, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.652676818792958e-05, |
|
"loss": 2.5238, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3811397038736819, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.646248644941716e-05, |
|
"loss": 2.4821, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3820138775064197, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.6398108611919696e-05, |
|
"loss": 2.5309, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3828880511391575, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.633363521516693e-05, |
|
"loss": 2.508, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.3837622247718953, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.626906679968974e-05, |
|
"loss": 2.5292, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3846363984046331, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.6204403906815655e-05, |
|
"loss": 2.5175, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38551057203737094, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.613964707866424e-05, |
|
"loss": 2.5478, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.3863847456701087, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.607479685814261e-05, |
|
"loss": 2.5442, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.3872589193028465, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.600985378894086e-05, |
|
"loss": 2.5198, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.38813309293558435, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.594481841552753e-05, |
|
"loss": 2.5001, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.38900726656832213, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.5879691283144964e-05, |
|
"loss": 2.53, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3898814402010599, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.5814472937804865e-05, |
|
"loss": 2.5589, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.39075561383379775, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.574916392628359e-05, |
|
"loss": 2.5402, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.39162978746653554, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.5683764796117634e-05, |
|
"loss": 2.48, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3925039610992733, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.561827609559905e-05, |
|
"loss": 2.5504, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.39337813473201116, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.55526983737708e-05, |
|
"loss": 2.5011, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39425230836474895, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.54870321804222e-05, |
|
"loss": 2.4815, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.39512648199748673, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.5421278066084276e-05, |
|
"loss": 2.537, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3960006556302246, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.535543658202518e-05, |
|
"loss": 2.5111, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.39687482926296236, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.528950828024555e-05, |
|
"loss": 2.4883, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.39774900289570014, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.522349371347387e-05, |
|
"loss": 2.4712, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.398623176528438, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.515739343516188e-05, |
|
"loss": 2.4872, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.39949735016117577, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.509120799947987e-05, |
|
"loss": 2.5711, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.40037152379391355, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.50249379613121e-05, |
|
"loss": 2.5285, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.4012456974266514, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.49585838762521e-05, |
|
"loss": 2.5139, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.4021198710593892, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.489214630059806e-05, |
|
"loss": 2.5236, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.40299404469212696, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.4825625791348096e-05, |
|
"loss": 2.5336, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.4038682183248648, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.475902290619565e-05, |
|
"loss": 2.4917, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.4047423919576026, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.469233820352477e-05, |
|
"loss": 2.5423, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.40561656559034037, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.462557224240545e-05, |
|
"loss": 2.4924, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.40649073922307816, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.455872558258895e-05, |
|
"loss": 2.5107, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.407364912855816, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.449179878450308e-05, |
|
"loss": 2.5197, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.4082390864885538, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.442479240924749e-05, |
|
"loss": 2.4901, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.40911326012129157, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.4357707018589036e-05, |
|
"loss": 2.4912, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4099874337540294, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.429054317495697e-05, |
|
"loss": 2.4534, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.4108616073867672, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.4223301441438306e-05, |
|
"loss": 2.4801, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.411735781019505, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.415598238177307e-05, |
|
"loss": 2.4984, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.4126099546522428, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.408858656034957e-05, |
|
"loss": 2.5402, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4134841282849806, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.4021114542199664e-05, |
|
"loss": 2.5232, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.4143583019177184, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.395356689299401e-05, |
|
"loss": 2.5168, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4152324755504562, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.3885944179037395e-05, |
|
"loss": 2.5563, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.416106649183194, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.381824696726386e-05, |
|
"loss": 2.5104, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.4169808228159318, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.3750475825232074e-05, |
|
"loss": 2.5002, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.41785499644866964, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.3682631321120504e-05, |
|
"loss": 2.5262, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.4187291700814074, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.361471402372267e-05, |
|
"loss": 2.5159, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.4196033437141452, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.3546724502442354e-05, |
|
"loss": 2.455, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.42047751734688305, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.347866332728889e-05, |
|
"loss": 2.4299, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.42135169097962083, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.341053106887229e-05, |
|
"loss": 2.5159, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.4222258646123586, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.3342328298398565e-05, |
|
"loss": 2.4763, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.42310003824509645, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.3274055587664856e-05, |
|
"loss": 2.4768, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.42397421187783424, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.320571350905466e-05, |
|
"loss": 2.5295, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.424848385510572, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.313730263553306e-05, |
|
"loss": 2.4913, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.42572255914330986, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.3068823540641886e-05, |
|
"loss": 2.5096, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.42659673277604765, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.300027679849492e-05, |
|
"loss": 2.5255, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.42747090640878543, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.2931662983773106e-05, |
|
"loss": 2.4564, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.4283450800415233, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.286298267171969e-05, |
|
"loss": 2.5294, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42921925367426106, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.2794236438135405e-05, |
|
"loss": 2.5117, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.43009342730699884, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 2.4564, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.43096760093973663, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.265654851233579e-05, |
|
"loss": 2.4361, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.43184177457247447, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.258760797446598e-05, |
|
"loss": 2.5215, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.43271594820521225, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.251860382374668e-05, |
|
"loss": 2.4979, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.43359012183795004, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.244953663869365e-05, |
|
"loss": 2.5005, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.4344642954706879, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.238040699835106e-05, |
|
"loss": 2.5365, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.43533846910342566, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.231121548228676e-05, |
|
"loss": 2.5102, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.43621264273616345, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.2241962670587314e-05, |
|
"loss": 2.4999, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.4370868163689013, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.2172649143853176e-05, |
|
"loss": 2.4631, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1143, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 4.571566411087872e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|