|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8741736327378026, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008741736327378026, |
|
"grad_norm": 19.75, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 4.7931, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017483472654756052, |
|
"grad_norm": 21.75, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 4.937, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0026225208982134074, |
|
"grad_norm": 20.625, |
|
"learning_rate": 2.586206896551724e-06, |
|
"loss": 5.0176, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0034966945309512104, |
|
"grad_norm": 19.875, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 4.8236, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004370868163689013, |
|
"grad_norm": 16.875, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 4.761, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005245041796426815, |
|
"grad_norm": 16.125, |
|
"learning_rate": 5.172413793103448e-06, |
|
"loss": 4.9055, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.006119215429164618, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 4.6787, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006993389061902421, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 4.6797, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007867562694640224, |
|
"grad_norm": 19.0, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 4.6406, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.008741736327378026, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 4.5986, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009615909960115828, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 4.5762, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01049008359285363, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.0344827586206897e-05, |
|
"loss": 4.5882, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011364257225591434, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.1206896551724138e-05, |
|
"loss": 4.3547, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.012238430858329236, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.206896551724138e-05, |
|
"loss": 4.2879, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.013112604491067038, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.2931034482758622e-05, |
|
"loss": 4.3241, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013986778123804841, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 4.386, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014860951756542643, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 1.4655172413793103e-05, |
|
"loss": 4.266, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015735125389280447, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.5517241379310346e-05, |
|
"loss": 4.2994, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01660929902201825, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.6379310344827585e-05, |
|
"loss": 4.1198, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01748347265475605, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 4.1751, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.018357646287493853, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.810344827586207e-05, |
|
"loss": 3.9965, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.019231819920231655, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.896551724137931e-05, |
|
"loss": 3.9684, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.020105993552969458, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9827586206896554e-05, |
|
"loss": 3.9812, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02098016718570726, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.0689655172413793e-05, |
|
"loss": 3.9997, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.021854340818445065, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.1551724137931033e-05, |
|
"loss": 3.9327, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022728514451182867, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 2.2413793103448276e-05, |
|
"loss": 3.937, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02360268808392067, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.327586206896552e-05, |
|
"loss": 3.8923, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02447686171665847, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.413793103448276e-05, |
|
"loss": 3.8121, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.025351035349396273, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 3.8329, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.026225208982134075, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 3.7719, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.027099382614871877, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 2.672413793103448e-05, |
|
"loss": 3.8168, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027973556247609683, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 3.6792, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028847729880347485, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.844827586206897e-05, |
|
"loss": 3.6723, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.029721903513085287, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 2.9310344827586206e-05, |
|
"loss": 3.6362, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03059607714582309, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.017241379310345e-05, |
|
"loss": 3.6452, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.031470250778560895, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 3.103448275862069e-05, |
|
"loss": 3.5118, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03234442441129869, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 3.1896551724137935e-05, |
|
"loss": 3.4852, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0332185980440365, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.275862068965517e-05, |
|
"loss": 3.3851, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0340927716767743, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.3620689655172414e-05, |
|
"loss": 3.3676, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0349669453095121, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 3.4513, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0358411189422499, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.53448275862069e-05, |
|
"loss": 3.3572, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03671529257498771, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.620689655172414e-05, |
|
"loss": 3.229, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03758946620772551, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.7068965517241385e-05, |
|
"loss": 3.2683, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03846363984046331, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.793103448275862e-05, |
|
"loss": 3.2209, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.039337813473201116, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.8793103448275865e-05, |
|
"loss": 3.3169, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.040211987105938915, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.965517241379311e-05, |
|
"loss": 3.2609, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04108616073867672, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.0517241379310344e-05, |
|
"loss": 3.2348, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04196033437141452, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 3.2157, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.042834508004152325, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.224137931034483e-05, |
|
"loss": 3.118, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04370868163689013, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 3.1812, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04458285526962793, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.396551724137931e-05, |
|
"loss": 3.1164, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.045457028902365734, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 3.1016, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04633120253510353, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.5689655172413794e-05, |
|
"loss": 3.0119, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04720537616784134, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 3.0376, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04807954980057914, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.741379310344828e-05, |
|
"loss": 3.0525, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04895372343331694, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 3.0417, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04982789706605475, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.913793103448276e-05, |
|
"loss": 2.9921, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.050702070698792547, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 5e-05, |
|
"loss": 2.9874, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.05157624433153035, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.9999895202727756e-05, |
|
"loss": 2.9822, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.05245041796426815, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.9999580811789614e-05, |
|
"loss": 2.9278, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.053324591597005956, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.999905682982135e-05, |
|
"loss": 2.9424, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.054198765229743755, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.999832326121594e-05, |
|
"loss": 2.8771, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.05507293886248156, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.999738011212344e-05, |
|
"loss": 2.9132, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.055947112495219366, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.999622739045101e-05, |
|
"loss": 2.9479, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.056821286127957164, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.999486510586282e-05, |
|
"loss": 2.948, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05769545976069497, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.9993293269779975e-05, |
|
"loss": 2.948, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05856963339343277, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.9991511895380396e-05, |
|
"loss": 2.9111, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.059443807026170574, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.998952099759874e-05, |
|
"loss": 2.9496, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.06031798065890837, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.998732059312625e-05, |
|
"loss": 2.8007, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.06119215429164618, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.998491070041066e-05, |
|
"loss": 2.8642, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.062066327924383984, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.998229133965596e-05, |
|
"loss": 2.8762, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.06294050155712179, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.997946253282231e-05, |
|
"loss": 2.8961, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06381467518985959, |
|
"grad_norm": 1.25, |
|
"learning_rate": 4.9976424303625815e-05, |
|
"loss": 2.825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06468884882259739, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.997317667753831e-05, |
|
"loss": 2.8532, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06556302245533518, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.9969719681787196e-05, |
|
"loss": 2.9245, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.066437196088073, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.9966053345355174e-05, |
|
"loss": 2.7549, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0673113697208108, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.9962177698979995e-05, |
|
"loss": 2.8295, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0681855433535486, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.995809277515424e-05, |
|
"loss": 2.7792, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0690597169862864, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.9953798608125025e-05, |
|
"loss": 2.7635, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0699338906190242, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 4.99492952338937e-05, |
|
"loss": 2.8317, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.070808064251762, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.994458269021557e-05, |
|
"loss": 2.7627, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0716822378844998, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.993966101659958e-05, |
|
"loss": 2.8273, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07255641151723762, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.993453025430797e-05, |
|
"loss": 2.8587, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07343058514997541, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.992919044635592e-05, |
|
"loss": 2.8023, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07430475878271321, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.9923641637511226e-05, |
|
"loss": 2.6944, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07517893241545102, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.991788387429388e-05, |
|
"loss": 2.7955, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07605310604818882, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.9911917204975724e-05, |
|
"loss": 2.8184, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07692727968092662, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.9905741679580007e-05, |
|
"loss": 2.8002, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07780145331366442, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.989935734988098e-05, |
|
"loss": 2.7749, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.07867562694640223, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.989276426940348e-05, |
|
"loss": 2.8351, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07954980057914003, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.988596249342244e-05, |
|
"loss": 2.7638, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.08042397421187783, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.987895207896248e-05, |
|
"loss": 2.7492, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.08129814784461564, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.987173308479738e-05, |
|
"loss": 2.7668, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.08217232147735344, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.9864305571449616e-05, |
|
"loss": 2.7527, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08304649511009124, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.985666960118983e-05, |
|
"loss": 2.7963, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08392066874282904, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.984882523803634e-05, |
|
"loss": 2.7924, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.08479484237556685, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.9840772547754566e-05, |
|
"loss": 2.763, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.08566901600830465, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.983251159785651e-05, |
|
"loss": 2.7398, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.08654318964104245, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.982404245760018e-05, |
|
"loss": 2.7528, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.08741736327378026, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.9815365197988986e-05, |
|
"loss": 2.8205, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08829153690651806, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9806479891771195e-05, |
|
"loss": 2.7228, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.08916571053925586, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.9797386613439265e-05, |
|
"loss": 2.7599, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.09003988417199366, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.978808543922925e-05, |
|
"loss": 2.7388, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.09091405780473147, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.9778576447120184e-05, |
|
"loss": 2.7801, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.09178823143746927, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.976885971683337e-05, |
|
"loss": 2.656, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09266240507020707, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.9758935329831754e-05, |
|
"loss": 2.763, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.09353657870294488, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.974880336931923e-05, |
|
"loss": 2.7975, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09441075233568268, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.9738463920239955e-05, |
|
"loss": 2.7029, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09528492596842048, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.972791706927759e-05, |
|
"loss": 2.689, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.09615909960115827, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9717162904854664e-05, |
|
"loss": 2.7322, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09703327323389609, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9706201517131725e-05, |
|
"loss": 2.778, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.09790744686663388, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.9695032998006655e-05, |
|
"loss": 2.8284, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.09878162049937168, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9683657441113884e-05, |
|
"loss": 2.71, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0996557941321095, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.967207494182361e-05, |
|
"loss": 2.6782, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1005299677648473, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.966028559724096e-05, |
|
"loss": 2.706, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.10140414139758509, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.964828950620524e-05, |
|
"loss": 2.7667, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.10227831503032289, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.963608676928905e-05, |
|
"loss": 2.685, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1031524886630607, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.962367748879748e-05, |
|
"loss": 2.6407, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1040266622957985, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.961106176876723e-05, |
|
"loss": 2.662, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.1049008359285363, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 2.7101, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10577500956127411, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.958521143489032e-05, |
|
"loss": 2.7607, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.10664918319401191, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.9571977037767217e-05, |
|
"loss": 2.6531, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.10752335682674971, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.955853663455072e-05, |
|
"loss": 2.6706, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.10839753045948751, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.954489033792227e-05, |
|
"loss": 2.6516, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.10927170409222532, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.95310382622894e-05, |
|
"loss": 2.6962, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11014587772496312, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.951698052378492e-05, |
|
"loss": 2.702, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.11102005135770092, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.950271724026582e-05, |
|
"loss": 2.6833, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.11189422499043873, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.948824853131236e-05, |
|
"loss": 2.691, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.11276839862317653, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.947357451822706e-05, |
|
"loss": 2.64, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11364257225591433, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.945869532403362e-05, |
|
"loss": 2.6507, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11451674588865213, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.944361107347597e-05, |
|
"loss": 2.7446, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11539091952138994, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.942832189301716e-05, |
|
"loss": 2.6651, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11626509315412774, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.941282791083836e-05, |
|
"loss": 2.6495, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.11713926678686554, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9397129256837724e-05, |
|
"loss": 2.6474, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.11801344041960335, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.938122606262936e-05, |
|
"loss": 2.6893, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11888761405234115, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.936511846154215e-05, |
|
"loss": 2.6667, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.11976178768507895, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.934880658861872e-05, |
|
"loss": 2.7114, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.12063596131781675, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.933229058061425e-05, |
|
"loss": 2.6641, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.12151013495055456, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9315570575995364e-05, |
|
"loss": 2.7359, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.12238430858329236, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.92986467149389e-05, |
|
"loss": 2.6406, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12325848221603015, |
|
"grad_norm": 1.0, |
|
"learning_rate": 4.9281519139330846e-05, |
|
"loss": 2.6395, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12413265584876797, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.926418799276504e-05, |
|
"loss": 2.664, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.12500682948150577, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.924665342054204e-05, |
|
"loss": 2.6725, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12588100311424358, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.922891556966788e-05, |
|
"loss": 2.6244, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12675517674698136, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.921097458885282e-05, |
|
"loss": 2.6786, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12762935037971918, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.9192830628510126e-05, |
|
"loss": 2.7084, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.128503524012457, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.9174483840754815e-05, |
|
"loss": 2.688, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.12937769764519477, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.9155934379402335e-05, |
|
"loss": 2.6582, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.13025187127793258, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.9137182399967343e-05, |
|
"loss": 2.6099, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.13112604491067037, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.911822805966232e-05, |
|
"loss": 2.6315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13200021854340818, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.909907151739633e-05, |
|
"loss": 2.6418, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.132874392176146, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.907971293377365e-05, |
|
"loss": 2.6344, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13374856580888378, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.9060152471092414e-05, |
|
"loss": 2.6904, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1346227394416216, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.904039029334326e-05, |
|
"loss": 2.6464, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1354969130743594, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.9020426566207997e-05, |
|
"loss": 2.6811, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1363710867070972, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.900026145705815e-05, |
|
"loss": 2.6346, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.137245260339835, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.897989513495358e-05, |
|
"loss": 2.6762, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1381194339725728, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.89593277706411e-05, |
|
"loss": 2.6383, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.1389936076053106, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.8938559536552994e-05, |
|
"loss": 2.634, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1398677812380484, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.891759060680562e-05, |
|
"loss": 2.6626, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14074195487078622, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.8896421157197896e-05, |
|
"loss": 2.664, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.141616128503524, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.887505136520987e-05, |
|
"loss": 2.6787, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.14249030213626182, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.885348141000122e-05, |
|
"loss": 2.6107, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1433644757689996, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.883171147240975e-05, |
|
"loss": 2.6128, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14423864940173742, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.880974173494984e-05, |
|
"loss": 2.6087, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14511282303447523, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.8787572381811e-05, |
|
"loss": 2.6377, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.14598699666721301, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.876520359885624e-05, |
|
"loss": 2.6326, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14686117029995083, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.874263557362056e-05, |
|
"loss": 2.6361, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14773534393268864, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.871986849530934e-05, |
|
"loss": 2.7243, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.14860951756542642, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.869690255479682e-05, |
|
"loss": 2.6845, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14948369119816424, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.867373794462442e-05, |
|
"loss": 2.6677, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.15035786483090205, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.8650374858999185e-05, |
|
"loss": 2.659, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.15123203846363983, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.862681349379212e-05, |
|
"loss": 2.6327, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.15210621209637765, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.860305404653657e-05, |
|
"loss": 2.6229, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.15298038572911546, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.857909671642656e-05, |
|
"loss": 2.618, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.15385455936185324, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.8554941704315116e-05, |
|
"loss": 2.5778, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.15472873299459106, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.853058921271259e-05, |
|
"loss": 2.5795, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.15560290662732884, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.850603944578494e-05, |
|
"loss": 2.6069, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.15647708026006665, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.848129260935208e-05, |
|
"loss": 2.6211, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.15735125389280447, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.845634891088608e-05, |
|
"loss": 2.601, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15822542752554225, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.8431208559509456e-05, |
|
"loss": 2.6104, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.15909960115828006, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.8405871765993433e-05, |
|
"loss": 2.6695, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.15997377479101788, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.8380338742756157e-05, |
|
"loss": 2.6339, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.16084794842375566, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.835460970386093e-05, |
|
"loss": 2.6176, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.16172212205649347, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.8328684865014386e-05, |
|
"loss": 2.6188, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16259629568923128, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.830256444356473e-05, |
|
"loss": 2.5651, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.16347046932196907, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.827624865849987e-05, |
|
"loss": 2.6513, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.16434464295470688, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.82497377304456e-05, |
|
"loss": 2.6408, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.1652188165874447, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.822303188166377e-05, |
|
"loss": 2.6039, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16609299022018248, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.819613133605036e-05, |
|
"loss": 2.6749, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1669671638529203, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.816903631913372e-05, |
|
"loss": 2.602, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.16784133748565808, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.814174705807252e-05, |
|
"loss": 2.5986, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1687155111183959, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.811426378165398e-05, |
|
"loss": 2.5921, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1695896847511337, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.808658672029189e-05, |
|
"loss": 2.5958, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.17046385838387149, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.8058716106024705e-05, |
|
"loss": 2.5892, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1713380320166093, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.803065217251357e-05, |
|
"loss": 2.5633, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1722122056493471, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.800239515504036e-05, |
|
"loss": 2.6577, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.1730863792820849, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.7973945290505766e-05, |
|
"loss": 2.6721, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1739605529148227, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.794530281742724e-05, |
|
"loss": 2.6837, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.17483472654756052, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.791646797593702e-05, |
|
"loss": 2.5801, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1757089001802983, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.7887441007780123e-05, |
|
"loss": 2.5675, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.17658307381303612, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.7858222156312316e-05, |
|
"loss": 2.6157, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17745724744577393, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.782881166649808e-05, |
|
"loss": 2.6109, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17833142107851171, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.779920978490854e-05, |
|
"loss": 2.5524, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.17920559471124953, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.776941675971941e-05, |
|
"loss": 2.6292, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1800797683439873, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.773943284070892e-05, |
|
"loss": 2.5868, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.18095394197672512, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.7709258279255696e-05, |
|
"loss": 2.5811, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.18182811560946294, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.767889332833667e-05, |
|
"loss": 2.6033, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.18270228924220072, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.764833824252498e-05, |
|
"loss": 2.5816, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.18357646287493853, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.7617593277987794e-05, |
|
"loss": 2.6657, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18445063650767635, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.758665869248417e-05, |
|
"loss": 2.5748, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.18532481014041413, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.755553474536294e-05, |
|
"loss": 2.6091, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18619898377315194, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 2.5747, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.18707315740588976, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.749271981159855e-05, |
|
"loss": 2.6302, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.18794733103862754, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.7461029351582076e-05, |
|
"loss": 2.6072, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18882150467136535, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.7429150583196976e-05, |
|
"loss": 2.6458, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.18969567830410317, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.739708377370789e-05, |
|
"loss": 2.5746, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.19056985193684095, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.736482919195593e-05, |
|
"loss": 2.5883, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.19144402556957876, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.733238710835648e-05, |
|
"loss": 2.657, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.19231819920231655, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.729975779489689e-05, |
|
"loss": 2.6394, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19319237283505436, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.7266941525134215e-05, |
|
"loss": 2.6204, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.19406654646779217, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.7233938574192894e-05, |
|
"loss": 2.5254, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.19494072010052996, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.720074921876245e-05, |
|
"loss": 2.5567, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.19581489373326777, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.716737373709521e-05, |
|
"loss": 2.6215, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.19668906736600558, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.713381240900394e-05, |
|
"loss": 2.5763, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19756324099874337, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.710006551585946e-05, |
|
"loss": 2.6087, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.19843741463148118, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.7066133340588394e-05, |
|
"loss": 2.5327, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.199311588264219, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.703201616767067e-05, |
|
"loss": 2.5569, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.20018576189695678, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.699771428313722e-05, |
|
"loss": 2.5719, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.2010599355296946, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.696322797456757e-05, |
|
"loss": 2.5906, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2019341091624324, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.69285575310874e-05, |
|
"loss": 2.5452, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.20280828279517019, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.689370324336615e-05, |
|
"loss": 2.6078, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.203682456427908, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.685866540361456e-05, |
|
"loss": 2.561, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.20455663006064578, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.682344430558222e-05, |
|
"loss": 2.6126, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2054308036933836, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.6788040244555145e-05, |
|
"loss": 2.6181, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2063049773261214, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.6752453517353245e-05, |
|
"loss": 2.5554, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2071791509588592, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.6716684422327886e-05, |
|
"loss": 2.5949, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.208053324591597, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.6680733259359346e-05, |
|
"loss": 2.5931, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.20892749822433482, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.6644600329854325e-05, |
|
"loss": 2.5865, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.2098016718570726, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.6608285936743445e-05, |
|
"loss": 2.5658, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21067584548981041, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.657179038447862e-05, |
|
"loss": 2.5902, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.21155001912254823, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.653511397903063e-05, |
|
"loss": 2.5303, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.212424192755286, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.649825702788643e-05, |
|
"loss": 2.6264, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.21329836638802382, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.6461219840046654e-05, |
|
"loss": 2.5831, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.21417254002076164, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.642400272602302e-05, |
|
"loss": 2.6215, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21504671365349942, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.638660599783567e-05, |
|
"loss": 2.5877, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.21592088728623723, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.6349029969010644e-05, |
|
"loss": 2.5607, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.21679506091897502, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.631127495457713e-05, |
|
"loss": 2.5615, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.21766923455171283, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.6273341271064965e-05, |
|
"loss": 2.6131, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.21854340818445064, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.6235229236501845e-05, |
|
"loss": 2.6152, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21941758181718843, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.619693917041076e-05, |
|
"loss": 2.5947, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.22029175544992624, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.615847139380728e-05, |
|
"loss": 2.6395, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.22116592908266405, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.611982622919683e-05, |
|
"loss": 2.5855, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.22204010271540184, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.608100400057206e-05, |
|
"loss": 2.5098, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.22291427634813965, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.604200503341004e-05, |
|
"loss": 2.6061, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22378844998087746, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.6002829654669616e-05, |
|
"loss": 2.5075, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.22466262361361525, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.596347819278861e-05, |
|
"loss": 2.5869, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.22553679724635306, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.5923950977681084e-05, |
|
"loss": 2.586, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.22641097087909087, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.58842483407346e-05, |
|
"loss": 2.5124, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22728514451182866, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.584437061480739e-05, |
|
"loss": 2.5364, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22815931814456647, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.58043181342256e-05, |
|
"loss": 2.5939, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.22903349177730425, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 4.5764091234780504e-05, |
|
"loss": 2.5893, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22990766541004207, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.572369025372564e-05, |
|
"loss": 2.5496, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.23078183904277988, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.568311552977401e-05, |
|
"loss": 2.6138, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.23165601267551766, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.564236740309525e-05, |
|
"loss": 2.5724, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23253018630825548, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.560144621531278e-05, |
|
"loss": 2.5762, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2334043599409933, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.5560352309500886e-05, |
|
"loss": 2.5781, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.23427853357373107, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.551908603018191e-05, |
|
"loss": 2.606, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.2351527072064689, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.547764772332333e-05, |
|
"loss": 2.589, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.2360268808392067, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.5436037736334894e-05, |
|
"loss": 2.6229, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23690105447194448, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.539425641806562e-05, |
|
"loss": 2.5875, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.2377752281046823, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.535230411880098e-05, |
|
"loss": 2.6023, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2386494017374201, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.531018119025989e-05, |
|
"loss": 2.5965, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2395235753701579, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.5267887985591795e-05, |
|
"loss": 2.5359, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.2403977490028957, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 2.5603, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2412719226356335, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.5182792167607155e-05, |
|
"loss": 2.6296, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2421460962683713, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.513999026771539e-05, |
|
"loss": 2.5896, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.24302026990110911, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.509701951854017e-05, |
|
"loss": 2.5494, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2438944435338469, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.505388028033888e-05, |
|
"loss": 2.6256, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2447686171665847, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.501057291478149e-05, |
|
"loss": 2.6245, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24564279079932252, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.496709778494749e-05, |
|
"loss": 2.5308, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.2465169644320603, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.492345525532288e-05, |
|
"loss": 2.6629, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.24739113806479812, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.487964569179711e-05, |
|
"loss": 2.4932, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24826531169753593, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.4835669461660004e-05, |
|
"loss": 2.5798, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.24913948533027372, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.479152693359868e-05, |
|
"loss": 2.6232, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.25001365896301153, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.474721847769445e-05, |
|
"loss": 2.5524, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.2508878325957493, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.4702744465419744e-05, |
|
"loss": 2.6093, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.25176200622848716, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.465810526963499e-05, |
|
"loss": 2.5971, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.25263617986122494, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.461330126458544e-05, |
|
"loss": 2.529, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.2535103534939627, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.4568332825898105e-05, |
|
"loss": 2.5475, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25438452712670057, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.452320033057856e-05, |
|
"loss": 2.5431, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.25525870075943835, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.447790415700781e-05, |
|
"loss": 2.5771, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.25613287439217614, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.4432444684939077e-05, |
|
"loss": 2.6166, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.257007048024914, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.438682229549466e-05, |
|
"loss": 2.5507, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.25788122165765176, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.434103737116272e-05, |
|
"loss": 2.5351, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.25875539529038954, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.429509029579405e-05, |
|
"loss": 2.6678, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.25962956892312733, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.4248981454598935e-05, |
|
"loss": 2.5859, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.26050374255586517, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.420271123414381e-05, |
|
"loss": 2.5215, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.26137791618860295, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.415628002234812e-05, |
|
"loss": 2.5394, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.26225208982134074, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.4109688208481015e-05, |
|
"loss": 2.6149, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2631262634540786, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.406293618315809e-05, |
|
"loss": 2.5216, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.26400043708681636, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.4016024338338114e-05, |
|
"loss": 2.5536, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.26487461071955415, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.3968953067319777e-05, |
|
"loss": 2.5415, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.265748784352292, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.3921722764738326e-05, |
|
"loss": 2.5575, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.2666229579850298, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.387433382656232e-05, |
|
"loss": 2.4776, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.26749713161776756, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 2.5806, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.2683713052505054, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.377908163394734e-05, |
|
"loss": 2.5854, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.2692454788832432, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.373121917808196e-05, |
|
"loss": 2.5241, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.27011965251598097, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.368319968376253e-05, |
|
"loss": 2.4803, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2709938261487188, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.363502355357399e-05, |
|
"loss": 2.5509, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2718679997814566, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.358669119141453e-05, |
|
"loss": 2.5421, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.2727421734141944, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.3538203002492104e-05, |
|
"loss": 2.5374, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.2736163470469322, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.348955939332111e-05, |
|
"loss": 2.5822, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.27449052067967, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.344076077171897e-05, |
|
"loss": 2.5644, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2753646943124078, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.339180754680267e-05, |
|
"loss": 2.6278, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2762388679451456, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.3342700128985345e-05, |
|
"loss": 2.577, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.2771130415778834, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.3293438929972894e-05, |
|
"loss": 2.5167, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.2779872152106212, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.324402436276046e-05, |
|
"loss": 2.5297, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.27886138884335904, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.319445684162897e-05, |
|
"loss": 2.58, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.2797355624760968, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.3144736782141725e-05, |
|
"loss": 2.5503, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2806097361088346, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.309486460114085e-05, |
|
"loss": 2.4978, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.28148390974157245, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.3044840716743824e-05, |
|
"loss": 2.5319, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.28235808337431023, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.299466554833997e-05, |
|
"loss": 2.5353, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.283232257007048, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.294433951658697e-05, |
|
"loss": 2.6071, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.2841064306397858, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.289386304340727e-05, |
|
"loss": 2.6526, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.28498060427252364, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.284323655198462e-05, |
|
"loss": 2.553, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.2858547779052614, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.2792460466760485e-05, |
|
"loss": 2.5924, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2867289515379992, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.274153521343046e-05, |
|
"loss": 2.5093, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.28760312517073705, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.269046121894077e-05, |
|
"loss": 2.5962, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.28847729880347484, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.2639238911484633e-05, |
|
"loss": 2.5287, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2893514724362126, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.2587868720498705e-05, |
|
"loss": 2.5151, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.29022564606895046, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.253635107665945e-05, |
|
"loss": 2.5844, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.29109981970168824, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.2484686411879554e-05, |
|
"loss": 2.5545, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.29197399333442603, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.2432875159304295e-05, |
|
"loss": 2.5029, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.29284816696716387, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.2380917753307904e-05, |
|
"loss": 2.5439, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.29372234059990165, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.232881462948994e-05, |
|
"loss": 2.5714, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.29459651423263944, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.227656622467162e-05, |
|
"loss": 2.5515, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.2954706878653773, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.222417297689217e-05, |
|
"loss": 2.5615, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.29634486149811506, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.217163532540514e-05, |
|
"loss": 2.57, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.29721903513085285, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.211895371067474e-05, |
|
"loss": 2.5805, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2980932087635907, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.206612857437213e-05, |
|
"loss": 2.6419, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2989673823963285, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.2013160359371736e-05, |
|
"loss": 2.5025, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.29984155602906626, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.19600495097475e-05, |
|
"loss": 2.4513, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.3007157296618041, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.1906796470769195e-05, |
|
"loss": 2.6036, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.3015899032945419, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.185340168889868e-05, |
|
"loss": 2.5366, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.30246407692727967, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.179986561178617e-05, |
|
"loss": 2.539, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.3033382505600175, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.1746188688266444e-05, |
|
"loss": 2.5152, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.3042124241927553, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.16923713683551e-05, |
|
"loss": 2.6098, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.3050865978254931, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.163841410324482e-05, |
|
"loss": 2.5229, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3059607714582309, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.158431734530154e-05, |
|
"loss": 2.5009, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3068349450909687, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.153008154806067e-05, |
|
"loss": 2.4947, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.3077091187237065, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.1475707166223296e-05, |
|
"loss": 2.5652, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.30858329235644427, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.142119465565238e-05, |
|
"loss": 2.5643, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3094574659891821, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.13665444733689e-05, |
|
"loss": 2.5575, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3103316396219199, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 4.131175707754807e-05, |
|
"loss": 2.4748, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3112058132546577, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.125683292751546e-05, |
|
"loss": 2.53, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3120799868873955, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.120177248374315e-05, |
|
"loss": 2.5582, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3129541605201333, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.114657620784589e-05, |
|
"loss": 2.5842, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.3138283341528711, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.109124456257721e-05, |
|
"loss": 2.5279, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.31470250778560893, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.103577801182557e-05, |
|
"loss": 2.5657, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3155766814183467, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.098017702061039e-05, |
|
"loss": 2.5622, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3164508550510845, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.0924442055078276e-05, |
|
"loss": 2.5328, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.31732502868382234, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.0868573582499004e-05, |
|
"loss": 2.5514, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.3181992023165601, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.0812572071261654e-05, |
|
"loss": 2.5575, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3190733759492979, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.07564379908707e-05, |
|
"loss": 2.5688, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.31994754958203575, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.070017181194199e-05, |
|
"loss": 2.5032, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.32082172321477354, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.0643774006198907e-05, |
|
"loss": 2.5319, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.3216958968475113, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 2.5558, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.32257007048024916, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.053058540667676e-05, |
|
"loss": 2.5876, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.32344424411298694, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.0473795561846215e-05, |
|
"loss": 2.5354, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32431841774572473, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.0416875988090375e-05, |
|
"loss": 2.531, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.32519259137846257, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.035982716261053e-05, |
|
"loss": 2.5584, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.32606676501120035, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.030264956369157e-05, |
|
"loss": 2.4785, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.32694093864393814, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.0245343670698025e-05, |
|
"loss": 2.549, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.327815112276676, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.018790996406998e-05, |
|
"loss": 2.4917, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.32868928590941376, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.01303489253191e-05, |
|
"loss": 2.4882, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.32956345954215155, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.0072661037024596e-05, |
|
"loss": 2.5832, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.3304376331748894, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.0014846782829104e-05, |
|
"loss": 2.5667, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.3313118068076272, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.9956906647434736e-05, |
|
"loss": 2.511, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.33218598044036496, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.989884111659893e-05, |
|
"loss": 2.5146, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33306015407310274, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.984065067713043e-05, |
|
"loss": 2.4662, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.3339343277058406, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.978233581688518e-05, |
|
"loss": 2.5807, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.33480850133857837, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.9723897024762255e-05, |
|
"loss": 2.5095, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.33568267497131615, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.9665334790699714e-05, |
|
"loss": 2.5084, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.336556848604054, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.960664960567057e-05, |
|
"loss": 2.5447, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3374310222367918, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.95478419616786e-05, |
|
"loss": 2.5544, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.33830519586952956, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.948891235175425e-05, |
|
"loss": 2.5338, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.3391793695022674, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.942986126995052e-05, |
|
"loss": 2.5239, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.3400535431350052, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.937068921133879e-05, |
|
"loss": 2.5493, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.34092771676774297, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.931139667200469e-05, |
|
"loss": 2.4874, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3418018904004808, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.9251984149043917e-05, |
|
"loss": 2.5066, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.3426760640332186, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.919245214055812e-05, |
|
"loss": 2.5081, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.3435502376659564, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.913280114565066e-05, |
|
"loss": 2.5536, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.3444244112986942, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.9073031664422444e-05, |
|
"loss": 2.5335, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.345298584931432, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.901314419796778e-05, |
|
"loss": 2.4885, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3461727585641698, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.8953139248370116e-05, |
|
"loss": 2.5373, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.34704693219690763, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.889301731869784e-05, |
|
"loss": 2.563, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.3479211058296454, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.883277891300011e-05, |
|
"loss": 2.5089, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.3487952794623832, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.8772424536302564e-05, |
|
"loss": 2.5444, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.34966945309512104, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.8711954694603126e-05, |
|
"loss": 2.4677, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3505436267278588, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.865136989486776e-05, |
|
"loss": 2.4907, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.3514178003605966, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.8590670645026195e-05, |
|
"loss": 2.4889, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.35229197399333445, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.85298574539677e-05, |
|
"loss": 2.5175, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.35316614762607224, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.84689308315368e-05, |
|
"loss": 2.555, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.35404032125881, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.8407891288529004e-05, |
|
"loss": 2.4927, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.35491449489154786, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.834673933668651e-05, |
|
"loss": 2.4928, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.35578866852428565, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.828547548869396e-05, |
|
"loss": 2.5426, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.35666284215702343, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.822410025817406e-05, |
|
"loss": 2.5477, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.3575370157897612, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.8162614159683374e-05, |
|
"loss": 2.5466, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.35841118942249905, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.8101017708707906e-05, |
|
"loss": 2.5304, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35928536305523684, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 3.8039311421658887e-05, |
|
"loss": 2.556, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.3601595366879746, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.797749581586835e-05, |
|
"loss": 2.5913, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.36103371032071246, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.7915571409584836e-05, |
|
"loss": 2.5172, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.36190788395345025, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.7853538721969064e-05, |
|
"loss": 2.4756, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.36278205758618803, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.779139827308956e-05, |
|
"loss": 2.5278, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3636562312189259, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.7729150583918264e-05, |
|
"loss": 2.4925, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.36453040485166366, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.766679617632624e-05, |
|
"loss": 2.5038, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.36540457848440144, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.760433557307922e-05, |
|
"loss": 2.518, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.3662787521171393, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.754176929783327e-05, |
|
"loss": 2.554, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.36715292574987707, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.74790978751304e-05, |
|
"loss": 2.5062, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.36802709938261485, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.7416321830394144e-05, |
|
"loss": 2.5755, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.3689012730153527, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.735344168992515e-05, |
|
"loss": 2.5203, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.3697754466480905, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.7290457980896795e-05, |
|
"loss": 2.4996, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.37064962028082826, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.722737123135075e-05, |
|
"loss": 2.5625, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.3715237939135661, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.716418197019257e-05, |
|
"loss": 2.5665, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3723979675463039, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.710089072718722e-05, |
|
"loss": 2.5188, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.37327214117904167, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.7037498032954664e-05, |
|
"loss": 2.5166, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3741463148117795, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.697400441896543e-05, |
|
"loss": 2.5166, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3750204884445173, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.691041041753613e-05, |
|
"loss": 2.5436, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3758946620772551, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.6846716561824965e-05, |
|
"loss": 2.5019, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3767688357099929, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.678292338582735e-05, |
|
"loss": 2.5575, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.3776430093427307, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.671903142437134e-05, |
|
"loss": 2.5161, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.3785171829754685, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.6655041213113184e-05, |
|
"loss": 2.5285, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.37939135660820633, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.659095328853288e-05, |
|
"loss": 2.4936, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.3802655302409441, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.652676818792958e-05, |
|
"loss": 2.5238, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3811397038736819, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.646248644941716e-05, |
|
"loss": 2.4821, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3820138775064197, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.6398108611919696e-05, |
|
"loss": 2.5309, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3828880511391575, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.633363521516693e-05, |
|
"loss": 2.508, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.3837622247718953, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.626906679968974e-05, |
|
"loss": 2.5292, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3846363984046331, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.6204403906815655e-05, |
|
"loss": 2.5175, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38551057203737094, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.613964707866424e-05, |
|
"loss": 2.5478, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.3863847456701087, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.607479685814261e-05, |
|
"loss": 2.5442, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.3872589193028465, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.600985378894086e-05, |
|
"loss": 2.5198, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.38813309293558435, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.594481841552753e-05, |
|
"loss": 2.5001, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.38900726656832213, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.5879691283144964e-05, |
|
"loss": 2.53, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3898814402010599, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.5814472937804865e-05, |
|
"loss": 2.5589, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.39075561383379775, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.574916392628359e-05, |
|
"loss": 2.5402, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.39162978746653554, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.5683764796117634e-05, |
|
"loss": 2.48, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3925039610992733, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.561827609559905e-05, |
|
"loss": 2.5504, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.39337813473201116, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.55526983737708e-05, |
|
"loss": 2.5011, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39425230836474895, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.54870321804222e-05, |
|
"loss": 2.4815, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.39512648199748673, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.5421278066084276e-05, |
|
"loss": 2.537, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3960006556302246, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.535543658202518e-05, |
|
"loss": 2.5111, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.39687482926296236, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.528950828024555e-05, |
|
"loss": 2.4883, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.39774900289570014, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.522349371347387e-05, |
|
"loss": 2.4712, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.398623176528438, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.515739343516188e-05, |
|
"loss": 2.4872, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.39949735016117577, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.509120799947987e-05, |
|
"loss": 2.5711, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.40037152379391355, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.50249379613121e-05, |
|
"loss": 2.5285, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.4012456974266514, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.49585838762521e-05, |
|
"loss": 2.5139, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.4021198710593892, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.489214630059806e-05, |
|
"loss": 2.5236, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.40299404469212696, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.4825625791348096e-05, |
|
"loss": 2.5336, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.4038682183248648, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.475902290619565e-05, |
|
"loss": 2.4917, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.4047423919576026, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.469233820352477e-05, |
|
"loss": 2.5423, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.40561656559034037, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.462557224240545e-05, |
|
"loss": 2.4924, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.40649073922307816, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.455872558258895e-05, |
|
"loss": 2.5107, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.407364912855816, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.449179878450308e-05, |
|
"loss": 2.5197, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.4082390864885538, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.442479240924749e-05, |
|
"loss": 2.4901, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.40911326012129157, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.4357707018589036e-05, |
|
"loss": 2.4912, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4099874337540294, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.429054317495697e-05, |
|
"loss": 2.4534, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.4108616073867672, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.4223301441438306e-05, |
|
"loss": 2.4801, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.411735781019505, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.415598238177307e-05, |
|
"loss": 2.4984, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.4126099546522428, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.408858656034957e-05, |
|
"loss": 2.5402, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4134841282849806, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.4021114542199664e-05, |
|
"loss": 2.5232, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.4143583019177184, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.395356689299401e-05, |
|
"loss": 2.5168, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4152324755504562, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.3885944179037395e-05, |
|
"loss": 2.5563, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.416106649183194, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.381824696726386e-05, |
|
"loss": 2.5104, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.4169808228159318, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.3750475825232074e-05, |
|
"loss": 2.5002, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.41785499644866964, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.3682631321120504e-05, |
|
"loss": 2.5262, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.4187291700814074, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.361471402372267e-05, |
|
"loss": 2.5159, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.4196033437141452, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.3546724502442354e-05, |
|
"loss": 2.455, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.42047751734688305, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.347866332728889e-05, |
|
"loss": 2.4299, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.42135169097962083, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.341053106887229e-05, |
|
"loss": 2.5159, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.4222258646123586, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.3342328298398565e-05, |
|
"loss": 2.4763, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.42310003824509645, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.3274055587664856e-05, |
|
"loss": 2.4768, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.42397421187783424, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.320571350905466e-05, |
|
"loss": 2.5295, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.424848385510572, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.313730263553306e-05, |
|
"loss": 2.4913, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.42572255914330986, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.3068823540641886e-05, |
|
"loss": 2.5096, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.42659673277604765, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.300027679849492e-05, |
|
"loss": 2.5255, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.42747090640878543, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.2931662983773106e-05, |
|
"loss": 2.4564, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.4283450800415233, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.286298267171969e-05, |
|
"loss": 2.5294, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42921925367426106, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.2794236438135405e-05, |
|
"loss": 2.5117, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.43009342730699884, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 2.4564, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.43096760093973663, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.265654851233579e-05, |
|
"loss": 2.4361, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.43184177457247447, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.258760797446598e-05, |
|
"loss": 2.5215, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.43271594820521225, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.251860382374668e-05, |
|
"loss": 2.4979, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.43359012183795004, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.244953663869365e-05, |
|
"loss": 2.5005, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.4344642954706879, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.238040699835106e-05, |
|
"loss": 2.5365, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.43533846910342566, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.231121548228676e-05, |
|
"loss": 2.5102, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.43621264273616345, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.2241962670587314e-05, |
|
"loss": 2.4999, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.4370868163689013, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.2172649143853176e-05, |
|
"loss": 2.4631, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43796099000163907, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.210327548319382e-05, |
|
"loss": 2.5414, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.43883516363437686, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.203384227022291e-05, |
|
"loss": 2.4368, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.4397093372671147, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.196435008705332e-05, |
|
"loss": 2.5089, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.4405835108998525, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.1894799516292374e-05, |
|
"loss": 2.4273, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.44145768453259027, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.1825191141036864e-05, |
|
"loss": 2.4994, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.4423318581653281, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.175552554486822e-05, |
|
"loss": 2.4675, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.4432060317980659, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.1685803311847596e-05, |
|
"loss": 2.4315, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.4440802054308037, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.161602502651099e-05, |
|
"loss": 2.5206, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.4449543790635415, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.1546191273864314e-05, |
|
"loss": 2.4594, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.4458285526962793, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.14763026393785e-05, |
|
"loss": 2.4955, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4467027263290171, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.140635970898462e-05, |
|
"loss": 2.4864, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.4475768999617549, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.133636306906895e-05, |
|
"loss": 2.4598, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.4484510735944927, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.126631330646802e-05, |
|
"loss": 2.5357, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.4493252472272305, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.1196211008463765e-05, |
|
"loss": 2.499, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.45019942085996834, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.112605676277855e-05, |
|
"loss": 2.5166, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4510735944927061, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.105585115757027e-05, |
|
"loss": 2.4977, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.4519477681254439, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.098559478142739e-05, |
|
"loss": 2.48, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.45282194175818175, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.091528822336405e-05, |
|
"loss": 2.5027, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.45369611539091953, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.084493207281507e-05, |
|
"loss": 2.4363, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.4545702890236573, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.077452691963108e-05, |
|
"loss": 2.4709, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4554444626563951, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.0704073354073524e-05, |
|
"loss": 2.4589, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.45631863628913294, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.063357196680969e-05, |
|
"loss": 2.5196, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.4571928099218707, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 2.4767, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.4580669835546085, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.0492428091832235e-05, |
|
"loss": 2.5096, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.45894115718734635, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.0421786787438046e-05, |
|
"loss": 2.472, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.45981533082008413, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.0351100027966576e-05, |
|
"loss": 2.4269, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.4606895044528219, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 3.028036840604019e-05, |
|
"loss": 2.4802, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.46156367808555976, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.0209592514657365e-05, |
|
"loss": 2.4102, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.46243785171829754, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.0138772947187743e-05, |
|
"loss": 2.5099, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.46331202535103533, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.006791029736711e-05, |
|
"loss": 2.5259, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.46418619898377317, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.999700515929247e-05, |
|
"loss": 2.3805, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.46506037261651095, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.9926058127417018e-05, |
|
"loss": 2.4986, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.46593454624924874, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.9855069796545186e-05, |
|
"loss": 2.5136, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.4668087198819866, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.9784040761827658e-05, |
|
"loss": 2.4745, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.46768289351472436, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.9712971618756364e-05, |
|
"loss": 2.4878, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.46855706714746215, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.9641862963159478e-05, |
|
"loss": 2.4917, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.4694312407802, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.9570715391196463e-05, |
|
"loss": 2.4364, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.4703054144129378, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.9499529499353024e-05, |
|
"loss": 2.4615, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.47117958804567556, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.942830588443615e-05, |
|
"loss": 2.526, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.4720537616784134, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.935704514356909e-05, |
|
"loss": 2.5232, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4729279353111512, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.9285747874186342e-05, |
|
"loss": 2.47, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.47380210894388897, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.9214414674028658e-05, |
|
"loss": 2.5342, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.4746762825766268, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.9143046141138015e-05, |
|
"loss": 2.5103, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.4755504562093646, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.9071642873852612e-05, |
|
"loss": 2.4559, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.4764246298421024, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.900020547080188e-05, |
|
"loss": 2.5457, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4772988034748402, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.8928734530901403e-05, |
|
"loss": 2.5192, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.478172977107578, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 2.8857230653347945e-05, |
|
"loss": 2.414, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.4790471507403158, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.878569443761442e-05, |
|
"loss": 2.5149, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.47992132437305357, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.871412648344485e-05, |
|
"loss": 2.412, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.4807954980057914, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.8642527390849326e-05, |
|
"loss": 2.4455, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4816696716385292, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 2.8570897760099042e-05, |
|
"loss": 2.4805, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.482543845271267, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.849923819172117e-05, |
|
"loss": 2.4148, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.4834180189040048, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.8427549286493904e-05, |
|
"loss": 2.4873, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.4842921925367426, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 2.8355831645441388e-05, |
|
"loss": 2.4999, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.4851663661694804, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 2.8284085869828665e-05, |
|
"loss": 2.527, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.48604053980221823, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.821231256115666e-05, |
|
"loss": 2.4385, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.486914713434956, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.8140512321157142e-05, |
|
"loss": 2.5412, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.4877888870676938, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.8068685751787636e-05, |
|
"loss": 2.5424, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.48866306070043164, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.799683345522644e-05, |
|
"loss": 2.5117, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.4895372343331694, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.792495603386753e-05, |
|
"loss": 2.4806, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4904114079659072, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.7853054090315505e-05, |
|
"loss": 2.5502, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.49128558159864505, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.778112822738059e-05, |
|
"loss": 2.4464, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.49215975523138283, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.770917904807352e-05, |
|
"loss": 2.4851, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.4930339288641206, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.7637207155600497e-05, |
|
"loss": 2.5079, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.49390810249685846, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.756521315335818e-05, |
|
"loss": 2.5144, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.49478227612959624, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 2.7493197644928563e-05, |
|
"loss": 2.5332, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.49565644976233403, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.742116123407396e-05, |
|
"loss": 2.5106, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.49653062339507187, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.7349104524731916e-05, |
|
"loss": 2.5031, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.49740479702780965, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 2.7277028121010162e-05, |
|
"loss": 2.4668, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.49827897066054744, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.720493262718153e-05, |
|
"loss": 2.5557, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4991531442932853, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.7132818647678916e-05, |
|
"loss": 2.4921, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.5000273179260231, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.7060686787090182e-05, |
|
"loss": 2.496, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.5009014915587608, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.6988537650153107e-05, |
|
"loss": 2.511, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.5017756651914986, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.691637184175031e-05, |
|
"loss": 2.5194, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.5026498388242364, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.6844189966904192e-05, |
|
"loss": 2.4826, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5035240124569743, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.6771992630771824e-05, |
|
"loss": 2.4936, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.5043981860897121, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.6699780438639925e-05, |
|
"loss": 2.5083, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.5052723597224499, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.6627553995919764e-05, |
|
"loss": 2.4806, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.5061465333551877, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.6555313908142053e-05, |
|
"loss": 2.5227, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.5070207069879255, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.648306078095194e-05, |
|
"loss": 2.4796, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5078948806206632, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.6410795220103877e-05, |
|
"loss": 2.4873, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.5087690542534011, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.6338517831456555e-05, |
|
"loss": 2.5188, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.5096432278861389, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.6266229220967818e-05, |
|
"loss": 2.461, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.5105174015188767, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.619392999468962e-05, |
|
"loss": 2.5317, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.5113915751516145, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 2.6121620758762877e-05, |
|
"loss": 2.5001, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5122657487843523, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.604930211941245e-05, |
|
"loss": 2.5155, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.51313992241709, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.5976974682942046e-05, |
|
"loss": 2.4995, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.514014096049828, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.5904639055729092e-05, |
|
"loss": 2.4771, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.5148882696825657, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.5832295844219696e-05, |
|
"loss": 2.4807, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.5157624433153035, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.5759945654923575e-05, |
|
"loss": 2.4858, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5166366169480413, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.5687589094408908e-05, |
|
"loss": 2.4595, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.5175107905807791, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 2.5615226769297325e-05, |
|
"loss": 2.5661, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.5183849642135169, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.554285928625877e-05, |
|
"loss": 2.5154, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.5192591378462547, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.5470487252006414e-05, |
|
"loss": 2.4824, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.5201333114789926, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.539811127329161e-05, |
|
"loss": 2.4549, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5210074851117303, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.5325731956898767e-05, |
|
"loss": 2.438, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.5218816587444681, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.5253349909640278e-05, |
|
"loss": 2.4597, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.5227558323772059, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.518096573835143e-05, |
|
"loss": 2.5094, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.5236300060099437, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.510858004988533e-05, |
|
"loss": 2.5704, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.5245041796426815, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.5036193451107776e-05, |
|
"loss": 2.4547, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5253783532754194, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.4963806548892233e-05, |
|
"loss": 2.5035, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.5262525269081572, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.489141995011468e-05, |
|
"loss": 2.4283, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.5271267005408949, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.4819034261648573e-05, |
|
"loss": 2.472, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.5280008741736327, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.474665009035973e-05, |
|
"loss": 2.4643, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.5288750478063705, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.4674268043101242e-05, |
|
"loss": 2.5151, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5297492214391083, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.4601888726708393e-05, |
|
"loss": 2.5029, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.5306233950718462, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.4529512747993595e-05, |
|
"loss": 2.4279, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.531497568704584, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.4457140713741237e-05, |
|
"loss": 2.4896, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.5323717423373218, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.4384773230702674e-05, |
|
"loss": 2.5096, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.5332459159700595, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.43124109055911e-05, |
|
"loss": 2.5153, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5341200896027973, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.4240054345076438e-05, |
|
"loss": 2.4421, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.5349942632355351, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.416770415578031e-05, |
|
"loss": 2.5096, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.535868436868273, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.4095360944270917e-05, |
|
"loss": 2.5204, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.5367426105010108, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.4023025317057963e-05, |
|
"loss": 2.4526, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.5376167841337486, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.3950697880587547e-05, |
|
"loss": 2.46, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5384909577664864, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.3878379241237136e-05, |
|
"loss": 2.5201, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.5393651313992242, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.3806070005310392e-05, |
|
"loss": 2.441, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.5402393050319619, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.3733770779032184e-05, |
|
"loss": 2.483, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.5411134786646997, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.366148216854345e-05, |
|
"loss": 2.4989, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.5419876522974376, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.3589204779896125e-05, |
|
"loss": 2.5297, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5428618259301754, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.3516939219048058e-05, |
|
"loss": 2.5281, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.5437359995629132, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.344468609185796e-05, |
|
"loss": 2.519, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.544610173195651, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.3372446004080252e-05, |
|
"loss": 2.4291, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.5454843468283888, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.3300219561360077e-05, |
|
"loss": 2.4963, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.5463585204611265, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 2.3228007369228178e-05, |
|
"loss": 2.5143, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5472326940938644, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.3155810033095814e-05, |
|
"loss": 2.4494, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.5481068677266022, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.308362815824969e-05, |
|
"loss": 2.4042, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.54898104135934, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.3011462349846905e-05, |
|
"loss": 2.5261, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.5498552149920778, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.293931321290983e-05, |
|
"loss": 2.4527, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.5507293886248156, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.2867181352321093e-05, |
|
"loss": 2.5016, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5516035622575534, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.2795067372818473e-05, |
|
"loss": 2.4918, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.5524777358902913, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.272297187898984e-05, |
|
"loss": 2.4603, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.553351909523029, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.2650895475268086e-05, |
|
"loss": 2.4828, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.5542260831557668, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.257883876592604e-05, |
|
"loss": 2.5157, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.5551002567885046, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.2506802355071443e-05, |
|
"loss": 2.5095, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5559744304212424, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.2434786846641824e-05, |
|
"loss": 2.5183, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.5568486040539802, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 2.2362792844399505e-05, |
|
"loss": 2.4203, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.5577227776867181, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.2290820951926487e-05, |
|
"loss": 2.4516, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.5585969513194559, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.221887177261941e-05, |
|
"loss": 2.5421, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.5594711249521936, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.214694590968449e-05, |
|
"loss": 2.4276, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5603452985849314, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.2075043966132484e-05, |
|
"loss": 2.4471, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.5612194722176692, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.2003166544773567e-05, |
|
"loss": 2.5014, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.562093645850407, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.1931314248212366e-05, |
|
"loss": 2.4937, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.5629678194831449, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.1859487678842864e-05, |
|
"loss": 2.5088, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.5638419931158827, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.1787687438843344e-05, |
|
"loss": 2.5142, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5647161667486205, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 2.1715914130171337e-05, |
|
"loss": 2.4418, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.5655903403813582, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.164416835455862e-05, |
|
"loss": 2.4465, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.566464514014096, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.1572450713506098e-05, |
|
"loss": 2.4755, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.5673386876468338, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.1500761808278834e-05, |
|
"loss": 2.4652, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.5682128612795716, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.1429102239900967e-05, |
|
"loss": 2.4644, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5690870349123095, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.1357472609150676e-05, |
|
"loss": 2.4921, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.5699612085450473, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.128587351655516e-05, |
|
"loss": 2.4701, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.5708353821777851, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.1214305562385592e-05, |
|
"loss": 2.4611, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.5717095558105229, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.1142769346652064e-05, |
|
"loss": 2.4365, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.5725837294432606, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.1071265469098607e-05, |
|
"loss": 2.4008, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5734579030759984, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 2.0999794529198124e-05, |
|
"loss": 2.394, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.5743320767087363, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.0928357126147387e-05, |
|
"loss": 2.5286, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.5752062503414741, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.0856953858861995e-05, |
|
"loss": 2.4543, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.5760804239742119, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.078558532597135e-05, |
|
"loss": 2.4641, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.5769545976069497, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.0714252125813667e-05, |
|
"loss": 2.5054, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5778287712396875, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.0642954856430913e-05, |
|
"loss": 2.5043, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.5787029448724252, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.057169411556385e-05, |
|
"loss": 2.5082, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.5795771185051631, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.0500470500646978e-05, |
|
"loss": 2.4855, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.5804512921379009, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.0429284608803546e-05, |
|
"loss": 2.5129, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.5813254657706387, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.0358137036840528e-05, |
|
"loss": 2.4905, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5821996394033765, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.0287028381243645e-05, |
|
"loss": 2.4457, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.5830738130361143, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.0215959238172345e-05, |
|
"loss": 2.4677, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.5839479866688521, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.0144930203454816e-05, |
|
"loss": 2.4793, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.58482216030159, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.0073941872582984e-05, |
|
"loss": 2.4967, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.5856963339343277, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.0002994840707534e-05, |
|
"loss": 2.4472, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5865705075670655, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.9932089702632897e-05, |
|
"loss": 2.5045, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.5874446811998033, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.986122705281227e-05, |
|
"loss": 2.4368, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.5883188548325411, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.979040748534264e-05, |
|
"loss": 2.4439, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.5891930284652789, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.9719631593959816e-05, |
|
"loss": 2.4486, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.5900672020980167, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.9648899972033426e-05, |
|
"loss": 2.4085, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5909413757307546, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.9578213212561953e-05, |
|
"loss": 2.3664, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.5918155493634923, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.950757190816777e-05, |
|
"loss": 2.5016, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.5926897229962301, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 2.48, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.5935638966289679, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.9366428033190313e-05, |
|
"loss": 2.4471, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.5944380702617057, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.929592664592649e-05, |
|
"loss": 2.4438, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5953122438944435, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.9225473080368916e-05, |
|
"loss": 2.4818, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.5961864175271814, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.9155067927184926e-05, |
|
"loss": 2.5117, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.5970605911599192, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 1.9084711776635958e-05, |
|
"loss": 2.4846, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.597934764792657, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.901440521857261e-05, |
|
"loss": 2.4458, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.5988089384253947, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.894414884242974e-05, |
|
"loss": 2.517, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5996831120581325, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.8873943237221453e-05, |
|
"loss": 2.4851, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.6005572856908703, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.880378899153624e-05, |
|
"loss": 2.4821, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.6014314593236082, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.8733686693531985e-05, |
|
"loss": 2.4555, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.602305632956346, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.8663636930931063e-05, |
|
"loss": 2.5098, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.6031798065890838, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.859364029101538e-05, |
|
"loss": 2.4382, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6040539802218216, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.8523697360621504e-05, |
|
"loss": 2.5444, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.6049281538545593, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.8453808726135695e-05, |
|
"loss": 2.5187, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.6058023274872971, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.838397497348901e-05, |
|
"loss": 2.5029, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.606676501120035, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.8314196688152403e-05, |
|
"loss": 2.5432, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.6075506747527728, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.8244474455131792e-05, |
|
"loss": 2.4449, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6084248483855106, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.8174808858963145e-05, |
|
"loss": 2.4266, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.6092990220182484, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.810520048370763e-05, |
|
"loss": 2.5512, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.6101731956509862, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.8035649912946684e-05, |
|
"loss": 2.5539, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.6110473692837239, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.7966157729777095e-05, |
|
"loss": 2.4313, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.6119215429164618, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.7896724516806175e-05, |
|
"loss": 2.4875, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6127957165491996, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.782735085614683e-05, |
|
"loss": 2.4663, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.6136698901819374, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.77580373294127e-05, |
|
"loss": 2.4446, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.6145440638146752, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.7688784517713248e-05, |
|
"loss": 2.5128, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.615418237447413, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.7619593001648947e-05, |
|
"loss": 2.3915, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.6162924110801508, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.755046336130636e-05, |
|
"loss": 2.4668, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6171665847128885, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.7481396176253313e-05, |
|
"loss": 2.4907, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.6180407583456264, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.7412392025534012e-05, |
|
"loss": 2.447, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.6189149319783642, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.7343451487664214e-05, |
|
"loss": 2.4827, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.619789105611102, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 2.5083, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.6206632792438398, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.72057635618646e-05, |
|
"loss": 2.4876, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6215374528765776, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.713701732828032e-05, |
|
"loss": 2.4839, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.6224116265093154, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.7068337016226893e-05, |
|
"loss": 2.5058, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.6232858001420533, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.6999723201505078e-05, |
|
"loss": 2.5269, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.624159973774791, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.6931176459358126e-05, |
|
"loss": 2.4452, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.6250341474075288, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.686269736446695e-05, |
|
"loss": 2.4149, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6259083210402666, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.6794286490945342e-05, |
|
"loss": 2.4508, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.6267824946730044, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.672594441233515e-05, |
|
"loss": 2.4886, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.6276566683057422, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.6657671701601434e-05, |
|
"loss": 2.4674, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.6285308419384801, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.6589468931127707e-05, |
|
"loss": 2.4671, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.6294050155712179, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.6521336672711123e-05, |
|
"loss": 2.4426, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6302791892039556, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.645327549755765e-05, |
|
"loss": 2.4402, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.6311533628366934, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.6385285976277337e-05, |
|
"loss": 2.4235, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.6320275364694312, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.6317368678879495e-05, |
|
"loss": 2.4949, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.632901710102169, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.624952417476792e-05, |
|
"loss": 2.4693, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.6337758837349069, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.618175303273614e-05, |
|
"loss": 2.4108, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6346500573676447, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.6114055820962617e-05, |
|
"loss": 2.5233, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.6355242310003825, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.6046433107005994e-05, |
|
"loss": 2.4736, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.6363984046331203, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.5978885457800345e-05, |
|
"loss": 2.4318, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.637272578265858, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.5911413439650436e-05, |
|
"loss": 2.4888, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.6381467518985958, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.5844017618226935e-05, |
|
"loss": 2.5133, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6390209255313336, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.5776698558561696e-05, |
|
"loss": 2.4708, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.6398950991640715, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.5709456825043046e-05, |
|
"loss": 2.4479, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.6407692727968093, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.5642292981410976e-05, |
|
"loss": 2.4754, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.6416434464295471, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.557520759075251e-05, |
|
"loss": 2.4537, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.6425176200622849, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.5508201215496926e-05, |
|
"loss": 2.4646, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6433917936950226, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.5441274417411053e-05, |
|
"loss": 2.5035, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.6442659673277604, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.5374427757594552e-05, |
|
"loss": 2.4973, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.6451401409604983, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.5307661796475247e-05, |
|
"loss": 2.351, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.6460143145932361, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.5240977093804365e-05, |
|
"loss": 2.457, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.6468884882259739, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.5174374208651912e-05, |
|
"loss": 2.4152, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6477626618587117, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.5107853699401945e-05, |
|
"loss": 2.4671, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.6486368354914495, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.5041416123747899e-05, |
|
"loss": 2.4371, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.6495110091241872, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.4975062038687904e-05, |
|
"loss": 2.4177, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.6503851827569251, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.4908792000520141e-05, |
|
"loss": 2.4789, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.6512593563896629, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.484260656483813e-05, |
|
"loss": 2.456, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6521335300224007, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.4776506286526131e-05, |
|
"loss": 2.4577, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.6530077036551385, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.4710491719754454e-05, |
|
"loss": 2.4199, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.6538818772878763, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.4644563417974827e-05, |
|
"loss": 2.4352, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.6547560509206141, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.4578721933915723e-05, |
|
"loss": 2.4898, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.655630224553352, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.4512967819577815e-05, |
|
"loss": 2.4195, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6565043981860897, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 1.4447301626229204e-05, |
|
"loss": 2.5405, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.6573785718188275, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.4381723904400957e-05, |
|
"loss": 2.4603, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.6582527454515653, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.4316235203882371e-05, |
|
"loss": 2.4595, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.6591269190843031, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.4250836073716411e-05, |
|
"loss": 2.4936, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.6600010927170409, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.418552706219514e-05, |
|
"loss": 2.4954, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6608752663497788, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.4120308716855038e-05, |
|
"loss": 2.4924, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.6617494399825166, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.4055181584472488e-05, |
|
"loss": 2.4962, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.6626236136152543, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.399014621105914e-05, |
|
"loss": 2.4441, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.6634977872479921, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 1.3925203141857398e-05, |
|
"loss": 2.5086, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.6643719608807299, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.386035292133577e-05, |
|
"loss": 2.4698, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6652461345134677, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.3795596093184344e-05, |
|
"loss": 2.4269, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.6661203081462055, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.3730933200310252e-05, |
|
"loss": 2.4506, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.6669944817789434, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.3666364784833075e-05, |
|
"loss": 2.4774, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.6678686554116812, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.3601891388080313e-05, |
|
"loss": 2.4573, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.668742829044419, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.3537513550582853e-05, |
|
"loss": 2.4305, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6696170026771567, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.3473231812070427e-05, |
|
"loss": 2.4808, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.6704911763098945, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.3409046711467127e-05, |
|
"loss": 2.4969, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.6713653499426323, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.3344958786886808e-05, |
|
"loss": 2.4678, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.6722395235753702, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.3280968575628674e-05, |
|
"loss": 2.4409, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.673113697208108, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.3217076614172652e-05, |
|
"loss": 2.5037, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6739878708408458, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.3153283438175034e-05, |
|
"loss": 2.4852, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.6748620444735836, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.3089589582463879e-05, |
|
"loss": 2.4512, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.6757362181063213, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.3025995581034561e-05, |
|
"loss": 2.4298, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.6766103917390591, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.2962501967045332e-05, |
|
"loss": 2.4524, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.677484565371797, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.2899109272812788e-05, |
|
"loss": 2.4817, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6783587390045348, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.283581802980744e-05, |
|
"loss": 2.4183, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.6792329126372726, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.2772628768649247e-05, |
|
"loss": 2.4454, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.6801070862700104, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.270954201910321e-05, |
|
"loss": 2.4866, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.6809812599027482, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.264655831007486e-05, |
|
"loss": 2.4648, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.6818554335354859, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.2583678169605857e-05, |
|
"loss": 2.416, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6827296071682238, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.2520902124869605e-05, |
|
"loss": 2.4401, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.6836037808009616, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.245823070216673e-05, |
|
"loss": 2.4564, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.6844779544336994, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.239566442692079e-05, |
|
"loss": 2.5142, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.6853521280664372, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.2333203823673773e-05, |
|
"loss": 2.44, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.686226301699175, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.2270849416081737e-05, |
|
"loss": 2.481, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6871004753319128, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.2208601726910446e-05, |
|
"loss": 2.4822, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.6879746489646507, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.2146461278030938e-05, |
|
"loss": 2.4373, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.6888488225973884, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.2084428590415172e-05, |
|
"loss": 2.4376, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.6897229962301262, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.2022504184131656e-05, |
|
"loss": 2.4519, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.690597169862864, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.1960688578341117e-05, |
|
"loss": 2.3984, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6914713434956018, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.1898982291292096e-05, |
|
"loss": 2.4713, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.6923455171283396, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.1837385840316628e-05, |
|
"loss": 2.453, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.6932196907610774, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.1775899741825947e-05, |
|
"loss": 2.4441, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.6940938643938153, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.1714524511306043e-05, |
|
"loss": 2.4505, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.694968038026553, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 1.165326066331349e-05, |
|
"loss": 2.4896, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6958422116592908, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.1592108711470995e-05, |
|
"loss": 2.4831, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.6967163852920286, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.1531069168463202e-05, |
|
"loss": 2.4131, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.6975905589247664, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.1470142546032304e-05, |
|
"loss": 2.4331, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.6984647325575042, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.1409329354973814e-05, |
|
"loss": 2.4559, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.6993389061902421, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.1348630105132253e-05, |
|
"loss": 2.5032, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7002130798229799, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.1288045305396874e-05, |
|
"loss": 2.4401, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.7010872534557177, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.122757546369744e-05, |
|
"loss": 2.4558, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.7019614270884554, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.1167221086999895e-05, |
|
"loss": 2.5234, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.7028356007211932, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.1106982681302159e-05, |
|
"loss": 2.4717, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.703709774353931, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.10468607516299e-05, |
|
"loss": 2.4759, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7045839479866689, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.0986855802032225e-05, |
|
"loss": 2.5144, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.7054581216194067, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.0926968335577564e-05, |
|
"loss": 2.4884, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.7063322952521445, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.086719885434935e-05, |
|
"loss": 2.4915, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.7072064688848823, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.0807547859441885e-05, |
|
"loss": 2.4426, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.70808064251762, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 1.0748015850956086e-05, |
|
"loss": 2.4774, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7089548161503578, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 1.0688603327995323e-05, |
|
"loss": 2.4552, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.7098289897830957, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.0629310788661222e-05, |
|
"loss": 2.444, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.7107031634158335, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.0570138730049484e-05, |
|
"loss": 2.4437, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.7115773370485713, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.0511087648245757e-05, |
|
"loss": 2.4354, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.7124515106813091, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.0452158038321402e-05, |
|
"loss": 2.499, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7133256843140469, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.0393350394329429e-05, |
|
"loss": 2.4979, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.7141998579467846, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.0334665209300295e-05, |
|
"loss": 2.5171, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.7150740315795224, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 1.0276102975237754e-05, |
|
"loss": 2.4536, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.7159482052122603, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.0217664183114825e-05, |
|
"loss": 2.4536, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.7168223788449981, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.0159349322869574e-05, |
|
"loss": 2.4038, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7176965524777359, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.0101158883401077e-05, |
|
"loss": 2.4728, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.7185707261104737, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.0043093352565272e-05, |
|
"loss": 2.4679, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.7194448997432115, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 9.985153217170903e-06, |
|
"loss": 2.5158, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.7203190733759492, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.927338962975416e-06, |
|
"loss": 2.4396, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.7211932470086871, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.869651074680893e-06, |
|
"loss": 2.4815, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7220674206414249, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.812090035930024e-06, |
|
"loss": 2.3869, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.7229415942741627, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.754656329301976e-06, |
|
"loss": 2.4363, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.7238157679069005, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 9.697350436308427e-06, |
|
"loss": 2.4645, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.7246899415396383, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.640172837389475e-06, |
|
"loss": 2.4348, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.7255641151723761, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.583124011909628e-06, |
|
"loss": 2.5198, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.726438288805114, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.526204438153794e-06, |
|
"loss": 2.4499, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.7273124624378517, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 9.469414593323242e-06, |
|
"loss": 2.4385, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.7281866360705895, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 2.4505, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.7290608097033273, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.356225993801101e-06, |
|
"loss": 2.4464, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.7299349833360651, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.299828188058013e-06, |
|
"loss": 2.3962, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7308091569688029, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.243562009129316e-06, |
|
"loss": 2.4827, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.7316833306015408, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 9.187427928738343e-06, |
|
"loss": 2.5232, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.7325575042342786, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.131426417501005e-06, |
|
"loss": 2.4248, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.7334316778670164, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.075557944921728e-06, |
|
"loss": 2.5111, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.7343058514997541, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 9.019822979389614e-06, |
|
"loss": 2.3708, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7351800251324919, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 8.964221988174442e-06, |
|
"loss": 2.4439, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.7360541987652297, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 8.908755437422792e-06, |
|
"loss": 2.451, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.7369283723979676, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.85342379215412e-06, |
|
"loss": 2.4452, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.7378025460307054, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 8.798227516256854e-06, |
|
"loss": 2.4486, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.7386767196634432, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.743167072484549e-06, |
|
"loss": 2.4509, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.739550893296181, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.688242922451928e-06, |
|
"loss": 2.4726, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.7404250669289187, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 8.633455526631098e-06, |
|
"loss": 2.5167, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.7412992405616565, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 8.578805344347623e-06, |
|
"loss": 2.4329, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.7421734141943943, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 8.524292833776706e-06, |
|
"loss": 2.4932, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.7430475878271322, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 8.469918451939334e-06, |
|
"loss": 2.4598, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.74392176145987, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 8.415682654698459e-06, |
|
"loss": 2.4279, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.7447959350926078, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 8.361585896755181e-06, |
|
"loss": 2.459, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.7456701087253456, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.307628631644903e-06, |
|
"loss": 2.4687, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.7465442823580833, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.253811311733567e-06, |
|
"loss": 2.5331, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.7474184559908211, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.200134388213837e-06, |
|
"loss": 2.4478, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.748292629623559, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.146598311101317e-06, |
|
"loss": 2.4979, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.7491668032562968, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 8.09320352923081e-06, |
|
"loss": 2.4402, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.7500409768890346, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 8.039950490252505e-06, |
|
"loss": 2.4791, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.7509151505217724, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.986839640628268e-06, |
|
"loss": 2.4827, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.7517893241545102, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.93387142562787e-06, |
|
"loss": 2.4664, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.752663497787248, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.881046289325268e-06, |
|
"loss": 2.5298, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.7535376714199858, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.82836467459487e-06, |
|
"loss": 2.4936, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.7544118450527236, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.775827023107835e-06, |
|
"loss": 2.4657, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.7552860186854614, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.723433775328384e-06, |
|
"loss": 2.4861, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.7561601923181992, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.671185370510059e-06, |
|
"loss": 2.4551, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.757034365950937, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.619082246692103e-06, |
|
"loss": 2.4114, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.7579085395836748, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 7.567124840695708e-06, |
|
"loss": 2.3837, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.7587827132164127, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.515313588120451e-06, |
|
"loss": 2.4788, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.7596568868491504, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 7.463648923340558e-06, |
|
"loss": 2.45, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.7605310604818882, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.412131279501297e-06, |
|
"loss": 2.4414, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.761405234114626, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.36076108851537e-06, |
|
"loss": 2.4682, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.7622794077473638, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.309538781059239e-06, |
|
"loss": 2.5377, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.7631535813801016, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.258464786569549e-06, |
|
"loss": 2.4896, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.7640277550128394, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.207539533239527e-06, |
|
"loss": 2.5085, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.7649019286455773, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.156763448015377e-06, |
|
"loss": 2.4876, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.765776102278315, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.106136956592729e-06, |
|
"loss": 2.4602, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.7666502759110528, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 7.055660483413029e-06, |
|
"loss": 2.5205, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.7675244495437906, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 7.005334451660034e-06, |
|
"loss": 2.4711, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.7683986231765284, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.95515928325618e-06, |
|
"loss": 2.4817, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.7692727968092662, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 6.905135398859156e-06, |
|
"loss": 2.4179, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7701469704420041, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.855263217858279e-06, |
|
"loss": 2.4559, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.7710211440747419, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 6.805543158371028e-06, |
|
"loss": 2.4928, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.7718953177074797, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.7559756372395475e-06, |
|
"loss": 2.4632, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.7727694913402174, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 6.706561070027109e-06, |
|
"loss": 2.4679, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.7736436649729552, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.657299871014664e-06, |
|
"loss": 2.481, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.774517838605693, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.60819245319734e-06, |
|
"loss": 2.4626, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.7753920122384309, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.5592392282810364e-06, |
|
"loss": 2.5, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.7762661858711687, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.5104406066788915e-06, |
|
"loss": 2.4614, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.7771403595039065, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 6.461796997507899e-06, |
|
"loss": 2.5137, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.7780145331366443, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.4133088085854775e-06, |
|
"loss": 2.5302, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.778888706769382, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.3649764464260105e-06, |
|
"loss": 2.4871, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.7797628804021198, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.316800316237481e-06, |
|
"loss": 2.5117, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.7806370540348577, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.268780821918044e-06, |
|
"loss": 2.4903, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.7815112276675955, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.220918366052661e-06, |
|
"loss": 2.4107, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.7823854013003333, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.173213349909729e-06, |
|
"loss": 2.44, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7832595749330711, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 6.125666173437678e-06, |
|
"loss": 2.456, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.7841337485658089, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.078277235261681e-06, |
|
"loss": 2.4971, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.7850079221985466, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.031046932680229e-06, |
|
"loss": 2.4523, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.7858820958312845, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.983975661661889e-06, |
|
"loss": 2.4813, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.7867562694640223, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.93706381684192e-06, |
|
"loss": 2.4376, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7876304430967601, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.8903117915189875e-06, |
|
"loss": 2.466, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.7885046167294979, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.843719977651882e-06, |
|
"loss": 2.47, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.7893787903622357, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.7972887658561955e-06, |
|
"loss": 2.4402, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.7902529639949735, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.751018545401076e-06, |
|
"loss": 2.4675, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.7911271376277113, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.704909704205949e-06, |
|
"loss": 2.4422, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7920013112604491, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 5.658962628837289e-06, |
|
"loss": 2.4023, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.7928754848931869, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 5.613177704505343e-06, |
|
"loss": 2.4977, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.7937496585259247, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.567555315060918e-06, |
|
"loss": 2.4663, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.7946238321586625, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.522095842992195e-06, |
|
"loss": 2.4664, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.7954980057914003, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.476799669421437e-06, |
|
"loss": 2.4706, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7963721794241381, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 5.431667174101901e-06, |
|
"loss": 2.5069, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.797246353056876, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.3866987354145724e-06, |
|
"loss": 2.4853, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.7981205266896138, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.3418947303650185e-06, |
|
"loss": 2.4523, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.7989947003223515, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.297255534580256e-06, |
|
"loss": 2.4469, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.7998688739550893, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.252781522305556e-06, |
|
"loss": 2.4548, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8007430475878271, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.208473066401329e-06, |
|
"loss": 2.4469, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.8016172212205649, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 5.164330538339995e-06, |
|
"loss": 2.4885, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.8024913948533028, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.120354308202893e-06, |
|
"loss": 2.4482, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.8033655684860406, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.076544744677128e-06, |
|
"loss": 2.5142, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.8042397421187784, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.032902215052515e-06, |
|
"loss": 2.524, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8051139157515161, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.989427085218523e-06, |
|
"loss": 2.4633, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.8059880893842539, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.946119719661121e-06, |
|
"loss": 2.4766, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.8068622630169917, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.902980481459835e-06, |
|
"loss": 2.5139, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.8077364366497296, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.860009732284609e-06, |
|
"loss": 2.4214, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.8086106102824674, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.817207832392842e-06, |
|
"loss": 2.4861, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8094847839152052, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 2.4826, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.810358957547943, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.732112014408213e-06, |
|
"loss": 2.4694, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.8112331311806807, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.689818809740118e-06, |
|
"loss": 2.4753, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.8121073048134185, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.647695881199024e-06, |
|
"loss": 2.4846, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.8129814784461563, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.605743581934385e-06, |
|
"loss": 2.5179, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8138556520788942, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.563962263665114e-06, |
|
"loss": 2.4131, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.814729825711632, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.522352276676661e-06, |
|
"loss": 2.4216, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.8156039993443698, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.480913969818098e-06, |
|
"loss": 2.3917, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.8164781729771076, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.439647690499122e-06, |
|
"loss": 2.4532, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.8173523466098453, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.398553784687226e-06, |
|
"loss": 2.5099, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8182265202425831, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.357632596904743e-06, |
|
"loss": 2.4413, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.819100693875321, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.31688447022599e-06, |
|
"loss": 2.4713, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.8199748675080588, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.276309746274368e-06, |
|
"loss": 2.4015, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.8208490411407966, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 4.235908765219504e-06, |
|
"loss": 2.4893, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.8217232147735344, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.195681865774406e-06, |
|
"loss": 2.4033, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8225973884062722, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.155629385192619e-06, |
|
"loss": 2.4699, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.82347156203901, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.115751659265407e-06, |
|
"loss": 2.4399, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.8243457356717478, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.0760490223189144e-06, |
|
"loss": 2.4812, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.8252199093044856, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.036521807211393e-06, |
|
"loss": 2.4413, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.8260940829372234, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.997170345330387e-06, |
|
"loss": 2.4954, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8269682565699612, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.957994966589965e-06, |
|
"loss": 2.4214, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.827842430202699, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.918995999427949e-06, |
|
"loss": 2.4462, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.8287166038354368, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.880173770803169e-06, |
|
"loss": 2.3887, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.8295907774681747, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.8415286061927265e-06, |
|
"loss": 2.4514, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.8304649511009125, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.8030608295892416e-06, |
|
"loss": 2.4721, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8313391247336502, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.764770763498163e-06, |
|
"loss": 2.5131, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.832213298366388, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.726658728935048e-06, |
|
"loss": 2.4598, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.8330874719991258, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.688725045422867e-06, |
|
"loss": 2.4106, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.8339616456318636, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.6509700309893618e-06, |
|
"loss": 2.4475, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.8348358192646015, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.613394002164322e-06, |
|
"loss": 2.4791, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8357099928973393, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.575997273976983e-06, |
|
"loss": 2.4782, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.8365841665300771, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 3.5387801599533475e-06, |
|
"loss": 2.4587, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.8374583401628148, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.5017429721135807e-06, |
|
"loss": 2.4663, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.8383325137955526, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.4648860209693794e-06, |
|
"loss": 2.4975, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.8392066874282904, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.428209615521377e-06, |
|
"loss": 2.3964, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8400808610610282, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.3917140632565624e-06, |
|
"loss": 2.4558, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.8409550346937661, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.35539967014567e-06, |
|
"loss": 2.5099, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.8418292083265039, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.319266740640661e-06, |
|
"loss": 2.4713, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.8427033819592417, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.283315577672122e-06, |
|
"loss": 2.4063, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.8435775555919794, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.2475464826467627e-06, |
|
"loss": 2.4831, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8444517292247172, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 3.2119597554448657e-06, |
|
"loss": 2.4549, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.845325902857455, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.1765556944177823e-06, |
|
"loss": 2.4457, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.8462000764901929, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.141334596385448e-06, |
|
"loss": 2.4424, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.8470742501229307, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.106296756633853e-06, |
|
"loss": 2.4523, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.8479484237556685, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.0714424689126024e-06, |
|
"loss": 2.4693, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8488225973884063, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.0367720254324357e-06, |
|
"loss": 2.4814, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.849696771021144, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.002285716862785e-06, |
|
"loss": 2.4373, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.8505709446538818, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.967983832329341e-06, |
|
"loss": 2.4637, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.8514451182866197, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.9338666594116134e-06, |
|
"loss": 2.4313, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.8523192919193575, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.8999344841405373e-06, |
|
"loss": 2.4572, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8531934655520953, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.8661875909960695e-06, |
|
"loss": 2.4709, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.8540676391848331, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.8326262629047917e-06, |
|
"loss": 2.4401, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.8549418128175709, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.7992507812375556e-06, |
|
"loss": 2.46, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.8558159864503087, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 2.766061425807112e-06, |
|
"loss": 2.451, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.8566901600830465, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.733058474865785e-06, |
|
"loss": 2.4501, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8575643337157843, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.700242205103104e-06, |
|
"loss": 2.4789, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.8584385073485221, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.6676128916435256e-06, |
|
"loss": 2.4685, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.8593126809812599, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.635170808044077e-06, |
|
"loss": 2.5047, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.8601868546139977, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.602916226292121e-06, |
|
"loss": 2.4509, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.8610610282467355, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.5708494168030255e-06, |
|
"loss": 2.4669, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8619352018794733, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.538970648417921e-06, |
|
"loss": 2.4744, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.8628093755122112, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.507280188401456e-06, |
|
"loss": 2.4848, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.8636835491449489, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.475778302439524e-06, |
|
"loss": 2.5041, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.8645577227776867, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 2.444465254637063e-06, |
|
"loss": 2.434, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.8654318964104245, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.4133413075158344e-06, |
|
"loss": 2.4779, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8663060700431623, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.382406722012212e-06, |
|
"loss": 2.4035, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.8671802436759001, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.351661757475021e-06, |
|
"loss": 2.428, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.868054417308638, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.3211066716633257e-06, |
|
"loss": 2.4896, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.8689285909413758, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.2907417207443133e-06, |
|
"loss": 2.4964, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.8698027645741135, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.2605671592910824e-06, |
|
"loss": 2.36, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8706769382068513, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.23058324028059e-06, |
|
"loss": 2.4351, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.8715511118395891, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.200790215091464e-06, |
|
"loss": 2.51, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.8724252854723269, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 2.1711883335019225e-06, |
|
"loss": 2.4247, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.8732994591050648, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.1417778436876867e-06, |
|
"loss": 2.4565, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.8741736327378026, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.1125589922198845e-06, |
|
"loss": 2.4257, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1143, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 9.143132822175744e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|