Gwanwoo's picture
Upload folder using huggingface_hub
591221e verified
raw
history blame
55.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.974025974025974,
"eval_steps": 39,
"global_step": 308,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006493506493506494,
"grad_norm": 0.39616659283638,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.6277,
"step": 1
},
{
"epoch": 0.006493506493506494,
"eval_loss": 1.6379814147949219,
"eval_runtime": 25.2293,
"eval_samples_per_second": 11.693,
"eval_steps_per_second": 1.467,
"step": 1
},
{
"epoch": 0.012987012987012988,
"grad_norm": 0.42422759532928467,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.6549,
"step": 2
},
{
"epoch": 0.01948051948051948,
"grad_norm": 0.40566202998161316,
"learning_rate": 3e-06,
"loss": 1.6115,
"step": 3
},
{
"epoch": 0.025974025974025976,
"grad_norm": 0.3940100371837616,
"learning_rate": 4.000000000000001e-06,
"loss": 1.617,
"step": 4
},
{
"epoch": 0.032467532467532464,
"grad_norm": 0.3876812756061554,
"learning_rate": 5e-06,
"loss": 1.6561,
"step": 5
},
{
"epoch": 0.03896103896103896,
"grad_norm": 0.41130709648132324,
"learning_rate": 6e-06,
"loss": 1.606,
"step": 6
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.3654179573059082,
"learning_rate": 7e-06,
"loss": 1.605,
"step": 7
},
{
"epoch": 0.05194805194805195,
"grad_norm": 0.35875341296195984,
"learning_rate": 8.000000000000001e-06,
"loss": 1.5651,
"step": 8
},
{
"epoch": 0.05844155844155844,
"grad_norm": 0.3530851900577545,
"learning_rate": 9e-06,
"loss": 1.5201,
"step": 9
},
{
"epoch": 0.06493506493506493,
"grad_norm": 0.38295701146125793,
"learning_rate": 1e-05,
"loss": 1.6376,
"step": 10
},
{
"epoch": 0.07142857142857142,
"grad_norm": 0.39765068888664246,
"learning_rate": 9.999722154604716e-06,
"loss": 1.5167,
"step": 11
},
{
"epoch": 0.07792207792207792,
"grad_norm": 0.44107645750045776,
"learning_rate": 9.99888864929809e-06,
"loss": 1.6259,
"step": 12
},
{
"epoch": 0.08441558441558442,
"grad_norm": 0.4228634238243103,
"learning_rate": 9.997499576714369e-06,
"loss": 1.6767,
"step": 13
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.4122574031352997,
"learning_rate": 9.995555091232516e-06,
"loss": 1.6082,
"step": 14
},
{
"epoch": 0.09740259740259741,
"grad_norm": 0.43088310956954956,
"learning_rate": 9.99305540895907e-06,
"loss": 1.6651,
"step": 15
},
{
"epoch": 0.1038961038961039,
"grad_norm": 0.4325619637966156,
"learning_rate": 9.990000807704114e-06,
"loss": 1.6473,
"step": 16
},
{
"epoch": 0.11038961038961038,
"grad_norm": 0.4201916456222534,
"learning_rate": 9.986391626950405e-06,
"loss": 1.5726,
"step": 17
},
{
"epoch": 0.11688311688311688,
"grad_norm": 0.4616028070449829,
"learning_rate": 9.982228267815644e-06,
"loss": 1.5994,
"step": 18
},
{
"epoch": 0.12337662337662338,
"grad_norm": 0.4239194989204407,
"learning_rate": 9.977511193007896e-06,
"loss": 1.6018,
"step": 19
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.41312310099601746,
"learning_rate": 9.972240926774167e-06,
"loss": 1.6497,
"step": 20
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.45023512840270996,
"learning_rate": 9.966418054842143e-06,
"loss": 1.557,
"step": 21
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.4155130982398987,
"learning_rate": 9.960043224355081e-06,
"loss": 1.5782,
"step": 22
},
{
"epoch": 0.14935064935064934,
"grad_norm": 0.39791709184646606,
"learning_rate": 9.9531171437999e-06,
"loss": 1.548,
"step": 23
},
{
"epoch": 0.15584415584415584,
"grad_norm": 0.40294748544692993,
"learning_rate": 9.945640582928438e-06,
"loss": 1.5745,
"step": 24
},
{
"epoch": 0.16233766233766234,
"grad_norm": 0.3911401927471161,
"learning_rate": 9.937614372671896e-06,
"loss": 1.5653,
"step": 25
},
{
"epoch": 0.16883116883116883,
"grad_norm": 0.407604843378067,
"learning_rate": 9.929039405048502e-06,
"loss": 1.5627,
"step": 26
},
{
"epoch": 0.17532467532467533,
"grad_norm": 0.4176672697067261,
"learning_rate": 9.919916633064363e-06,
"loss": 1.5413,
"step": 27
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.40910884737968445,
"learning_rate": 9.91024707060755e-06,
"loss": 1.5688,
"step": 28
},
{
"epoch": 0.18831168831168832,
"grad_norm": 0.393100768327713,
"learning_rate": 9.900031792335432e-06,
"loss": 1.5259,
"step": 29
},
{
"epoch": 0.19480519480519481,
"grad_norm": 0.39668065309524536,
"learning_rate": 9.889271933555214e-06,
"loss": 1.5471,
"step": 30
},
{
"epoch": 0.2012987012987013,
"grad_norm": 0.37264811992645264,
"learning_rate": 9.877968690097785e-06,
"loss": 1.488,
"step": 31
},
{
"epoch": 0.2077922077922078,
"grad_norm": 0.39069730043411255,
"learning_rate": 9.866123318184803e-06,
"loss": 1.5863,
"step": 32
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.37075284123420715,
"learning_rate": 9.853737134289086e-06,
"loss": 1.4967,
"step": 33
},
{
"epoch": 0.22077922077922077,
"grad_norm": 0.37381497025489807,
"learning_rate": 9.840811514988294e-06,
"loss": 1.4219,
"step": 34
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.37528884410858154,
"learning_rate": 9.827347896811954e-06,
"loss": 1.4754,
"step": 35
},
{
"epoch": 0.23376623376623376,
"grad_norm": 0.3899977505207062,
"learning_rate": 9.81334777608179e-06,
"loss": 1.5201,
"step": 36
},
{
"epoch": 0.24025974025974026,
"grad_norm": 0.37340793013572693,
"learning_rate": 9.798812708745431e-06,
"loss": 1.4467,
"step": 37
},
{
"epoch": 0.24675324675324675,
"grad_norm": 0.3682302236557007,
"learning_rate": 9.783744310203492e-06,
"loss": 1.3911,
"step": 38
},
{
"epoch": 0.2532467532467532,
"grad_norm": 0.37759163975715637,
"learning_rate": 9.76814425513003e-06,
"loss": 1.4242,
"step": 39
},
{
"epoch": 0.2532467532467532,
"eval_loss": 1.451162338256836,
"eval_runtime": 25.2272,
"eval_samples_per_second": 11.694,
"eval_steps_per_second": 1.467,
"step": 39
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.3705059885978699,
"learning_rate": 9.752014277286433e-06,
"loss": 1.4542,
"step": 40
},
{
"epoch": 0.2662337662337662,
"grad_norm": 0.34180477261543274,
"learning_rate": 9.73535616932873e-06,
"loss": 1.4108,
"step": 41
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.34464818239212036,
"learning_rate": 9.718171782608355e-06,
"loss": 1.4691,
"step": 42
},
{
"epoch": 0.2792207792207792,
"grad_norm": 0.37641096115112305,
"learning_rate": 9.7004630269664e-06,
"loss": 1.3784,
"step": 43
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.335043340921402,
"learning_rate": 9.682231870521347e-06,
"loss": 1.4899,
"step": 44
},
{
"epoch": 0.2922077922077922,
"grad_norm": 0.3437371551990509,
"learning_rate": 9.663480339450344e-06,
"loss": 1.4003,
"step": 45
},
{
"epoch": 0.2987012987012987,
"grad_norm": 0.3729651868343353,
"learning_rate": 9.644210517764014e-06,
"loss": 1.3884,
"step": 46
},
{
"epoch": 0.3051948051948052,
"grad_norm": 0.33896854519844055,
"learning_rate": 9.624424547074851e-06,
"loss": 1.4258,
"step": 47
},
{
"epoch": 0.3116883116883117,
"grad_norm": 0.36020371317863464,
"learning_rate": 9.60412462635919e-06,
"loss": 1.3703,
"step": 48
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.3405916392803192,
"learning_rate": 9.583313011712832e-06,
"loss": 1.3407,
"step": 49
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.3447131812572479,
"learning_rate": 9.561992016100293e-06,
"loss": 1.3456,
"step": 50
},
{
"epoch": 0.33116883116883117,
"grad_norm": 0.35361772775650024,
"learning_rate": 9.540164009097756e-06,
"loss": 1.3384,
"step": 51
},
{
"epoch": 0.33766233766233766,
"grad_norm": 0.3525960147380829,
"learning_rate": 9.517831416629717e-06,
"loss": 1.3726,
"step": 52
},
{
"epoch": 0.34415584415584416,
"grad_norm": 0.3269696831703186,
"learning_rate": 9.494996720699363e-06,
"loss": 1.3664,
"step": 53
},
{
"epoch": 0.35064935064935066,
"grad_norm": 0.3153081238269806,
"learning_rate": 9.471662459112747e-06,
"loss": 1.3448,
"step": 54
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.3017883002758026,
"learning_rate": 9.44783122519672e-06,
"loss": 1.3228,
"step": 55
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.314627081155777,
"learning_rate": 9.423505667510724e-06,
"loss": 1.3565,
"step": 56
},
{
"epoch": 0.37012987012987014,
"grad_norm": 0.3048722743988037,
"learning_rate": 9.398688489552437e-06,
"loss": 1.2669,
"step": 57
},
{
"epoch": 0.37662337662337664,
"grad_norm": 0.3107397258281708,
"learning_rate": 9.373382449457305e-06,
"loss": 1.2871,
"step": 58
},
{
"epoch": 0.38311688311688313,
"grad_norm": 0.3061436116695404,
"learning_rate": 9.347590359692015e-06,
"loss": 1.3015,
"step": 59
},
{
"epoch": 0.38961038961038963,
"grad_norm": 0.31199324131011963,
"learning_rate": 9.321315086741916e-06,
"loss": 1.3196,
"step": 60
},
{
"epoch": 0.3961038961038961,
"grad_norm": 0.3002881109714508,
"learning_rate": 9.294559550792451e-06,
"loss": 1.3404,
"step": 61
},
{
"epoch": 0.4025974025974026,
"grad_norm": 0.2965368330478668,
"learning_rate": 9.2673267254046e-06,
"loss": 1.2931,
"step": 62
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.2912820279598236,
"learning_rate": 9.23961963718442e-06,
"loss": 1.2941,
"step": 63
},
{
"epoch": 0.4155844155844156,
"grad_norm": 0.29220762848854065,
"learning_rate": 9.211441365446661e-06,
"loss": 1.306,
"step": 64
},
{
"epoch": 0.42207792207792205,
"grad_norm": 0.2802380323410034,
"learning_rate": 9.182795041872543e-06,
"loss": 1.2641,
"step": 65
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.28170621395111084,
"learning_rate": 9.153683850161706e-06,
"loss": 1.2459,
"step": 66
},
{
"epoch": 0.43506493506493504,
"grad_norm": 0.2919502854347229,
"learning_rate": 9.124111025678378e-06,
"loss": 1.2736,
"step": 67
},
{
"epoch": 0.44155844155844154,
"grad_norm": 0.2793984115123749,
"learning_rate": 9.094079855091797e-06,
"loss": 1.265,
"step": 68
},
{
"epoch": 0.44805194805194803,
"grad_norm": 0.2946363091468811,
"learning_rate": 9.063593676010954e-06,
"loss": 1.2669,
"step": 69
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.2771857678890228,
"learning_rate": 9.032655876613636e-06,
"loss": 1.2479,
"step": 70
},
{
"epoch": 0.461038961038961,
"grad_norm": 0.26628848910331726,
"learning_rate": 9.001269895269886e-06,
"loss": 1.2764,
"step": 71
},
{
"epoch": 0.4675324675324675,
"grad_norm": 0.3006434440612793,
"learning_rate": 8.969439220159861e-06,
"loss": 1.2286,
"step": 72
},
{
"epoch": 0.474025974025974,
"grad_norm": 0.27274397015571594,
"learning_rate": 8.937167388886163e-06,
"loss": 1.3059,
"step": 73
},
{
"epoch": 0.4805194805194805,
"grad_norm": 0.2763414978981018,
"learning_rate": 8.904457988080682e-06,
"loss": 1.2095,
"step": 74
},
{
"epoch": 0.487012987012987,
"grad_norm": 0.28402575850486755,
"learning_rate": 8.871314653005972e-06,
"loss": 1.2601,
"step": 75
},
{
"epoch": 0.4935064935064935,
"grad_norm": 0.2892557680606842,
"learning_rate": 8.837741067151251e-06,
"loss": 1.2342,
"step": 76
},
{
"epoch": 0.5,
"grad_norm": 0.2644696533679962,
"learning_rate": 8.80374096182301e-06,
"loss": 1.2744,
"step": 77
},
{
"epoch": 0.5064935064935064,
"grad_norm": 0.25198492407798767,
"learning_rate": 8.76931811573033e-06,
"loss": 1.2885,
"step": 78
},
{
"epoch": 0.5064935064935064,
"eval_loss": 1.2617864608764648,
"eval_runtime": 25.3815,
"eval_samples_per_second": 11.623,
"eval_steps_per_second": 1.458,
"step": 78
},
{
"epoch": 0.512987012987013,
"grad_norm": 0.2804642915725708,
"learning_rate": 8.734476354564924e-06,
"loss": 1.2583,
"step": 79
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.2729627788066864,
"learning_rate": 8.699219550575954e-06,
"loss": 1.2246,
"step": 80
},
{
"epoch": 0.525974025974026,
"grad_norm": 0.2520177662372589,
"learning_rate": 8.663551622139674e-06,
"loss": 1.2599,
"step": 81
},
{
"epoch": 0.5324675324675324,
"grad_norm": 0.2614675760269165,
"learning_rate": 8.627476533323957e-06,
"loss": 1.2165,
"step": 82
},
{
"epoch": 0.538961038961039,
"grad_norm": 0.3191888928413391,
"learning_rate": 8.590998293447728e-06,
"loss": 1.2558,
"step": 83
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.27159151434898376,
"learning_rate": 8.554120956635375e-06,
"loss": 1.2197,
"step": 84
},
{
"epoch": 0.551948051948052,
"grad_norm": 0.291990727186203,
"learning_rate": 8.516848621366188e-06,
"loss": 1.219,
"step": 85
},
{
"epoch": 0.5584415584415584,
"grad_norm": 0.23849813640117645,
"learning_rate": 8.47918543001886e-06,
"loss": 1.2199,
"step": 86
},
{
"epoch": 0.564935064935065,
"grad_norm": 0.26883506774902344,
"learning_rate": 8.441135568411102e-06,
"loss": 1.1959,
"step": 87
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.2667544484138489,
"learning_rate": 8.402703265334455e-06,
"loss": 1.2682,
"step": 88
},
{
"epoch": 0.577922077922078,
"grad_norm": 0.24013420939445496,
"learning_rate": 8.363892792084291e-06,
"loss": 1.1649,
"step": 89
},
{
"epoch": 0.5844155844155844,
"grad_norm": 0.2544495463371277,
"learning_rate": 8.324708461985124e-06,
"loss": 1.1929,
"step": 90
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.2654297351837158,
"learning_rate": 8.285154629911227e-06,
"loss": 1.18,
"step": 91
},
{
"epoch": 0.5974025974025974,
"grad_norm": 0.26719850301742554,
"learning_rate": 8.245235691802644e-06,
"loss": 1.2933,
"step": 92
},
{
"epoch": 0.6038961038961039,
"grad_norm": 0.2760712802410126,
"learning_rate": 8.20495608417663e-06,
"loss": 1.1612,
"step": 93
},
{
"epoch": 0.6103896103896104,
"grad_norm": 0.2652733623981476,
"learning_rate": 8.164320283634585e-06,
"loss": 1.2125,
"step": 94
},
{
"epoch": 0.6168831168831169,
"grad_norm": 0.24604123830795288,
"learning_rate": 8.123332806364537e-06,
"loss": 1.1801,
"step": 95
},
{
"epoch": 0.6233766233766234,
"grad_norm": 0.23077791929244995,
"learning_rate": 8.081998207639212e-06,
"loss": 1.2016,
"step": 96
},
{
"epoch": 0.6298701298701299,
"grad_norm": 0.25489139556884766,
"learning_rate": 8.040321081309783e-06,
"loss": 1.2049,
"step": 97
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.2564036250114441,
"learning_rate": 7.998306059295302e-06,
"loss": 1.2377,
"step": 98
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.2734230160713196,
"learning_rate": 7.955957811067932e-06,
"loss": 1.1107,
"step": 99
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.2672719359397888,
"learning_rate": 7.913281043133978e-06,
"loss": 1.1863,
"step": 100
},
{
"epoch": 0.6558441558441559,
"grad_norm": 0.263724684715271,
"learning_rate": 7.870280498510824e-06,
"loss": 1.2678,
"step": 101
},
{
"epoch": 0.6623376623376623,
"grad_norm": 0.27098724246025085,
"learning_rate": 7.826960956199796e-06,
"loss": 1.1656,
"step": 102
},
{
"epoch": 0.6688311688311688,
"grad_norm": 0.29257479310035706,
"learning_rate": 7.783327230655036e-06,
"loss": 1.1749,
"step": 103
},
{
"epoch": 0.6753246753246753,
"grad_norm": 0.26874226331710815,
"learning_rate": 7.739384171248436e-06,
"loss": 1.2013,
"step": 104
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.26934632658958435,
"learning_rate": 7.695136661730677e-06,
"loss": 1.1507,
"step": 105
},
{
"epoch": 0.6883116883116883,
"grad_norm": 0.27807483077049255,
"learning_rate": 7.650589619688468e-06,
"loss": 1.1729,
"step": 106
},
{
"epoch": 0.6948051948051948,
"grad_norm": 0.2936646342277527,
"learning_rate": 7.6057479959980145e-06,
"loss": 1.1646,
"step": 107
},
{
"epoch": 0.7012987012987013,
"grad_norm": 0.28149378299713135,
"learning_rate": 7.560616774274775e-06,
"loss": 1.1939,
"step": 108
},
{
"epoch": 0.7077922077922078,
"grad_norm": 0.25706660747528076,
"learning_rate": 7.5152009703196105e-06,
"loss": 1.1708,
"step": 109
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.29964110255241394,
"learning_rate": 7.469505631561318e-06,
"loss": 1.2161,
"step": 110
},
{
"epoch": 0.7207792207792207,
"grad_norm": 0.24933487176895142,
"learning_rate": 7.423535836495683e-06,
"loss": 1.1641,
"step": 111
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.27039459347724915,
"learning_rate": 7.3772966941210585e-06,
"loss": 1.1563,
"step": 112
},
{
"epoch": 0.7337662337662337,
"grad_norm": 0.2490512579679489,
"learning_rate": 7.33079334337056e-06,
"loss": 1.1887,
"step": 113
},
{
"epoch": 0.7402597402597403,
"grad_norm": 0.28315550088882446,
"learning_rate": 7.284030952540937e-06,
"loss": 1.1189,
"step": 114
},
{
"epoch": 0.7467532467532467,
"grad_norm": 0.2557179033756256,
"learning_rate": 7.2370147187181736e-06,
"loss": 1.1812,
"step": 115
},
{
"epoch": 0.7532467532467533,
"grad_norm": 0.2875461280345917,
"learning_rate": 7.189749867199899e-06,
"loss": 1.1534,
"step": 116
},
{
"epoch": 0.7597402597402597,
"grad_norm": 0.26117077469825745,
"learning_rate": 7.142241650914654e-06,
"loss": 1.1618,
"step": 117
},
{
"epoch": 0.7597402597402597,
"eval_loss": 1.2008626461029053,
"eval_runtime": 25.2533,
"eval_samples_per_second": 11.682,
"eval_steps_per_second": 1.465,
"step": 117
},
{
"epoch": 0.7662337662337663,
"grad_norm": 0.29663676023483276,
"learning_rate": 7.094495349838093e-06,
"loss": 1.1064,
"step": 118
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.23107394576072693,
"learning_rate": 7.046516270406174e-06,
"loss": 1.1464,
"step": 119
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.2502164840698242,
"learning_rate": 6.998309744925411e-06,
"loss": 1.1998,
"step": 120
},
{
"epoch": 0.7857142857142857,
"grad_norm": 0.25331148505210876,
"learning_rate": 6.9498811309802595e-06,
"loss": 1.1784,
"step": 121
},
{
"epoch": 0.7922077922077922,
"grad_norm": 0.2596096694469452,
"learning_rate": 6.901235810837668e-06,
"loss": 1.1034,
"step": 122
},
{
"epoch": 0.7987012987012987,
"grad_norm": 0.26797452569007874,
"learning_rate": 6.852379190848923e-06,
"loss": 1.1264,
"step": 123
},
{
"epoch": 0.8051948051948052,
"grad_norm": 0.31169766187667847,
"learning_rate": 6.8033167008487784e-06,
"loss": 1.1386,
"step": 124
},
{
"epoch": 0.8116883116883117,
"grad_norm": 0.26767072081565857,
"learning_rate": 6.754053793552005e-06,
"loss": 1.2137,
"step": 125
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.2596385180950165,
"learning_rate": 6.704595943947385e-06,
"loss": 1.1466,
"step": 126
},
{
"epoch": 0.8246753246753247,
"grad_norm": 0.27891236543655396,
"learning_rate": 6.654948648689228e-06,
"loss": 1.1371,
"step": 127
},
{
"epoch": 0.8311688311688312,
"grad_norm": 0.28439176082611084,
"learning_rate": 6.605117425486483e-06,
"loss": 1.1948,
"step": 128
},
{
"epoch": 0.8376623376623377,
"grad_norm": 0.2944129705429077,
"learning_rate": 6.555107812489513e-06,
"loss": 1.1169,
"step": 129
},
{
"epoch": 0.8441558441558441,
"grad_norm": 0.2609187960624695,
"learning_rate": 6.504925367674595e-06,
"loss": 1.1503,
"step": 130
},
{
"epoch": 0.8506493506493507,
"grad_norm": 0.27614086866378784,
"learning_rate": 6.454575668226215e-06,
"loss": 1.1835,
"step": 131
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.2971368134021759,
"learning_rate": 6.40406430991723e-06,
"loss": 1.1816,
"step": 132
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.28436651825904846,
"learning_rate": 6.353396906486971e-06,
"loss": 1.1947,
"step": 133
},
{
"epoch": 0.8701298701298701,
"grad_norm": 0.2339404821395874,
"learning_rate": 6.302579089017328e-06,
"loss": 1.1027,
"step": 134
},
{
"epoch": 0.8766233766233766,
"grad_norm": 0.27757248282432556,
"learning_rate": 6.251616505306933e-06,
"loss": 1.1294,
"step": 135
},
{
"epoch": 0.8831168831168831,
"grad_norm": 0.27656033635139465,
"learning_rate": 6.200514819243476e-06,
"loss": 1.1313,
"step": 136
},
{
"epoch": 0.8896103896103896,
"grad_norm": 0.26819008588790894,
"learning_rate": 6.149279710174219e-06,
"loss": 1.2036,
"step": 137
},
{
"epoch": 0.8961038961038961,
"grad_norm": 0.3008396029472351,
"learning_rate": 6.097916872274815e-06,
"loss": 1.1512,
"step": 138
},
{
"epoch": 0.9025974025974026,
"grad_norm": 0.29651182889938354,
"learning_rate": 6.046432013916467e-06,
"loss": 1.1412,
"step": 139
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.275259405374527,
"learning_rate": 5.9948308570315e-06,
"loss": 1.1726,
"step": 140
},
{
"epoch": 0.9155844155844156,
"grad_norm": 0.26858457922935486,
"learning_rate": 5.943119136477449e-06,
"loss": 1.1701,
"step": 141
},
{
"epoch": 0.922077922077922,
"grad_norm": 0.273671954870224,
"learning_rate": 5.891302599399686e-06,
"loss": 1.165,
"step": 142
},
{
"epoch": 0.9285714285714286,
"grad_norm": 0.26044774055480957,
"learning_rate": 5.839387004592705e-06,
"loss": 1.1119,
"step": 143
},
{
"epoch": 0.935064935064935,
"grad_norm": 0.24865947663784027,
"learning_rate": 5.78737812186009e-06,
"loss": 1.1598,
"step": 144
},
{
"epoch": 0.9415584415584416,
"grad_norm": 0.2713409960269928,
"learning_rate": 5.735281731373271e-06,
"loss": 1.1543,
"step": 145
},
{
"epoch": 0.948051948051948,
"grad_norm": 0.2865453362464905,
"learning_rate": 5.6831036230291345e-06,
"loss": 1.1379,
"step": 146
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.26891422271728516,
"learning_rate": 5.630849595806534e-06,
"loss": 1.1382,
"step": 147
},
{
"epoch": 0.961038961038961,
"grad_norm": 0.3001209795475006,
"learning_rate": 5.578525457121807e-06,
"loss": 1.1674,
"step": 148
},
{
"epoch": 0.9675324675324676,
"grad_norm": 0.2672886848449707,
"learning_rate": 5.526137022183356e-06,
"loss": 1.1209,
"step": 149
},
{
"epoch": 0.974025974025974,
"grad_norm": 0.27608615159988403,
"learning_rate": 5.473690113345343e-06,
"loss": 1.1855,
"step": 150
},
{
"epoch": 0.9805194805194806,
"grad_norm": 0.2823050618171692,
"learning_rate": 5.4211905594606165e-06,
"loss": 1.1433,
"step": 151
},
{
"epoch": 0.987012987012987,
"grad_norm": 0.2888166308403015,
"learning_rate": 5.368644195232896e-06,
"loss": 1.1413,
"step": 152
},
{
"epoch": 0.9935064935064936,
"grad_norm": 0.2720174193382263,
"learning_rate": 5.316056860568318e-06,
"loss": 1.1657,
"step": 153
},
{
"epoch": 1.0,
"grad_norm": 0.28704148530960083,
"learning_rate": 5.2634343999263985e-06,
"loss": 1.1606,
"step": 154
},
{
"epoch": 1.0064935064935066,
"grad_norm": 0.2424069494009018,
"learning_rate": 5.210782661670486e-06,
"loss": 1.1506,
"step": 155
},
{
"epoch": 1.0129870129870129,
"grad_norm": 0.2754787802696228,
"learning_rate": 5.158107497417795e-06,
"loss": 1.1186,
"step": 156
},
{
"epoch": 1.0129870129870129,
"eval_loss": 1.1744325160980225,
"eval_runtime": 25.2167,
"eval_samples_per_second": 11.699,
"eval_steps_per_second": 1.467,
"step": 156
},
{
"epoch": 1.0194805194805194,
"grad_norm": 0.2637002170085907,
"learning_rate": 5.105414761389056e-06,
"loss": 1.178,
"step": 157
},
{
"epoch": 1.025974025974026,
"grad_norm": 0.26754647493362427,
"learning_rate": 5.052710309757899e-06,
"loss": 1.1329,
"step": 158
},
{
"epoch": 1.0064935064935066,
"grad_norm": 0.2765344977378845,
"learning_rate": 5e-06,
"loss": 1.0933,
"step": 159
},
{
"epoch": 1.0129870129870129,
"grad_norm": 0.2661243677139282,
"learning_rate": 4.947289690242103e-06,
"loss": 1.0931,
"step": 160
},
{
"epoch": 1.0194805194805194,
"grad_norm": 0.2913898527622223,
"learning_rate": 4.894585238610946e-06,
"loss": 1.0963,
"step": 161
},
{
"epoch": 1.025974025974026,
"grad_norm": 0.2579714357852936,
"learning_rate": 4.841892502582206e-06,
"loss": 1.1984,
"step": 162
},
{
"epoch": 1.0324675324675325,
"grad_norm": 0.2802336513996124,
"learning_rate": 4.789217338329515e-06,
"loss": 1.1592,
"step": 163
},
{
"epoch": 1.0389610389610389,
"grad_norm": 0.27885061502456665,
"learning_rate": 4.736565600073602e-06,
"loss": 1.1184,
"step": 164
},
{
"epoch": 1.0454545454545454,
"grad_norm": 0.26521897315979004,
"learning_rate": 4.683943139431683e-06,
"loss": 1.1685,
"step": 165
},
{
"epoch": 1.051948051948052,
"grad_norm": 0.30475732684135437,
"learning_rate": 4.631355804767106e-06,
"loss": 1.105,
"step": 166
},
{
"epoch": 1.0584415584415585,
"grad_norm": 0.2804529070854187,
"learning_rate": 4.578809440539386e-06,
"loss": 1.0735,
"step": 167
},
{
"epoch": 1.0649350649350648,
"grad_norm": 0.2721943259239197,
"learning_rate": 4.526309886654659e-06,
"loss": 1.187,
"step": 168
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.2899441421031952,
"learning_rate": 4.473862977816647e-06,
"loss": 1.1375,
"step": 169
},
{
"epoch": 1.077922077922078,
"grad_norm": 0.27229487895965576,
"learning_rate": 4.4214745428781946e-06,
"loss": 1.1079,
"step": 170
},
{
"epoch": 1.0844155844155845,
"grad_norm": 0.28253939747810364,
"learning_rate": 4.369150404193467e-06,
"loss": 1.1283,
"step": 171
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.31305843591690063,
"learning_rate": 4.316896376970866e-06,
"loss": 1.1576,
"step": 172
},
{
"epoch": 1.0974025974025974,
"grad_norm": 0.26900458335876465,
"learning_rate": 4.264718268626729e-06,
"loss": 1.1694,
"step": 173
},
{
"epoch": 1.103896103896104,
"grad_norm": 0.27618759870529175,
"learning_rate": 4.212621878139912e-06,
"loss": 1.0882,
"step": 174
},
{
"epoch": 1.1103896103896105,
"grad_norm": 0.3342863619327545,
"learning_rate": 4.160612995407296e-06,
"loss": 1.1103,
"step": 175
},
{
"epoch": 1.1168831168831168,
"grad_norm": 0.3191912770271301,
"learning_rate": 4.108697400600316e-06,
"loss": 1.1528,
"step": 176
},
{
"epoch": 1.1233766233766234,
"grad_norm": 0.28136956691741943,
"learning_rate": 4.056880863522553e-06,
"loss": 1.1239,
"step": 177
},
{
"epoch": 1.12987012987013,
"grad_norm": 0.3044598400592804,
"learning_rate": 4.005169142968503e-06,
"loss": 1.1396,
"step": 178
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.26966431736946106,
"learning_rate": 3.953567986083535e-06,
"loss": 1.1166,
"step": 179
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.2642230689525604,
"learning_rate": 3.902083127725186e-06,
"loss": 1.1156,
"step": 180
},
{
"epoch": 1.1493506493506493,
"grad_norm": 0.27723589539527893,
"learning_rate": 3.850720289825783e-06,
"loss": 1.106,
"step": 181
},
{
"epoch": 1.155844155844156,
"grad_norm": 0.25195732712745667,
"learning_rate": 3.799485180756526e-06,
"loss": 1.1473,
"step": 182
},
{
"epoch": 1.1623376623376624,
"grad_norm": 0.2634832561016083,
"learning_rate": 3.7483834946930682e-06,
"loss": 1.12,
"step": 183
},
{
"epoch": 1.1688311688311688,
"grad_norm": 0.2719174027442932,
"learning_rate": 3.6974209109826724e-06,
"loss": 1.1379,
"step": 184
},
{
"epoch": 1.1753246753246753,
"grad_norm": 0.2815384566783905,
"learning_rate": 3.64660309351303e-06,
"loss": 1.1548,
"step": 185
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.2685678005218506,
"learning_rate": 3.595935690082769e-06,
"loss": 1.1321,
"step": 186
},
{
"epoch": 1.1883116883116882,
"grad_norm": 0.282886803150177,
"learning_rate": 3.545424331773787e-06,
"loss": 1.159,
"step": 187
},
{
"epoch": 1.1948051948051948,
"grad_norm": 0.2961716055870056,
"learning_rate": 3.495074632325407e-06,
"loss": 1.0941,
"step": 188
},
{
"epoch": 1.2012987012987013,
"grad_norm": 0.26541033387184143,
"learning_rate": 3.4448921875104898e-06,
"loss": 1.1487,
"step": 189
},
{
"epoch": 1.2077922077922079,
"grad_norm": 0.3059718608856201,
"learning_rate": 3.3948825745135196e-06,
"loss": 1.1529,
"step": 190
},
{
"epoch": 1.2142857142857142,
"grad_norm": 0.2965088486671448,
"learning_rate": 3.345051351310774e-06,
"loss": 1.0652,
"step": 191
},
{
"epoch": 1.2207792207792207,
"grad_norm": 0.2668808102607727,
"learning_rate": 3.295404056052616e-06,
"loss": 1.1231,
"step": 192
},
{
"epoch": 1.2272727272727273,
"grad_norm": 0.2890554666519165,
"learning_rate": 3.2459462064479972e-06,
"loss": 1.15,
"step": 193
},
{
"epoch": 1.2337662337662338,
"grad_norm": 0.25789421796798706,
"learning_rate": 3.1966832991512232e-06,
"loss": 1.1434,
"step": 194
},
{
"epoch": 1.2402597402597402,
"grad_norm": 0.28866487741470337,
"learning_rate": 3.147620809151078e-06,
"loss": 1.1237,
"step": 195
},
{
"epoch": 1.2402597402597402,
"eval_loss": 1.1614325046539307,
"eval_runtime": 25.3147,
"eval_samples_per_second": 11.653,
"eval_steps_per_second": 1.462,
"step": 195
},
{
"epoch": 1.2467532467532467,
"grad_norm": 0.26898378133773804,
"learning_rate": 3.098764189162332e-06,
"loss": 1.1265,
"step": 196
},
{
"epoch": 1.2532467532467533,
"grad_norm": 0.2718038260936737,
"learning_rate": 3.0501188690197418e-06,
"loss": 1.1012,
"step": 197
},
{
"epoch": 1.2597402597402598,
"grad_norm": 0.27878862619400024,
"learning_rate": 3.0016902550745896e-06,
"loss": 1.1454,
"step": 198
},
{
"epoch": 1.2662337662337662,
"grad_norm": 0.28224632143974304,
"learning_rate": 2.9534837295938268e-06,
"loss": 1.1561,
"step": 199
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.2997869849205017,
"learning_rate": 2.9055046501619088e-06,
"loss": 1.1408,
"step": 200
},
{
"epoch": 1.2792207792207793,
"grad_norm": 0.29859447479248047,
"learning_rate": 2.857758349085348e-06,
"loss": 1.1105,
"step": 201
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.2930034101009369,
"learning_rate": 2.810250132800103e-06,
"loss": 1.1113,
"step": 202
},
{
"epoch": 1.2922077922077921,
"grad_norm": 0.29845744371414185,
"learning_rate": 2.762985281281828e-06,
"loss": 1.1702,
"step": 203
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.27331170439720154,
"learning_rate": 2.715969047459066e-06,
"loss": 1.1197,
"step": 204
},
{
"epoch": 1.3051948051948052,
"grad_norm": 0.31652095913887024,
"learning_rate": 2.6692066566294393e-06,
"loss": 1.1422,
"step": 205
},
{
"epoch": 1.3116883116883118,
"grad_norm": 0.3164924383163452,
"learning_rate": 2.622703305878941e-06,
"loss": 1.1179,
"step": 206
},
{
"epoch": 1.3181818181818181,
"grad_norm": 0.25467604398727417,
"learning_rate": 2.5764641635043174e-06,
"loss": 1.0839,
"step": 207
},
{
"epoch": 1.3246753246753247,
"grad_norm": 0.2731601297855377,
"learning_rate": 2.530494368438683e-06,
"loss": 1.1033,
"step": 208
},
{
"epoch": 1.3311688311688312,
"grad_norm": 0.2869837284088135,
"learning_rate": 2.4847990296803907e-06,
"loss": 1.1179,
"step": 209
},
{
"epoch": 1.3376623376623376,
"grad_norm": 0.2819492816925049,
"learning_rate": 2.4393832257252253e-06,
"loss": 1.012,
"step": 210
},
{
"epoch": 1.344155844155844,
"grad_norm": 0.267045259475708,
"learning_rate": 2.394252004001989e-06,
"loss": 1.0931,
"step": 211
},
{
"epoch": 1.3506493506493507,
"grad_norm": 0.32312628626823425,
"learning_rate": 2.349410380311532e-06,
"loss": 1.1414,
"step": 212
},
{
"epoch": 1.3571428571428572,
"grad_norm": 0.25884246826171875,
"learning_rate": 2.304863338269326e-06,
"loss": 1.126,
"step": 213
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.3011741638183594,
"learning_rate": 2.2606158287515662e-06,
"loss": 1.1247,
"step": 214
},
{
"epoch": 1.37012987012987,
"grad_norm": 0.2756052017211914,
"learning_rate": 2.216672769344965e-06,
"loss": 1.1196,
"step": 215
},
{
"epoch": 1.3766233766233766,
"grad_norm": 0.2793034613132477,
"learning_rate": 2.1730390438002056e-06,
"loss": 1.1372,
"step": 216
},
{
"epoch": 1.3831168831168832,
"grad_norm": 0.2940700948238373,
"learning_rate": 2.129719501489177e-06,
"loss": 1.0846,
"step": 217
},
{
"epoch": 1.3896103896103895,
"grad_norm": 0.30984777212142944,
"learning_rate": 2.086718956866024e-06,
"loss": 1.1404,
"step": 218
},
{
"epoch": 1.396103896103896,
"grad_norm": 0.25352832674980164,
"learning_rate": 2.044042188932068e-06,
"loss": 1.1223,
"step": 219
},
{
"epoch": 1.4025974025974026,
"grad_norm": 0.31949445605278015,
"learning_rate": 2.0016939407046987e-06,
"loss": 1.0899,
"step": 220
},
{
"epoch": 1.4090909090909092,
"grad_norm": 0.2721538245677948,
"learning_rate": 1.9596789186902184e-06,
"loss": 1.0571,
"step": 221
},
{
"epoch": 1.4155844155844157,
"grad_norm": 0.2586475610733032,
"learning_rate": 1.9180017923607884e-06,
"loss": 1.0164,
"step": 222
},
{
"epoch": 1.422077922077922,
"grad_norm": 0.29858991503715515,
"learning_rate": 1.8766671936354647e-06,
"loss": 1.155,
"step": 223
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.31114456057548523,
"learning_rate": 1.8356797163654172e-06,
"loss": 1.1546,
"step": 224
},
{
"epoch": 1.435064935064935,
"grad_norm": 0.27457138895988464,
"learning_rate": 1.795043915823373e-06,
"loss": 1.1159,
"step": 225
},
{
"epoch": 1.4415584415584415,
"grad_norm": 0.31147733330726624,
"learning_rate": 1.754764308197358e-06,
"loss": 1.1316,
"step": 226
},
{
"epoch": 1.448051948051948,
"grad_norm": 0.30539748072624207,
"learning_rate": 1.7148453700887747e-06,
"loss": 1.1135,
"step": 227
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.2620432674884796,
"learning_rate": 1.6752915380148772e-06,
"loss": 1.136,
"step": 228
},
{
"epoch": 1.4610389610389611,
"grad_norm": 0.3202950060367584,
"learning_rate": 1.6361072079157092e-06,
"loss": 1.1635,
"step": 229
},
{
"epoch": 1.4675324675324675,
"grad_norm": 0.3076479732990265,
"learning_rate": 1.5972967346655449e-06,
"loss": 1.1305,
"step": 230
},
{
"epoch": 1.474025974025974,
"grad_norm": 0.2635696828365326,
"learning_rate": 1.5588644315888978e-06,
"loss": 1.0887,
"step": 231
},
{
"epoch": 1.4805194805194806,
"grad_norm": 0.30404505133628845,
"learning_rate": 1.5208145699811417e-06,
"loss": 1.1176,
"step": 232
},
{
"epoch": 1.487012987012987,
"grad_norm": 0.29464152455329895,
"learning_rate": 1.4831513786338126e-06,
"loss": 1.1724,
"step": 233
},
{
"epoch": 1.4935064935064934,
"grad_norm": 0.3045382499694824,
"learning_rate": 1.4458790433646264e-06,
"loss": 1.1151,
"step": 234
},
{
"epoch": 1.4935064935064934,
"eval_loss": 1.1553453207015991,
"eval_runtime": 25.2983,
"eval_samples_per_second": 11.661,
"eval_steps_per_second": 1.463,
"step": 234
},
{
"epoch": 1.5,
"grad_norm": 0.2871938943862915,
"learning_rate": 1.4090017065522731e-06,
"loss": 1.1687,
"step": 235
},
{
"epoch": 1.5064935064935066,
"grad_norm": 0.2857874035835266,
"learning_rate": 1.3725234666760428e-06,
"loss": 1.1089,
"step": 236
},
{
"epoch": 1.512987012987013,
"grad_norm": 0.3428025245666504,
"learning_rate": 1.3364483778603272e-06,
"loss": 1.1335,
"step": 237
},
{
"epoch": 1.5194805194805194,
"grad_norm": 0.3187818229198456,
"learning_rate": 1.3007804494240478e-06,
"loss": 1.163,
"step": 238
},
{
"epoch": 1.525974025974026,
"grad_norm": 0.28887903690338135,
"learning_rate": 1.2655236454350777e-06,
"loss": 1.119,
"step": 239
},
{
"epoch": 1.5324675324675323,
"grad_norm": 0.28905734419822693,
"learning_rate": 1.2306818842696716e-06,
"loss": 1.1448,
"step": 240
},
{
"epoch": 1.5389610389610389,
"grad_norm": 0.25675976276397705,
"learning_rate": 1.1962590381769923e-06,
"loss": 1.1513,
"step": 241
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.2782272696495056,
"learning_rate": 1.1622589328487505e-06,
"loss": 1.0953,
"step": 242
},
{
"epoch": 1.551948051948052,
"grad_norm": 0.2892334461212158,
"learning_rate": 1.128685346994028e-06,
"loss": 1.1721,
"step": 243
},
{
"epoch": 1.5584415584415585,
"grad_norm": 0.31845828890800476,
"learning_rate": 1.09554201191932e-06,
"loss": 1.1076,
"step": 244
},
{
"epoch": 1.564935064935065,
"grad_norm": 0.27880439162254333,
"learning_rate": 1.0628326111138377e-06,
"loss": 1.1691,
"step": 245
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.2526175081729889,
"learning_rate": 1.03056077984014e-06,
"loss": 1.1443,
"step": 246
},
{
"epoch": 1.577922077922078,
"grad_norm": 0.2798580527305603,
"learning_rate": 9.98730104730115e-07,
"loss": 1.1156,
"step": 247
},
{
"epoch": 1.5844155844155843,
"grad_norm": 0.3181169927120209,
"learning_rate": 9.673441233863661e-07,
"loss": 1.0997,
"step": 248
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.2808782756328583,
"learning_rate": 9.364063239890476e-07,
"loss": 1.1658,
"step": 249
},
{
"epoch": 1.5974025974025974,
"grad_norm": 0.2726938724517822,
"learning_rate": 9.059201449082045e-07,
"loss": 1.0976,
"step": 250
},
{
"epoch": 1.603896103896104,
"grad_norm": 0.29625144600868225,
"learning_rate": 8.758889743216247e-07,
"loss": 1.1165,
"step": 251
},
{
"epoch": 1.6103896103896105,
"grad_norm": 0.29083919525146484,
"learning_rate": 8.463161498382949e-07,
"loss": 1.158,
"step": 252
},
{
"epoch": 1.616883116883117,
"grad_norm": 0.27802902460098267,
"learning_rate": 8.172049581274571e-07,
"loss": 1.1136,
"step": 253
},
{
"epoch": 1.6233766233766234,
"grad_norm": 0.2722671627998352,
"learning_rate": 7.885586345533397e-07,
"loss": 1.1543,
"step": 254
},
{
"epoch": 1.62987012987013,
"grad_norm": 0.26996558904647827,
"learning_rate": 7.603803628155821e-07,
"loss": 1.1196,
"step": 255
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.2988632023334503,
"learning_rate": 7.326732745954001e-07,
"loss": 1.1126,
"step": 256
},
{
"epoch": 1.6428571428571428,
"grad_norm": 0.2721429467201233,
"learning_rate": 7.054404492075512e-07,
"loss": 1.1334,
"step": 257
},
{
"epoch": 1.6493506493506493,
"grad_norm": 0.2960205078125,
"learning_rate": 6.786849132580841e-07,
"loss": 1.1289,
"step": 258
},
{
"epoch": 1.655844155844156,
"grad_norm": 0.2805614173412323,
"learning_rate": 6.524096403079861e-07,
"loss": 1.1566,
"step": 259
},
{
"epoch": 1.6623376623376624,
"grad_norm": 0.26631271839141846,
"learning_rate": 6.266175505426958e-07,
"loss": 1.1385,
"step": 260
},
{
"epoch": 1.6688311688311688,
"grad_norm": 0.2959340810775757,
"learning_rate": 6.013115104475653e-07,
"loss": 1.061,
"step": 261
},
{
"epoch": 1.6753246753246753,
"grad_norm": 0.3008975088596344,
"learning_rate": 5.76494332489278e-07,
"loss": 1.1383,
"step": 262
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.28491881489753723,
"learning_rate": 5.521687748032805e-07,
"loss": 1.1827,
"step": 263
},
{
"epoch": 1.6883116883116882,
"grad_norm": 0.2625648081302643,
"learning_rate": 5.283375408872538e-07,
"loss": 1.1496,
"step": 264
},
{
"epoch": 1.6948051948051948,
"grad_norm": 0.29310908913612366,
"learning_rate": 5.05003279300637e-07,
"loss": 1.1256,
"step": 265
},
{
"epoch": 1.7012987012987013,
"grad_norm": 0.28792184591293335,
"learning_rate": 4.82168583370285e-07,
"loss": 1.1269,
"step": 266
},
{
"epoch": 1.7077922077922079,
"grad_norm": 0.3059506118297577,
"learning_rate": 4.598359909022443e-07,
"loss": 1.1275,
"step": 267
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.2944253087043762,
"learning_rate": 4.380079838997087e-07,
"loss": 1.0612,
"step": 268
},
{
"epoch": 1.7207792207792207,
"grad_norm": 0.3492392599582672,
"learning_rate": 4.1668698828716994e-07,
"loss": 1.1408,
"step": 269
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.2833608090877533,
"learning_rate": 3.958753736408105e-07,
"loss": 1.1345,
"step": 270
},
{
"epoch": 1.7337662337662336,
"grad_norm": 0.30626702308654785,
"learning_rate": 3.7557545292514987e-07,
"loss": 1.1533,
"step": 271
},
{
"epoch": 1.7402597402597402,
"grad_norm": 0.2807121276855469,
"learning_rate": 3.557894822359864e-07,
"loss": 1.1166,
"step": 272
},
{
"epoch": 1.7467532467532467,
"grad_norm": 0.31310421228408813,
"learning_rate": 3.3651966054965734e-07,
"loss": 1.1238,
"step": 273
},
{
"epoch": 1.7467532467532467,
"eval_loss": 1.153025507926941,
"eval_runtime": 25.365,
"eval_samples_per_second": 11.63,
"eval_steps_per_second": 1.459,
"step": 273
},
{
"epoch": 1.7532467532467533,
"grad_norm": 0.29958927631378174,
"learning_rate": 3.177681294786539e-07,
"loss": 1.1579,
"step": 274
},
{
"epoch": 1.7597402597402598,
"grad_norm": 0.3180530071258545,
"learning_rate": 2.995369730336012e-07,
"loss": 1.0656,
"step": 275
},
{
"epoch": 1.7662337662337664,
"grad_norm": 0.30499908328056335,
"learning_rate": 2.8182821739164534e-07,
"loss": 1.0744,
"step": 276
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.2803168296813965,
"learning_rate": 2.6464383067127175e-07,
"loss": 1.0652,
"step": 277
},
{
"epoch": 1.7792207792207793,
"grad_norm": 0.2866427004337311,
"learning_rate": 2.479857227135685e-07,
"loss": 1.1175,
"step": 278
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.26658734679222107,
"learning_rate": 2.3185574486997264e-07,
"loss": 1.1494,
"step": 279
},
{
"epoch": 1.7922077922077921,
"grad_norm": 0.3132326602935791,
"learning_rate": 2.1625568979651012e-07,
"loss": 1.0939,
"step": 280
},
{
"epoch": 1.7987012987012987,
"grad_norm": 0.29077062010765076,
"learning_rate": 2.0118729125457036e-07,
"loss": 1.1686,
"step": 281
},
{
"epoch": 1.8051948051948052,
"grad_norm": 0.2711000144481659,
"learning_rate": 1.866522239182117e-07,
"loss": 1.1246,
"step": 282
},
{
"epoch": 1.8116883116883118,
"grad_norm": 0.28556856513023376,
"learning_rate": 1.7265210318804683e-07,
"loss": 1.0459,
"step": 283
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.30168288946151733,
"learning_rate": 1.5918848501170647e-07,
"loss": 1.1087,
"step": 284
},
{
"epoch": 1.8246753246753247,
"grad_norm": 0.30205830931663513,
"learning_rate": 1.4626286571091664e-07,
"loss": 1.1333,
"step": 285
},
{
"epoch": 1.8311688311688312,
"grad_norm": 0.2765739858150482,
"learning_rate": 1.338766818151982e-07,
"loss": 1.0988,
"step": 286
},
{
"epoch": 1.8376623376623376,
"grad_norm": 0.3049326241016388,
"learning_rate": 1.2203130990221635e-07,
"loss": 1.0979,
"step": 287
},
{
"epoch": 1.844155844155844,
"grad_norm": 0.2744823694229126,
"learning_rate": 1.107280664447874e-07,
"loss": 1.1496,
"step": 288
},
{
"epoch": 1.8506493506493507,
"grad_norm": 0.2835923731327057,
"learning_rate": 9.996820766456916e-08,
"loss": 1.0894,
"step": 289
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.31562480330467224,
"learning_rate": 8.975292939244928e-08,
"loss": 1.1204,
"step": 290
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.26750344038009644,
"learning_rate": 8.008336693563823e-08,
"loss": 1.1243,
"step": 291
},
{
"epoch": 1.87012987012987,
"grad_norm": 0.26439955830574036,
"learning_rate": 7.096059495149855e-08,
"loss": 1.1027,
"step": 292
},
{
"epoch": 1.8766233766233766,
"grad_norm": 0.2618618905544281,
"learning_rate": 6.238562732810427e-08,
"loss": 1.0328,
"step": 293
},
{
"epoch": 1.883116883116883,
"grad_norm": 0.3233661949634552,
"learning_rate": 5.435941707156389e-08,
"loss": 1.1159,
"step": 294
},
{
"epoch": 1.8896103896103895,
"grad_norm": 0.28882384300231934,
"learning_rate": 4.6882856200101135e-08,
"loss": 1.1039,
"step": 295
},
{
"epoch": 1.896103896103896,
"grad_norm": 0.3103022277355194,
"learning_rate": 3.99567756449204e-08,
"loss": 1.1315,
"step": 296
},
{
"epoch": 1.9025974025974026,
"grad_norm": 0.2719733715057373,
"learning_rate": 3.358194515785784e-08,
"loss": 1.1128,
"step": 297
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.266874223947525,
"learning_rate": 2.77590732258326e-08,
"loss": 1.1333,
"step": 298
},
{
"epoch": 1.9155844155844157,
"grad_norm": 0.2862412631511688,
"learning_rate": 2.2488806992105317e-08,
"loss": 1.1387,
"step": 299
},
{
"epoch": 1.922077922077922,
"grad_norm": 0.3009204864501953,
"learning_rate": 1.7771732184357905e-08,
"loss": 1.1328,
"step": 300
},
{
"epoch": 1.9285714285714286,
"grad_norm": 0.28494757413864136,
"learning_rate": 1.3608373049596724e-08,
"loss": 1.1189,
"step": 301
},
{
"epoch": 1.935064935064935,
"grad_norm": 0.26805561780929565,
"learning_rate": 9.999192295886973e-09,
"loss": 1.0664,
"step": 302
},
{
"epoch": 1.9415584415584415,
"grad_norm": 0.3299916088581085,
"learning_rate": 6.944591040930481e-09,
"loss": 1.1117,
"step": 303
},
{
"epoch": 1.948051948051948,
"grad_norm": 0.2890622913837433,
"learning_rate": 4.444908767484712e-09,
"loss": 1.1396,
"step": 304
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.28758740425109863,
"learning_rate": 2.500423285632381e-09,
"loss": 1.1277,
"step": 305
},
{
"epoch": 1.9610389610389611,
"grad_norm": 0.2983172535896301,
"learning_rate": 1.111350701909486e-09,
"loss": 1.1452,
"step": 306
},
{
"epoch": 1.9675324675324677,
"grad_norm": 0.30814212560653687,
"learning_rate": 2.7784539528397104e-10,
"loss": 1.0888,
"step": 307
},
{
"epoch": 1.974025974025974,
"grad_norm": 0.2840464115142822,
"learning_rate": 0.0,
"loss": 1.1205,
"step": 308
}
],
"logging_steps": 1,
"max_steps": 308,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 77,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.703755799815258e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}