chansung's picture
Model save
c3c5c3b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990224828934506,
"eval_steps": 500,
"global_step": 511,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019550342130987292,
"grad_norm": 28.436507384573893,
"learning_rate": 5.769230769230769e-06,
"loss": 1.9755,
"step": 1
},
{
"epoch": 0.009775171065493646,
"grad_norm": 16.326267191478145,
"learning_rate": 2.8846153846153845e-05,
"loss": 1.7228,
"step": 5
},
{
"epoch": 0.019550342130987292,
"grad_norm": 7.77688805789614,
"learning_rate": 5.769230769230769e-05,
"loss": 1.1824,
"step": 10
},
{
"epoch": 0.02932551319648094,
"grad_norm": 23.182562040558295,
"learning_rate": 8.653846153846152e-05,
"loss": 1.2132,
"step": 15
},
{
"epoch": 0.039100684261974585,
"grad_norm": 3.8046090746822716,
"learning_rate": 0.00011538461538461538,
"loss": 1.0978,
"step": 20
},
{
"epoch": 0.04887585532746823,
"grad_norm": 6.364935901579011,
"learning_rate": 0.00014423076923076922,
"loss": 1.2143,
"step": 25
},
{
"epoch": 0.05865102639296188,
"grad_norm": 4.553589249592707,
"learning_rate": 0.00017307692307692304,
"loss": 1.1357,
"step": 30
},
{
"epoch": 0.06842619745845552,
"grad_norm": 5.819131618520497,
"learning_rate": 0.00020192307692307691,
"loss": 1.2003,
"step": 35
},
{
"epoch": 0.07820136852394917,
"grad_norm": 8.631833955662083,
"learning_rate": 0.00023076923076923076,
"loss": 1.2049,
"step": 40
},
{
"epoch": 0.08797653958944282,
"grad_norm": 27.119527389294134,
"learning_rate": 0.0002596153846153846,
"loss": 2.0482,
"step": 45
},
{
"epoch": 0.09775171065493646,
"grad_norm": 32.33385091368177,
"learning_rate": 0.00028846153846153843,
"loss": 1.6455,
"step": 50
},
{
"epoch": 0.10752688172043011,
"grad_norm": 45.94902359405063,
"learning_rate": 0.0002999683799255387,
"loss": 1.9763,
"step": 55
},
{
"epoch": 0.11730205278592376,
"grad_norm": 128.59363629595714,
"learning_rate": 0.0002997751944121241,
"loss": 1.5422,
"step": 60
},
{
"epoch": 0.1270772238514174,
"grad_norm": 10.71651832916556,
"learning_rate": 0.0002994066160471166,
"loss": 1.7548,
"step": 65
},
{
"epoch": 0.13685239491691104,
"grad_norm": 4.189410908699583,
"learning_rate": 0.0002988630764507904,
"loss": 1.3404,
"step": 70
},
{
"epoch": 0.1466275659824047,
"grad_norm": 33.78267011030123,
"learning_rate": 0.00029814521213014585,
"loss": 1.4341,
"step": 75
},
{
"epoch": 0.15640273704789834,
"grad_norm": 4.447241036402298,
"learning_rate": 0.00029725386373353455,
"loss": 1.4355,
"step": 80
},
{
"epoch": 0.16617790811339198,
"grad_norm": 19.324376604180756,
"learning_rate": 0.00029619007506622504,
"loss": 1.4037,
"step": 85
},
{
"epoch": 0.17595307917888564,
"grad_norm": 56.398325865658165,
"learning_rate": 0.00029495509186806487,
"loss": 2.1883,
"step": 90
},
{
"epoch": 0.18572825024437928,
"grad_norm": 117.39758059322921,
"learning_rate": 0.0002935503603546683,
"loss": 2.0507,
"step": 95
},
{
"epoch": 0.19550342130987292,
"grad_norm": 7.6884925370685995,
"learning_rate": 0.00029197752552383914,
"loss": 1.5932,
"step": 100
},
{
"epoch": 0.20527859237536658,
"grad_norm": 963.0390862127407,
"learning_rate": 0.000290238429229211,
"loss": 5.223,
"step": 105
},
{
"epoch": 0.21505376344086022,
"grad_norm": 258.2156128276988,
"learning_rate": 0.00028833510802336203,
"loss": 4.5213,
"step": 110
},
{
"epoch": 0.22482893450635386,
"grad_norm": 264.762100287562,
"learning_rate": 0.0002862697907729285,
"loss": 2.0849,
"step": 115
},
{
"epoch": 0.23460410557184752,
"grad_norm": 18.694604887076736,
"learning_rate": 0.0002840448960485118,
"loss": 1.8192,
"step": 120
},
{
"epoch": 0.24437927663734116,
"grad_norm": 67.0812503804155,
"learning_rate": 0.00028166302929243326,
"loss": 1.3915,
"step": 125
},
{
"epoch": 0.2541544477028348,
"grad_norm": 21.92554603766507,
"learning_rate": 0.0002791269797676551,
"loss": 1.5317,
"step": 130
},
{
"epoch": 0.26392961876832843,
"grad_norm": 7.078011892011075,
"learning_rate": 0.00027643971729144056,
"loss": 1.4673,
"step": 135
},
{
"epoch": 0.27370478983382207,
"grad_norm": 9.631691414307214,
"learning_rate": 0.0002736043887575761,
"loss": 1.3131,
"step": 140
},
{
"epoch": 0.28347996089931576,
"grad_norm": 23.096760351073254,
"learning_rate": 0.00027062431445123124,
"loss": 1.572,
"step": 145
},
{
"epoch": 0.2932551319648094,
"grad_norm": 3.17350344043114,
"learning_rate": 0.0002675029841607691,
"loss": 1.3668,
"step": 150
},
{
"epoch": 0.30303030303030304,
"grad_norm": 6.678371795997961,
"learning_rate": 0.00026424405309106216,
"loss": 1.3082,
"step": 155
},
{
"epoch": 0.3128054740957967,
"grad_norm": 4.718771263163467,
"learning_rate": 0.00026085133758309883,
"loss": 1.3286,
"step": 160
},
{
"epoch": 0.3225806451612903,
"grad_norm": 8.790382644648172,
"learning_rate": 0.00025732881064489233,
"loss": 1.3241,
"step": 165
},
{
"epoch": 0.33235581622678395,
"grad_norm": 4.416208770248871,
"learning_rate": 0.0002536805972989267,
"loss": 1.3357,
"step": 170
},
{
"epoch": 0.3421309872922776,
"grad_norm": 3.1002678359027285,
"learning_rate": 0.0002499109697515875,
"loss": 1.4037,
"step": 175
},
{
"epoch": 0.3519061583577713,
"grad_norm": 7.124469693986668,
"learning_rate": 0.0002460243423902342,
"loss": 1.625,
"step": 180
},
{
"epoch": 0.3616813294232649,
"grad_norm": 51.006744297270856,
"learning_rate": 0.00024202526661377277,
"loss": 1.6499,
"step": 185
},
{
"epoch": 0.37145650048875856,
"grad_norm": 14.662023479458805,
"learning_rate": 0.00023791842550278217,
"loss": 1.8342,
"step": 190
},
{
"epoch": 0.3812316715542522,
"grad_norm": 4.973242239066626,
"learning_rate": 0.00023370862833543648,
"loss": 1.6823,
"step": 195
},
{
"epoch": 0.39100684261974583,
"grad_norm": 95.12537291599145,
"learning_rate": 0.0002294008049556441,
"loss": 1.5268,
"step": 200
},
{
"epoch": 0.40078201368523947,
"grad_norm": 3.394429345078054,
"learning_rate": 0.000225,
"loss": 1.45,
"step": 205
},
{
"epoch": 0.41055718475073316,
"grad_norm": 3.5902349674560456,
"learning_rate": 0.00022051136699031057,
"loss": 1.2502,
"step": 210
},
{
"epoch": 0.4203323558162268,
"grad_norm": 260.2217345574012,
"learning_rate": 0.00021594016229861007,
"loss": 1.4486,
"step": 215
},
{
"epoch": 0.43010752688172044,
"grad_norm": 4.8647475236367725,
"learning_rate": 0.0002112917389917347,
"loss": 1.486,
"step": 220
},
{
"epoch": 0.4398826979472141,
"grad_norm": 1.9024756731821182,
"learning_rate": 0.0002065715405626634,
"loss": 1.2628,
"step": 225
},
{
"epoch": 0.4496578690127077,
"grad_norm": 10.37907922884646,
"learning_rate": 0.00020178509455596596,
"loss": 1.2518,
"step": 230
},
{
"epoch": 0.45943304007820135,
"grad_norm": 1.9405506982628546,
"learning_rate": 0.00019693800609482315,
"loss": 1.2849,
"step": 235
},
{
"epoch": 0.46920821114369504,
"grad_norm": 2.6160283264932462,
"learning_rate": 0.00019203595131719932,
"loss": 1.2548,
"step": 240
},
{
"epoch": 0.4789833822091887,
"grad_norm": 2.1695347772705373,
"learning_rate": 0.00018708467072885382,
"loss": 1.3377,
"step": 245
},
{
"epoch": 0.4887585532746823,
"grad_norm": 2.2144620011763374,
"learning_rate": 0.00018208996248097458,
"loss": 1.3093,
"step": 250
},
{
"epoch": 0.49853372434017595,
"grad_norm": 2.1880901805448403,
"learning_rate": 0.00017705767558030754,
"loss": 1.245,
"step": 255
},
{
"epoch": 0.5083088954056696,
"grad_norm": 2.991429961153096,
"learning_rate": 0.0001719937030397311,
"loss": 1.2559,
"step": 260
},
{
"epoch": 0.5180840664711632,
"grad_norm": 23.064953881729117,
"learning_rate": 0.00016690397497729818,
"loss": 1.288,
"step": 265
},
{
"epoch": 0.5278592375366569,
"grad_norm": 1.3455036549144204,
"learning_rate": 0.00016179445167182677,
"loss": 1.2717,
"step": 270
},
{
"epoch": 0.5376344086021505,
"grad_norm": 1.1846238387921606,
"learning_rate": 0.00015667111658317054,
"loss": 1.2394,
"step": 275
},
{
"epoch": 0.5474095796676441,
"grad_norm": 137.50996798714343,
"learning_rate": 0.00015153996934534348,
"loss": 1.3296,
"step": 280
},
{
"epoch": 0.5571847507331378,
"grad_norm": 1.1488279921928382,
"learning_rate": 0.00014640701874070455,
"loss": 1.2874,
"step": 285
},
{
"epoch": 0.5669599217986315,
"grad_norm": 1.2689991198054311,
"learning_rate": 0.00014127827566342863,
"loss": 1.2561,
"step": 290
},
{
"epoch": 0.5767350928641252,
"grad_norm": 2.140511295392104,
"learning_rate": 0.0001361597460805047,
"loss": 1.2205,
"step": 295
},
{
"epoch": 0.5865102639296188,
"grad_norm": 12.286388401977892,
"learning_rate": 0.000131057423998504,
"loss": 1.252,
"step": 300
},
{
"epoch": 0.5962854349951124,
"grad_norm": 2.5541457654289395,
"learning_rate": 0.00012597728444435418,
"loss": 1.215,
"step": 305
},
{
"epoch": 0.6060606060606061,
"grad_norm": 1.1732833800621696,
"learning_rate": 0.00012092527646833949,
"loss": 1.2053,
"step": 310
},
{
"epoch": 0.6158357771260997,
"grad_norm": 1.4481798374657,
"learning_rate": 0.00011590731617752066,
"loss": 1.2061,
"step": 315
},
{
"epoch": 0.6256109481915934,
"grad_norm": 0.9912604590459435,
"learning_rate": 0.00011092927980773267,
"loss": 1.1604,
"step": 320
},
{
"epoch": 0.635386119257087,
"grad_norm": 1.0322599469502478,
"learning_rate": 0.00010599699684227311,
"loss": 1.1369,
"step": 325
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.5437893851108073,
"learning_rate": 0.00010111624318534006,
"loss": 1.1721,
"step": 330
},
{
"epoch": 0.6549364613880743,
"grad_norm": 1.383693940282455,
"learning_rate": 9.629273439821313e-05,
"loss": 1.1094,
"step": 335
},
{
"epoch": 0.6647116324535679,
"grad_norm": 2.598856235735631,
"learning_rate": 9.15321190060981e-05,
"loss": 1.1251,
"step": 340
},
{
"epoch": 0.6744868035190615,
"grad_norm": 1.0440965009342513,
"learning_rate": 8.683997188347435e-05,
"loss": 1.0953,
"step": 345
},
{
"epoch": 0.6842619745845552,
"grad_norm": 1.2172823916521676,
"learning_rate": 8.222178772568959e-05,
"loss": 1.0839,
"step": 350
},
{
"epoch": 0.6940371456500489,
"grad_norm": 0.9291103607651107,
"learning_rate": 7.768297461444765e-05,
"loss": 1.0786,
"step": 355
},
{
"epoch": 0.7038123167155426,
"grad_norm": 36.656353739168324,
"learning_rate": 7.32288476847252e-05,
"loss": 1.1001,
"step": 360
},
{
"epoch": 0.7135874877810362,
"grad_norm": 0.7923067460462172,
"learning_rate": 6.886462290053158e-05,
"loss": 1.0793,
"step": 365
},
{
"epoch": 0.7233626588465298,
"grad_norm": 0.8419278909431203,
"learning_rate": 6.4595410946803e-05,
"loss": 1.0869,
"step": 370
},
{
"epoch": 0.7331378299120235,
"grad_norm": 1.0339571657214093,
"learning_rate": 6.04262112445821e-05,
"loss": 1.0128,
"step": 375
},
{
"epoch": 0.7429130009775171,
"grad_norm": 0.7333799573780848,
"learning_rate": 5.636190609649249e-05,
"loss": 1.0101,
"step": 380
},
{
"epoch": 0.7526881720430108,
"grad_norm": 0.8595033406539786,
"learning_rate": 5.240725496936372e-05,
"loss": 1.0224,
"step": 385
},
{
"epoch": 0.7624633431085044,
"grad_norm": 0.7112388580251547,
"learning_rate": 4.8566888920701196e-05,
"loss": 1.0016,
"step": 390
},
{
"epoch": 0.772238514173998,
"grad_norm": 0.8167634152987004,
"learning_rate": 4.48453051755301e-05,
"loss": 0.9793,
"step": 395
},
{
"epoch": 0.7820136852394917,
"grad_norm": 0.6682157986400304,
"learning_rate": 4.12468618599611e-05,
"loss": 1.0015,
"step": 400
},
{
"epoch": 0.7917888563049853,
"grad_norm": 0.7797058649722416,
"learning_rate": 3.777577289764752e-05,
"loss": 0.9784,
"step": 405
},
{
"epoch": 0.8015640273704789,
"grad_norm": 0.6768885831261553,
"learning_rate": 3.443610307510907e-05,
"loss": 0.9605,
"step": 410
},
{
"epoch": 0.8113391984359726,
"grad_norm": 0.6416162504789994,
"learning_rate": 3.1231763281701305e-05,
"loss": 0.971,
"step": 415
},
{
"epoch": 0.8211143695014663,
"grad_norm": 0.6784950124595185,
"learning_rate": 2.816650592980495e-05,
"loss": 0.9553,
"step": 420
},
{
"epoch": 0.83088954056696,
"grad_norm": 0.689853020596839,
"learning_rate": 2.5243920560598184e-05,
"loss": 0.9351,
"step": 425
},
{
"epoch": 0.8406647116324536,
"grad_norm": 0.6460996639595076,
"learning_rate": 2.24674296405579e-05,
"loss": 0.9313,
"step": 430
},
{
"epoch": 0.8504398826979472,
"grad_norm": 0.6636695831614443,
"learning_rate": 1.98402845536117e-05,
"loss": 0.9266,
"step": 435
},
{
"epoch": 0.8602150537634409,
"grad_norm": 0.6119153060968766,
"learning_rate": 1.736556179363543e-05,
"loss": 0.9134,
"step": 440
},
{
"epoch": 0.8699902248289345,
"grad_norm": 0.5955119176542452,
"learning_rate": 1.5046159361753224e-05,
"loss": 0.9198,
"step": 445
},
{
"epoch": 0.8797653958944281,
"grad_norm": 0.6324852203366718,
"learning_rate": 1.2884793372660207e-05,
"loss": 0.9051,
"step": 450
},
{
"epoch": 0.8895405669599218,
"grad_norm": 0.6803072103352026,
"learning_rate": 1.0883994873941815e-05,
"loss": 0.8923,
"step": 455
},
{
"epoch": 0.8993157380254154,
"grad_norm": 2.125025596226289,
"learning_rate": 9.046106882113751e-06,
"loss": 0.9315,
"step": 460
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.5812394213670566,
"learning_rate": 7.373281638854328e-06,
"loss": 0.9131,
"step": 465
},
{
"epoch": 0.9188660801564027,
"grad_norm": 0.6819326019234143,
"learning_rate": 5.867478090641892e-06,
"loss": 0.9521,
"step": 470
},
{
"epoch": 0.9286412512218963,
"grad_norm": 0.6009777818830081,
"learning_rate": 4.530459594748592e-06,
"loss": 0.8585,
"step": 475
},
{
"epoch": 0.9384164222873901,
"grad_norm": 0.5996940109383458,
"learning_rate": 3.363791854277348e-06,
"loss": 0.8938,
"step": 480
},
{
"epoch": 0.9481915933528837,
"grad_norm": 0.6175664551394122,
"learning_rate": 2.3688410846596282e-06,
"loss": 0.8891,
"step": 485
},
{
"epoch": 0.9579667644183774,
"grad_norm": 2.0911762607681768,
"learning_rate": 1.5467724137617043e-06,
"loss": 0.924,
"step": 490
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.6696352577624144,
"learning_rate": 8.985485174722973e-07,
"loss": 0.9077,
"step": 495
},
{
"epoch": 0.9775171065493646,
"grad_norm": 0.5766498486118373,
"learning_rate": 4.249284923700358e-07,
"loss": 0.9012,
"step": 500
},
{
"epoch": 0.9872922776148583,
"grad_norm": 0.6254945643083392,
"learning_rate": 1.2646696679042833e-07,
"loss": 0.9035,
"step": 505
},
{
"epoch": 0.9970674486803519,
"grad_norm": 0.6202670688713676,
"learning_rate": 3.5134513334200697e-09,
"loss": 0.9303,
"step": 510
},
{
"epoch": 0.9990224828934506,
"eval_loss": 3.596351385116577,
"eval_runtime": 2.2495,
"eval_samples_per_second": 2.667,
"eval_steps_per_second": 0.445,
"step": 511
},
{
"epoch": 0.9990224828934506,
"step": 511,
"total_flos": 26722078556160.0,
"train_loss": 1.3314139091805235,
"train_runtime": 8881.3456,
"train_samples_per_second": 1.842,
"train_steps_per_second": 0.058
}
],
"logging_steps": 5,
"max_steps": 511,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 26722078556160.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}