{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7582771755054205, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011737089201877935, "grad_norm": 0.27773135900497437, "learning_rate": 4e-05, "loss": 1.1957, "step": 1 }, { "epoch": 0.002347417840375587, "grad_norm": 0.26547771692276, "learning_rate": 8e-05, "loss": 1.1284, "step": 2 }, { "epoch": 0.0035211267605633804, "grad_norm": 0.236787810921669, "learning_rate": 0.00012, "loss": 1.1823, "step": 3 }, { "epoch": 0.004694835680751174, "grad_norm": 0.2459038347005844, "learning_rate": 0.00016, "loss": 1.1409, "step": 4 }, { "epoch": 0.005868544600938967, "grad_norm": 0.2526487410068512, "learning_rate": 0.0002, "loss": 1.12, "step": 5 }, { "epoch": 0.007042253521126761, "grad_norm": 0.2795103192329407, "learning_rate": 0.00019976387249114524, "loss": 1.1579, "step": 6 }, { "epoch": 0.008215962441314555, "grad_norm": 0.2876183092594147, "learning_rate": 0.00019952774498229045, "loss": 1.1211, "step": 7 }, { "epoch": 0.009389671361502348, "grad_norm": 0.3014296293258667, "learning_rate": 0.00019929161747343565, "loss": 1.1118, "step": 8 }, { "epoch": 0.01056338028169014, "grad_norm": 0.29106494784355164, "learning_rate": 0.00019905548996458088, "loss": 1.1787, "step": 9 }, { "epoch": 0.011737089201877934, "grad_norm": 0.3211474120616913, "learning_rate": 0.00019881936245572609, "loss": 1.1004, "step": 10 }, { "epoch": 0.012910798122065728, "grad_norm": 0.3358176350593567, "learning_rate": 0.00019858323494687132, "loss": 1.1099, "step": 11 }, { "epoch": 0.014084507042253521, "grad_norm": 0.3236922323703766, "learning_rate": 0.00019834710743801655, "loss": 1.048, "step": 12 }, { "epoch": 0.015258215962441314, "grad_norm": 0.31388312578201294, "learning_rate": 0.00019811097992916175, "loss": 1.0532, "step": 13 }, { "epoch": 0.01643192488262911, "grad_norm": 0.320402055978775, "learning_rate": 0.00019787485242030696, "loss": 1.0757, "step": 14 }, { "epoch": 0.017605633802816902, "grad_norm": 0.32999494671821594, "learning_rate": 0.0001976387249114522, "loss": 1.122, "step": 15 }, { "epoch": 0.018779342723004695, "grad_norm": 0.30936214327812195, "learning_rate": 0.00019740259740259742, "loss": 1.1156, "step": 16 }, { "epoch": 0.01995305164319249, "grad_norm": 0.2863931953907013, "learning_rate": 0.00019716646989374263, "loss": 1.0414, "step": 17 }, { "epoch": 0.02112676056338028, "grad_norm": 0.29143351316452026, "learning_rate": 0.00019693034238488786, "loss": 1.0379, "step": 18 }, { "epoch": 0.022300469483568074, "grad_norm": 0.28874626755714417, "learning_rate": 0.0001966942148760331, "loss": 1.0388, "step": 19 }, { "epoch": 0.023474178403755867, "grad_norm": 0.30588293075561523, "learning_rate": 0.00019645808736717827, "loss": 1.0515, "step": 20 }, { "epoch": 0.02464788732394366, "grad_norm": 0.29231536388397217, "learning_rate": 0.0001962219598583235, "loss": 1.0472, "step": 21 }, { "epoch": 0.025821596244131457, "grad_norm": 0.2783581614494324, "learning_rate": 0.00019598583234946873, "loss": 1.0608, "step": 22 }, { "epoch": 0.02699530516431925, "grad_norm": 0.29816293716430664, "learning_rate": 0.00019574970484061393, "loss": 1.0986, "step": 23 }, { "epoch": 0.028169014084507043, "grad_norm": 0.27919578552246094, "learning_rate": 0.00019551357733175916, "loss": 1.0265, "step": 24 }, { "epoch": 0.029342723004694836, "grad_norm": 0.3144524097442627, "learning_rate": 0.00019527744982290437, "loss": 1.0699, "step": 25 }, { "epoch": 0.03051643192488263, "grad_norm": 0.3090282678604126, "learning_rate": 0.0001950413223140496, "loss": 1.0601, "step": 26 }, { "epoch": 0.03169014084507042, "grad_norm": 0.30304697155952454, "learning_rate": 0.0001948051948051948, "loss": 1.0926, "step": 27 }, { "epoch": 0.03286384976525822, "grad_norm": 0.29015883803367615, "learning_rate": 0.00019456906729634004, "loss": 1.0, "step": 28 }, { "epoch": 0.03403755868544601, "grad_norm": 0.29359501600265503, "learning_rate": 0.00019433293978748527, "loss": 0.988, "step": 29 }, { "epoch": 0.035211267605633804, "grad_norm": 0.2772333323955536, "learning_rate": 0.00019409681227863047, "loss": 0.9758, "step": 30 }, { "epoch": 0.036384976525821594, "grad_norm": 0.2761421799659729, "learning_rate": 0.00019386068476977568, "loss": 0.9926, "step": 31 }, { "epoch": 0.03755868544600939, "grad_norm": NaN, "learning_rate": 0.00019386068476977568, "loss": 1.0944, "step": 32 }, { "epoch": 0.03873239436619718, "grad_norm": 0.2766799330711365, "learning_rate": 0.0001936245572609209, "loss": 0.9813, "step": 33 }, { "epoch": 0.03990610328638498, "grad_norm": 0.28922533988952637, "learning_rate": 0.0001933884297520661, "loss": 0.9839, "step": 34 }, { "epoch": 0.04107981220657277, "grad_norm": 0.28271371126174927, "learning_rate": 0.00019315230224321134, "loss": 1.0125, "step": 35 }, { "epoch": 0.04225352112676056, "grad_norm": 0.2955509424209595, "learning_rate": 0.00019291617473435658, "loss": 1.0049, "step": 36 }, { "epoch": 0.04342723004694836, "grad_norm": 0.2909109592437744, "learning_rate": 0.00019268004722550178, "loss": 1.0015, "step": 37 }, { "epoch": 0.04460093896713615, "grad_norm": 0.29657021164894104, "learning_rate": 0.00019244391971664698, "loss": 1.0107, "step": 38 }, { "epoch": 0.045774647887323945, "grad_norm": 0.29010507464408875, "learning_rate": 0.00019220779220779222, "loss": 0.9918, "step": 39 }, { "epoch": 0.046948356807511735, "grad_norm": 0.2906627058982849, "learning_rate": 0.00019197166469893745, "loss": 0.9843, "step": 40 }, { "epoch": 0.04812206572769953, "grad_norm": 0.2919193208217621, "learning_rate": 0.00019173553719008265, "loss": 0.9889, "step": 41 }, { "epoch": 0.04929577464788732, "grad_norm": 0.3219091296195984, "learning_rate": 0.00019149940968122788, "loss": 0.9979, "step": 42 }, { "epoch": 0.05046948356807512, "grad_norm": 0.29512301087379456, "learning_rate": 0.0001912632821723731, "loss": 0.989, "step": 43 }, { "epoch": 0.051643192488262914, "grad_norm": 0.3190619647502899, "learning_rate": 0.0001910271546635183, "loss": 0.9563, "step": 44 }, { "epoch": 0.0528169014084507, "grad_norm": 0.310253381729126, "learning_rate": 0.00019079102715466352, "loss": 1.037, "step": 45 }, { "epoch": 0.0539906103286385, "grad_norm": 0.3140093684196472, "learning_rate": 0.00019055489964580876, "loss": 0.9687, "step": 46 }, { "epoch": 0.05516431924882629, "grad_norm": 0.2816644310951233, "learning_rate": 0.00019031877213695396, "loss": 0.9372, "step": 47 }, { "epoch": 0.056338028169014086, "grad_norm": 0.3012441396713257, "learning_rate": 0.0001900826446280992, "loss": 0.9968, "step": 48 }, { "epoch": 0.057511737089201875, "grad_norm": 0.29789185523986816, "learning_rate": 0.0001898465171192444, "loss": 0.9143, "step": 49 }, { "epoch": 0.05868544600938967, "grad_norm": 0.29454007744789124, "learning_rate": 0.00018961038961038963, "loss": 0.9837, "step": 50 }, { "epoch": 0.05985915492957746, "grad_norm": 0.321218341588974, "learning_rate": 0.00018937426210153483, "loss": 1.0135, "step": 51 }, { "epoch": 0.06103286384976526, "grad_norm": 0.30039164423942566, "learning_rate": 0.00018913813459268006, "loss": 0.9639, "step": 52 }, { "epoch": 0.062206572769953054, "grad_norm": 0.3052615225315094, "learning_rate": 0.0001889020070838253, "loss": 0.9401, "step": 53 }, { "epoch": 0.06338028169014084, "grad_norm": 0.3177138864994049, "learning_rate": 0.00018866587957497047, "loss": 0.9626, "step": 54 }, { "epoch": 0.06455399061032864, "grad_norm": 0.3098903298377991, "learning_rate": 0.0001884297520661157, "loss": 0.9535, "step": 55 }, { "epoch": 0.06572769953051644, "grad_norm": 0.33165299892425537, "learning_rate": 0.00018819362455726094, "loss": 1.0475, "step": 56 }, { "epoch": 0.06690140845070422, "grad_norm": 0.3054540455341339, "learning_rate": 0.00018795749704840614, "loss": 0.988, "step": 57 }, { "epoch": 0.06807511737089202, "grad_norm": 0.3412969708442688, "learning_rate": 0.00018772136953955137, "loss": 0.9531, "step": 58 }, { "epoch": 0.06924882629107981, "grad_norm": 0.3173505663871765, "learning_rate": 0.0001874852420306966, "loss": 1.0037, "step": 59 }, { "epoch": 0.07042253521126761, "grad_norm": 0.29377281665802, "learning_rate": 0.0001872491145218418, "loss": 0.9205, "step": 60 }, { "epoch": 0.0715962441314554, "grad_norm": 0.2970433831214905, "learning_rate": 0.000187012987012987, "loss": 0.8902, "step": 61 }, { "epoch": 0.07276995305164319, "grad_norm": 0.3081493675708771, "learning_rate": 0.00018677685950413224, "loss": 0.9498, "step": 62 }, { "epoch": 0.07394366197183098, "grad_norm": 0.31438371539115906, "learning_rate": 0.00018654073199527747, "loss": 0.9406, "step": 63 }, { "epoch": 0.07511737089201878, "grad_norm": 0.29640915989875793, "learning_rate": 0.00018630460448642268, "loss": 0.8948, "step": 64 }, { "epoch": 0.07629107981220658, "grad_norm": 0.33342233300209045, "learning_rate": 0.00018606847697756788, "loss": 0.941, "step": 65 }, { "epoch": 0.07746478873239436, "grad_norm": 0.31546634435653687, "learning_rate": 0.00018583234946871312, "loss": 0.9392, "step": 66 }, { "epoch": 0.07863849765258216, "grad_norm": 0.31528937816619873, "learning_rate": 0.00018559622195985832, "loss": 0.9293, "step": 67 }, { "epoch": 0.07981220657276995, "grad_norm": 0.33473101258277893, "learning_rate": 0.00018536009445100355, "loss": 0.9214, "step": 68 }, { "epoch": 0.08098591549295775, "grad_norm": 0.6588060259819031, "learning_rate": 0.00018512396694214878, "loss": 0.944, "step": 69 }, { "epoch": 0.08215962441314555, "grad_norm": 0.30120280385017395, "learning_rate": 0.000184887839433294, "loss": 0.9171, "step": 70 }, { "epoch": 0.08333333333333333, "grad_norm": 0.3417011499404907, "learning_rate": 0.0001846517119244392, "loss": 0.9382, "step": 71 }, { "epoch": 0.08450704225352113, "grad_norm": 0.3202987313270569, "learning_rate": 0.00018441558441558442, "loss": 0.931, "step": 72 }, { "epoch": 0.08568075117370892, "grad_norm": 0.3390517234802246, "learning_rate": 0.00018417945690672965, "loss": 0.9218, "step": 73 }, { "epoch": 0.08685446009389672, "grad_norm": 0.32109472155570984, "learning_rate": 0.00018394332939787486, "loss": 0.9226, "step": 74 }, { "epoch": 0.0880281690140845, "grad_norm": 0.3435365855693817, "learning_rate": 0.0001837072018890201, "loss": 0.9402, "step": 75 }, { "epoch": 0.0892018779342723, "grad_norm": 0.3335697054862976, "learning_rate": 0.00018347107438016532, "loss": 0.9385, "step": 76 }, { "epoch": 0.0903755868544601, "grad_norm": 0.32050758600234985, "learning_rate": 0.0001832349468713105, "loss": 0.8992, "step": 77 }, { "epoch": 0.09154929577464789, "grad_norm": 0.32620421051979065, "learning_rate": 0.00018299881936245573, "loss": 0.9476, "step": 78 }, { "epoch": 0.09272300469483569, "grad_norm": 0.33306750655174255, "learning_rate": 0.00018276269185360096, "loss": 0.9458, "step": 79 }, { "epoch": 0.09389671361502347, "grad_norm": 0.3500649034976959, "learning_rate": 0.00018252656434474617, "loss": 0.9612, "step": 80 }, { "epoch": 0.09507042253521127, "grad_norm": 0.3186359405517578, "learning_rate": 0.0001822904368358914, "loss": 0.9527, "step": 81 }, { "epoch": 0.09624413145539906, "grad_norm": 0.3317716717720032, "learning_rate": 0.0001820543093270366, "loss": 0.9648, "step": 82 }, { "epoch": 0.09741784037558686, "grad_norm": 0.3196907639503479, "learning_rate": 0.00018181818181818183, "loss": 0.9643, "step": 83 }, { "epoch": 0.09859154929577464, "grad_norm": 0.3195818066596985, "learning_rate": 0.00018158205430932704, "loss": 0.9121, "step": 84 }, { "epoch": 0.09976525821596244, "grad_norm": 0.33151793479919434, "learning_rate": 0.00018134592680047227, "loss": 0.9051, "step": 85 }, { "epoch": 0.10093896713615023, "grad_norm": 0.3110804259777069, "learning_rate": 0.00018110979929161747, "loss": 0.9241, "step": 86 }, { "epoch": 0.10211267605633803, "grad_norm": 0.34278568625450134, "learning_rate": 0.0001808736717827627, "loss": 0.9634, "step": 87 }, { "epoch": 0.10328638497652583, "grad_norm": 0.34013500809669495, "learning_rate": 0.0001806375442739079, "loss": 0.8822, "step": 88 }, { "epoch": 0.10446009389671361, "grad_norm": 0.3449755012989044, "learning_rate": 0.00018040141676505314, "loss": 0.969, "step": 89 }, { "epoch": 0.1056338028169014, "grad_norm": 0.3166862726211548, "learning_rate": 0.00018016528925619835, "loss": 0.885, "step": 90 }, { "epoch": 0.1068075117370892, "grad_norm": 0.3260084092617035, "learning_rate": 0.00017992916174734358, "loss": 0.8908, "step": 91 }, { "epoch": 0.107981220657277, "grad_norm": 0.32791605591773987, "learning_rate": 0.0001796930342384888, "loss": 0.8822, "step": 92 }, { "epoch": 0.10915492957746478, "grad_norm": 0.31909653544425964, "learning_rate": 0.000179456906729634, "loss": 0.8463, "step": 93 }, { "epoch": 0.11032863849765258, "grad_norm": 0.3413308262825012, "learning_rate": 0.00017922077922077922, "loss": 0.9232, "step": 94 }, { "epoch": 0.11150234741784038, "grad_norm": 0.32644134759902954, "learning_rate": 0.00017898465171192445, "loss": 0.9113, "step": 95 }, { "epoch": 0.11267605633802817, "grad_norm": 0.33090126514434814, "learning_rate": 0.00017874852420306965, "loss": 0.9286, "step": 96 }, { "epoch": 0.11384976525821597, "grad_norm": 0.37200361490249634, "learning_rate": 0.00017851239669421489, "loss": 0.9239, "step": 97 }, { "epoch": 0.11502347417840375, "grad_norm": 0.3274000585079193, "learning_rate": 0.00017827626918536012, "loss": 0.9038, "step": 98 }, { "epoch": 0.11619718309859155, "grad_norm": 0.3768482506275177, "learning_rate": 0.00017804014167650532, "loss": 0.8558, "step": 99 }, { "epoch": 0.11737089201877934, "grad_norm": 0.32970595359802246, "learning_rate": 0.00017780401416765053, "loss": 0.9057, "step": 100 }, { "epoch": 0.11854460093896714, "grad_norm": 0.37230944633483887, "learning_rate": 0.00017756788665879576, "loss": 0.9211, "step": 101 }, { "epoch": 0.11971830985915492, "grad_norm": 0.352201372385025, "learning_rate": 0.000177331759149941, "loss": 0.9497, "step": 102 }, { "epoch": 0.12089201877934272, "grad_norm": 0.363364577293396, "learning_rate": 0.0001770956316410862, "loss": 0.9535, "step": 103 }, { "epoch": 0.12206572769953052, "grad_norm": 0.3388724625110626, "learning_rate": 0.00017685950413223143, "loss": 0.8908, "step": 104 }, { "epoch": 0.12323943661971831, "grad_norm": 0.34684258699417114, "learning_rate": 0.00017662337662337663, "loss": 0.8981, "step": 105 }, { "epoch": 0.12441314553990611, "grad_norm": 0.31892621517181396, "learning_rate": 0.00017638724911452183, "loss": 0.8461, "step": 106 }, { "epoch": 0.1255868544600939, "grad_norm": 0.32913845777511597, "learning_rate": 0.00017615112160566707, "loss": 0.9087, "step": 107 }, { "epoch": 0.1267605633802817, "grad_norm": 0.3695410490036011, "learning_rate": 0.0001759149940968123, "loss": 0.8899, "step": 108 }, { "epoch": 0.12793427230046947, "grad_norm": 0.3455798923969269, "learning_rate": 0.0001756788665879575, "loss": 0.9045, "step": 109 }, { "epoch": 0.12910798122065728, "grad_norm": 0.3612275719642639, "learning_rate": 0.0001754427390791027, "loss": 0.8861, "step": 110 }, { "epoch": 0.13028169014084506, "grad_norm": 0.4106651544570923, "learning_rate": 0.00017520661157024794, "loss": 0.9152, "step": 111 }, { "epoch": 0.13145539906103287, "grad_norm": 0.3604993224143982, "learning_rate": 0.00017497048406139317, "loss": 0.9141, "step": 112 }, { "epoch": 0.13262910798122066, "grad_norm": 0.3496919870376587, "learning_rate": 0.00017473435655253837, "loss": 0.9061, "step": 113 }, { "epoch": 0.13380281690140844, "grad_norm": 0.33643972873687744, "learning_rate": 0.0001744982290436836, "loss": 0.8877, "step": 114 }, { "epoch": 0.13497652582159625, "grad_norm": 0.33064204454421997, "learning_rate": 0.00017426210153482884, "loss": 0.8967, "step": 115 }, { "epoch": 0.13615023474178403, "grad_norm": 0.37868356704711914, "learning_rate": 0.00017402597402597401, "loss": 0.8957, "step": 116 }, { "epoch": 0.13732394366197184, "grad_norm": 0.34379109740257263, "learning_rate": 0.00017378984651711925, "loss": 0.9332, "step": 117 }, { "epoch": 0.13849765258215962, "grad_norm": 0.37193912267684937, "learning_rate": 0.00017355371900826448, "loss": 0.9513, "step": 118 }, { "epoch": 0.1396713615023474, "grad_norm": 0.33701232075691223, "learning_rate": 0.00017331759149940968, "loss": 0.8946, "step": 119 }, { "epoch": 0.14084507042253522, "grad_norm": 0.35765206813812256, "learning_rate": 0.0001730814639905549, "loss": 0.8931, "step": 120 }, { "epoch": 0.142018779342723, "grad_norm": 0.3511311411857605, "learning_rate": 0.00017284533648170012, "loss": 0.9042, "step": 121 }, { "epoch": 0.1431924882629108, "grad_norm": 0.33516445755958557, "learning_rate": 0.00017260920897284535, "loss": 0.8564, "step": 122 }, { "epoch": 0.1443661971830986, "grad_norm": 0.385959267616272, "learning_rate": 0.00017237308146399055, "loss": 0.963, "step": 123 }, { "epoch": 0.14553990610328638, "grad_norm": 0.34608641266822815, "learning_rate": 0.00017213695395513578, "loss": 0.8666, "step": 124 }, { "epoch": 0.1467136150234742, "grad_norm": 0.3705556392669678, "learning_rate": 0.00017190082644628102, "loss": 0.7783, "step": 125 }, { "epoch": 0.14788732394366197, "grad_norm": 0.3213210701942444, "learning_rate": 0.00017166469893742622, "loss": 0.8428, "step": 126 }, { "epoch": 0.14906103286384975, "grad_norm": 0.3903498351573944, "learning_rate": 0.00017142857142857143, "loss": 0.8418, "step": 127 }, { "epoch": 0.15023474178403756, "grad_norm": 0.3556365668773651, "learning_rate": 0.00017119244391971666, "loss": 0.8612, "step": 128 }, { "epoch": 0.15140845070422534, "grad_norm": 0.3734995424747467, "learning_rate": 0.00017095631641086186, "loss": 0.8845, "step": 129 }, { "epoch": 0.15258215962441316, "grad_norm": 0.33735260367393494, "learning_rate": 0.0001707201889020071, "loss": 0.8752, "step": 130 }, { "epoch": 0.15375586854460094, "grad_norm": 0.38340267539024353, "learning_rate": 0.00017048406139315232, "loss": 0.8847, "step": 131 }, { "epoch": 0.15492957746478872, "grad_norm": 0.3654419779777527, "learning_rate": 0.00017024793388429753, "loss": 0.8448, "step": 132 }, { "epoch": 0.15610328638497653, "grad_norm": 0.3601568341255188, "learning_rate": 0.00017001180637544273, "loss": 0.8981, "step": 133 }, { "epoch": 0.1572769953051643, "grad_norm": 0.40733832120895386, "learning_rate": 0.00016977567886658796, "loss": 0.9135, "step": 134 }, { "epoch": 0.15845070422535212, "grad_norm": 0.34627673029899597, "learning_rate": 0.0001695395513577332, "loss": 0.9164, "step": 135 }, { "epoch": 0.1596244131455399, "grad_norm": 0.3865872621536255, "learning_rate": 0.0001693034238488784, "loss": 0.9222, "step": 136 }, { "epoch": 0.1607981220657277, "grad_norm": 0.4011456072330475, "learning_rate": 0.00016906729634002363, "loss": 0.8843, "step": 137 }, { "epoch": 0.1619718309859155, "grad_norm": 0.32259878516197205, "learning_rate": 0.00016883116883116884, "loss": 0.8427, "step": 138 }, { "epoch": 0.16314553990610328, "grad_norm": 0.3807618319988251, "learning_rate": 0.00016859504132231404, "loss": 0.8684, "step": 139 }, { "epoch": 0.1643192488262911, "grad_norm": 0.3658106327056885, "learning_rate": 0.00016835891381345927, "loss": 0.9024, "step": 140 }, { "epoch": 0.16549295774647887, "grad_norm": 0.3638787865638733, "learning_rate": 0.0001681227863046045, "loss": 0.8582, "step": 141 }, { "epoch": 0.16666666666666666, "grad_norm": 0.3839091360569, "learning_rate": 0.0001678866587957497, "loss": 0.8543, "step": 142 }, { "epoch": 0.16784037558685447, "grad_norm": 0.33579927682876587, "learning_rate": 0.00016765053128689494, "loss": 0.8765, "step": 143 }, { "epoch": 0.16901408450704225, "grad_norm": 0.35091203451156616, "learning_rate": 0.00016741440377804014, "loss": 0.8504, "step": 144 }, { "epoch": 0.17018779342723006, "grad_norm": 0.35823047161102295, "learning_rate": 0.00016717827626918538, "loss": 0.8534, "step": 145 }, { "epoch": 0.17136150234741784, "grad_norm": 0.37154486775398254, "learning_rate": 0.00016694214876033058, "loss": 0.851, "step": 146 }, { "epoch": 0.17253521126760563, "grad_norm": 0.33140066266059875, "learning_rate": 0.0001667060212514758, "loss": 0.8136, "step": 147 }, { "epoch": 0.17370892018779344, "grad_norm": 0.37408292293548584, "learning_rate": 0.00016646989374262104, "loss": 0.8933, "step": 148 }, { "epoch": 0.17488262910798122, "grad_norm": 0.36203357577323914, "learning_rate": 0.00016623376623376625, "loss": 0.8747, "step": 149 }, { "epoch": 0.176056338028169, "grad_norm": 0.35033532977104187, "learning_rate": 0.00016599763872491145, "loss": 0.8273, "step": 150 }, { "epoch": 0.1772300469483568, "grad_norm": 0.345048189163208, "learning_rate": 0.00016576151121605668, "loss": 0.8698, "step": 151 }, { "epoch": 0.1784037558685446, "grad_norm": 0.3592989146709442, "learning_rate": 0.0001655253837072019, "loss": 0.8483, "step": 152 }, { "epoch": 0.1795774647887324, "grad_norm": 0.3685864806175232, "learning_rate": 0.00016528925619834712, "loss": 0.915, "step": 153 }, { "epoch": 0.1807511737089202, "grad_norm": 0.3427909314632416, "learning_rate": 0.00016505312868949235, "loss": 0.8321, "step": 154 }, { "epoch": 0.18192488262910797, "grad_norm": 0.34697192907333374, "learning_rate": 0.00016481700118063756, "loss": 0.8801, "step": 155 }, { "epoch": 0.18309859154929578, "grad_norm": 0.3387276530265808, "learning_rate": 0.00016458087367178276, "loss": 0.8237, "step": 156 }, { "epoch": 0.18427230046948356, "grad_norm": 0.3547775447368622, "learning_rate": 0.000164344746162928, "loss": 0.8645, "step": 157 }, { "epoch": 0.18544600938967137, "grad_norm": 0.3342725932598114, "learning_rate": 0.00016410861865407322, "loss": 0.82, "step": 158 }, { "epoch": 0.18661971830985916, "grad_norm": 0.4317960739135742, "learning_rate": 0.00016387249114521843, "loss": 0.8614, "step": 159 }, { "epoch": 0.18779342723004694, "grad_norm": 0.35031062364578247, "learning_rate": 0.00016363636363636366, "loss": 0.8193, "step": 160 }, { "epoch": 0.18896713615023475, "grad_norm": 0.3616986572742462, "learning_rate": 0.00016340023612750886, "loss": 0.8571, "step": 161 }, { "epoch": 0.19014084507042253, "grad_norm": 0.36284518241882324, "learning_rate": 0.00016316410861865407, "loss": 0.8555, "step": 162 }, { "epoch": 0.19131455399061034, "grad_norm": 0.42962291836738586, "learning_rate": 0.0001629279811097993, "loss": 0.8574, "step": 163 }, { "epoch": 0.19248826291079812, "grad_norm": 0.330268532037735, "learning_rate": 0.00016269185360094453, "loss": 0.8952, "step": 164 }, { "epoch": 0.1936619718309859, "grad_norm": 0.33917295932769775, "learning_rate": 0.00016245572609208974, "loss": 0.8588, "step": 165 }, { "epoch": 0.19483568075117372, "grad_norm": 0.3963412046432495, "learning_rate": 0.00016221959858323494, "loss": 0.8451, "step": 166 }, { "epoch": 0.1960093896713615, "grad_norm": 0.33864182233810425, "learning_rate": 0.00016198347107438017, "loss": 0.8734, "step": 167 }, { "epoch": 0.19718309859154928, "grad_norm": 0.3751653730869293, "learning_rate": 0.00016174734356552538, "loss": 0.8786, "step": 168 }, { "epoch": 0.1983568075117371, "grad_norm": 0.4138842821121216, "learning_rate": 0.0001615112160566706, "loss": 0.8608, "step": 169 }, { "epoch": 0.19953051643192488, "grad_norm": 0.3747748136520386, "learning_rate": 0.00016127508854781584, "loss": 0.8901, "step": 170 }, { "epoch": 0.2007042253521127, "grad_norm": 0.3302014172077179, "learning_rate": 0.00016103896103896104, "loss": 0.8538, "step": 171 }, { "epoch": 0.20187793427230047, "grad_norm": 0.36144372820854187, "learning_rate": 0.00016080283353010625, "loss": 0.8634, "step": 172 }, { "epoch": 0.20305164319248825, "grad_norm": 0.3579455018043518, "learning_rate": 0.00016056670602125148, "loss": 0.8536, "step": 173 }, { "epoch": 0.20422535211267606, "grad_norm": 0.3475671410560608, "learning_rate": 0.0001603305785123967, "loss": 0.8304, "step": 174 }, { "epoch": 0.20539906103286384, "grad_norm": 0.34114810824394226, "learning_rate": 0.00016009445100354192, "loss": 0.8276, "step": 175 }, { "epoch": 0.20657276995305165, "grad_norm": 0.32198190689086914, "learning_rate": 0.00015985832349468715, "loss": 0.815, "step": 176 }, { "epoch": 0.20774647887323944, "grad_norm": 0.4003874361515045, "learning_rate": 0.00015962219598583238, "loss": 0.8523, "step": 177 }, { "epoch": 0.20892018779342722, "grad_norm": 0.32290229201316833, "learning_rate": 0.00015938606847697756, "loss": 0.8465, "step": 178 }, { "epoch": 0.21009389671361503, "grad_norm": 0.35729506611824036, "learning_rate": 0.0001591499409681228, "loss": 0.8437, "step": 179 }, { "epoch": 0.2112676056338028, "grad_norm": 0.33743324875831604, "learning_rate": 0.00015891381345926802, "loss": 0.8351, "step": 180 }, { "epoch": 0.21244131455399062, "grad_norm": 0.34673774242401123, "learning_rate": 0.00015867768595041322, "loss": 0.8146, "step": 181 }, { "epoch": 0.2136150234741784, "grad_norm": 0.37883323431015015, "learning_rate": 0.00015844155844155845, "loss": 0.8889, "step": 182 }, { "epoch": 0.2147887323943662, "grad_norm": 0.34172534942626953, "learning_rate": 0.00015820543093270366, "loss": 0.8479, "step": 183 }, { "epoch": 0.215962441314554, "grad_norm": 0.39948219060897827, "learning_rate": 0.0001579693034238489, "loss": 0.8383, "step": 184 }, { "epoch": 0.21713615023474178, "grad_norm": 0.33746814727783203, "learning_rate": 0.0001577331759149941, "loss": 0.8713, "step": 185 }, { "epoch": 0.21830985915492956, "grad_norm": 0.34141069650650024, "learning_rate": 0.00015749704840613933, "loss": 0.8303, "step": 186 }, { "epoch": 0.21948356807511737, "grad_norm": 0.35994264483451843, "learning_rate": 0.00015726092089728456, "loss": 0.7919, "step": 187 }, { "epoch": 0.22065727699530516, "grad_norm": 0.34234684705734253, "learning_rate": 0.00015702479338842976, "loss": 0.8225, "step": 188 }, { "epoch": 0.22183098591549297, "grad_norm": 0.3601793050765991, "learning_rate": 0.00015678866587957497, "loss": 0.8395, "step": 189 }, { "epoch": 0.22300469483568075, "grad_norm": 0.3154338002204895, "learning_rate": 0.0001565525383707202, "loss": 0.7735, "step": 190 }, { "epoch": 0.22417840375586853, "grad_norm": 0.3758296072483063, "learning_rate": 0.0001563164108618654, "loss": 0.8241, "step": 191 }, { "epoch": 0.22535211267605634, "grad_norm": 0.3732200264930725, "learning_rate": 0.00015608028335301063, "loss": 0.8116, "step": 192 }, { "epoch": 0.22652582159624413, "grad_norm": 0.3601556718349457, "learning_rate": 0.00015584415584415587, "loss": 0.8242, "step": 193 }, { "epoch": 0.22769953051643194, "grad_norm": 0.360442191362381, "learning_rate": 0.00015560802833530107, "loss": 0.832, "step": 194 }, { "epoch": 0.22887323943661972, "grad_norm": 0.35598254203796387, "learning_rate": 0.00015537190082644627, "loss": 0.8938, "step": 195 }, { "epoch": 0.2300469483568075, "grad_norm": 0.3962613046169281, "learning_rate": 0.0001551357733175915, "loss": 0.8409, "step": 196 }, { "epoch": 0.2312206572769953, "grad_norm": 0.3521510064601898, "learning_rate": 0.00015489964580873674, "loss": 0.8298, "step": 197 }, { "epoch": 0.2323943661971831, "grad_norm": 0.34407946467399597, "learning_rate": 0.00015466351829988194, "loss": 0.7921, "step": 198 }, { "epoch": 0.2335680751173709, "grad_norm": 0.3572155237197876, "learning_rate": 0.00015442739079102717, "loss": 0.8997, "step": 199 }, { "epoch": 0.2347417840375587, "grad_norm": 0.345745712518692, "learning_rate": 0.00015419126328217238, "loss": 0.8563, "step": 200 }, { "epoch": 0.23591549295774647, "grad_norm": 0.3741077780723572, "learning_rate": 0.00015395513577331758, "loss": 0.8334, "step": 201 }, { "epoch": 0.23708920187793428, "grad_norm": 0.36866459250450134, "learning_rate": 0.00015371900826446281, "loss": 0.8398, "step": 202 }, { "epoch": 0.23826291079812206, "grad_norm": 0.3834739625453949, "learning_rate": 0.00015348288075560805, "loss": 0.8181, "step": 203 }, { "epoch": 0.23943661971830985, "grad_norm": 0.373045951128006, "learning_rate": 0.00015324675324675325, "loss": 0.8044, "step": 204 }, { "epoch": 0.24061032863849766, "grad_norm": 0.3418562412261963, "learning_rate": 0.00015301062573789848, "loss": 0.8454, "step": 205 }, { "epoch": 0.24178403755868544, "grad_norm": 0.36289098858833313, "learning_rate": 0.00015277449822904369, "loss": 0.8478, "step": 206 }, { "epoch": 0.24295774647887325, "grad_norm": 0.38806968927383423, "learning_rate": 0.00015253837072018892, "loss": 0.804, "step": 207 }, { "epoch": 0.24413145539906103, "grad_norm": 0.34217599034309387, "learning_rate": 0.00015230224321133412, "loss": 0.8391, "step": 208 }, { "epoch": 0.24530516431924881, "grad_norm": 0.3738957643508911, "learning_rate": 0.00015206611570247935, "loss": 0.9026, "step": 209 }, { "epoch": 0.24647887323943662, "grad_norm": 0.3481609523296356, "learning_rate": 0.00015182998819362458, "loss": 0.8674, "step": 210 }, { "epoch": 0.2476525821596244, "grad_norm": 0.38967254757881165, "learning_rate": 0.00015159386068476976, "loss": 0.8796, "step": 211 }, { "epoch": 0.24882629107981222, "grad_norm": 0.34841835498809814, "learning_rate": 0.000151357733175915, "loss": 0.7913, "step": 212 }, { "epoch": 0.25, "grad_norm": 0.33826395869255066, "learning_rate": 0.00015112160566706023, "loss": 0.8539, "step": 213 }, { "epoch": 0.2511737089201878, "grad_norm": 0.35131266713142395, "learning_rate": 0.00015088547815820543, "loss": 0.8072, "step": 214 }, { "epoch": 0.25234741784037557, "grad_norm": 0.3298250734806061, "learning_rate": 0.00015064935064935066, "loss": 0.7688, "step": 215 }, { "epoch": 0.2535211267605634, "grad_norm": 0.33808133006095886, "learning_rate": 0.0001504132231404959, "loss": 0.7609, "step": 216 }, { "epoch": 0.2546948356807512, "grad_norm": 0.37146687507629395, "learning_rate": 0.0001501770956316411, "loss": 0.843, "step": 217 }, { "epoch": 0.25586854460093894, "grad_norm": 0.33817118406295776, "learning_rate": 0.0001499409681227863, "loss": 0.7828, "step": 218 }, { "epoch": 0.25704225352112675, "grad_norm": 0.35203686356544495, "learning_rate": 0.00014970484061393153, "loss": 0.8236, "step": 219 }, { "epoch": 0.25821596244131456, "grad_norm": 0.34176716208457947, "learning_rate": 0.00014946871310507676, "loss": 0.8191, "step": 220 }, { "epoch": 0.25938967136150237, "grad_norm": 0.34649035334587097, "learning_rate": 0.00014923258559622197, "loss": 0.8284, "step": 221 }, { "epoch": 0.2605633802816901, "grad_norm": 0.35891467332839966, "learning_rate": 0.00014899645808736717, "loss": 0.8149, "step": 222 }, { "epoch": 0.26173708920187794, "grad_norm": 0.3408451974391937, "learning_rate": 0.0001487603305785124, "loss": 0.8049, "step": 223 }, { "epoch": 0.26291079812206575, "grad_norm": 0.36554664373397827, "learning_rate": 0.0001485242030696576, "loss": 0.8478, "step": 224 }, { "epoch": 0.2640845070422535, "grad_norm": 0.3355228304862976, "learning_rate": 0.00014828807556080284, "loss": 0.815, "step": 225 }, { "epoch": 0.2652582159624413, "grad_norm": 0.3500598669052124, "learning_rate": 0.00014805194805194807, "loss": 0.8571, "step": 226 }, { "epoch": 0.2664319248826291, "grad_norm": 0.3362652659416199, "learning_rate": 0.00014781582054309328, "loss": 0.8363, "step": 227 }, { "epoch": 0.2676056338028169, "grad_norm": 0.34258243441581726, "learning_rate": 0.00014757969303423848, "loss": 0.7648, "step": 228 }, { "epoch": 0.2687793427230047, "grad_norm": 0.34023317694664, "learning_rate": 0.0001473435655253837, "loss": 0.8373, "step": 229 }, { "epoch": 0.2699530516431925, "grad_norm": 0.35829535126686096, "learning_rate": 0.00014710743801652894, "loss": 0.8255, "step": 230 }, { "epoch": 0.2711267605633803, "grad_norm": 0.3499360978603363, "learning_rate": 0.00014687131050767415, "loss": 0.8514, "step": 231 }, { "epoch": 0.27230046948356806, "grad_norm": 0.3703480362892151, "learning_rate": 0.00014663518299881938, "loss": 0.8615, "step": 232 }, { "epoch": 0.2734741784037559, "grad_norm": 0.3460928499698639, "learning_rate": 0.0001463990554899646, "loss": 0.7891, "step": 233 }, { "epoch": 0.2746478873239437, "grad_norm": 0.34184372425079346, "learning_rate": 0.0001461629279811098, "loss": 0.8168, "step": 234 }, { "epoch": 0.27582159624413144, "grad_norm": 0.34520068764686584, "learning_rate": 0.00014592680047225502, "loss": 0.8271, "step": 235 }, { "epoch": 0.27699530516431925, "grad_norm": 0.3415423631668091, "learning_rate": 0.00014569067296340025, "loss": 0.783, "step": 236 }, { "epoch": 0.27816901408450706, "grad_norm": 0.34584441781044006, "learning_rate": 0.00014545454545454546, "loss": 0.8488, "step": 237 }, { "epoch": 0.2793427230046948, "grad_norm": 0.33898866176605225, "learning_rate": 0.0001452184179456907, "loss": 0.8786, "step": 238 }, { "epoch": 0.2805164319248826, "grad_norm": 0.3591814339160919, "learning_rate": 0.0001449822904368359, "loss": 0.8081, "step": 239 }, { "epoch": 0.28169014084507044, "grad_norm": 0.34305432438850403, "learning_rate": 0.0001447461629279811, "loss": 0.7911, "step": 240 }, { "epoch": 0.2828638497652582, "grad_norm": 0.35866865515708923, "learning_rate": 0.00014451003541912633, "loss": 0.8393, "step": 241 }, { "epoch": 0.284037558685446, "grad_norm": 0.3422331213951111, "learning_rate": 0.00014427390791027156, "loss": 0.848, "step": 242 }, { "epoch": 0.2852112676056338, "grad_norm": 0.33504337072372437, "learning_rate": 0.00014403778040141676, "loss": 0.7782, "step": 243 }, { "epoch": 0.2863849765258216, "grad_norm": 0.3509252667427063, "learning_rate": 0.000143801652892562, "loss": 0.8535, "step": 244 }, { "epoch": 0.2875586854460094, "grad_norm": 0.3254059851169586, "learning_rate": 0.0001435655253837072, "loss": 0.7642, "step": 245 }, { "epoch": 0.2887323943661972, "grad_norm": 0.33594879508018494, "learning_rate": 0.00014332939787485243, "loss": 0.814, "step": 246 }, { "epoch": 0.289906103286385, "grad_norm": 0.3620656132698059, "learning_rate": 0.00014309327036599764, "loss": 0.8248, "step": 247 }, { "epoch": 0.29107981220657275, "grad_norm": 0.3325202167034149, "learning_rate": 0.00014285714285714287, "loss": 0.7408, "step": 248 }, { "epoch": 0.29225352112676056, "grad_norm": 0.33905264735221863, "learning_rate": 0.0001426210153482881, "loss": 0.8446, "step": 249 }, { "epoch": 0.2934272300469484, "grad_norm": 0.3577309548854828, "learning_rate": 0.0001423848878394333, "loss": 0.784, "step": 250 }, { "epoch": 0.29460093896713613, "grad_norm": 0.3840247392654419, "learning_rate": 0.0001421487603305785, "loss": 0.8068, "step": 251 }, { "epoch": 0.29577464788732394, "grad_norm": 0.3539847433567047, "learning_rate": 0.00014191263282172374, "loss": 0.8232, "step": 252 }, { "epoch": 0.29694835680751175, "grad_norm": 0.33225932717323303, "learning_rate": 0.00014167650531286894, "loss": 0.7946, "step": 253 }, { "epoch": 0.2981220657276995, "grad_norm": 0.3429291546344757, "learning_rate": 0.00014144037780401418, "loss": 0.816, "step": 254 }, { "epoch": 0.2992957746478873, "grad_norm": 0.3584197163581848, "learning_rate": 0.0001412042502951594, "loss": 0.8351, "step": 255 }, { "epoch": 0.3004694835680751, "grad_norm": 0.35585007071495056, "learning_rate": 0.0001409681227863046, "loss": 0.8255, "step": 256 }, { "epoch": 0.30164319248826293, "grad_norm": 0.3510012924671173, "learning_rate": 0.00014073199527744982, "loss": 0.7889, "step": 257 }, { "epoch": 0.3028169014084507, "grad_norm": 0.36646419763565063, "learning_rate": 0.00014049586776859505, "loss": 0.8161, "step": 258 }, { "epoch": 0.3039906103286385, "grad_norm": 0.35207659006118774, "learning_rate": 0.00014025974025974028, "loss": 0.8151, "step": 259 }, { "epoch": 0.3051643192488263, "grad_norm": 0.33348143100738525, "learning_rate": 0.00014002361275088548, "loss": 0.8108, "step": 260 }, { "epoch": 0.30633802816901406, "grad_norm": 0.3474767506122589, "learning_rate": 0.00013978748524203072, "loss": 0.8105, "step": 261 }, { "epoch": 0.3075117370892019, "grad_norm": 0.37046462297439575, "learning_rate": 0.00013955135773317592, "loss": 0.867, "step": 262 }, { "epoch": 0.3086854460093897, "grad_norm": 0.3426377475261688, "learning_rate": 0.00013931523022432112, "loss": 0.8281, "step": 263 }, { "epoch": 0.30985915492957744, "grad_norm": 0.3340952694416046, "learning_rate": 0.00013907910271546636, "loss": 0.7805, "step": 264 }, { "epoch": 0.31103286384976525, "grad_norm": 0.3546634316444397, "learning_rate": 0.0001388429752066116, "loss": 0.824, "step": 265 }, { "epoch": 0.31220657276995306, "grad_norm": 0.3211507499217987, "learning_rate": 0.0001386068476977568, "loss": 0.7572, "step": 266 }, { "epoch": 0.31338028169014087, "grad_norm": 0.3440265357494354, "learning_rate": 0.000138370720188902, "loss": 0.8247, "step": 267 }, { "epoch": 0.3145539906103286, "grad_norm": 0.34174132347106934, "learning_rate": 0.00013813459268004723, "loss": 0.7939, "step": 268 }, { "epoch": 0.31572769953051644, "grad_norm": 0.3415057361125946, "learning_rate": 0.00013789846517119246, "loss": 0.8184, "step": 269 }, { "epoch": 0.31690140845070425, "grad_norm": 0.3313206732273102, "learning_rate": 0.00013766233766233766, "loss": 0.7936, "step": 270 }, { "epoch": 0.318075117370892, "grad_norm": 0.35693395137786865, "learning_rate": 0.0001374262101534829, "loss": 0.7738, "step": 271 }, { "epoch": 0.3192488262910798, "grad_norm": 0.3530910313129425, "learning_rate": 0.00013719008264462813, "loss": 0.7901, "step": 272 }, { "epoch": 0.3204225352112676, "grad_norm": 0.34867924451828003, "learning_rate": 0.0001369539551357733, "loss": 0.8281, "step": 273 }, { "epoch": 0.3215962441314554, "grad_norm": 0.34141889214515686, "learning_rate": 0.00013671782762691854, "loss": 0.7987, "step": 274 }, { "epoch": 0.3227699530516432, "grad_norm": 0.3511849045753479, "learning_rate": 0.00013648170011806377, "loss": 0.8306, "step": 275 }, { "epoch": 0.323943661971831, "grad_norm": 0.343523770570755, "learning_rate": 0.00013624557260920897, "loss": 0.7813, "step": 276 }, { "epoch": 0.32511737089201875, "grad_norm": 0.3539726138114929, "learning_rate": 0.0001360094451003542, "loss": 0.8258, "step": 277 }, { "epoch": 0.32629107981220656, "grad_norm": 0.35628989338874817, "learning_rate": 0.00013577331759149943, "loss": 0.829, "step": 278 }, { "epoch": 0.3274647887323944, "grad_norm": 0.3531114459037781, "learning_rate": 0.00013553719008264464, "loss": 0.8475, "step": 279 }, { "epoch": 0.3286384976525822, "grad_norm": 0.35344576835632324, "learning_rate": 0.00013530106257378984, "loss": 0.8343, "step": 280 }, { "epoch": 0.32981220657276994, "grad_norm": 0.37604016065597534, "learning_rate": 0.00013506493506493507, "loss": 0.7598, "step": 281 }, { "epoch": 0.33098591549295775, "grad_norm": 0.35646241903305054, "learning_rate": 0.0001348288075560803, "loss": 0.83, "step": 282 }, { "epoch": 0.33215962441314556, "grad_norm": 0.36084675788879395, "learning_rate": 0.0001345926800472255, "loss": 0.7465, "step": 283 }, { "epoch": 0.3333333333333333, "grad_norm": 0.3514406085014343, "learning_rate": 0.00013435655253837071, "loss": 0.7979, "step": 284 }, { "epoch": 0.3345070422535211, "grad_norm": 0.3554603159427643, "learning_rate": 0.00013412042502951595, "loss": 0.8487, "step": 285 }, { "epoch": 0.33568075117370894, "grad_norm": 0.3360341787338257, "learning_rate": 0.00013388429752066115, "loss": 0.7787, "step": 286 }, { "epoch": 0.3368544600938967, "grad_norm": 0.35026323795318604, "learning_rate": 0.00013364817001180638, "loss": 0.7845, "step": 287 }, { "epoch": 0.3380281690140845, "grad_norm": 0.3419228494167328, "learning_rate": 0.00013341204250295161, "loss": 0.7971, "step": 288 }, { "epoch": 0.3392018779342723, "grad_norm": 0.3314400315284729, "learning_rate": 0.00013317591499409682, "loss": 0.7899, "step": 289 }, { "epoch": 0.3403755868544601, "grad_norm": 0.3434331715106964, "learning_rate": 0.00013293978748524202, "loss": 0.827, "step": 290 }, { "epoch": 0.3415492957746479, "grad_norm": 0.34718382358551025, "learning_rate": 0.00013270365997638725, "loss": 0.7835, "step": 291 }, { "epoch": 0.3427230046948357, "grad_norm": 0.3585168421268463, "learning_rate": 0.00013246753246753249, "loss": 0.8728, "step": 292 }, { "epoch": 0.3438967136150235, "grad_norm": 0.3508673906326294, "learning_rate": 0.0001322314049586777, "loss": 0.836, "step": 293 }, { "epoch": 0.34507042253521125, "grad_norm": 0.40241560339927673, "learning_rate": 0.00013199527744982292, "loss": 0.8043, "step": 294 }, { "epoch": 0.34624413145539906, "grad_norm": 0.33775267004966736, "learning_rate": 0.00013175914994096813, "loss": 0.8047, "step": 295 }, { "epoch": 0.3474178403755869, "grad_norm": 0.3423898220062256, "learning_rate": 0.00013152302243211333, "loss": 0.7894, "step": 296 }, { "epoch": 0.3485915492957746, "grad_norm": 0.3472992479801178, "learning_rate": 0.00013128689492325856, "loss": 0.8198, "step": 297 }, { "epoch": 0.34976525821596244, "grad_norm": 0.3425481915473938, "learning_rate": 0.0001310507674144038, "loss": 0.8178, "step": 298 }, { "epoch": 0.35093896713615025, "grad_norm": 0.3459112048149109, "learning_rate": 0.000130814639905549, "loss": 0.7749, "step": 299 }, { "epoch": 0.352112676056338, "grad_norm": 0.353595495223999, "learning_rate": 0.00013057851239669423, "loss": 0.7886, "step": 300 }, { "epoch": 0.3532863849765258, "grad_norm": 0.35495465993881226, "learning_rate": 0.00013034238488783943, "loss": 0.771, "step": 301 }, { "epoch": 0.3544600938967136, "grad_norm": 0.34812483191490173, "learning_rate": 0.00013010625737898467, "loss": 0.8335, "step": 302 }, { "epoch": 0.35563380281690143, "grad_norm": 0.3655085861682892, "learning_rate": 0.00012987012987012987, "loss": 0.8117, "step": 303 }, { "epoch": 0.3568075117370892, "grad_norm": 0.35925915837287903, "learning_rate": 0.0001296340023612751, "loss": 0.8147, "step": 304 }, { "epoch": 0.357981220657277, "grad_norm": 0.3293222486972809, "learning_rate": 0.00012939787485242033, "loss": 0.7602, "step": 305 }, { "epoch": 0.3591549295774648, "grad_norm": 0.3486446738243103, "learning_rate": 0.00012916174734356554, "loss": 0.7857, "step": 306 }, { "epoch": 0.36032863849765256, "grad_norm": 0.382565975189209, "learning_rate": 0.00012892561983471074, "loss": 0.863, "step": 307 }, { "epoch": 0.3615023474178404, "grad_norm": 0.32544344663619995, "learning_rate": 0.00012868949232585597, "loss": 0.781, "step": 308 }, { "epoch": 0.3626760563380282, "grad_norm": 0.38700491189956665, "learning_rate": 0.00012845336481700118, "loss": 0.8102, "step": 309 }, { "epoch": 0.36384976525821594, "grad_norm": 0.3503759503364563, "learning_rate": 0.0001282172373081464, "loss": 0.7699, "step": 310 }, { "epoch": 0.36502347417840375, "grad_norm": 0.3323630094528198, "learning_rate": 0.00012798110979929164, "loss": 0.7511, "step": 311 }, { "epoch": 0.36619718309859156, "grad_norm": 0.3668995797634125, "learning_rate": 0.00012774498229043685, "loss": 0.7374, "step": 312 }, { "epoch": 0.3673708920187793, "grad_norm": 0.37373387813568115, "learning_rate": 0.00012750885478158205, "loss": 0.8077, "step": 313 }, { "epoch": 0.3685446009389671, "grad_norm": 0.3601135015487671, "learning_rate": 0.00012727272727272728, "loss": 0.7991, "step": 314 }, { "epoch": 0.36971830985915494, "grad_norm": 0.3527435064315796, "learning_rate": 0.00012703659976387249, "loss": 0.7971, "step": 315 }, { "epoch": 0.37089201877934275, "grad_norm": 0.3584372401237488, "learning_rate": 0.00012680047225501772, "loss": 0.7513, "step": 316 }, { "epoch": 0.3720657276995305, "grad_norm": 0.3517726957798004, "learning_rate": 0.00012656434474616295, "loss": 0.8206, "step": 317 }, { "epoch": 0.3732394366197183, "grad_norm": 0.3655302822589874, "learning_rate": 0.00012632821723730815, "loss": 0.771, "step": 318 }, { "epoch": 0.3744131455399061, "grad_norm": 0.3659893274307251, "learning_rate": 0.00012609208972845336, "loss": 0.8048, "step": 319 }, { "epoch": 0.3755868544600939, "grad_norm": 0.36364591121673584, "learning_rate": 0.0001258559622195986, "loss": 0.7832, "step": 320 }, { "epoch": 0.3767605633802817, "grad_norm": 0.37528395652770996, "learning_rate": 0.00012561983471074382, "loss": 0.7926, "step": 321 }, { "epoch": 0.3779342723004695, "grad_norm": 0.37137654423713684, "learning_rate": 0.00012538370720188903, "loss": 0.8486, "step": 322 }, { "epoch": 0.37910798122065725, "grad_norm": 0.3466728925704956, "learning_rate": 0.00012514757969303423, "loss": 0.7961, "step": 323 }, { "epoch": 0.38028169014084506, "grad_norm": 0.38629114627838135, "learning_rate": 0.00012491145218417946, "loss": 0.8071, "step": 324 }, { "epoch": 0.3814553990610329, "grad_norm": 0.34686383605003357, "learning_rate": 0.00012467532467532467, "loss": 0.7698, "step": 325 }, { "epoch": 0.3826291079812207, "grad_norm": 0.36625292897224426, "learning_rate": 0.0001244391971664699, "loss": 0.8486, "step": 326 }, { "epoch": 0.38380281690140844, "grad_norm": 0.38903650641441345, "learning_rate": 0.00012420306965761513, "loss": 0.8031, "step": 327 }, { "epoch": 0.38497652582159625, "grad_norm": 0.3456287980079651, "learning_rate": 0.00012396694214876033, "loss": 0.7887, "step": 328 }, { "epoch": 0.38615023474178406, "grad_norm": 0.36374613642692566, "learning_rate": 0.00012373081463990554, "loss": 0.7588, "step": 329 }, { "epoch": 0.3873239436619718, "grad_norm": 0.360626220703125, "learning_rate": 0.00012349468713105077, "loss": 0.8239, "step": 330 }, { "epoch": 0.3884976525821596, "grad_norm": 0.40213796496391296, "learning_rate": 0.000123258559622196, "loss": 0.8029, "step": 331 }, { "epoch": 0.38967136150234744, "grad_norm": 0.3273613750934601, "learning_rate": 0.0001230224321133412, "loss": 0.7567, "step": 332 }, { "epoch": 0.3908450704225352, "grad_norm": 0.34953057765960693, "learning_rate": 0.00012278630460448644, "loss": 0.7512, "step": 333 }, { "epoch": 0.392018779342723, "grad_norm": 0.34772762656211853, "learning_rate": 0.00012255017709563167, "loss": 0.7551, "step": 334 }, { "epoch": 0.3931924882629108, "grad_norm": 0.34170207381248474, "learning_rate": 0.00012231404958677685, "loss": 0.7884, "step": 335 }, { "epoch": 0.39436619718309857, "grad_norm": 0.3696103096008301, "learning_rate": 0.00012207792207792208, "loss": 0.8658, "step": 336 }, { "epoch": 0.3955399061032864, "grad_norm": 0.3513827621936798, "learning_rate": 0.00012184179456906731, "loss": 0.8199, "step": 337 }, { "epoch": 0.3967136150234742, "grad_norm": 0.3454856872558594, "learning_rate": 0.00012160566706021253, "loss": 0.7627, "step": 338 }, { "epoch": 0.397887323943662, "grad_norm": 0.3246639370918274, "learning_rate": 0.00012136953955135774, "loss": 0.7454, "step": 339 }, { "epoch": 0.39906103286384975, "grad_norm": 0.33567938208580017, "learning_rate": 0.00012113341204250295, "loss": 0.7611, "step": 340 }, { "epoch": 0.40023474178403756, "grad_norm": 0.33728334307670593, "learning_rate": 0.00012089728453364817, "loss": 0.7575, "step": 341 }, { "epoch": 0.4014084507042254, "grad_norm": 0.35161352157592773, "learning_rate": 0.0001206611570247934, "loss": 0.8117, "step": 342 }, { "epoch": 0.4025821596244131, "grad_norm": 0.3425585925579071, "learning_rate": 0.00012042502951593862, "loss": 0.8019, "step": 343 }, { "epoch": 0.40375586854460094, "grad_norm": 0.3406507968902588, "learning_rate": 0.00012018890200708383, "loss": 0.8235, "step": 344 }, { "epoch": 0.40492957746478875, "grad_norm": 0.37840309739112854, "learning_rate": 0.00011995277449822907, "loss": 0.7866, "step": 345 }, { "epoch": 0.4061032863849765, "grad_norm": 0.35816213488578796, "learning_rate": 0.00011971664698937426, "loss": 0.8425, "step": 346 }, { "epoch": 0.4072769953051643, "grad_norm": 0.3441546559333801, "learning_rate": 0.00011948051948051949, "loss": 0.8094, "step": 347 }, { "epoch": 0.4084507042253521, "grad_norm": 0.34275054931640625, "learning_rate": 0.0001192443919716647, "loss": 0.7244, "step": 348 }, { "epoch": 0.4096244131455399, "grad_norm": 0.33207401633262634, "learning_rate": 0.00011900826446280992, "loss": 0.8108, "step": 349 }, { "epoch": 0.4107981220657277, "grad_norm": 0.3412252962589264, "learning_rate": 0.00011877213695395516, "loss": 0.7818, "step": 350 }, { "epoch": 0.4119718309859155, "grad_norm": 0.36701643466949463, "learning_rate": 0.00011853600944510035, "loss": 0.8293, "step": 351 }, { "epoch": 0.4131455399061033, "grad_norm": 0.34462520480155945, "learning_rate": 0.00011829988193624558, "loss": 0.7603, "step": 352 }, { "epoch": 0.41431924882629106, "grad_norm": 0.35232508182525635, "learning_rate": 0.0001180637544273908, "loss": 0.7616, "step": 353 }, { "epoch": 0.4154929577464789, "grad_norm": 0.37428373098373413, "learning_rate": 0.00011782762691853601, "loss": 0.7919, "step": 354 }, { "epoch": 0.4166666666666667, "grad_norm": 0.3429507911205292, "learning_rate": 0.00011759149940968123, "loss": 0.7859, "step": 355 }, { "epoch": 0.41784037558685444, "grad_norm": 0.3584844470024109, "learning_rate": 0.00011735537190082646, "loss": 0.7934, "step": 356 }, { "epoch": 0.41901408450704225, "grad_norm": 0.356391578912735, "learning_rate": 0.00011711924439197165, "loss": 0.8222, "step": 357 }, { "epoch": 0.42018779342723006, "grad_norm": 0.3663417100906372, "learning_rate": 0.00011688311688311689, "loss": 0.7507, "step": 358 }, { "epoch": 0.4213615023474178, "grad_norm": 0.3388553559780121, "learning_rate": 0.0001166469893742621, "loss": 0.8263, "step": 359 }, { "epoch": 0.4225352112676056, "grad_norm": 0.34876593947410583, "learning_rate": 0.00011641086186540732, "loss": 0.7969, "step": 360 }, { "epoch": 0.42370892018779344, "grad_norm": 0.3500271737575531, "learning_rate": 0.00011617473435655255, "loss": 0.7789, "step": 361 }, { "epoch": 0.42488262910798125, "grad_norm": 0.3554798662662506, "learning_rate": 0.00011593860684769777, "loss": 0.7681, "step": 362 }, { "epoch": 0.426056338028169, "grad_norm": 0.34559762477874756, "learning_rate": 0.00011570247933884298, "loss": 0.7676, "step": 363 }, { "epoch": 0.4272300469483568, "grad_norm": 0.3520505726337433, "learning_rate": 0.0001154663518299882, "loss": 0.7494, "step": 364 }, { "epoch": 0.4284037558685446, "grad_norm": 0.35454803705215454, "learning_rate": 0.00011523022432113341, "loss": 0.7516, "step": 365 }, { "epoch": 0.4295774647887324, "grad_norm": 0.36526602506637573, "learning_rate": 0.00011499409681227864, "loss": 0.7789, "step": 366 }, { "epoch": 0.4307511737089202, "grad_norm": 0.34084445238113403, "learning_rate": 0.00011475796930342386, "loss": 0.7446, "step": 367 }, { "epoch": 0.431924882629108, "grad_norm": 0.3405500054359436, "learning_rate": 0.00011452184179456907, "loss": 0.8217, "step": 368 }, { "epoch": 0.43309859154929575, "grad_norm": 0.3523256182670593, "learning_rate": 0.00011428571428571428, "loss": 0.7311, "step": 369 }, { "epoch": 0.43427230046948356, "grad_norm": 0.3336530327796936, "learning_rate": 0.0001140495867768595, "loss": 0.7806, "step": 370 }, { "epoch": 0.4354460093896714, "grad_norm": 0.3268769383430481, "learning_rate": 0.00011381345926800473, "loss": 0.7945, "step": 371 }, { "epoch": 0.43661971830985913, "grad_norm": 0.35258617997169495, "learning_rate": 0.00011357733175914995, "loss": 0.7468, "step": 372 }, { "epoch": 0.43779342723004694, "grad_norm": 0.3546913266181946, "learning_rate": 0.00011334120425029517, "loss": 0.7921, "step": 373 }, { "epoch": 0.43896713615023475, "grad_norm": 0.36266180872917175, "learning_rate": 0.00011310507674144037, "loss": 0.7623, "step": 374 }, { "epoch": 0.44014084507042256, "grad_norm": 0.3355543613433838, "learning_rate": 0.00011286894923258559, "loss": 0.7436, "step": 375 }, { "epoch": 0.4413145539906103, "grad_norm": 0.33666127920150757, "learning_rate": 0.00011263282172373082, "loss": 0.7609, "step": 376 }, { "epoch": 0.4424882629107981, "grad_norm": 0.3505670428276062, "learning_rate": 0.00011239669421487604, "loss": 0.7868, "step": 377 }, { "epoch": 0.44366197183098594, "grad_norm": 0.3446255028247833, "learning_rate": 0.00011216056670602126, "loss": 0.765, "step": 378 }, { "epoch": 0.4448356807511737, "grad_norm": 0.3761040270328522, "learning_rate": 0.00011192443919716649, "loss": 0.8104, "step": 379 }, { "epoch": 0.4460093896713615, "grad_norm": 0.35692986845970154, "learning_rate": 0.00011168831168831168, "loss": 0.7896, "step": 380 }, { "epoch": 0.4471830985915493, "grad_norm": 0.34384050965309143, "learning_rate": 0.00011145218417945691, "loss": 0.7716, "step": 381 }, { "epoch": 0.44835680751173707, "grad_norm": 0.3477395176887512, "learning_rate": 0.00011121605667060213, "loss": 0.8146, "step": 382 }, { "epoch": 0.4495305164319249, "grad_norm": 0.35172998905181885, "learning_rate": 0.00011097992916174735, "loss": 0.7844, "step": 383 }, { "epoch": 0.4507042253521127, "grad_norm": 0.33881857991218567, "learning_rate": 0.00011074380165289258, "loss": 0.7528, "step": 384 }, { "epoch": 0.4518779342723005, "grad_norm": 0.3429534137248993, "learning_rate": 0.00011050767414403777, "loss": 0.7826, "step": 385 }, { "epoch": 0.45305164319248825, "grad_norm": 0.34472665190696716, "learning_rate": 0.000110271546635183, "loss": 0.7153, "step": 386 }, { "epoch": 0.45422535211267606, "grad_norm": 0.3572479486465454, "learning_rate": 0.00011003541912632822, "loss": 0.7811, "step": 387 }, { "epoch": 0.45539906103286387, "grad_norm": 0.3531682789325714, "learning_rate": 0.00010979929161747344, "loss": 0.8016, "step": 388 }, { "epoch": 0.4565727699530516, "grad_norm": 0.3845299780368805, "learning_rate": 0.00010956316410861867, "loss": 0.7817, "step": 389 }, { "epoch": 0.45774647887323944, "grad_norm": 0.35217660665512085, "learning_rate": 0.00010932703659976389, "loss": 0.7495, "step": 390 }, { "epoch": 0.45892018779342725, "grad_norm": 0.35103702545166016, "learning_rate": 0.00010909090909090909, "loss": 0.7602, "step": 391 }, { "epoch": 0.460093896713615, "grad_norm": 0.3511259853839874, "learning_rate": 0.00010885478158205431, "loss": 0.7923, "step": 392 }, { "epoch": 0.4612676056338028, "grad_norm": 0.33732983469963074, "learning_rate": 0.00010861865407319953, "loss": 0.7875, "step": 393 }, { "epoch": 0.4624413145539906, "grad_norm": 0.35035955905914307, "learning_rate": 0.00010838252656434476, "loss": 0.7737, "step": 394 }, { "epoch": 0.4636150234741784, "grad_norm": 0.3277076482772827, "learning_rate": 0.00010814639905548998, "loss": 0.7619, "step": 395 }, { "epoch": 0.4647887323943662, "grad_norm": 0.34461456537246704, "learning_rate": 0.00010791027154663518, "loss": 0.7394, "step": 396 }, { "epoch": 0.465962441314554, "grad_norm": 0.36000820994377136, "learning_rate": 0.0001076741440377804, "loss": 0.8004, "step": 397 }, { "epoch": 0.4671361502347418, "grad_norm": 0.3291054666042328, "learning_rate": 0.00010743801652892562, "loss": 0.721, "step": 398 }, { "epoch": 0.46830985915492956, "grad_norm": 0.37541574239730835, "learning_rate": 0.00010720188902007085, "loss": 0.7673, "step": 399 }, { "epoch": 0.4694835680751174, "grad_norm": 0.33268067240715027, "learning_rate": 0.00010696576151121607, "loss": 0.7439, "step": 400 }, { "epoch": 0.4706572769953052, "grad_norm": 0.34383484721183777, "learning_rate": 0.00010672963400236129, "loss": 0.7453, "step": 401 }, { "epoch": 0.47183098591549294, "grad_norm": 0.3543702960014343, "learning_rate": 0.00010649350649350649, "loss": 0.7544, "step": 402 }, { "epoch": 0.47300469483568075, "grad_norm": 0.34553685784339905, "learning_rate": 0.00010625737898465171, "loss": 0.7656, "step": 403 }, { "epoch": 0.47417840375586856, "grad_norm": 0.3437071144580841, "learning_rate": 0.00010602125147579694, "loss": 0.773, "step": 404 }, { "epoch": 0.4753521126760563, "grad_norm": 0.34917253255844116, "learning_rate": 0.00010578512396694216, "loss": 0.7607, "step": 405 }, { "epoch": 0.4765258215962441, "grad_norm": 0.33429262042045593, "learning_rate": 0.00010554899645808738, "loss": 0.768, "step": 406 }, { "epoch": 0.47769953051643194, "grad_norm": 0.33842045068740845, "learning_rate": 0.00010531286894923261, "loss": 0.7665, "step": 407 }, { "epoch": 0.4788732394366197, "grad_norm": 0.3419265151023865, "learning_rate": 0.0001050767414403778, "loss": 0.7717, "step": 408 }, { "epoch": 0.4800469483568075, "grad_norm": 0.3458483815193176, "learning_rate": 0.00010484061393152303, "loss": 0.8031, "step": 409 }, { "epoch": 0.4812206572769953, "grad_norm": 0.37077274918556213, "learning_rate": 0.00010460448642266825, "loss": 0.8009, "step": 410 }, { "epoch": 0.4823943661971831, "grad_norm": 0.35040315985679626, "learning_rate": 0.00010436835891381347, "loss": 0.7545, "step": 411 }, { "epoch": 0.4835680751173709, "grad_norm": 0.3503456115722656, "learning_rate": 0.0001041322314049587, "loss": 0.8515, "step": 412 }, { "epoch": 0.4847417840375587, "grad_norm": 0.34627342224121094, "learning_rate": 0.00010389610389610389, "loss": 0.716, "step": 413 }, { "epoch": 0.4859154929577465, "grad_norm": 0.3596992790699005, "learning_rate": 0.00010365997638724912, "loss": 0.7636, "step": 414 }, { "epoch": 0.48708920187793425, "grad_norm": 0.3346829116344452, "learning_rate": 0.00010342384887839434, "loss": 0.7635, "step": 415 }, { "epoch": 0.48826291079812206, "grad_norm": 0.37179237604141235, "learning_rate": 0.00010318772136953956, "loss": 0.7642, "step": 416 }, { "epoch": 0.4894366197183099, "grad_norm": 0.34897381067276, "learning_rate": 0.00010295159386068479, "loss": 0.7792, "step": 417 }, { "epoch": 0.49061032863849763, "grad_norm": 0.3820830285549164, "learning_rate": 0.00010271546635183, "loss": 0.7722, "step": 418 }, { "epoch": 0.49178403755868544, "grad_norm": 0.3688552677631378, "learning_rate": 0.00010247933884297521, "loss": 0.7927, "step": 419 }, { "epoch": 0.49295774647887325, "grad_norm": 0.35100415349006653, "learning_rate": 0.00010224321133412043, "loss": 0.7848, "step": 420 }, { "epoch": 0.49413145539906106, "grad_norm": 0.3596225082874298, "learning_rate": 0.00010200708382526565, "loss": 0.7383, "step": 421 }, { "epoch": 0.4953051643192488, "grad_norm": 0.36203423142433167, "learning_rate": 0.00010177095631641088, "loss": 0.769, "step": 422 }, { "epoch": 0.4964788732394366, "grad_norm": 0.3776590824127197, "learning_rate": 0.0001015348288075561, "loss": 0.8007, "step": 423 }, { "epoch": 0.49765258215962443, "grad_norm": 0.36009421944618225, "learning_rate": 0.0001012987012987013, "loss": 0.7557, "step": 424 }, { "epoch": 0.4988262910798122, "grad_norm": 0.3442706763744354, "learning_rate": 0.00010106257378984652, "loss": 0.7488, "step": 425 }, { "epoch": 0.5, "grad_norm": 0.3635407090187073, "learning_rate": 0.00010082644628099174, "loss": 0.7922, "step": 426 }, { "epoch": 0.5011737089201878, "grad_norm": 0.3766370117664337, "learning_rate": 0.00010059031877213697, "loss": 0.7818, "step": 427 }, { "epoch": 0.5023474178403756, "grad_norm": 0.34344202280044556, "learning_rate": 0.00010035419126328218, "loss": 0.8308, "step": 428 }, { "epoch": 0.5035211267605634, "grad_norm": 0.3495674133300781, "learning_rate": 0.0001001180637544274, "loss": 0.799, "step": 429 }, { "epoch": 0.5046948356807511, "grad_norm": 0.36545464396476746, "learning_rate": 9.988193624557262e-05, "loss": 0.7453, "step": 430 }, { "epoch": 0.505868544600939, "grad_norm": 0.3482630252838135, "learning_rate": 9.964580873671782e-05, "loss": 0.7422, "step": 431 }, { "epoch": 0.5070422535211268, "grad_norm": 0.3745418190956116, "learning_rate": 9.940968122786304e-05, "loss": 0.7333, "step": 432 }, { "epoch": 0.5082159624413145, "grad_norm": 0.3470025062561035, "learning_rate": 9.917355371900827e-05, "loss": 0.7907, "step": 433 }, { "epoch": 0.5093896713615024, "grad_norm": 0.38251325488090515, "learning_rate": 9.893742621015348e-05, "loss": 0.7629, "step": 434 }, { "epoch": 0.5105633802816901, "grad_norm": 0.3829626739025116, "learning_rate": 9.870129870129871e-05, "loss": 0.7939, "step": 435 }, { "epoch": 0.5117370892018779, "grad_norm": 0.35726287961006165, "learning_rate": 9.846517119244393e-05, "loss": 0.755, "step": 436 }, { "epoch": 0.5129107981220657, "grad_norm": 0.38168108463287354, "learning_rate": 9.822904368358913e-05, "loss": 0.7396, "step": 437 }, { "epoch": 0.5140845070422535, "grad_norm": 0.35728660225868225, "learning_rate": 9.799291617473436e-05, "loss": 0.7568, "step": 438 }, { "epoch": 0.5152582159624414, "grad_norm": 0.37819668650627136, "learning_rate": 9.775678866587958e-05, "loss": 0.8046, "step": 439 }, { "epoch": 0.5164319248826291, "grad_norm": 0.4106784760951996, "learning_rate": 9.75206611570248e-05, "loss": 0.7116, "step": 440 }, { "epoch": 0.5176056338028169, "grad_norm": 0.3476578891277313, "learning_rate": 9.728453364817002e-05, "loss": 0.7824, "step": 441 }, { "epoch": 0.5187793427230047, "grad_norm": 0.36705800890922546, "learning_rate": 9.704840613931524e-05, "loss": 0.7631, "step": 442 }, { "epoch": 0.5199530516431925, "grad_norm": 0.3880864977836609, "learning_rate": 9.681227863046045e-05, "loss": 0.7608, "step": 443 }, { "epoch": 0.5211267605633803, "grad_norm": 0.3610959053039551, "learning_rate": 9.657615112160567e-05, "loss": 0.7909, "step": 444 }, { "epoch": 0.5223004694835681, "grad_norm": 0.33494657278060913, "learning_rate": 9.634002361275089e-05, "loss": 0.7108, "step": 445 }, { "epoch": 0.5234741784037559, "grad_norm": 0.352055162191391, "learning_rate": 9.610389610389611e-05, "loss": 0.7177, "step": 446 }, { "epoch": 0.5246478873239436, "grad_norm": 0.35466742515563965, "learning_rate": 9.586776859504133e-05, "loss": 0.7762, "step": 447 }, { "epoch": 0.5258215962441315, "grad_norm": 0.34477657079696655, "learning_rate": 9.563164108618654e-05, "loss": 0.7583, "step": 448 }, { "epoch": 0.5269953051643192, "grad_norm": 0.37008315324783325, "learning_rate": 9.539551357733176e-05, "loss": 0.7954, "step": 449 }, { "epoch": 0.528169014084507, "grad_norm": 0.34141793847084045, "learning_rate": 9.515938606847698e-05, "loss": 0.7444, "step": 450 }, { "epoch": 0.5293427230046949, "grad_norm": 0.3429400622844696, "learning_rate": 9.49232585596222e-05, "loss": 0.7499, "step": 451 }, { "epoch": 0.5305164319248826, "grad_norm": 0.3666730225086212, "learning_rate": 9.468713105076742e-05, "loss": 0.7704, "step": 452 }, { "epoch": 0.5316901408450704, "grad_norm": 0.34185874462127686, "learning_rate": 9.445100354191265e-05, "loss": 0.7446, "step": 453 }, { "epoch": 0.5328638497652582, "grad_norm": 0.3718375861644745, "learning_rate": 9.421487603305785e-05, "loss": 0.7316, "step": 454 }, { "epoch": 0.534037558685446, "grad_norm": 0.35064697265625, "learning_rate": 9.397874852420307e-05, "loss": 0.7651, "step": 455 }, { "epoch": 0.5352112676056338, "grad_norm": 0.3724139630794525, "learning_rate": 9.37426210153483e-05, "loss": 0.7639, "step": 456 }, { "epoch": 0.5363849765258216, "grad_norm": 0.3420800566673279, "learning_rate": 9.35064935064935e-05, "loss": 0.7578, "step": 457 }, { "epoch": 0.5375586854460094, "grad_norm": 0.3437943160533905, "learning_rate": 9.327036599763874e-05, "loss": 0.7898, "step": 458 }, { "epoch": 0.5387323943661971, "grad_norm": 0.3799413740634918, "learning_rate": 9.303423848878394e-05, "loss": 0.7216, "step": 459 }, { "epoch": 0.539906103286385, "grad_norm": 0.35702013969421387, "learning_rate": 9.279811097992916e-05, "loss": 0.7509, "step": 460 }, { "epoch": 0.5410798122065728, "grad_norm": 0.36074140667915344, "learning_rate": 9.256198347107439e-05, "loss": 0.7448, "step": 461 }, { "epoch": 0.5422535211267606, "grad_norm": 0.34211182594299316, "learning_rate": 9.23258559622196e-05, "loss": 0.7143, "step": 462 }, { "epoch": 0.5434272300469484, "grad_norm": 0.3816893398761749, "learning_rate": 9.208972845336483e-05, "loss": 0.7178, "step": 463 }, { "epoch": 0.5446009389671361, "grad_norm": 0.36033767461776733, "learning_rate": 9.185360094451005e-05, "loss": 0.7406, "step": 464 }, { "epoch": 0.545774647887324, "grad_norm": 0.38050010800361633, "learning_rate": 9.161747343565525e-05, "loss": 0.7528, "step": 465 }, { "epoch": 0.5469483568075117, "grad_norm": 0.3648395240306854, "learning_rate": 9.138134592680048e-05, "loss": 0.7802, "step": 466 }, { "epoch": 0.5481220657276995, "grad_norm": 0.35185542702674866, "learning_rate": 9.11452184179457e-05, "loss": 0.7489, "step": 467 }, { "epoch": 0.5492957746478874, "grad_norm": 0.3487717807292938, "learning_rate": 9.090909090909092e-05, "loss": 0.7742, "step": 468 }, { "epoch": 0.5504694835680751, "grad_norm": 0.36121654510498047, "learning_rate": 9.067296340023614e-05, "loss": 0.7974, "step": 469 }, { "epoch": 0.5516431924882629, "grad_norm": 0.3470339775085449, "learning_rate": 9.043683589138135e-05, "loss": 0.723, "step": 470 }, { "epoch": 0.5528169014084507, "grad_norm": 0.33549764752388, "learning_rate": 9.020070838252657e-05, "loss": 0.7334, "step": 471 }, { "epoch": 0.5539906103286385, "grad_norm": 0.36101868748664856, "learning_rate": 8.996458087367179e-05, "loss": 0.6817, "step": 472 }, { "epoch": 0.5551643192488263, "grad_norm": 0.36847153306007385, "learning_rate": 8.9728453364817e-05, "loss": 0.7942, "step": 473 }, { "epoch": 0.5563380281690141, "grad_norm": 0.3564891815185547, "learning_rate": 8.949232585596222e-05, "loss": 0.7071, "step": 474 }, { "epoch": 0.5575117370892019, "grad_norm": 0.36866652965545654, "learning_rate": 8.925619834710744e-05, "loss": 0.7685, "step": 475 }, { "epoch": 0.5586854460093896, "grad_norm": 0.370924711227417, "learning_rate": 8.902007083825266e-05, "loss": 0.7313, "step": 476 }, { "epoch": 0.5598591549295775, "grad_norm": 0.3611142039299011, "learning_rate": 8.878394332939788e-05, "loss": 0.7666, "step": 477 }, { "epoch": 0.5610328638497653, "grad_norm": 0.3418121635913849, "learning_rate": 8.85478158205431e-05, "loss": 0.7194, "step": 478 }, { "epoch": 0.562206572769953, "grad_norm": 0.3478650748729706, "learning_rate": 8.831168831168831e-05, "loss": 0.7145, "step": 479 }, { "epoch": 0.5633802816901409, "grad_norm": 0.3567008078098297, "learning_rate": 8.807556080283353e-05, "loss": 0.7591, "step": 480 }, { "epoch": 0.5645539906103286, "grad_norm": 0.3629607558250427, "learning_rate": 8.783943329397875e-05, "loss": 0.7856, "step": 481 }, { "epoch": 0.5657276995305164, "grad_norm": 0.37257978320121765, "learning_rate": 8.760330578512397e-05, "loss": 0.709, "step": 482 }, { "epoch": 0.5669014084507042, "grad_norm": 0.3570626676082611, "learning_rate": 8.736717827626919e-05, "loss": 0.7639, "step": 483 }, { "epoch": 0.568075117370892, "grad_norm": 0.34790506958961487, "learning_rate": 8.713105076741442e-05, "loss": 0.7375, "step": 484 }, { "epoch": 0.5692488262910798, "grad_norm": 0.3525756895542145, "learning_rate": 8.689492325855962e-05, "loss": 0.7274, "step": 485 }, { "epoch": 0.5704225352112676, "grad_norm": 0.3545394837856293, "learning_rate": 8.665879574970484e-05, "loss": 0.7531, "step": 486 }, { "epoch": 0.5715962441314554, "grad_norm": 0.35677066445350647, "learning_rate": 8.642266824085006e-05, "loss": 0.7682, "step": 487 }, { "epoch": 0.5727699530516432, "grad_norm": 0.3439461290836334, "learning_rate": 8.618654073199528e-05, "loss": 0.7176, "step": 488 }, { "epoch": 0.573943661971831, "grad_norm": 0.3622515797615051, "learning_rate": 8.595041322314051e-05, "loss": 0.7004, "step": 489 }, { "epoch": 0.5751173708920188, "grad_norm": 0.36056646704673767, "learning_rate": 8.571428571428571e-05, "loss": 0.74, "step": 490 }, { "epoch": 0.5762910798122066, "grad_norm": 0.3509630262851715, "learning_rate": 8.547815820543093e-05, "loss": 0.8006, "step": 491 }, { "epoch": 0.5774647887323944, "grad_norm": 0.3422422707080841, "learning_rate": 8.524203069657616e-05, "loss": 0.7162, "step": 492 }, { "epoch": 0.5786384976525821, "grad_norm": 0.35553744435310364, "learning_rate": 8.500590318772137e-05, "loss": 0.7554, "step": 493 }, { "epoch": 0.57981220657277, "grad_norm": 0.3443603813648224, "learning_rate": 8.47697756788666e-05, "loss": 0.7128, "step": 494 }, { "epoch": 0.5809859154929577, "grad_norm": 0.3314555883407593, "learning_rate": 8.453364817001182e-05, "loss": 0.7123, "step": 495 }, { "epoch": 0.5821596244131455, "grad_norm": 0.33951112627983093, "learning_rate": 8.429752066115702e-05, "loss": 0.7501, "step": 496 }, { "epoch": 0.5833333333333334, "grad_norm": 0.327809602022171, "learning_rate": 8.406139315230225e-05, "loss": 0.7543, "step": 497 }, { "epoch": 0.5845070422535211, "grad_norm": 0.33205023407936096, "learning_rate": 8.382526564344747e-05, "loss": 0.7395, "step": 498 }, { "epoch": 0.5856807511737089, "grad_norm": 0.3762659430503845, "learning_rate": 8.358913813459269e-05, "loss": 0.7424, "step": 499 }, { "epoch": 0.5868544600938967, "grad_norm": 0.3421575427055359, "learning_rate": 8.33530106257379e-05, "loss": 0.7167, "step": 500 }, { "epoch": 0.5880281690140845, "grad_norm": 0.3560996353626251, "learning_rate": 8.311688311688312e-05, "loss": 0.7464, "step": 501 }, { "epoch": 0.5892018779342723, "grad_norm": 0.3566039800643921, "learning_rate": 8.288075560802834e-05, "loss": 0.715, "step": 502 }, { "epoch": 0.5903755868544601, "grad_norm": 0.3481593430042267, "learning_rate": 8.264462809917356e-05, "loss": 0.7506, "step": 503 }, { "epoch": 0.5915492957746479, "grad_norm": 0.34428590536117554, "learning_rate": 8.240850059031878e-05, "loss": 0.7272, "step": 504 }, { "epoch": 0.5927230046948356, "grad_norm": 0.35629555583000183, "learning_rate": 8.2172373081464e-05, "loss": 0.7334, "step": 505 }, { "epoch": 0.5938967136150235, "grad_norm": 0.37292811274528503, "learning_rate": 8.193624557260921e-05, "loss": 0.7505, "step": 506 }, { "epoch": 0.5950704225352113, "grad_norm": 0.359614759683609, "learning_rate": 8.170011806375443e-05, "loss": 0.8006, "step": 507 }, { "epoch": 0.596244131455399, "grad_norm": 0.3388945460319519, "learning_rate": 8.146399055489965e-05, "loss": 0.7542, "step": 508 }, { "epoch": 0.5974178403755869, "grad_norm": 0.3528054356575012, "learning_rate": 8.122786304604487e-05, "loss": 0.7412, "step": 509 }, { "epoch": 0.5985915492957746, "grad_norm": 0.3354608416557312, "learning_rate": 8.099173553719009e-05, "loss": 0.7062, "step": 510 }, { "epoch": 0.5997652582159625, "grad_norm": 0.35168859362602234, "learning_rate": 8.07556080283353e-05, "loss": 0.7653, "step": 511 }, { "epoch": 0.6009389671361502, "grad_norm": 0.33843398094177246, "learning_rate": 8.051948051948052e-05, "loss": 0.7339, "step": 512 }, { "epoch": 0.602112676056338, "grad_norm": 0.32910212874412537, "learning_rate": 8.028335301062574e-05, "loss": 0.6966, "step": 513 }, { "epoch": 0.6032863849765259, "grad_norm": 0.3462936580181122, "learning_rate": 8.004722550177096e-05, "loss": 0.7386, "step": 514 }, { "epoch": 0.6044600938967136, "grad_norm": 0.3483426868915558, "learning_rate": 7.981109799291619e-05, "loss": 0.7548, "step": 515 }, { "epoch": 0.6056338028169014, "grad_norm": 0.3555918335914612, "learning_rate": 7.95749704840614e-05, "loss": 0.7144, "step": 516 }, { "epoch": 0.6068075117370892, "grad_norm": 0.3545628786087036, "learning_rate": 7.933884297520661e-05, "loss": 0.7601, "step": 517 }, { "epoch": 0.607981220657277, "grad_norm": 0.3554907441139221, "learning_rate": 7.910271546635183e-05, "loss": 0.7464, "step": 518 }, { "epoch": 0.6091549295774648, "grad_norm": 0.3457619547843933, "learning_rate": 7.886658795749705e-05, "loss": 0.7372, "step": 519 }, { "epoch": 0.6103286384976526, "grad_norm": 0.3450148105621338, "learning_rate": 7.863046044864228e-05, "loss": 0.7265, "step": 520 }, { "epoch": 0.6115023474178404, "grad_norm": 0.3475225567817688, "learning_rate": 7.839433293978748e-05, "loss": 0.798, "step": 521 }, { "epoch": 0.6126760563380281, "grad_norm": 0.34560921788215637, "learning_rate": 7.81582054309327e-05, "loss": 0.7583, "step": 522 }, { "epoch": 0.613849765258216, "grad_norm": 0.33480820059776306, "learning_rate": 7.792207792207793e-05, "loss": 0.7658, "step": 523 }, { "epoch": 0.6150234741784038, "grad_norm": 0.34581395983695984, "learning_rate": 7.768595041322314e-05, "loss": 0.7368, "step": 524 }, { "epoch": 0.6161971830985915, "grad_norm": 0.35383906960487366, "learning_rate": 7.744982290436837e-05, "loss": 0.7963, "step": 525 }, { "epoch": 0.6173708920187794, "grad_norm": 0.352117121219635, "learning_rate": 7.721369539551359e-05, "loss": 0.7589, "step": 526 }, { "epoch": 0.6185446009389671, "grad_norm": 0.34420257806777954, "learning_rate": 7.697756788665879e-05, "loss": 0.7209, "step": 527 }, { "epoch": 0.6197183098591549, "grad_norm": 0.3449562191963196, "learning_rate": 7.674144037780402e-05, "loss": 0.7526, "step": 528 }, { "epoch": 0.6208920187793427, "grad_norm": 0.37377694249153137, "learning_rate": 7.650531286894924e-05, "loss": 0.7348, "step": 529 }, { "epoch": 0.6220657276995305, "grad_norm": 0.32662031054496765, "learning_rate": 7.626918536009446e-05, "loss": 0.7125, "step": 530 }, { "epoch": 0.6232394366197183, "grad_norm": 0.3551415801048279, "learning_rate": 7.603305785123968e-05, "loss": 0.7497, "step": 531 }, { "epoch": 0.6244131455399061, "grad_norm": 0.3519802689552307, "learning_rate": 7.579693034238488e-05, "loss": 0.7864, "step": 532 }, { "epoch": 0.6255868544600939, "grad_norm": 0.3773750364780426, "learning_rate": 7.556080283353011e-05, "loss": 0.7681, "step": 533 }, { "epoch": 0.6267605633802817, "grad_norm": 0.3558037281036377, "learning_rate": 7.532467532467533e-05, "loss": 0.7392, "step": 534 }, { "epoch": 0.6279342723004695, "grad_norm": 0.33910447359085083, "learning_rate": 7.508854781582055e-05, "loss": 0.7036, "step": 535 }, { "epoch": 0.6291079812206573, "grad_norm": 0.35620275139808655, "learning_rate": 7.485242030696577e-05, "loss": 0.7272, "step": 536 }, { "epoch": 0.6302816901408451, "grad_norm": 0.3377542495727539, "learning_rate": 7.461629279811098e-05, "loss": 0.7244, "step": 537 }, { "epoch": 0.6314553990610329, "grad_norm": 0.35217198729515076, "learning_rate": 7.43801652892562e-05, "loss": 0.7655, "step": 538 }, { "epoch": 0.6326291079812206, "grad_norm": 0.34656718373298645, "learning_rate": 7.414403778040142e-05, "loss": 0.7474, "step": 539 }, { "epoch": 0.6338028169014085, "grad_norm": 0.34429579973220825, "learning_rate": 7.390791027154664e-05, "loss": 0.7333, "step": 540 }, { "epoch": 0.6349765258215962, "grad_norm": 0.374262273311615, "learning_rate": 7.367178276269186e-05, "loss": 0.7876, "step": 541 }, { "epoch": 0.636150234741784, "grad_norm": 0.363299161195755, "learning_rate": 7.343565525383707e-05, "loss": 0.7784, "step": 542 }, { "epoch": 0.6373239436619719, "grad_norm": 0.36767125129699707, "learning_rate": 7.31995277449823e-05, "loss": 0.7329, "step": 543 }, { "epoch": 0.6384976525821596, "grad_norm": 0.3338686525821686, "learning_rate": 7.296340023612751e-05, "loss": 0.7737, "step": 544 }, { "epoch": 0.6396713615023474, "grad_norm": 0.3493046164512634, "learning_rate": 7.272727272727273e-05, "loss": 0.7461, "step": 545 }, { "epoch": 0.6408450704225352, "grad_norm": 0.3691573441028595, "learning_rate": 7.249114521841795e-05, "loss": 0.765, "step": 546 }, { "epoch": 0.642018779342723, "grad_norm": 0.3573099374771118, "learning_rate": 7.225501770956316e-05, "loss": 0.7589, "step": 547 }, { "epoch": 0.6431924882629108, "grad_norm": 0.36218926310539246, "learning_rate": 7.201889020070838e-05, "loss": 0.7314, "step": 548 }, { "epoch": 0.6443661971830986, "grad_norm": 0.35753628611564636, "learning_rate": 7.17827626918536e-05, "loss": 0.7564, "step": 549 }, { "epoch": 0.6455399061032864, "grad_norm": 0.3394756615161896, "learning_rate": 7.154663518299882e-05, "loss": 0.7162, "step": 550 }, { "epoch": 0.6467136150234741, "grad_norm": 0.350090891122818, "learning_rate": 7.131050767414405e-05, "loss": 0.7561, "step": 551 }, { "epoch": 0.647887323943662, "grad_norm": 0.328924298286438, "learning_rate": 7.107438016528925e-05, "loss": 0.7143, "step": 552 }, { "epoch": 0.6490610328638498, "grad_norm": 0.3552818298339844, "learning_rate": 7.083825265643447e-05, "loss": 0.7264, "step": 553 }, { "epoch": 0.6502347417840375, "grad_norm": 0.3504960536956787, "learning_rate": 7.06021251475797e-05, "loss": 0.7512, "step": 554 }, { "epoch": 0.6514084507042254, "grad_norm": 0.33755823969841003, "learning_rate": 7.036599763872491e-05, "loss": 0.7621, "step": 555 }, { "epoch": 0.6525821596244131, "grad_norm": 0.35977354645729065, "learning_rate": 7.012987012987014e-05, "loss": 0.776, "step": 556 }, { "epoch": 0.653755868544601, "grad_norm": 0.37304726243019104, "learning_rate": 6.989374262101536e-05, "loss": 0.7601, "step": 557 }, { "epoch": 0.6549295774647887, "grad_norm": 0.3569071590900421, "learning_rate": 6.965761511216056e-05, "loss": 0.7303, "step": 558 }, { "epoch": 0.6561032863849765, "grad_norm": 0.348264217376709, "learning_rate": 6.94214876033058e-05, "loss": 0.759, "step": 559 }, { "epoch": 0.6572769953051644, "grad_norm": 0.3501366674900055, "learning_rate": 6.9185360094451e-05, "loss": 0.7588, "step": 560 }, { "epoch": 0.6584507042253521, "grad_norm": 0.3633224666118622, "learning_rate": 6.894923258559623e-05, "loss": 0.7741, "step": 561 }, { "epoch": 0.6596244131455399, "grad_norm": 0.35944506525993347, "learning_rate": 6.871310507674145e-05, "loss": 0.756, "step": 562 }, { "epoch": 0.6607981220657277, "grad_norm": 0.3479359745979309, "learning_rate": 6.847697756788665e-05, "loss": 0.7292, "step": 563 }, { "epoch": 0.6619718309859155, "grad_norm": 0.37013959884643555, "learning_rate": 6.824085005903188e-05, "loss": 0.7618, "step": 564 }, { "epoch": 0.6631455399061033, "grad_norm": 0.36679190397262573, "learning_rate": 6.80047225501771e-05, "loss": 0.7797, "step": 565 }, { "epoch": 0.6643192488262911, "grad_norm": 0.35092490911483765, "learning_rate": 6.776859504132232e-05, "loss": 0.705, "step": 566 }, { "epoch": 0.6654929577464789, "grad_norm": 0.3594275712966919, "learning_rate": 6.753246753246754e-05, "loss": 0.7215, "step": 567 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3503059148788452, "learning_rate": 6.729634002361276e-05, "loss": 0.7248, "step": 568 }, { "epoch": 0.6678403755868545, "grad_norm": 0.35919633507728577, "learning_rate": 6.706021251475797e-05, "loss": 0.7718, "step": 569 }, { "epoch": 0.6690140845070423, "grad_norm": 0.36752262711524963, "learning_rate": 6.682408500590319e-05, "loss": 0.7738, "step": 570 }, { "epoch": 0.67018779342723, "grad_norm": 0.33812567591667175, "learning_rate": 6.658795749704841e-05, "loss": 0.7846, "step": 571 }, { "epoch": 0.6713615023474179, "grad_norm": 0.3429810404777527, "learning_rate": 6.635182998819363e-05, "loss": 0.7371, "step": 572 }, { "epoch": 0.6725352112676056, "grad_norm": 0.3457571864128113, "learning_rate": 6.611570247933885e-05, "loss": 0.7318, "step": 573 }, { "epoch": 0.6737089201877934, "grad_norm": 0.3476294279098511, "learning_rate": 6.587957497048406e-05, "loss": 0.7344, "step": 574 }, { "epoch": 0.6748826291079812, "grad_norm": 0.34464409947395325, "learning_rate": 6.564344746162928e-05, "loss": 0.7429, "step": 575 }, { "epoch": 0.676056338028169, "grad_norm": 0.34444373846054077, "learning_rate": 6.54073199527745e-05, "loss": 0.7663, "step": 576 }, { "epoch": 0.6772300469483568, "grad_norm": 0.3656728267669678, "learning_rate": 6.517119244391972e-05, "loss": 0.7068, "step": 577 }, { "epoch": 0.6784037558685446, "grad_norm": 0.3591727614402771, "learning_rate": 6.493506493506494e-05, "loss": 0.7481, "step": 578 }, { "epoch": 0.6795774647887324, "grad_norm": 0.38865676522254944, "learning_rate": 6.469893742621017e-05, "loss": 0.7659, "step": 579 }, { "epoch": 0.6807511737089202, "grad_norm": 0.3438194990158081, "learning_rate": 6.446280991735537e-05, "loss": 0.6748, "step": 580 }, { "epoch": 0.681924882629108, "grad_norm": 0.34979990124702454, "learning_rate": 6.422668240850059e-05, "loss": 0.7529, "step": 581 }, { "epoch": 0.6830985915492958, "grad_norm": 0.37309062480926514, "learning_rate": 6.399055489964582e-05, "loss": 0.7417, "step": 582 }, { "epoch": 0.6842723004694836, "grad_norm": 0.3737837076187134, "learning_rate": 6.375442739079102e-05, "loss": 0.773, "step": 583 }, { "epoch": 0.6854460093896714, "grad_norm": 0.3397013247013092, "learning_rate": 6.351829988193624e-05, "loss": 0.7093, "step": 584 }, { "epoch": 0.6866197183098591, "grad_norm": 0.37165701389312744, "learning_rate": 6.328217237308147e-05, "loss": 0.7078, "step": 585 }, { "epoch": 0.687793427230047, "grad_norm": 0.3533116579055786, "learning_rate": 6.304604486422668e-05, "loss": 0.7105, "step": 586 }, { "epoch": 0.6889671361502347, "grad_norm": 0.35352569818496704, "learning_rate": 6.280991735537191e-05, "loss": 0.7282, "step": 587 }, { "epoch": 0.6901408450704225, "grad_norm": 0.3754810094833374, "learning_rate": 6.257378984651711e-05, "loss": 0.7364, "step": 588 }, { "epoch": 0.6913145539906104, "grad_norm": 0.36235493421554565, "learning_rate": 6.233766233766233e-05, "loss": 0.7024, "step": 589 }, { "epoch": 0.6924882629107981, "grad_norm": 0.3446933627128601, "learning_rate": 6.210153482880756e-05, "loss": 0.7392, "step": 590 }, { "epoch": 0.6936619718309859, "grad_norm": 0.34918078780174255, "learning_rate": 6.186540731995277e-05, "loss": 0.6716, "step": 591 }, { "epoch": 0.6948356807511737, "grad_norm": 0.3438567519187927, "learning_rate": 6.1629279811098e-05, "loss": 0.7812, "step": 592 }, { "epoch": 0.6960093896713615, "grad_norm": 0.346626341342926, "learning_rate": 6.139315230224322e-05, "loss": 0.7538, "step": 593 }, { "epoch": 0.6971830985915493, "grad_norm": 0.3506343960762024, "learning_rate": 6.115702479338842e-05, "loss": 0.7434, "step": 594 }, { "epoch": 0.6983568075117371, "grad_norm": 0.35403555631637573, "learning_rate": 6.0920897284533654e-05, "loss": 0.7333, "step": 595 }, { "epoch": 0.6995305164319249, "grad_norm": 0.3391430377960205, "learning_rate": 6.068476977567887e-05, "loss": 0.7486, "step": 596 }, { "epoch": 0.7007042253521126, "grad_norm": 0.33783578872680664, "learning_rate": 6.044864226682408e-05, "loss": 0.7588, "step": 597 }, { "epoch": 0.7018779342723005, "grad_norm": 0.3333738446235657, "learning_rate": 6.021251475796931e-05, "loss": 0.7268, "step": 598 }, { "epoch": 0.7030516431924883, "grad_norm": 0.3494018316268921, "learning_rate": 5.997638724911453e-05, "loss": 0.7363, "step": 599 }, { "epoch": 0.704225352112676, "grad_norm": 0.34416642785072327, "learning_rate": 5.9740259740259744e-05, "loss": 0.7322, "step": 600 }, { "epoch": 0.7053990610328639, "grad_norm": 0.3523387312889099, "learning_rate": 5.950413223140496e-05, "loss": 0.6986, "step": 601 }, { "epoch": 0.7065727699530516, "grad_norm": 0.33000361919403076, "learning_rate": 5.926800472255017e-05, "loss": 0.7535, "step": 602 }, { "epoch": 0.7077464788732394, "grad_norm": 0.33932214975357056, "learning_rate": 5.90318772136954e-05, "loss": 0.7051, "step": 603 }, { "epoch": 0.7089201877934272, "grad_norm": 0.3373797833919525, "learning_rate": 5.8795749704840616e-05, "loss": 0.7022, "step": 604 }, { "epoch": 0.710093896713615, "grad_norm": 0.35239875316619873, "learning_rate": 5.855962219598583e-05, "loss": 0.7893, "step": 605 }, { "epoch": 0.7112676056338029, "grad_norm": 0.36973506212234497, "learning_rate": 5.832349468713105e-05, "loss": 0.7157, "step": 606 }, { "epoch": 0.7124413145539906, "grad_norm": 0.3447434604167938, "learning_rate": 5.8087367178276277e-05, "loss": 0.7306, "step": 607 }, { "epoch": 0.7136150234741784, "grad_norm": 0.36380118131637573, "learning_rate": 5.785123966942149e-05, "loss": 0.7238, "step": 608 }, { "epoch": 0.7147887323943662, "grad_norm": 0.33784252405166626, "learning_rate": 5.7615112160566706e-05, "loss": 0.6792, "step": 609 }, { "epoch": 0.715962441314554, "grad_norm": 0.34995025396347046, "learning_rate": 5.737898465171193e-05, "loss": 0.7158, "step": 610 }, { "epoch": 0.7171361502347418, "grad_norm": 0.3586655259132385, "learning_rate": 5.714285714285714e-05, "loss": 0.7345, "step": 611 }, { "epoch": 0.7183098591549296, "grad_norm": 0.3490711450576782, "learning_rate": 5.6906729634002366e-05, "loss": 0.759, "step": 612 }, { "epoch": 0.7194835680751174, "grad_norm": 0.3405636250972748, "learning_rate": 5.6670602125147584e-05, "loss": 0.7069, "step": 613 }, { "epoch": 0.7206572769953051, "grad_norm": 0.3362460136413574, "learning_rate": 5.6434474616292796e-05, "loss": 0.7413, "step": 614 }, { "epoch": 0.721830985915493, "grad_norm": 0.3571033775806427, "learning_rate": 5.619834710743802e-05, "loss": 0.7138, "step": 615 }, { "epoch": 0.7230046948356808, "grad_norm": 0.33801379799842834, "learning_rate": 5.5962219598583245e-05, "loss": 0.7004, "step": 616 }, { "epoch": 0.7241784037558685, "grad_norm": 0.350063294172287, "learning_rate": 5.5726092089728456e-05, "loss": 0.7342, "step": 617 }, { "epoch": 0.7253521126760564, "grad_norm": 0.3471220135688782, "learning_rate": 5.5489964580873674e-05, "loss": 0.7591, "step": 618 }, { "epoch": 0.7265258215962441, "grad_norm": 0.3600592613220215, "learning_rate": 5.5253837072018886e-05, "loss": 0.7427, "step": 619 }, { "epoch": 0.7276995305164319, "grad_norm": 0.34294822812080383, "learning_rate": 5.501770956316411e-05, "loss": 0.7085, "step": 620 }, { "epoch": 0.7288732394366197, "grad_norm": 0.3481101989746094, "learning_rate": 5.4781582054309335e-05, "loss": 0.7465, "step": 621 }, { "epoch": 0.7300469483568075, "grad_norm": 0.3402861952781677, "learning_rate": 5.4545454545454546e-05, "loss": 0.7613, "step": 622 }, { "epoch": 0.7312206572769953, "grad_norm": 0.3475019335746765, "learning_rate": 5.4309327036599764e-05, "loss": 0.775, "step": 623 }, { "epoch": 0.7323943661971831, "grad_norm": 0.34003034234046936, "learning_rate": 5.407319952774499e-05, "loss": 0.6817, "step": 624 }, { "epoch": 0.7335680751173709, "grad_norm": 0.33620044589042664, "learning_rate": 5.38370720188902e-05, "loss": 0.7392, "step": 625 }, { "epoch": 0.7347417840375586, "grad_norm": 0.34645119309425354, "learning_rate": 5.3600944510035425e-05, "loss": 0.717, "step": 626 }, { "epoch": 0.7359154929577465, "grad_norm": 0.3485560417175293, "learning_rate": 5.336481700118064e-05, "loss": 0.7361, "step": 627 }, { "epoch": 0.7370892018779343, "grad_norm": 0.36997392773628235, "learning_rate": 5.3128689492325854e-05, "loss": 0.7264, "step": 628 }, { "epoch": 0.7382629107981221, "grad_norm": 0.3379404842853546, "learning_rate": 5.289256198347108e-05, "loss": 0.7303, "step": 629 }, { "epoch": 0.7394366197183099, "grad_norm": 0.3385223150253296, "learning_rate": 5.2656434474616304e-05, "loss": 0.7174, "step": 630 }, { "epoch": 0.7406103286384976, "grad_norm": 0.38303306698799133, "learning_rate": 5.2420306965761515e-05, "loss": 0.7539, "step": 631 }, { "epoch": 0.7417840375586855, "grad_norm": 0.3544706404209137, "learning_rate": 5.218417945690673e-05, "loss": 0.7108, "step": 632 }, { "epoch": 0.7429577464788732, "grad_norm": 0.35137131810188293, "learning_rate": 5.1948051948051944e-05, "loss": 0.7184, "step": 633 }, { "epoch": 0.744131455399061, "grad_norm": 0.35326629877090454, "learning_rate": 5.171192443919717e-05, "loss": 0.7114, "step": 634 }, { "epoch": 0.7453051643192489, "grad_norm": 0.35051414370536804, "learning_rate": 5.1475796930342393e-05, "loss": 0.6966, "step": 635 }, { "epoch": 0.7464788732394366, "grad_norm": 0.37491628527641296, "learning_rate": 5.1239669421487605e-05, "loss": 0.7061, "step": 636 }, { "epoch": 0.7476525821596244, "grad_norm": 0.37242433428764343, "learning_rate": 5.100354191263282e-05, "loss": 0.6904, "step": 637 }, { "epoch": 0.7488262910798122, "grad_norm": 0.376429945230484, "learning_rate": 5.076741440377805e-05, "loss": 0.7203, "step": 638 }, { "epoch": 0.75, "grad_norm": 0.34106218814849854, "learning_rate": 5.053128689492326e-05, "loss": 0.6878, "step": 639 }, { "epoch": 0.7511737089201878, "grad_norm": 0.37987956404685974, "learning_rate": 5.029515938606848e-05, "loss": 0.7835, "step": 640 }, { "epoch": 0.7523474178403756, "grad_norm": 0.355932354927063, "learning_rate": 5.00590318772137e-05, "loss": 0.7382, "step": 641 }, { "epoch": 0.7535211267605634, "grad_norm": 0.33495378494262695, "learning_rate": 4.982290436835891e-05, "loss": 0.7244, "step": 642 }, { "epoch": 0.7546948356807511, "grad_norm": 0.36573663353919983, "learning_rate": 4.958677685950414e-05, "loss": 0.7339, "step": 643 }, { "epoch": 0.755868544600939, "grad_norm": 0.34233418107032776, "learning_rate": 4.9350649350649355e-05, "loss": 0.7303, "step": 644 }, { "epoch": 0.7570422535211268, "grad_norm": 0.36358365416526794, "learning_rate": 4.9114521841794566e-05, "loss": 0.7169, "step": 645 }, { "epoch": 0.7582159624413145, "grad_norm": 0.3423750400543213, "learning_rate": 4.887839433293979e-05, "loss": 0.7413, "step": 646 }, { "epoch": 0.7593896713615024, "grad_norm": 0.34080007672309875, "learning_rate": 4.864226682408501e-05, "loss": 0.7319, "step": 647 }, { "epoch": 0.7605633802816901, "grad_norm": 0.35408544540405273, "learning_rate": 4.840613931523023e-05, "loss": 0.6895, "step": 648 }, { "epoch": 0.7617370892018779, "grad_norm": 0.34515753388404846, "learning_rate": 4.8170011806375445e-05, "loss": 0.7181, "step": 649 }, { "epoch": 0.7629107981220657, "grad_norm": 0.3446560502052307, "learning_rate": 4.793388429752066e-05, "loss": 0.7156, "step": 650 }, { "epoch": 0.7640845070422535, "grad_norm": 0.3451150357723236, "learning_rate": 4.769775678866588e-05, "loss": 0.7232, "step": 651 }, { "epoch": 0.7652582159624414, "grad_norm": 0.357740193605423, "learning_rate": 4.74616292798111e-05, "loss": 0.6872, "step": 652 }, { "epoch": 0.7664319248826291, "grad_norm": 0.3685015141963959, "learning_rate": 4.7225501770956324e-05, "loss": 0.735, "step": 653 }, { "epoch": 0.7676056338028169, "grad_norm": 0.3503192961215973, "learning_rate": 4.6989374262101535e-05, "loss": 0.7336, "step": 654 }, { "epoch": 0.7687793427230047, "grad_norm": 0.33453887701034546, "learning_rate": 4.675324675324675e-05, "loss": 0.7101, "step": 655 }, { "epoch": 0.7699530516431925, "grad_norm": 0.3708442747592926, "learning_rate": 4.651711924439197e-05, "loss": 0.7153, "step": 656 }, { "epoch": 0.7711267605633803, "grad_norm": 0.3736172318458557, "learning_rate": 4.6280991735537196e-05, "loss": 0.7071, "step": 657 }, { "epoch": 0.7723004694835681, "grad_norm": 0.35988256335258484, "learning_rate": 4.6044864226682414e-05, "loss": 0.7285, "step": 658 }, { "epoch": 0.7734741784037559, "grad_norm": 0.34314337372779846, "learning_rate": 4.5808736717827625e-05, "loss": 0.7137, "step": 659 }, { "epoch": 0.7746478873239436, "grad_norm": 0.3723309338092804, "learning_rate": 4.557260920897285e-05, "loss": 0.7391, "step": 660 }, { "epoch": 0.7758215962441315, "grad_norm": 0.3581268787384033, "learning_rate": 4.533648170011807e-05, "loss": 0.7157, "step": 661 }, { "epoch": 0.7769953051643192, "grad_norm": 0.36784443259239197, "learning_rate": 4.5100354191263286e-05, "loss": 0.6865, "step": 662 }, { "epoch": 0.778169014084507, "grad_norm": 0.36377546191215515, "learning_rate": 4.48642266824085e-05, "loss": 0.7437, "step": 663 }, { "epoch": 0.7793427230046949, "grad_norm": 0.349101722240448, "learning_rate": 4.462809917355372e-05, "loss": 0.7226, "step": 664 }, { "epoch": 0.7805164319248826, "grad_norm": 0.36608216166496277, "learning_rate": 4.439197166469894e-05, "loss": 0.7543, "step": 665 }, { "epoch": 0.7816901408450704, "grad_norm": 0.3495696783065796, "learning_rate": 4.415584415584416e-05, "loss": 0.708, "step": 666 }, { "epoch": 0.7828638497652582, "grad_norm": 0.3664140999317169, "learning_rate": 4.3919716646989375e-05, "loss": 0.7225, "step": 667 }, { "epoch": 0.784037558685446, "grad_norm": 0.3560849726200104, "learning_rate": 4.368358913813459e-05, "loss": 0.6972, "step": 668 }, { "epoch": 0.7852112676056338, "grad_norm": 0.3571857511997223, "learning_rate": 4.344746162927981e-05, "loss": 0.694, "step": 669 }, { "epoch": 0.7863849765258216, "grad_norm": 0.37072160840034485, "learning_rate": 4.321133412042503e-05, "loss": 0.7202, "step": 670 }, { "epoch": 0.7875586854460094, "grad_norm": 0.354948490858078, "learning_rate": 4.2975206611570254e-05, "loss": 0.7481, "step": 671 }, { "epoch": 0.7887323943661971, "grad_norm": 0.3736347258090973, "learning_rate": 4.2739079102715465e-05, "loss": 0.7261, "step": 672 }, { "epoch": 0.789906103286385, "grad_norm": 0.3690294623374939, "learning_rate": 4.250295159386068e-05, "loss": 0.7529, "step": 673 }, { "epoch": 0.7910798122065728, "grad_norm": 0.354192316532135, "learning_rate": 4.226682408500591e-05, "loss": 0.7176, "step": 674 }, { "epoch": 0.7922535211267606, "grad_norm": 0.355185866355896, "learning_rate": 4.2030696576151126e-05, "loss": 0.7099, "step": 675 }, { "epoch": 0.7934272300469484, "grad_norm": 0.3503565490245819, "learning_rate": 4.1794569067296344e-05, "loss": 0.7072, "step": 676 }, { "epoch": 0.7946009389671361, "grad_norm": 0.3727845549583435, "learning_rate": 4.155844155844156e-05, "loss": 0.7334, "step": 677 }, { "epoch": 0.795774647887324, "grad_norm": 0.33894312381744385, "learning_rate": 4.132231404958678e-05, "loss": 0.6946, "step": 678 }, { "epoch": 0.7969483568075117, "grad_norm": 0.3385523855686188, "learning_rate": 4.1086186540732e-05, "loss": 0.7096, "step": 679 }, { "epoch": 0.7981220657276995, "grad_norm": 0.3488437235355377, "learning_rate": 4.0850059031877216e-05, "loss": 0.6942, "step": 680 }, { "epoch": 0.7992957746478874, "grad_norm": 0.34666576981544495, "learning_rate": 4.0613931523022434e-05, "loss": 0.7329, "step": 681 }, { "epoch": 0.8004694835680751, "grad_norm": 0.3557136356830597, "learning_rate": 4.037780401416765e-05, "loss": 0.7655, "step": 682 }, { "epoch": 0.8016431924882629, "grad_norm": 0.3647683262825012, "learning_rate": 4.014167650531287e-05, "loss": 0.7578, "step": 683 }, { "epoch": 0.8028169014084507, "grad_norm": 0.3452191650867462, "learning_rate": 3.9905548996458095e-05, "loss": 0.7145, "step": 684 }, { "epoch": 0.8039906103286385, "grad_norm": 0.3540481925010681, "learning_rate": 3.9669421487603306e-05, "loss": 0.7347, "step": 685 }, { "epoch": 0.8051643192488263, "grad_norm": 0.3536418378353119, "learning_rate": 3.9433293978748524e-05, "loss": 0.7103, "step": 686 }, { "epoch": 0.8063380281690141, "grad_norm": 0.34728798270225525, "learning_rate": 3.919716646989374e-05, "loss": 0.7376, "step": 687 }, { "epoch": 0.8075117370892019, "grad_norm": 0.354643851518631, "learning_rate": 3.8961038961038966e-05, "loss": 0.7223, "step": 688 }, { "epoch": 0.8086854460093896, "grad_norm": 0.3438583016395569, "learning_rate": 3.8724911452184184e-05, "loss": 0.6906, "step": 689 }, { "epoch": 0.8098591549295775, "grad_norm": 0.34713107347488403, "learning_rate": 3.8488783943329396e-05, "loss": 0.7361, "step": 690 }, { "epoch": 0.8110328638497653, "grad_norm": 0.3483150005340576, "learning_rate": 3.825265643447462e-05, "loss": 0.7016, "step": 691 }, { "epoch": 0.812206572769953, "grad_norm": 0.34848445653915405, "learning_rate": 3.801652892561984e-05, "loss": 0.6966, "step": 692 }, { "epoch": 0.8133802816901409, "grad_norm": 0.34223318099975586, "learning_rate": 3.7780401416765056e-05, "loss": 0.7088, "step": 693 }, { "epoch": 0.8145539906103286, "grad_norm": 0.33693239092826843, "learning_rate": 3.7544273907910274e-05, "loss": 0.7108, "step": 694 }, { "epoch": 0.8157276995305164, "grad_norm": 0.34613272547721863, "learning_rate": 3.730814639905549e-05, "loss": 0.7075, "step": 695 }, { "epoch": 0.8169014084507042, "grad_norm": 0.3430733382701874, "learning_rate": 3.707201889020071e-05, "loss": 0.7246, "step": 696 }, { "epoch": 0.818075117370892, "grad_norm": 0.35237351059913635, "learning_rate": 3.683589138134593e-05, "loss": 0.6918, "step": 697 }, { "epoch": 0.8192488262910798, "grad_norm": 0.3375650644302368, "learning_rate": 3.659976387249115e-05, "loss": 0.6978, "step": 698 }, { "epoch": 0.8204225352112676, "grad_norm": 0.3585062026977539, "learning_rate": 3.6363636363636364e-05, "loss": 0.7241, "step": 699 }, { "epoch": 0.8215962441314554, "grad_norm": 0.35660460591316223, "learning_rate": 3.612750885478158e-05, "loss": 0.6946, "step": 700 }, { "epoch": 0.8227699530516432, "grad_norm": 0.3468845784664154, "learning_rate": 3.58913813459268e-05, "loss": 0.7535, "step": 701 }, { "epoch": 0.823943661971831, "grad_norm": 0.365291029214859, "learning_rate": 3.5655253837072025e-05, "loss": 0.7438, "step": 702 }, { "epoch": 0.8251173708920188, "grad_norm": 0.353506863117218, "learning_rate": 3.5419126328217236e-05, "loss": 0.7359, "step": 703 }, { "epoch": 0.8262910798122066, "grad_norm": 0.381610244512558, "learning_rate": 3.5182998819362454e-05, "loss": 0.7821, "step": 704 }, { "epoch": 0.8274647887323944, "grad_norm": 0.37710806727409363, "learning_rate": 3.494687131050768e-05, "loss": 0.7349, "step": 705 }, { "epoch": 0.8286384976525821, "grad_norm": 0.361545592546463, "learning_rate": 3.47107438016529e-05, "loss": 0.7229, "step": 706 }, { "epoch": 0.82981220657277, "grad_norm": 0.3615299463272095, "learning_rate": 3.4474616292798115e-05, "loss": 0.748, "step": 707 }, { "epoch": 0.8309859154929577, "grad_norm": 0.3437252342700958, "learning_rate": 3.4238488783943326e-05, "loss": 0.7165, "step": 708 }, { "epoch": 0.8321596244131455, "grad_norm": 0.35603129863739014, "learning_rate": 3.400236127508855e-05, "loss": 0.7373, "step": 709 }, { "epoch": 0.8333333333333334, "grad_norm": 0.3586898446083069, "learning_rate": 3.376623376623377e-05, "loss": 0.7056, "step": 710 }, { "epoch": 0.8345070422535211, "grad_norm": 0.3558507263660431, "learning_rate": 3.353010625737899e-05, "loss": 0.742, "step": 711 }, { "epoch": 0.8356807511737089, "grad_norm": 0.3359735608100891, "learning_rate": 3.3293978748524205e-05, "loss": 0.6994, "step": 712 }, { "epoch": 0.8368544600938967, "grad_norm": 0.34250345826148987, "learning_rate": 3.305785123966942e-05, "loss": 0.6762, "step": 713 }, { "epoch": 0.8380281690140845, "grad_norm": 0.38417667150497437, "learning_rate": 3.282172373081464e-05, "loss": 0.7213, "step": 714 }, { "epoch": 0.8392018779342723, "grad_norm": 0.3643978536128998, "learning_rate": 3.258559622195986e-05, "loss": 0.6884, "step": 715 }, { "epoch": 0.8403755868544601, "grad_norm": 0.3544299602508545, "learning_rate": 3.234946871310508e-05, "loss": 0.712, "step": 716 }, { "epoch": 0.8415492957746479, "grad_norm": 0.36903661489486694, "learning_rate": 3.2113341204250294e-05, "loss": 0.7227, "step": 717 }, { "epoch": 0.8427230046948356, "grad_norm": 0.3557377755641937, "learning_rate": 3.187721369539551e-05, "loss": 0.6904, "step": 718 }, { "epoch": 0.8438967136150235, "grad_norm": 0.36762547492980957, "learning_rate": 3.164108618654074e-05, "loss": 0.7469, "step": 719 }, { "epoch": 0.8450704225352113, "grad_norm": 0.35805556178092957, "learning_rate": 3.1404958677685955e-05, "loss": 0.7443, "step": 720 }, { "epoch": 0.846244131455399, "grad_norm": 0.38130536675453186, "learning_rate": 3.1168831168831166e-05, "loss": 0.7664, "step": 721 }, { "epoch": 0.8474178403755869, "grad_norm": 0.3599521219730377, "learning_rate": 3.0932703659976384e-05, "loss": 0.7065, "step": 722 }, { "epoch": 0.8485915492957746, "grad_norm": 0.3531062602996826, "learning_rate": 3.069657615112161e-05, "loss": 0.7451, "step": 723 }, { "epoch": 0.8497652582159625, "grad_norm": 0.36916878819465637, "learning_rate": 3.0460448642266827e-05, "loss": 0.7077, "step": 724 }, { "epoch": 0.8509389671361502, "grad_norm": 0.38139578700065613, "learning_rate": 3.022432113341204e-05, "loss": 0.7452, "step": 725 }, { "epoch": 0.852112676056338, "grad_norm": 0.337944358587265, "learning_rate": 2.9988193624557266e-05, "loss": 0.6596, "step": 726 }, { "epoch": 0.8532863849765259, "grad_norm": 0.36196213960647583, "learning_rate": 2.975206611570248e-05, "loss": 0.7081, "step": 727 }, { "epoch": 0.8544600938967136, "grad_norm": 0.34913602471351624, "learning_rate": 2.95159386068477e-05, "loss": 0.6901, "step": 728 }, { "epoch": 0.8556338028169014, "grad_norm": 0.343414843082428, "learning_rate": 2.9279811097992914e-05, "loss": 0.675, "step": 729 }, { "epoch": 0.8568075117370892, "grad_norm": 0.3704102039337158, "learning_rate": 2.9043683589138138e-05, "loss": 0.7566, "step": 730 }, { "epoch": 0.857981220657277, "grad_norm": 0.3464911878108978, "learning_rate": 2.8807556080283353e-05, "loss": 0.6872, "step": 731 }, { "epoch": 0.8591549295774648, "grad_norm": 0.3615940511226654, "learning_rate": 2.857142857142857e-05, "loss": 0.7755, "step": 732 }, { "epoch": 0.8603286384976526, "grad_norm": 0.35284191370010376, "learning_rate": 2.8335301062573792e-05, "loss": 0.7483, "step": 733 }, { "epoch": 0.8615023474178404, "grad_norm": 0.3469059467315674, "learning_rate": 2.809917355371901e-05, "loss": 0.6902, "step": 734 }, { "epoch": 0.8626760563380281, "grad_norm": 0.35148003697395325, "learning_rate": 2.7863046044864228e-05, "loss": 0.732, "step": 735 }, { "epoch": 0.863849765258216, "grad_norm": 0.3533206880092621, "learning_rate": 2.7626918536009443e-05, "loss": 0.7287, "step": 736 }, { "epoch": 0.8650234741784038, "grad_norm": 0.383095383644104, "learning_rate": 2.7390791027154668e-05, "loss": 0.8017, "step": 737 }, { "epoch": 0.8661971830985915, "grad_norm": 0.3541397452354431, "learning_rate": 2.7154663518299882e-05, "loss": 0.7291, "step": 738 }, { "epoch": 0.8673708920187794, "grad_norm": 0.35989582538604736, "learning_rate": 2.69185360094451e-05, "loss": 0.7211, "step": 739 }, { "epoch": 0.8685446009389671, "grad_norm": 0.34245404601097107, "learning_rate": 2.668240850059032e-05, "loss": 0.7062, "step": 740 }, { "epoch": 0.8697183098591549, "grad_norm": 0.3396112024784088, "learning_rate": 2.644628099173554e-05, "loss": 0.6946, "step": 741 }, { "epoch": 0.8708920187793427, "grad_norm": 0.34901162981987, "learning_rate": 2.6210153482880757e-05, "loss": 0.7742, "step": 742 }, { "epoch": 0.8720657276995305, "grad_norm": 0.3654363453388214, "learning_rate": 2.5974025974025972e-05, "loss": 0.7894, "step": 743 }, { "epoch": 0.8732394366197183, "grad_norm": 0.3478833734989166, "learning_rate": 2.5737898465171197e-05, "loss": 0.6909, "step": 744 }, { "epoch": 0.8744131455399061, "grad_norm": 0.3447161912918091, "learning_rate": 2.550177095631641e-05, "loss": 0.7166, "step": 745 }, { "epoch": 0.8755868544600939, "grad_norm": 0.35436901450157166, "learning_rate": 2.526564344746163e-05, "loss": 0.6962, "step": 746 }, { "epoch": 0.8767605633802817, "grad_norm": 0.3359661400318146, "learning_rate": 2.502951593860685e-05, "loss": 0.7345, "step": 747 }, { "epoch": 0.8779342723004695, "grad_norm": 0.35876211524009705, "learning_rate": 2.479338842975207e-05, "loss": 0.6723, "step": 748 }, { "epoch": 0.8791079812206573, "grad_norm": 0.35507625341415405, "learning_rate": 2.4557260920897283e-05, "loss": 0.6744, "step": 749 }, { "epoch": 0.8802816901408451, "grad_norm": 0.3504907786846161, "learning_rate": 2.4321133412042505e-05, "loss": 0.7281, "step": 750 }, { "epoch": 0.8814553990610329, "grad_norm": 0.3498130440711975, "learning_rate": 2.4085005903187723e-05, "loss": 0.7079, "step": 751 }, { "epoch": 0.8826291079812206, "grad_norm": 0.36793026328086853, "learning_rate": 2.384887839433294e-05, "loss": 0.747, "step": 752 }, { "epoch": 0.8838028169014085, "grad_norm": 0.3484232723712921, "learning_rate": 2.3612750885478162e-05, "loss": 0.7347, "step": 753 }, { "epoch": 0.8849765258215962, "grad_norm": 0.34402692317962646, "learning_rate": 2.3376623376623376e-05, "loss": 0.6717, "step": 754 }, { "epoch": 0.886150234741784, "grad_norm": 0.377380907535553, "learning_rate": 2.3140495867768598e-05, "loss": 0.7642, "step": 755 }, { "epoch": 0.8873239436619719, "grad_norm": 0.361382395029068, "learning_rate": 2.2904368358913812e-05, "loss": 0.7081, "step": 756 }, { "epoch": 0.8884976525821596, "grad_norm": 0.3643784821033478, "learning_rate": 2.2668240850059034e-05, "loss": 0.7219, "step": 757 }, { "epoch": 0.8896713615023474, "grad_norm": 0.3974801301956177, "learning_rate": 2.243211334120425e-05, "loss": 0.712, "step": 758 }, { "epoch": 0.8908450704225352, "grad_norm": 0.35573598742485046, "learning_rate": 2.219598583234947e-05, "loss": 0.7335, "step": 759 }, { "epoch": 0.892018779342723, "grad_norm": 0.3532857596874237, "learning_rate": 2.1959858323494688e-05, "loss": 0.7013, "step": 760 }, { "epoch": 0.8931924882629108, "grad_norm": 0.33362728357315063, "learning_rate": 2.1723730814639906e-05, "loss": 0.6739, "step": 761 }, { "epoch": 0.8943661971830986, "grad_norm": 0.3325813412666321, "learning_rate": 2.1487603305785127e-05, "loss": 0.7099, "step": 762 }, { "epoch": 0.8955399061032864, "grad_norm": 0.3451225459575653, "learning_rate": 2.125147579693034e-05, "loss": 0.6959, "step": 763 }, { "epoch": 0.8967136150234741, "grad_norm": 0.3604796528816223, "learning_rate": 2.1015348288075563e-05, "loss": 0.737, "step": 764 }, { "epoch": 0.897887323943662, "grad_norm": 0.34980282187461853, "learning_rate": 2.077922077922078e-05, "loss": 0.7206, "step": 765 }, { "epoch": 0.8990610328638498, "grad_norm": 0.35130617022514343, "learning_rate": 2.0543093270366e-05, "loss": 0.7153, "step": 766 }, { "epoch": 0.9002347417840375, "grad_norm": 0.34524810314178467, "learning_rate": 2.0306965761511217e-05, "loss": 0.7237, "step": 767 }, { "epoch": 0.9014084507042254, "grad_norm": 0.35661572217941284, "learning_rate": 2.0070838252656435e-05, "loss": 0.6831, "step": 768 }, { "epoch": 0.9025821596244131, "grad_norm": 0.35206255316734314, "learning_rate": 1.9834710743801653e-05, "loss": 0.7721, "step": 769 }, { "epoch": 0.903755868544601, "grad_norm": 0.35439351201057434, "learning_rate": 1.959858323494687e-05, "loss": 0.7142, "step": 770 }, { "epoch": 0.9049295774647887, "grad_norm": 0.33722493052482605, "learning_rate": 1.9362455726092092e-05, "loss": 0.6968, "step": 771 }, { "epoch": 0.9061032863849765, "grad_norm": 0.3573172092437744, "learning_rate": 1.912632821723731e-05, "loss": 0.7301, "step": 772 }, { "epoch": 0.9072769953051644, "grad_norm": 0.3347008526325226, "learning_rate": 1.8890200708382528e-05, "loss": 0.6721, "step": 773 }, { "epoch": 0.9084507042253521, "grad_norm": 0.3563063144683838, "learning_rate": 1.8654073199527746e-05, "loss": 0.7233, "step": 774 }, { "epoch": 0.9096244131455399, "grad_norm": 0.35159915685653687, "learning_rate": 1.8417945690672964e-05, "loss": 0.7184, "step": 775 }, { "epoch": 0.9107981220657277, "grad_norm": 0.35826948285102844, "learning_rate": 1.8181818181818182e-05, "loss": 0.7301, "step": 776 }, { "epoch": 0.9119718309859155, "grad_norm": 0.3533133268356323, "learning_rate": 1.79456906729634e-05, "loss": 0.7373, "step": 777 }, { "epoch": 0.9131455399061033, "grad_norm": 0.3495820164680481, "learning_rate": 1.7709563164108618e-05, "loss": 0.7379, "step": 778 }, { "epoch": 0.9143192488262911, "grad_norm": 0.33082085847854614, "learning_rate": 1.747343565525384e-05, "loss": 0.6789, "step": 779 }, { "epoch": 0.9154929577464789, "grad_norm": 0.34669029712677, "learning_rate": 1.7237308146399057e-05, "loss": 0.6962, "step": 780 }, { "epoch": 0.9166666666666666, "grad_norm": 0.3401969373226166, "learning_rate": 1.7001180637544275e-05, "loss": 0.717, "step": 781 }, { "epoch": 0.9178403755868545, "grad_norm": 0.3488728702068329, "learning_rate": 1.6765053128689493e-05, "loss": 0.7087, "step": 782 }, { "epoch": 0.9190140845070423, "grad_norm": 0.39244547486305237, "learning_rate": 1.652892561983471e-05, "loss": 0.7331, "step": 783 }, { "epoch": 0.92018779342723, "grad_norm": 0.33185505867004395, "learning_rate": 1.629279811097993e-05, "loss": 0.6821, "step": 784 }, { "epoch": 0.9213615023474179, "grad_norm": 0.34186288714408875, "learning_rate": 1.6056670602125147e-05, "loss": 0.6766, "step": 785 }, { "epoch": 0.9225352112676056, "grad_norm": 0.34512627124786377, "learning_rate": 1.582054309327037e-05, "loss": 0.6837, "step": 786 }, { "epoch": 0.9237089201877934, "grad_norm": 0.34042122960090637, "learning_rate": 1.5584415584415583e-05, "loss": 0.7266, "step": 787 }, { "epoch": 0.9248826291079812, "grad_norm": 0.34173402190208435, "learning_rate": 1.5348288075560805e-05, "loss": 0.6998, "step": 788 }, { "epoch": 0.926056338028169, "grad_norm": 0.34008073806762695, "learning_rate": 1.511216056670602e-05, "loss": 0.7211, "step": 789 }, { "epoch": 0.9272300469483568, "grad_norm": 0.3400252163410187, "learning_rate": 1.487603305785124e-05, "loss": 0.6771, "step": 790 }, { "epoch": 0.9284037558685446, "grad_norm": 0.3393029570579529, "learning_rate": 1.4639905548996457e-05, "loss": 0.7274, "step": 791 }, { "epoch": 0.9295774647887324, "grad_norm": 0.3489772379398346, "learning_rate": 1.4403778040141676e-05, "loss": 0.7195, "step": 792 }, { "epoch": 0.9307511737089202, "grad_norm": 0.3434072732925415, "learning_rate": 1.4167650531286896e-05, "loss": 0.6806, "step": 793 }, { "epoch": 0.931924882629108, "grad_norm": 0.35593146085739136, "learning_rate": 1.3931523022432114e-05, "loss": 0.7026, "step": 794 }, { "epoch": 0.9330985915492958, "grad_norm": 0.33654287457466125, "learning_rate": 1.3695395513577334e-05, "loss": 0.6655, "step": 795 }, { "epoch": 0.9342723004694836, "grad_norm": 0.35049983859062195, "learning_rate": 1.345926800472255e-05, "loss": 0.686, "step": 796 }, { "epoch": 0.9354460093896714, "grad_norm": 0.3442087471485138, "learning_rate": 1.322314049586777e-05, "loss": 0.7048, "step": 797 }, { "epoch": 0.9366197183098591, "grad_norm": 0.3569439649581909, "learning_rate": 1.2987012987012986e-05, "loss": 0.7271, "step": 798 }, { "epoch": 0.937793427230047, "grad_norm": 0.3418942391872406, "learning_rate": 1.2750885478158206e-05, "loss": 0.7132, "step": 799 }, { "epoch": 0.9389671361502347, "grad_norm": 0.3399513363838196, "learning_rate": 1.2514757969303425e-05, "loss": 0.7046, "step": 800 }, { "epoch": 0.9401408450704225, "grad_norm": 0.34055379033088684, "learning_rate": 0.00010641553855208948, "loss": 0.7293, "step": 801 }, { "epoch": 0.9413145539906104, "grad_norm": 0.3299119770526886, "learning_rate": 0.0001062978222483814, "loss": 0.6779, "step": 802 }, { "epoch": 0.9424882629107981, "grad_norm": 0.3833242356777191, "learning_rate": 0.00010618010594467334, "loss": 0.6909, "step": 803 }, { "epoch": 0.9436619718309859, "grad_norm": 0.39958855509757996, "learning_rate": 0.00010606238964096529, "loss": 0.7307, "step": 804 }, { "epoch": 0.9448356807511737, "grad_norm": 0.38618725538253784, "learning_rate": 0.00010594467333725722, "loss": 0.6984, "step": 805 }, { "epoch": 0.9460093896713615, "grad_norm": 0.4084942936897278, "learning_rate": 0.00010582695703354914, "loss": 0.7456, "step": 806 }, { "epoch": 0.9471830985915493, "grad_norm": 0.4109421372413635, "learning_rate": 0.00010570924072984109, "loss": 0.6991, "step": 807 }, { "epoch": 0.9483568075117371, "grad_norm": 0.382415771484375, "learning_rate": 0.00010559152442613303, "loss": 0.726, "step": 808 }, { "epoch": 0.9495305164319249, "grad_norm": 0.4036392867565155, "learning_rate": 0.00010547380812242496, "loss": 0.7264, "step": 809 }, { "epoch": 0.9507042253521126, "grad_norm": 0.38903331756591797, "learning_rate": 0.00010535609181871691, "loss": 0.691, "step": 810 }, { "epoch": 0.9518779342723005, "grad_norm": 0.3803318440914154, "learning_rate": 0.00010523837551500883, "loss": 0.7271, "step": 811 }, { "epoch": 0.9530516431924883, "grad_norm": 0.3850460350513458, "learning_rate": 0.00010512065921130076, "loss": 0.7111, "step": 812 }, { "epoch": 0.954225352112676, "grad_norm": 0.4110994040966034, "learning_rate": 0.00010500294290759271, "loss": 0.7282, "step": 813 }, { "epoch": 0.9553990610328639, "grad_norm": 0.3853722810745239, "learning_rate": 0.00010488522660388465, "loss": 0.7194, "step": 814 }, { "epoch": 0.9565727699530516, "grad_norm": 0.37440797686576843, "learning_rate": 0.00010476751030017658, "loss": 0.7116, "step": 815 }, { "epoch": 0.9577464788732394, "grad_norm": 0.42637899518013, "learning_rate": 0.00010464979399646853, "loss": 0.7189, "step": 816 }, { "epoch": 0.9589201877934272, "grad_norm": 0.4067356288433075, "learning_rate": 0.00010453207769276045, "loss": 0.7509, "step": 817 }, { "epoch": 0.960093896713615, "grad_norm": 0.3854503929615021, "learning_rate": 0.00010441436138905238, "loss": 0.7426, "step": 818 }, { "epoch": 0.9612676056338029, "grad_norm": 0.4298991858959198, "learning_rate": 0.00010429664508534433, "loss": 0.7528, "step": 819 }, { "epoch": 0.9624413145539906, "grad_norm": 0.3748774826526642, "learning_rate": 0.00010417892878163627, "loss": 0.6512, "step": 820 }, { "epoch": 0.9636150234741784, "grad_norm": 0.38448989391326904, "learning_rate": 0.00010406121247792819, "loss": 0.6929, "step": 821 }, { "epoch": 0.9647887323943662, "grad_norm": 0.42416030168533325, "learning_rate": 0.00010394349617422015, "loss": 0.7312, "step": 822 }, { "epoch": 0.965962441314554, "grad_norm": 0.3875625729560852, "learning_rate": 0.00010382577987051207, "loss": 0.7121, "step": 823 }, { "epoch": 0.9671361502347418, "grad_norm": 0.4241638481616974, "learning_rate": 0.000103708063566804, "loss": 0.7248, "step": 824 }, { "epoch": 0.9683098591549296, "grad_norm": 0.4026165306568146, "learning_rate": 0.00010359034726309595, "loss": 0.7224, "step": 825 }, { "epoch": 0.9694835680751174, "grad_norm": 0.39895206689834595, "learning_rate": 0.00010347263095938789, "loss": 0.7193, "step": 826 }, { "epoch": 0.9706572769953051, "grad_norm": 0.395463228225708, "learning_rate": 0.00010335491465567981, "loss": 0.7673, "step": 827 }, { "epoch": 0.971830985915493, "grad_norm": 0.4351494312286377, "learning_rate": 0.00010323719835197174, "loss": 0.7684, "step": 828 }, { "epoch": 0.9730046948356808, "grad_norm": 0.4378681182861328, "learning_rate": 0.00010311948204826369, "loss": 0.7277, "step": 829 }, { "epoch": 0.9741784037558685, "grad_norm": 0.4214630722999573, "learning_rate": 0.00010300176574455563, "loss": 0.7107, "step": 830 }, { "epoch": 0.9753521126760564, "grad_norm": 0.41999107599258423, "learning_rate": 0.00010288404944084755, "loss": 0.7328, "step": 831 }, { "epoch": 0.9765258215962441, "grad_norm": 0.49026909470558167, "learning_rate": 0.00010276633313713951, "loss": 0.7345, "step": 832 }, { "epoch": 0.9776995305164319, "grad_norm": 0.4068211317062378, "learning_rate": 0.00010264861683343143, "loss": 0.701, "step": 833 }, { "epoch": 0.9788732394366197, "grad_norm": 0.42514288425445557, "learning_rate": 0.00010253090052972336, "loss": 0.729, "step": 834 }, { "epoch": 0.9800469483568075, "grad_norm": 0.4883005619049072, "learning_rate": 0.00010241318422601531, "loss": 0.7183, "step": 835 }, { "epoch": 0.9812206572769953, "grad_norm": 0.38146787881851196, "learning_rate": 0.00010229546792230725, "loss": 0.6977, "step": 836 }, { "epoch": 0.9823943661971831, "grad_norm": 0.3898909389972687, "learning_rate": 0.00010217775161859917, "loss": 0.7131, "step": 837 }, { "epoch": 0.9835680751173709, "grad_norm": 0.39693424105644226, "learning_rate": 0.00010206003531489112, "loss": 0.7184, "step": 838 }, { "epoch": 0.9847417840375586, "grad_norm": 0.3968975841999054, "learning_rate": 0.00010194231901118305, "loss": 0.7536, "step": 839 }, { "epoch": 0.9859154929577465, "grad_norm": 0.4030087888240814, "learning_rate": 0.00010182460270747499, "loss": 0.7156, "step": 840 }, { "epoch": 0.9870892018779343, "grad_norm": 0.37477344274520874, "learning_rate": 0.00010170688640376693, "loss": 0.6815, "step": 841 }, { "epoch": 0.9882629107981221, "grad_norm": 0.40929409861564636, "learning_rate": 0.00010158917010005887, "loss": 0.6827, "step": 842 }, { "epoch": 0.9894366197183099, "grad_norm": 0.36350882053375244, "learning_rate": 0.00010147145379635079, "loss": 0.6927, "step": 843 }, { "epoch": 0.9906103286384976, "grad_norm": 0.3828059434890747, "learning_rate": 0.00010135373749264274, "loss": 0.7254, "step": 844 }, { "epoch": 0.9917840375586855, "grad_norm": 0.4095743000507355, "learning_rate": 0.00010123602118893467, "loss": 0.719, "step": 845 }, { "epoch": 0.9929577464788732, "grad_norm": 0.37418296933174133, "learning_rate": 0.0001011183048852266, "loss": 0.682, "step": 846 }, { "epoch": 0.994131455399061, "grad_norm": 0.39427751302719116, "learning_rate": 0.00010100058858151855, "loss": 0.7742, "step": 847 }, { "epoch": 0.9953051643192489, "grad_norm": 0.3696395754814148, "learning_rate": 0.00010088287227781048, "loss": 0.7377, "step": 848 }, { "epoch": 0.9964788732394366, "grad_norm": 0.36249879002571106, "learning_rate": 0.00010076515597410241, "loss": 0.7237, "step": 849 }, { "epoch": 0.9976525821596244, "grad_norm": 0.3712272047996521, "learning_rate": 0.00010064743967039436, "loss": 0.6737, "step": 850 }, { "epoch": 0.9988262910798122, "grad_norm": 0.37550613284111023, "learning_rate": 0.00010052972336668629, "loss": 0.7147, "step": 851 }, { "epoch": 1.0, "grad_norm": 0.405351459980011, "learning_rate": 0.00010041200706297821, "loss": 0.7364, "step": 852 }, { "epoch": 1.0011737089201878, "grad_norm": 0.39747750759124756, "learning_rate": 0.00010029429075927018, "loss": 0.6934, "step": 853 }, { "epoch": 1.0023474178403755, "grad_norm": 0.3695623576641083, "learning_rate": 0.0001001765744555621, "loss": 0.6971, "step": 854 }, { "epoch": 1.0035211267605635, "grad_norm": 0.3880208134651184, "learning_rate": 0.00010005885815185403, "loss": 0.7219, "step": 855 }, { "epoch": 1.0046948356807512, "grad_norm": 0.40131011605262756, "learning_rate": 9.994114184814597e-05, "loss": 0.6925, "step": 856 }, { "epoch": 1.005868544600939, "grad_norm": 0.38630256056785583, "learning_rate": 9.982342554443791e-05, "loss": 0.7412, "step": 857 }, { "epoch": 1.0070422535211268, "grad_norm": 0.39141979813575745, "learning_rate": 9.970570924072985e-05, "loss": 0.7089, "step": 858 }, { "epoch": 1.0082159624413145, "grad_norm": 0.3811167776584625, "learning_rate": 9.958799293702178e-05, "loss": 0.6979, "step": 859 }, { "epoch": 1.0093896713615023, "grad_norm": 0.38177528977394104, "learning_rate": 9.947027663331372e-05, "loss": 0.7181, "step": 860 }, { "epoch": 1.0105633802816902, "grad_norm": 0.36225804686546326, "learning_rate": 9.935256032960567e-05, "loss": 0.6495, "step": 861 }, { "epoch": 1.011737089201878, "grad_norm": 0.3796376585960388, "learning_rate": 9.923484402589759e-05, "loss": 0.6661, "step": 862 }, { "epoch": 1.0129107981220657, "grad_norm": 0.3896029591560364, "learning_rate": 9.911712772218953e-05, "loss": 0.6705, "step": 863 }, { "epoch": 1.0140845070422535, "grad_norm": 0.35688912868499756, "learning_rate": 9.899941141848147e-05, "loss": 0.6835, "step": 864 }, { "epoch": 1.0152582159624413, "grad_norm": 0.3919657766819, "learning_rate": 9.88816951147734e-05, "loss": 0.6771, "step": 865 }, { "epoch": 1.016431924882629, "grad_norm": 0.390311062335968, "learning_rate": 9.876397881106534e-05, "loss": 0.7208, "step": 866 }, { "epoch": 1.017605633802817, "grad_norm": 0.3857402205467224, "learning_rate": 9.864626250735727e-05, "loss": 0.7321, "step": 867 }, { "epoch": 1.0187793427230047, "grad_norm": 0.3688738942146301, "learning_rate": 9.852854620364921e-05, "loss": 0.6853, "step": 868 }, { "epoch": 1.0199530516431925, "grad_norm": 0.3814820647239685, "learning_rate": 9.841082989994114e-05, "loss": 0.664, "step": 869 }, { "epoch": 1.0211267605633803, "grad_norm": 0.3849344253540039, "learning_rate": 9.829311359623309e-05, "loss": 0.6844, "step": 870 }, { "epoch": 1.022300469483568, "grad_norm": 0.36203038692474365, "learning_rate": 9.817539729252502e-05, "loss": 0.7201, "step": 871 }, { "epoch": 1.0234741784037558, "grad_norm": 0.36614471673965454, "learning_rate": 9.805768098881696e-05, "loss": 0.659, "step": 872 }, { "epoch": 1.0246478873239437, "grad_norm": 0.3908173143863678, "learning_rate": 9.79399646851089e-05, "loss": 0.6638, "step": 873 }, { "epoch": 1.0258215962441315, "grad_norm": 0.35966452956199646, "learning_rate": 9.782224838140083e-05, "loss": 0.7187, "step": 874 }, { "epoch": 1.0269953051643192, "grad_norm": 0.40878093242645264, "learning_rate": 9.770453207769276e-05, "loss": 0.691, "step": 875 }, { "epoch": 1.028169014084507, "grad_norm": 0.38903382420539856, "learning_rate": 9.75868157739847e-05, "loss": 0.718, "step": 876 }, { "epoch": 1.0293427230046948, "grad_norm": 0.3865324556827545, "learning_rate": 9.746909947027663e-05, "loss": 0.7331, "step": 877 }, { "epoch": 1.0305164319248827, "grad_norm": 0.37417513132095337, "learning_rate": 9.735138316656858e-05, "loss": 0.677, "step": 878 }, { "epoch": 1.0316901408450705, "grad_norm": 0.38043439388275146, "learning_rate": 9.72336668628605e-05, "loss": 0.6932, "step": 879 }, { "epoch": 1.0328638497652582, "grad_norm": 0.37418729066848755, "learning_rate": 9.711595055915245e-05, "loss": 0.7119, "step": 880 }, { "epoch": 1.034037558685446, "grad_norm": 0.4013047218322754, "learning_rate": 9.699823425544438e-05, "loss": 0.7041, "step": 881 }, { "epoch": 1.0352112676056338, "grad_norm": 0.38462570309638977, "learning_rate": 9.688051795173632e-05, "loss": 0.6861, "step": 882 }, { "epoch": 1.0363849765258215, "grad_norm": 0.3900148868560791, "learning_rate": 9.676280164802825e-05, "loss": 0.6382, "step": 883 }, { "epoch": 1.0375586854460095, "grad_norm": 0.3882652819156647, "learning_rate": 9.66450853443202e-05, "loss": 0.6948, "step": 884 }, { "epoch": 1.0387323943661972, "grad_norm": 0.36546608805656433, "learning_rate": 9.652736904061212e-05, "loss": 0.7064, "step": 885 }, { "epoch": 1.039906103286385, "grad_norm": 0.3788559138774872, "learning_rate": 9.640965273690407e-05, "loss": 0.7129, "step": 886 }, { "epoch": 1.0410798122065728, "grad_norm": 0.3979467451572418, "learning_rate": 9.6291936433196e-05, "loss": 0.7196, "step": 887 }, { "epoch": 1.0422535211267605, "grad_norm": 0.3777488172054291, "learning_rate": 9.617422012948794e-05, "loss": 0.6922, "step": 888 }, { "epoch": 1.0434272300469483, "grad_norm": 0.39730504155158997, "learning_rate": 9.605650382577987e-05, "loss": 0.6529, "step": 889 }, { "epoch": 1.0446009389671362, "grad_norm": 0.39619576930999756, "learning_rate": 9.593878752207182e-05, "loss": 0.6505, "step": 890 }, { "epoch": 1.045774647887324, "grad_norm": 0.3763888478279114, "learning_rate": 9.582107121836374e-05, "loss": 0.638, "step": 891 }, { "epoch": 1.0469483568075117, "grad_norm": 0.3947450518608093, "learning_rate": 9.570335491465569e-05, "loss": 0.7099, "step": 892 }, { "epoch": 1.0481220657276995, "grad_norm": 0.43239885568618774, "learning_rate": 9.558563861094763e-05, "loss": 0.7112, "step": 893 }, { "epoch": 1.0492957746478873, "grad_norm": 0.37725165486335754, "learning_rate": 9.546792230723956e-05, "loss": 0.6775, "step": 894 }, { "epoch": 1.050469483568075, "grad_norm": 0.3807140290737152, "learning_rate": 9.53502060035315e-05, "loss": 0.7201, "step": 895 }, { "epoch": 1.051643192488263, "grad_norm": 0.40270236134529114, "learning_rate": 9.523248969982343e-05, "loss": 0.6908, "step": 896 }, { "epoch": 1.0528169014084507, "grad_norm": 0.38907137513160706, "learning_rate": 9.511477339611536e-05, "loss": 0.7274, "step": 897 }, { "epoch": 1.0539906103286385, "grad_norm": 0.35074397921562195, "learning_rate": 9.49970570924073e-05, "loss": 0.6765, "step": 898 }, { "epoch": 1.0551643192488263, "grad_norm": 0.37548649311065674, "learning_rate": 9.487934078869925e-05, "loss": 0.7258, "step": 899 }, { "epoch": 1.056338028169014, "grad_norm": 0.3947518467903137, "learning_rate": 9.476162448499118e-05, "loss": 0.7142, "step": 900 }, { "epoch": 1.057511737089202, "grad_norm": 0.36888387799263, "learning_rate": 9.464390818128312e-05, "loss": 0.664, "step": 901 }, { "epoch": 1.0586854460093897, "grad_norm": 0.3735831379890442, "learning_rate": 9.452619187757505e-05, "loss": 0.6914, "step": 902 }, { "epoch": 1.0598591549295775, "grad_norm": 0.3840358257293701, "learning_rate": 9.440847557386698e-05, "loss": 0.663, "step": 903 }, { "epoch": 1.0610328638497653, "grad_norm": 0.408840537071228, "learning_rate": 9.429075927015892e-05, "loss": 0.7225, "step": 904 }, { "epoch": 1.062206572769953, "grad_norm": 0.36408165097236633, "learning_rate": 9.417304296645085e-05, "loss": 0.6744, "step": 905 }, { "epoch": 1.0633802816901408, "grad_norm": 0.4005196690559387, "learning_rate": 9.405532666274279e-05, "loss": 0.7285, "step": 906 }, { "epoch": 1.0645539906103287, "grad_norm": 0.3824830949306488, "learning_rate": 9.393761035903474e-05, "loss": 0.6978, "step": 907 }, { "epoch": 1.0657276995305165, "grad_norm": 0.38410818576812744, "learning_rate": 9.381989405532666e-05, "loss": 0.6725, "step": 908 }, { "epoch": 1.0669014084507042, "grad_norm": 0.37026217579841614, "learning_rate": 9.37021777516186e-05, "loss": 0.6908, "step": 909 }, { "epoch": 1.068075117370892, "grad_norm": 0.37652963399887085, "learning_rate": 9.358446144791054e-05, "loss": 0.6674, "step": 910 }, { "epoch": 1.0692488262910798, "grad_norm": 0.40584585070610046, "learning_rate": 9.346674514420247e-05, "loss": 0.7087, "step": 911 }, { "epoch": 1.0704225352112675, "grad_norm": 0.3777616620063782, "learning_rate": 9.334902884049441e-05, "loss": 0.6633, "step": 912 }, { "epoch": 1.0715962441314555, "grad_norm": 0.35584181547164917, "learning_rate": 9.323131253678636e-05, "loss": 0.6484, "step": 913 }, { "epoch": 1.0727699530516432, "grad_norm": 0.40920573472976685, "learning_rate": 9.311359623307828e-05, "loss": 0.6781, "step": 914 }, { "epoch": 1.073943661971831, "grad_norm": 0.37617766857147217, "learning_rate": 9.299587992937023e-05, "loss": 0.6785, "step": 915 }, { "epoch": 1.0751173708920188, "grad_norm": 0.36854755878448486, "learning_rate": 9.287816362566216e-05, "loss": 0.6805, "step": 916 }, { "epoch": 1.0762910798122065, "grad_norm": 0.3820021152496338, "learning_rate": 9.27604473219541e-05, "loss": 0.7413, "step": 917 }, { "epoch": 1.0774647887323943, "grad_norm": 0.3654205799102783, "learning_rate": 9.264273101824603e-05, "loss": 0.6996, "step": 918 }, { "epoch": 1.0786384976525822, "grad_norm": 0.36847448348999023, "learning_rate": 9.252501471453798e-05, "loss": 0.6593, "step": 919 }, { "epoch": 1.07981220657277, "grad_norm": 0.4072454571723938, "learning_rate": 9.24072984108299e-05, "loss": 0.7062, "step": 920 }, { "epoch": 1.0809859154929577, "grad_norm": 0.37201663851737976, "learning_rate": 9.228958210712185e-05, "loss": 0.7188, "step": 921 }, { "epoch": 1.0821596244131455, "grad_norm": 0.40708494186401367, "learning_rate": 9.217186580341378e-05, "loss": 0.6984, "step": 922 }, { "epoch": 1.0833333333333333, "grad_norm": 0.37668758630752563, "learning_rate": 9.205414949970572e-05, "loss": 0.6856, "step": 923 }, { "epoch": 1.084507042253521, "grad_norm": 0.41518712043762207, "learning_rate": 9.193643319599765e-05, "loss": 0.7093, "step": 924 }, { "epoch": 1.085680751173709, "grad_norm": 0.3661474883556366, "learning_rate": 9.181871689228958e-05, "loss": 0.6765, "step": 925 }, { "epoch": 1.0868544600938967, "grad_norm": 0.3910673260688782, "learning_rate": 9.170100058858152e-05, "loss": 0.6778, "step": 926 }, { "epoch": 1.0880281690140845, "grad_norm": 0.3851100206375122, "learning_rate": 9.158328428487345e-05, "loss": 0.7188, "step": 927 }, { "epoch": 1.0892018779342723, "grad_norm": 0.36254799365997314, "learning_rate": 9.14655679811654e-05, "loss": 0.7182, "step": 928 }, { "epoch": 1.09037558685446, "grad_norm": 0.39364567399024963, "learning_rate": 9.134785167745734e-05, "loss": 0.7208, "step": 929 }, { "epoch": 1.091549295774648, "grad_norm": 0.3755466639995575, "learning_rate": 9.123013537374927e-05, "loss": 0.6771, "step": 930 }, { "epoch": 1.0927230046948357, "grad_norm": 0.361087828874588, "learning_rate": 9.11124190700412e-05, "loss": 0.6541, "step": 931 }, { "epoch": 1.0938967136150235, "grad_norm": 0.37327754497528076, "learning_rate": 9.099470276633314e-05, "loss": 0.698, "step": 932 }, { "epoch": 1.0950704225352113, "grad_norm": 0.38413748145103455, "learning_rate": 9.087698646262507e-05, "loss": 0.6933, "step": 933 }, { "epoch": 1.096244131455399, "grad_norm": 0.4182147681713104, "learning_rate": 9.075927015891701e-05, "loss": 0.6776, "step": 934 }, { "epoch": 1.0974178403755868, "grad_norm": 0.3987724483013153, "learning_rate": 9.064155385520894e-05, "loss": 0.694, "step": 935 }, { "epoch": 1.0985915492957747, "grad_norm": 0.37629225850105286, "learning_rate": 9.052383755150089e-05, "loss": 0.6565, "step": 936 }, { "epoch": 1.0997652582159625, "grad_norm": 0.38973352313041687, "learning_rate": 9.040612124779281e-05, "loss": 0.6739, "step": 937 }, { "epoch": 1.1009389671361502, "grad_norm": 0.3845914900302887, "learning_rate": 9.028840494408476e-05, "loss": 0.6788, "step": 938 }, { "epoch": 1.102112676056338, "grad_norm": 0.3861023485660553, "learning_rate": 9.01706886403767e-05, "loss": 0.6763, "step": 939 }, { "epoch": 1.1032863849765258, "grad_norm": 0.37565183639526367, "learning_rate": 9.005297233666863e-05, "loss": 0.6478, "step": 940 }, { "epoch": 1.1044600938967135, "grad_norm": 0.4068315029144287, "learning_rate": 8.993525603296056e-05, "loss": 0.6752, "step": 941 }, { "epoch": 1.1056338028169015, "grad_norm": 0.37796974182128906, "learning_rate": 8.981753972925251e-05, "loss": 0.7355, "step": 942 }, { "epoch": 1.1068075117370892, "grad_norm": 0.4024117887020111, "learning_rate": 8.969982342554443e-05, "loss": 0.6648, "step": 943 }, { "epoch": 1.107981220657277, "grad_norm": 0.404442697763443, "learning_rate": 8.958210712183638e-05, "loss": 0.7, "step": 944 }, { "epoch": 1.1091549295774648, "grad_norm": 0.35948899388313293, "learning_rate": 8.946439081812832e-05, "loss": 0.6859, "step": 945 }, { "epoch": 1.1103286384976525, "grad_norm": 0.4014012813568115, "learning_rate": 8.934667451442025e-05, "loss": 0.7294, "step": 946 }, { "epoch": 1.1115023474178405, "grad_norm": 0.38261109590530396, "learning_rate": 8.922895821071219e-05, "loss": 0.6965, "step": 947 }, { "epoch": 1.1126760563380282, "grad_norm": 0.39297208189964294, "learning_rate": 8.911124190700413e-05, "loss": 0.7153, "step": 948 }, { "epoch": 1.113849765258216, "grad_norm": 0.3710176348686218, "learning_rate": 8.899352560329605e-05, "loss": 0.7085, "step": 949 }, { "epoch": 1.1150234741784038, "grad_norm": 0.3750080168247223, "learning_rate": 8.8875809299588e-05, "loss": 0.6739, "step": 950 }, { "epoch": 1.1161971830985915, "grad_norm": 0.3672105073928833, "learning_rate": 8.875809299587994e-05, "loss": 0.7097, "step": 951 }, { "epoch": 1.1173708920187793, "grad_norm": 0.3663265109062195, "learning_rate": 8.864037669217187e-05, "loss": 0.6594, "step": 952 }, { "epoch": 1.1185446009389672, "grad_norm": 0.4023442268371582, "learning_rate": 8.85226603884638e-05, "loss": 0.7186, "step": 953 }, { "epoch": 1.119718309859155, "grad_norm": 0.36602139472961426, "learning_rate": 8.840494408475574e-05, "loss": 0.67, "step": 954 }, { "epoch": 1.1208920187793427, "grad_norm": 0.36866381764411926, "learning_rate": 8.828722778104768e-05, "loss": 0.6954, "step": 955 }, { "epoch": 1.1220657276995305, "grad_norm": 0.38905832171440125, "learning_rate": 8.816951147733961e-05, "loss": 0.7214, "step": 956 }, { "epoch": 1.1232394366197183, "grad_norm": 0.3806670010089874, "learning_rate": 8.805179517363156e-05, "loss": 0.6679, "step": 957 }, { "epoch": 1.124413145539906, "grad_norm": 0.3796343505382538, "learning_rate": 8.793407886992349e-05, "loss": 0.6334, "step": 958 }, { "epoch": 1.125586854460094, "grad_norm": 0.4143288731575012, "learning_rate": 8.781636256621543e-05, "loss": 0.7484, "step": 959 }, { "epoch": 1.1267605633802817, "grad_norm": 0.3692832887172699, "learning_rate": 8.769864626250736e-05, "loss": 0.6581, "step": 960 }, { "epoch": 1.1279342723004695, "grad_norm": 0.39971667528152466, "learning_rate": 8.75809299587993e-05, "loss": 0.7252, "step": 961 }, { "epoch": 1.1291079812206573, "grad_norm": 0.391924113035202, "learning_rate": 8.746321365509123e-05, "loss": 0.673, "step": 962 }, { "epoch": 1.130281690140845, "grad_norm": 0.39626866579055786, "learning_rate": 8.734549735138317e-05, "loss": 0.7161, "step": 963 }, { "epoch": 1.131455399061033, "grad_norm": 0.3812800347805023, "learning_rate": 8.72277810476751e-05, "loss": 0.6735, "step": 964 }, { "epoch": 1.1326291079812207, "grad_norm": 0.36054447293281555, "learning_rate": 8.711006474396705e-05, "loss": 0.6861, "step": 965 }, { "epoch": 1.1338028169014085, "grad_norm": 0.41179588437080383, "learning_rate": 8.699234844025897e-05, "loss": 0.7151, "step": 966 }, { "epoch": 1.1349765258215962, "grad_norm": 0.3688051998615265, "learning_rate": 8.687463213655092e-05, "loss": 0.6608, "step": 967 }, { "epoch": 1.136150234741784, "grad_norm": 0.3877013325691223, "learning_rate": 8.675691583284285e-05, "loss": 0.6826, "step": 968 }, { "epoch": 1.1373239436619718, "grad_norm": 0.38986387848854065, "learning_rate": 8.663919952913479e-05, "loss": 0.6915, "step": 969 }, { "epoch": 1.1384976525821595, "grad_norm": 0.41986656188964844, "learning_rate": 8.652148322542672e-05, "loss": 0.7471, "step": 970 }, { "epoch": 1.1396713615023475, "grad_norm": 0.3977747857570648, "learning_rate": 8.640376692171867e-05, "loss": 0.6844, "step": 971 }, { "epoch": 1.1408450704225352, "grad_norm": 0.3956218361854553, "learning_rate": 8.628605061801059e-05, "loss": 0.6586, "step": 972 }, { "epoch": 1.142018779342723, "grad_norm": 0.3789028227329254, "learning_rate": 8.616833431430254e-05, "loss": 0.7415, "step": 973 }, { "epoch": 1.1431924882629108, "grad_norm": 0.3878764808177948, "learning_rate": 8.605061801059447e-05, "loss": 0.6559, "step": 974 }, { "epoch": 1.1443661971830985, "grad_norm": 0.37901559472084045, "learning_rate": 8.593290170688641e-05, "loss": 0.6685, "step": 975 }, { "epoch": 1.1455399061032865, "grad_norm": 0.40399041771888733, "learning_rate": 8.581518540317834e-05, "loss": 0.6602, "step": 976 }, { "epoch": 1.1467136150234742, "grad_norm": 0.38144391775131226, "learning_rate": 8.569746909947029e-05, "loss": 0.6683, "step": 977 }, { "epoch": 1.147887323943662, "grad_norm": 0.3610433042049408, "learning_rate": 8.557975279576221e-05, "loss": 0.6579, "step": 978 }, { "epoch": 1.1490610328638498, "grad_norm": 0.42147722840309143, "learning_rate": 8.546203649205416e-05, "loss": 0.6997, "step": 979 }, { "epoch": 1.1502347417840375, "grad_norm": 0.3799455761909485, "learning_rate": 8.53443201883461e-05, "loss": 0.7096, "step": 980 }, { "epoch": 1.1514084507042253, "grad_norm": 0.4173739552497864, "learning_rate": 8.522660388463803e-05, "loss": 0.6708, "step": 981 }, { "epoch": 1.1525821596244132, "grad_norm": 0.3997640013694763, "learning_rate": 8.510888758092996e-05, "loss": 0.6514, "step": 982 }, { "epoch": 1.153755868544601, "grad_norm": 0.3758656978607178, "learning_rate": 8.49911712772219e-05, "loss": 0.6442, "step": 983 }, { "epoch": 1.1549295774647887, "grad_norm": 0.37429675459861755, "learning_rate": 8.487345497351383e-05, "loss": 0.6619, "step": 984 }, { "epoch": 1.1561032863849765, "grad_norm": 0.3747265934944153, "learning_rate": 8.475573866980577e-05, "loss": 0.7107, "step": 985 }, { "epoch": 1.1572769953051643, "grad_norm": 0.37782514095306396, "learning_rate": 8.463802236609771e-05, "loss": 0.7241, "step": 986 }, { "epoch": 1.158450704225352, "grad_norm": 0.3703122138977051, "learning_rate": 8.452030606238965e-05, "loss": 0.6952, "step": 987 }, { "epoch": 1.15962441314554, "grad_norm": 0.37990477681159973, "learning_rate": 8.440258975868158e-05, "loss": 0.7364, "step": 988 }, { "epoch": 1.1607981220657277, "grad_norm": 0.42046844959259033, "learning_rate": 8.428487345497352e-05, "loss": 0.695, "step": 989 }, { "epoch": 1.1619718309859155, "grad_norm": 0.3745966851711273, "learning_rate": 8.416715715126545e-05, "loss": 0.6875, "step": 990 }, { "epoch": 1.1631455399061033, "grad_norm": 0.3496320843696594, "learning_rate": 8.404944084755739e-05, "loss": 0.6826, "step": 991 }, { "epoch": 1.164319248826291, "grad_norm": 0.39181873202323914, "learning_rate": 8.393172454384934e-05, "loss": 0.6937, "step": 992 }, { "epoch": 1.165492957746479, "grad_norm": 0.3910543620586395, "learning_rate": 8.381400824014126e-05, "loss": 0.749, "step": 993 }, { "epoch": 1.1666666666666667, "grad_norm": 0.3770748972892761, "learning_rate": 8.36962919364332e-05, "loss": 0.6743, "step": 994 }, { "epoch": 1.1678403755868545, "grad_norm": 0.3675018846988678, "learning_rate": 8.357857563272513e-05, "loss": 0.6499, "step": 995 }, { "epoch": 1.1690140845070423, "grad_norm": 0.36867639422416687, "learning_rate": 8.346085932901707e-05, "loss": 0.6642, "step": 996 }, { "epoch": 1.17018779342723, "grad_norm": 0.3860320746898651, "learning_rate": 8.334314302530901e-05, "loss": 0.6947, "step": 997 }, { "epoch": 1.1713615023474178, "grad_norm": 0.36680731177330017, "learning_rate": 8.322542672160094e-05, "loss": 0.7111, "step": 998 }, { "epoch": 1.1725352112676055, "grad_norm": 0.38997524976730347, "learning_rate": 8.310771041789288e-05, "loss": 0.6842, "step": 999 }, { "epoch": 1.1737089201877935, "grad_norm": 0.3883102834224701, "learning_rate": 8.298999411418483e-05, "loss": 0.6655, "step": 1000 }, { "epoch": 1.1748826291079812, "grad_norm": 0.37893033027648926, "learning_rate": 8.287227781047675e-05, "loss": 0.7422, "step": 1001 }, { "epoch": 1.176056338028169, "grad_norm": 0.3806493580341339, "learning_rate": 8.27545615067687e-05, "loss": 0.7147, "step": 1002 }, { "epoch": 1.1772300469483568, "grad_norm": 0.36736544966697693, "learning_rate": 8.263684520306063e-05, "loss": 0.6813, "step": 1003 }, { "epoch": 1.1784037558685445, "grad_norm": 0.3947063088417053, "learning_rate": 8.251912889935256e-05, "loss": 0.6863, "step": 1004 }, { "epoch": 1.1795774647887325, "grad_norm": 0.3787820339202881, "learning_rate": 8.24014125956445e-05, "loss": 0.6929, "step": 1005 }, { "epoch": 1.1807511737089202, "grad_norm": 0.3705902099609375, "learning_rate": 8.228369629193645e-05, "loss": 0.6536, "step": 1006 }, { "epoch": 1.181924882629108, "grad_norm": 0.38193127512931824, "learning_rate": 8.216597998822837e-05, "loss": 0.6822, "step": 1007 }, { "epoch": 1.1830985915492958, "grad_norm": 0.3939112424850464, "learning_rate": 8.204826368452032e-05, "loss": 0.7211, "step": 1008 }, { "epoch": 1.1842723004694835, "grad_norm": 0.3711683452129364, "learning_rate": 8.193054738081225e-05, "loss": 0.6839, "step": 1009 }, { "epoch": 1.1854460093896715, "grad_norm": 0.3889683187007904, "learning_rate": 8.181283107710418e-05, "loss": 0.7151, "step": 1010 }, { "epoch": 1.1866197183098592, "grad_norm": 0.3716399073600769, "learning_rate": 8.169511477339612e-05, "loss": 0.6706, "step": 1011 }, { "epoch": 1.187793427230047, "grad_norm": 0.3653043806552887, "learning_rate": 8.157739846968805e-05, "loss": 0.691, "step": 1012 }, { "epoch": 1.1889671361502347, "grad_norm": 0.37435877323150635, "learning_rate": 8.145968216597999e-05, "loss": 0.6431, "step": 1013 }, { "epoch": 1.1901408450704225, "grad_norm": 0.36023011803627014, "learning_rate": 8.134196586227192e-05, "loss": 0.6397, "step": 1014 }, { "epoch": 1.1913145539906103, "grad_norm": 0.3797522485256195, "learning_rate": 8.122424955856387e-05, "loss": 0.6615, "step": 1015 }, { "epoch": 1.192488262910798, "grad_norm": 0.3832303285598755, "learning_rate": 8.11065332548558e-05, "loss": 0.6685, "step": 1016 }, { "epoch": 1.193661971830986, "grad_norm": 0.37008216977119446, "learning_rate": 8.098881695114774e-05, "loss": 0.7051, "step": 1017 }, { "epoch": 1.1948356807511737, "grad_norm": 0.3758820593357086, "learning_rate": 8.087110064743967e-05, "loss": 0.6753, "step": 1018 }, { "epoch": 1.1960093896713615, "grad_norm": 0.3715493977069855, "learning_rate": 8.075338434373161e-05, "loss": 0.6863, "step": 1019 }, { "epoch": 1.1971830985915493, "grad_norm": 0.36335518956184387, "learning_rate": 8.063566804002354e-05, "loss": 0.6865, "step": 1020 }, { "epoch": 1.198356807511737, "grad_norm": 0.36738407611846924, "learning_rate": 8.051795173631549e-05, "loss": 0.6729, "step": 1021 }, { "epoch": 1.199530516431925, "grad_norm": 0.37416642904281616, "learning_rate": 8.040023543260741e-05, "loss": 0.6904, "step": 1022 }, { "epoch": 1.2007042253521127, "grad_norm": 0.37728917598724365, "learning_rate": 8.028251912889936e-05, "loss": 0.6613, "step": 1023 }, { "epoch": 1.2018779342723005, "grad_norm": 0.3821254074573517, "learning_rate": 8.016480282519128e-05, "loss": 0.6567, "step": 1024 }, { "epoch": 1.2030516431924883, "grad_norm": 0.37766242027282715, "learning_rate": 8.004708652148323e-05, "loss": 0.6648, "step": 1025 }, { "epoch": 1.204225352112676, "grad_norm": 0.3994279205799103, "learning_rate": 7.992937021777516e-05, "loss": 0.7085, "step": 1026 }, { "epoch": 1.2053990610328638, "grad_norm": 0.3634461760520935, "learning_rate": 7.98116539140671e-05, "loss": 0.69, "step": 1027 }, { "epoch": 1.2065727699530517, "grad_norm": 0.36296284198760986, "learning_rate": 7.969393761035903e-05, "loss": 0.6394, "step": 1028 }, { "epoch": 1.2077464788732395, "grad_norm": 0.38547366857528687, "learning_rate": 7.957622130665098e-05, "loss": 0.6602, "step": 1029 }, { "epoch": 1.2089201877934272, "grad_norm": 0.38083600997924805, "learning_rate": 7.94585050029429e-05, "loss": 0.6559, "step": 1030 }, { "epoch": 1.210093896713615, "grad_norm": 0.377316415309906, "learning_rate": 7.934078869923485e-05, "loss": 0.7096, "step": 1031 }, { "epoch": 1.2112676056338028, "grad_norm": 0.37379539012908936, "learning_rate": 7.922307239552679e-05, "loss": 0.7321, "step": 1032 }, { "epoch": 1.2124413145539905, "grad_norm": 0.3899437487125397, "learning_rate": 7.910535609181872e-05, "loss": 0.6307, "step": 1033 }, { "epoch": 1.2136150234741785, "grad_norm": 0.3994787931442261, "learning_rate": 7.898763978811065e-05, "loss": 0.7189, "step": 1034 }, { "epoch": 1.2147887323943662, "grad_norm": 0.3705124855041504, "learning_rate": 7.88699234844026e-05, "loss": 0.6683, "step": 1035 }, { "epoch": 1.215962441314554, "grad_norm": 0.39496761560440063, "learning_rate": 7.875220718069452e-05, "loss": 0.6961, "step": 1036 }, { "epoch": 1.2171361502347418, "grad_norm": 0.38047799468040466, "learning_rate": 7.863449087698647e-05, "loss": 0.6877, "step": 1037 }, { "epoch": 1.2183098591549295, "grad_norm": 0.3772275745868683, "learning_rate": 7.85167745732784e-05, "loss": 0.6523, "step": 1038 }, { "epoch": 1.2194835680751175, "grad_norm": 0.37646278738975525, "learning_rate": 7.839905826957034e-05, "loss": 0.686, "step": 1039 }, { "epoch": 1.2206572769953052, "grad_norm": 0.3657703101634979, "learning_rate": 7.828134196586228e-05, "loss": 0.6904, "step": 1040 }, { "epoch": 1.221830985915493, "grad_norm": 0.36845406889915466, "learning_rate": 7.816362566215421e-05, "loss": 0.6795, "step": 1041 }, { "epoch": 1.2230046948356808, "grad_norm": 0.37235668301582336, "learning_rate": 7.804590935844614e-05, "loss": 0.6406, "step": 1042 }, { "epoch": 1.2241784037558685, "grad_norm": 0.3837567865848541, "learning_rate": 7.792819305473808e-05, "loss": 0.7387, "step": 1043 }, { "epoch": 1.2253521126760563, "grad_norm": 0.3731269836425781, "learning_rate": 7.781047675103003e-05, "loss": 0.6767, "step": 1044 }, { "epoch": 1.226525821596244, "grad_norm": 0.3709997534751892, "learning_rate": 7.769276044732196e-05, "loss": 0.6843, "step": 1045 }, { "epoch": 1.227699530516432, "grad_norm": 0.36420220136642456, "learning_rate": 7.75750441436139e-05, "loss": 0.6732, "step": 1046 }, { "epoch": 1.2288732394366197, "grad_norm": 0.35353943705558777, "learning_rate": 7.745732783990583e-05, "loss": 0.6955, "step": 1047 }, { "epoch": 1.2300469483568075, "grad_norm": 0.3663172721862793, "learning_rate": 7.733961153619777e-05, "loss": 0.6509, "step": 1048 }, { "epoch": 1.2312206572769953, "grad_norm": 0.3808366060256958, "learning_rate": 7.72218952324897e-05, "loss": 0.6736, "step": 1049 }, { "epoch": 1.232394366197183, "grad_norm": 0.3684788942337036, "learning_rate": 7.710417892878165e-05, "loss": 0.7294, "step": 1050 }, { "epoch": 1.233568075117371, "grad_norm": 0.3792336881160736, "learning_rate": 7.698646262507357e-05, "loss": 0.6864, "step": 1051 }, { "epoch": 1.2347417840375587, "grad_norm": 0.3681674897670746, "learning_rate": 7.686874632136552e-05, "loss": 0.6406, "step": 1052 }, { "epoch": 1.2359154929577465, "grad_norm": 0.3578929603099823, "learning_rate": 7.675103001765744e-05, "loss": 0.6799, "step": 1053 }, { "epoch": 1.2370892018779343, "grad_norm": 0.3885924816131592, "learning_rate": 7.663331371394939e-05, "loss": 0.6888, "step": 1054 }, { "epoch": 1.238262910798122, "grad_norm": 0.3622206747531891, "learning_rate": 7.651559741024132e-05, "loss": 0.6591, "step": 1055 }, { "epoch": 1.2394366197183098, "grad_norm": 0.36698073148727417, "learning_rate": 7.639788110653326e-05, "loss": 0.6703, "step": 1056 }, { "epoch": 1.2406103286384977, "grad_norm": 0.3823949992656708, "learning_rate": 7.628016480282519e-05, "loss": 0.6354, "step": 1057 }, { "epoch": 1.2417840375586855, "grad_norm": 0.3905463218688965, "learning_rate": 7.616244849911714e-05, "loss": 0.6875, "step": 1058 }, { "epoch": 1.2429577464788732, "grad_norm": 0.3658163845539093, "learning_rate": 7.604473219540906e-05, "loss": 0.6918, "step": 1059 }, { "epoch": 1.244131455399061, "grad_norm": 0.3887515068054199, "learning_rate": 7.592701589170101e-05, "loss": 0.6636, "step": 1060 }, { "epoch": 1.2453051643192488, "grad_norm": 0.38240671157836914, "learning_rate": 7.580929958799294e-05, "loss": 0.7151, "step": 1061 }, { "epoch": 1.2464788732394365, "grad_norm": 0.3772590458393097, "learning_rate": 7.569158328428488e-05, "loss": 0.646, "step": 1062 }, { "epoch": 1.2476525821596245, "grad_norm": 0.4079722464084625, "learning_rate": 7.557386698057681e-05, "loss": 0.6599, "step": 1063 }, { "epoch": 1.2488262910798122, "grad_norm": 0.36246946454048157, "learning_rate": 7.545615067686876e-05, "loss": 0.6512, "step": 1064 }, { "epoch": 1.25, "grad_norm": 0.4114508330821991, "learning_rate": 7.533843437316068e-05, "loss": 0.6756, "step": 1065 }, { "epoch": 1.2511737089201878, "grad_norm": 0.40775856375694275, "learning_rate": 7.522071806945263e-05, "loss": 0.7285, "step": 1066 }, { "epoch": 1.2523474178403755, "grad_norm": 0.3941507339477539, "learning_rate": 7.510300176574456e-05, "loss": 0.6548, "step": 1067 }, { "epoch": 1.2535211267605635, "grad_norm": 0.3808498978614807, "learning_rate": 7.49852854620365e-05, "loss": 0.6922, "step": 1068 }, { "epoch": 1.2546948356807512, "grad_norm": 0.39144501090049744, "learning_rate": 7.486756915832843e-05, "loss": 0.779, "step": 1069 }, { "epoch": 1.255868544600939, "grad_norm": 0.3885059356689453, "learning_rate": 7.474985285462037e-05, "loss": 0.7023, "step": 1070 }, { "epoch": 1.2570422535211268, "grad_norm": 0.37694790959358215, "learning_rate": 7.46321365509123e-05, "loss": 0.6306, "step": 1071 }, { "epoch": 1.2582159624413145, "grad_norm": 0.3846953213214874, "learning_rate": 7.451442024720424e-05, "loss": 0.6719, "step": 1072 }, { "epoch": 1.2593896713615025, "grad_norm": 0.3871643841266632, "learning_rate": 7.439670394349618e-05, "loss": 0.7057, "step": 1073 }, { "epoch": 1.26056338028169, "grad_norm": 0.37070584297180176, "learning_rate": 7.427898763978812e-05, "loss": 0.706, "step": 1074 }, { "epoch": 1.261737089201878, "grad_norm": 0.3774947226047516, "learning_rate": 7.416127133608005e-05, "loss": 0.6918, "step": 1075 }, { "epoch": 1.2629107981220657, "grad_norm": 0.36732053756713867, "learning_rate": 7.404355503237199e-05, "loss": 0.6639, "step": 1076 }, { "epoch": 1.2640845070422535, "grad_norm": 0.36723384261131287, "learning_rate": 7.392583872866392e-05, "loss": 0.6819, "step": 1077 }, { "epoch": 1.2652582159624413, "grad_norm": 0.35997796058654785, "learning_rate": 7.380812242495586e-05, "loss": 0.6772, "step": 1078 }, { "epoch": 1.266431924882629, "grad_norm": 0.385193407535553, "learning_rate": 7.36904061212478e-05, "loss": 0.693, "step": 1079 }, { "epoch": 1.267605633802817, "grad_norm": 0.40055274963378906, "learning_rate": 7.357268981753973e-05, "loss": 0.6957, "step": 1080 }, { "epoch": 1.2687793427230047, "grad_norm": 0.35632067918777466, "learning_rate": 7.345497351383167e-05, "loss": 0.685, "step": 1081 }, { "epoch": 1.2699530516431925, "grad_norm": 0.3778478503227234, "learning_rate": 7.333725721012361e-05, "loss": 0.6657, "step": 1082 }, { "epoch": 1.2711267605633803, "grad_norm": 0.38718998432159424, "learning_rate": 7.321954090641554e-05, "loss": 0.6929, "step": 1083 }, { "epoch": 1.272300469483568, "grad_norm": 0.3769080936908722, "learning_rate": 7.310182460270748e-05, "loss": 0.7031, "step": 1084 }, { "epoch": 1.273474178403756, "grad_norm": 0.3617388904094696, "learning_rate": 7.298410829899941e-05, "loss": 0.665, "step": 1085 }, { "epoch": 1.2746478873239437, "grad_norm": 0.38341668248176575, "learning_rate": 7.286639199529135e-05, "loss": 0.6921, "step": 1086 }, { "epoch": 1.2758215962441315, "grad_norm": 0.36512598395347595, "learning_rate": 7.27486756915833e-05, "loss": 0.6582, "step": 1087 }, { "epoch": 1.2769953051643192, "grad_norm": 0.3690038025379181, "learning_rate": 7.263095938787522e-05, "loss": 0.6923, "step": 1088 }, { "epoch": 1.278169014084507, "grad_norm": 0.3716605007648468, "learning_rate": 7.251324308416716e-05, "loss": 0.6825, "step": 1089 }, { "epoch": 1.2793427230046948, "grad_norm": 0.3732469379901886, "learning_rate": 7.23955267804591e-05, "loss": 0.7064, "step": 1090 }, { "epoch": 1.2805164319248825, "grad_norm": 0.3982716500759125, "learning_rate": 7.227781047675103e-05, "loss": 0.7379, "step": 1091 }, { "epoch": 1.2816901408450705, "grad_norm": 0.36569979786872864, "learning_rate": 7.216009417304297e-05, "loss": 0.6309, "step": 1092 }, { "epoch": 1.2828638497652582, "grad_norm": 0.3752378523349762, "learning_rate": 7.204237786933492e-05, "loss": 0.655, "step": 1093 }, { "epoch": 1.284037558685446, "grad_norm": 0.3787980377674103, "learning_rate": 7.192466156562684e-05, "loss": 0.6503, "step": 1094 }, { "epoch": 1.2852112676056338, "grad_norm": 0.40723252296447754, "learning_rate": 7.180694526191878e-05, "loss": 0.6949, "step": 1095 }, { "epoch": 1.2863849765258215, "grad_norm": 0.36322662234306335, "learning_rate": 7.168922895821072e-05, "loss": 0.6924, "step": 1096 }, { "epoch": 1.2875586854460095, "grad_norm": 0.38535597920417786, "learning_rate": 7.157151265450265e-05, "loss": 0.6597, "step": 1097 }, { "epoch": 1.2887323943661972, "grad_norm": 0.3948807716369629, "learning_rate": 7.145379635079459e-05, "loss": 0.6594, "step": 1098 }, { "epoch": 1.289906103286385, "grad_norm": 0.3859221935272217, "learning_rate": 7.133608004708652e-05, "loss": 0.6894, "step": 1099 }, { "epoch": 1.2910798122065728, "grad_norm": 0.36739176511764526, "learning_rate": 7.121836374337846e-05, "loss": 0.6726, "step": 1100 }, { "epoch": 1.290653384119543, "grad_norm": 1.054391622543335, "learning_rate": 7.125220458553792e-05, "loss": 0.8774, "step": 1101 }, { "epoch": 1.2918253735716378, "grad_norm": 1.0680177211761475, "learning_rate": 7.113462669018225e-05, "loss": 0.8413, "step": 1102 }, { "epoch": 1.2929973630237328, "grad_norm": 0.6065592169761658, "learning_rate": 7.101704879482658e-05, "loss": 0.7806, "step": 1103 }, { "epoch": 1.2941693524758278, "grad_norm": 0.6377708315849304, "learning_rate": 7.089947089947089e-05, "loss": 0.7703, "step": 1104 }, { "epoch": 1.2953413419279227, "grad_norm": 0.522957980632782, "learning_rate": 7.078189300411523e-05, "loss": 0.7668, "step": 1105 }, { "epoch": 1.2965133313800177, "grad_norm": 0.5710256099700928, "learning_rate": 7.066431510875956e-05, "loss": 0.7143, "step": 1106 }, { "epoch": 1.2976853208321124, "grad_norm": 0.5131927728652954, "learning_rate": 7.054673721340387e-05, "loss": 0.7151, "step": 1107 }, { "epoch": 1.2988573102842074, "grad_norm": 0.495702862739563, "learning_rate": 7.042915931804821e-05, "loss": 0.7108, "step": 1108 }, { "epoch": 1.3000292997363023, "grad_norm": 0.49212130904197693, "learning_rate": 7.031158142269254e-05, "loss": 0.711, "step": 1109 }, { "epoch": 1.3012012891883973, "grad_norm": 0.4861961305141449, "learning_rate": 7.019400352733686e-05, "loss": 0.7562, "step": 1110 }, { "epoch": 1.3023732786404922, "grad_norm": 0.4575994610786438, "learning_rate": 7.00764256319812e-05, "loss": 0.7036, "step": 1111 }, { "epoch": 1.3035452680925872, "grad_norm": 0.4313446283340454, "learning_rate": 6.995884773662552e-05, "loss": 0.7124, "step": 1112 }, { "epoch": 1.3047172575446822, "grad_norm": 0.4154439866542816, "learning_rate": 6.984126984126984e-05, "loss": 0.6674, "step": 1113 }, { "epoch": 1.305889246996777, "grad_norm": 0.4371664226055145, "learning_rate": 6.972369194591418e-05, "loss": 0.7066, "step": 1114 }, { "epoch": 1.307061236448872, "grad_norm": 0.40978437662124634, "learning_rate": 6.96061140505585e-05, "loss": 0.6923, "step": 1115 }, { "epoch": 1.3082332259009668, "grad_norm": 0.39959609508514404, "learning_rate": 6.948853615520282e-05, "loss": 0.7224, "step": 1116 }, { "epoch": 1.3094052153530618, "grad_norm": 0.4188673794269562, "learning_rate": 6.937095825984715e-05, "loss": 0.727, "step": 1117 }, { "epoch": 1.3105772048051567, "grad_norm": 0.3936775028705597, "learning_rate": 6.925338036449149e-05, "loss": 0.6825, "step": 1118 }, { "epoch": 1.3117491942572517, "grad_norm": 0.39894506335258484, "learning_rate": 6.91358024691358e-05, "loss": 0.642, "step": 1119 }, { "epoch": 1.3129211837093466, "grad_norm": 0.4340292513370514, "learning_rate": 6.901822457378013e-05, "loss": 0.7259, "step": 1120 }, { "epoch": 1.3140931731614416, "grad_norm": 0.4102571904659271, "learning_rate": 6.890064667842447e-05, "loss": 0.6954, "step": 1121 }, { "epoch": 1.3152651626135365, "grad_norm": 0.4111669063568115, "learning_rate": 6.878306878306878e-05, "loss": 0.671, "step": 1122 }, { "epoch": 1.3164371520656313, "grad_norm": 0.4138496518135071, "learning_rate": 6.866549088771311e-05, "loss": 0.6387, "step": 1123 }, { "epoch": 1.3176091415177265, "grad_norm": 0.3867737054824829, "learning_rate": 6.854791299235745e-05, "loss": 0.6777, "step": 1124 }, { "epoch": 1.3187811309698212, "grad_norm": 0.40671873092651367, "learning_rate": 6.843033509700176e-05, "loss": 0.695, "step": 1125 }, { "epoch": 1.3199531204219161, "grad_norm": 0.3959422707557678, "learning_rate": 6.831275720164609e-05, "loss": 0.6755, "step": 1126 }, { "epoch": 1.321125109874011, "grad_norm": 0.3982046842575073, "learning_rate": 6.819517930629043e-05, "loss": 0.6898, "step": 1127 }, { "epoch": 1.322297099326106, "grad_norm": 0.3888896405696869, "learning_rate": 6.807760141093474e-05, "loss": 0.6595, "step": 1128 }, { "epoch": 1.323469088778201, "grad_norm": 0.4006032347679138, "learning_rate": 6.796002351557907e-05, "loss": 0.6882, "step": 1129 }, { "epoch": 1.324641078230296, "grad_norm": 0.37636417150497437, "learning_rate": 6.78424456202234e-05, "loss": 0.6689, "step": 1130 }, { "epoch": 1.325813067682391, "grad_norm": 0.3819728195667267, "learning_rate": 6.772486772486773e-05, "loss": 0.7133, "step": 1131 }, { "epoch": 1.3269850571344857, "grad_norm": 0.37200263142585754, "learning_rate": 6.760728982951205e-05, "loss": 0.6942, "step": 1132 }, { "epoch": 1.3281570465865808, "grad_norm": 0.3960074186325073, "learning_rate": 6.748971193415638e-05, "loss": 0.6271, "step": 1133 }, { "epoch": 1.3293290360386756, "grad_norm": 0.3755532205104828, "learning_rate": 6.737213403880071e-05, "loss": 0.6706, "step": 1134 }, { "epoch": 1.3305010254907705, "grad_norm": 0.387464314699173, "learning_rate": 6.725455614344503e-05, "loss": 0.6958, "step": 1135 }, { "epoch": 1.3316730149428655, "grad_norm": 0.3806421756744385, "learning_rate": 6.713697824808936e-05, "loss": 0.6928, "step": 1136 }, { "epoch": 1.3328450043949605, "grad_norm": 0.38345402479171753, "learning_rate": 6.701940035273369e-05, "loss": 0.7126, "step": 1137 }, { "epoch": 1.3340169938470554, "grad_norm": 0.3863559067249298, "learning_rate": 6.690182245737802e-05, "loss": 0.6914, "step": 1138 }, { "epoch": 1.3351889832991504, "grad_norm": 0.3891694247722626, "learning_rate": 6.678424456202234e-05, "loss": 0.6817, "step": 1139 }, { "epoch": 1.3363609727512453, "grad_norm": 0.4065246284008026, "learning_rate": 6.666666666666667e-05, "loss": 0.7028, "step": 1140 }, { "epoch": 1.33753296220334, "grad_norm": 0.38939380645751953, "learning_rate": 6.6549088771311e-05, "loss": 0.6528, "step": 1141 }, { "epoch": 1.3387049516554352, "grad_norm": 0.3831149935722351, "learning_rate": 6.643151087595532e-05, "loss": 0.675, "step": 1142 }, { "epoch": 1.33987694110753, "grad_norm": 0.4192376434803009, "learning_rate": 6.631393298059965e-05, "loss": 0.7131, "step": 1143 }, { "epoch": 1.341048930559625, "grad_norm": 0.38939863443374634, "learning_rate": 6.619635508524398e-05, "loss": 0.6696, "step": 1144 }, { "epoch": 1.3422209200117199, "grad_norm": 0.39097532629966736, "learning_rate": 6.60787771898883e-05, "loss": 0.6845, "step": 1145 }, { "epoch": 1.3433929094638148, "grad_norm": 0.39098918437957764, "learning_rate": 6.596119929453263e-05, "loss": 0.6643, "step": 1146 }, { "epoch": 1.3445648989159098, "grad_norm": 0.41722220182418823, "learning_rate": 6.584362139917696e-05, "loss": 0.6461, "step": 1147 }, { "epoch": 1.3457368883680048, "grad_norm": 0.40393978357315063, "learning_rate": 6.572604350382129e-05, "loss": 0.677, "step": 1148 }, { "epoch": 1.3469088778200997, "grad_norm": 0.37448999285697937, "learning_rate": 6.560846560846561e-05, "loss": 0.6534, "step": 1149 }, { "epoch": 1.3480808672721944, "grad_norm": 0.3823452889919281, "learning_rate": 6.549088771310994e-05, "loss": 0.6269, "step": 1150 }, { "epoch": 1.3492528567242894, "grad_norm": 0.4128420650959015, "learning_rate": 6.537330981775427e-05, "loss": 0.6659, "step": 1151 }, { "epoch": 1.3504248461763844, "grad_norm": 0.385935515165329, "learning_rate": 6.525573192239858e-05, "loss": 0.6501, "step": 1152 }, { "epoch": 1.3515968356284793, "grad_norm": 0.39446333050727844, "learning_rate": 6.513815402704292e-05, "loss": 0.6607, "step": 1153 }, { "epoch": 1.3527688250805743, "grad_norm": 0.39609313011169434, "learning_rate": 6.502057613168725e-05, "loss": 0.6408, "step": 1154 }, { "epoch": 1.3539408145326692, "grad_norm": 0.4118976891040802, "learning_rate": 6.490299823633156e-05, "loss": 0.6783, "step": 1155 }, { "epoch": 1.3551128039847642, "grad_norm": 0.39040106534957886, "learning_rate": 6.47854203409759e-05, "loss": 0.6996, "step": 1156 }, { "epoch": 1.3562847934368591, "grad_norm": 0.3837215304374695, "learning_rate": 6.466784244562023e-05, "loss": 0.6536, "step": 1157 }, { "epoch": 1.357456782888954, "grad_norm": 0.38842037320137024, "learning_rate": 6.455026455026454e-05, "loss": 0.675, "step": 1158 }, { "epoch": 1.3586287723410488, "grad_norm": 0.3818652927875519, "learning_rate": 6.443268665490889e-05, "loss": 0.6821, "step": 1159 }, { "epoch": 1.3598007617931438, "grad_norm": 0.38457590341567993, "learning_rate": 6.431510875955321e-05, "loss": 0.6801, "step": 1160 }, { "epoch": 1.3609727512452388, "grad_norm": 0.40308085083961487, "learning_rate": 6.419753086419753e-05, "loss": 0.7282, "step": 1161 }, { "epoch": 1.3621447406973337, "grad_norm": 0.3923408091068268, "learning_rate": 6.407995296884187e-05, "loss": 0.6771, "step": 1162 }, { "epoch": 1.3633167301494287, "grad_norm": 0.35983577370643616, "learning_rate": 6.39623750734862e-05, "loss": 0.6684, "step": 1163 }, { "epoch": 1.3644887196015236, "grad_norm": 0.38956505060195923, "learning_rate": 6.384479717813051e-05, "loss": 0.6456, "step": 1164 }, { "epoch": 1.3656607090536186, "grad_norm": 0.3745352327823639, "learning_rate": 6.372721928277484e-05, "loss": 0.6628, "step": 1165 }, { "epoch": 1.3668326985057133, "grad_norm": 0.3648051619529724, "learning_rate": 6.360964138741918e-05, "loss": 0.6341, "step": 1166 }, { "epoch": 1.3680046879578085, "grad_norm": 0.3933015763759613, "learning_rate": 6.349206349206349e-05, "loss": 0.6862, "step": 1167 }, { "epoch": 1.3691766774099032, "grad_norm": 0.3905039429664612, "learning_rate": 6.337448559670782e-05, "loss": 0.6697, "step": 1168 }, { "epoch": 1.3703486668619982, "grad_norm": 0.39760053157806396, "learning_rate": 6.325690770135216e-05, "loss": 0.6903, "step": 1169 }, { "epoch": 1.3715206563140931, "grad_norm": 0.38109833002090454, "learning_rate": 6.313932980599647e-05, "loss": 0.6539, "step": 1170 }, { "epoch": 1.372692645766188, "grad_norm": 0.3650861978530884, "learning_rate": 6.30217519106408e-05, "loss": 0.6749, "step": 1171 }, { "epoch": 1.373864635218283, "grad_norm": 0.35274556279182434, "learning_rate": 6.290417401528514e-05, "loss": 0.6421, "step": 1172 }, { "epoch": 1.375036624670378, "grad_norm": 0.41130268573760986, "learning_rate": 6.278659611992945e-05, "loss": 0.7218, "step": 1173 }, { "epoch": 1.376208614122473, "grad_norm": 0.3993924856185913, "learning_rate": 6.266901822457378e-05, "loss": 0.675, "step": 1174 }, { "epoch": 1.3773806035745677, "grad_norm": 0.3782753348350525, "learning_rate": 6.255144032921812e-05, "loss": 0.6316, "step": 1175 }, { "epoch": 1.3785525930266629, "grad_norm": 0.3624318242073059, "learning_rate": 6.243386243386243e-05, "loss": 0.65, "step": 1176 }, { "epoch": 1.3797245824787576, "grad_norm": 0.3888125717639923, "learning_rate": 6.231628453850676e-05, "loss": 0.7022, "step": 1177 }, { "epoch": 1.3808965719308526, "grad_norm": 0.3882753252983093, "learning_rate": 6.219870664315109e-05, "loss": 0.6866, "step": 1178 }, { "epoch": 1.3820685613829475, "grad_norm": 0.39087679982185364, "learning_rate": 6.208112874779542e-05, "loss": 0.6398, "step": 1179 }, { "epoch": 1.3832405508350425, "grad_norm": 0.40074875950813293, "learning_rate": 6.196355085243974e-05, "loss": 0.6446, "step": 1180 }, { "epoch": 1.3844125402871374, "grad_norm": 0.40180960297584534, "learning_rate": 6.184597295708407e-05, "loss": 0.6994, "step": 1181 }, { "epoch": 1.3855845297392324, "grad_norm": 0.38567396998405457, "learning_rate": 6.17283950617284e-05, "loss": 0.6659, "step": 1182 }, { "epoch": 1.3867565191913274, "grad_norm": 0.3690516948699951, "learning_rate": 6.161081716637272e-05, "loss": 0.6203, "step": 1183 }, { "epoch": 1.387928508643422, "grad_norm": 0.4022299349308014, "learning_rate": 6.149323927101705e-05, "loss": 0.6684, "step": 1184 }, { "epoch": 1.3891004980955173, "grad_norm": 0.384766161441803, "learning_rate": 6.137566137566138e-05, "loss": 0.6941, "step": 1185 }, { "epoch": 1.390272487547612, "grad_norm": 0.3742985427379608, "learning_rate": 6.12580834803057e-05, "loss": 0.6947, "step": 1186 }, { "epoch": 1.391444476999707, "grad_norm": 0.40857595205307007, "learning_rate": 6.114050558495003e-05, "loss": 0.6637, "step": 1187 }, { "epoch": 1.392616466451802, "grad_norm": 0.3824119567871094, "learning_rate": 6.102292768959435e-05, "loss": 0.6357, "step": 1188 }, { "epoch": 1.3937884559038969, "grad_norm": 0.39211392402648926, "learning_rate": 6.0905349794238687e-05, "loss": 0.644, "step": 1189 }, { "epoch": 1.3949604453559918, "grad_norm": 0.36606866121292114, "learning_rate": 6.0787771898883014e-05, "loss": 0.6388, "step": 1190 }, { "epoch": 1.3961324348080868, "grad_norm": 0.38364294171333313, "learning_rate": 6.0670194003527334e-05, "loss": 0.6561, "step": 1191 }, { "epoch": 1.3973044242601818, "grad_norm": 0.377848356962204, "learning_rate": 6.055261610817167e-05, "loss": 0.6916, "step": 1192 }, { "epoch": 1.3984764137122765, "grad_norm": 0.413018137216568, "learning_rate": 6.0435038212815995e-05, "loss": 0.6752, "step": 1193 }, { "epoch": 1.3996484031643714, "grad_norm": 0.3800462782382965, "learning_rate": 6.0317460317460316e-05, "loss": 0.6916, "step": 1194 }, { "epoch": 1.4008203926164664, "grad_norm": 0.38262689113616943, "learning_rate": 6.019988242210465e-05, "loss": 0.632, "step": 1195 }, { "epoch": 1.4019923820685614, "grad_norm": 0.37755706906318665, "learning_rate": 6.0082304526748977e-05, "loss": 0.608, "step": 1196 }, { "epoch": 1.4031643715206563, "grad_norm": 0.4054010510444641, "learning_rate": 5.99647266313933e-05, "loss": 0.6952, "step": 1197 }, { "epoch": 1.4043363609727513, "grad_norm": 0.38393253087997437, "learning_rate": 5.984714873603763e-05, "loss": 0.6742, "step": 1198 }, { "epoch": 1.4055083504248462, "grad_norm": 0.3715971112251282, "learning_rate": 5.972957084068196e-05, "loss": 0.6944, "step": 1199 }, { "epoch": 1.4066803398769412, "grad_norm": 0.3809766471385956, "learning_rate": 5.961199294532628e-05, "loss": 0.69, "step": 1200 }, { "epoch": 1.4078523293290361, "grad_norm": 0.42656511068344116, "learning_rate": 5.9494415049970606e-05, "loss": 0.6761, "step": 1201 }, { "epoch": 1.4090243187811309, "grad_norm": 0.39485815167427063, "learning_rate": 5.937683715461494e-05, "loss": 0.6581, "step": 1202 }, { "epoch": 1.4101963082332258, "grad_norm": 0.39925625920295715, "learning_rate": 5.925925925925926e-05, "loss": 0.6785, "step": 1203 }, { "epoch": 1.4113682976853208, "grad_norm": 0.38978663086891174, "learning_rate": 5.914168136390359e-05, "loss": 0.6814, "step": 1204 }, { "epoch": 1.4125402871374158, "grad_norm": 0.37807661294937134, "learning_rate": 5.902410346854792e-05, "loss": 0.6872, "step": 1205 }, { "epoch": 1.4137122765895107, "grad_norm": 0.3912755846977234, "learning_rate": 5.890652557319224e-05, "loss": 0.6343, "step": 1206 }, { "epoch": 1.4148842660416057, "grad_norm": 0.39116203784942627, "learning_rate": 5.878894767783657e-05, "loss": 0.63, "step": 1207 }, { "epoch": 1.4160562554937006, "grad_norm": 0.4052892029285431, "learning_rate": 5.86713697824809e-05, "loss": 0.6698, "step": 1208 }, { "epoch": 1.4172282449457954, "grad_norm": 0.3755436837673187, "learning_rate": 5.855379188712522e-05, "loss": 0.6966, "step": 1209 }, { "epoch": 1.4184002343978905, "grad_norm": 0.37595874071121216, "learning_rate": 5.843621399176955e-05, "loss": 0.6879, "step": 1210 }, { "epoch": 1.4195722238499853, "grad_norm": 0.4029812514781952, "learning_rate": 5.8318636096413884e-05, "loss": 0.6469, "step": 1211 }, { "epoch": 1.4207442133020802, "grad_norm": 0.37568360567092896, "learning_rate": 5.82010582010582e-05, "loss": 0.663, "step": 1212 }, { "epoch": 1.4219162027541752, "grad_norm": 0.38563036918640137, "learning_rate": 5.808348030570253e-05, "loss": 0.6849, "step": 1213 }, { "epoch": 1.4230881922062701, "grad_norm": 0.3693836033344269, "learning_rate": 5.796590241034686e-05, "loss": 0.6502, "step": 1214 }, { "epoch": 1.424260181658365, "grad_norm": 0.3790949583053589, "learning_rate": 5.784832451499118e-05, "loss": 0.6848, "step": 1215 }, { "epoch": 1.42543217111046, "grad_norm": 0.379332572221756, "learning_rate": 5.773074661963551e-05, "loss": 0.642, "step": 1216 }, { "epoch": 1.426604160562555, "grad_norm": 0.38053902983665466, "learning_rate": 5.761316872427984e-05, "loss": 0.7017, "step": 1217 }, { "epoch": 1.4277761500146497, "grad_norm": 0.39065247774124146, "learning_rate": 5.749559082892416e-05, "loss": 0.6959, "step": 1218 }, { "epoch": 1.428948139466745, "grad_norm": 0.40035319328308105, "learning_rate": 5.7378012933568494e-05, "loss": 0.6378, "step": 1219 }, { "epoch": 1.4301201289188397, "grad_norm": 0.44022828340530396, "learning_rate": 5.726043503821282e-05, "loss": 0.6805, "step": 1220 }, { "epoch": 1.4312921183709346, "grad_norm": 0.37635087966918945, "learning_rate": 5.714285714285714e-05, "loss": 0.6437, "step": 1221 }, { "epoch": 1.4324641078230296, "grad_norm": 0.3882914185523987, "learning_rate": 5.7025279247501476e-05, "loss": 0.6468, "step": 1222 }, { "epoch": 1.4336360972751245, "grad_norm": 0.3925718665122986, "learning_rate": 5.69077013521458e-05, "loss": 0.6516, "step": 1223 }, { "epoch": 1.4348080867272195, "grad_norm": 0.37410497665405273, "learning_rate": 5.679012345679012e-05, "loss": 0.6963, "step": 1224 }, { "epoch": 1.4359800761793144, "grad_norm": 0.42080360651016235, "learning_rate": 5.667254556143445e-05, "loss": 0.6386, "step": 1225 }, { "epoch": 1.4371520656314094, "grad_norm": 0.41487985849380493, "learning_rate": 5.6554967666078784e-05, "loss": 0.6914, "step": 1226 }, { "epoch": 1.4383240550835041, "grad_norm": 0.36109688878059387, "learning_rate": 5.6437389770723105e-05, "loss": 0.6672, "step": 1227 }, { "epoch": 1.4394960445355993, "grad_norm": 0.37526097893714905, "learning_rate": 5.631981187536743e-05, "loss": 0.6437, "step": 1228 }, { "epoch": 1.440668033987694, "grad_norm": 0.38677671551704407, "learning_rate": 5.6202233980011766e-05, "loss": 0.6432, "step": 1229 }, { "epoch": 1.441840023439789, "grad_norm": 0.37004703283309937, "learning_rate": 5.6084656084656086e-05, "loss": 0.6873, "step": 1230 }, { "epoch": 1.443012012891884, "grad_norm": 0.363845556974411, "learning_rate": 5.596707818930041e-05, "loss": 0.6388, "step": 1231 }, { "epoch": 1.444184002343979, "grad_norm": 0.3900223970413208, "learning_rate": 5.584950029394475e-05, "loss": 0.636, "step": 1232 }, { "epoch": 1.4453559917960739, "grad_norm": 0.37807023525238037, "learning_rate": 5.573192239858907e-05, "loss": 0.6699, "step": 1233 }, { "epoch": 1.4465279812481688, "grad_norm": 0.3589508533477783, "learning_rate": 5.5614344503233395e-05, "loss": 0.6256, "step": 1234 }, { "epoch": 1.4476999707002638, "grad_norm": 0.3793289065361023, "learning_rate": 5.549676660787773e-05, "loss": 0.66, "step": 1235 }, { "epoch": 1.4488719601523585, "grad_norm": 0.38588500022888184, "learning_rate": 5.537918871252204e-05, "loss": 0.6795, "step": 1236 }, { "epoch": 1.4500439496044535, "grad_norm": 0.3903558552265167, "learning_rate": 5.5261610817166376e-05, "loss": 0.6871, "step": 1237 }, { "epoch": 1.4512159390565484, "grad_norm": 0.3721795082092285, "learning_rate": 5.51440329218107e-05, "loss": 0.6524, "step": 1238 }, { "epoch": 1.4523879285086434, "grad_norm": 0.41447609663009644, "learning_rate": 5.5026455026455024e-05, "loss": 0.6711, "step": 1239 }, { "epoch": 1.4535599179607384, "grad_norm": 0.3805668354034424, "learning_rate": 5.490887713109936e-05, "loss": 0.6467, "step": 1240 }, { "epoch": 1.4547319074128333, "grad_norm": 0.3742830157279968, "learning_rate": 5.4791299235743685e-05, "loss": 0.6497, "step": 1241 }, { "epoch": 1.4559038968649283, "grad_norm": 0.3923674523830414, "learning_rate": 5.4673721340388005e-05, "loss": 0.6446, "step": 1242 }, { "epoch": 1.4570758863170232, "grad_norm": 0.4189533591270447, "learning_rate": 5.455614344503234e-05, "loss": 0.688, "step": 1243 }, { "epoch": 1.4582478757691182, "grad_norm": 0.38281819224357605, "learning_rate": 5.4438565549676666e-05, "loss": 0.6543, "step": 1244 }, { "epoch": 1.459419865221213, "grad_norm": 0.3858702480792999, "learning_rate": 5.4320987654320986e-05, "loss": 0.6822, "step": 1245 }, { "epoch": 1.4605918546733079, "grad_norm": 0.41707929968833923, "learning_rate": 5.420340975896532e-05, "loss": 0.6261, "step": 1246 }, { "epoch": 1.4617638441254028, "grad_norm": 0.36858686804771423, "learning_rate": 5.408583186360965e-05, "loss": 0.6457, "step": 1247 }, { "epoch": 1.4629358335774978, "grad_norm": 0.4082486033439636, "learning_rate": 5.396825396825397e-05, "loss": 0.6932, "step": 1248 }, { "epoch": 1.4641078230295927, "grad_norm": 0.3880460858345032, "learning_rate": 5.3850676072898295e-05, "loss": 0.6646, "step": 1249 }, { "epoch": 1.4652798124816877, "grad_norm": 0.3867412209510803, "learning_rate": 5.373309817754263e-05, "loss": 0.7426, "step": 1250 }, { "epoch": 1.4664518019337827, "grad_norm": 0.3632020354270935, "learning_rate": 5.361552028218695e-05, "loss": 0.6446, "step": 1251 }, { "epoch": 1.4676237913858774, "grad_norm": 0.36393246054649353, "learning_rate": 5.3497942386831277e-05, "loss": 0.6056, "step": 1252 }, { "epoch": 1.4687957808379726, "grad_norm": 0.37027665972709656, "learning_rate": 5.338036449147561e-05, "loss": 0.672, "step": 1253 }, { "epoch": 1.4699677702900673, "grad_norm": 0.35585132241249084, "learning_rate": 5.326278659611993e-05, "loss": 0.696, "step": 1254 }, { "epoch": 1.4711397597421623, "grad_norm": 0.39298853278160095, "learning_rate": 5.314520870076426e-05, "loss": 0.667, "step": 1255 }, { "epoch": 1.4723117491942572, "grad_norm": 0.3782164454460144, "learning_rate": 5.302763080540859e-05, "loss": 0.6258, "step": 1256 }, { "epoch": 1.4734837386463522, "grad_norm": 0.38333067297935486, "learning_rate": 5.291005291005291e-05, "loss": 0.666, "step": 1257 }, { "epoch": 1.4746557280984471, "grad_norm": 0.40639209747314453, "learning_rate": 5.279247501469724e-05, "loss": 0.6612, "step": 1258 }, { "epoch": 1.475827717550542, "grad_norm": 0.38118046522140503, "learning_rate": 5.267489711934157e-05, "loss": 0.6475, "step": 1259 }, { "epoch": 1.476999707002637, "grad_norm": 0.3647833466529846, "learning_rate": 5.255731922398589e-05, "loss": 0.6594, "step": 1260 }, { "epoch": 1.4781716964547318, "grad_norm": 0.3835331201553345, "learning_rate": 5.243974132863022e-05, "loss": 0.6208, "step": 1261 }, { "epoch": 1.479343685906827, "grad_norm": 0.3909193277359009, "learning_rate": 5.232216343327455e-05, "loss": 0.6997, "step": 1262 }, { "epoch": 1.4805156753589217, "grad_norm": 0.377839595079422, "learning_rate": 5.220458553791887e-05, "loss": 0.6687, "step": 1263 }, { "epoch": 1.4816876648110167, "grad_norm": 0.3836138844490051, "learning_rate": 5.20870076425632e-05, "loss": 0.6636, "step": 1264 }, { "epoch": 1.4828596542631116, "grad_norm": 0.37317249178886414, "learning_rate": 5.196942974720753e-05, "loss": 0.6793, "step": 1265 }, { "epoch": 1.4840316437152066, "grad_norm": 0.385690838098526, "learning_rate": 5.185185185185185e-05, "loss": 0.6517, "step": 1266 }, { "epoch": 1.4852036331673015, "grad_norm": 0.37199866771698, "learning_rate": 5.1734273956496184e-05, "loss": 0.6403, "step": 1267 }, { "epoch": 1.4863756226193965, "grad_norm": 0.393108606338501, "learning_rate": 5.161669606114051e-05, "loss": 0.7115, "step": 1268 }, { "epoch": 1.4875476120714914, "grad_norm": 0.40995368361473083, "learning_rate": 5.149911816578483e-05, "loss": 0.6859, "step": 1269 }, { "epoch": 1.4887196015235862, "grad_norm": 0.35813668370246887, "learning_rate": 5.1381540270429165e-05, "loss": 0.6265, "step": 1270 }, { "epoch": 1.4898915909756814, "grad_norm": 0.3874257504940033, "learning_rate": 5.126396237507349e-05, "loss": 0.6407, "step": 1271 }, { "epoch": 1.491063580427776, "grad_norm": 0.39107558131217957, "learning_rate": 5.114638447971781e-05, "loss": 0.6787, "step": 1272 }, { "epoch": 1.492235569879871, "grad_norm": 0.38405880331993103, "learning_rate": 5.102880658436214e-05, "loss": 0.6422, "step": 1273 }, { "epoch": 1.493407559331966, "grad_norm": 0.40102508664131165, "learning_rate": 5.0911228689006474e-05, "loss": 0.6573, "step": 1274 }, { "epoch": 1.494579548784061, "grad_norm": 0.4192945659160614, "learning_rate": 5.0793650793650794e-05, "loss": 0.6181, "step": 1275 }, { "epoch": 1.495751538236156, "grad_norm": 0.3956759572029114, "learning_rate": 5.067607289829512e-05, "loss": 0.6583, "step": 1276 }, { "epoch": 1.4969235276882509, "grad_norm": 0.4155990183353424, "learning_rate": 5.0558495002939455e-05, "loss": 0.6849, "step": 1277 }, { "epoch": 1.4980955171403458, "grad_norm": 0.390337198972702, "learning_rate": 5.0440917107583776e-05, "loss": 0.7172, "step": 1278 }, { "epoch": 1.4992675065924406, "grad_norm": 0.39508169889450073, "learning_rate": 5.03233392122281e-05, "loss": 0.6638, "step": 1279 }, { "epoch": 1.5004394960445357, "grad_norm": 0.3898437023162842, "learning_rate": 5.020576131687244e-05, "loss": 0.6085, "step": 1280 }, { "epoch": 1.5016114854966305, "grad_norm": 0.37632593512535095, "learning_rate": 5.008818342151676e-05, "loss": 0.6722, "step": 1281 }, { "epoch": 1.5027834749487254, "grad_norm": 0.39309123158454895, "learning_rate": 4.9970605526161084e-05, "loss": 0.7213, "step": 1282 }, { "epoch": 1.5039554644008204, "grad_norm": 0.3973393440246582, "learning_rate": 4.985302763080541e-05, "loss": 0.6851, "step": 1283 }, { "epoch": 1.5051274538529154, "grad_norm": 0.3938598930835724, "learning_rate": 4.973544973544973e-05, "loss": 0.6219, "step": 1284 }, { "epoch": 1.5062994433050103, "grad_norm": 0.38015884160995483, "learning_rate": 4.9617871840094066e-05, "loss": 0.7159, "step": 1285 }, { "epoch": 1.507471432757105, "grad_norm": 0.3672783076763153, "learning_rate": 4.950029394473839e-05, "loss": 0.6958, "step": 1286 }, { "epoch": 1.5086434222092002, "grad_norm": 0.3825698494911194, "learning_rate": 4.938271604938271e-05, "loss": 0.6572, "step": 1287 }, { "epoch": 1.509815411661295, "grad_norm": 0.38162460923194885, "learning_rate": 4.926513815402705e-05, "loss": 0.6262, "step": 1288 }, { "epoch": 1.5109874011133901, "grad_norm": 0.3847755789756775, "learning_rate": 4.9147560258671374e-05, "loss": 0.6155, "step": 1289 }, { "epoch": 1.5121593905654849, "grad_norm": 0.3806037902832031, "learning_rate": 4.9029982363315695e-05, "loss": 0.6606, "step": 1290 }, { "epoch": 1.5133313800175798, "grad_norm": 0.38434869050979614, "learning_rate": 4.891240446796003e-05, "loss": 0.687, "step": 1291 }, { "epoch": 1.5145033694696748, "grad_norm": 0.39996469020843506, "learning_rate": 4.879482657260435e-05, "loss": 0.6714, "step": 1292 }, { "epoch": 1.5156753589217697, "grad_norm": 0.37494775652885437, "learning_rate": 4.8677248677248676e-05, "loss": 0.669, "step": 1293 }, { "epoch": 1.5168473483738647, "grad_norm": 0.3752374053001404, "learning_rate": 4.855967078189301e-05, "loss": 0.6388, "step": 1294 }, { "epoch": 1.5180193378259594, "grad_norm": 0.36477306485176086, "learning_rate": 4.844209288653733e-05, "loss": 0.6531, "step": 1295 }, { "epoch": 1.5191913272780546, "grad_norm": 0.36245569586753845, "learning_rate": 4.832451499118166e-05, "loss": 0.6685, "step": 1296 }, { "epoch": 1.5203633167301494, "grad_norm": 0.355896919965744, "learning_rate": 4.820693709582599e-05, "loss": 0.6594, "step": 1297 }, { "epoch": 1.5215353061822443, "grad_norm": 0.3616688549518585, "learning_rate": 4.808935920047031e-05, "loss": 0.6425, "step": 1298 }, { "epoch": 1.5227072956343393, "grad_norm": 0.3755030632019043, "learning_rate": 4.797178130511464e-05, "loss": 0.6472, "step": 1299 }, { "epoch": 1.5238792850864342, "grad_norm": 0.3720192015171051, "learning_rate": 4.7854203409758966e-05, "loss": 0.6426, "step": 1300 }, { "epoch": 1.5250512745385292, "grad_norm": 0.3859194219112396, "learning_rate": 4.773662551440329e-05, "loss": 0.6385, "step": 1301 }, { "epoch": 1.5262232639906241, "grad_norm": 0.37670567631721497, "learning_rate": 4.761904761904762e-05, "loss": 0.6411, "step": 1302 }, { "epoch": 1.527395253442719, "grad_norm": 0.3701159358024597, "learning_rate": 4.750146972369195e-05, "loss": 0.6767, "step": 1303 }, { "epoch": 1.5285672428948138, "grad_norm": 0.38531991839408875, "learning_rate": 4.7383891828336275e-05, "loss": 0.6948, "step": 1304 }, { "epoch": 1.529739232346909, "grad_norm": 0.4071838855743408, "learning_rate": 4.72663139329806e-05, "loss": 0.6502, "step": 1305 }, { "epoch": 1.5309112217990037, "grad_norm": 0.38243693113327026, "learning_rate": 4.714873603762493e-05, "loss": 0.6327, "step": 1306 }, { "epoch": 1.5320832112510987, "grad_norm": 0.37889015674591064, "learning_rate": 4.7031158142269256e-05, "loss": 0.6182, "step": 1307 }, { "epoch": 1.5332552007031937, "grad_norm": 0.36006277799606323, "learning_rate": 4.691358024691358e-05, "loss": 0.6053, "step": 1308 }, { "epoch": 1.5344271901552886, "grad_norm": 0.39384907484054565, "learning_rate": 4.679600235155791e-05, "loss": 0.6609, "step": 1309 }, { "epoch": 1.5355991796073836, "grad_norm": 0.38327935338020325, "learning_rate": 4.667842445620224e-05, "loss": 0.6948, "step": 1310 }, { "epoch": 1.5367711690594783, "grad_norm": 0.38430511951446533, "learning_rate": 4.656084656084656e-05, "loss": 0.6313, "step": 1311 }, { "epoch": 1.5379431585115735, "grad_norm": 0.3972114324569702, "learning_rate": 4.644326866549089e-05, "loss": 0.6396, "step": 1312 }, { "epoch": 1.5391151479636682, "grad_norm": 0.4078161120414734, "learning_rate": 4.632569077013522e-05, "loss": 0.6885, "step": 1313 }, { "epoch": 1.5402871374157634, "grad_norm": 0.3729688227176666, "learning_rate": 4.620811287477954e-05, "loss": 0.6438, "step": 1314 }, { "epoch": 1.5414591268678581, "grad_norm": 0.3863990902900696, "learning_rate": 4.609053497942387e-05, "loss": 0.6424, "step": 1315 }, { "epoch": 1.542631116319953, "grad_norm": 0.3734455406665802, "learning_rate": 4.5972957084068194e-05, "loss": 0.6927, "step": 1316 }, { "epoch": 1.543803105772048, "grad_norm": 0.3718372583389282, "learning_rate": 4.585537918871252e-05, "loss": 0.6325, "step": 1317 }, { "epoch": 1.544975095224143, "grad_norm": 0.36497369408607483, "learning_rate": 4.5737801293356855e-05, "loss": 0.6187, "step": 1318 }, { "epoch": 1.546147084676238, "grad_norm": 0.38579320907592773, "learning_rate": 4.5620223398001175e-05, "loss": 0.6754, "step": 1319 }, { "epoch": 1.5473190741283327, "grad_norm": 0.374126136302948, "learning_rate": 4.55026455026455e-05, "loss": 0.6472, "step": 1320 }, { "epoch": 1.5484910635804279, "grad_norm": 0.38020315766334534, "learning_rate": 4.5385067607289836e-05, "loss": 0.672, "step": 1321 }, { "epoch": 1.5496630530325226, "grad_norm": 0.38782334327697754, "learning_rate": 4.5267489711934157e-05, "loss": 0.7069, "step": 1322 }, { "epoch": 1.5508350424846178, "grad_norm": 0.36838847398757935, "learning_rate": 4.5149911816578484e-05, "loss": 0.6567, "step": 1323 }, { "epoch": 1.5520070319367125, "grad_norm": 0.37560224533081055, "learning_rate": 4.503233392122281e-05, "loss": 0.6638, "step": 1324 }, { "epoch": 1.5531790213888075, "grad_norm": 0.3812626898288727, "learning_rate": 4.491475602586714e-05, "loss": 0.6841, "step": 1325 }, { "epoch": 1.5543510108409024, "grad_norm": 0.37894850969314575, "learning_rate": 4.4797178130511465e-05, "loss": 0.6505, "step": 1326 }, { "epoch": 1.5555230002929974, "grad_norm": 0.3598770499229431, "learning_rate": 4.467960023515579e-05, "loss": 0.651, "step": 1327 }, { "epoch": 1.5566949897450924, "grad_norm": 0.4064580798149109, "learning_rate": 4.456202233980012e-05, "loss": 0.6593, "step": 1328 }, { "epoch": 1.557866979197187, "grad_norm": 0.3867085576057434, "learning_rate": 4.4444444444444447e-05, "loss": 0.6189, "step": 1329 }, { "epoch": 1.5590389686492823, "grad_norm": 0.3712484836578369, "learning_rate": 4.4326866549088774e-05, "loss": 0.6563, "step": 1330 }, { "epoch": 1.560210958101377, "grad_norm": 0.3924928605556488, "learning_rate": 4.42092886537331e-05, "loss": 0.6168, "step": 1331 }, { "epoch": 1.5613829475534722, "grad_norm": 0.36495327949523926, "learning_rate": 4.409171075837743e-05, "loss": 0.6584, "step": 1332 }, { "epoch": 1.562554937005567, "grad_norm": 0.37702497839927673, "learning_rate": 4.3974132863021755e-05, "loss": 0.6638, "step": 1333 }, { "epoch": 1.5637269264576619, "grad_norm": 0.3681667149066925, "learning_rate": 4.385655496766608e-05, "loss": 0.6122, "step": 1334 }, { "epoch": 1.5648989159097568, "grad_norm": 0.37110069394111633, "learning_rate": 4.37389770723104e-05, "loss": 0.6791, "step": 1335 }, { "epoch": 1.5660709053618518, "grad_norm": 0.3588317036628723, "learning_rate": 4.3621399176954737e-05, "loss": 0.616, "step": 1336 }, { "epoch": 1.5672428948139467, "grad_norm": 0.39171895384788513, "learning_rate": 4.3503821281599064e-05, "loss": 0.6859, "step": 1337 }, { "epoch": 1.5684148842660415, "grad_norm": 0.39346379041671753, "learning_rate": 4.3386243386243384e-05, "loss": 0.6833, "step": 1338 }, { "epoch": 1.5695868737181367, "grad_norm": 0.37361201643943787, "learning_rate": 4.326866549088772e-05, "loss": 0.637, "step": 1339 }, { "epoch": 1.5707588631702314, "grad_norm": 0.38074547052383423, "learning_rate": 4.315108759553204e-05, "loss": 0.6672, "step": 1340 }, { "epoch": 1.5719308526223263, "grad_norm": 0.36969512701034546, "learning_rate": 4.3033509700176366e-05, "loss": 0.6363, "step": 1341 }, { "epoch": 1.5731028420744213, "grad_norm": 0.36250782012939453, "learning_rate": 4.29159318048207e-05, "loss": 0.6194, "step": 1342 }, { "epoch": 1.5742748315265163, "grad_norm": 0.3632318377494812, "learning_rate": 4.279835390946502e-05, "loss": 0.6229, "step": 1343 }, { "epoch": 1.5754468209786112, "grad_norm": 0.37329503893852234, "learning_rate": 4.268077601410935e-05, "loss": 0.6614, "step": 1344 }, { "epoch": 1.5766188104307062, "grad_norm": 0.3797610402107239, "learning_rate": 4.256319811875368e-05, "loss": 0.6454, "step": 1345 }, { "epoch": 1.5777907998828011, "grad_norm": 0.38016003370285034, "learning_rate": 4.2445620223398e-05, "loss": 0.6362, "step": 1346 }, { "epoch": 1.5789627893348959, "grad_norm": 0.3744441568851471, "learning_rate": 4.232804232804233e-05, "loss": 0.6523, "step": 1347 }, { "epoch": 1.580134778786991, "grad_norm": 0.38999882340431213, "learning_rate": 4.2210464432686656e-05, "loss": 0.6723, "step": 1348 }, { "epoch": 1.5813067682390858, "grad_norm": 0.40173429250717163, "learning_rate": 4.209288653733098e-05, "loss": 0.6375, "step": 1349 }, { "epoch": 1.5824787576911807, "grad_norm": 0.38676032423973083, "learning_rate": 4.197530864197531e-05, "loss": 0.6853, "step": 1350 }, { "epoch": 1.5836507471432757, "grad_norm": 0.38119667768478394, "learning_rate": 4.185773074661964e-05, "loss": 0.6388, "step": 1351 }, { "epoch": 1.5848227365953707, "grad_norm": 0.3793523907661438, "learning_rate": 4.1740152851263964e-05, "loss": 0.6269, "step": 1352 }, { "epoch": 1.5859947260474656, "grad_norm": 0.3863115608692169, "learning_rate": 4.162257495590829e-05, "loss": 0.7104, "step": 1353 }, { "epoch": 1.5871667154995603, "grad_norm": 0.3677123785018921, "learning_rate": 4.150499706055262e-05, "loss": 0.6581, "step": 1354 }, { "epoch": 1.5883387049516555, "grad_norm": 0.39373883605003357, "learning_rate": 4.1387419165196946e-05, "loss": 0.6768, "step": 1355 }, { "epoch": 1.5895106944037503, "grad_norm": 0.39006727933883667, "learning_rate": 4.126984126984127e-05, "loss": 0.6636, "step": 1356 }, { "epoch": 1.5906826838558454, "grad_norm": 0.37299513816833496, "learning_rate": 4.11522633744856e-05, "loss": 0.6205, "step": 1357 }, { "epoch": 1.5918546733079402, "grad_norm": 0.374833345413208, "learning_rate": 4.103468547912993e-05, "loss": 0.6734, "step": 1358 }, { "epoch": 1.5930266627600351, "grad_norm": 0.38301679491996765, "learning_rate": 4.091710758377425e-05, "loss": 0.6709, "step": 1359 }, { "epoch": 1.59419865221213, "grad_norm": 0.35621005296707153, "learning_rate": 4.079952968841858e-05, "loss": 0.6523, "step": 1360 }, { "epoch": 1.595370641664225, "grad_norm": 0.3874707520008087, "learning_rate": 4.068195179306291e-05, "loss": 0.6615, "step": 1361 }, { "epoch": 1.59654263111632, "grad_norm": 0.39570900797843933, "learning_rate": 4.056437389770723e-05, "loss": 0.647, "step": 1362 }, { "epoch": 1.5977146205684147, "grad_norm": 0.38016220927238464, "learning_rate": 4.044679600235156e-05, "loss": 0.662, "step": 1363 }, { "epoch": 1.59888661002051, "grad_norm": 0.37574926018714905, "learning_rate": 4.032921810699588e-05, "loss": 0.6636, "step": 1364 }, { "epoch": 1.6000585994726046, "grad_norm": 0.40126630663871765, "learning_rate": 4.021164021164021e-05, "loss": 0.6081, "step": 1365 }, { "epoch": 1.6012305889246998, "grad_norm": 0.3685883581638336, "learning_rate": 4.0094062316284544e-05, "loss": 0.6364, "step": 1366 }, { "epoch": 1.6024025783767946, "grad_norm": 0.409110426902771, "learning_rate": 3.9976484420928865e-05, "loss": 0.7165, "step": 1367 }, { "epoch": 1.6035745678288895, "grad_norm": 0.39460107684135437, "learning_rate": 3.985890652557319e-05, "loss": 0.6756, "step": 1368 }, { "epoch": 1.6047465572809845, "grad_norm": 0.3784719407558441, "learning_rate": 3.9741328630217526e-05, "loss": 0.6578, "step": 1369 }, { "epoch": 1.6059185467330794, "grad_norm": 0.36832788586616516, "learning_rate": 3.9623750734861846e-05, "loss": 0.6372, "step": 1370 }, { "epoch": 1.6070905361851744, "grad_norm": 0.3807730972766876, "learning_rate": 3.950617283950617e-05, "loss": 0.6015, "step": 1371 }, { "epoch": 1.6082625256372691, "grad_norm": 0.3825012445449829, "learning_rate": 3.93885949441505e-05, "loss": 0.6982, "step": 1372 }, { "epoch": 1.6094345150893643, "grad_norm": 0.37073245644569397, "learning_rate": 3.927101704879483e-05, "loss": 0.6201, "step": 1373 }, { "epoch": 1.610606504541459, "grad_norm": 0.38484933972358704, "learning_rate": 3.9153439153439155e-05, "loss": 0.6671, "step": 1374 }, { "epoch": 1.6117784939935542, "grad_norm": 0.38665077090263367, "learning_rate": 3.903586125808348e-05, "loss": 0.6841, "step": 1375 }, { "epoch": 1.612950483445649, "grad_norm": 0.3957137167453766, "learning_rate": 3.891828336272781e-05, "loss": 0.6734, "step": 1376 }, { "epoch": 1.614122472897744, "grad_norm": 0.3841049075126648, "learning_rate": 3.8800705467372136e-05, "loss": 0.642, "step": 1377 }, { "epoch": 1.6152944623498389, "grad_norm": 0.3854089081287384, "learning_rate": 3.868312757201646e-05, "loss": 0.6292, "step": 1378 }, { "epoch": 1.6164664518019338, "grad_norm": 0.37589502334594727, "learning_rate": 3.856554967666079e-05, "loss": 0.6337, "step": 1379 }, { "epoch": 1.6176384412540288, "grad_norm": 0.3719007968902588, "learning_rate": 3.844797178130512e-05, "loss": 0.6396, "step": 1380 }, { "epoch": 1.6188104307061235, "grad_norm": 0.4034193158149719, "learning_rate": 3.8330393885949445e-05, "loss": 0.6406, "step": 1381 }, { "epoch": 1.6199824201582187, "grad_norm": 0.39838656783103943, "learning_rate": 3.821281599059377e-05, "loss": 0.674, "step": 1382 }, { "epoch": 1.6211544096103134, "grad_norm": 0.3712717592716217, "learning_rate": 3.809523809523809e-05, "loss": 0.6493, "step": 1383 }, { "epoch": 1.6223263990624084, "grad_norm": 0.38980063796043396, "learning_rate": 3.7977660199882426e-05, "loss": 0.6588, "step": 1384 }, { "epoch": 1.6234983885145033, "grad_norm": 0.37699729204177856, "learning_rate": 3.786008230452675e-05, "loss": 0.6591, "step": 1385 }, { "epoch": 1.6246703779665983, "grad_norm": 0.3779825270175934, "learning_rate": 3.7742504409171074e-05, "loss": 0.6293, "step": 1386 }, { "epoch": 1.6258423674186933, "grad_norm": 0.40519246459007263, "learning_rate": 3.762492651381541e-05, "loss": 0.6035, "step": 1387 }, { "epoch": 1.6270143568707882, "grad_norm": 0.3740384876728058, "learning_rate": 3.750734861845973e-05, "loss": 0.6495, "step": 1388 }, { "epoch": 1.6281863463228832, "grad_norm": 0.393038272857666, "learning_rate": 3.7389770723104055e-05, "loss": 0.6375, "step": 1389 }, { "epoch": 1.629358335774978, "grad_norm": 0.3672095835208893, "learning_rate": 3.727219282774839e-05, "loss": 0.6262, "step": 1390 }, { "epoch": 1.630530325227073, "grad_norm": 0.36593756079673767, "learning_rate": 3.715461493239271e-05, "loss": 0.6107, "step": 1391 }, { "epoch": 1.6317023146791678, "grad_norm": 0.38510987162590027, "learning_rate": 3.7037037037037037e-05, "loss": 0.666, "step": 1392 }, { "epoch": 1.6328743041312628, "grad_norm": 0.37760478258132935, "learning_rate": 3.691945914168137e-05, "loss": 0.6047, "step": 1393 }, { "epoch": 1.6340462935833577, "grad_norm": 0.39161157608032227, "learning_rate": 3.680188124632569e-05, "loss": 0.6353, "step": 1394 }, { "epoch": 1.6352182830354527, "grad_norm": 0.4048272669315338, "learning_rate": 3.668430335097002e-05, "loss": 0.6406, "step": 1395 }, { "epoch": 1.6363902724875476, "grad_norm": 0.3909299671649933, "learning_rate": 3.6566725455614345e-05, "loss": 0.6602, "step": 1396 }, { "epoch": 1.6375622619396424, "grad_norm": 0.3780452609062195, "learning_rate": 3.644914756025867e-05, "loss": 0.6563, "step": 1397 }, { "epoch": 1.6387342513917376, "grad_norm": 0.415984183549881, "learning_rate": 3.6331569664903e-05, "loss": 0.6956, "step": 1398 }, { "epoch": 1.6399062408438323, "grad_norm": 0.39058148860931396, "learning_rate": 3.6213991769547327e-05, "loss": 0.618, "step": 1399 }, { "epoch": 1.6410782302959275, "grad_norm": 0.3820166289806366, "learning_rate": 3.6096413874191654e-05, "loss": 0.6871, "step": 1400 }, { "epoch": 1.6422502197480222, "grad_norm": 0.3850885331630707, "learning_rate": 3.597883597883598e-05, "loss": 0.6564, "step": 1401 }, { "epoch": 1.6434222092001172, "grad_norm": 0.36710235476493835, "learning_rate": 3.586125808348031e-05, "loss": 0.6465, "step": 1402 }, { "epoch": 1.6445941986522121, "grad_norm": 0.37936750054359436, "learning_rate": 3.5743680188124635e-05, "loss": 0.6541, "step": 1403 }, { "epoch": 1.645766188104307, "grad_norm": 0.4197543263435364, "learning_rate": 3.562610229276896e-05, "loss": 0.7147, "step": 1404 }, { "epoch": 1.646938177556402, "grad_norm": 0.39201852679252625, "learning_rate": 3.550852439741329e-05, "loss": 0.6829, "step": 1405 }, { "epoch": 1.6481101670084968, "grad_norm": 0.3781983554363251, "learning_rate": 3.539094650205762e-05, "loss": 0.6482, "step": 1406 }, { "epoch": 1.649282156460592, "grad_norm": 0.3771427869796753, "learning_rate": 3.527336860670194e-05, "loss": 0.6526, "step": 1407 }, { "epoch": 1.6504541459126867, "grad_norm": 0.38876059651374817, "learning_rate": 3.515579071134627e-05, "loss": 0.6731, "step": 1408 }, { "epoch": 1.6516261353647819, "grad_norm": 0.38884153962135315, "learning_rate": 3.50382128159906e-05, "loss": 0.6311, "step": 1409 }, { "epoch": 1.6527981248168766, "grad_norm": 0.39201685786247253, "learning_rate": 3.492063492063492e-05, "loss": 0.6902, "step": 1410 }, { "epoch": 1.6539701142689716, "grad_norm": 0.37352827191352844, "learning_rate": 3.480305702527925e-05, "loss": 0.6938, "step": 1411 }, { "epoch": 1.6551421037210665, "grad_norm": 0.37906309962272644, "learning_rate": 3.468547912992357e-05, "loss": 0.6818, "step": 1412 }, { "epoch": 1.6563140931731615, "grad_norm": 0.3947876989841461, "learning_rate": 3.45679012345679e-05, "loss": 0.6529, "step": 1413 }, { "epoch": 1.6574860826252564, "grad_norm": 0.3685528635978699, "learning_rate": 3.4450323339212234e-05, "loss": 0.6282, "step": 1414 }, { "epoch": 1.6586580720773512, "grad_norm": 0.3796926438808441, "learning_rate": 3.4332745443856554e-05, "loss": 0.6472, "step": 1415 }, { "epoch": 1.6598300615294463, "grad_norm": 0.3854847848415375, "learning_rate": 3.421516754850088e-05, "loss": 0.6531, "step": 1416 }, { "epoch": 1.661002050981541, "grad_norm": 0.3648832440376282, "learning_rate": 3.4097589653145215e-05, "loss": 0.6203, "step": 1417 }, { "epoch": 1.6621740404336363, "grad_norm": 0.37867122888565063, "learning_rate": 3.3980011757789536e-05, "loss": 0.657, "step": 1418 }, { "epoch": 1.663346029885731, "grad_norm": 0.3739066421985626, "learning_rate": 3.386243386243386e-05, "loss": 0.6555, "step": 1419 }, { "epoch": 1.664518019337826, "grad_norm": 0.3743496537208557, "learning_rate": 3.374485596707819e-05, "loss": 0.6406, "step": 1420 }, { "epoch": 1.665690008789921, "grad_norm": 0.38990095257759094, "learning_rate": 3.362727807172252e-05, "loss": 0.6006, "step": 1421 }, { "epoch": 1.6668619982420159, "grad_norm": 0.38039836287498474, "learning_rate": 3.3509700176366844e-05, "loss": 0.6363, "step": 1422 }, { "epoch": 1.6680339876941108, "grad_norm": 0.36417829990386963, "learning_rate": 3.339212228101117e-05, "loss": 0.6366, "step": 1423 }, { "epoch": 1.6692059771462056, "grad_norm": 0.37726080417633057, "learning_rate": 3.32745443856555e-05, "loss": 0.6574, "step": 1424 }, { "epoch": 1.6703779665983007, "grad_norm": 0.3663460910320282, "learning_rate": 3.3156966490299826e-05, "loss": 0.6306, "step": 1425 }, { "epoch": 1.6715499560503955, "grad_norm": 0.4453365206718445, "learning_rate": 3.303938859494415e-05, "loss": 0.6846, "step": 1426 }, { "epoch": 1.6727219455024904, "grad_norm": 0.3600301742553711, "learning_rate": 3.292181069958848e-05, "loss": 0.6361, "step": 1427 }, { "epoch": 1.6738939349545854, "grad_norm": 0.3683728575706482, "learning_rate": 3.280423280423281e-05, "loss": 0.6304, "step": 1428 }, { "epoch": 1.6750659244066803, "grad_norm": 0.4204603135585785, "learning_rate": 3.2686654908877134e-05, "loss": 0.6582, "step": 1429 }, { "epoch": 1.6762379138587753, "grad_norm": 0.3554556667804718, "learning_rate": 3.256907701352146e-05, "loss": 0.6506, "step": 1430 }, { "epoch": 1.6774099033108703, "grad_norm": 0.39126551151275635, "learning_rate": 3.245149911816578e-05, "loss": 0.6441, "step": 1431 }, { "epoch": 1.6785818927629652, "grad_norm": 0.37401652336120605, "learning_rate": 3.2333921222810116e-05, "loss": 0.6569, "step": 1432 }, { "epoch": 1.67975388221506, "grad_norm": 0.38046538829803467, "learning_rate": 3.221634332745444e-05, "loss": 0.642, "step": 1433 }, { "epoch": 1.6809258716671551, "grad_norm": 0.375357061624527, "learning_rate": 3.209876543209876e-05, "loss": 0.66, "step": 1434 }, { "epoch": 1.6820978611192499, "grad_norm": 0.36403539776802063, "learning_rate": 3.19811875367431e-05, "loss": 0.6493, "step": 1435 }, { "epoch": 1.6832698505713448, "grad_norm": 0.3747311234474182, "learning_rate": 3.186360964138742e-05, "loss": 0.6422, "step": 1436 }, { "epoch": 1.6844418400234398, "grad_norm": 0.38533565402030945, "learning_rate": 3.1746031746031745e-05, "loss": 0.6381, "step": 1437 }, { "epoch": 1.6856138294755347, "grad_norm": 0.38410523533821106, "learning_rate": 3.162845385067608e-05, "loss": 0.6695, "step": 1438 }, { "epoch": 1.6867858189276297, "grad_norm": 0.37452232837677, "learning_rate": 3.15108759553204e-05, "loss": 0.6305, "step": 1439 }, { "epoch": 1.6879578083797244, "grad_norm": 0.3752080500125885, "learning_rate": 3.1393298059964726e-05, "loss": 0.6486, "step": 1440 }, { "epoch": 1.6891297978318196, "grad_norm": 0.384127676486969, "learning_rate": 3.127572016460906e-05, "loss": 0.628, "step": 1441 }, { "epoch": 1.6903017872839143, "grad_norm": 0.3655785024166107, "learning_rate": 3.115814226925338e-05, "loss": 0.6325, "step": 1442 }, { "epoch": 1.6914737767360095, "grad_norm": 0.37582463026046753, "learning_rate": 3.104056437389771e-05, "loss": 0.6641, "step": 1443 }, { "epoch": 1.6926457661881043, "grad_norm": 0.3942627012729645, "learning_rate": 3.0922986478542035e-05, "loss": 0.6314, "step": 1444 }, { "epoch": 1.6938177556401992, "grad_norm": 0.38446322083473206, "learning_rate": 3.080540858318636e-05, "loss": 0.6482, "step": 1445 }, { "epoch": 1.6949897450922942, "grad_norm": 0.3782792091369629, "learning_rate": 3.068783068783069e-05, "loss": 0.666, "step": 1446 }, { "epoch": 1.6961617345443891, "grad_norm": 0.3792760372161865, "learning_rate": 3.0570252792475016e-05, "loss": 0.6424, "step": 1447 }, { "epoch": 1.697333723996484, "grad_norm": 0.4009881615638733, "learning_rate": 3.0452674897119343e-05, "loss": 0.6803, "step": 1448 }, { "epoch": 1.6985057134485788, "grad_norm": 0.3797784149646759, "learning_rate": 3.0335097001763667e-05, "loss": 0.6243, "step": 1449 }, { "epoch": 1.699677702900674, "grad_norm": 0.37865757942199707, "learning_rate": 3.0217519106407998e-05, "loss": 0.6354, "step": 1450 }, { "epoch": 1.7008496923527687, "grad_norm": 0.38236260414123535, "learning_rate": 3.0099941211052325e-05, "loss": 0.6653, "step": 1451 }, { "epoch": 1.702021681804864, "grad_norm": 0.37716102600097656, "learning_rate": 2.998236331569665e-05, "loss": 0.6721, "step": 1452 }, { "epoch": 1.7031936712569586, "grad_norm": 0.38444986939430237, "learning_rate": 2.986478542034098e-05, "loss": 0.6269, "step": 1453 }, { "epoch": 1.7043656607090536, "grad_norm": 0.3900532126426697, "learning_rate": 2.9747207524985303e-05, "loss": 0.6657, "step": 1454 }, { "epoch": 1.7055376501611486, "grad_norm": 0.3843390643596649, "learning_rate": 2.962962962962963e-05, "loss": 0.6847, "step": 1455 }, { "epoch": 1.7067096396132435, "grad_norm": 0.3660966753959656, "learning_rate": 2.951205173427396e-05, "loss": 0.6667, "step": 1456 }, { "epoch": 1.7078816290653385, "grad_norm": 0.3676696717739105, "learning_rate": 2.9394473838918284e-05, "loss": 0.6643, "step": 1457 }, { "epoch": 1.7090536185174332, "grad_norm": 0.3827989101409912, "learning_rate": 2.927689594356261e-05, "loss": 0.6637, "step": 1458 }, { "epoch": 1.7102256079695284, "grad_norm": 0.3773452639579773, "learning_rate": 2.9159318048206942e-05, "loss": 0.6432, "step": 1459 }, { "epoch": 1.7113975974216231, "grad_norm": 0.39179813861846924, "learning_rate": 2.9041740152851266e-05, "loss": 0.6571, "step": 1460 }, { "epoch": 1.7125695868737183, "grad_norm": 0.3847814202308655, "learning_rate": 2.892416225749559e-05, "loss": 0.6611, "step": 1461 }, { "epoch": 1.713741576325813, "grad_norm": 0.3790866732597351, "learning_rate": 2.880658436213992e-05, "loss": 0.6731, "step": 1462 }, { "epoch": 1.714913565777908, "grad_norm": 0.37719854712486267, "learning_rate": 2.8689006466784247e-05, "loss": 0.6456, "step": 1463 }, { "epoch": 1.716085555230003, "grad_norm": 0.3641086220741272, "learning_rate": 2.857142857142857e-05, "loss": 0.6548, "step": 1464 }, { "epoch": 1.717257544682098, "grad_norm": 0.39434361457824707, "learning_rate": 2.84538506760729e-05, "loss": 0.6535, "step": 1465 }, { "epoch": 1.7184295341341929, "grad_norm": 0.36238378286361694, "learning_rate": 2.8336272780717225e-05, "loss": 0.6303, "step": 1466 }, { "epoch": 1.7196015235862876, "grad_norm": 0.3763822615146637, "learning_rate": 2.8218694885361552e-05, "loss": 0.7062, "step": 1467 }, { "epoch": 1.7207735130383828, "grad_norm": 0.37111690640449524, "learning_rate": 2.8101116990005883e-05, "loss": 0.6468, "step": 1468 }, { "epoch": 1.7219455024904775, "grad_norm": 0.3747517764568329, "learning_rate": 2.7983539094650207e-05, "loss": 0.663, "step": 1469 }, { "epoch": 1.7231174919425725, "grad_norm": 0.3822534382343292, "learning_rate": 2.7865961199294534e-05, "loss": 0.6306, "step": 1470 }, { "epoch": 1.7242894813946674, "grad_norm": 0.36476072669029236, "learning_rate": 2.7748383303938864e-05, "loss": 0.6283, "step": 1471 }, { "epoch": 1.7254614708467624, "grad_norm": 0.36173558235168457, "learning_rate": 2.7630805408583188e-05, "loss": 0.618, "step": 1472 }, { "epoch": 1.7266334602988573, "grad_norm": 0.36746352910995483, "learning_rate": 2.7513227513227512e-05, "loss": 0.6457, "step": 1473 }, { "epoch": 1.7278054497509523, "grad_norm": 0.37200871109962463, "learning_rate": 2.7395649617871842e-05, "loss": 0.6204, "step": 1474 }, { "epoch": 1.7289774392030473, "grad_norm": 0.3685344159603119, "learning_rate": 2.727807172251617e-05, "loss": 0.6303, "step": 1475 }, { "epoch": 1.730149428655142, "grad_norm": 0.37158438563346863, "learning_rate": 2.7160493827160493e-05, "loss": 0.6278, "step": 1476 }, { "epoch": 1.7313214181072372, "grad_norm": 0.3743583559989929, "learning_rate": 2.7042915931804824e-05, "loss": 0.6088, "step": 1477 }, { "epoch": 1.732493407559332, "grad_norm": 0.37238809466362, "learning_rate": 2.6925338036449148e-05, "loss": 0.6456, "step": 1478 }, { "epoch": 1.7336653970114269, "grad_norm": 0.3718564808368683, "learning_rate": 2.6807760141093475e-05, "loss": 0.6123, "step": 1479 }, { "epoch": 1.7348373864635218, "grad_norm": 0.3883103132247925, "learning_rate": 2.6690182245737805e-05, "loss": 0.6628, "step": 1480 }, { "epoch": 1.7360093759156168, "grad_norm": 0.376810759305954, "learning_rate": 2.657260435038213e-05, "loss": 0.6465, "step": 1481 }, { "epoch": 1.7371813653677117, "grad_norm": 0.41317370533943176, "learning_rate": 2.6455026455026456e-05, "loss": 0.6835, "step": 1482 }, { "epoch": 1.7383533548198065, "grad_norm": 0.3698785603046417, "learning_rate": 2.6337448559670787e-05, "loss": 0.6174, "step": 1483 }, { "epoch": 1.7395253442719016, "grad_norm": 0.3769229054450989, "learning_rate": 2.621987066431511e-05, "loss": 0.6402, "step": 1484 }, { "epoch": 1.7406973337239964, "grad_norm": 0.3612512946128845, "learning_rate": 2.6102292768959434e-05, "loss": 0.6457, "step": 1485 }, { "epoch": 1.7418693231760916, "grad_norm": 0.36678725481033325, "learning_rate": 2.5984714873603765e-05, "loss": 0.6247, "step": 1486 }, { "epoch": 1.7430413126281863, "grad_norm": 0.39542651176452637, "learning_rate": 2.5867136978248092e-05, "loss": 0.6294, "step": 1487 }, { "epoch": 1.7442133020802812, "grad_norm": 0.3900982439517975, "learning_rate": 2.5749559082892416e-05, "loss": 0.675, "step": 1488 }, { "epoch": 1.7453852915323762, "grad_norm": 0.3719351887702942, "learning_rate": 2.5631981187536746e-05, "loss": 0.6492, "step": 1489 }, { "epoch": 1.7465572809844712, "grad_norm": 0.37937313318252563, "learning_rate": 2.551440329218107e-05, "loss": 0.6232, "step": 1490 }, { "epoch": 1.7477292704365661, "grad_norm": 0.39128735661506653, "learning_rate": 2.5396825396825397e-05, "loss": 0.6404, "step": 1491 }, { "epoch": 1.7489012598886609, "grad_norm": 0.3798913359642029, "learning_rate": 2.5279247501469728e-05, "loss": 0.6588, "step": 1492 }, { "epoch": 1.750073249340756, "grad_norm": 0.3671737015247345, "learning_rate": 2.516166960611405e-05, "loss": 0.671, "step": 1493 }, { "epoch": 1.7512452387928508, "grad_norm": 0.3778979182243347, "learning_rate": 2.504409171075838e-05, "loss": 0.6157, "step": 1494 }, { "epoch": 1.752417228244946, "grad_norm": 0.38095155358314514, "learning_rate": 2.4926513815402706e-05, "loss": 0.6674, "step": 1495 }, { "epoch": 1.7535892176970407, "grad_norm": 0.39539647102355957, "learning_rate": 2.4808935920047033e-05, "loss": 0.6457, "step": 1496 }, { "epoch": 1.7547612071491356, "grad_norm": 0.3683115541934967, "learning_rate": 2.4691358024691357e-05, "loss": 0.6207, "step": 1497 }, { "epoch": 1.7559331966012306, "grad_norm": 0.38240090012550354, "learning_rate": 2.4573780129335687e-05, "loss": 0.6593, "step": 1498 }, { "epoch": 1.7571051860533256, "grad_norm": 0.36555352807044983, "learning_rate": 2.4456202233980014e-05, "loss": 0.6818, "step": 1499 }, { "epoch": 1.7582771755054205, "grad_norm": 0.3794335722923279, "learning_rate": 2.4338624338624338e-05, "loss": 0.6481, "step": 1500 } ], "logging_steps": 1, "max_steps": 1706, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 2.2263110020544594e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }