diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4186 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8212414090449476, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0000000000000004e-08, + "loss": 2.1648, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 7.500000000000001e-08, + "loss": 2.131, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.2500000000000002e-07, + "loss": 1.9325, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 1.7500000000000002e-07, + "loss": 1.8743, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 2.2500000000000002e-07, + "loss": 1.8232, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 2.75e-07, + "loss": 1.7315, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 3.25e-07, + "loss": 1.656, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 3.75e-07, + "loss": 1.6538, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 4.2500000000000006e-07, + "loss": 1.5483, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 4.7500000000000006e-07, + "loss": 1.5073, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 5.250000000000001e-07, + "loss": 1.501, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 5.750000000000001e-07, + "loss": 1.4804, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 6.25e-07, + "loss": 1.4357, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 6.750000000000001e-07, + "loss": 1.424, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 7.25e-07, + "loss": 1.4579, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 7.750000000000001e-07, + "loss": 1.4185, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 8.250000000000001e-07, + "loss": 1.4141, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 8.75e-07, + "loss": 1.4098, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 9.25e-07, + "loss": 1.4144, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 9.7e-07, + "loss": 1.3644, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.02e-06, + "loss": 1.3524, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 1.0700000000000001e-06, + "loss": 1.3403, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 1.12e-06, + "loss": 1.3355, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 1.1700000000000002e-06, + "loss": 1.3448, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 1.2200000000000002e-06, + "loss": 1.322, + "step": 250 + }, + { + "epoch": 0.04, + "learning_rate": 1.2700000000000001e-06, + "loss": 1.3186, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 1.32e-06, + "loss": 1.3038, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 1.3700000000000002e-06, + "loss": 1.2853, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 1.42e-06, + "loss": 1.2939, + "step": 290 + }, + { + "epoch": 0.05, + "learning_rate": 1.4700000000000001e-06, + "loss": 1.2918, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 1.52e-06, + "loss": 1.2976, + "step": 310 + }, + { + "epoch": 0.05, + "learning_rate": 1.5700000000000002e-06, + "loss": 1.3128, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 1.6200000000000002e-06, + "loss": 1.2433, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 1.6700000000000003e-06, + "loss": 1.2978, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 1.72e-06, + "loss": 1.2964, + "step": 350 + }, + { + "epoch": 0.06, + "learning_rate": 1.77e-06, + "loss": 1.2625, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 1.8200000000000002e-06, + "loss": 1.2837, + "step": 370 + }, + { + "epoch": 0.06, + "learning_rate": 1.87e-06, + "loss": 1.2995, + "step": 380 + }, + { + "epoch": 0.06, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.2706, + "step": 390 + }, + { + "epoch": 0.07, + "learning_rate": 1.97e-06, + "loss": 1.2819, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 2.02e-06, + "loss": 1.2522, + "step": 410 + }, + { + "epoch": 0.07, + "learning_rate": 2.07e-06, + "loss": 1.2955, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 2.12e-06, + "loss": 1.2506, + "step": 430 + }, + { + "epoch": 0.07, + "learning_rate": 2.17e-06, + "loss": 1.249, + "step": 440 + }, + { + "epoch": 0.07, + "learning_rate": 2.2200000000000003e-06, + "loss": 1.2413, + "step": 450 + }, + { + "epoch": 0.08, + "learning_rate": 2.2700000000000003e-06, + "loss": 1.2463, + "step": 460 + }, + { + "epoch": 0.08, + "learning_rate": 2.3200000000000002e-06, + "loss": 1.288, + "step": 470 + }, + { + "epoch": 0.08, + "learning_rate": 2.37e-06, + "loss": 1.2531, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 2.42e-06, + "loss": 1.2314, + "step": 490 + }, + { + "epoch": 0.08, + "learning_rate": 2.47e-06, + "loss": 1.2369, + "step": 500 + }, + { + "epoch": 0.08, + "eval_multi_news_accuracy": 0.5592306537314586, + "eval_multi_news_loss": 1.919921875, + "eval_multi_news_runtime": 374.7444, + "eval_multi_news_samples_per_second": 15.002, + "eval_multi_news_steps_per_second": 1.876, + "step": 500 + }, + { + "epoch": 0.08, + "eval_samsum_accuracy": 0.630043040249728, + "eval_samsum_loss": 1.3271484375, + "eval_samsum_runtime": 37.5336, + "eval_samsum_samples_per_second": 21.794, + "eval_samsum_steps_per_second": 2.744, + "step": 500 + }, + { + "epoch": 0.08, + "eval_billsum_accuracy": 0.6415100921468554, + "eval_billsum_loss": 1.4970703125, + "eval_billsum_runtime": 204.4286, + "eval_billsum_samples_per_second": 15.991, + "eval_billsum_steps_per_second": 2.001, + "step": 500 + }, + { + "epoch": 0.08, + "eval_wmt2019_zh-en_accuracy": 0.5844479239374446, + "eval_wmt2019_zh-en_loss": 1.89453125, + "eval_wmt2019_zh-en_runtime": 43.2897, + "eval_wmt2019_zh-en_samples_per_second": 23.1, + "eval_wmt2019_zh-en_steps_per_second": 2.888, + "step": 500 + }, + { + "epoch": 0.08, + "eval_ted_trans_en-ja_accuracy": 0.5366497079329188, + "eval_ted_trans_en-ja_loss": 2.01953125, + "eval_ted_trans_en-ja_runtime": 36.4641, + "eval_ted_trans_en-ja_samples_per_second": 21.967, + "eval_ted_trans_en-ja_steps_per_second": 2.77, + "step": 500 + }, + { + "epoch": 0.08, + "eval_ted_trans_zh-ja_accuracy": 0.44175365344467643, + "eval_ted_trans_zh-ja_loss": 2.703125, + "eval_ted_trans_zh-ja_runtime": 2.2264, + "eval_ted_trans_zh-ja_samples_per_second": 18.864, + "eval_ted_trans_zh-ja_steps_per_second": 2.695, + "step": 500 + }, + { + "epoch": 0.08, + "eval_sharegpt_accuracy": 0.7056496488080175, + "eval_sharegpt_loss": 1.1474609375, + "eval_sharegpt_runtime": 735.7691, + "eval_sharegpt_samples_per_second": 4.55, + "eval_sharegpt_steps_per_second": 0.569, + "step": 500 + }, + { + "epoch": 0.08, + "eval_dolly15k_accuracy": 0.5961999725877193, + "eval_dolly15k_loss": 1.6650390625, + "eval_dolly15k_runtime": 33.9484, + "eval_dolly15k_samples_per_second": 22.122, + "eval_dolly15k_steps_per_second": 2.769, + "step": 500 + }, + { + "epoch": 0.08, + "eval_ikala_accuracy": 0.7306054447586751, + "eval_ikala_loss": 1.0380859375, + "eval_ikala_runtime": 887.5903, + "eval_ikala_samples_per_second": 16.005, + "eval_ikala_steps_per_second": 2.001, + "step": 500 + }, + { + "epoch": 0.08, + "eval_oasst_export_accuracy": 0.656822117898619, + "eval_oasst_export_loss": 1.60546875, + "eval_oasst_export_runtime": 134.1688, + "eval_oasst_export_samples_per_second": 15.644, + "eval_oasst_export_steps_per_second": 1.96, + "step": 500 + }, + { + "epoch": 0.08, + "eval_joke_accuracy": 0.48218347232752085, + "eval_joke_loss": 2.29296875, + "eval_joke_runtime": 3.5706, + "eval_joke_samples_per_second": 21.285, + "eval_joke_steps_per_second": 2.801, + "step": 500 + }, + { + "epoch": 0.08, + "eval_gsm8k_accuracy": 0.7402563310685608, + "eval_gsm8k_loss": 1.0068359375, + "eval_gsm8k_runtime": 56.8505, + "eval_gsm8k_samples_per_second": 23.201, + "eval_gsm8k_steps_per_second": 2.902, + "step": 500 + }, + { + "epoch": 0.08, + "eval_webgpt_accuracy": 0.4973525539337287, + "eval_webgpt_loss": 2.21484375, + "eval_webgpt_runtime": 155.091, + "eval_webgpt_samples_per_second": 22.974, + "eval_webgpt_steps_per_second": 2.876, + "step": 500 + }, + { + "epoch": 0.08, + "learning_rate": 2.52e-06, + "loss": 1.2409, + "step": 510 + }, + { + "epoch": 0.09, + "learning_rate": 2.5700000000000004e-06, + "loss": 1.2076, + "step": 520 + }, + { + "epoch": 0.09, + "learning_rate": 2.6200000000000003e-06, + "loss": 1.2425, + "step": 530 + }, + { + "epoch": 0.09, + "learning_rate": 2.6700000000000003e-06, + "loss": 1.267, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.238, + "step": 550 + }, + { + "epoch": 0.09, + "learning_rate": 2.7700000000000006e-06, + "loss": 1.2176, + "step": 560 + }, + { + "epoch": 0.09, + "learning_rate": 2.82e-06, + "loss": 1.2168, + "step": 570 + }, + { + "epoch": 0.1, + "learning_rate": 2.87e-06, + "loss": 1.2262, + "step": 580 + }, + { + "epoch": 0.1, + "learning_rate": 2.92e-06, + "loss": 1.2125, + "step": 590 + }, + { + "epoch": 0.1, + "learning_rate": 2.97e-06, + "loss": 1.2092, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 3.0200000000000003e-06, + "loss": 1.2521, + "step": 610 + }, + { + "epoch": 0.1, + "learning_rate": 3.0700000000000003e-06, + "loss": 1.2297, + "step": 620 + }, + { + "epoch": 0.1, + "learning_rate": 3.12e-06, + "loss": 1.2317, + "step": 630 + }, + { + "epoch": 0.11, + "learning_rate": 3.17e-06, + "loss": 1.2225, + "step": 640 + }, + { + "epoch": 0.11, + "learning_rate": 3.2200000000000005e-06, + "loss": 1.2227, + "step": 650 + }, + { + "epoch": 0.11, + "learning_rate": 3.2700000000000005e-06, + "loss": 1.2172, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 3.3200000000000004e-06, + "loss": 1.22, + "step": 670 + }, + { + "epoch": 0.11, + "learning_rate": 3.3700000000000003e-06, + "loss": 1.2164, + "step": 680 + }, + { + "epoch": 0.11, + "learning_rate": 3.4200000000000007e-06, + "loss": 1.2045, + "step": 690 + }, + { + "epoch": 0.11, + "learning_rate": 3.4700000000000002e-06, + "loss": 1.2334, + "step": 700 + }, + { + "epoch": 0.12, + "learning_rate": 3.52e-06, + "loss": 1.1979, + "step": 710 + }, + { + "epoch": 0.12, + "learning_rate": 3.57e-06, + "loss": 1.2066, + "step": 720 + }, + { + "epoch": 0.12, + "learning_rate": 3.62e-06, + "loss": 1.2153, + "step": 730 + }, + { + "epoch": 0.12, + "learning_rate": 3.6700000000000004e-06, + "loss": 1.2246, + "step": 740 + }, + { + "epoch": 0.12, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.2027, + "step": 750 + }, + { + "epoch": 0.12, + "learning_rate": 3.7700000000000003e-06, + "loss": 1.233, + "step": 760 + }, + { + "epoch": 0.13, + "learning_rate": 3.820000000000001e-06, + "loss": 1.2156, + "step": 770 + }, + { + "epoch": 0.13, + "learning_rate": 3.87e-06, + "loss": 1.2067, + "step": 780 + }, + { + "epoch": 0.13, + "learning_rate": 3.920000000000001e-06, + "loss": 1.2077, + "step": 790 + }, + { + "epoch": 0.13, + "learning_rate": 3.97e-06, + "loss": 1.184, + "step": 800 + }, + { + "epoch": 0.13, + "learning_rate": 4.0200000000000005e-06, + "loss": 1.1747, + "step": 810 + }, + { + "epoch": 0.13, + "learning_rate": 4.07e-06, + "loss": 1.2055, + "step": 820 + }, + { + "epoch": 0.14, + "learning_rate": 4.12e-06, + "loss": 1.2137, + "step": 830 + }, + { + "epoch": 0.14, + "learning_rate": 4.17e-06, + "loss": 1.1934, + "step": 840 + }, + { + "epoch": 0.14, + "learning_rate": 4.22e-06, + "loss": 1.2154, + "step": 850 + }, + { + "epoch": 0.14, + "learning_rate": 4.270000000000001e-06, + "loss": 1.2216, + "step": 860 + }, + { + "epoch": 0.14, + "learning_rate": 4.32e-06, + "loss": 1.2002, + "step": 870 + }, + { + "epoch": 0.14, + "learning_rate": 4.3700000000000005e-06, + "loss": 1.1698, + "step": 880 + }, + { + "epoch": 0.15, + "learning_rate": 4.42e-06, + "loss": 1.2006, + "step": 890 + }, + { + "epoch": 0.15, + "learning_rate": 4.47e-06, + "loss": 1.1706, + "step": 900 + }, + { + "epoch": 0.15, + "learning_rate": 4.520000000000001e-06, + "loss": 1.1898, + "step": 910 + }, + { + "epoch": 0.15, + "learning_rate": 4.57e-06, + "loss": 1.1941, + "step": 920 + }, + { + "epoch": 0.15, + "learning_rate": 4.620000000000001e-06, + "loss": 1.1978, + "step": 930 + }, + { + "epoch": 0.15, + "learning_rate": 4.670000000000001e-06, + "loss": 1.1871, + "step": 940 + }, + { + "epoch": 0.16, + "learning_rate": 4.7200000000000005e-06, + "loss": 1.1673, + "step": 950 + }, + { + "epoch": 0.16, + "learning_rate": 4.77e-06, + "loss": 1.1938, + "step": 960 + }, + { + "epoch": 0.16, + "learning_rate": 4.8200000000000004e-06, + "loss": 1.1601, + "step": 970 + }, + { + "epoch": 0.16, + "learning_rate": 4.87e-06, + "loss": 1.1815, + "step": 980 + }, + { + "epoch": 0.16, + "learning_rate": 4.92e-06, + "loss": 1.1985, + "step": 990 + }, + { + "epoch": 0.16, + "learning_rate": 4.970000000000001e-06, + "loss": 1.1755, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_multi_news_accuracy": 0.5616533126883595, + "eval_multi_news_loss": 1.9033203125, + "eval_multi_news_runtime": 374.666, + "eval_multi_news_samples_per_second": 15.005, + "eval_multi_news_steps_per_second": 1.876, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_samsum_accuracy": 0.6358605685096722, + "eval_samsum_loss": 1.2763671875, + "eval_samsum_runtime": 36.4854, + "eval_samsum_samples_per_second": 22.42, + "eval_samsum_steps_per_second": 2.823, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_billsum_accuracy": 0.645555269329641, + "eval_billsum_loss": 1.466796875, + "eval_billsum_runtime": 205.3486, + "eval_billsum_samples_per_second": 15.919, + "eval_billsum_steps_per_second": 1.992, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_wmt2019_zh-en_accuracy": 0.5821662271706222, + "eval_wmt2019_zh-en_loss": 1.908203125, + "eval_wmt2019_zh-en_runtime": 42.6249, + "eval_wmt2019_zh-en_samples_per_second": 23.46, + "eval_wmt2019_zh-en_steps_per_second": 2.933, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_ted_trans_en-ja_accuracy": 0.5513235961740165, + "eval_ted_trans_en-ja_loss": 1.9208984375, + "eval_ted_trans_en-ja_runtime": 35.6003, + "eval_ted_trans_en-ja_samples_per_second": 22.5, + "eval_ted_trans_en-ja_steps_per_second": 2.837, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_ted_trans_zh-ja_accuracy": 0.4552332912988651, + "eval_ted_trans_zh-ja_loss": 2.595703125, + "eval_ted_trans_zh-ja_runtime": 2.6463, + "eval_ted_trans_zh-ja_samples_per_second": 15.871, + "eval_ted_trans_zh-ja_steps_per_second": 2.267, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_sharegpt_accuracy": 0.7199542010473684, + "eval_sharegpt_loss": 1.0751953125, + "eval_sharegpt_runtime": 733.0519, + "eval_sharegpt_samples_per_second": 4.567, + "eval_sharegpt_steps_per_second": 0.572, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_dolly15k_accuracy": 0.5963712993421053, + "eval_dolly15k_loss": 1.6484375, + "eval_dolly15k_runtime": 33.8269, + "eval_dolly15k_samples_per_second": 22.201, + "eval_dolly15k_steps_per_second": 2.779, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_ikala_accuracy": 0.7374268761235112, + "eval_ikala_loss": 0.98876953125, + "eval_ikala_runtime": 886.0533, + "eval_ikala_samples_per_second": 16.033, + "eval_ikala_steps_per_second": 2.004, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_oasst_export_accuracy": 0.6594323119298394, + "eval_oasst_export_loss": 1.580078125, + "eval_oasst_export_runtime": 134.3333, + "eval_oasst_export_samples_per_second": 15.625, + "eval_oasst_export_steps_per_second": 1.958, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_joke_accuracy": 0.4916603487490523, + "eval_joke_loss": 2.20703125, + "eval_joke_runtime": 3.5959, + "eval_joke_samples_per_second": 21.135, + "eval_joke_steps_per_second": 2.781, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_gsm8k_accuracy": 0.760284126003706, + "eval_gsm8k_loss": 0.89794921875, + "eval_gsm8k_runtime": 57.2198, + "eval_gsm8k_samples_per_second": 23.051, + "eval_gsm8k_steps_per_second": 2.884, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_webgpt_accuracy": 0.4994055667344498, + "eval_webgpt_loss": 2.18359375, + "eval_webgpt_runtime": 155.137, + "eval_webgpt_samples_per_second": 22.967, + "eval_webgpt_steps_per_second": 2.875, + "step": 1000 + }, + { + "epoch": 0.17, + "learning_rate": 5.02e-06, + "loss": 1.1772, + "step": 1010 + }, + { + "epoch": 0.17, + "learning_rate": 5.070000000000001e-06, + "loss": 1.2069, + "step": 1020 + }, + { + "epoch": 0.17, + "learning_rate": 5.12e-06, + "loss": 1.1755, + "step": 1030 + }, + { + "epoch": 0.17, + "learning_rate": 5.1700000000000005e-06, + "loss": 1.1658, + "step": 1040 + }, + { + "epoch": 0.17, + "learning_rate": 5.220000000000001e-06, + "loss": 1.1896, + "step": 1050 + }, + { + "epoch": 0.17, + "learning_rate": 5.27e-06, + "loss": 1.1743, + "step": 1060 + }, + { + "epoch": 0.18, + "learning_rate": 5.320000000000001e-06, + "loss": 1.1444, + "step": 1070 + }, + { + "epoch": 0.18, + "learning_rate": 5.370000000000001e-06, + "loss": 1.1812, + "step": 1080 + }, + { + "epoch": 0.18, + "learning_rate": 5.420000000000001e-06, + "loss": 1.1549, + "step": 1090 + }, + { + "epoch": 0.18, + "learning_rate": 5.470000000000001e-06, + "loss": 1.1929, + "step": 1100 + }, + { + "epoch": 0.18, + "learning_rate": 5.5200000000000005e-06, + "loss": 1.1317, + "step": 1110 + }, + { + "epoch": 0.18, + "learning_rate": 5.570000000000001e-06, + "loss": 1.1531, + "step": 1120 + }, + { + "epoch": 0.19, + "learning_rate": 5.620000000000001e-06, + "loss": 1.1871, + "step": 1130 + }, + { + "epoch": 0.19, + "learning_rate": 5.67e-06, + "loss": 1.1507, + "step": 1140 + }, + { + "epoch": 0.19, + "learning_rate": 5.72e-06, + "loss": 1.1916, + "step": 1150 + }, + { + "epoch": 0.19, + "learning_rate": 5.77e-06, + "loss": 1.1532, + "step": 1160 + }, + { + "epoch": 0.19, + "learning_rate": 5.82e-06, + "loss": 1.1763, + "step": 1170 + }, + { + "epoch": 0.19, + "learning_rate": 5.8700000000000005e-06, + "loss": 1.1719, + "step": 1180 + }, + { + "epoch": 0.2, + "learning_rate": 5.92e-06, + "loss": 1.1784, + "step": 1190 + }, + { + "epoch": 0.2, + "learning_rate": 5.9700000000000004e-06, + "loss": 1.1597, + "step": 1200 + }, + { + "epoch": 0.2, + "learning_rate": 6.02e-06, + "loss": 1.1594, + "step": 1210 + }, + { + "epoch": 0.2, + "learning_rate": 6.07e-06, + "loss": 1.1769, + "step": 1220 + }, + { + "epoch": 0.2, + "learning_rate": 6.120000000000001e-06, + "loss": 1.1692, + "step": 1230 + }, + { + "epoch": 0.2, + "learning_rate": 6.17e-06, + "loss": 1.1327, + "step": 1240 + }, + { + "epoch": 0.21, + "learning_rate": 6.220000000000001e-06, + "loss": 1.1733, + "step": 1250 + }, + { + "epoch": 0.21, + "learning_rate": 6.27e-06, + "loss": 1.16, + "step": 1260 + }, + { + "epoch": 0.21, + "learning_rate": 6.3200000000000005e-06, + "loss": 1.1701, + "step": 1270 + }, + { + "epoch": 0.21, + "learning_rate": 6.370000000000001e-06, + "loss": 1.1649, + "step": 1280 + }, + { + "epoch": 0.21, + "learning_rate": 6.42e-06, + "loss": 1.1477, + "step": 1290 + }, + { + "epoch": 0.21, + "learning_rate": 6.470000000000001e-06, + "loss": 1.1498, + "step": 1300 + }, + { + "epoch": 0.22, + "learning_rate": 6.520000000000001e-06, + "loss": 1.1881, + "step": 1310 + }, + { + "epoch": 0.22, + "learning_rate": 6.570000000000001e-06, + "loss": 1.1414, + "step": 1320 + }, + { + "epoch": 0.22, + "learning_rate": 6.620000000000001e-06, + "loss": 1.1663, + "step": 1330 + }, + { + "epoch": 0.22, + "learning_rate": 6.6700000000000005e-06, + "loss": 1.1555, + "step": 1340 + }, + { + "epoch": 0.22, + "learning_rate": 6.720000000000001e-06, + "loss": 1.1652, + "step": 1350 + }, + { + "epoch": 0.22, + "learning_rate": 6.770000000000001e-06, + "loss": 1.1539, + "step": 1360 + }, + { + "epoch": 0.23, + "learning_rate": 6.820000000000001e-06, + "loss": 1.1633, + "step": 1370 + }, + { + "epoch": 0.23, + "learning_rate": 6.870000000000001e-06, + "loss": 1.1583, + "step": 1380 + }, + { + "epoch": 0.23, + "learning_rate": 6.92e-06, + "loss": 1.1404, + "step": 1390 + }, + { + "epoch": 0.23, + "learning_rate": 6.97e-06, + "loss": 1.1436, + "step": 1400 + }, + { + "epoch": 0.23, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.1856, + "step": 1410 + }, + { + "epoch": 0.23, + "learning_rate": 7.07e-06, + "loss": 1.1587, + "step": 1420 + }, + { + "epoch": 0.23, + "learning_rate": 7.1200000000000004e-06, + "loss": 1.1296, + "step": 1430 + }, + { + "epoch": 0.24, + "learning_rate": 7.17e-06, + "loss": 1.1171, + "step": 1440 + }, + { + "epoch": 0.24, + "learning_rate": 7.22e-06, + "loss": 1.1459, + "step": 1450 + }, + { + "epoch": 0.24, + "learning_rate": 7.270000000000001e-06, + "loss": 1.1621, + "step": 1460 + }, + { + "epoch": 0.24, + "learning_rate": 7.32e-06, + "loss": 1.1345, + "step": 1470 + }, + { + "epoch": 0.24, + "learning_rate": 7.370000000000001e-06, + "loss": 1.1711, + "step": 1480 + }, + { + "epoch": 0.24, + "learning_rate": 7.420000000000001e-06, + "loss": 1.1852, + "step": 1490 + }, + { + "epoch": 0.25, + "learning_rate": 7.4700000000000005e-06, + "loss": 1.1361, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_multi_news_accuracy": 0.5626650769023163, + "eval_multi_news_loss": 1.9013671875, + "eval_multi_news_runtime": 374.2125, + "eval_multi_news_samples_per_second": 15.024, + "eval_multi_news_steps_per_second": 1.879, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_samsum_accuracy": 0.641110533036939, + "eval_samsum_loss": 1.267578125, + "eval_samsum_runtime": 37.1994, + "eval_samsum_samples_per_second": 21.99, + "eval_samsum_steps_per_second": 2.769, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_billsum_accuracy": 0.648249370750216, + "eval_billsum_loss": 1.453125, + "eval_billsum_runtime": 204.445, + "eval_billsum_samples_per_second": 15.99, + "eval_billsum_steps_per_second": 2.001, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_wmt2019_zh-en_accuracy": 0.5873898487705391, + "eval_wmt2019_zh-en_loss": 1.892578125, + "eval_wmt2019_zh-en_runtime": 43.8258, + "eval_wmt2019_zh-en_samples_per_second": 22.818, + "eval_wmt2019_zh-en_steps_per_second": 2.852, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_ted_trans_en-ja_accuracy": 0.5575474107655961, + "eval_ted_trans_en-ja_loss": 1.8818359375, + "eval_ted_trans_en-ja_runtime": 35.7188, + "eval_ted_trans_en-ja_samples_per_second": 22.425, + "eval_ted_trans_en-ja_steps_per_second": 2.828, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_ted_trans_zh-ja_accuracy": 0.45999153259949194, + "eval_ted_trans_zh-ja_loss": 2.556640625, + "eval_ted_trans_zh-ja_runtime": 2.58, + "eval_ted_trans_zh-ja_samples_per_second": 16.279, + "eval_ted_trans_zh-ja_steps_per_second": 2.326, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_sharegpt_accuracy": 0.7297610954662402, + "eval_sharegpt_loss": 1.0302734375, + "eval_sharegpt_runtime": 732.545, + "eval_sharegpt_samples_per_second": 4.57, + "eval_sharegpt_steps_per_second": 0.572, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_dolly15k_accuracy": 0.5962685032894737, + "eval_dolly15k_loss": 1.646484375, + "eval_dolly15k_runtime": 33.5813, + "eval_dolly15k_samples_per_second": 22.364, + "eval_dolly15k_steps_per_second": 2.799, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_ikala_accuracy": 0.7406414384414164, + "eval_ikala_loss": 0.96875, + "eval_ikala_runtime": 885.454, + "eval_ikala_samples_per_second": 16.044, + "eval_ikala_steps_per_second": 2.006, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_oasst_export_accuracy": 0.6599712470813749, + "eval_oasst_export_loss": 1.578125, + "eval_oasst_export_runtime": 133.2511, + "eval_oasst_export_samples_per_second": 15.752, + "eval_oasst_export_steps_per_second": 1.974, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_joke_accuracy": 0.49838893100833964, + "eval_joke_loss": 2.1953125, + "eval_joke_runtime": 4.5928, + "eval_joke_samples_per_second": 16.548, + "eval_joke_steps_per_second": 2.177, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_gsm8k_accuracy": 0.7668082149474984, + "eval_gsm8k_loss": 0.85791015625, + "eval_gsm8k_runtime": 57.7515, + "eval_gsm8k_samples_per_second": 22.839, + "eval_gsm8k_steps_per_second": 2.857, + "step": 1500 + }, + { + "epoch": 0.25, + "eval_webgpt_accuracy": 0.4995741373619939, + "eval_webgpt_loss": 2.181640625, + "eval_webgpt_runtime": 154.199, + "eval_webgpt_samples_per_second": 23.107, + "eval_webgpt_steps_per_second": 2.892, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 7.520000000000001e-06, + "loss": 1.1574, + "step": 1510 + }, + { + "epoch": 0.25, + "learning_rate": 7.57e-06, + "loss": 1.1593, + "step": 1520 + }, + { + "epoch": 0.25, + "learning_rate": 7.620000000000001e-06, + "loss": 1.1255, + "step": 1530 + }, + { + "epoch": 0.25, + "learning_rate": 7.670000000000001e-06, + "loss": 1.1665, + "step": 1540 + }, + { + "epoch": 0.25, + "learning_rate": 7.72e-06, + "loss": 1.1459, + "step": 1550 + }, + { + "epoch": 0.26, + "learning_rate": 7.77e-06, + "loss": 1.1187, + "step": 1560 + }, + { + "epoch": 0.26, + "learning_rate": 7.820000000000001e-06, + "loss": 1.1469, + "step": 1570 + }, + { + "epoch": 0.26, + "learning_rate": 7.870000000000001e-06, + "loss": 1.1648, + "step": 1580 + }, + { + "epoch": 0.26, + "learning_rate": 7.92e-06, + "loss": 1.1314, + "step": 1590 + }, + { + "epoch": 0.26, + "learning_rate": 7.970000000000002e-06, + "loss": 1.1213, + "step": 1600 + }, + { + "epoch": 0.26, + "learning_rate": 8.020000000000001e-06, + "loss": 1.1424, + "step": 1610 + }, + { + "epoch": 0.27, + "learning_rate": 8.07e-06, + "loss": 1.1637, + "step": 1620 + }, + { + "epoch": 0.27, + "learning_rate": 8.120000000000002e-06, + "loss": 1.1403, + "step": 1630 + }, + { + "epoch": 0.27, + "learning_rate": 8.17e-06, + "loss": 1.1299, + "step": 1640 + }, + { + "epoch": 0.27, + "learning_rate": 8.220000000000001e-06, + "loss": 1.1361, + "step": 1650 + }, + { + "epoch": 0.27, + "learning_rate": 8.27e-06, + "loss": 1.1484, + "step": 1660 + }, + { + "epoch": 0.27, + "learning_rate": 8.32e-06, + "loss": 1.1292, + "step": 1670 + }, + { + "epoch": 0.28, + "learning_rate": 8.370000000000001e-06, + "loss": 1.1395, + "step": 1680 + }, + { + "epoch": 0.28, + "learning_rate": 8.42e-06, + "loss": 1.1299, + "step": 1690 + }, + { + "epoch": 0.28, + "learning_rate": 8.47e-06, + "loss": 1.145, + "step": 1700 + }, + { + "epoch": 0.28, + "learning_rate": 8.52e-06, + "loss": 1.1351, + "step": 1710 + }, + { + "epoch": 0.28, + "learning_rate": 8.570000000000001e-06, + "loss": 1.1579, + "step": 1720 + }, + { + "epoch": 0.28, + "learning_rate": 8.62e-06, + "loss": 1.1483, + "step": 1730 + }, + { + "epoch": 0.29, + "learning_rate": 8.67e-06, + "loss": 1.1278, + "step": 1740 + }, + { + "epoch": 0.29, + "learning_rate": 8.720000000000001e-06, + "loss": 1.1375, + "step": 1750 + }, + { + "epoch": 0.29, + "learning_rate": 8.77e-06, + "loss": 1.1526, + "step": 1760 + }, + { + "epoch": 0.29, + "learning_rate": 8.82e-06, + "loss": 1.1535, + "step": 1770 + }, + { + "epoch": 0.29, + "learning_rate": 8.870000000000001e-06, + "loss": 1.1377, + "step": 1780 + }, + { + "epoch": 0.29, + "learning_rate": 8.920000000000001e-06, + "loss": 1.1578, + "step": 1790 + }, + { + "epoch": 0.3, + "learning_rate": 8.97e-06, + "loss": 1.1598, + "step": 1800 + }, + { + "epoch": 0.3, + "learning_rate": 9.020000000000002e-06, + "loss": 1.1601, + "step": 1810 + }, + { + "epoch": 0.3, + "learning_rate": 9.070000000000001e-06, + "loss": 1.1292, + "step": 1820 + }, + { + "epoch": 0.3, + "learning_rate": 9.12e-06, + "loss": 1.111, + "step": 1830 + }, + { + "epoch": 0.3, + "learning_rate": 9.17e-06, + "loss": 1.12, + "step": 1840 + }, + { + "epoch": 0.3, + "learning_rate": 9.220000000000002e-06, + "loss": 1.1, + "step": 1850 + }, + { + "epoch": 0.31, + "learning_rate": 9.270000000000001e-06, + "loss": 1.099, + "step": 1860 + }, + { + "epoch": 0.31, + "learning_rate": 9.32e-06, + "loss": 1.1333, + "step": 1870 + }, + { + "epoch": 0.31, + "learning_rate": 9.370000000000002e-06, + "loss": 1.1386, + "step": 1880 + }, + { + "epoch": 0.31, + "learning_rate": 9.42e-06, + "loss": 1.1389, + "step": 1890 + }, + { + "epoch": 0.31, + "learning_rate": 9.47e-06, + "loss": 1.1294, + "step": 1900 + }, + { + "epoch": 0.31, + "learning_rate": 9.52e-06, + "loss": 1.1326, + "step": 1910 + }, + { + "epoch": 0.32, + "learning_rate": 9.57e-06, + "loss": 1.129, + "step": 1920 + }, + { + "epoch": 0.32, + "learning_rate": 9.620000000000001e-06, + "loss": 1.1224, + "step": 1930 + }, + { + "epoch": 0.32, + "learning_rate": 9.67e-06, + "loss": 1.1168, + "step": 1940 + }, + { + "epoch": 0.32, + "learning_rate": 9.72e-06, + "loss": 1.1223, + "step": 1950 + }, + { + "epoch": 0.32, + "learning_rate": 9.770000000000001e-06, + "loss": 1.1064, + "step": 1960 + }, + { + "epoch": 0.32, + "learning_rate": 9.820000000000001e-06, + "loss": 1.1303, + "step": 1970 + }, + { + "epoch": 0.33, + "learning_rate": 9.87e-06, + "loss": 1.1134, + "step": 1980 + }, + { + "epoch": 0.33, + "learning_rate": 9.920000000000002e-06, + "loss": 1.1396, + "step": 1990 + }, + { + "epoch": 0.33, + "learning_rate": 9.970000000000001e-06, + "loss": 1.1418, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_multi_news_accuracy": 0.5614524803428215, + "eval_multi_news_loss": 1.9052734375, + "eval_multi_news_runtime": 373.3978, + "eval_multi_news_samples_per_second": 15.056, + "eval_multi_news_steps_per_second": 1.883, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_samsum_accuracy": 0.6388875750839521, + "eval_samsum_loss": 1.265625, + "eval_samsum_runtime": 37.3723, + "eval_samsum_samples_per_second": 21.888, + "eval_samsum_steps_per_second": 2.756, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_billsum_accuracy": 0.6493294263495999, + "eval_billsum_loss": 1.4462890625, + "eval_billsum_runtime": 203.77, + "eval_billsum_samples_per_second": 16.043, + "eval_billsum_steps_per_second": 2.007, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_wmt2019_zh-en_accuracy": 0.5823181343543334, + "eval_wmt2019_zh-en_loss": 1.9228515625, + "eval_wmt2019_zh-en_runtime": 43.5037, + "eval_wmt2019_zh-en_samples_per_second": 22.987, + "eval_wmt2019_zh-en_steps_per_second": 2.873, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_ted_trans_en-ja_accuracy": 0.5623202978930665, + "eval_ted_trans_en-ja_loss": 1.869140625, + "eval_ted_trans_en-ja_runtime": 35.4889, + "eval_ted_trans_en-ja_samples_per_second": 22.57, + "eval_ted_trans_en-ja_steps_per_second": 2.846, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_ted_trans_zh-ja_accuracy": 0.46688327918020495, + "eval_ted_trans_zh-ja_loss": 2.46875, + "eval_ted_trans_zh-ja_runtime": 2.6642, + "eval_ted_trans_zh-ja_samples_per_second": 15.765, + "eval_ted_trans_zh-ja_steps_per_second": 2.252, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_sharegpt_accuracy": 0.7361997474453208, + "eval_sharegpt_loss": 1.001953125, + "eval_sharegpt_runtime": 732.4255, + "eval_sharegpt_samples_per_second": 4.571, + "eval_sharegpt_steps_per_second": 0.572, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_dolly15k_accuracy": 0.5939898574561403, + "eval_dolly15k_loss": 1.65625, + "eval_dolly15k_runtime": 33.8567, + "eval_dolly15k_samples_per_second": 22.182, + "eval_dolly15k_steps_per_second": 2.776, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_ikala_accuracy": 0.7422763555784087, + "eval_ikala_loss": 0.9580078125, + "eval_ikala_runtime": 885.2845, + "eval_ikala_samples_per_second": 16.047, + "eval_ikala_steps_per_second": 2.006, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_oasst_export_accuracy": 0.6593580262738169, + "eval_oasst_export_loss": 1.578125, + "eval_oasst_export_runtime": 132.7253, + "eval_oasst_export_samples_per_second": 15.815, + "eval_oasst_export_steps_per_second": 1.982, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_joke_accuracy": 0.49895754359363154, + "eval_joke_loss": 2.171875, + "eval_joke_runtime": 4.5049, + "eval_joke_samples_per_second": 16.871, + "eval_joke_steps_per_second": 2.22, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_gsm8k_accuracy": 0.775555898702903, + "eval_gsm8k_loss": 0.8232421875, + "eval_gsm8k_runtime": 56.3886, + "eval_gsm8k_samples_per_second": 23.391, + "eval_gsm8k_steps_per_second": 2.926, + "step": 2000 + }, + { + "epoch": 0.33, + "eval_webgpt_accuracy": 0.4990524556304364, + "eval_webgpt_loss": 2.185546875, + "eval_webgpt_runtime": 154.0524, + "eval_webgpt_samples_per_second": 23.128, + "eval_webgpt_steps_per_second": 2.895, + "step": 2000 + }, + { + "epoch": 0.33, + "learning_rate": 9.99914354230901e-06, + "loss": 1.116, + "step": 2010 + }, + { + "epoch": 0.33, + "learning_rate": 9.997002398081536e-06, + "loss": 1.1243, + "step": 2020 + }, + { + "epoch": 0.33, + "learning_rate": 9.99486125385406e-06, + "loss": 1.1183, + "step": 2030 + }, + { + "epoch": 0.34, + "learning_rate": 9.992720109626585e-06, + "loss": 1.1324, + "step": 2040 + }, + { + "epoch": 0.34, + "learning_rate": 9.99057896539911e-06, + "loss": 1.1033, + "step": 2050 + }, + { + "epoch": 0.34, + "learning_rate": 9.988437821171634e-06, + "loss": 1.0962, + "step": 2060 + }, + { + "epoch": 0.34, + "learning_rate": 9.98629667694416e-06, + "loss": 1.1253, + "step": 2070 + }, + { + "epoch": 0.34, + "learning_rate": 9.984155532716685e-06, + "loss": 1.1522, + "step": 2080 + }, + { + "epoch": 0.34, + "learning_rate": 9.98201438848921e-06, + "loss": 1.142, + "step": 2090 + }, + { + "epoch": 0.34, + "learning_rate": 9.979873244261734e-06, + "loss": 1.1289, + "step": 2100 + }, + { + "epoch": 0.35, + "learning_rate": 9.97773210003426e-06, + "loss": 1.1367, + "step": 2110 + }, + { + "epoch": 0.35, + "learning_rate": 9.975590955806785e-06, + "loss": 1.1303, + "step": 2120 + }, + { + "epoch": 0.35, + "learning_rate": 9.973449811579308e-06, + "loss": 1.1041, + "step": 2130 + }, + { + "epoch": 0.35, + "learning_rate": 9.971308667351834e-06, + "loss": 1.1325, + "step": 2140 + }, + { + "epoch": 0.35, + "learning_rate": 9.969167523124359e-06, + "loss": 1.1371, + "step": 2150 + }, + { + "epoch": 0.35, + "learning_rate": 9.967026378896883e-06, + "loss": 1.112, + "step": 2160 + }, + { + "epoch": 0.36, + "learning_rate": 9.964885234669408e-06, + "loss": 1.1172, + "step": 2170 + }, + { + "epoch": 0.36, + "learning_rate": 9.962744090441932e-06, + "loss": 1.0959, + "step": 2180 + }, + { + "epoch": 0.36, + "learning_rate": 9.960602946214459e-06, + "loss": 1.1322, + "step": 2190 + }, + { + "epoch": 0.36, + "learning_rate": 9.958461801986983e-06, + "loss": 1.1098, + "step": 2200 + }, + { + "epoch": 0.36, + "learning_rate": 9.956320657759508e-06, + "loss": 1.1185, + "step": 2210 + }, + { + "epoch": 0.36, + "learning_rate": 9.954179513532032e-06, + "loss": 1.1027, + "step": 2220 + }, + { + "epoch": 0.37, + "learning_rate": 9.952038369304557e-06, + "loss": 1.1217, + "step": 2230 + }, + { + "epoch": 0.37, + "learning_rate": 9.949897225077082e-06, + "loss": 1.115, + "step": 2240 + }, + { + "epoch": 0.37, + "learning_rate": 9.947756080849606e-06, + "loss": 1.1197, + "step": 2250 + }, + { + "epoch": 0.37, + "learning_rate": 9.945614936622131e-06, + "loss": 1.0926, + "step": 2260 + }, + { + "epoch": 0.37, + "learning_rate": 9.943473792394657e-06, + "loss": 1.1085, + "step": 2270 + }, + { + "epoch": 0.37, + "learning_rate": 9.94133264816718e-06, + "loss": 1.139, + "step": 2280 + }, + { + "epoch": 0.38, + "learning_rate": 9.939191503939706e-06, + "loss": 1.1131, + "step": 2290 + }, + { + "epoch": 0.38, + "learning_rate": 9.937050359712231e-06, + "loss": 1.1281, + "step": 2300 + }, + { + "epoch": 0.38, + "learning_rate": 9.934909215484757e-06, + "loss": 1.0962, + "step": 2310 + }, + { + "epoch": 0.38, + "learning_rate": 9.93276807125728e-06, + "loss": 1.107, + "step": 2320 + }, + { + "epoch": 0.38, + "learning_rate": 9.930626927029806e-06, + "loss": 1.1082, + "step": 2330 + }, + { + "epoch": 0.38, + "learning_rate": 9.928485782802331e-06, + "loss": 1.1323, + "step": 2340 + }, + { + "epoch": 0.39, + "learning_rate": 9.926344638574855e-06, + "loss": 1.0984, + "step": 2350 + }, + { + "epoch": 0.39, + "learning_rate": 9.92420349434738e-06, + "loss": 1.118, + "step": 2360 + }, + { + "epoch": 0.39, + "learning_rate": 9.922062350119905e-06, + "loss": 1.1003, + "step": 2370 + }, + { + "epoch": 0.39, + "learning_rate": 9.919921205892429e-06, + "loss": 1.115, + "step": 2380 + }, + { + "epoch": 0.39, + "learning_rate": 9.917780061664954e-06, + "loss": 1.0974, + "step": 2390 + }, + { + "epoch": 0.39, + "learning_rate": 9.915638917437478e-06, + "loss": 1.107, + "step": 2400 + }, + { + "epoch": 0.4, + "learning_rate": 9.913497773210005e-06, + "loss": 1.1101, + "step": 2410 + }, + { + "epoch": 0.4, + "learning_rate": 9.911356628982529e-06, + "loss": 1.1115, + "step": 2420 + }, + { + "epoch": 0.4, + "learning_rate": 9.909215484755054e-06, + "loss": 1.0951, + "step": 2430 + }, + { + "epoch": 0.4, + "learning_rate": 9.907074340527578e-06, + "loss": 1.0938, + "step": 2440 + }, + { + "epoch": 0.4, + "learning_rate": 9.904933196300103e-06, + "loss": 1.0772, + "step": 2450 + }, + { + "epoch": 0.4, + "learning_rate": 9.902792052072629e-06, + "loss": 1.1028, + "step": 2460 + }, + { + "epoch": 0.41, + "learning_rate": 9.900650907845152e-06, + "loss": 1.0923, + "step": 2470 + }, + { + "epoch": 0.41, + "learning_rate": 9.898509763617678e-06, + "loss": 1.1238, + "step": 2480 + }, + { + "epoch": 0.41, + "learning_rate": 9.896368619390203e-06, + "loss": 1.1401, + "step": 2490 + }, + { + "epoch": 0.41, + "learning_rate": 9.894227475162728e-06, + "loss": 1.1142, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_multi_news_accuracy": 0.5619751512736382, + "eval_multi_news_loss": 1.904296875, + "eval_multi_news_runtime": 373.6389, + "eval_multi_news_samples_per_second": 15.047, + "eval_multi_news_steps_per_second": 1.881, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_samsum_accuracy": 0.6433334909899258, + "eval_samsum_loss": 1.2607421875, + "eval_samsum_runtime": 37.306, + "eval_samsum_samples_per_second": 21.927, + "eval_samsum_steps_per_second": 2.761, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_billsum_accuracy": 0.650991772793869, + "eval_billsum_loss": 1.439453125, + "eval_billsum_runtime": 203.4152, + "eval_billsum_samples_per_second": 16.071, + "eval_billsum_steps_per_second": 2.011, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_wmt2019_zh-en_accuracy": 0.5893870117057808, + "eval_wmt2019_zh-en_loss": 1.904296875, + "eval_wmt2019_zh-en_runtime": 43.0674, + "eval_wmt2019_zh-en_samples_per_second": 23.219, + "eval_wmt2019_zh-en_steps_per_second": 2.902, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_ted_trans_en-ja_accuracy": 0.563225558860213, + "eval_ted_trans_en-ja_loss": 1.8388671875, + "eval_ted_trans_en-ja_runtime": 35.7494, + "eval_ted_trans_en-ja_samples_per_second": 22.406, + "eval_ted_trans_en-ja_steps_per_second": 2.825, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_ted_trans_zh-ja_accuracy": 0.4661596958174905, + "eval_ted_trans_zh-ja_loss": 2.453125, + "eval_ted_trans_zh-ja_runtime": 2.525, + "eval_ted_trans_zh-ja_samples_per_second": 16.634, + "eval_ted_trans_zh-ja_steps_per_second": 2.376, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_sharegpt_accuracy": 0.7423548548826656, + "eval_sharegpt_loss": 0.97265625, + "eval_sharegpt_runtime": 732.2894, + "eval_sharegpt_samples_per_second": 4.572, + "eval_sharegpt_steps_per_second": 0.572, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_dolly15k_accuracy": 0.5930646929824561, + "eval_dolly15k_loss": 1.6513671875, + "eval_dolly15k_runtime": 33.3723, + "eval_dolly15k_samples_per_second": 22.504, + "eval_dolly15k_steps_per_second": 2.817, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_ikala_accuracy": 0.7440914978067725, + "eval_ikala_loss": 0.9453125, + "eval_ikala_runtime": 884.831, + "eval_ikala_samples_per_second": 16.055, + "eval_ikala_steps_per_second": 2.007, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_oasst_export_accuracy": 0.6600834038561538, + "eval_oasst_export_loss": 1.5791015625, + "eval_oasst_export_runtime": 133.5652, + "eval_oasst_export_samples_per_second": 15.715, + "eval_oasst_export_steps_per_second": 1.969, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_joke_accuracy": 0.5195223654283548, + "eval_joke_loss": 2.078125, + "eval_joke_runtime": 4.5929, + "eval_joke_samples_per_second": 16.547, + "eval_joke_steps_per_second": 2.177, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_gsm8k_accuracy": 0.782682211241507, + "eval_gsm8k_loss": 0.796875, + "eval_gsm8k_runtime": 56.5404, + "eval_gsm8k_samples_per_second": 23.328, + "eval_gsm8k_steps_per_second": 2.918, + "step": 2500 + }, + { + "epoch": 0.41, + "eval_webgpt_accuracy": 0.49846334564786127, + "eval_webgpt_loss": 2.189453125, + "eval_webgpt_runtime": 154.9389, + "eval_webgpt_samples_per_second": 22.996, + "eval_webgpt_steps_per_second": 2.879, + "step": 2500 + }, + { + "epoch": 0.41, + "learning_rate": 9.892086330935252e-06, + "loss": 1.1335, + "step": 2510 + }, + { + "epoch": 0.41, + "learning_rate": 9.889945186707778e-06, + "loss": 1.0999, + "step": 2520 + }, + { + "epoch": 0.42, + "learning_rate": 9.887804042480303e-06, + "loss": 1.1324, + "step": 2530 + }, + { + "epoch": 0.42, + "learning_rate": 9.885662898252827e-06, + "loss": 1.0832, + "step": 2540 + }, + { + "epoch": 0.42, + "learning_rate": 9.883521754025352e-06, + "loss": 1.1, + "step": 2550 + }, + { + "epoch": 0.42, + "learning_rate": 9.881380609797877e-06, + "loss": 1.1153, + "step": 2560 + }, + { + "epoch": 0.42, + "learning_rate": 9.879239465570401e-06, + "loss": 1.0958, + "step": 2570 + }, + { + "epoch": 0.42, + "learning_rate": 9.877098321342926e-06, + "loss": 1.1154, + "step": 2580 + }, + { + "epoch": 0.43, + "learning_rate": 9.874957177115452e-06, + "loss": 1.1139, + "step": 2590 + }, + { + "epoch": 0.43, + "learning_rate": 9.872816032887977e-06, + "loss": 1.0739, + "step": 2600 + }, + { + "epoch": 0.43, + "learning_rate": 9.8706748886605e-06, + "loss": 1.1105, + "step": 2610 + }, + { + "epoch": 0.43, + "learning_rate": 9.868533744433026e-06, + "loss": 1.0969, + "step": 2620 + }, + { + "epoch": 0.43, + "learning_rate": 9.866392600205552e-06, + "loss": 1.1207, + "step": 2630 + }, + { + "epoch": 0.43, + "learning_rate": 9.864251455978075e-06, + "loss": 1.1392, + "step": 2640 + }, + { + "epoch": 0.44, + "learning_rate": 9.8621103117506e-06, + "loss": 1.1161, + "step": 2650 + }, + { + "epoch": 0.44, + "learning_rate": 9.859969167523126e-06, + "loss": 1.0767, + "step": 2660 + }, + { + "epoch": 0.44, + "learning_rate": 9.85782802329565e-06, + "loss": 1.1113, + "step": 2670 + }, + { + "epoch": 0.44, + "learning_rate": 9.855686879068175e-06, + "loss": 1.0801, + "step": 2680 + }, + { + "epoch": 0.44, + "learning_rate": 9.853545734840699e-06, + "loss": 1.0835, + "step": 2690 + }, + { + "epoch": 0.44, + "learning_rate": 9.851404590613226e-06, + "loss": 1.0635, + "step": 2700 + }, + { + "epoch": 0.45, + "learning_rate": 9.84926344638575e-06, + "loss": 1.095, + "step": 2710 + }, + { + "epoch": 0.45, + "learning_rate": 9.847122302158275e-06, + "loss": 1.0822, + "step": 2720 + }, + { + "epoch": 0.45, + "learning_rate": 9.844981157930798e-06, + "loss": 1.0983, + "step": 2730 + }, + { + "epoch": 0.45, + "learning_rate": 9.842840013703324e-06, + "loss": 1.1245, + "step": 2740 + }, + { + "epoch": 0.45, + "learning_rate": 9.84069886947585e-06, + "loss": 1.0768, + "step": 2750 + }, + { + "epoch": 0.45, + "learning_rate": 9.838557725248373e-06, + "loss": 1.0958, + "step": 2760 + }, + { + "epoch": 0.45, + "learning_rate": 9.836416581020898e-06, + "loss": 1.0869, + "step": 2770 + }, + { + "epoch": 0.46, + "learning_rate": 9.834275436793424e-06, + "loss": 1.126, + "step": 2780 + }, + { + "epoch": 0.46, + "learning_rate": 9.832134292565947e-06, + "loss": 1.0823, + "step": 2790 + }, + { + "epoch": 0.46, + "learning_rate": 9.829993148338473e-06, + "loss": 1.1057, + "step": 2800 + }, + { + "epoch": 0.46, + "learning_rate": 9.827852004110998e-06, + "loss": 1.0717, + "step": 2810 + }, + { + "epoch": 0.46, + "learning_rate": 9.825710859883523e-06, + "loss": 1.0835, + "step": 2820 + }, + { + "epoch": 0.46, + "learning_rate": 9.823569715656047e-06, + "loss": 1.1291, + "step": 2830 + }, + { + "epoch": 0.47, + "learning_rate": 9.821428571428573e-06, + "loss": 1.0856, + "step": 2840 + }, + { + "epoch": 0.47, + "learning_rate": 9.819287427201098e-06, + "loss": 1.0972, + "step": 2850 + }, + { + "epoch": 0.47, + "learning_rate": 9.817146282973622e-06, + "loss": 1.0833, + "step": 2860 + }, + { + "epoch": 0.47, + "learning_rate": 9.815005138746147e-06, + "loss": 1.1124, + "step": 2870 + }, + { + "epoch": 0.47, + "learning_rate": 9.812863994518672e-06, + "loss": 1.0905, + "step": 2880 + }, + { + "epoch": 0.47, + "learning_rate": 9.810722850291196e-06, + "loss": 1.0891, + "step": 2890 + }, + { + "epoch": 0.48, + "learning_rate": 9.808581706063721e-06, + "loss": 1.0931, + "step": 2900 + }, + { + "epoch": 0.48, + "learning_rate": 9.806440561836245e-06, + "loss": 1.1066, + "step": 2910 + }, + { + "epoch": 0.48, + "learning_rate": 9.804299417608772e-06, + "loss": 1.0759, + "step": 2920 + }, + { + "epoch": 0.48, + "learning_rate": 9.802158273381296e-06, + "loss": 1.0996, + "step": 2930 + }, + { + "epoch": 0.48, + "learning_rate": 9.800017129153821e-06, + "loss": 1.0868, + "step": 2940 + }, + { + "epoch": 0.48, + "learning_rate": 9.797875984926345e-06, + "loss": 1.0799, + "step": 2950 + }, + { + "epoch": 0.49, + "learning_rate": 9.79573484069887e-06, + "loss": 1.0989, + "step": 2960 + }, + { + "epoch": 0.49, + "learning_rate": 9.793593696471396e-06, + "loss": 1.0841, + "step": 2970 + }, + { + "epoch": 0.49, + "learning_rate": 9.79145255224392e-06, + "loss": 1.0745, + "step": 2980 + }, + { + "epoch": 0.49, + "learning_rate": 9.789311408016445e-06, + "loss": 1.0742, + "step": 2990 + }, + { + "epoch": 0.49, + "learning_rate": 9.78717026378897e-06, + "loss": 1.0745, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_multi_news_accuracy": 0.5619935239487821, + "eval_multi_news_loss": 1.9033203125, + "eval_multi_news_runtime": 373.8934, + "eval_multi_news_samples_per_second": 15.036, + "eval_multi_news_steps_per_second": 1.88, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_samsum_accuracy": 0.6469753582746063, + "eval_samsum_loss": 1.24609375, + "eval_samsum_runtime": 37.2777, + "eval_samsum_samples_per_second": 21.943, + "eval_samsum_steps_per_second": 2.763, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_billsum_accuracy": 0.6516022390022165, + "eval_billsum_loss": 1.431640625, + "eval_billsum_runtime": 204.7394, + "eval_billsum_samples_per_second": 15.967, + "eval_billsum_steps_per_second": 1.998, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_wmt2019_zh-en_accuracy": 0.5844544095665172, + "eval_wmt2019_zh-en_loss": 1.8984375, + "eval_wmt2019_zh-en_runtime": 42.4024, + "eval_wmt2019_zh-en_samples_per_second": 23.584, + "eval_wmt2019_zh-en_steps_per_second": 2.948, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_ted_trans_en-ja_accuracy": 0.577170182658057, + "eval_ted_trans_en-ja_loss": 1.7958984375, + "eval_ted_trans_en-ja_runtime": 35.5789, + "eval_ted_trans_en-ja_samples_per_second": 22.513, + "eval_ted_trans_en-ja_steps_per_second": 2.839, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_ted_trans_zh-ja_accuracy": 0.46690610569522834, + "eval_ted_trans_zh-ja_loss": 2.515625, + "eval_ted_trans_zh-ja_runtime": 2.5484, + "eval_ted_trans_zh-ja_samples_per_second": 16.481, + "eval_ted_trans_zh-ja_steps_per_second": 2.354, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_sharegpt_accuracy": 0.7475412115956699, + "eval_sharegpt_loss": 0.9453125, + "eval_sharegpt_runtime": 731.2857, + "eval_sharegpt_samples_per_second": 4.578, + "eval_sharegpt_steps_per_second": 0.573, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_dolly15k_accuracy": 0.5928933662280702, + "eval_dolly15k_loss": 1.65625, + "eval_dolly15k_runtime": 34.5989, + "eval_dolly15k_samples_per_second": 21.706, + "eval_dolly15k_steps_per_second": 2.717, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_ikala_accuracy": 0.7469942144047141, + "eval_ikala_loss": 0.9296875, + "eval_ikala_runtime": 884.7774, + "eval_ikala_samples_per_second": 16.056, + "eval_ikala_steps_per_second": 2.007, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_oasst_export_accuracy": 0.6607141036415994, + "eval_oasst_export_loss": 1.5732421875, + "eval_oasst_export_runtime": 132.9167, + "eval_oasst_export_samples_per_second": 15.792, + "eval_oasst_export_steps_per_second": 1.979, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_joke_accuracy": 0.5242608036391205, + "eval_joke_loss": 2.025390625, + "eval_joke_runtime": 4.5573, + "eval_joke_samples_per_second": 16.677, + "eval_joke_steps_per_second": 2.194, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_gsm8k_accuracy": 0.7849444101297097, + "eval_gsm8k_loss": 0.783203125, + "eval_gsm8k_runtime": 56.5634, + "eval_gsm8k_samples_per_second": 23.319, + "eval_gsm8k_steps_per_second": 2.917, + "step": 3000 + }, + { + "epoch": 0.49, + "eval_webgpt_accuracy": 0.498837749883775, + "eval_webgpt_loss": 2.19140625, + "eval_webgpt_runtime": 153.8546, + "eval_webgpt_samples_per_second": 23.158, + "eval_webgpt_steps_per_second": 2.899, + "step": 3000 + }, + { + "epoch": 0.49, + "learning_rate": 9.785029119561494e-06, + "loss": 1.0876, + "step": 3010 + }, + { + "epoch": 0.5, + "learning_rate": 9.782887975334019e-06, + "loss": 1.0761, + "step": 3020 + }, + { + "epoch": 0.5, + "learning_rate": 9.780746831106544e-06, + "loss": 1.103, + "step": 3030 + }, + { + "epoch": 0.5, + "learning_rate": 9.77860568687907e-06, + "loss": 1.0891, + "step": 3040 + }, + { + "epoch": 0.5, + "learning_rate": 9.776464542651593e-06, + "loss": 1.0852, + "step": 3050 + }, + { + "epoch": 0.5, + "learning_rate": 9.774323398424119e-06, + "loss": 1.1041, + "step": 3060 + }, + { + "epoch": 0.5, + "learning_rate": 9.772182254196644e-06, + "loss": 1.0801, + "step": 3070 + }, + { + "epoch": 0.51, + "learning_rate": 9.770041109969168e-06, + "loss": 1.0851, + "step": 3080 + }, + { + "epoch": 0.51, + "learning_rate": 9.767899965741693e-06, + "loss": 1.0839, + "step": 3090 + }, + { + "epoch": 0.51, + "learning_rate": 9.765758821514219e-06, + "loss": 1.0756, + "step": 3100 + }, + { + "epoch": 0.51, + "learning_rate": 9.763617677286742e-06, + "loss": 1.0604, + "step": 3110 + }, + { + "epoch": 0.51, + "learning_rate": 9.761476533059268e-06, + "loss": 1.0613, + "step": 3120 + }, + { + "epoch": 0.51, + "learning_rate": 9.759335388831791e-06, + "loss": 1.0839, + "step": 3130 + }, + { + "epoch": 0.52, + "learning_rate": 9.757194244604318e-06, + "loss": 1.0873, + "step": 3140 + }, + { + "epoch": 0.52, + "learning_rate": 9.755053100376842e-06, + "loss": 1.0935, + "step": 3150 + }, + { + "epoch": 0.52, + "learning_rate": 9.752911956149367e-06, + "loss": 1.0821, + "step": 3160 + }, + { + "epoch": 0.52, + "learning_rate": 9.750770811921891e-06, + "loss": 1.0679, + "step": 3170 + }, + { + "epoch": 0.52, + "learning_rate": 9.748629667694417e-06, + "loss": 1.0939, + "step": 3180 + }, + { + "epoch": 0.52, + "learning_rate": 9.746488523466942e-06, + "loss": 1.0764, + "step": 3190 + }, + { + "epoch": 0.53, + "learning_rate": 9.744347379239466e-06, + "loss": 1.0772, + "step": 3200 + }, + { + "epoch": 0.53, + "learning_rate": 9.742206235011991e-06, + "loss": 1.0983, + "step": 3210 + }, + { + "epoch": 0.53, + "learning_rate": 9.740065090784516e-06, + "loss": 1.0649, + "step": 3220 + }, + { + "epoch": 0.53, + "learning_rate": 9.73792394655704e-06, + "loss": 1.0829, + "step": 3230 + }, + { + "epoch": 0.53, + "learning_rate": 9.735782802329565e-06, + "loss": 1.0914, + "step": 3240 + }, + { + "epoch": 0.53, + "learning_rate": 9.73364165810209e-06, + "loss": 1.0776, + "step": 3250 + }, + { + "epoch": 0.54, + "learning_rate": 9.731500513874616e-06, + "loss": 1.0698, + "step": 3260 + }, + { + "epoch": 0.54, + "learning_rate": 9.72935936964714e-06, + "loss": 1.074, + "step": 3270 + }, + { + "epoch": 0.54, + "learning_rate": 9.727218225419665e-06, + "loss": 1.0951, + "step": 3280 + }, + { + "epoch": 0.54, + "learning_rate": 9.72507708119219e-06, + "loss": 1.0586, + "step": 3290 + }, + { + "epoch": 0.54, + "learning_rate": 9.722935936964714e-06, + "loss": 1.066, + "step": 3300 + }, + { + "epoch": 0.54, + "learning_rate": 9.72079479273724e-06, + "loss": 1.0897, + "step": 3310 + }, + { + "epoch": 0.55, + "learning_rate": 9.718653648509765e-06, + "loss": 1.079, + "step": 3320 + }, + { + "epoch": 0.55, + "learning_rate": 9.716512504282289e-06, + "loss": 1.063, + "step": 3330 + }, + { + "epoch": 0.55, + "learning_rate": 9.714371360054814e-06, + "loss": 1.0688, + "step": 3340 + }, + { + "epoch": 0.55, + "learning_rate": 9.712230215827338e-06, + "loss": 1.0845, + "step": 3350 + }, + { + "epoch": 0.55, + "learning_rate": 9.710089071599865e-06, + "loss": 1.0421, + "step": 3360 + }, + { + "epoch": 0.55, + "learning_rate": 9.707947927372388e-06, + "loss": 1.0735, + "step": 3370 + }, + { + "epoch": 0.56, + "learning_rate": 9.705806783144914e-06, + "loss": 1.0848, + "step": 3380 + }, + { + "epoch": 0.56, + "learning_rate": 9.703665638917438e-06, + "loss": 1.0863, + "step": 3390 + }, + { + "epoch": 0.56, + "learning_rate": 9.701524494689963e-06, + "loss": 1.0372, + "step": 3400 + }, + { + "epoch": 0.56, + "learning_rate": 9.699383350462488e-06, + "loss": 1.0741, + "step": 3410 + }, + { + "epoch": 0.56, + "learning_rate": 9.697242206235012e-06, + "loss": 1.0988, + "step": 3420 + }, + { + "epoch": 0.56, + "learning_rate": 9.695101062007537e-06, + "loss": 1.0808, + "step": 3430 + }, + { + "epoch": 0.57, + "learning_rate": 9.692959917780063e-06, + "loss": 1.0717, + "step": 3440 + }, + { + "epoch": 0.57, + "learning_rate": 9.690818773552586e-06, + "loss": 1.0632, + "step": 3450 + }, + { + "epoch": 0.57, + "learning_rate": 9.688677629325112e-06, + "loss": 1.0539, + "step": 3460 + }, + { + "epoch": 0.57, + "learning_rate": 9.686536485097637e-06, + "loss": 1.0944, + "step": 3470 + }, + { + "epoch": 0.57, + "learning_rate": 9.684395340870162e-06, + "loss": 1.0682, + "step": 3480 + }, + { + "epoch": 0.57, + "learning_rate": 9.682254196642686e-06, + "loss": 1.066, + "step": 3490 + }, + { + "epoch": 0.57, + "learning_rate": 9.680113052415212e-06, + "loss": 1.0649, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_multi_news_accuracy": 0.5626207290657621, + "eval_multi_news_loss": 1.90234375, + "eval_multi_news_runtime": 374.9582, + "eval_multi_news_samples_per_second": 14.994, + "eval_multi_news_steps_per_second": 1.875, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_samsum_accuracy": 0.6482050796954074, + "eval_samsum_loss": 1.244140625, + "eval_samsum_runtime": 36.4554, + "eval_samsum_samples_per_second": 22.438, + "eval_samsum_steps_per_second": 2.825, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_billsum_accuracy": 0.6542856069509964, + "eval_billsum_loss": 1.427734375, + "eval_billsum_runtime": 204.9118, + "eval_billsum_samples_per_second": 15.953, + "eval_billsum_steps_per_second": 1.996, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_wmt2019_zh-en_accuracy": 0.5960585499733171, + "eval_wmt2019_zh-en_loss": 1.8671875, + "eval_wmt2019_zh-en_runtime": 42.5542, + "eval_wmt2019_zh-en_samples_per_second": 23.499, + "eval_wmt2019_zh-en_steps_per_second": 2.937, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_ted_trans_en-ja_accuracy": 0.5799230113905279, + "eval_ted_trans_en-ja_loss": 1.7705078125, + "eval_ted_trans_en-ja_runtime": 35.599, + "eval_ted_trans_en-ja_samples_per_second": 22.501, + "eval_ted_trans_en-ja_steps_per_second": 2.837, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_ted_trans_zh-ja_accuracy": 0.48124428179322964, + "eval_ted_trans_zh-ja_loss": 2.44140625, + "eval_ted_trans_zh-ja_runtime": 2.5311, + "eval_ted_trans_zh-ja_samples_per_second": 16.594, + "eval_ted_trans_zh-ja_steps_per_second": 2.371, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_sharegpt_accuracy": 0.7523055854464493, + "eval_sharegpt_loss": 0.92236328125, + "eval_sharegpt_runtime": 732.6588, + "eval_sharegpt_samples_per_second": 4.57, + "eval_sharegpt_steps_per_second": 0.572, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_dolly15k_accuracy": 0.5933216831140351, + "eval_dolly15k_loss": 1.65625, + "eval_dolly15k_runtime": 33.8299, + "eval_dolly15k_samples_per_second": 22.199, + "eval_dolly15k_steps_per_second": 2.779, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_ikala_accuracy": 0.7489160065784373, + "eval_ikala_loss": 0.9208984375, + "eval_ikala_runtime": 886.4258, + "eval_ikala_samples_per_second": 16.026, + "eval_ikala_steps_per_second": 2.004, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_oasst_export_accuracy": 0.6617497330814418, + "eval_oasst_export_loss": 1.572265625, + "eval_oasst_export_runtime": 135.1792, + "eval_oasst_export_samples_per_second": 15.528, + "eval_oasst_export_steps_per_second": 1.946, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_joke_accuracy": 0.5256823351023503, + "eval_joke_loss": 2.013671875, + "eval_joke_runtime": 3.6235, + "eval_joke_samples_per_second": 20.974, + "eval_joke_steps_per_second": 2.76, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_gsm8k_accuracy": 0.7892603458925262, + "eval_gsm8k_loss": 0.77001953125, + "eval_gsm8k_runtime": 56.8179, + "eval_gsm8k_samples_per_second": 23.215, + "eval_gsm8k_steps_per_second": 2.904, + "step": 3500 + }, + { + "epoch": 0.57, + "eval_webgpt_accuracy": 0.49841011281811054, + "eval_webgpt_loss": 2.19140625, + "eval_webgpt_runtime": 157.0655, + "eval_webgpt_samples_per_second": 22.685, + "eval_webgpt_steps_per_second": 2.84, + "step": 3500 + }, + { + "epoch": 0.58, + "learning_rate": 9.677971908187737e-06, + "loss": 1.0692, + "step": 3510 + }, + { + "epoch": 0.58, + "learning_rate": 9.67583076396026e-06, + "loss": 1.1059, + "step": 3520 + }, + { + "epoch": 0.58, + "learning_rate": 9.673689619732786e-06, + "loss": 1.0758, + "step": 3530 + }, + { + "epoch": 0.58, + "learning_rate": 9.671548475505311e-06, + "loss": 1.0386, + "step": 3540 + }, + { + "epoch": 0.58, + "learning_rate": 9.669407331277835e-06, + "loss": 1.0865, + "step": 3550 + }, + { + "epoch": 0.58, + "learning_rate": 9.66726618705036e-06, + "loss": 1.0537, + "step": 3560 + }, + { + "epoch": 0.59, + "learning_rate": 9.665125042822884e-06, + "loss": 1.0481, + "step": 3570 + }, + { + "epoch": 0.59, + "learning_rate": 9.662983898595411e-06, + "loss": 1.0811, + "step": 3580 + }, + { + "epoch": 0.59, + "learning_rate": 9.660842754367935e-06, + "loss": 1.0518, + "step": 3590 + }, + { + "epoch": 0.59, + "learning_rate": 9.65870161014046e-06, + "loss": 1.0756, + "step": 3600 + }, + { + "epoch": 0.59, + "learning_rate": 9.656560465912986e-06, + "loss": 1.0594, + "step": 3610 + }, + { + "epoch": 0.59, + "learning_rate": 9.65441932168551e-06, + "loss": 1.0842, + "step": 3620 + }, + { + "epoch": 0.6, + "learning_rate": 9.652278177458035e-06, + "loss": 1.0703, + "step": 3630 + }, + { + "epoch": 0.6, + "learning_rate": 9.650137033230558e-06, + "loss": 1.0649, + "step": 3640 + }, + { + "epoch": 0.6, + "learning_rate": 9.647995889003084e-06, + "loss": 1.0869, + "step": 3650 + }, + { + "epoch": 0.6, + "learning_rate": 9.645854744775609e-06, + "loss": 1.0494, + "step": 3660 + }, + { + "epoch": 0.6, + "learning_rate": 9.643713600548134e-06, + "loss": 1.0575, + "step": 3670 + }, + { + "epoch": 0.6, + "learning_rate": 9.641572456320658e-06, + "loss": 1.0846, + "step": 3680 + }, + { + "epoch": 0.61, + "learning_rate": 9.639431312093183e-06, + "loss": 1.0815, + "step": 3690 + }, + { + "epoch": 0.61, + "learning_rate": 9.637290167865709e-06, + "loss": 1.0593, + "step": 3700 + }, + { + "epoch": 0.61, + "learning_rate": 9.635149023638232e-06, + "loss": 1.0936, + "step": 3710 + }, + { + "epoch": 0.61, + "learning_rate": 9.633007879410758e-06, + "loss": 1.0249, + "step": 3720 + }, + { + "epoch": 0.61, + "learning_rate": 9.630866735183283e-06, + "loss": 1.0382, + "step": 3730 + }, + { + "epoch": 0.61, + "learning_rate": 9.628725590955807e-06, + "loss": 1.0528, + "step": 3740 + }, + { + "epoch": 0.62, + "learning_rate": 9.626584446728332e-06, + "loss": 1.0469, + "step": 3750 + }, + { + "epoch": 0.62, + "learning_rate": 9.624443302500858e-06, + "loss": 1.053, + "step": 3760 + }, + { + "epoch": 0.62, + "learning_rate": 9.622302158273383e-06, + "loss": 1.0301, + "step": 3770 + }, + { + "epoch": 0.62, + "learning_rate": 9.620161014045907e-06, + "loss": 1.0913, + "step": 3780 + }, + { + "epoch": 0.62, + "learning_rate": 9.618019869818432e-06, + "loss": 1.0633, + "step": 3790 + }, + { + "epoch": 0.62, + "learning_rate": 9.615878725590957e-06, + "loss": 1.0743, + "step": 3800 + }, + { + "epoch": 0.63, + "learning_rate": 9.613737581363481e-06, + "loss": 1.0486, + "step": 3810 + }, + { + "epoch": 0.63, + "learning_rate": 9.611596437136006e-06, + "loss": 1.0491, + "step": 3820 + }, + { + "epoch": 0.63, + "learning_rate": 9.609455292908532e-06, + "loss": 1.0736, + "step": 3830 + }, + { + "epoch": 0.63, + "learning_rate": 9.607314148681056e-06, + "loss": 1.0729, + "step": 3840 + }, + { + "epoch": 0.63, + "learning_rate": 9.605173004453581e-06, + "loss": 1.0625, + "step": 3850 + }, + { + "epoch": 0.63, + "learning_rate": 9.603031860226105e-06, + "loss": 1.0726, + "step": 3860 + }, + { + "epoch": 0.64, + "learning_rate": 9.600890715998632e-06, + "loss": 1.0666, + "step": 3870 + }, + { + "epoch": 0.64, + "learning_rate": 9.598749571771155e-06, + "loss": 1.0773, + "step": 3880 + }, + { + "epoch": 0.64, + "learning_rate": 9.59660842754368e-06, + "loss": 1.065, + "step": 3890 + }, + { + "epoch": 0.64, + "learning_rate": 9.594467283316204e-06, + "loss": 1.0404, + "step": 3900 + }, + { + "epoch": 0.64, + "learning_rate": 9.59232613908873e-06, + "loss": 1.0717, + "step": 3910 + }, + { + "epoch": 0.64, + "learning_rate": 9.590184994861255e-06, + "loss": 1.0667, + "step": 3920 + }, + { + "epoch": 0.65, + "learning_rate": 9.588043850633779e-06, + "loss": 1.0603, + "step": 3930 + }, + { + "epoch": 0.65, + "learning_rate": 9.585902706406304e-06, + "loss": 1.0452, + "step": 3940 + }, + { + "epoch": 0.65, + "learning_rate": 9.58376156217883e-06, + "loss": 1.0681, + "step": 3950 + }, + { + "epoch": 0.65, + "learning_rate": 9.581620417951353e-06, + "loss": 1.075, + "step": 3960 + }, + { + "epoch": 0.65, + "learning_rate": 9.579479273723879e-06, + "loss": 1.0735, + "step": 3970 + }, + { + "epoch": 0.65, + "learning_rate": 9.577338129496404e-06, + "loss": 1.0859, + "step": 3980 + }, + { + "epoch": 0.66, + "learning_rate": 9.57519698526893e-06, + "loss": 1.0498, + "step": 3990 + }, + { + "epoch": 0.66, + "learning_rate": 9.573055841041453e-06, + "loss": 1.0353, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_multi_news_accuracy": 0.5627512384133357, + "eval_multi_news_loss": 1.9013671875, + "eval_multi_news_runtime": 374.2642, + "eval_multi_news_samples_per_second": 15.021, + "eval_multi_news_steps_per_second": 1.878, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_samsum_accuracy": 0.6499550678711631, + "eval_samsum_loss": 1.228515625, + "eval_samsum_runtime": 37.472, + "eval_samsum_samples_per_second": 21.83, + "eval_samsum_steps_per_second": 2.749, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_billsum_accuracy": 0.6559680786548813, + "eval_billsum_loss": 1.4189453125, + "eval_billsum_runtime": 204.7196, + "eval_billsum_samples_per_second": 15.968, + "eval_billsum_steps_per_second": 1.998, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_wmt2019_zh-en_accuracy": 0.605999539382773, + "eval_wmt2019_zh-en_loss": 1.8330078125, + "eval_wmt2019_zh-en_runtime": 43.2969, + "eval_wmt2019_zh-en_samples_per_second": 23.096, + "eval_wmt2019_zh-en_steps_per_second": 2.887, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_ted_trans_en-ja_accuracy": 0.591362074351765, + "eval_ted_trans_en-ja_loss": 1.7236328125, + "eval_ted_trans_en-ja_runtime": 36.1634, + "eval_ted_trans_en-ja_samples_per_second": 22.149, + "eval_ted_trans_en-ja_steps_per_second": 2.793, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_ted_trans_zh-ja_accuracy": 0.4934623430962343, + "eval_ted_trans_zh-ja_loss": 2.33984375, + "eval_ted_trans_zh-ja_runtime": 2.8371, + "eval_ted_trans_zh-ja_samples_per_second": 14.804, + "eval_ted_trans_zh-ja_steps_per_second": 2.115, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_sharegpt_accuracy": 0.7566927466258041, + "eval_sharegpt_loss": 0.90234375, + "eval_sharegpt_runtime": 732.9729, + "eval_sharegpt_samples_per_second": 4.568, + "eval_sharegpt_steps_per_second": 0.572, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_dolly15k_accuracy": 0.5929618969298246, + "eval_dolly15k_loss": 1.65625, + "eval_dolly15k_runtime": 33.6288, + "eval_dolly15k_samples_per_second": 22.332, + "eval_dolly15k_steps_per_second": 2.795, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_ikala_accuracy": 0.7508814779446873, + "eval_ikala_loss": 0.91015625, + "eval_ikala_runtime": 887.5958, + "eval_ikala_samples_per_second": 16.005, + "eval_ikala_steps_per_second": 2.001, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_oasst_export_accuracy": 0.6615836827915093, + "eval_oasst_export_loss": 1.57421875, + "eval_oasst_export_runtime": 134.7449, + "eval_oasst_export_samples_per_second": 15.578, + "eval_oasst_export_steps_per_second": 1.952, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_joke_accuracy": 0.535538286580743, + "eval_joke_loss": 1.9736328125, + "eval_joke_runtime": 3.6334, + "eval_joke_samples_per_second": 20.917, + "eval_joke_steps_per_second": 2.752, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_gsm8k_accuracy": 0.7943020382952439, + "eval_gsm8k_loss": 0.74560546875, + "eval_gsm8k_runtime": 57.4917, + "eval_gsm8k_samples_per_second": 22.942, + "eval_gsm8k_steps_per_second": 2.87, + "step": 4000 + }, + { + "epoch": 0.66, + "eval_webgpt_accuracy": 0.49873483307959016, + "eval_webgpt_loss": 2.19140625, + "eval_webgpt_runtime": 155.9655, + "eval_webgpt_samples_per_second": 22.845, + "eval_webgpt_steps_per_second": 2.86, + "step": 4000 + }, + { + "epoch": 0.66, + "learning_rate": 9.570914696813978e-06, + "loss": 1.0657, + "step": 4010 + }, + { + "epoch": 0.66, + "learning_rate": 9.568773552586504e-06, + "loss": 1.0743, + "step": 4020 + }, + { + "epoch": 0.66, + "learning_rate": 9.566632408359027e-06, + "loss": 1.0543, + "step": 4030 + }, + { + "epoch": 0.66, + "learning_rate": 9.564491264131553e-06, + "loss": 1.0457, + "step": 4040 + }, + { + "epoch": 0.67, + "learning_rate": 9.562350119904078e-06, + "loss": 1.0546, + "step": 4050 + }, + { + "epoch": 0.67, + "learning_rate": 9.560208975676602e-06, + "loss": 1.0485, + "step": 4060 + }, + { + "epoch": 0.67, + "learning_rate": 9.558067831449127e-06, + "loss": 1.0535, + "step": 4070 + }, + { + "epoch": 0.67, + "learning_rate": 9.555926687221651e-06, + "loss": 1.0603, + "step": 4080 + }, + { + "epoch": 0.67, + "learning_rate": 9.553785542994178e-06, + "loss": 1.0444, + "step": 4090 + }, + { + "epoch": 0.67, + "learning_rate": 9.551644398766702e-06, + "loss": 1.0482, + "step": 4100 + }, + { + "epoch": 0.68, + "learning_rate": 9.549503254539227e-06, + "loss": 1.0509, + "step": 4110 + }, + { + "epoch": 0.68, + "learning_rate": 9.54736211031175e-06, + "loss": 1.036, + "step": 4120 + }, + { + "epoch": 0.68, + "learning_rate": 9.545220966084276e-06, + "loss": 1.0457, + "step": 4130 + }, + { + "epoch": 0.68, + "learning_rate": 9.543079821856801e-06, + "loss": 1.065, + "step": 4140 + }, + { + "epoch": 0.68, + "learning_rate": 9.540938677629325e-06, + "loss": 1.0441, + "step": 4150 + }, + { + "epoch": 0.68, + "learning_rate": 9.53879753340185e-06, + "loss": 1.047, + "step": 4160 + }, + { + "epoch": 0.68, + "learning_rate": 9.536656389174376e-06, + "loss": 1.05, + "step": 4170 + }, + { + "epoch": 0.69, + "learning_rate": 9.5345152449469e-06, + "loss": 1.0615, + "step": 4180 + }, + { + "epoch": 0.69, + "learning_rate": 9.532374100719425e-06, + "loss": 1.0575, + "step": 4190 + }, + { + "epoch": 0.69, + "learning_rate": 9.53023295649195e-06, + "loss": 1.0614, + "step": 4200 + }, + { + "epoch": 0.69, + "learning_rate": 9.528091812264476e-06, + "loss": 1.0504, + "step": 4210 + }, + { + "epoch": 0.69, + "learning_rate": 9.525950668037e-06, + "loss": 1.0401, + "step": 4220 + }, + { + "epoch": 0.69, + "learning_rate": 9.524023638232272e-06, + "loss": 1.0376, + "step": 4230 + }, + { + "epoch": 0.7, + "learning_rate": 9.521882494004797e-06, + "loss": 1.0265, + "step": 4240 + }, + { + "epoch": 0.7, + "learning_rate": 9.519741349777321e-06, + "loss": 1.0636, + "step": 4250 + }, + { + "epoch": 0.7, + "learning_rate": 9.517600205549846e-06, + "loss": 1.059, + "step": 4260 + }, + { + "epoch": 0.7, + "learning_rate": 9.515459061322372e-06, + "loss": 1.0552, + "step": 4270 + }, + { + "epoch": 0.7, + "learning_rate": 9.513317917094897e-06, + "loss": 1.0577, + "step": 4280 + }, + { + "epoch": 0.7, + "learning_rate": 9.51117677286742e-06, + "loss": 1.034, + "step": 4290 + }, + { + "epoch": 0.71, + "learning_rate": 9.509035628639946e-06, + "loss": 1.0697, + "step": 4300 + }, + { + "epoch": 0.71, + "learning_rate": 9.506894484412471e-06, + "loss": 1.0392, + "step": 4310 + }, + { + "epoch": 0.71, + "learning_rate": 9.504753340184995e-06, + "loss": 1.0069, + "step": 4320 + }, + { + "epoch": 0.71, + "learning_rate": 9.50261219595752e-06, + "loss": 1.0583, + "step": 4330 + }, + { + "epoch": 0.71, + "learning_rate": 9.500471051730046e-06, + "loss": 1.0522, + "step": 4340 + }, + { + "epoch": 0.71, + "learning_rate": 9.49832990750257e-06, + "loss": 1.0315, + "step": 4350 + }, + { + "epoch": 0.72, + "learning_rate": 9.496188763275095e-06, + "loss": 1.057, + "step": 4360 + }, + { + "epoch": 0.72, + "learning_rate": 9.494047619047619e-06, + "loss": 1.0513, + "step": 4370 + }, + { + "epoch": 0.72, + "learning_rate": 9.491906474820146e-06, + "loss": 1.0342, + "step": 4380 + }, + { + "epoch": 0.72, + "learning_rate": 9.48976533059267e-06, + "loss": 1.0559, + "step": 4390 + }, + { + "epoch": 0.72, + "learning_rate": 9.487624186365195e-06, + "loss": 1.0377, + "step": 4400 + }, + { + "epoch": 0.72, + "learning_rate": 9.485483042137718e-06, + "loss": 1.0512, + "step": 4410 + }, + { + "epoch": 0.73, + "learning_rate": 9.483341897910244e-06, + "loss": 1.0439, + "step": 4420 + }, + { + "epoch": 0.73, + "learning_rate": 9.48120075368277e-06, + "loss": 1.0344, + "step": 4430 + }, + { + "epoch": 0.73, + "learning_rate": 9.479059609455293e-06, + "loss": 1.0343, + "step": 4440 + }, + { + "epoch": 0.73, + "learning_rate": 9.477132579650567e-06, + "loss": 1.0463, + "step": 4450 + }, + { + "epoch": 0.73, + "learning_rate": 9.47499143542309e-06, + "loss": 1.0443, + "step": 4460 + }, + { + "epoch": 0.73, + "learning_rate": 9.472850291195616e-06, + "loss": 1.0559, + "step": 4470 + }, + { + "epoch": 0.74, + "learning_rate": 9.47070914696814e-06, + "loss": 1.0555, + "step": 4480 + }, + { + "epoch": 0.74, + "learning_rate": 9.468568002740665e-06, + "loss": 1.0267, + "step": 4490 + }, + { + "epoch": 0.74, + "learning_rate": 9.46642685851319e-06, + "loss": 1.042, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_multi_news_accuracy": 0.5627252632519255, + "eval_multi_news_loss": 1.9013671875, + "eval_multi_news_runtime": 374.4153, + "eval_multi_news_samples_per_second": 15.015, + "eval_multi_news_steps_per_second": 1.878, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_samsum_accuracy": 0.6482050796954074, + "eval_samsum_loss": 1.2255859375, + "eval_samsum_runtime": 38.0285, + "eval_samsum_samples_per_second": 21.51, + "eval_samsum_steps_per_second": 2.708, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_billsum_accuracy": 0.6552127105772998, + "eval_billsum_loss": 1.4169921875, + "eval_billsum_runtime": 204.5119, + "eval_billsum_samples_per_second": 15.984, + "eval_billsum_steps_per_second": 2.0, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_wmt2019_zh-en_accuracy": 0.5987996483045988, + "eval_wmt2019_zh-en_loss": 1.8505859375, + "eval_wmt2019_zh-en_runtime": 43.4755, + "eval_wmt2019_zh-en_samples_per_second": 23.001, + "eval_wmt2019_zh-en_steps_per_second": 2.875, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_ted_trans_en-ja_accuracy": 0.5948422811429342, + "eval_ted_trans_en-ja_loss": 1.69140625, + "eval_ted_trans_en-ja_runtime": 36.0142, + "eval_ted_trans_en-ja_samples_per_second": 22.241, + "eval_ted_trans_en-ja_steps_per_second": 2.804, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_ted_trans_zh-ja_accuracy": 0.521213679609154, + "eval_ted_trans_zh-ja_loss": 2.265625, + "eval_ted_trans_zh-ja_runtime": 2.289, + "eval_ted_trans_zh-ja_samples_per_second": 18.349, + "eval_ted_trans_zh-ja_steps_per_second": 2.621, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_sharegpt_accuracy": 0.7602375296761273, + "eval_sharegpt_loss": 0.884765625, + "eval_sharegpt_runtime": 733.0075, + "eval_sharegpt_samples_per_second": 4.567, + "eval_sharegpt_steps_per_second": 0.572, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_dolly15k_accuracy": 0.5921566611842105, + "eval_dolly15k_loss": 1.65234375, + "eval_dolly15k_runtime": 33.7747, + "eval_dolly15k_samples_per_second": 22.236, + "eval_dolly15k_steps_per_second": 2.783, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_ikala_accuracy": 0.7515879865982705, + "eval_ikala_loss": 0.90576171875, + "eval_ikala_runtime": 884.7883, + "eval_ikala_samples_per_second": 16.056, + "eval_ikala_steps_per_second": 2.007, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_oasst_export_accuracy": 0.6615137668799588, + "eval_oasst_export_loss": 1.5693359375, + "eval_oasst_export_runtime": 134.5394, + "eval_oasst_export_samples_per_second": 15.601, + "eval_oasst_export_steps_per_second": 1.955, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_joke_accuracy": 0.5379075056861259, + "eval_joke_loss": 1.966796875, + "eval_joke_runtime": 4.5957, + "eval_joke_samples_per_second": 16.537, + "eval_joke_steps_per_second": 2.176, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_gsm8k_accuracy": 0.7959928968499074, + "eval_gsm8k_loss": 0.74072265625, + "eval_gsm8k_runtime": 57.0884, + "eval_gsm8k_samples_per_second": 23.105, + "eval_gsm8k_steps_per_second": 2.89, + "step": 4500 + }, + { + "epoch": 0.74, + "eval_webgpt_accuracy": 0.4989797040964437, + "eval_webgpt_loss": 2.19140625, + "eval_webgpt_runtime": 157.3673, + "eval_webgpt_samples_per_second": 22.641, + "eval_webgpt_steps_per_second": 2.834, + "step": 4500 + }, + { + "epoch": 0.74, + "learning_rate": 9.464285714285714e-06, + "loss": 1.0577, + "step": 4510 + }, + { + "epoch": 0.74, + "learning_rate": 9.46214457005824e-06, + "loss": 1.0465, + "step": 4520 + }, + { + "epoch": 0.74, + "learning_rate": 9.460003425830765e-06, + "loss": 1.0635, + "step": 4530 + }, + { + "epoch": 0.75, + "learning_rate": 9.457862281603289e-06, + "loss": 1.035, + "step": 4540 + }, + { + "epoch": 0.75, + "learning_rate": 9.455721137375814e-06, + "loss": 1.0623, + "step": 4550 + }, + { + "epoch": 0.75, + "learning_rate": 9.45357999314834e-06, + "loss": 1.0279, + "step": 4560 + }, + { + "epoch": 0.75, + "learning_rate": 9.451438848920865e-06, + "loss": 1.0287, + "step": 4570 + }, + { + "epoch": 0.75, + "learning_rate": 9.449297704693388e-06, + "loss": 1.0567, + "step": 4580 + }, + { + "epoch": 0.75, + "learning_rate": 9.447156560465914e-06, + "loss": 1.0246, + "step": 4590 + }, + { + "epoch": 0.76, + "learning_rate": 9.445015416238439e-06, + "loss": 1.0352, + "step": 4600 + }, + { + "epoch": 0.76, + "learning_rate": 9.442874272010963e-06, + "loss": 1.0493, + "step": 4610 + }, + { + "epoch": 0.76, + "learning_rate": 9.440733127783488e-06, + "loss": 1.0435, + "step": 4620 + }, + { + "epoch": 0.76, + "learning_rate": 9.438591983556014e-06, + "loss": 1.0418, + "step": 4630 + }, + { + "epoch": 0.76, + "learning_rate": 9.436450839328539e-06, + "loss": 1.0586, + "step": 4640 + }, + { + "epoch": 0.76, + "learning_rate": 9.434309695101063e-06, + "loss": 1.0041, + "step": 4650 + }, + { + "epoch": 0.77, + "learning_rate": 9.432168550873588e-06, + "loss": 1.0236, + "step": 4660 + }, + { + "epoch": 0.77, + "learning_rate": 9.430027406646113e-06, + "loss": 1.0377, + "step": 4670 + }, + { + "epoch": 0.77, + "learning_rate": 9.427886262418637e-06, + "loss": 1.0385, + "step": 4680 + }, + { + "epoch": 0.77, + "learning_rate": 9.425745118191162e-06, + "loss": 1.0418, + "step": 4690 + }, + { + "epoch": 0.77, + "learning_rate": 9.423603973963686e-06, + "loss": 1.0304, + "step": 4700 + }, + { + "epoch": 0.77, + "learning_rate": 9.421462829736211e-06, + "loss": 1.0376, + "step": 4710 + }, + { + "epoch": 0.78, + "learning_rate": 9.419321685508737e-06, + "loss": 1.0377, + "step": 4720 + }, + { + "epoch": 0.78, + "learning_rate": 9.41718054128126e-06, + "loss": 1.0451, + "step": 4730 + }, + { + "epoch": 0.78, + "learning_rate": 9.415039397053788e-06, + "loss": 1.0359, + "step": 4740 + }, + { + "epoch": 0.78, + "learning_rate": 9.412898252826311e-06, + "loss": 1.0375, + "step": 4750 + }, + { + "epoch": 0.78, + "learning_rate": 9.410757108598837e-06, + "loss": 1.0575, + "step": 4760 + }, + { + "epoch": 0.78, + "learning_rate": 9.40861596437136e-06, + "loss": 1.042, + "step": 4770 + }, + { + "epoch": 0.79, + "learning_rate": 9.406474820143886e-06, + "loss": 1.0405, + "step": 4780 + }, + { + "epoch": 0.79, + "learning_rate": 9.404333675916411e-06, + "loss": 1.0538, + "step": 4790 + }, + { + "epoch": 0.79, + "learning_rate": 9.402192531688935e-06, + "loss": 1.0168, + "step": 4800 + }, + { + "epoch": 0.79, + "learning_rate": 9.40005138746146e-06, + "loss": 1.0406, + "step": 4810 + }, + { + "epoch": 0.79, + "learning_rate": 9.397910243233985e-06, + "loss": 1.0419, + "step": 4820 + }, + { + "epoch": 0.79, + "learning_rate": 9.39576909900651e-06, + "loss": 1.0249, + "step": 4830 + }, + { + "epoch": 0.79, + "learning_rate": 9.393627954779035e-06, + "loss": 1.0455, + "step": 4840 + }, + { + "epoch": 0.8, + "learning_rate": 9.39148681055156e-06, + "loss": 1.0314, + "step": 4850 + }, + { + "epoch": 0.8, + "learning_rate": 9.389345666324085e-06, + "loss": 1.0365, + "step": 4860 + }, + { + "epoch": 0.8, + "learning_rate": 9.387204522096609e-06, + "loss": 1.0503, + "step": 4870 + }, + { + "epoch": 0.8, + "learning_rate": 9.385063377869134e-06, + "loss": 1.0134, + "step": 4880 + }, + { + "epoch": 0.8, + "learning_rate": 9.38292223364166e-06, + "loss": 1.0655, + "step": 4890 + }, + { + "epoch": 0.8, + "learning_rate": 9.380781089414183e-06, + "loss": 1.0403, + "step": 4900 + }, + { + "epoch": 0.81, + "learning_rate": 9.378639945186709e-06, + "loss": 1.042, + "step": 4910 + }, + { + "epoch": 0.81, + "learning_rate": 9.376498800959234e-06, + "loss": 1.0564, + "step": 4920 + }, + { + "epoch": 0.81, + "learning_rate": 9.374357656731758e-06, + "loss": 1.0469, + "step": 4930 + }, + { + "epoch": 0.81, + "learning_rate": 9.372216512504283e-06, + "loss": 1.0323, + "step": 4940 + }, + { + "epoch": 0.81, + "learning_rate": 9.370075368276807e-06, + "loss": 1.0434, + "step": 4950 + }, + { + "epoch": 0.81, + "learning_rate": 9.367934224049334e-06, + "loss": 1.0474, + "step": 4960 + }, + { + "epoch": 0.82, + "learning_rate": 9.365793079821858e-06, + "loss": 1.0576, + "step": 4970 + }, + { + "epoch": 0.82, + "learning_rate": 9.363651935594383e-06, + "loss": 1.0588, + "step": 4980 + }, + { + "epoch": 0.82, + "learning_rate": 9.361510791366907e-06, + "loss": 1.0369, + "step": 4990 + }, + { + "epoch": 0.82, + "learning_rate": 9.359369647139432e-06, + "loss": 1.0159, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_multi_news_accuracy": 0.5636508659548628, + "eval_multi_news_loss": 1.9013671875, + "eval_multi_news_runtime": 374.2666, + "eval_multi_news_samples_per_second": 15.021, + "eval_multi_news_steps_per_second": 1.878, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_samsum_accuracy": 0.6542117958662441, + "eval_samsum_loss": 1.22265625, + "eval_samsum_runtime": 37.6015, + "eval_samsum_samples_per_second": 21.754, + "eval_samsum_steps_per_second": 2.739, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_billsum_accuracy": 0.6560928552644996, + "eval_billsum_loss": 1.412109375, + "eval_billsum_runtime": 204.697, + "eval_billsum_samples_per_second": 15.97, + "eval_billsum_steps_per_second": 1.998, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_wmt2019_zh-en_accuracy": 0.6045209655463313, + "eval_wmt2019_zh-en_loss": 1.826171875, + "eval_wmt2019_zh-en_runtime": 43.7238, + "eval_wmt2019_zh-en_samples_per_second": 22.871, + "eval_wmt2019_zh-en_steps_per_second": 2.859, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_ted_trans_en-ja_accuracy": 0.5986221480612599, + "eval_ted_trans_en-ja_loss": 1.6884765625, + "eval_ted_trans_en-ja_runtime": 35.5277, + "eval_ted_trans_en-ja_samples_per_second": 22.546, + "eval_ted_trans_en-ja_steps_per_second": 2.843, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_ted_trans_zh-ja_accuracy": 0.49819293855991104, + "eval_ted_trans_zh-ja_loss": 2.29296875, + "eval_ted_trans_zh-ja_runtime": 2.6448, + "eval_ted_trans_zh-ja_samples_per_second": 15.88, + "eval_ted_trans_zh-ja_steps_per_second": 2.269, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_sharegpt_accuracy": 0.7644941905120852, + "eval_sharegpt_loss": 0.86474609375, + "eval_sharegpt_runtime": 732.8989, + "eval_sharegpt_samples_per_second": 4.568, + "eval_sharegpt_steps_per_second": 0.572, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_dolly15k_accuracy": 0.5935786732456141, + "eval_dolly15k_loss": 1.6474609375, + "eval_dolly15k_runtime": 33.5372, + "eval_dolly15k_samples_per_second": 22.393, + "eval_dolly15k_steps_per_second": 2.803, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_ikala_accuracy": 0.7535167297168253, + "eval_ikala_loss": 0.89990234375, + "eval_ikala_runtime": 887.8229, + "eval_ikala_samples_per_second": 16.001, + "eval_ikala_steps_per_second": 2.0, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_oasst_export_accuracy": 0.6621677719692545, + "eval_oasst_export_loss": 1.5673828125, + "eval_oasst_export_runtime": 134.0182, + "eval_oasst_export_samples_per_second": 15.662, + "eval_oasst_export_steps_per_second": 1.962, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_joke_accuracy": 0.5397081122062168, + "eval_joke_loss": 1.966796875, + "eval_joke_runtime": 4.7628, + "eval_joke_samples_per_second": 15.957, + "eval_joke_steps_per_second": 2.1, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_gsm8k_accuracy": 0.7978304508956146, + "eval_gsm8k_loss": 0.73388671875, + "eval_gsm8k_runtime": 58.2683, + "eval_gsm8k_samples_per_second": 22.637, + "eval_gsm8k_steps_per_second": 2.832, + "step": 5000 + }, + { + "epoch": 0.82, + "eval_webgpt_accuracy": 0.4992032819813969, + "eval_webgpt_loss": 2.189453125, + "eval_webgpt_runtime": 158.2652, + "eval_webgpt_samples_per_second": 22.513, + "eval_webgpt_steps_per_second": 2.818, + "step": 5000 + } + ], + "max_steps": 48704, + "num_train_epochs": 8, + "total_flos": 3017973738504192.0, + "trial_name": null, + "trial_params": null +}