besimray commited on
Commit
89b64bd
1 Parent(s): 9a151ef

Training in progress, step 260, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0b38996684554d1f09b04a3e5ab2b0b03df762b983defcab7835951fdacba51
3
  size 90207248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05f6ed908b6c0a5b0bf09c0573053a4af39c4c92f5bf61fe05d9df2519c97031
3
  size 90207248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31eea40405f95b3ea2b2b047c33bd41db6558f1265fd847d6fd69497f9cc08db
3
- size 46057082
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e3c04601c84fd5ba7eeb5da4ff1003918f386a2d1833589198a44ba65f0d8d
3
+ size 46057338
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e0f15e8250e550076d6f480971ea58fe35e1e1ccca0d097b04c361be146de54
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90b2977b40565c7b47786677726b21587b954d614cb5eddcc13e2d79ccfddfd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11fce6896214284d141064893064ad6e844c1b7b446de9ee050ea3044b9b2a3b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24163fa211a98380eaaf8162c38702de00babc1b46887461983dfe21c7fd7b23
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.151181697845459,
3
  "best_model_checkpoint": "miner_id_besimray/checkpoint-80",
4
- "epoch": 5.052631578947368,
5
  "eval_steps": 20,
6
- "global_step": 240,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1791,6 +1791,154 @@
1791
  "eval_samples_per_second": 6.623,
1792
  "eval_steps_per_second": 0.662,
1793
  "step": 240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1794
  }
1795
  ],
1796
  "logging_steps": 1,
@@ -1805,7 +1953,7 @@
1805
  "early_stopping_threshold": 0.0
1806
  },
1807
  "attributes": {
1808
- "early_stopping_patience_counter": 8
1809
  }
1810
  },
1811
  "TrainerControl": {
@@ -1819,7 +1967,7 @@
1819
  "attributes": {}
1820
  }
1821
  },
1822
- "total_flos": 2.349128604450816e+17,
1823
  "train_batch_size": 10,
1824
  "trial_name": null,
1825
  "trial_params": null
 
1
  {
2
  "best_metric": 1.151181697845459,
3
  "best_model_checkpoint": "miner_id_besimray/checkpoint-80",
4
+ "epoch": 5.473684210526316,
5
  "eval_steps": 20,
6
+ "global_step": 260,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1791
  "eval_samples_per_second": 6.623,
1792
  "eval_steps_per_second": 0.662,
1793
  "step": 240
1794
+ },
1795
+ {
1796
+ "epoch": 5.073684210526316,
1797
+ "grad_norm": 0.4180259704589844,
1798
+ "learning_rate": 0.00019883026221137652,
1799
+ "loss": 0.526,
1800
+ "step": 241
1801
+ },
1802
+ {
1803
+ "epoch": 5.094736842105263,
1804
+ "grad_norm": 0.42054420709609985,
1805
+ "learning_rate": 0.00019882013269579584,
1806
+ "loss": 0.4412,
1807
+ "step": 242
1808
+ },
1809
+ {
1810
+ "epoch": 5.11578947368421,
1811
+ "grad_norm": 0.5846607089042664,
1812
+ "learning_rate": 0.00019880995977034584,
1813
+ "loss": 0.5306,
1814
+ "step": 243
1815
+ },
1816
+ {
1817
+ "epoch": 5.136842105263158,
1818
+ "grad_norm": 0.6321561932563782,
1819
+ "learning_rate": 0.00019879974343949526,
1820
+ "loss": 0.575,
1821
+ "step": 244
1822
+ },
1823
+ {
1824
+ "epoch": 5.157894736842105,
1825
+ "grad_norm": 0.48956233263015747,
1826
+ "learning_rate": 0.00019878948370773193,
1827
+ "loss": 0.4667,
1828
+ "step": 245
1829
+ },
1830
+ {
1831
+ "epoch": 5.178947368421053,
1832
+ "grad_norm": 0.49197542667388916,
1833
+ "learning_rate": 0.00019877918057956278,
1834
+ "loss": 0.473,
1835
+ "step": 246
1836
+ },
1837
+ {
1838
+ "epoch": 5.2,
1839
+ "grad_norm": 0.5268818736076355,
1840
+ "learning_rate": 0.00019876883405951377,
1841
+ "loss": 0.6249,
1842
+ "step": 247
1843
+ },
1844
+ {
1845
+ "epoch": 5.221052631578948,
1846
+ "grad_norm": 0.4883573651313782,
1847
+ "learning_rate": 0.00019875844415212997,
1848
+ "loss": 0.5239,
1849
+ "step": 248
1850
+ },
1851
+ {
1852
+ "epoch": 5.242105263157895,
1853
+ "grad_norm": 0.45860010385513306,
1854
+ "learning_rate": 0.00019874801086197544,
1855
+ "loss": 0.5462,
1856
+ "step": 249
1857
+ },
1858
+ {
1859
+ "epoch": 5.2631578947368425,
1860
+ "grad_norm": 0.41302675008773804,
1861
+ "learning_rate": 0.00019873753419363336,
1862
+ "loss": 0.5144,
1863
+ "step": 250
1864
+ },
1865
+ {
1866
+ "epoch": 5.284210526315789,
1867
+ "grad_norm": 0.550791323184967,
1868
+ "learning_rate": 0.00019872701415170593,
1869
+ "loss": 0.5071,
1870
+ "step": 251
1871
+ },
1872
+ {
1873
+ "epoch": 5.3052631578947365,
1874
+ "grad_norm": 0.4419604539871216,
1875
+ "learning_rate": 0.00019871645074081434,
1876
+ "loss": 0.4598,
1877
+ "step": 252
1878
+ },
1879
+ {
1880
+ "epoch": 5.326315789473684,
1881
+ "grad_norm": 0.5271047353744507,
1882
+ "learning_rate": 0.00019870584396559902,
1883
+ "loss": 0.5444,
1884
+ "step": 253
1885
+ },
1886
+ {
1887
+ "epoch": 5.347368421052631,
1888
+ "grad_norm": 0.4978967308998108,
1889
+ "learning_rate": 0.00019869519383071928,
1890
+ "loss": 0.5829,
1891
+ "step": 254
1892
+ },
1893
+ {
1894
+ "epoch": 5.368421052631579,
1895
+ "grad_norm": 0.5046519041061401,
1896
+ "learning_rate": 0.00019868450034085352,
1897
+ "loss": 0.5343,
1898
+ "step": 255
1899
+ },
1900
+ {
1901
+ "epoch": 5.389473684210526,
1902
+ "grad_norm": 0.5924373865127563,
1903
+ "learning_rate": 0.0001986737635006992,
1904
+ "loss": 0.514,
1905
+ "step": 256
1906
+ },
1907
+ {
1908
+ "epoch": 5.410526315789474,
1909
+ "grad_norm": 0.47235432267189026,
1910
+ "learning_rate": 0.00019866298331497283,
1911
+ "loss": 0.4899,
1912
+ "step": 257
1913
+ },
1914
+ {
1915
+ "epoch": 5.431578947368421,
1916
+ "grad_norm": 0.49679791927337646,
1917
+ "learning_rate": 0.0001986521597884099,
1918
+ "loss": 0.5483,
1919
+ "step": 258
1920
+ },
1921
+ {
1922
+ "epoch": 5.4526315789473685,
1923
+ "grad_norm": 0.4871433973312378,
1924
+ "learning_rate": 0.00019864129292576505,
1925
+ "loss": 0.5544,
1926
+ "step": 259
1927
+ },
1928
+ {
1929
+ "epoch": 5.473684210526316,
1930
+ "grad_norm": 0.5678947567939758,
1931
+ "learning_rate": 0.00019863038273181186,
1932
+ "loss": 0.5298,
1933
+ "step": 260
1934
+ },
1935
+ {
1936
+ "epoch": 5.473684210526316,
1937
+ "eval_loss": 1.5484461784362793,
1938
+ "eval_runtime": 15.0951,
1939
+ "eval_samples_per_second": 6.625,
1940
+ "eval_steps_per_second": 0.662,
1941
+ "step": 260
1942
  }
1943
  ],
1944
  "logging_steps": 1,
 
1953
  "early_stopping_threshold": 0.0
1954
  },
1955
  "attributes": {
1956
+ "early_stopping_patience_counter": 9
1957
  }
1958
  },
1959
  "TrainerControl": {
 
1967
  "attributes": {}
1968
  }
1969
  },
1970
+ "total_flos": 2.544889321488384e+17,
1971
  "train_batch_size": 10,
1972
  "trial_name": null,
1973
  "trial_params": null