victormiller commited on
Commit
ba24833
1 Parent(s): 7c015e6

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +1 -1
main.py CHANGED
@@ -788,7 +788,7 @@ def intro():
788
  B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
789
  ),
790
  P(
791
- "Building on top of the prior studies on pre-training data,"
792
  D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"),
793
  "TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
794
  ),
 
788
  B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
789
  ),
790
  P(
791
+ "Building on top of the prior studies on pre-training data,",
792
  D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"),
793
  "TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
794
  ),