omkarenator commited on
Commit
3c38447
1 Parent(s): e0a8eb9

checkpointing

Browse files
Files changed (2) hide show
  1. main.py +182 -155
  2. style.css +5 -1
main.py CHANGED
@@ -1,20 +1,20 @@
1
- from fasthtml_hf import setup_hf_backup
2
  from fasthtml.common import *
3
  from fasthtml.components import *
4
  from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
5
 
6
 
7
- app, rt = fast_app()
8
 
9
 
10
- @rt("/")
11
- def get():
12
  return Html(
13
  Head(
14
  Meta(charset="UTF-8"),
15
  Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
16
  Link(rel="stylesheet", href="style.css"),
17
  Script(src="https://distill.pub/template.v2.js"),
 
18
  ),
19
  Body(
20
  D_title(
@@ -22,171 +22,198 @@ def get():
22
  "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models",
23
  cls="l-body",
24
  style="text-align: center;",
25
- )
 
 
 
 
 
26
  ),
27
  D_article(
28
  D_contents(
29
  Nav(
30
  H3("Table of Contents"),
31
- Div(A("TxT360")),
32
- Ul(
33
- Li(A("Introduction", href="#section1")),
34
- Li(A("Background", href="#section2")),
35
- Li(A("Main Content", href="#section3")),
36
- Li(A("Conclusion", href="#section4")),
37
  ),
38
- Div(A("Web Data", href="#section5")),
39
- Div(A("Curated Sources", href="#section3")),
40
- Div(A("Common Steps", href="#section4")),
41
- Div(A("TxT360 Results", href="#section4")),
42
- role="navigation",
43
- cls="l-text figcaption",
44
- ),
45
- ),
46
- Div(
47
- Section(
48
- H2("Introduction"),
49
- P("""We are excited to introduce TxT360, a
50
- large-scale, comprehensive, and fully transparent
51
- dataset designed for Large Language Model (LLM)
52
- pre-training. TxT360 is engineered to strike a
53
- balance between the quantity and quality of
54
- pre-training data, pushing the limit on both
55
- fronts. This comprehensive dataset encompasses both
56
- expansive web-based data and highly curated data
57
- sources, making it one of the most robust LLM
58
- pre-training corpora available today. Our web data
59
- component includes 99 snapshots from Common Crawl,
60
- amassing 5.7 trillion tokens and occupying 11 TB of
61
- disk space in jsonl.gz format. On the curated side,
62
- TxT360 integrates one of the most extensive
63
- collections of high-quality sources across multiple
64
- domains, ensuring diverse and rich content referred
65
- to as curated sources, 14 sources across 10
66
- domains. To maintain the highest quality, we
67
- meticulously pre-processed the web data to filter
68
- out low-quality content and conducted thorough
69
- reviews of the curated sources. This process not
70
- only unified their formats but also identified and
71
- rectified any anomalies. Not only do we 100%
72
- open-source our processing scripts, but we also
73
- release the details of our data reviews, revealing
74
- the decision-making processes behind data selection
75
- and quality assurance. This level of transparency
76
- allows researchers and practitioners to fully
77
- understand the dataset’s composition and make
78
- informed decisions when using TxT360 for training.
79
- Additionally, TxT360 includes detailed
80
- documentation and analysis of the data, covering
81
- distribution statistics, domain coverage, and
82
- processing pipeline, which helps users navigate and
83
- utilize the dataset effectively. Overall, TxT360
84
- represents a significant step forward in the
85
- availability and transparency of large-scale
86
- training data for language models, setting a new
87
- standard for dataset quality and openness."""),
88
- id="section1",
89
- ),
90
- Section(
91
- H2("Background"),
92
- P(
93
- """ The quality and size of a pre-training dataset
94
- play a crucial role in the performance of large
95
- language models (LLMs). The community has
96
- introduced a variety of datasets for this purpose,
97
- including purely web-based datasets like RefinedWeb
98
- [1], RedPajama-Data-V2 [2], DCLM [3], and
99
- FineWeb [4], as well as comprehensive datasets
100
- derived from multiple highly-curated data sources
101
- such as The Pile [5], RedPajama-Data-V1 [6], and
102
- Dolma [7] . It is commonly known that web-based
103
- datasets provide a vast quantity of data, while
104
- highly-curated multi-source datasets consistently
105
- deliver high quality and diversity, both critical
106
- for effective LLM pre-training. However, despite
107
- the advancements in both types of data, each type
108
- of dataset has its limitations. For instance, the
109
- processing scripts for the web dataset, RefinedWeb,
110
- known for its high quality, are not public, and
111
- only about 10% of the entire dataset has been
112
- disclosed. Conversely, the web component of
113
- existing highly-curated multi-source datasets is
114
- relatively small compared to purely web-based
115
- datasets, limiting their coverage and diversity
116
- compared to the scale of information from the
117
- internet. By integrating the extensive reach of
118
- web data with the exceptional quality of curated
119
- sources, TxT360 is crafted to meet and surpass the
120
- rigorous standards required for state-of-the-art
121
- LLM pre-training. """
122
  ),
123
- id="section2",
124
- ),
125
- Section(
126
- H2("Main Content"),
127
- P(
128
- """The performance of a large language model (LLM)
129
- depends heavily on the quality and size of its
130
- pretraining dataset. However, the pretraining
131
- datasets for state-of-the-art open LLMs like Llama
132
- 3 and Mixtral are not publicly available and very
133
- little is known about how they were created.
134
- Reading time: 45 min. For the best reading
135
- experience, we recommend not using a mobile phone.
136
- Recently, we released 🍷 FineWeb, a new,
137
- large-scale (15-trillion tokens, 44TB disk space)
138
- dataset for LLM pretraining. FineWeb is derived
139
- from 96 CommonCrawl snapshots and produces
140
- better-performing LLMs than other open pretraining
141
- datasets. To bring more clarity in machine learning
142
- and advance the open understanding of how to train
143
- good quality large language models, we carefully
144
- documented and ablated all of the design choices
145
- used in FineWeb, including in-depth investigations
146
- of deduplication and filtering strategies. The
147
- present long form report is a deep dive in how to
148
- create a large and high-quality web-scale dataset
149
- for LLM pretraining. The dataset itself, 🍷
150
- FineWeb, is available here. We are extremely
151
- thankful to the whole distill.pub team (Christopher
152
- Olah, Shan Carter, Ludwig Schubert in particular)
153
- for creating the template on which we based this
154
- blog post. Thanks also for inspiring us with
155
- exquisitely crafted articles and blog posts. In
156
- this report we also introduce 📚 FineWeb-Edu, a
157
- subset of FineWeb constructed using scalable
158
- automated high-quality annotations for educational
159
- value, and which outperforms all openly accessible
160
- web-datasets on a number of educational benchmarks
161
- such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu
162
- is available in two sizes/filtering-level: 1.3
163
- trillion (very high educational content) and 5.4
164
- trillion (high educational content) tokens (all
165
- tokens are measured with GPT2 tokenizer). You can
166
- download it here. Both datasets are released under
167
- the permissive ODC-By 1.0 license TLDR: This blog
168
- covers a discussion on processing and evaluating
169
- data quality at scale, the 🍷 FineWeb recipe
170
- (listing and explaining all of our design choices),
171
- and the process followed to create its 📚
172
- FineWeb-Edu subset."""
173
  ),
174
- id="section3",
175
- ),
176
- Section(
177
- H2("Conclusion"),
178
- P("""This is the conclusion section where we
179
- summarize the key points discussed in the blog post
180
- and provide final thoughts.
181
- """),
182
- id="section4",
183
  ),
184
  ),
 
185
  ),
186
  ),
187
  lang="en",
188
  )
189
 
190
 
191
- setup_hf_backup(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  serve()
 
 
1
  from fasthtml.common import *
2
  from fasthtml.components import *
3
  from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
4
 
5
 
6
+ app, rt = fast_app(live=True)
7
 
8
 
9
+ @app.get("/")
10
+ def main():
11
  return Html(
12
  Head(
13
  Meta(charset="UTF-8"),
14
  Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
15
  Link(rel="stylesheet", href="style.css"),
16
  Script(src="https://distill.pub/template.v2.js"),
17
+ Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
18
  ),
19
  Body(
20
  D_title(
 
22
  "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models",
23
  cls="l-body",
24
  style="text-align: center;",
25
+ ),
26
+ Div(
27
+ Img(src="images/llm360_logo.png"),
28
+ id="title-plot",
29
+ cls="main-plot-container l-page",
30
+ ),
31
  ),
32
  D_article(
33
  D_contents(
34
  Nav(
35
  H3("Table of Contents"),
36
+ Div(
37
+ A("TxT360", href="#section1"),
38
+ hx_get="/intro",
39
+ hx_target="#inner-text",
40
+ hx_swap="innerHTML",
 
41
  ),
42
+ Div(
43
+ Ul(
44
+ Li(A("Introduction", href="#section1")),
45
+ Li(A("Background", href="#section2")),
46
+ Li(A("Main Content", href="#section3")),
47
+ Li(A("Conclusion", href="#section4")),
48
+ ),
49
+ hx_get="/intro",
50
+ hx_target="#inner-text",
51
+ hx_swap="innerHTML",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  ),
53
+ Div(
54
+ A("Web Data", href="#inner-text"),
55
+ hx_get="/web_data",
56
+ hx_target="#inner-text",
57
+ hx_swap="innerHTML",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ),
59
+ Div(A("Curated Sources")),
60
+ Div(A("Common Steps")),
61
+ Div(A("TxT360 Results")),
62
+ role="navigation",
63
+ cls="l-text figcaption",
 
 
 
 
64
  ),
65
  ),
66
+ intro(),
67
  ),
68
  ),
69
  lang="en",
70
  )
71
 
72
 
73
+ @app.get("/intro")
74
+ def intro():
75
+ return Div(
76
+ Section(
77
+ H2("Introduction"),
78
+ P("""We are excited to introduce TxT360, a
79
+ large-scale, comprehensive, and fully transparent
80
+ dataset designed for Large Language Model (LLM)
81
+ pre-training. TxT360 is engineered to strike a
82
+ balance between the quantity and quality of
83
+ pre-training data, pushing the limit on both
84
+ fronts. This comprehensive dataset encompasses both
85
+ expansive web-based data and highly curated data
86
+ sources, making it one of the most robust LLM
87
+ pre-training corpora available today. Our web data
88
+ component includes 99 snapshots from Common Crawl,
89
+ amassing 5.7 trillion tokens and occupying 11 TB of
90
+ disk space in jsonl.gz format. On the curated side,
91
+ TxT360 integrates one of the most extensive
92
+ collections of high-quality sources across multiple
93
+ domains, ensuring diverse and rich content referred
94
+ to as curated sources, 14 sources across 10
95
+ domains. To maintain the highest quality, we
96
+ meticulously pre-processed the web data to filter
97
+ out low-quality content and conducted thorough
98
+ reviews of the curated sources. This process not
99
+ only unified their formats but also identified and
100
+ rectified any anomalies. Not only do we 100%
101
+ open-source our processing scripts, but we also
102
+ release the details of our data reviews, revealing
103
+ the decision-making processes behind data selection
104
+ and quality assurance. This level of transparency
105
+ allows researchers and practitioners to fully
106
+ understand the dataset’s composition and make
107
+ informed decisions when using TxT360 for training.
108
+ Additionally, TxT360 includes detailed
109
+ documentation and analysis of the data, covering
110
+ distribution statistics, domain coverage, and
111
+ processing pipeline, which helps users navigate and
112
+ utilize the dataset effectively. Overall, TxT360
113
+ represents a significant step forward in the
114
+ availability and transparency of large-scale
115
+ training data for language models, setting a new
116
+ standard for dataset quality and openness."""),
117
+ id="section1",
118
+ ),
119
+ Section(
120
+ H2("Background"),
121
+ P(
122
+ """ The quality and size of a pre-training dataset
123
+ play a crucial role in the performance of large
124
+ language models (LLMs). The community has
125
+ introduced a variety of datasets for this purpose,
126
+ including purely web-based datasets like RefinedWeb
127
+ [1], RedPajama-Data-V2 [2], DCLM [3], and
128
+ FineWeb [4], as well as comprehensive datasets
129
+ derived from multiple highly-curated data sources
130
+ such as The Pile [5], RedPajama-Data-V1 [6], and
131
+ Dolma [7] . It is commonly known that web-based
132
+ datasets provide a vast quantity of data, while
133
+ highly-curated multi-source datasets consistently
134
+ deliver high quality and diversity, both critical
135
+ for effective LLM pre-training. However, despite
136
+ the advancements in both types of data, each type
137
+ of dataset has its limitations. For instance, the
138
+ processing scripts for the web dataset, RefinedWeb,
139
+ known for its high quality, are not public, and
140
+ only about 10% of the entire dataset has been
141
+ disclosed. Conversely, the web component of
142
+ existing highly-curated multi-source datasets is
143
+ relatively small compared to purely web-based
144
+ datasets, limiting their coverage and diversity
145
+ compared to the scale of information from the
146
+ internet. By integrating the extensive reach of
147
+ web data with the exceptional quality of curated
148
+ sources, TxT360 is crafted to meet and surpass the
149
+ rigorous standards required for state-of-the-art
150
+ LLM pre-training. """
151
+ ),
152
+ id="section2",
153
+ ),
154
+ Section(
155
+ H2("Main Content"),
156
+ P("""The performance of a large language model (LLM)
157
+ depends heavily on the quality and size of its
158
+ pretraining dataset. However, the pretraining
159
+ datasets for state-of-the-art open LLMs like Llama
160
+ 3 and Mixtral are not publicly available and very
161
+ little is known about how they were created.
162
+ Reading time: 45 min. For the best reading
163
+ experience, we recommend not using a mobile phone.
164
+ Recently, we released 🍷 FineWeb, a new,
165
+ large-scale (15-trillion tokens, 44TB disk space)
166
+ dataset for LLM pretraining. FineWeb is derived
167
+ from 96 CommonCrawl snapshots and produces
168
+ better-performing LLMs than other open pretraining
169
+ datasets. To bring more clarity in machine learning
170
+ and advance the open understanding of how to train
171
+ good quality large language models, we carefully
172
+ documented and ablated all of the design choices
173
+ used in FineWeb, including in-depth investigations
174
+ of deduplication and filtering strategies. The
175
+ present long form report is a deep dive in how to
176
+ create a large and high-quality web-scale dataset
177
+ for LLM pretraining. The dataset itself, 🍷
178
+ FineWeb, is available here. We are extremely
179
+ thankful to the whole distill.pub team (Christopher
180
+ Olah, Shan Carter, Ludwig Schubert in particular)
181
+ for creating the template on which we based this
182
+ blog post. Thanks also for inspiring us with
183
+ exquisitely crafted articles and blog posts. In
184
+ this report we also introduce 📚 FineWeb-Edu, a
185
+ subset of FineWeb constructed using scalable
186
+ automated high-quality annotations for educational
187
+ value, and which outperforms all openly accessible
188
+ web-datasets on a number of educational benchmarks
189
+ such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu
190
+ is available in two sizes/filtering-level: 1.3
191
+ trillion (very high educational content) and 5.4
192
+ trillion (high educational content) tokens (all
193
+ tokens are measured with GPT2 tokenizer). You can
194
+ download it here. Both datasets are released under
195
+ the permissive ODC-By 1.0 license TLDR: This blog
196
+ covers a discussion on processing and evaluating
197
+ data quality at scale, the 🍷 FineWeb recipe
198
+ (listing and explaining all of our design choices),
199
+ and the process followed to create its 📚
200
+ FineWeb-Edu subset."""),
201
+ id="section3",
202
+ ),
203
+ Section(
204
+ H2("Conclusion"),
205
+ P("""This is the conclusion section where we
206
+ summarize the key points discussed in the blog post
207
+ and provide final thoughts."""),
208
+ id="section4",
209
+ ),
210
+ id="inner-text",
211
+ )
212
+
213
+
214
+ @app.get("/web_data")
215
+ def web_data():
216
+ return Div(Section(H1("Web Data"), id="inner-text"))
217
+
218
+
219
  serve()
style.css CHANGED
@@ -112,10 +112,15 @@
112
  margin-bottom: 0px;
113
  margin-top: 0px;
114
  }
 
115
  .main-plot-container > div {
116
  display: none !important;
117
  }
118
 
 
 
 
 
119
 
120
  @media (min-width: 768px) {
121
  .main-plot-container > figure {
@@ -256,4 +261,3 @@ d-contents nav > div > a:hover,
256
  d-contents nav > ul > li > a:hover {
257
  text-decoration: none;
258
  }
259
-
 
112
  margin-bottom: 0px;
113
  margin-top: 0px;
114
  }
115
+
116
  .main-plot-container > div {
117
  display: none !important;
118
  }
119
 
120
+ .main-plot-container img {
121
+ max-width: 100%;
122
+ height: auto;
123
+ }
124
 
125
  @media (min-width: 768px) {
126
  .main-plot-container > figure {
 
261
  d-contents nav > ul > li > a:hover {
262
  text-decoration: none;
263
  }