Spaces:
Sleeping
Sleeping
omkarenator
commited on
Commit
•
888beee
1
Parent(s):
b6c56e9
add more stuff
Browse files
main.py
CHANGED
@@ -346,23 +346,92 @@ def curated(request):
|
|
346 |
)
|
347 |
|
348 |
table_html = data_preparation_steps.to_html(index=False, border=0)
|
349 |
-
table_div = Div(NotStr(table_html),
|
350 |
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
)
|
357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
return Div(
|
359 |
Section(
|
360 |
H2("Curated Sources"),
|
361 |
plotly2fasthtml(get_chart_28168342()),
|
362 |
-
|
363 |
-
|
364 |
-
H3("Data Preprocessing"),
|
365 |
-
expander,
|
366 |
id="inner-text",
|
367 |
)
|
368 |
)
|
|
|
346 |
)
|
347 |
|
348 |
table_html = data_preparation_steps.to_html(index=False, border=0)
|
349 |
+
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
350 |
|
351 |
+
text = P("""This initial stage serves as the foundation for the entire
|
352 |
+
process. Here, we focus on acquiring and extracting the raw data, which can
|
353 |
+
come from various sources such as crawling websites, using HTTP/FTP dumps,
|
354 |
+
or working with archive dumps. For instance, to download and prepare a
|
355 |
+
dataset, we can specific downloaders based on the data source. Each dataset
|
356 |
+
might have its own downloader script which can be updated in real time to
|
357 |
+
handle changes in the data source. Here is a general outline of the data
|
358 |
+
preparation process: It's worth noting that some pipelines might require
|
359 |
+
invoking additional functions or scripts to handle specific data sources or
|
360 |
+
formats. These helper scripts can be located within specific directories
|
361 |
+
or modules dedicated to the dataset.""")
|
362 |
+
|
363 |
+
data_preparation_div = Div(
|
364 |
+
H3("Data Preparation"),
|
365 |
+
text,
|
366 |
+
table_div,
|
367 |
+
Div(get_data(), style="border: 1px solid #ccc; padding: 20px;"),
|
368 |
)
|
369 |
|
370 |
+
text = P("""Data preprocessing is a crucial step in the data science
|
371 |
+
pipeline. It involves cleaning and transforming raw data into a format that
|
372 |
+
is suitable for analysis. This process includes handling missing values,
|
373 |
+
normalizing data, encoding categorical variables, and more.""")
|
374 |
+
|
375 |
+
preprocessing_steps = pd.DataFrame(
|
376 |
+
{
|
377 |
+
"Step": [
|
378 |
+
"Language Filter",
|
379 |
+
"Min Word Count",
|
380 |
+
"Title Abstract",
|
381 |
+
"Majority Language",
|
382 |
+
"Paragraph Count",
|
383 |
+
"Frequency",
|
384 |
+
"Unigram Log Probability",
|
385 |
+
],
|
386 |
+
"Description": [
|
387 |
+
"Filtering data based on language",
|
388 |
+
"Setting a minimum word count threshold",
|
389 |
+
"Extracting information from the title and abstract",
|
390 |
+
"Identifying the majority language in the dataset",
|
391 |
+
"Counting the number of paragraphs in each document",
|
392 |
+
"Calculating the frequency of each word in the dataset",
|
393 |
+
"Calculating the log probability of each unigram",
|
394 |
+
],
|
395 |
+
"Need": [
|
396 |
+
"To remove documents in unwanted languages",
|
397 |
+
"To filter out documents with very few words",
|
398 |
+
"To extract relevant information for analysis",
|
399 |
+
"To understand the distribution of languages in the dataset",
|
400 |
+
"To analyze the structure and length of documents",
|
401 |
+
"To identify important words in the dataset",
|
402 |
+
"To measure the significance of individual words",
|
403 |
+
],
|
404 |
+
"Pros": [
|
405 |
+
"Improves data quality by removing irrelevant documents",
|
406 |
+
"Filters out low-quality or incomplete documents",
|
407 |
+
"Provides additional information for analysis",
|
408 |
+
"Enables language-specific analysis and insights",
|
409 |
+
"Helps understand the complexity and content of documents",
|
410 |
+
"Identifies important terms and topics in the dataset",
|
411 |
+
"Quantifies the importance of individual words",
|
412 |
+
],
|
413 |
+
"Cons": [
|
414 |
+
"May exclude documents in less common languages",
|
415 |
+
"May remove documents with valuable information",
|
416 |
+
"May introduce bias in the analysis",
|
417 |
+
"May not accurately represent the language distribution",
|
418 |
+
"May not capture the complexity of document structure",
|
419 |
+
"May be sensitive to noise and outliers",
|
420 |
+
"May not capture the semantic meaning of words",
|
421 |
+
],
|
422 |
+
}
|
423 |
+
)
|
424 |
+
|
425 |
+
table_html = preprocessing_steps.to_html(index=False, border=0)
|
426 |
+
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
427 |
+
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
428 |
+
|
429 |
return Div(
|
430 |
Section(
|
431 |
H2("Curated Sources"),
|
432 |
plotly2fasthtml(get_chart_28168342()),
|
433 |
+
data_preparation_div,
|
434 |
+
data_preprocessing_div,
|
|
|
|
|
435 |
id="inner-text",
|
436 |
)
|
437 |
)
|