polinaeterna HF staff commited on
Commit
d211b3d
1 Parent(s): 2ad6313

update app

Browse files
Files changed (1) hide show
  1. app.py +192 -15
app.py CHANGED
@@ -1,25 +1,202 @@
1
- import gradio as gd
 
2
  import polars as pl
 
3
 
4
 
5
- data = pl.read_parquet("hf://datasets/polinaeterna/text_unnested/data/*.parquet")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  min_min = data["min"].min()
7
- min_max = data["max"].max()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def filter(min_value: min_min, max_value: min_max):
11
- df = data.select((pl.col("min") >= min_value) & (pl.col("min") <= max_value)).to_pandas()
12
- if df.shape[0] > 100:
13
- return df.head(100)
14
- return df
15
-
16
 
17
  with gr.Blocks() as demo:
18
- gr.Markdown("# 💫 Filter text datasets by string lengths distribution 💫")
19
- min_value = gr.Slider(min_min, min_max, 0, step=1, label="Min min value")
20
- max_value = gr.Slider(min_min, min_max, 0, step=1, label="Max min value")
21
- btn = gr.Button("Get datasets ")
22
- datasets = gr.DataFrame()
23
- btn.click(filter, inputs=[min_value, max_value], outputs=[datasets])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  demo.launch(debug=True)
 
1
+ import gradio as gr
2
+ import pandas as pd
3
  import polars as pl
4
+ import logging
5
 
6
 
7
+ logging.basicConfig(format='%(name)s - %(asctime)s - %(message)s', level=logging.INFO)
8
+
9
+ logging.info("loading data")
10
+ data = pl.read_parquet("hf://datasets/polinaeterna/hub_datasets_string_statistics/data/*.parquet")
11
+ logging.info("data loaded")
12
+
13
+
14
+ min_num_examples = data["num_examples"].min()
15
+ max_num_examples = data["num_examples"].max()
16
+
17
+ min_null_count = data["null_count"].min()
18
+ max_null_count = data["null_count"].max()
19
+
20
+ min_null_prop = data["null_proportion"].min()
21
+ max_null_prop = data["null_proportion"].max()
22
+
23
  min_min = data["min"].min()
24
+ max_min = data["min"].max()
25
+
26
+ min_max = data["max"].min()
27
+ max_max = data["max"].max()
28
+
29
+ min_mean = data["mean"].min()
30
+ max_mean = data["mean"].max()
31
+
32
+ min_median = data["median"].min()
33
+ max_median = data["median"].max()
34
+
35
+ min_std = data["std"].min()
36
+ max_std = data["std"].max()
37
+
38
+
39
+ def urlize(dataset_name):
40
+ return f"[{dataset_name}](https://huggingface.co/datasets/{dataset_name})"
41
+
42
+
43
+ def filter_data(
44
+ min_num_examples_input, max_num_examples_input,
45
+ min_null_count_input, max_null_count_input,
46
+ min_null_prop_input, max_null_prop_input,
47
+ min_min_input, max_min_input,
48
+ min_max_input, max_max_input,
49
+ min_mean_input, max_mean_input,
50
+ min_median_input, max_median_input,
51
+ min_std_input, max_std_input,
52
+ sort_by,
53
+ column_name,
54
+ include_partial = False,
55
+ ):
56
 
57
+ df = data.filter(
58
+ (pl.col("num_examples") >= min_num_examples_input) & (pl.col("num_examples") <= max_num_examples_input) &
59
+ (pl.col("null_count") >= min_null_count_input) & (pl.col("null_count") <= max_null_count_input) &
60
+ (pl.col("null_proportion") >= min_null_prop_input) & (pl.col("null_proportion") <= max_null_prop_input) &
61
+ (pl.col("min") >= min_min_input) & (pl.col("min") <= max_min_input) &
62
+ (pl.col("max") >= min_max_input) & (pl.col("max") <= max_max_input) &
63
+ (pl.col("mean") >= min_mean_input) & (pl.col("mean") <= max_mean_input) &
64
+ (pl.col("median") >= min_median_input) & (pl.col("median") <= max_median_input) &
65
+ (pl.col("std") >= min_std_input) & (pl.col("std") <= max_std_input)
66
+ )
67
+ if not include_partial:
68
+ df = df.filter((pl.col("partial") == include_partial))
69
+ if column_name:
70
+ df = df.filter(pl.col("column_name") == column_name)
71
+ if sort_by:
72
+ try:
73
+ sort_cols, sort_descs = parse_sort_by(sort_by)
74
+ except:
75
+ return [pd.DataFrame(), "incorrect sort string format"]
76
+ logging.info(sort_cols)
77
+ logging.info(sort_descs)
78
+ df = df.sort(
79
+ *sort_cols, descending=sort_descs if len(sort_descs) > 1 else sort_descs[0],
80
+ )
81
+ n_rows = df.shape[0]
82
+ n_splits = df.group_by(["dataset", "config", "split"]).len().shape[0]
83
+ n_datasets = df["dataset"].n_unique()
84
+
85
+ max_rows = 100
86
+
87
+ text = f"{n_rows} rows / {n_splits} unique splits / {n_datasets} unique datasets found{' (100 rows displayed).' if n_rows > max_rows else '.'} \n"
88
+ df = df.to_pandas()
89
+ df["dataset"] = df["dataset"].apply(urlize)
90
+ df = df.drop("histogram", axis=1)
91
+ logging.info(df.head(2))
92
+ if df.shape[0] > max_rows:
93
+ return df.head(max_rows), text
94
+ return df, text
95
+
96
+
97
+ def parse_sort_by(sort_string):
98
+ args = sort_string.split(";")
99
+ col_names, descs = [], []
100
+ for arg in args:
101
+ col_name, desc = arg.split(":")
102
+ col_names.append(col_name)
103
+ descs.append(True if desc == "desc" else False)
104
+ return col_names, descs
105
 
 
 
 
 
 
 
106
 
107
  with gr.Blocks() as demo:
108
+ gr.Markdown(
109
+ """
110
+ # 💫 Filter text datasets by string statistics 💫
111
+
112
+ ### The raw data is here:
113
+ """)
114
+
115
+ html_code = f"""
116
+ <iframe
117
+ src="https://huggingface.co/datasets/polinaeterna/hub_datasets_string_statistics/embed/viewer/default/train"
118
+ frameborder="0"
119
+ width="100%"
120
+ height="560px"
121
+ ></iframe>
122
+ """
123
+ gr.HTML(value=html_code)
124
+
125
+ gr.Markdown("- Num examples range")
126
+ with gr.Row():
127
+ with gr.Column():
128
+ min_num_examples_input = gr.Slider(min_num_examples, max_num_examples, min_num_examples, step=1, label="Min null count value")
129
+ with gr.Column():
130
+ max_num_examples_input = gr.Slider(min_num_examples, max_num_examples, max_num_examples, step=1, label="Max null count value")
131
+
132
+ gr.Markdown("- Null count range")
133
+ with gr.Row():
134
+ with gr.Column():
135
+ min_null_count_input = gr.Slider(min_null_count, max_null_count, min_null_count, step=1, label="Min null count value")
136
+ with gr.Column():
137
+ max_null_count_input = gr.Slider(min_null_count, max_null_count, max_null_count, step=1, label="Max null count value")
138
+
139
+ gr.Markdown("- Null proportion range")
140
+ with gr.Row():
141
+ with gr.Column():
142
+ min_null_prop_input = gr.Slider(min_null_prop, max_null_prop, min_null_prop, step=1, label="Min null proportion value")
143
+ with gr.Column():
144
+ max_null_prop_input = gr.Slider(min_null_prop, max_null_prop, max_null_prop, step=0.01, label="Max null proportion value")
145
+
146
+ gr.Markdown("- Minimum string length (in symbols) range")
147
+ with gr.Row():
148
+ with gr.Column():
149
+ min_min_input = gr.Slider(min_min, max_min, min_min, step=1, label="Min min value")
150
+ with gr.Column():
151
+ max_min_input = gr.Slider(min_min, max_min, max_min, step=1, label="Max min value")
152
+
153
+ gr.Markdown("- Maximum string length (in symbols) range")
154
+ with gr.Row():
155
+ with gr.Column():
156
+ min_max_input = gr.Slider(min_max, max_max, min_max, step=1, label="Min max value")
157
+ with gr.Column():
158
+ max_max_input = gr.Slider(min_max, max_max, max_max, step=1, label="Max max value")
159
+
160
+ gr.Markdown("- Mean string length (in symbols) range")
161
+ with gr.Row():
162
+ with gr.Column():
163
+ min_mean_input = gr.Slider(min_mean, max_mean, min_mean, step=1, label="Min mean value")
164
+ with gr.Column():
165
+ max_mean_input = gr.Slider(min_mean, max_mean, max_mean, step=1, label="Max mean value")
166
+
167
+ gr.Markdown("- Median string length (in symbols) range")
168
+ with gr.Row():
169
+ with gr.Column():
170
+ min_median_input = gr.Slider(min_median, max_median, min_median, step=1, label="Min median value")
171
+ with gr.Column():
172
+ max_median_input = gr.Slider(min_median, max_median, max_median, step=1, label="Max median value")
173
+
174
+ gr.Markdown("- Standard deviation of string length (in symbols) range")
175
+ with gr.Row():
176
+ with gr.Column():
177
+ min_std_input = gr.Slider(min_std, max_std, min_std, step=1, label="Min std value")
178
+ with gr.Column():
179
+ max_std_input = gr.Slider(min_std, max_std, max_std, step=1, label="Max std value")
180
+
181
+ sort_by = gr.Textbox(placeholder="num_examples:desc;std:asc;null_proportion:asc", label="Sort by (optional), in the following format: '<column_name_1>:desc/asc;<column_name_2>:desc/asc'")
182
+ column_name = gr.Textbox(placeholder="text", label="Column name, if you want to check only specific column (optional)")
183
+ include_partial = gr.Checkbox(False, label="Include partial datasets")
184
+ # max_rows = gr.Number(100, )
185
+ btn = gr.Button("Get datasets")
186
+ summary = gr.Markdown()
187
+ datasets = gr.DataFrame(datatype="markdown")
188
+ btn.click(filter_data, inputs=[
189
+ min_num_examples_input, max_num_examples_input,
190
+ min_null_count_input, max_null_count_input,
191
+ min_null_prop_input, max_null_prop_input,
192
+ min_min_input, max_min_input,
193
+ min_max_input, max_max_input,
194
+ min_mean_input, max_mean_input,
195
+ min_median_input, max_median_input,
196
+ min_std_input, max_std_input,
197
+ sort_by,
198
+ column_name,
199
+ include_partial,
200
+ ], outputs=[datasets, summary])
201
 
202
  demo.launch(debug=True)