Spaces:
Sleeping
Sleeping
Commit
•
d211b3d
1
Parent(s):
2ad6313
update app
Browse files
app.py
CHANGED
@@ -1,25 +1,202 @@
|
|
1 |
-
import gradio as
|
|
|
2 |
import polars as pl
|
|
|
3 |
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
min_min = data["min"].min()
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
def filter(min_value: min_min, max_value: min_max):
|
11 |
-
df = data.select((pl.col("min") >= min_value) & (pl.col("min") <= max_value)).to_pandas()
|
12 |
-
if df.shape[0] > 100:
|
13 |
-
return df.head(100)
|
14 |
-
return df
|
15 |
-
|
16 |
|
17 |
with gr.Blocks() as demo:
|
18 |
-
gr.Markdown(
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
demo.launch(debug=True)
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
import polars as pl
|
4 |
+
import logging
|
5 |
|
6 |
|
7 |
+
logging.basicConfig(format='%(name)s - %(asctime)s - %(message)s', level=logging.INFO)
|
8 |
+
|
9 |
+
logging.info("loading data")
|
10 |
+
data = pl.read_parquet("hf://datasets/polinaeterna/hub_datasets_string_statistics/data/*.parquet")
|
11 |
+
logging.info("data loaded")
|
12 |
+
|
13 |
+
|
14 |
+
min_num_examples = data["num_examples"].min()
|
15 |
+
max_num_examples = data["num_examples"].max()
|
16 |
+
|
17 |
+
min_null_count = data["null_count"].min()
|
18 |
+
max_null_count = data["null_count"].max()
|
19 |
+
|
20 |
+
min_null_prop = data["null_proportion"].min()
|
21 |
+
max_null_prop = data["null_proportion"].max()
|
22 |
+
|
23 |
min_min = data["min"].min()
|
24 |
+
max_min = data["min"].max()
|
25 |
+
|
26 |
+
min_max = data["max"].min()
|
27 |
+
max_max = data["max"].max()
|
28 |
+
|
29 |
+
min_mean = data["mean"].min()
|
30 |
+
max_mean = data["mean"].max()
|
31 |
+
|
32 |
+
min_median = data["median"].min()
|
33 |
+
max_median = data["median"].max()
|
34 |
+
|
35 |
+
min_std = data["std"].min()
|
36 |
+
max_std = data["std"].max()
|
37 |
+
|
38 |
+
|
39 |
+
def urlize(dataset_name):
|
40 |
+
return f"[{dataset_name}](https://huggingface.co/datasets/{dataset_name})"
|
41 |
+
|
42 |
+
|
43 |
+
def filter_data(
|
44 |
+
min_num_examples_input, max_num_examples_input,
|
45 |
+
min_null_count_input, max_null_count_input,
|
46 |
+
min_null_prop_input, max_null_prop_input,
|
47 |
+
min_min_input, max_min_input,
|
48 |
+
min_max_input, max_max_input,
|
49 |
+
min_mean_input, max_mean_input,
|
50 |
+
min_median_input, max_median_input,
|
51 |
+
min_std_input, max_std_input,
|
52 |
+
sort_by,
|
53 |
+
column_name,
|
54 |
+
include_partial = False,
|
55 |
+
):
|
56 |
|
57 |
+
df = data.filter(
|
58 |
+
(pl.col("num_examples") >= min_num_examples_input) & (pl.col("num_examples") <= max_num_examples_input) &
|
59 |
+
(pl.col("null_count") >= min_null_count_input) & (pl.col("null_count") <= max_null_count_input) &
|
60 |
+
(pl.col("null_proportion") >= min_null_prop_input) & (pl.col("null_proportion") <= max_null_prop_input) &
|
61 |
+
(pl.col("min") >= min_min_input) & (pl.col("min") <= max_min_input) &
|
62 |
+
(pl.col("max") >= min_max_input) & (pl.col("max") <= max_max_input) &
|
63 |
+
(pl.col("mean") >= min_mean_input) & (pl.col("mean") <= max_mean_input) &
|
64 |
+
(pl.col("median") >= min_median_input) & (pl.col("median") <= max_median_input) &
|
65 |
+
(pl.col("std") >= min_std_input) & (pl.col("std") <= max_std_input)
|
66 |
+
)
|
67 |
+
if not include_partial:
|
68 |
+
df = df.filter((pl.col("partial") == include_partial))
|
69 |
+
if column_name:
|
70 |
+
df = df.filter(pl.col("column_name") == column_name)
|
71 |
+
if sort_by:
|
72 |
+
try:
|
73 |
+
sort_cols, sort_descs = parse_sort_by(sort_by)
|
74 |
+
except:
|
75 |
+
return [pd.DataFrame(), "incorrect sort string format"]
|
76 |
+
logging.info(sort_cols)
|
77 |
+
logging.info(sort_descs)
|
78 |
+
df = df.sort(
|
79 |
+
*sort_cols, descending=sort_descs if len(sort_descs) > 1 else sort_descs[0],
|
80 |
+
)
|
81 |
+
n_rows = df.shape[0]
|
82 |
+
n_splits = df.group_by(["dataset", "config", "split"]).len().shape[0]
|
83 |
+
n_datasets = df["dataset"].n_unique()
|
84 |
+
|
85 |
+
max_rows = 100
|
86 |
+
|
87 |
+
text = f"{n_rows} rows / {n_splits} unique splits / {n_datasets} unique datasets found{' (100 rows displayed).' if n_rows > max_rows else '.'} \n"
|
88 |
+
df = df.to_pandas()
|
89 |
+
df["dataset"] = df["dataset"].apply(urlize)
|
90 |
+
df = df.drop("histogram", axis=1)
|
91 |
+
logging.info(df.head(2))
|
92 |
+
if df.shape[0] > max_rows:
|
93 |
+
return df.head(max_rows), text
|
94 |
+
return df, text
|
95 |
+
|
96 |
+
|
97 |
+
def parse_sort_by(sort_string):
|
98 |
+
args = sort_string.split(";")
|
99 |
+
col_names, descs = [], []
|
100 |
+
for arg in args:
|
101 |
+
col_name, desc = arg.split(":")
|
102 |
+
col_names.append(col_name)
|
103 |
+
descs.append(True if desc == "desc" else False)
|
104 |
+
return col_names, descs
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
with gr.Blocks() as demo:
|
108 |
+
gr.Markdown(
|
109 |
+
"""
|
110 |
+
# 💫 Filter text datasets by string statistics 💫
|
111 |
+
|
112 |
+
### The raw data is here:
|
113 |
+
""")
|
114 |
+
|
115 |
+
html_code = f"""
|
116 |
+
<iframe
|
117 |
+
src="https://huggingface.co/datasets/polinaeterna/hub_datasets_string_statistics/embed/viewer/default/train"
|
118 |
+
frameborder="0"
|
119 |
+
width="100%"
|
120 |
+
height="560px"
|
121 |
+
></iframe>
|
122 |
+
"""
|
123 |
+
gr.HTML(value=html_code)
|
124 |
+
|
125 |
+
gr.Markdown("- Num examples range")
|
126 |
+
with gr.Row():
|
127 |
+
with gr.Column():
|
128 |
+
min_num_examples_input = gr.Slider(min_num_examples, max_num_examples, min_num_examples, step=1, label="Min null count value")
|
129 |
+
with gr.Column():
|
130 |
+
max_num_examples_input = gr.Slider(min_num_examples, max_num_examples, max_num_examples, step=1, label="Max null count value")
|
131 |
+
|
132 |
+
gr.Markdown("- Null count range")
|
133 |
+
with gr.Row():
|
134 |
+
with gr.Column():
|
135 |
+
min_null_count_input = gr.Slider(min_null_count, max_null_count, min_null_count, step=1, label="Min null count value")
|
136 |
+
with gr.Column():
|
137 |
+
max_null_count_input = gr.Slider(min_null_count, max_null_count, max_null_count, step=1, label="Max null count value")
|
138 |
+
|
139 |
+
gr.Markdown("- Null proportion range")
|
140 |
+
with gr.Row():
|
141 |
+
with gr.Column():
|
142 |
+
min_null_prop_input = gr.Slider(min_null_prop, max_null_prop, min_null_prop, step=1, label="Min null proportion value")
|
143 |
+
with gr.Column():
|
144 |
+
max_null_prop_input = gr.Slider(min_null_prop, max_null_prop, max_null_prop, step=0.01, label="Max null proportion value")
|
145 |
+
|
146 |
+
gr.Markdown("- Minimum string length (in symbols) range")
|
147 |
+
with gr.Row():
|
148 |
+
with gr.Column():
|
149 |
+
min_min_input = gr.Slider(min_min, max_min, min_min, step=1, label="Min min value")
|
150 |
+
with gr.Column():
|
151 |
+
max_min_input = gr.Slider(min_min, max_min, max_min, step=1, label="Max min value")
|
152 |
+
|
153 |
+
gr.Markdown("- Maximum string length (in symbols) range")
|
154 |
+
with gr.Row():
|
155 |
+
with gr.Column():
|
156 |
+
min_max_input = gr.Slider(min_max, max_max, min_max, step=1, label="Min max value")
|
157 |
+
with gr.Column():
|
158 |
+
max_max_input = gr.Slider(min_max, max_max, max_max, step=1, label="Max max value")
|
159 |
+
|
160 |
+
gr.Markdown("- Mean string length (in symbols) range")
|
161 |
+
with gr.Row():
|
162 |
+
with gr.Column():
|
163 |
+
min_mean_input = gr.Slider(min_mean, max_mean, min_mean, step=1, label="Min mean value")
|
164 |
+
with gr.Column():
|
165 |
+
max_mean_input = gr.Slider(min_mean, max_mean, max_mean, step=1, label="Max mean value")
|
166 |
+
|
167 |
+
gr.Markdown("- Median string length (in symbols) range")
|
168 |
+
with gr.Row():
|
169 |
+
with gr.Column():
|
170 |
+
min_median_input = gr.Slider(min_median, max_median, min_median, step=1, label="Min median value")
|
171 |
+
with gr.Column():
|
172 |
+
max_median_input = gr.Slider(min_median, max_median, max_median, step=1, label="Max median value")
|
173 |
+
|
174 |
+
gr.Markdown("- Standard deviation of string length (in symbols) range")
|
175 |
+
with gr.Row():
|
176 |
+
with gr.Column():
|
177 |
+
min_std_input = gr.Slider(min_std, max_std, min_std, step=1, label="Min std value")
|
178 |
+
with gr.Column():
|
179 |
+
max_std_input = gr.Slider(min_std, max_std, max_std, step=1, label="Max std value")
|
180 |
+
|
181 |
+
sort_by = gr.Textbox(placeholder="num_examples:desc;std:asc;null_proportion:asc", label="Sort by (optional), in the following format: '<column_name_1>:desc/asc;<column_name_2>:desc/asc'")
|
182 |
+
column_name = gr.Textbox(placeholder="text", label="Column name, if you want to check only specific column (optional)")
|
183 |
+
include_partial = gr.Checkbox(False, label="Include partial datasets")
|
184 |
+
# max_rows = gr.Number(100, )
|
185 |
+
btn = gr.Button("Get datasets")
|
186 |
+
summary = gr.Markdown()
|
187 |
+
datasets = gr.DataFrame(datatype="markdown")
|
188 |
+
btn.click(filter_data, inputs=[
|
189 |
+
min_num_examples_input, max_num_examples_input,
|
190 |
+
min_null_count_input, max_null_count_input,
|
191 |
+
min_null_prop_input, max_null_prop_input,
|
192 |
+
min_min_input, max_min_input,
|
193 |
+
min_max_input, max_max_input,
|
194 |
+
min_mean_input, max_mean_input,
|
195 |
+
min_median_input, max_median_input,
|
196 |
+
min_std_input, max_std_input,
|
197 |
+
sort_by,
|
198 |
+
column_name,
|
199 |
+
include_partial,
|
200 |
+
], outputs=[datasets, summary])
|
201 |
|
202 |
demo.launch(debug=True)
|