cankoban commited on
Commit
b67dfe3
1 Parent(s): 37d70c1

Upload util.py

Browse files
Files changed (1) hide show
  1. util.py +87 -0
util.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import streamlit.components.v1 as components
4
+ from pandas.api.types import (
5
+ is_categorical_dtype,
6
+ is_datetime64_any_dtype,
7
+ is_numeric_dtype,
8
+ is_object_dtype,
9
+ )
10
+
11
+ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
12
+ """
13
+ Adds a UI on top of a dataframe to let viewers filter columns
14
+
15
+ Args:
16
+ df (pd.DataFrame): Original dataframe
17
+
18
+ Returns:
19
+ pd.DataFrame: Filtered dataframe
20
+ """
21
+ modify = st.checkbox("Add filters")
22
+
23
+ if not modify:
24
+ return df
25
+
26
+ df = df.copy()
27
+
28
+ # Try to convert datetimes into a standard format (datetime, no timezone)
29
+ for col in df.columns:
30
+ if is_object_dtype(df[col]):
31
+ try:
32
+ df[col] = pd.to_datetime(df[col])
33
+ except Exception:
34
+ pass
35
+
36
+ if is_datetime64_any_dtype(df[col]):
37
+ df[col] = df[col].dt.tz_localize(None)
38
+
39
+ modification_container = st.container()
40
+
41
+ with modification_container:
42
+ limit_non_unique = 1
43
+ to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
44
+ for column in to_filter_columns:
45
+ if df[column].dtype == 'O': # Check if the column is of 'object' dtype (i.e., string)
46
+ df[column] = df[column].astype(pd.CategoricalDtype())
47
+ left, right = st.columns((1, 20))
48
+ # Treat columns with < 10 unique values as categorical
49
+ if is_categorical_dtype(df[column]) or df[column].nunique() < limit_non_unique:
50
+ user_cat_input = right.multiselect(
51
+ f"Values for {column}",
52
+ df[column].unique(),
53
+ default=list(df[column].unique()),
54
+ )
55
+ df = df[df[column].isin(user_cat_input)]
56
+ elif is_numeric_dtype(df[column]):
57
+ _min = float(df[column].min())
58
+ _max = float(df[column].max())
59
+ step = (_max - _min) / 100
60
+ user_num_input = right.slider(
61
+ f"Values for {column}",
62
+ min_value=_min,
63
+ max_value=_max,
64
+ value=(_min, _max),
65
+ step=step,
66
+ )
67
+ df = df[df[column].between(*user_num_input)]
68
+ elif is_datetime64_any_dtype(df[column]):
69
+ user_date_input = right.date_input(
70
+ f"Values for {column}",
71
+ value=(
72
+ df[column].min(),
73
+ df[column].max(),
74
+ ),
75
+ )
76
+ if len(user_date_input) == 2:
77
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
78
+ start_date, end_date = user_date_input
79
+ df = df.loc[df[column].between(start_date, end_date)]
80
+ else:
81
+ user_text_input = right.text_input(
82
+ f"Substring or regex in {column}",
83
+ )
84
+ if user_text_input:
85
+ df = df[df[column].astype(str).str.contains(user_text_input)]
86
+
87
+ return df