Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
""" | |
Preliminary preprocessing on the data, such as: | |
- correcting column names | |
- encoding the target column | |
""" | |
import pandas as pd | |
from sklearn import preprocessing | |
COLUMNS_TO_DROP = ["Unnamed: 133"] | |
TARGET_COLUMN = ["prognosis"] | |
RENAME_COLUMNS = { | |
"scurring": "scurving", | |
"dischromic _patches": "dischromic_patches", | |
"spotting_ urination": "spotting_urination", | |
"foul_smell_of urine": "foul_smell_of_urine", | |
} | |
def pretty_print(input): | |
""" | |
Prettify the input. | |
Args: | |
input: Can be a list of symtoms or a disease. | |
Returns: | |
list: Sorted and prettified input. | |
""" | |
# Convert to a list if necessary | |
if isinstance(input, list): | |
input = list(input) | |
# Flatten the list if required | |
pretty_list = [] | |
for item in input: | |
if isinstance(item, list): | |
pretty_list.extend(item) | |
else: | |
pretty_list.append(item) | |
# Sort and prettify the input | |
pretty_list = sorted([" ".join((item.split("_"))).title() for item in pretty_list]) | |
return pretty_list | |
def map_prediction(target_columns=["y", "prognosis"]): | |
df = pd.read_csv("Training_preprocessed.csv") | |
relevent_df = df[target_columns].drop_duplicates().relevent_df.where(df["y"] == 1) | |
prediction = relevent_df[target_columns[1]].dropna().values[0] | |
return prediction | |
if __name__ == "__main__": | |
# Load data | |
df_train = pd.read_csv("Training.csv") | |
df_test = pd.read_csv("Testing.csv") | |
# Remove unseless columns | |
df_train.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) | |
df_test.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) | |
# Correct some typos in some columns name | |
df_train.rename(columns=RENAME_COLUMNS, inplace=True) | |
df_test.rename(columns=RENAME_COLUMNS, inplace=True) | |
# Convert y category labels to y | |
label_encoder = preprocessing.LabelEncoder() | |
label_encoder.fit(df_train[TARGET_COLUMN].values.flatten()) | |
df_train["y"] = label_encoder.transform(df_train[TARGET_COLUMN].values.flatten()) | |
df_test["y"] = label_encoder.transform(df_test[TARGET_COLUMN].values.flatten()) | |
# Cast X features from int64 to float32 | |
float_columns = df_train.columns.drop(TARGET_COLUMN) | |
df_train[float_columns] = df_train[float_columns].astype("float32") | |
df_test[float_columns] = df_test[float_columns].astype("float32") | |
# Save preprocessed data | |
df_train.to_csv(path_or_buf="Training_preprocessed.csv", index=False) | |
df_test.to_csv(path_or_buf="Testing_preprocessed.csv", index=False) | |