Spaces:
Build error
Build error
import pandas as pd | |
import numpy as np | |
import random | |
random.seed(1996) | |
CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv" | |
CORPUS_ALL = "data/migration/corpus_all.csv" | |
RATIO_DEV = 0.05 | |
RATIO_TEST = 0.25 | |
def preprocess_annotated(): | |
print("Loading corpus...") | |
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1") | |
print(f"\tfound {len(df)} annotated headlines") | |
train_idx = [] | |
dev_idx = [] | |
test_idx = [] | |
print("Making random train/dev/test split...") | |
for i in range(len(df)): | |
rnd = random.random() | |
if rnd < RATIO_DEV: | |
dev_idx.append(i) | |
elif rnd < (RATIO_DEV + RATIO_TEST): | |
test_idx.append(i) | |
else: | |
train_idx.append(i) | |
print(f"\tassigned {len(train_idx)} samples to train") | |
print(f"\tassigned {len(dev_idx)} samples to dev") | |
print(f"\tassigned {len(test_idx)} samples to test") | |
df_train = df.iloc[train_idx] | |
df_dev = df.iloc[dev_idx] | |
df_test = df.iloc[test_idx] | |
df_train.to_csv("output/migration/preprocess/annotations_train.csv") | |
df_dev.to_csv("output/migration/preprocess/annotations_dev.csv") | |
df_test.to_csv("output/migration/preprocess/annotations_test.csv") | |
def preprocess_all(): | |
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1") | |
for _, row in df.iterrows(): | |
pass | |
if __name__ == "__main__": | |
# preprocess_annotated() | |
preprocess_all() | |