File size: 1,084 Bytes
73d5951 7cfca48 73d5951 7cfca48 73d5951 7cfca48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from datasets import load_dataset, DatasetDict
from hazm import sent_tokenize
from normalizer import normalize
class Prep_dataset:
def __init__(self, subsample=False, *args, **kwargs):
raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_fa")
if subsample:
sample_dataset = raw_dataset.copy()
sample_dataset["sample"] = sample_dataset["train"].select(range(100))
sample_dataset.pop("train")
sample_dataset["train"] = sample_dataset.pop("sample")
final = DatasetDict(sample_dataset)
self.raw_dataset = final
else:
self.raw_dataset = raw_dataset
def _normalize(self, example):
example["text"] = normalize(example["text"])
return example
def preprare_dataset(self):
big_dataset = self.raw_dataset.filter(lambda x: len(x["text"]) > 500)
richSent_dataset = big_dataset.filter(lambda x: len(sent_tokenize(x["text"])) > 2)
normalized_dataset = richSent_dataset.map(self._normalize)
return normalized_dataset
|