adding dataset prepration module

Files changed (14) hide show

src/__pycache__/dictionary.cpython-38.pyc ADDED Viewed

Binary file (2.01 kB). View file

src/__pycache__/normalizer.cpython-38.pyc ADDED Viewed

Binary file (2.51 kB). View file

src/normalizer.py CHANGED Viewed

@@ -81,13 +81,13 @@ def normalize(text, zwnj="\u200c", tokenized=False):
     return " ".join(tokens)
-if __name__ == '__main__':
-    import textwrap
-    input_text = "دارهٔ تحقیقات فدرال در سال ۱۹۰۸ به نام ادارهٔ تحقیقات (BOI یا BI) بنیان‌گذاری شد. نام این سازمان در سال ۱۹۳۵ به ادارهٔ تحقیقات فدرال تغییر یافت. دفتر مرکزی اف‌بی‌آی در ساختمان جی. ادگار هوور در شهر واشینگتن، دی.سی. واقع شده‌است."
-    input_text = "یونان (به یونانی: Ελλάδα, اِلادا)"
-    input_text = "نسخهٔ"
-    input_text = "ὑ蕉Ұ제ṅ尘̲改座◦花芝秀黄天자埃澤ಿ ˈazbab اینجا ایران خانه‌شما است؟!۱۲۳۱۲۳۱۳۱۲ اَلْحُرُوفُ ٱلْعَرَبِیَّة"
-    input_text = normalize(input_text)
-    print(textwrap.fill(input_text))
-    print(normalize(input_text, tokenized=True))

     return " ".join(tokens)
+# if __name__ == '__main__':
+#     import textwrap
+#     input_text = "دارهٔ تحقیقات فدرال در سال ۱۹۰۸ به نام ادارهٔ تحقیقات (BOI یا BI) بنیان‌گذاری شد. نام این سازمان در سال ۱۹۳۵ به ادارهٔ تحقیقات فدرال تغییر یافت. دفتر مرکزی اف‌بی‌آی در ساختمان جی. ادگار هوور در شهر واشینگتن، دی.سی. واقع شده‌است."
+#     input_text = "یونان (به یونانی: Ελλάδα, اِلادا)"
+#     input_text = "نسخهٔ"
+#     input_text = "ὑ蕉Ұ제ṅ尘̲改座◦花芝秀黄天자埃澤ಿ ˈazbab اینجا ایران خانه‌شما است؟!۱۲۳۱۲۳۱۳۱۲ اَلْحُرُوفُ ٱلْعَرَبِیَّة"
+#     input_text = normalize(input_text)
+#     print(textwrap.fill(input_text))
+#     print(normalize(input_text, tokenized=True))

src/prep_dataset.py ADDED Viewed

+from datasets import load_dataset, DatasetDict
+from hazm import sent_tokenize
+from normalizer import normalize
+class Prep_dataset:
+    def __init__(self, subsample=False,*args, **kwargs):
+        raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_fa")
+        if subsample:
+            sample_dataset = raw_dataset.copy()
+            sample_dataset["sample"] = sample_dataset["train"].select(range(100))
+            sample_dataset.pop("train")
+            sample_dataset["train"] = sample_dataset.pop("sample")
+            final = DatasetDict(sample_dataset)
+            self.raw_dataset = final
+        else:
+            self.raw_dataset = raw_dataset
+    def _normalize(self, example):
+        example["text"] = normalize(example["text"])
+        return example
+    def preprare_dataset(self):
+        big_dataset = self.raw_dataset.filter(lambda x: len(x["text"])>500)
+        richSent_dataset = big_dataset.filter(lambda x: len(sent_tokenize(x["text"]))>2)
+        normalized_dataset = richSent_dataset.map(self._normalize)
+        return normalized_dataset

src/regexes/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (168 Bytes). View file

src/regexes/__pycache__/currency.cpython-38.pyc ADDED Viewed

Binary file (691 Bytes). View file

src/regexes/__pycache__/email.cpython-38.pyc ADDED Viewed

Binary file (482 Bytes). View file

src/regexes/__pycache__/latin.cpython-38.pyc ADDED Viewed

Binary file (382 Bytes). View file

src/regexes/__pycache__/number.cpython-38.pyc ADDED Viewed

Binary file (348 Bytes). View file

src/regexes/__pycache__/persian.cpython-38.pyc ADDED Viewed

Binary file (549 Bytes). View file

src/regexes/__pycache__/phone.cpython-38.pyc ADDED Viewed

Binary file (378 Bytes). View file

src/regexes/__pycache__/punk.cpython-38.pyc ADDED Viewed

Binary file (309 Bytes). View file

src/regexes/__pycache__/quote.cpython-38.pyc ADDED Viewed

Binary file (589 Bytes). View file

src/regexes/__pycache__/url.cpython-38.pyc ADDED Viewed

Binary file (777 Bytes). View file