adding dataset prepration module
Browse files- src/__pycache__/dictionary.cpython-38.pyc +0 -0
- src/__pycache__/normalizer.cpython-38.pyc +0 -0
- src/normalizer.py +10 -10
- src/prep_dataset.py +30 -0
- src/regexes/__pycache__/__init__.cpython-38.pyc +0 -0
- src/regexes/__pycache__/currency.cpython-38.pyc +0 -0
- src/regexes/__pycache__/email.cpython-38.pyc +0 -0
- src/regexes/__pycache__/latin.cpython-38.pyc +0 -0
- src/regexes/__pycache__/number.cpython-38.pyc +0 -0
- src/regexes/__pycache__/persian.cpython-38.pyc +0 -0
- src/regexes/__pycache__/phone.cpython-38.pyc +0 -0
- src/regexes/__pycache__/punk.cpython-38.pyc +0 -0
- src/regexes/__pycache__/quote.cpython-38.pyc +0 -0
- src/regexes/__pycache__/url.cpython-38.pyc +0 -0
src/__pycache__/dictionary.cpython-38.pyc
ADDED
Binary file (2.01 kB). View file
|
|
src/__pycache__/normalizer.cpython-38.pyc
ADDED
Binary file (2.51 kB). View file
|
|
src/normalizer.py
CHANGED
@@ -81,13 +81,13 @@ def normalize(text, zwnj="\u200c", tokenized=False):
|
|
81 |
return " ".join(tokens)
|
82 |
|
83 |
|
84 |
-
if __name__ == '__main__':
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
81 |
return " ".join(tokens)
|
82 |
|
83 |
|
84 |
+
# if __name__ == '__main__':
|
85 |
+
# import textwrap
|
86 |
+
|
87 |
+
# input_text = "دارهٔ تحقیقات فدرال در سال ۱۹۰۸ به نام ادارهٔ تحقیقات (BOI یا BI) بنیانگذاری شد. نام این سازمان در سال ۱۹۳۵ به ادارهٔ تحقیقات فدرال تغییر یافت. دفتر مرکزی افبیآی در ساختمان جی. ادگار هوور در شهر واشینگتن، دی.سی. واقع شدهاست."
|
88 |
+
# input_text = "یونان (به یونانی: Ελλάδα, اِلادا)"
|
89 |
+
# input_text = "نسخهٔ"
|
90 |
+
# input_text = "ὑ蕉Ұ제ṅ尘̲改座◦花芝秀黄天자埃澤ಿ ˈazbab اینجا ایران خانهشما است؟!۱۲۳۱۲۳۱۳۱۲ اَلْحُرُوفُ ٱلْعَرَبِیَّة"
|
91 |
+
# input_text = normalize(input_text)
|
92 |
+
# print(textwrap.fill(input_text))
|
93 |
+
# print(normalize(input_text, tokenized=True))
|
src/prep_dataset.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, DatasetDict
|
2 |
+
from hazm import sent_tokenize
|
3 |
+
from normalizer import normalize
|
4 |
+
|
5 |
+
|
6 |
+
class Prep_dataset:
|
7 |
+
|
8 |
+
def __init__(self, subsample=False,*args, **kwargs):
|
9 |
+
raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_fa")
|
10 |
+
if subsample:
|
11 |
+
sample_dataset = raw_dataset.copy()
|
12 |
+
sample_dataset["sample"] = sample_dataset["train"].select(range(100))
|
13 |
+
sample_dataset.pop("train")
|
14 |
+
sample_dataset["train"] = sample_dataset.pop("sample")
|
15 |
+
final = DatasetDict(sample_dataset)
|
16 |
+
self.raw_dataset = final
|
17 |
+
else:
|
18 |
+
self.raw_dataset = raw_dataset
|
19 |
+
|
20 |
+
|
21 |
+
def _normalize(self, example):
|
22 |
+
example["text"] = normalize(example["text"])
|
23 |
+
return example
|
24 |
+
|
25 |
+
def preprare_dataset(self):
|
26 |
+
big_dataset = self.raw_dataset.filter(lambda x: len(x["text"])>500)
|
27 |
+
richSent_dataset = big_dataset.filter(lambda x: len(sent_tokenize(x["text"]))>2)
|
28 |
+
normalized_dataset = richSent_dataset.map(self._normalize)
|
29 |
+
|
30 |
+
return normalized_dataset
|
src/regexes/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (168 Bytes). View file
|
|
src/regexes/__pycache__/currency.cpython-38.pyc
ADDED
Binary file (691 Bytes). View file
|
|
src/regexes/__pycache__/email.cpython-38.pyc
ADDED
Binary file (482 Bytes). View file
|
|
src/regexes/__pycache__/latin.cpython-38.pyc
ADDED
Binary file (382 Bytes). View file
|
|
src/regexes/__pycache__/number.cpython-38.pyc
ADDED
Binary file (348 Bytes). View file
|
|
src/regexes/__pycache__/persian.cpython-38.pyc
ADDED
Binary file (549 Bytes). View file
|
|
src/regexes/__pycache__/phone.cpython-38.pyc
ADDED
Binary file (378 Bytes). View file
|
|
src/regexes/__pycache__/punk.cpython-38.pyc
ADDED
Binary file (309 Bytes). View file
|
|
src/regexes/__pycache__/quote.cpython-38.pyc
ADDED
Binary file (589 Bytes). View file
|
|
src/regexes/__pycache__/url.cpython-38.pyc
ADDED
Binary file (777 Bytes). View file
|
|