adding remove add and remove tag functions
Browse files- src/__pycache__/dictionary.cpython-38.pyc +0 -0
- src/data_utils.py +8 -0
- src/normalizer.py +8 -0
- src/regexes/__pycache__/__init__.cpython-38.pyc +0 -0
- src/regexes/__pycache__/currency.cpython-38.pyc +0 -0
- src/regexes/__pycache__/email.cpython-38.pyc +0 -0
- src/regexes/__pycache__/latin.cpython-38.pyc +0 -0
- src/regexes/__pycache__/number.cpython-38.pyc +0 -0
- src/regexes/__pycache__/persian.cpython-38.pyc +0 -0
- src/regexes/__pycache__/phone.cpython-38.pyc +0 -0
- src/regexes/__pycache__/punk.cpython-38.pyc +0 -0
- src/regexes/__pycache__/quote.cpython-38.pyc +0 -0
- src/regexes/__pycache__/url.cpython-38.pyc +0 -0
src/__pycache__/dictionary.cpython-38.pyc
ADDED
Binary file (2.07 kB). View file
|
|
src/data_utils.py
CHANGED
@@ -23,6 +23,14 @@ def filter_by_num_tokens(text, gt=64):
|
|
23 |
def filter_by_num_sents(text, gt=2):
|
24 |
return True if len(sent_tokenize(text)) > gt else False
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
def normalizer(text, do_lowercase=False):
|
28 |
text = normalize(text)
|
|
|
23 |
def filter_by_num_sents(text, gt=2):
|
24 |
return True if len(sent_tokenize(text)) > gt else False
|
25 |
|
26 |
+
def remove_adds(text,ratio=50):
|
27 |
+
comma = text.split(",")
|
28 |
+
colon = re.findall(r'(?:([^\W]+):([^\W]+))',text)
|
29 |
+
virgool = text.split("،")
|
30 |
+
length_add = len(comma)+len(colon)+len(virgool)
|
31 |
+
|
32 |
+
return True if length_add < ratio else False
|
33 |
+
|
34 |
|
35 |
def normalizer(text, do_lowercase=False):
|
36 |
text = normalize(text)
|
src/normalizer.py
CHANGED
@@ -25,6 +25,13 @@ def multiple_replace(text, chars_to_mapping):
|
|
25 |
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
26 |
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def clean_url(text):
|
30 |
# removing html tags
|
@@ -79,6 +86,7 @@ def normalize(text, zwnj="\u200c", tokenized=False):
|
|
79 |
text = DOUBLE_QUOTE_REGEX.sub('"', text)
|
80 |
text = CURRENCY_REGEX.sub(r" \1 ", text)
|
81 |
text = clean_url(text)
|
|
|
82 |
text = URL_REGEX.sub(" ", text)
|
83 |
text = EMAIL_REGEX.sub(" ", text)
|
84 |
text = PHONE_REGEX.sub(r" \1 ", text)
|
|
|
25 |
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
26 |
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
27 |
|
28 |
+
def remove_tags(text):
|
29 |
+
tag = "برچسب ها :"
|
30 |
+
try:
|
31 |
+
text = text[:text.index(tag)]
|
32 |
+
return text
|
33 |
+
except:
|
34 |
+
return text
|
35 |
|
36 |
def clean_url(text):
|
37 |
# removing html tags
|
|
|
86 |
text = DOUBLE_QUOTE_REGEX.sub('"', text)
|
87 |
text = CURRENCY_REGEX.sub(r" \1 ", text)
|
88 |
text = clean_url(text)
|
89 |
+
text = remove_tags(text)
|
90 |
text = URL_REGEX.sub(" ", text)
|
91 |
text = EMAIL_REGEX.sub(" ", text)
|
92 |
text = PHONE_REGEX.sub(r" \1 ", text)
|
src/regexes/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (168 Bytes). View file
|
|
src/regexes/__pycache__/currency.cpython-38.pyc
ADDED
Binary file (691 Bytes). View file
|
|
src/regexes/__pycache__/email.cpython-38.pyc
ADDED
Binary file (482 Bytes). View file
|
|
src/regexes/__pycache__/latin.cpython-38.pyc
ADDED
Binary file (382 Bytes). View file
|
|
src/regexes/__pycache__/number.cpython-38.pyc
ADDED
Binary file (348 Bytes). View file
|
|
src/regexes/__pycache__/persian.cpython-38.pyc
ADDED
Binary file (549 Bytes). View file
|
|
src/regexes/__pycache__/phone.cpython-38.pyc
ADDED
Binary file (378 Bytes). View file
|
|
src/regexes/__pycache__/punk.cpython-38.pyc
ADDED
Binary file (309 Bytes). View file
|
|
src/regexes/__pycache__/quote.cpython-38.pyc
ADDED
Binary file (589 Bytes). View file
|
|
src/regexes/__pycache__/url.cpython-38.pyc
ADDED
Binary file (777 Bytes). View file
|
|