adding remove add and remove tag functions

Files changed (13) hide show

src/__pycache__/dictionary.cpython-38.pyc ADDED Viewed

Binary file (2.07 kB). View file

src/data_utils.py CHANGED Viewed

@@ -23,6 +23,14 @@ def filter_by_num_tokens(text, gt=64):
 def filter_by_num_sents(text, gt=2):
     return True if len(sent_tokenize(text)) > gt else False
 def normalizer(text, do_lowercase=False):
     text = normalize(text)

 def filter_by_num_sents(text, gt=2):
     return True if len(sent_tokenize(text)) > gt else False
+def remove_adds(text,ratio=50):
+    comma = text.split(",")
+    colon = re.findall(r'(?:([^\W]+):([^\W]+))',text)
+    virgool = text.split("،")
+    length_add = len(comma)+len(colon)+len(virgool)
+    return True if length_add < ratio else False
 def normalizer(text, do_lowercase=False):
     text = normalize(text)

src/normalizer.py CHANGED Viewed

@@ -25,6 +25,13 @@ def multiple_replace(text, chars_to_mapping):
     pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
     return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
 def clean_url(text):
     # removing html tags
@@ -79,6 +86,7 @@ def normalize(text, zwnj="\u200c", tokenized=False):
     text = DOUBLE_QUOTE_REGEX.sub('"', text)
     text = CURRENCY_REGEX.sub(r" \1 ", text)
     text = clean_url(text)
     text = URL_REGEX.sub(" ", text)
     text = EMAIL_REGEX.sub(" ", text)
     text = PHONE_REGEX.sub(r" \1 ", text)

     pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
     return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
+def remove_tags(text):
+    tag = "برچسب ها :"
+    try:
+        text = text[:text.index(tag)]
+        return text
+    except:
+        return text
 def clean_url(text):
     # removing html tags
     text = DOUBLE_QUOTE_REGEX.sub('"', text)
     text = CURRENCY_REGEX.sub(r" \1 ", text)
     text = clean_url(text)
+    text = remove_tags(text)
     text = URL_REGEX.sub(" ", text)
     text = EMAIL_REGEX.sub(" ", text)
     text = PHONE_REGEX.sub(r" \1 ", text)

src/regexes/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (168 Bytes). View file

src/regexes/__pycache__/currency.cpython-38.pyc ADDED Viewed

Binary file (691 Bytes). View file

src/regexes/__pycache__/email.cpython-38.pyc ADDED Viewed

Binary file (482 Bytes). View file

src/regexes/__pycache__/latin.cpython-38.pyc ADDED Viewed

Binary file (382 Bytes). View file

src/regexes/__pycache__/number.cpython-38.pyc ADDED Viewed

Binary file (348 Bytes). View file

src/regexes/__pycache__/persian.cpython-38.pyc ADDED Viewed

Binary file (549 Bytes). View file

src/regexes/__pycache__/phone.cpython-38.pyc ADDED Viewed

Binary file (378 Bytes). View file

src/regexes/__pycache__/punk.cpython-38.pyc ADDED Viewed

Binary file (309 Bytes). View file

src/regexes/__pycache__/quote.cpython-38.pyc ADDED Viewed

Binary file (589 Bytes). View file

src/regexes/__pycache__/url.cpython-38.pyc ADDED Viewed

Binary file (777 Bytes). View file