flax-community
/

gpt2-medium-persian

@@ -2,11 +2,12 @@ from hazm import word_tokenize
 from hazm import sent_tokenize
 import re
 import six
 from normalizer import normalize
 persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
 def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
     candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
@@ -25,7 +26,39 @@ def filter_by_num_sents(text, gt=2):
 def normalizer(text, do_lowercase=False):
     text = normalize(text)
     if do_lowercase:
         text = text.lower()
     return text

 from hazm import sent_tokenize
 import re
 import six
+import string
 from normalizer import normalize
 persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
+allowed_char = string.ascii_letters + string.digits + ':/@_-. '
 def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
     candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
 def normalizer(text, do_lowercase=False):
     text = normalize(text)
+    text = text.replace('ـ', '')
     if do_lowercase:
         text = text.lower()
     return text
+def clean_url(text):
+    ## removing html tags
+    text = re.sub('<.*?>', '', text)
+    ## removing normal(without space urls)
+    text = re.sub(r'http\S+', "", text)
+    ## removing urls that contains space
+    result = ''
+    for char in text:
+        if char in allowed_char:
+            result += char
+    result = result.replace('  ', '')
+    result = result.split(':')
+    for phrase in result:
+        p = phrase.replace(' ', '')
+        # text = text.replace(p, "")
+        if "/ /" or "//" in p:
+            if ('https :' + p) or ('https:' + p) in text:
+                text = text.replace('https :' + p, '')
+                text = text.replace('https:' + p, '')
+            elif ('http :' + p) or ('http:' + p) in text:
+                text = text.replace('http :' + p, '')
+                text = text.replace('http:' + p, '')
+        elif '@' in p:
+            if p in text:
+                text = text.replace(p, '')
+        else:
+            text = text.replace(p, "")
+    return text