gpt2-medium-persian / src /data_utils.py
saied's picture
editted data_utils-url,html,streched alphabet
95cd35a
raw
history blame
2.05 kB
from hazm import word_tokenize
from hazm import sent_tokenize
import re
import six
import string
from normalizer import normalize
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
allowed_char = string.ascii_letters + string.digits + ':/@_-. '
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
text = text.replace(" ", "")
return True if (len(candidate_text) / len(text)) > ratio else False
def filter_by_num_tokens(text, gt=64):
return True if len(word_tokenize(text)) > gt else False
def filter_by_num_sents(text, gt=2):
return True if len(sent_tokenize(text)) > gt else False
def normalizer(text, do_lowercase=False):
text = normalize(text)
text = text.replace('ـ', '')
if do_lowercase:
text = text.lower()
return text
def clean_url(text):
## removing html tags
text = re.sub('<.*?>', '', text)
## removing normal(without space urls)
text = re.sub(r'http\S+', "", text)
## removing urls that contains space
result = ''
for char in text:
if char in allowed_char:
result += char
result = result.replace(' ', '')
result = result.split(':')
for phrase in result:
p = phrase.replace(' ', '')
# text = text.replace(p, "")
if "/ /" or "//" in p:
if ('https :' + p) or ('https:' + p) in text:
text = text.replace('https :' + p, '')
text = text.replace('https:' + p, '')
elif ('http :' + p) or ('http:' + p) in text:
text = text.replace('http :' + p, '')
text = text.replace('http:' + p, '')
elif '@' in p:
if p in text:
text = text.replace(p, '')
else:
text = text.replace(p, "")
return text