gpt2-medium-persian / src /data_utils.py
m3hrdadfi's picture
Add training script with checkpoint and preprocessing + merge scripts
7cfca48
raw
history blame
931 Bytes
from hazm import word_tokenize
from hazm import sent_tokenize
import re
import six
from normalizer import normalize
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
text = text.replace(" ", "")
return True if (len(candidate_text) / len(text)) > ratio else False
def filter_by_num_tokens(text, gt=64):
return True if len(word_tokenize(text)) > gt else False
def filter_by_num_sents(text, gt=2):
return True if len(sent_tokenize(text)) > gt else False
def normalizer(text, do_lowercase=False):
text = normalize(text)
if do_lowercase:
text = text.lower()
return text