from hazm import word_tokenize from hazm import sent_tokenize import re import six import string from normalizer import normalize persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c" allowed_char = string.ascii_letters + string.digits + ':/@_-. ' def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"): candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "") text = text.replace(" ", "") return True if (len(candidate_text) / len(text)) > ratio else False def filter_by_num_tokens(text, gt=64): return True if len(word_tokenize(text)) > gt else False def filter_by_num_sents(text, gt=2): return True if len(sent_tokenize(text)) > gt else False def normalizer(text, do_lowercase=False): text = normalize(text) text = text.replace('ـ', '') if do_lowercase: text = text.lower() return text def clean_url(text): ## removing html tags text = re.sub('<.*?>', '', text) ## removing normal(without space urls) text = re.sub(r'http\S+', "", text) ## removing urls that contains space result = '' for char in text: if char in allowed_char: result += char result = result.replace(' ', '') result = result.split(':') for phrase in result: p = phrase.replace(' ', '') # text = text.replace(p, "") if "/ /" or "//" in p: if ('https :' + p) or ('https:' + p) in text: text = text.replace('https :' + p, '') text = text.replace('https:' + p, '') elif ('http :' + p) or ('http:' + p) in text: text = text.replace('http :' + p, '') text = text.replace('http:' + p, '') elif '@' in p: if p in text: text = text.replace(p, '') else: text = text.replace(p, "") return text