|
from hazm import word_tokenize |
|
from hazm import sent_tokenize |
|
import re |
|
import six |
|
import string |
|
|
|
from normalizer import normalize |
|
|
|
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c" |
|
allowed_char = string.ascii_letters + string.digits + ':/@_-. ' |
|
|
|
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"): |
|
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "") |
|
text = text.replace(" ", "") |
|
|
|
return True if (len(candidate_text) / len(text)) > ratio else False |
|
|
|
|
|
def filter_by_num_tokens(text, gt=64): |
|
return True if len(word_tokenize(text)) > gt else False |
|
|
|
|
|
def filter_by_num_sents(text, gt=2): |
|
return True if len(sent_tokenize(text)) > gt else False |
|
|
|
|
|
def normalizer(text, do_lowercase=False): |
|
text = normalize(text) |
|
text = text.replace('ـ', '') |
|
if do_lowercase: |
|
text = text.lower() |
|
|
|
return text |
|
|
|
|
|
def clean_url(text): |
|
|
|
text = re.sub('<.*?>', '', text) |
|
|
|
text = re.sub(r'http\S+', "", text) |
|
|
|
result = '' |
|
for char in text: |
|
if char in allowed_char: |
|
result += char |
|
result = result.replace(' ', '') |
|
result = result.split(':') |
|
for phrase in result: |
|
p = phrase.replace(' ', '') |
|
|
|
if "/ /" or "//" in p: |
|
if ('https :' + p) or ('https:' + p) in text: |
|
text = text.replace('https :' + p, '') |
|
text = text.replace('https:' + p, '') |
|
elif ('http :' + p) or ('http:' + p) in text: |
|
text = text.replace('http :' + p, '') |
|
text = text.replace('http:' + p, '') |
|
elif '@' in p: |
|
if p in text: |
|
text = text.replace(p, '') |
|
else: |
|
text = text.replace(p, "") |
|
|
|
return text |
|
|