from hazm import word_tokenize
from hazm import sent_tokenize
import re
import six
import string

from normalizer import normalize

persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
allowed_char = string.ascii_letters + string.digits + ':/@_-. '

def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
    candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
    text = text.replace(" ", "")

    return True if (len(candidate_text) / len(text)) > ratio else False


def filter_by_num_tokens(text, gt=64):
    return True if len(word_tokenize(text)) > gt else False


def filter_by_num_sents(text, gt=2):
    return True if len(sent_tokenize(text)) > gt else False


def normalizer(text, do_lowercase=False):
    text = normalize(text)
    text = text.replace('ـ', '')
    if do_lowercase:
        text = text.lower()

    return text


def clean_url(text):
    ## removing html tags
    text = re.sub('<.*?>', '', text)
    ## removing normal(without space urls)
    text = re.sub(r'http\S+', "", text)
    ## removing urls that contains space
    result = ''
    for char in text:
        if char in allowed_char:
            result += char
    result = result.replace('  ', '')
    result = result.split(':')
    for phrase in result:
        p = phrase.replace(' ', '')
        # text = text.replace(p, "")
        if "/ /" or "//" in p:
            if ('https :' + p) or ('https:' + p) in text:
                text = text.replace('https :' + p, '')
                text = text.replace('https:' + p, '')
            elif ('http :' + p) or ('http:' + p) in text:
                text = text.replace('http :' + p, '')
                text = text.replace('http:' + p, '')
        elif '@' in p:
            if p in text:
                text = text.replace(p, '')
        else:
            text = text.replace(p, "")
    
    return text