flax-community
/

gpt2-medium-persian

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

gpt2-medium-persian / src /data_utils.py

saied's picture

editted data_utils-url,html,streched alphabet

95cd35a over 3 years ago

2.05 kB

	from hazm import word_tokenize
	from hazm import sent_tokenize
	import re
	import six
	import string

	from normalizer import normalize

	persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
	allowed_char = string.ascii_letters + string.digits + ':/@_-. '

	def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
	candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
	text = text.replace(" ", "")

	return True if (len(candidate_text) / len(text)) > ratio else False


	def filter_by_num_tokens(text, gt=64):
	return True if len(word_tokenize(text)) > gt else False


	def filter_by_num_sents(text, gt=2):
	return True if len(sent_tokenize(text)) > gt else False


	def normalizer(text, do_lowercase=False):
	text = normalize(text)
	text = text.replace('ـ', '')
	if do_lowercase:
	text = text.lower()

	return text


	def clean_url(text):
	## removing html tags
	text = re.sub('<.*?>', '', text)
	## removing normal(without space urls)
	text = re.sub(r'http\S+', "", text)
	## removing urls that contains space
	result = ''
	for char in text:
	if char in allowed_char:
	result += char
	result = result.replace(' ', '')
	result = result.split(':')
	for phrase in result:
	p = phrase.replace(' ', '')
	# text = text.replace(p, "")
	if "/ /" or "//" in p:
	if ('https :' + p) or ('https:' + p) in text:
	text = text.replace('https :' + p, '')
	text = text.replace('https:' + p, '')
	elif ('http :' + p) or ('http:' + p) in text:
	text = text.replace('http :' + p, '')
	text = text.replace('http:' + p, '')
	elif '@' in p:
	if p in text:
	text = text.replace(p, '')
	else:
	text = text.replace(p, "")

	return text