flax-community
/

gpt2-medium-persian

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

gpt2-medium-persian / src /normalizer.py

saied's picture

adding dataset prepration module

73d5951 over 3 years ago

3.5 kB

	import hazm
	import re

	from regexes.currency import CURRENCY_REGEX
	from regexes.email import EMAIL_REGEX
	from regexes.latin import LATIN_REGEX
	from regexes.latin import LATIN_REGEX, LATIN_WITH_SPECIAL_REGEX
	from regexes.number import NUMBERS_REGEX
	from regexes.phone import PHONE_REGEX
	from regexes.quote import DOUBLE_QUOTE_REGEX, SINGLE_QUOTE_REGEX
	from regexes.url import URL_REGEX
	from regexes.persian import PERSIAN_REGEX
	from regexes.punk import PUNK_REGEX
	import dictionary


	def make_trans(list_a, list_b):
	return dict((ord(a), b) for a, b in zip(list_a, list_b))


	def multiple_replace(text, chars_to_mapping):
	pattern = "\|".join(map(re.escape, chars_to_mapping.keys()))
	return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))


	ar2fa_digits = make_trans("٠١٢٣٤٥٦٧٨٩٪", "۰۱۲۳۴۵۶۷۸۹٪")
	fa2en_digits = make_trans("۰۱۲۳۴۵۶۷۸۹٪", "0123456789%")
	normalizer = hazm.Normalizer(persian_numbers=True)


	def normalize(text, zwnj="\u200c", tokenized=False):
	text = text.replace("\n", " ").replace("\t", " ")
	text = re.sub(r"\u200c+", "\u200c", text)

	text = normalizer.normalize(text)

	if len(dictionary.characters) > 0:
	text = multiple_replace(text, dictionary.characters)

	text = text.translate(ar2fa_digits)
	text = text.translate(fa2en_digits)

	text = SINGLE_QUOTE_REGEX.sub("'", text)
	text = DOUBLE_QUOTE_REGEX.sub('"', text)
	text = CURRENCY_REGEX.sub(r" \1 ", text)
	text = URL_REGEX.sub(r" \1 ", text)
	text = EMAIL_REGEX.sub(r" \1 ", text)
	text = PHONE_REGEX.sub(r" \1 ", text)
	text = NUMBERS_REGEX.sub(r" \1 ", text)
	text = LATIN_REGEX.sub(r" \1 ", text)
	text = PUNK_REGEX.sub(r" \1 ", text)

	# Allow only english and persian characters
	text = re.sub(PERSIAN_REGEX, " ", text)

	text = text.replace(f" {zwnj} ", f"{zwnj}")
	text = text.replace(f"{zwnj} ", f"{zwnj}")
	text = text.replace(f" {zwnj}", f"{zwnj}")

	if len(dictionary.special_tokens) > 0:
	text = multiple_replace(text, dictionary.special_tokens)

	tokens = []
	for token in text.split():
	token = token.strip()
	if token:
	if token.startswith(zwnj) and token.endswith(zwnj):
	token = token[1:-1]
	if token.startswith(zwnj):
	token = token[1:]
	elif token.endswith(zwnj):
	token = token[:-1]
	else:
	token = token

	tokens.append(token)

	if tokenized:
	return tokens

	return " ".join(tokens)


	# if __name__ == '__main__':
	# import textwrap

	# input_text = "دارهٔ تحقیقات فدرال در سال ۱۹۰۸ به نام ادارهٔ تحقیقات (BOI یا BI) بنیان‌گذاری شد. نام این سازمان در سال ۱۹۳۵ به ادارهٔ تحقیقات فدرال تغییر یافت. دفتر مرکزی اف‌بی‌آی در ساختمان جی. ادگار هوور در شهر واشینگتن، دی.سی. واقع شده‌است."
	# input_text = "یونان (به یونانی: Ελλάδα, اِلادا)"
	# input_text = "نسخهٔ"
	# input_text = "ὑ蕉Ұ제ṅ尘̲改座◦花芝秀黄天자埃澤ಿ ˈazbab اینجا ایران خانه‌شما است؟!۱۲۳۱۲۳۱۳۱۲ اَلْحُرُوفُ ٱلْعَرَبِیَّة"
	# input_text = normalize(input_text)
	# print(textwrap.fill(input_text))
	# print(normalize(input_text, tokenized=True))