import streamlit as st import awesome_streamlit as ast from .preprocess import ( ArabertPreprocessor, white_spaced_back_quotation_regex, white_spaced_double_quotation_regex, white_spaced_em_dash, white_spaced_single_quotation_regex, left_and_right_spaced_chars, left_spaced_chars, right_spaced_chars, ) import re MODELS_to_SELECT = [ "None", "bert-base-arabertv01", "bert-base-arabert", "bert-base-arabertv02", "bert-base-arabertv2", "bert-large-arabertv02", "bert-large-arabertv2", "araelectra-base", "araelectra-base-discriminator", "araelectra-base-generator", "araelectra-base-artydiqa", "aragpt2-base", "aragpt2-medium", "aragpt2-large", "aragpt2-mega", ] def unpreprocess(text: str) -> str: """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. The objective is to make the generated text of any model appear natural and not preprocessed. Args: text (:obj:`str`): input text to be un-preprocessed desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. Returns: str: The unpreprocessed (and possibly Farasa-desegmented) text. """ text = desegment(text) # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple # https://stackoverflow.com/a/53436792/5381220 text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) # during generation, sometimes the models don't put a space after the dot, this handles it text = text.replace(".", " . ") text = " ".join(text.split()) # handle decimals text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(left_and_right_spaced_chars, r"\1", text) text = re.sub(left_spaced_chars, r"\1", text) text = re.sub(right_spaced_chars, r"\1", text) return text def desegment(text: str) -> str: """ Use this function if sentence tokenization was done using `from arabert.preprocess_arabert import preprocess` with Farasa enabled AraBERT segmentation using Farasa adds a space after the '+' for prefixes, and after before the '+' for suffixes Example: >>> desegment('ال+ دراس +ات') الدراسات """ text = text.replace("+ ", "+") text = text.replace(" +", "+") text = " ".join([_desegmentword(word) for word in text.split(" ")]) return text def _desegmentword(orig_word: str) -> str: """ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs Example: >>> _desegmentword("ال+يومي+ة") اليومية """ word = orig_word.replace("ل+ال+", "لل") if "ال+ال" not in orig_word: word = word.replace("ل+ال", "لل") word = word.replace("+", "") word = word.replace("للل", "لل") return word def write(): col1, _ = st.columns(2) with col1: col1.title("Arabic Text Pre-Processor") st.markdown( """ """, unsafe_allow_html=True, ) input_text = st.text_input( "Text to Pre-Process", value="ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري", ) aligning_cols = st.columns(5) model_selector = aligning_cols[0].selectbox("Model", options=MODELS_to_SELECT) aligning_cols[1].write("#") aligning_cols[1].write("Select None to enable further filters") if model_selector == "None": cols = st.columns(5) keep_emojis = cols[0].checkbox("Keep emojis", False) remove_html_markup = cols[0].checkbox("Remove html markup", True) strip_tashkeel = cols[1].checkbox("Strip tashkeel", True) replace_urls_emails_mentions = cols[1].checkbox("Replace urls and emails", True) strip_tatweel = cols[2].checkbox("Strip tatweel", True) insert_white_spaces = cols[2].checkbox("Insert white spaces", True) remove_non_digit_repetition = cols[3].checkbox( "Remove non-digit repetition", True ) replace_slash_with_dash = cols[3].checkbox("Replace slash with dash", None) map_hindi_numbers_to_arabic = cols[4].checkbox( "Map hindi numbers to arabic", None ) apply_farasa_segmentation = cols[4].checkbox("Apply farasa segmentation", None) run_preprocessor = st.button("Run Pre-Processor") prep_text = None if run_preprocessor: if model_selector == "None": arabert_preprocessor = ArabertPreprocessor( model_selector, keep_emojis, remove_html_markup, replace_urls_emails_mentions, strip_tashkeel, strip_tatweel, insert_white_spaces, remove_non_digit_repetition, replace_slash_with_dash, map_hindi_numbers_to_arabic, apply_farasa_segmentation, ) else: arabert_preprocessor = ArabertPreprocessor(model_name=model_selector) prep_text = arabert_preprocessor._preprocess_v3(input_text) st.write(prep_text) st.write("-----") input_text_unprep = st.text_input( "Text to Undo the Pre-Processing", value=prep_text if prep_text else "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري", ) run_unpreprocessor = st.button("Run Un-Pre-Processor") if run_unpreprocessor: st.write(unpreprocess(input_text_unprep))