import re SYMBOLS_MAPPING = { "\n": "", "…": ".", "“": "'", "”": "'", "‘": "'", "’": "'", "【": "", "】": "", "[": "", "]": "", "(": "", ")": "", "(": "", ")": "", "・": "", "·": "", "「": "'", "」": "'", "《": "'", "》": "'", "—": "", "~": "", "~": "", ":": ",", ";": ",", ";": ",", ":": ",", } REPLACE_SYMBOL_REGEX = re.compile( "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) ) EMOJI_REGEX = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE, ) def clean_text(text): # Clean the text text = text.strip() # Replace all chinese symbols with their english counterparts text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) # Remove emojis text = EMOJI_REGEX.sub(r"", text) # Remove continuous periods (...) and commas (,,,) text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text) return text