Spaces:
Paused
Paused
import re | |
SYMBOLS_MAPPING = { | |
"\n": "", | |
"β¦": ".", | |
"β": "'", | |
"β": "'", | |
"β": "'", | |
"β": "'", | |
"γ": "", | |
"γ": "", | |
"[": "", | |
"]": "", | |
"οΌ": "", | |
"οΌ": "", | |
"(": "", | |
")": "", | |
"γ»": "", | |
"Β·": "", | |
"γ": "'", | |
"γ": "'", | |
"γ": "'", | |
"γ": "'", | |
"β": "", | |
"ο½": "", | |
"~": "", | |
"οΌ": ",", | |
"οΌ": ",", | |
";": ",", | |
":": ",", | |
} | |
REPLACE_SYMBOL_REGEX = re.compile( | |
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) | |
) | |
EMOJI_REGEX = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
"]+", | |
flags=re.UNICODE, | |
) | |
def clean_text(text): | |
# Clean the text | |
text = text.strip() | |
# Replace all chinese symbols with their english counterparts | |
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) | |
# Remove emojis | |
text = EMOJI_REGEX.sub(r"", text) | |
# Remove continuous periods (...) and commas (,,,) | |
text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text) | |
return text | |