multilingual-dokugpt / insert_spaces.py
ffreemt
Update insert_spaces.py
767bc4e
raw
history blame
1.43 kB
"""Insert spaces, mypython/split_chinese.py."""
import re
def insert_spaces(text: str, method: int = None) -> str:
r"""Insert space between Chinese characters.
To speed up, first check text contains more latin letters or Chinese charas, if more latin letters use insert_spaces(text,, 3) else use insert_spaces(text, None)
Args:
text: string of latin and Chinese chars
method:
None: default, re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")) # NOQA
1: re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
>>> insert_spaces("test亨利it四世上").strip()
'test 亨 利 it 四 世 上'
>>> insert_spaces("test亨利it四世上").strip().__len__()
17
"""
if method is None:
if re.findall(r"[a-zA-Z ]+", text).__len__() > len(text) // 2: # more latin # NOQA
method = 3
else: # more Chinese
method = 0
if method == 0:
return re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))
elif method == 1:
return re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
elif method == 2:
return re.sub(r"[一-龟]|\d+|\w+", r"\g<0> ", text)
elif method == 3:
return re.sub(r"(?<=[^a-zA-Z\d])|(?=[^a-zA-Z\d])", " ", text)
else:
return re.sub(
r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")
) # NOQA