Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
•
767bc4e
1
Parent(s):
fa65d76
Update insert_spaces.py
Browse files- insert_spaces.py +39 -0
insert_spaces.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Insert spaces, mypython/split_chinese.py."""
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
def insert_spaces(text: str, method: int = None) -> str:
|
6 |
+
r"""Insert space between Chinese characters.
|
7 |
+
|
8 |
+
To speed up, first check text contains more latin letters or Chinese charas, if more latin letters use insert_spaces(text,, 3) else use insert_spaces(text, None)
|
9 |
+
|
10 |
+
Args:
|
11 |
+
text: string of latin and Chinese chars
|
12 |
+
method:
|
13 |
+
None: default, re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")) # NOQA
|
14 |
+
1: re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
|
15 |
+
|
16 |
+
>>> insert_spaces("test亨利it四世上").strip()
|
17 |
+
'test 亨 利 it 四 世 上'
|
18 |
+
>>> insert_spaces("test亨利it四世上").strip().__len__()
|
19 |
+
17
|
20 |
+
|
21 |
+
"""
|
22 |
+
if method is None:
|
23 |
+
if re.findall(r"[a-zA-Z ]+", text).__len__() > len(text) // 2: # more latin # NOQA
|
24 |
+
method = 3
|
25 |
+
else: # more Chinese
|
26 |
+
method = 0
|
27 |
+
|
28 |
+
if method == 0:
|
29 |
+
return re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))
|
30 |
+
elif method == 1:
|
31 |
+
return re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
|
32 |
+
elif method == 2:
|
33 |
+
return re.sub(r"[一-龟]|\d+|\w+", r"\g<0> ", text)
|
34 |
+
elif method == 3:
|
35 |
+
return re.sub(r"(?<=[^a-zA-Z\d])|(?=[^a-zA-Z\d])", " ", text)
|
36 |
+
else:
|
37 |
+
return re.sub(
|
38 |
+
r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")
|
39 |
+
) # NOQA
|