ffreemt commited on
Commit
767bc4e
1 Parent(s): fa65d76

Update insert_spaces.py

Browse files
Files changed (1) hide show
  1. insert_spaces.py +39 -0
insert_spaces.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Insert spaces, mypython/split_chinese.py."""
2
+ import re
3
+
4
+
5
+ def insert_spaces(text: str, method: int = None) -> str:
6
+ r"""Insert space between Chinese characters.
7
+
8
+ To speed up, first check text contains more latin letters or Chinese charas, if more latin letters use insert_spaces(text,, 3) else use insert_spaces(text, None)
9
+
10
+ Args:
11
+ text: string of latin and Chinese chars
12
+ method:
13
+ None: default, re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")) # NOQA
14
+ 1: re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
15
+
16
+ >>> insert_spaces("test亨利it四世上").strip()
17
+ 'test 亨 利 it 四 世 上'
18
+ >>> insert_spaces("test亨利it四世上").strip().__len__()
19
+ 17
20
+
21
+ """
22
+ if method is None:
23
+ if re.findall(r"[a-zA-Z ]+", text).__len__() > len(text) // 2: # more latin # NOQA
24
+ method = 3
25
+ else: # more Chinese
26
+ method = 0
27
+
28
+ if method == 0:
29
+ return re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))
30
+ elif method == 1:
31
+ return re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
32
+ elif method == 2:
33
+ return re.sub(r"[一-龟]|\d+|\w+", r"\g<0> ", text)
34
+ elif method == 3:
35
+ return re.sub(r"(?<=[^a-zA-Z\d])|(?=[^a-zA-Z\d])", " ", text)
36
+ else:
37
+ return re.sub(
38
+ r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")
39
+ ) # NOQA