File size: 1,433 Bytes
767bc4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""Insert spaces, mypython/split_chinese.py."""
import re


def insert_spaces(text: str, method: int = None) -> str:
    r"""Insert space between Chinese characters.

    To speed up, first check text contains more latin letters or Chinese charas, if more latin letters use insert_spaces(text,, 3) else use insert_spaces(text, None)

    Args:
        text: string of latin and Chinese chars
        method:
            None: default, re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))  # NOQA
            1: re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)

    >>> insert_spaces("test亨利it四世上").strip()
    'test 亨 利 it 四 世 上'
    >>> insert_spaces("test亨利it四世上").strip().__len__()
    17

    """
    if method is None:
        if re.findall(r"[a-zA-Z ]+", text).__len__() > len(text) // 2:  # more latin  # NOQA
            method = 3
        else:  # more Chinese
            method = 0

    if method == 0:
        return re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))
    elif method == 1:
        return re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
    elif method == 2:
        return re.sub(r"[一-龟]|\d+|\w+", r"\g<0> ", text)
    elif method == 3:
        return re.sub(r"(?<=[^a-zA-Z\d])|(?=[^a-zA-Z\d])", " ", text)
    else:
        return re.sub(
            r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")
        )  # NOQA