Spaces:

skytnt
/

moe-tts

Running on CPU Upgrade

App Files Files Community

skytnt commited on Sep 2, 2022

Commit

1a40474

•

1 Parent(s): 8505133

add models

Browse files

Files changed (12) hide show

app.py +25 -13
requirements.txt +4 -0
saved_model/5/config.json +3 -0
saved_model/5/cover.jpg +3 -0
saved_model/5/model.pth +3 -0
saved_model/6/config.json +3 -0
saved_model/6/cover.jpg +3 -0
saved_model/6/model.pth +3 -0
saved_model/{names.json → info.json} +2 -2
text/__init__.py +17 -16
text/cleaners.py +442 -4
text/jieba_dict.txt +0 -0

app.py CHANGED Viewed

@@ -73,6 +73,12 @@ def create_vc_fn(model, hps, speaker_ids):
     return vc_fn
 css = """
         #advanced-btn {
             color: white;
@@ -93,9 +99,12 @@ css = """
 if __name__ == '__main__':
     models = []
-    with open("saved_model/names.json", "r", encoding="utf-8") as f:
-        models_names = json.load(f)
-    for i, models_name in models_names.items():
         config_path = f"saved_model/{i}/config.json"
         model_path = f"saved_model/{i}/model.pth"
         cover_path = f"saved_model/{i}/cover.jpg"
@@ -111,8 +120,9 @@ if __name__ == '__main__':
         speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
         speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
-        models.append((models_name, cover_path, speakers, hps.symbols,
-                       create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids)))
     app = gr.Blocks(css=css)
@@ -126,12 +136,14 @@ if __name__ == '__main__':
         with gr.Tabs():
             with gr.TabItem("TTS"):
                 with gr.Tabs():
-                    for i, (model_name, cover_path, speakers, symbols, tts_fn, vc_fn) in enumerate(models):
                         with gr.TabItem(f"model{i}"):
                             with gr.Column():
-                                gr.Markdown(f"## {model_name}\n\n"
-                                            f"![cover](file/{cover_path})")
-                                tts_input1 = gr.TextArea(label="Text (60 words limitation)", value="こんにちは。")
                                 tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
                                                          type="index", value=speakers[0])
                                 tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
@@ -157,16 +169,16 @@ if __name__ == '__main__':
                                 }""")
                                 tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
                                                  [tts_output1, tts_output2])
-                                to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
-                                                     [tts_input1], [tts_input1])
                                 phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
                                                    _js="(i,phonemes, text) => text + phonemes[i]")
             with gr.TabItem("Voice Conversion"):
                 with gr.Tabs():
-                    for i, (model_name, cover_path, speakers, symbols, tts_fn, vc_fn) in enumerate(models):
                         with gr.TabItem(f"model{i}"):
-                            gr.Markdown(f"## {model_name}\n\n"
                                         f"![cover](file/{cover_path})")
                             vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
                                                     value=speakers[0])

     return vc_fn
+def create_to_phoneme_fn(hps):
+    def to_phoneme_fn(text):
+        return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
+    return to_phoneme_fn
 css = """
         #advanced-btn {
             color: white;
 if __name__ == '__main__':
     models = []
+    with open("saved_model/info.json", "r", encoding="utf-8") as f:
+        models_info = json.load(f)
+    for i, info in models_info.items():
+        name = info["title"]
+        lang = info["lang"]
+        example = info["example"]
         config_path = f"saved_model/{i}/config.json"
         model_path = f"saved_model/{i}/model.pth"
         cover_path = f"saved_model/{i}/cover.jpg"
         speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
         speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
+        models.append((name, lang, example, cover_path, speakers, hps.symbols,
+                       create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids),
+                       create_to_phoneme_fn(hps)))
     app = gr.Blocks(css=css)
         with gr.Tabs():
             with gr.TabItem("TTS"):
                 with gr.Tabs():
+                    for i, (name, lang, example, cover_path, speakers,
+                            symbols, tts_fn, vc_fn, to_phoneme_fn) in enumerate(models):
                         with gr.TabItem(f"model{i}"):
                             with gr.Column():
+                                gr.Markdown(f"## {name}\n\n"
+                                            f"![cover](file/{cover_path})\n\n"
+                                            f"lang: {lang}")
+                                tts_input1 = gr.TextArea(label="Text (60 words limitation)", value=example)
                                 tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
                                                          type="index", value=speakers[0])
                                 tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
                                 }""")
                                 tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
                                                  [tts_output1, tts_output2])
+                                to_phoneme_btn.click(to_phoneme_fn, [tts_input1], [tts_input1])
                                 phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
                                                    _js="(i,phonemes, text) => text + phonemes[i]")
             with gr.TabItem("Voice Conversion"):
                 with gr.Tabs():
+                    for i, (name, lang, example, cover_path, speakers,
+                            symbols, tts_fn, vc_fn, to_phoneme_fn) in enumerate(models):
                         with gr.TabItem(f"model{i}"):
+                            gr.Markdown(f"## {name}\n\n"
                                         f"![cover](file/{cover_path})")
                             vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
                                                     value=speakers[0])

requirements.txt CHANGED Viewed

@@ -9,4 +9,8 @@ torch
 torchvision
 Unidecode
 pyopenjtalk
 gradio

 torchvision
 Unidecode
 pyopenjtalk
+jamo
+pypinyin
+jieba
+cn2an
 gradio

saved_model/5/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d08ea4e940cd92bebaa656762031eb085439d47d6f636cdafb37411f24c927d1
+size 1262

saved_model/5/cover.jpg ADDED Viewed

Git LFS Details

SHA256: dbed43668741a90c3a7faef3c3b5aace7723b94c251106fb5925a0f1ba0d7c5c
Pointer size: 130 Bytes
Size of remote file: 30.5 kB

saved_model/5/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edfb6b428c398fab83a85b5ae41e13cb5a9f7be12692129e8a880d4553701f7b
+size 158888013

saved_model/6/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a7d6956086537898264526d08e780c9abc4af8533bf75358dd960016c13da8b
+size 1218

saved_model/6/cover.jpg ADDED Viewed

Git LFS Details

SHA256: 38e71373daa8849f04bd7867845676afab2057e69a5e0a1e312c2b6cfdd72794
Pointer size: 131 Bytes
Size of remote file: 146 kB

saved_model/6/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b545a33fe870c214e3828da9ab8e756c6c75a30a6acee74670637fbbd3a58a0d
+size 158875981

saved_model/{names.json → info.json} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1cfb2b973343bfcf64350ed974902b24b12b9903baff7c21ff17fdd9763abe1
-size 231

 version https://git-lfs.github.com/spec/v1
+oid sha256:1ae450ecf80251796929594abecca61537612c4115cf947d363c805055f0b199
+size 905

text/__init__.py CHANGED Viewed

@@ -3,30 +3,31 @@ from text import cleaners
 def text_to_sequence(text, symbols, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
     Args:
       text: string to convert to a sequence
       cleaner_names: names of the cleaner functions to run the text through
     Returns:
       List of integers corresponding to the symbols in the text
   '''
-  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
-  sequence = []
-  clean_text = _clean_text(text, cleaner_names)
-  for symbol in clean_text:
-    if symbol not in _symbol_to_id.keys():
-      continue
-    symbol_id = _symbol_to_id[symbol]
-    sequence += [symbol_id]
-  return sequence
 def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text = cleaner(text)
-  return text

 def text_to_sequence(text, symbols, cleaner_names):
+    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
     Args:
       text: string to convert to a sequence
       cleaner_names: names of the cleaner functions to run the text through
     Returns:
       List of integers corresponding to the symbols in the text
   '''
+    _symbol_to_id = {s: i for i, s in enumerate(symbols)}
+    sequence = []
+    clean_text = _clean_text(text, cleaner_names)
+    for symbol in clean_text:
+        if symbol not in _symbol_to_id.keys():
+            continue
+        symbol_id = _symbol_to_id[symbol]
+        sequence += [symbol_id]
+    return sequence
 def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception('Unknown cleaner: %s' % name)
+        text = cleaner(text)
+    print(text, cleaner_names)
+    return text

text/cleaners.py CHANGED Viewed

@@ -1,9 +1,37 @@
 import re
-from unidecode import unidecode
 import pyopenjtalk
 pyopenjtalk._lazy_init()
 # Regular expression matching Japanese without punctuation marks:
 _japanese_characters = re.compile(
     r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
@@ -12,10 +40,209 @@ _japanese_characters = re.compile(
 _japanese_marks = re.compile(
     r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
-def japanese_cleaners(text):
-    '''Pipeline for notating accent in Japanese text.'''
     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
     sentences = re.split(_japanese_marks, text)
     marks = re.findall(_japanese_marks, text)
     text = ''
@@ -49,10 +276,221 @@ def japanese_cleaners(text):
                     text += '↑'
         if i < len(marks):
             text += unidecode(marks[i]).replace(' ', '')
     if re.match('[A-Za-z]', text[-1]):
         text += '.'
     return text
 def japanese_cleaners2(text):
-    return japanese_cleaners(text).replace('ts', 'ʦ')

+""" from https://github.com/keithito/tacotron """
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+'''
+import os
 import re
+import sys
+import cn2an
+import jieba
 import pyopenjtalk
+from jamo import h2j, j2hcj
+from pypinyin import lazy_pinyin, BOPOMOFO
+from unidecode import unidecode
+jieba.set_dictionary(os.path.dirname(sys.argv[0]) + '/text/jieba_dict.txt')
+jieba.initialize()
 pyopenjtalk._lazy_init()
+# This is a list of Korean classifiers preceded by pure Korean numerals.
+_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
 # Regular expression matching Japanese without punctuation marks:
 _japanese_characters = re.compile(
     r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 _japanese_marks = re.compile(
     r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('％', 'パーセント')
+]]
+# List of (hangul, hangul divided) pairs:
+_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄳ', 'ㄱㅅ'),
+    ('ㄵ', 'ㄴㅈ'),
+    ('ㄶ', 'ㄴㅎ'),
+    ('ㄺ', 'ㄹㄱ'),
+    ('ㄻ', 'ㄹㅁ'),
+    ('ㄼ', 'ㄹㅂ'),
+    ('ㄽ', 'ㄹㅅ'),
+    ('ㄾ', 'ㄹㅌ'),
+    ('ㄿ', 'ㄹㅍ'),
+    ('ㅀ', 'ㄹㅎ'),
+    ('ㅄ', 'ㅂㅅ'),
+    ('ㅘ', 'ㅗㅏ'),
+    ('ㅙ', 'ㅗㅐ'),
+    ('ㅚ', 'ㅗㅣ'),
+    ('ㅝ', 'ㅜㅓ'),
+    ('ㅞ', 'ㅜㅔ'),
+    ('ㅟ', 'ㅜㅣ'),
+    ('ㅢ', 'ㅡㅣ'),
+    ('ㅑ', 'ㅣㅏ'),
+    ('ㅒ', 'ㅣㅐ'),
+    ('ㅕ', 'ㅣㅓ'),
+    ('ㅖ', 'ㅣㅔ'),
+    ('ㅛ', 'ㅣㅗ'),
+    ('ㅠ', 'ㅣㅜ')
+]]
+# List of (Latin alphabet, hangul) pairs:
+_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', '에이'),
+    ('b', '비'),
+    ('c', '시'),
+    ('d', '디'),
+    ('e', '이'),
+    ('f', '에프'),
+    ('g', '지'),
+    ('h', '에이치'),
+    ('i', '아이'),
+    ('j', '제이'),
+    ('k', '케이'),
+    ('l', '엘'),
+    ('m', '엠'),
+    ('n', '엔'),
+    ('o', '오'),
+    ('p', '피'),
+    ('q', '큐'),
+    ('r', '아르'),
+    ('s', '에스'),
+    ('t', '티'),
+    ('u', '유'),
+    ('v', '브이'),
+    ('w', '더블유'),
+    ('x', '엑스'),
+    ('y', '와이'),
+    ('z', '제트')
+]]
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', 'ㄟˉ'),
+    ('b', 'ㄅㄧˋ'),
+    ('c', 'ㄙㄧˉ'),
+    ('d', 'ㄉㄧˋ'),
+    ('e', 'ㄧˋ'),
+    ('f', 'ㄝˊㄈㄨˋ'),
+    ('g', 'ㄐㄧˋ'),
+    ('h', 'ㄝˇㄑㄩˋ'),
+    ('i', 'ㄞˋ'),
+    ('j', 'ㄐㄟˋ'),
+    ('k', 'ㄎㄟˋ'),
+    ('l', 'ㄝˊㄛˋ'),
+    ('m', 'ㄝˊㄇㄨˋ'),
+    ('n', 'ㄣˉ'),
+    ('o', 'ㄡˉ'),
+    ('p', 'ㄆㄧˉ'),
+    ('q', 'ㄎㄧㄡˉ'),
+    ('r', 'ㄚˋ'),
+    ('s', 'ㄝˊㄙˋ'),
+    ('t', 'ㄊㄧˋ'),
+    ('u', 'ㄧㄡˉ'),
+    ('v', 'ㄨㄧˉ'),
+    ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
+    ('x', 'ㄝˉㄎㄨˋㄙˋ'),
+    ('y', 'ㄨㄞˋ'),
+    ('z', 'ㄗㄟˋ')
+]]
+# List of (bopomofo, romaji) pairs:
+_bopomofo_to_romaji = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('ㄅㄛ', 'p⁼wo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p⁼'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't⁼'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k⁼'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'h'),
+    ('ㄐ', 'ʧ⁼'),
+    ('ㄑ', 'ʧʰ'),
+    ('ㄒ', 'ʃ'),
+    ('ㄓ', 'ʦ`⁼'),
+    ('ㄔ', 'ʦ`ʰ'),
+    ('ㄕ', 's`'),
+    ('ㄖ', 'ɹ`'),
+    ('ㄗ', 'ʦ⁼'),
+    ('ㄘ', 'ʦʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ə'),
+    ('ㄝ', 'e'),
+    ('ㄞ', 'ai'),
+    ('ㄟ', 'ei'),
+    ('ㄠ', 'au'),
+    ('ㄡ', 'ou'),
+    ('ㄧㄢ', 'yeNN'),
+    ('ㄢ', 'aNN'),
+    ('ㄧㄣ', 'iNN'),
+    ('ㄣ', 'əNN'),
+    ('ㄤ', 'aNg'),
+    ('ㄧㄥ', 'iNg'),
+    ('ㄨㄥ', 'uNg'),
+    ('ㄩㄥ', 'yuNg'),
+    ('ㄥ', 'əNg'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'ɥ'),
+    ('ˉ', '→'),
+    ('ˊ', '↑'),
+    ('ˇ', '↓↑'),
+    ('ˋ', '↓'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_romaji_with_accent(text):
     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    text = symbols_to_japanese(text)
     sentences = re.split(_japanese_marks, text)
     marks = re.findall(_japanese_marks, text)
     text = ''
                     text += '↑'
         if i < len(marks):
             text += unidecode(marks[i]).replace(' ', '')
+    return text
+def latin_to_hangul(text):
+    for regex, replacement in _latin_to_hangul:
+        text = re.sub(regex, replacement, text)
+    return text
+def divide_hangul(text):
+    for regex, replacement in _hangul_divided:
+        text = re.sub(regex, replacement, text)
+    return text
+def hangul_number(num, sino=True):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    num = re.sub(',', '', num)
+    if num == '0':
+        return '영'
+    if not sino and num == '20':
+        return '스무'
+    digits = '123456789'
+    names = '일이삼사오육칠팔구'
+    digit2name = {d: n for d, n in zip(digits, names)}
+    modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
+    decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
+    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
+    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
+    spelledout = []
+    for i, digit in enumerate(num):
+        i = len(num) - i - 1
+        if sino:
+            if i == 0:
+                name = digit2name.get(digit, '')
+            elif i == 1:
+                name = digit2name.get(digit, '') + '십'
+                name = name.replace('일십', '십')
+        else:
+            if i == 0:
+                name = digit2mod.get(digit, '')
+            elif i == 1:
+                name = digit2dec.get(digit, '')
+        if digit == '0':
+            if i % 4 == 0:
+                last_three = spelledout[-min(3, len(spelledout)):]
+                if ''.join(last_three) == '':
+                    spelledout.append('')
+                    continue
+            else:
+                spelledout.append('')
+                continue
+        if i == 2:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 3:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 4:
+            name = digit2name.get(digit, '') + '만'
+            name = name.replace('일만', '만')
+        elif i == 5:
+            name = digit2name.get(digit, '') + '십'
+            name = name.replace('일십', '십')
+        elif i == 6:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 7:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 8:
+            name = digit2name.get(digit, '') + '억'
+        elif i == 9:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 10:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 11:
+            name = digit2name.get(digit, '') + '천'
+        elif i == 12:
+            name = digit2name.get(digit, '') + '조'
+        elif i == 13:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 14:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 15:
+            name = digit2name.get(digit, '') + '천'
+        spelledout.append(name)
+    return ''.join(elem for elem in spelledout)
+def number_to_hangul(text):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
+    for token in tokens:
+        num, classifier = token
+        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
+            spelledout = hangul_number(num, sino=False)
+        else:
+            spelledout = hangul_number(num, sino=True)
+        text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
+    # digit by digit for remaining digits
+    digits = '0123456789'
+    names = '영일이삼사오육칠팔구'
+    for d, n in zip(digits, names):
+        text = text.replace(d, n)
+    return text
+def number_to_chinese(text):
+    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    for number in numbers:
+        text = text.replace(number, cn2an.an2cn(number), 1)
+    return text
+def chinese_to_bopomofo(text):
+    text = text.replace('、', '，').replace('；', '，').replace('：', '，')
+    words = jieba.lcut(text, cut_all=False)
+    text = ''
+    for word in words:
+        bopomofos = lazy_pinyin(word, BOPOMOFO)
+        if not re.search('[\u4e00-\u9fff]', word):
+            text += word
+            continue
+        for i in range(len(bopomofos)):
+            if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
+                bopomofos[i] += 'ˉ'
+        if text != '':
+            text += ' '
+        text += ''.join(bopomofos)
+    return text
+def latin_to_bopomofo(text):
+    for regex, replacement in _latin_to_bopomofo:
+        text = re.sub(regex, replacement, text)
+    return text
+def bopomofo_to_romaji(text):
+    for regex, replacement in _bopomofo_to_romaji:
+        text = re.sub(regex, replacement, text)
+    return text
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def japanese_cleaners(text):
+    text = japanese_to_romaji_with_accent(text)
     if re.match('[A-Za-z]', text[-1]):
         text += '.'
     return text
 def japanese_cleaners2(text):
+    return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
+def korean_cleaners(text):
+    '''Pipeline for Korean text'''
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text = j2hcj(h2j(text))
+    text = divide_hangul(text)
+    if re.match('[\u3131-\u3163]', text[-1]):
+        text += '.'
+    return text
+def chinese_cleaners(text):
+    '''Pipeline for Chinese text'''
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    if re.match('[ˉˊˇˋ˙]', text[-1]):
+        text += '。'
+    return text
+def zh_ja_mixture_cleaners(text):
+    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
+    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
+    for chinese_text in chinese_texts:
+        cleaned_text = number_to_chinese(chinese_text[4:-4])
+        cleaned_text = chinese_to_bopomofo(cleaned_text)
+        cleaned_text = latin_to_bopomofo(cleaned_text)
+        cleaned_text = bopomofo_to_romaji(cleaned_text)
+        cleaned_text = re.sub('i[aoe]', lambda x: 'y' + x.group(0)[1:], cleaned_text)
+        cleaned_text = re.sub('u[aoəe]', lambda x: 'w' + x.group(0)[1:], cleaned_text)
+        cleaned_text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑]+)', lambda x: x.group(1) + 'ɹ`' + x.group(2), cleaned_text).replace(
+            'ɻ', 'ɹ`')
+        cleaned_text = re.sub('([ʦs][⁼ʰ]?)([→↓↑]+)', lambda x: x.group(1) + 'ɹ' + x.group(2), cleaned_text)
+        text = text.replace(chinese_text, cleaned_text + ' ', 1)
+    for japanese_text in japanese_texts:
+        cleaned_text = japanese_to_romaji_with_accent(japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace(
+            '...', '…')
+        text = text.replace(japanese_text, cleaned_text + ' ', 1)
+    text = text[:-1]
+    if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
+        text += '.'
+    return text

text/jieba_dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff