Spaces:
Runtime error
Runtime error
import re | |
from typing import Dict, List | |
from pypinyin import lazy_pinyin, Style | |
from .custom_pypinyin_dict import phrase_pinyin_data | |
import jieba | |
from .cn2an import an2cn | |
# 加载自定义拼音词典数据 | |
phrase_pinyin_data.load() | |
# 标点符号正则 | |
PUNC_MAP: Dict[str, str] = { | |
":": ",", | |
";": ",", | |
",": ",", | |
"。": ".", | |
"!": "!", | |
"?": "?", | |
"\n": ".", | |
"·": ",", | |
"、": ",", | |
"$": ".", | |
"/": ",", | |
"“": "'", | |
"”": "'", | |
'"': "'", | |
"‘": "'", | |
"’": "'", | |
"(": "'", | |
")": "'", | |
"(": "'", | |
")": "'", | |
"《": "'", | |
"》": "'", | |
"【": "'", | |
"】": "'", | |
"[": "'", | |
"]": "'", | |
"—": "-", | |
"~": "~", | |
"「": "'", | |
"」": "'", | |
"『": "'", | |
"』": "'", | |
} | |
# from GPT_SoVITS.text.zh_normalization.text_normlization | |
PUNC_MAP.update ({ | |
'/': '每', | |
'①': '一', | |
'②': '二', | |
'③': '三', | |
'④': '四', | |
'⑤': '五', | |
'⑥': '六', | |
'⑦': '七', | |
'⑧': '八', | |
'⑨': '九', | |
'⑩': '十', | |
'α': '阿尔法', | |
'β': '贝塔', | |
'γ': '伽玛', | |
'Γ': '伽玛', | |
'δ': '德尔塔', | |
'Δ': '德尔塔', | |
'ε': '艾普西龙', | |
'ζ': '捷塔', | |
'η': '依塔', | |
'θ': '西塔', | |
'Θ': '西塔', | |
'ι': '艾欧塔', | |
'κ': '喀帕', | |
'λ': '拉姆达', | |
'Λ': '拉姆达', | |
'μ': '缪', | |
'ν': '拗', | |
'ξ': '克西', | |
'Ξ': '克西', | |
'ο': '欧米克伦', | |
'π': '派', | |
'Π': '派', | |
'ρ': '肉', | |
'ς': '西格玛', | |
'σ': '西格玛', | |
'Σ': '西格玛', | |
'τ': '套', | |
'υ': '宇普西龙', | |
'φ': '服艾', | |
'Φ': '服艾', | |
'χ': '器', | |
'ψ': '普赛', | |
'Ψ': '普赛', | |
'ω': '欧米伽', | |
'Ω': '欧米伽', | |
'+': '加', | |
'-': '减', | |
'×': '乘', | |
'÷': '除', | |
'=': '等', | |
"嗯": "恩", | |
"呣": "母" | |
}) | |
PUNC_TABLE = str.maketrans(PUNC_MAP) | |
# 数字正则化 | |
NUMBER_PATTERN: re.Pattern = re.compile(r'\d+(?:\.?\d+)?') | |
# 阿拉伯数字转汉字 | |
def replace_number(match: re.Match) -> str: | |
return an2cn(match.group()) | |
def normalize_number(text: str) -> str: | |
return NUMBER_PATTERN.sub(replace_number, text) | |
# get symbols of phones, not used | |
def load_pinyin_symbols(path): | |
pinyin_dict={} | |
temp = [] | |
with open(path, "r", encoding='utf-8') as f: | |
content = f.readlines() | |
for line in content: | |
cuts = line.strip().split(',') | |
pinyin = cuts[0] | |
phones = cuts[1].split(' ') | |
pinyin_dict[pinyin] = phones | |
temp.extend(phones) | |
temp = list(set(temp)) | |
tone = [] | |
for phone in temp: | |
for i in range(1, 6): | |
phone2 = phone + str(i) | |
tone.append(phone2) | |
print(sorted(tone, key=lambda x: len(x))) | |
return pinyin_dict | |
def load_pinyin_dict(path: str) -> Dict[str, List[str]]: | |
pinyin_dict = {} | |
with open(path, "r", encoding='utf-8') as f: | |
for line in f: | |
key, value = line.strip().split(',', 1) | |
pinyin_dict[key] = value.split() | |
return pinyin_dict | |
import os | |
pinyin_dict = load_pinyin_dict(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cnm3', 'ds_CNM3.txt')) | |
# pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt') | |
def chinese_to_cnm3(text: str) -> List[str]: | |
# 标点符号和数字正则化 | |
text = text.translate(PUNC_TABLE) | |
text = normalize_number(text) | |
# 过滤掉特殊字符 | |
text = re.sub(r'[#&@“”^_|\\]', '', text) | |
words = jieba.lcut(text, cut_all=False) | |
phones = [] | |
for word in words: | |
pinyin_list: List[str] = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True) | |
for pinyin in pinyin_list: | |
if pinyin[-1].isdigit(): | |
tone = pinyin[-1] | |
syllable = pinyin[:-1] | |
phone = pinyin_dict[syllable] | |
phones.extend([ph + tone for ph in phone]) | |
elif pinyin[-1].isalpha(): | |
pass | |
else: | |
phones.extend(pinyin) | |
return phones |