|
|
|
import hashlib |
|
import json |
|
import unittest |
|
from dataclasses import dataclass |
|
from pathlib import Path |
|
from typing import Set |
|
|
|
_DIR = Path(__file__).parent |
|
_REPO_DIR = _DIR.parent |
|
|
|
|
|
@dataclass |
|
class Language: |
|
native: str |
|
english: str |
|
country: str |
|
|
|
|
|
_LANGUAGES = { |
|
"ar_JO": Language("العربية", "Arabic", "Jordan"), |
|
"ca_ES": Language("Català", "Catalan", "Spain"), |
|
"cs_CZ": Language("Čeština", "Czech", "Czech Republic"), |
|
"da_DK": Language("Dansk", "Danish", "Denmark"), |
|
"de_DE": Language("Deutsch", "German", "Germany"), |
|
"el_GR": Language("Ελληνικά", "Greek", "Greece"), |
|
"en_GB": Language("English", "English", "Great Britain"), |
|
"en_US": Language("English", "English", "United States"), |
|
"es_ES": Language("Español", "Spanish", "Spain"), |
|
"es_MX": Language("Español", "Spanish", "Mexico"), |
|
"fa_IR": Language("فارسی", "Farsi", "Iran"), |
|
"fi_FI": Language("Suomi", "Finnish", "Finland"), |
|
"fr_FR": Language("Français", "French", "France"), |
|
"is_IS": Language("íslenska", "Icelandic", "Iceland"), |
|
"it_IT": Language("Italiano", "Italian", "Italy"), |
|
"hu_HU": Language("Magyar", "Hungarian", "Hungary"), |
|
"ka_GE": Language("ქართული ენა", "Georgian", "Georgia"), |
|
"kk_KZ": Language("қазақша", "Kazakh", "Kazakhstan"), |
|
"lb_LU": Language("Lëtzebuergesch", "Luxembourgish", "Luxembourg"), |
|
"ne_NP": Language("नेपाली", "Nepali", "Nepal"), |
|
"nl_BE": Language("Nederlands", "Dutch", "Belgium"), |
|
"nl_NL": Language("Nederlands", "Dutch", "Netherlands"), |
|
"no_NO": Language("Norsk", "Norwegian", "Norway"), |
|
"pl_PL": Language("Polski", "Polish", "Poland"), |
|
"pt_BR": Language("Português", "Portuguese", "Brazil"), |
|
"pt_PT": Language("Português", "Portuguese", "Portugal"), |
|
"ro_RO": Language("Română", "Romanian", "Romania"), |
|
"ru_RU": Language("Русский", "Russian", "Russia"), |
|
"sk_SK": Language("Slovenčina", "Slovak", "Slovakia"), |
|
"sl_SI": Language("Slovenščina", "Slovenian", "Slovenia"), |
|
"sr_RS": Language("srpski", "Serbian", "Serbia"), |
|
"sv_SE": Language("Svenska", "Swedish", "Sweden"), |
|
"sw_CD": Language("Kiswahili", "Swahili", "Democratic Republic of the Congo"), |
|
"tr_TR": Language("Türkçe", "Turkish", "Turkey"), |
|
"uk_UA": Language("украї́нська мо́ва", "Ukrainian", "Ukraine"), |
|
"vi_VN": Language("Tiếng Việt", "Vietnamese", "Vietnam"), |
|
"zh_CN": Language("简体中文", "Chinese", "China"), |
|
} |
|
|
|
|
|
|
|
|
|
def add_languages(): |
|
for onnx_path in _REPO_DIR.rglob("*.onnx"): |
|
config_path = f"{onnx_path}.json" |
|
with open(config_path, "r", encoding="utf-8") as config_file: |
|
config = json.load(config_file) |
|
|
|
lang_code, dataset, quality = onnx_path.stem.split("-") |
|
is_changed = False |
|
|
|
lang_info = _LANGUAGES.get(lang_code) |
|
assert lang_info is not None, f"Missing name for language: {lang_code}" |
|
|
|
lang_family, lang_region = lang_code.split("_", maxsplit=1) |
|
lang_dict = { |
|
"code": lang_code, |
|
"family": lang_family, |
|
"region": lang_region, |
|
"name_native": lang_info.native, |
|
"name_english": lang_info.english, |
|
"country_english": lang_info.country, |
|
} |
|
|
|
if "language" not in config: |
|
config["language"] = lang_dict |
|
is_changed = True |
|
else: |
|
current_lang_dict = config["language"] |
|
if "code" not in current_lang_dict: |
|
current_lang_dict["code"] = lang_dict["code"] |
|
is_changed = True |
|
|
|
if "family" not in current_lang_dict: |
|
current_lang_dict["family"] = lang_dict["family"] |
|
is_changed = True |
|
|
|
if "region" not in current_lang_dict: |
|
current_lang_dict["region"] = lang_dict["region"] |
|
is_changed = True |
|
|
|
if "name_native" not in current_lang_dict: |
|
current_lang_dict["name_native"] = lang_dict["name_native"] |
|
is_changed = True |
|
|
|
if "name_english" not in current_lang_dict: |
|
current_lang_dict["name_english"] = lang_dict["name_english"] |
|
is_changed = True |
|
|
|
if "country_english" not in current_lang_dict: |
|
current_lang_dict["country_english"] = lang_dict["country_english"] |
|
is_changed = True |
|
|
|
if "dataset" not in config: |
|
config["dataset"] = dataset |
|
is_changed = True |
|
|
|
if "quality" not in config["audio"]: |
|
config["audio"]["quality"] = quality |
|
is_changed = True |
|
|
|
if is_changed: |
|
with open(config_path, "w", encoding="utf-8") as config_file: |
|
json.dump(config, config_file, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
|
|
|
|
class VoiceTest(unittest.TestCase): |
|
def test_voices(self): |
|
used_aliases: Set[str] = set() |
|
|
|
for onnx_path in _REPO_DIR.rglob("*.onnx"): |
|
with self.subTest(onnx_path=onnx_path): |
|
self.assertGreater(onnx_path.stat().st_size, 0, "Empty onnx file") |
|
|
|
|
|
config_path = onnx_path.parent / f"{onnx_path.name}.json" |
|
with open(config_path, "r", encoding="utf-8") as config_file: |
|
config = json.load(config_file) |
|
|
|
|
|
self.assertIn( |
|
"piper_version", config, "Missing piper_version in config" |
|
) |
|
self.assertIn("language", config, "Missing language in config") |
|
self.assertIn("dataset", config, "Missing dataset in config") |
|
self.assertIn( |
|
"quality", config["audio"], "Missing audio quality in config" |
|
) |
|
|
|
|
|
|
|
quality_dir = onnx_path.parent |
|
dataset_dir = quality_dir.parent |
|
lang_code_dir = dataset_dir.parent |
|
lang_family_dir = lang_code_dir.parent |
|
|
|
self.assertEqual( |
|
lang_family_dir.name, |
|
config["language"]["family"], |
|
"Wrong lang family dir", |
|
) |
|
self.assertEqual( |
|
lang_code_dir.name, |
|
config["language"]["code"], |
|
"Wrong lang code dir", |
|
) |
|
self.assertEqual( |
|
dataset_dir.name, config["dataset"], "Wrong dataset dir" |
|
) |
|
self.assertEqual( |
|
quality_dir.name, config["audio"]["quality"], "Wrong quality dir" |
|
) |
|
self.assertIn(lang_code_dir.name, _LANGUAGES, "Unknown language code") |
|
|
|
|
|
file_lang_code, file_dataset, file_quality = onnx_path.stem.split("-") |
|
file_lang_family = file_lang_code.split("_", maxsplit=1)[0] |
|
|
|
self.assertEqual( |
|
file_lang_family, |
|
config["language"]["family"], |
|
"Wrong lang family file", |
|
) |
|
self.assertEqual( |
|
file_lang_code, config["language"]["code"], "Wrong lang code file" |
|
) |
|
self.assertEqual(file_dataset, config["dataset"], "Wrong dataset file") |
|
self.assertEqual( |
|
file_quality, config["audio"]["quality"], "Wrong quality" |
|
) |
|
|
|
|
|
aliases_path = onnx_path.parent / "ALIASES" |
|
if aliases_path.exists(): |
|
with open(aliases_path, "r", encoding="utf-8") as aliases_file: |
|
for alias in aliases_file: |
|
alias = alias.strip() |
|
self.assertNotIn( |
|
alias, |
|
used_aliases, |
|
"Alias is already in use by another voice", |
|
) |
|
used_aliases.add(alias) |
|
|
|
|
|
def run_tests() -> None: |
|
runner = unittest.TextTestRunner() |
|
result = runner.run(unittest.makeSuite(VoiceTest)) |
|
assert not result.failures, "Test failures" |
|
|
|
|
|
|
|
|
|
|
|
def write_voices_json() -> None: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
voices = {} |
|
|
|
for onnx_path in sorted(_REPO_DIR.rglob("*.onnx")): |
|
voice_dir = onnx_path.parent |
|
config_path = voice_dir / f"{onnx_path.name}.json" |
|
with open(config_path, "r", encoding="utf-8") as config_file: |
|
config = json.load(config_file) |
|
|
|
quality = config["audio"]["quality"] |
|
dataset = config["dataset"] |
|
lang_code = config["language"]["code"] |
|
lang_family, lang_region = lang_code.split("_", maxsplit=1) |
|
lang_names = _LANGUAGES[lang_code] |
|
voice_key = f"{lang_code}-{dataset}-{quality}" |
|
|
|
model_card_path = voice_dir / "MODEL_CARD" |
|
assert model_card_path.exists(), f"Missing {model_card_path}" |
|
|
|
aliases: Set[str] = set() |
|
aliases_path = voice_dir / "ALIASES" |
|
if aliases_path.exists(): |
|
with open(aliases_path, "r", encoding="utf-8") as aliases_file: |
|
for alias in aliases_file: |
|
aliases.add(alias.strip()) |
|
|
|
voices[voice_key] = { |
|
"key": voice_key, |
|
"name": dataset, |
|
"language": { |
|
"code": lang_code, |
|
"family": lang_family, |
|
"region": lang_region, |
|
"name_native": lang_names.native, |
|
"name_english": lang_names.english, |
|
"country_english": lang_names.country, |
|
}, |
|
"quality": quality, |
|
"num_speakers": config["num_speakers"], |
|
"speaker_id_map": config.get("speaker_id_map", {}), |
|
"files": { |
|
str(file_path.relative_to(_REPO_DIR)): { |
|
"size_bytes": file_path.stat().st_size, |
|
"md5_digest": get_file_hash(file_path), |
|
} |
|
for file_path in ( |
|
onnx_path, |
|
config_path, |
|
model_card_path, |
|
) |
|
}, |
|
"aliases": sorted(list(aliases)), |
|
} |
|
|
|
with open(_REPO_DIR / "voices.json", "w", encoding="utf-8") as voices_file: |
|
json.dump(voices, voices_file, indent=4, ensure_ascii=False) |
|
|
|
|
|
def get_file_hash(path, bytes_per_chunk: int = 8192) -> str: |
|
"""Hash a file in chunks using md5.""" |
|
path_hash = hashlib.md5() |
|
with open(path, "rb") as path_file: |
|
chunk = path_file.read(bytes_per_chunk) |
|
while chunk: |
|
path_hash.update(chunk) |
|
chunk = path_file.read(bytes_per_chunk) |
|
|
|
return path_hash.hexdigest() |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
add_languages() |
|
run_tests() |
|
|
|
print("Writing voices.json") |
|
write_voices_json() |
|
|