File size: 3,936 Bytes
c35d162
30c3950
d3b5ad0
30c3950
 
 
 
 
 
 
 
 
 
 
 
 
245b8ae
30c3950
 
 
 
 
 
 
 
 
 
 
f272b23
 
5a320c5
f272b23
 
 
 
 
 
 
30c3950
f272b23
30c3950
a4a82c5
f272b23
a4a82c5
 
 
12984b2
f272b23
5cf39a1
f272b23
de76d43
5cf39a1
a4a82c5
f272b23
a4a82c5
d06d609
a3d1f8d
5cf39a1
 
 
 
a3d1f8d
5cf39a1
 
 
b9a1bad
 
6110406
 
b9a1bad
 
 
 
 
 
5cf39a1
 
6110406
 
038f0e8
 
 
 
 
 
6110406
 
b9a1bad
 
 
 
 
5cf39a1
6110406
 
 
038f0e8
 
 
 
6110406
 
30c3950
6110406
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')

import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
#from text.symbols import symbols_ftgra
from text import text_to_sequence

from scipy.io.wavfile import write

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def load_model(model_path, hps):
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model)
    _ = net_g.eval()
    _ = utils.load_checkpoint(model_path, net_g, None)
    return net_g

hps = utils.get_hparams_from_file("configs/vctk_base.json")

# Define a dictionary to store the model paths for each tab
model_paths = {
    "Phonemes_finetuned": "fr_wa_finetuned_pho/G_125000.pth",
    "Graphemes_finetuned": "fr_wa_finetuned/G_198000.pth",
    "Phonemes": "path_to_phonemes_model.pth",
    "Graphemes": "wa_graphemes/G_168000.pth"
}

# Load the initial model
net_g = load_model(model_paths["Phonemes_finetuned"], hps)

def tts(text, speaker_id, tab_name):
    global net_g
    net_g = load_model(model_paths[tab_name], hps)
    sid = torch.LongTensor([speaker_id])  # speaker identity
    stn_tst = get_text(text, hps)

    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
            0, 0].data.float().numpy()
    return "Success", (hps.data.sampling_rate, audio)

def create_tab(tab_name):
    with gr.TabItem(tab_name):
        gr.Markdown(f"### {tab_name} TTS Model")
        tts_input1 = gr.TextArea(label="Text in Walloon (Depending on the model the input should be on phonemes or characters)", value="")
        tts_input2 = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
        tts_submit = gr.Button("Generate", variant="primary")
        tts_output1 = gr.Textbox(label="Message")
        tts_output2 = gr.Audio(label="Output")
        tts_submit.click(lambda text, speaker_id: tts(text, speaker_id, tab_name), [tts_input1, tts_input2], [tts_output1, tts_output2])

app = gr.Blocks()
with app:
    gr.Markdown(
        """
        # First Text to Speech (TTS) for Walloon
        Based on VITS (https://github.com/jaywalnut310/vits).

        Select the desired model and write the text in phonemes or graphemes depending on the model.

        For faster inference speed it is recommended to use short sentences.
        """
    )
    with gr.Tabs():
        create_tab("Phonemes_finetuned")
        create_tab("Graphemes_finetuned")
        create_tab("Phonemes")
        create_tab("Graphemes")

    gr.Markdown(
        """
        ### Examples
        | Input Text | Speaker | Input Method  |
        |------------|---------|---------------|
        | li biːç ɛ l sɔlja ɛstẽ ki s maʁɡajẽ pɔ sawɛ kiː ski , dɛ døː , ɛstøː l py fwaʁ . m ɛ̃ s koː la , la k i vɛjɛ õ tsminɔː k aʁivef pjim pjam , d ɛ̃ õ bja nuː tsoː paltɔ .  | Female | Phonemes |
        | Li bijhe et l’ solea estént ki s’ margayént po sawè kî çki, des deus, esteut l’ pus foirt. Mins ç’ côp la, la k’ i veyèt on tchminåd k' arivéve pyim piam, dins on bea noû tchôd paltot.  | Male | Graphemes |
        """
    )

app.launch()