File size: 3,770 Bytes
d27bdac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8450284
d27bdac
8450284
 
 
 
 
e1657ee
c0d60e2
e1657ee
f664937
e1657ee
 
8450284
 
 
 
d27bdac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b131625
 
b45add8
b131625
 
b2ab782
 
b131625
 
 
 
 
d9ca6f9
 
d27bdac
b131625
 
d27bdac
 
 
 
 
 
b131625
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import json
import math
import torch
import torchaudio
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import UnitAudioLoader, UnitAudioCollate
from models import SynthesizerTrn

import gradio

hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")

hps = utils.get_hparams_from_file("configs/sovits_ow2.json")

net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("logs/ow2/G_195000.pth", net_g, None)


def infer(md, mic_audio, audio, speaker_id, pitch_shift, length_scale, noise_scale=.667, noise_scale_w=0.8):
    
    source = None
    sr = None
    
    if mic_audio:
        sr, source  = mic_audio
        source = torch.Tensor(source)
        
        if source.dim() == 1:
            source = source.unsqueeze(1)
            
        source = source.T

    if audio:
        source, sr = torchaudio.load(audio)

    source = torchaudio.functional.pitch_shift(source, sr, int(pitch_shift))#, n_fft=256)
    source = torchaudio.functional.resample(source, sr, 16000)
    source = torch.mean(source, dim=0).unsqueeze(0)
    source = source.unsqueeze(0)
    
    with torch.inference_mode():
        # Extract speech units
        unit = hubert.units(source)
        unit_lengths = torch.LongTensor([unit.size(1)])
        
        # for multi-speaker inference
        sid = torch.LongTensor([speaker_id])
        
        # Synthesize audio
        audio_out = net_g.infer(unit, unit_lengths, sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.float().numpy()

    return (22050, audio_out)

demo = gradio.Interface(
    fn=infer,
    inputs=[
        gradio.Markdown(
        """
        # SOVITS Any-to-Many VC | Overwatch 2
        Upload any voice recording and turn it into a mangled approximation of any* Overwatch 2 Hero!

        For a higher quality single-speaker model, check out my [soft-vc-widowmaker](https://huggingface.co/spaces/cjayic/soft-vc-widowmaker) space!

        SOVITS doesn't really appear to adjust the pitch to the target speaker, so it helps to have your input voice at a similar pitch to the target voice.
        I added a pitch shift option to preprocess the input voice, but it's slow and sometimes outright broken, use at your own risk.
        
        ( * up to Kiriko and without Bastion. Please forgive. )
        """),
        gradio.Audio(label="Record Input Audio", source="microphone"),
        gradio.Audio(label="Upload Input Audio", type="filepath"),
        gradio.Dropdown(label="Target Voice", choices=["Ana", "Ashe", "Baptiste", "Brigitte", "Cassidy", "Doomfist", "D.Va", "Echo", "Genji", "Hanzo", "Junker Queen", "Junkrat", "Kiriko", "Lúcio", "Mei", "Mercy", "Moira", "Orisa", "Pharah", "Reaper", "Reinhardt", "Roadhog", "Sigma", "Sojourn", "Soldier_ 76", "Sombra", "Symmetra", "Torbjörn", "Tracer", "Widowmaker", "Winston", "Zarya", "Zenyatta"], type="index", value="Ana"),
        gradio.Slider(label="Pitch Shift Input (+12 = up one octave, ⚠️ broken AF ⚠️)", minimum=-12.0, maximum=12.0, value=0, step=1),
        gradio.Slider(label="Length Factor (higher = slower speech)", minimum=0.1, maximum=2.0, value=1.0),
        gradio.Slider(label="Noise Scale (higher = more expressive and erratic)", minimum=0.0, maximum=2.0, value=.667),
        gradio.Slider(label="Noise Scale W (higher = more variation in cadence)", minimum=0.0, maximum=2.0, value=.8)
    ],
    outputs=[gradio.Audio(label="Audio as Target Voice")],
)
#demo.launch(share=True)
demo.launch(server_name="0.0.0.0")