Spaces:

amphion
/

singing_voice_conversion

Running on A10G

App Files Files Community

RMSnow commited on Dec 5, 2023

Commit

df2accb

•

1 Parent(s): 9f923d1

init and interface

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -1
.gitignore +18 -0
app.py +78 -0
ckpts/svc/vocalist_l1_contentvec+whisper/args.json +256 -0
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json +17 -0
ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/mel_min_max_stats/mel_max.npy +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/mel_min_max_stats/mel_min.npy +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/meta_info.json +31 -0
ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/pitches/statistics.json +242 -0
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 +3 -0
ckpts/svc/vocalist_l1_contentvec+whisper/singers.json +17 -0
egs/svc/MultipleContentsSVC/README.md +153 -0
egs/svc/MultipleContentsSVC/exp_config.json +126 -0
egs/svc/MultipleContentsSVC/run.sh +1 -0
egs/svc/README.md +34 -0
egs/svc/_template/run.sh +150 -0
inference.py +258 -0
models/__init__.py +0 -0
models/base/__init__.py +7 -0
models/base/base_dataset.py +350 -0
models/base/base_inference.py +220 -0
models/base/base_sampler.py +136 -0
models/base/base_trainer.py +348 -0
models/base/new_dataset.py +50 -0
models/base/new_inference.py +249 -0
models/base/new_trainer.py +722 -0
models/svc/__init__.py +0 -0
models/svc/base/__init__.py +7 -0
models/svc/base/svc_dataset.py +425 -0
models/svc/base/svc_inference.py +15 -0
models/svc/base/svc_trainer.py +111 -0
models/svc/comosvc/__init__.py +4 -0
models/svc/comosvc/comosvc.py +377 -0
models/svc/comosvc/comosvc_inference.py +39 -0
models/svc/comosvc/comosvc_trainer.py +295 -0
models/svc/comosvc/utils.py +31 -0
models/svc/diffusion/__init__.py +0 -0
models/svc/diffusion/diffusion_inference.py +63 -0
models/svc/diffusion/diffusion_inference_pipeline.py +47 -0
models/svc/diffusion/diffusion_trainer.py +88 -0
models/svc/diffusion/diffusion_wrapper.py +73 -0
models/svc/transformer/__init__.py +0 -0
models/svc/transformer/conformer.py +405 -0
models/svc/transformer/transformer.py +82 -0
models/svc/transformer/transformer_inference.py +45 -0
models/svc/transformer/transformer_trainer.py +52 -0

.gitattributes CHANGED Viewed

@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+__pycache__
+flagged
+result
+# Developing mode
+_*.sh
+_*.json
+*.lst
+yard*
+*.out
+evaluation/evalset_selection
+mfa
+egs/svc/*wavmark
+egs/svc/custom
+egs/svc/*/dev*
+egs/svc/dev_exp_config.json
+bins/svc/demo*
+bins/svc/preprocess_custom.py

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import gradio as gr
+SUPPORTED_TARGET_SINGERS = {
+    "Adele": "vocalist_l1_Adele",
+    "Beyonce": "vocalist_l1_Beyonce",
+    "Bruno Mars": "vocalist_l1_BrunoMars",
+    "John Mayer": "vocalist_l1_JohnMayer",
+    "Michael Jackson": "vocalist_l1_MichaelJackson",
+    "Taylor Swift": "vocalist_l1_TaylorSwift",
+    "Jacky Cheung 张学友": "vocalist_l1_张学友",
+    "Jian Li 李健": "vocalist_l1_李健",
+    "Feng Wang 汪峰": "vocalist_l1_汪峰",
+    "Faye Wong 王菲": "vocalist_l1_王菲",
+    "Yijie Shi 石倚洁": "vocalist_l1_石倚洁",
+    "Tsai Chin 蔡琴": "vocalist_l1_蔡琴",
+    "Ying Na 那英": "vocalist_l1_那英",
+    "Eason Chan 陈奕迅": "vocalist_l1_陈奕迅",
+    "David Tao 陶喆": "vocalist_l1_陶喆",
+}
+def svc_inference(
+    source_audio,
+    target_singer,
+    diffusion_steps=1000,
+    key_shift_mode="auto",
+    key_shift_num=0,
+):
+    pass
+demo_inputs = [
+    gr.Audio(
+        sources=["upload", "microphone"],
+        label="Upload (or record) a song you want to listen",
+    ),
+    gr.Radio(
+        choices=list(SUPPORTED_TARGET_SINGERS.keys()),
+        label="Target Singer",
+        value="Jian Li 李健",
+    ),
+    gr.Slider(
+        1,
+        1000,
+        value=1000,
+        step=1,
+        label="Diffusion Inference Steps",
+        info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
+    ),
+    gr.Radio(
+        choices=["Auto Shift", "Key Shift"],
+        value="Auto Shift",
+        label="Pitch Shift Control",
+        info='If you want to control the specific pitch shift value, you need to choose "Key Shift"',
+    ),
+    gr.Slider(
+        -6,
+        6,
+        value=0,
+        step=1,
+        label="Key Shift Values",
+        info='How many semitones you want to transpose.	This parameter will work only if you choose "Key Shift"',
+    ),
+]
+demo_outputs = gr.Audio(label="")
+demo = gr.Interface(
+    fn=svc_inference,
+    inputs=demo_inputs,
+    outputs=demo_outputs,
+    title="Amphion Singing Voice Conversion",
+)
+if __name__ == "__main__":
+    demo.launch(show_api=False)

ckpts/svc/vocalist_l1_contentvec+whisper/args.json ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+    "base_config": "config/diffusion.json",
+    "dataset": [
+        "vocalist_l1",
+    ],
+    "exp_name": "vocalist_l1_contentvec+whisper",
+    "inference": {
+        "diffusion": {
+            "scheduler": "pndm",
+            "scheduler_settings": {
+                "num_inference_timesteps": 1000,
+            },
+        },
+    },
+    "model": {
+        "condition_encoder": {
+            "content_encoder_dim": 384,
+            "contentvec_dim": 256,
+            "f0_max": 1100,
+            "f0_min": 50,
+            "input_loudness_dim": 1,
+            "input_melody_dim": 1,
+            "merge_mode": "add",
+            "mert_dim": 256,
+            "n_bins_loudness": 256,
+            "n_bins_melody": 256,
+            "output_content_dim": 384,
+            "output_loudness_dim": 384,
+            "output_melody_dim": 384,
+            "output_singer_dim": 384,
+            "pitch_max": 1100,
+            "pitch_min": 50,
+            "singer_table_size": 512,
+            "use_conformer_for_content_features": false,
+            "use_contentvec": true,
+            "use_log_f0": true,
+            "use_log_loudness": true,
+            "use_mert": false,
+            "use_singer_encoder": true,
+            "use_spkid": true,
+            "use_wenet": false,
+            "use_whisper": true,
+            "wenet_dim": 512,
+            "whisper_dim": 1024,
+        },
+        "diffusion": {
+            "bidilconv": {
+                "base_channel": 384,
+                "conditioner_size": 384,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                "n_res_block": 20,
+            },
+            "model_type": "bidilconv",
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "beta_end": 0.02,
+                "beta_schedule": "linear",
+                "beta_start": 0.0001,
+                "num_train_timesteps": 1000,
+            },
+            "step_encoder": {
+                "activation": "SiLU",
+                "dim_hidden_layer": 512,
+                "dim_raw_embedding": 128,
+                "max_period": 10000,
+                "num_layer": 2,
+            },
+            "unet2d": {
+                "down_block_types": [
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "DownBlock2D",
+                ],
+                "in_channels": 1,
+                "mid_block_type": "UNetMidBlock2DCrossAttn",
+                "only_cross_attention": false,
+                "out_channels": 1,
+                "up_block_types": [
+                    "UpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D",
+                ],
+            },
+        },
+    },
+    "model_type": "DiffWaveNetSVC",
+    "preprocess": {
+        "audio_dir": "audios",
+        "bits": 8,
+        "content_feature_batch_size": 16,
+        "contentvec_batch_size": 1,
+        "contentvec_dir": "contentvec",
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "contentvec_frameshift": 0.02,
+        "contentvec_sample_rate": 16000,
+        "dur_dir": "durs",
+        "duration_dir": "duration",
+        "emo2id": "emo2id.json",
+        "energy_dir": "energys",
+        "extract_audio": false,
+        "extract_contentvec_feature": true,
+        "extract_energy": true,
+        "extract_label": false,
+        "extract_mcep": false,
+        "extract_mel": true,
+        "extract_mert_feature": false,
+        "extract_pitch": true,
+        "extract_uv": true,
+        "extract_wenet_feature": false,
+        "extract_whisper_feature": true,
+        "f0_max": 1100,
+        "f0_min": 50,
+        "file_lst": "file.lst",
+        "fmax": 12000,
+        "fmin": 0,
+        "hop_size": 256,
+        "is_label": true,
+        "is_mu_law": true,
+        "lab_dir": "labs",
+        "label_dir": "labels",
+        "mcep_dir": "mcep",
+        "mel_dir": "mels",
+        "mel_min_max_norm": true,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "mert_dir": "mert",
+        "mert_feature_layer": -1,
+        "mert_frameshit": 0.01333,
+        "mert_hop_size": 320,
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "min_level_db": -115,
+        "mu_law_norm": false,
+        "n_fft": 1024,
+        "n_mel": 100,
+        "num_silent_frames": 8,
+        "num_workers": 8,
+        "phone_seq_file": "phone_seq_file",
+        "pin_memory": true,
+        "pitch_bin": 256,
+        "pitch_dir": "pitches",
+        "pitch_extractor": "parselmouth",
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
+        "ref_level_db": 20,
+        "sample_rate": 24000,
+        "spk2id": "singers.json",
+        "train_file": "train.json",
+        "trim_fft_size": 512,
+        "trim_hop_size": 128,
+        "trim_silence": false,
+        "trim_top_db": 30,
+        "trimmed_wav_dir": "trimmed_wavs",
+        "use_audio": false,
+        "use_contentvec": true,
+        "use_dur": false,
+        "use_emoid": false,
+        "use_frame_duration": false,
+        "use_frame_energy": true,
+        "use_frame_pitch": true,
+        "use_lab": false,
+        "use_label": false,
+        "use_log_scale_energy": false,
+        "use_log_scale_pitch": false,
+        "use_mel": true,
+        "use_mert": false,
+        "use_min_max_norm_mel": true,
+        "use_one_hot": false,
+        "use_phn_seq": false,
+        "use_phone_duration": false,
+        "use_phone_energy": false,
+        "use_phone_pitch": false,
+        "use_spkid": true,
+        "use_uv": true,
+        "use_wav": false,
+        "use_wenet": false,
+        "use_whisper": true,
+        "utt2emo": "utt2emo",
+        "utt2spk": "utt2singer",
+        "uv_dir": "uvs",
+        "valid_file": "test.json",
+        "wav_dir": "wavs",
+        "wenet_batch_size": 1,
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "wenet_dir": "wenet",
+        "wenet_downsample_rate": 4,
+        "wenet_frameshift": 0.01,
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_sample_rate": 16000,
+        "whisper_batch_size": 30,
+        "whisper_dir": "whisper",
+        "whisper_downsample_rate": 2,
+        "whisper_frameshift": 0.01,
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        "win_size": 1024,
+    },
+    "supported_model_type": [
+        "Fastspeech2",
+        "DiffSVC",
+        "Transformer",
+        "EDM",
+        "CD",
+    ],
+    "train": {
+        "adamw": {
+            "lr": 0.0004,
+        },
+        "batch_size": 32,
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true,
+        },
+        "ddp": true,
+        "epochs": 50000,
+        "gradient_accumulation_step": 1,
+        "keep_checkpoint_max": 5,
+        "keep_last": [
+            5,
+            -1,
+        ],
+        "max_epoch": -1,
+        "max_steps": 1000000,
+        "multi_speaker_training": false,
+        "optimizer": "AdamW",
+        "random_seed": 10086,
+        "reducelronplateau": {
+            "factor": 0.8,
+            "min_lr": 0.0001,
+            "patience": 10,
+        },
+        "run_eval": [
+            false,
+            true,
+        ],
+        "sampler": {
+            "drop_last": true,
+            "holistic_shuffle": false,
+        },
+        "save_checkpoint_stride": [
+            3,
+            10,
+        ],
+        "save_checkpoints_steps": 10000,
+        "save_summary_steps": 500,
+        "scheduler": "ReduceLROnPlateau",
+        "total_training_steps": 50000,
+        "tracker": [
+            "tensorboard",
+        ],
+        "valid_interval": 10000,
+    },
+    "use_custom_dataset": true,
+}

ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:836af10b834c7aec9209eb19ce43559e6ef1e3a59bd6468e90cadbc9a18749ef
+size 249512389

ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d54eed12bef331095fc367f196d07c5061d5cb72dd6fe0e1e4453b997bf1d68d
+size 124755137

ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6798ddffadcd7d5405a77e667c674c474e4fef0cba817fdd300c7c985c1e82fe
+size 14599

ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "vocalist_l1_Adele": 0,
+    "vocalist_l1_Beyonce": 1,
+    "vocalist_l1_BrunoMars": 2,
+    "vocalist_l1_JohnMayer": 3,
+    "vocalist_l1_MichaelJackson": 4,
+    "vocalist_l1_TaylorSwift": 5,
+    "vocalist_l1_张学友": 6,
+    "vocalist_l1_李健": 7,
+    "vocalist_l1_汪峰": 8,
+    "vocalist_l1_王菲": 9,
+    "vocalist_l1_石倚洁": 10,
+    "vocalist_l1_蔡琴": 11,
+    "vocalist_l1_那英": 12,
+    "vocalist_l1_陈奕迅": 13,
+    "vocalist_l1_陶喆": 14
+}

ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/mel_min_max_stats/mel_max.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04131849378aa4f525a701909f743c303f8d56571682572b888046ead9f3e2ab
+size 528

ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/mel_min_max_stats/mel_min.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef4895ebef0e9949a6e623315bdc8a68490ba95d2f81b2be9f5146f904203016
+size 528

ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/meta_info.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "dataset": "vocalist_l1",
+    "train": {
+        "size": 3180,
+        "hours": 6.1643
+    },
+    "test": {
+        "size": 114,
+        "hours": 0.2224
+    },
+    "singers": {
+        "size": 15,
+        "training_minutes": {
+            "vocalist_l1_陶喆": 45.51,
+            "vocalist_l1_陈奕迅": 43.36,
+            "vocalist_l1_汪峰": 41.08,
+            "vocalist_l1_李健": 38.9,
+            "vocalist_l1_JohnMayer": 30.83,
+            "vocalist_l1_Adele": 27.23,
+            "vocalist_l1_那英": 27.02,
+            "vocalist_l1_石倚洁": 24.93,
+            "vocalist_l1_张学友": 18.31,
+            "vocalist_l1_TaylorSwift": 18.31,
+            "vocalist_l1_王菲": 16.78,
+            "vocalist_l1_MichaelJackson": 15.13,
+            "vocalist_l1_蔡琴": 10.12,
+            "vocalist_l1_BrunoMars": 6.29,
+            "vocalist_l1_Beyonce": 6.06
+        }
+    }
+}

ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1/pitches/statistics.json ADDED Viewed

	@@ -0,0 +1,242 @@

+{
+    "vocalist_l1_Adele": {
+        "voiced_positions": {
+            "mean": 336.5038018286193,
+            "std": 100.2148774476881,
+            "median": 332.98363792619296,
+            "min": 59.99838412340723,
+            "max": 1099.849325287837
+        },
+        "total_positions": {
+            "mean": 231.79366581704338,
+            "std": 176.6042850107386,
+            "median": 273.2844263775394,
+            "min": 0.0,
+            "max": 1099.849325287837
+        }
+    },
+    "vocalist_l1_Beyonce": {
+        "voiced_positions": {
+            "mean": 357.5678927636881,
+            "std": 130.1132620135807,
+            "median": 318.2981879228934,
+            "min": 70.29719673914867,
+            "max": 1050.354470112099
+        },
+        "total_positions": {
+            "mean": 267.5248026267327,
+            "std": 191.71600807951046,
+            "median": 261.91981963774066,
+            "min": 0.0,
+            "max": 1050.354470112099
+        }
+    },
+    "vocalist_l1_BrunoMars": {
+        "voiced_positions": {
+            "mean": 330.92612740814315,
+            "std": 86.51034158515388,
+            "median": 324.65585832605217,
+            "min": 58.74277302450286,
+            "max": 999.2818302992808
+        },
+        "total_positions": {
+            "mean": 237.26076288057826,
+            "std": 166.09898203490803,
+            "median": 286.3097386522132,
+            "min": 0.0,
+            "max": 999.2818302992808
+        }
+    },
+    "vocalist_l1_JohnMayer": {
+        "voiced_positions": {
+            "mean": 218.3531239166661,
+            "std": 77.89887175223768,
+            "median": 200.19060542586652,
+            "min": 53.371912740674716,
+            "max": 1098.1986774161685
+        },
+        "total_positions": {
+            "mean": 112.95331907131244,
+            "std": 122.65534824070893,
+            "median": 124.71389285965317,
+            "min": 0.0,
+            "max": 1098.1986774161685
+        }
+    },
+    "vocalist_l1_MichaelJackson": {
+        "voiced_positions": {
+            "mean": 293.4663654519906,
+            "std": 89.02211325650234,
+            "median": 284.4323483619402,
+            "min": 61.14507754070825,
+            "max": 1096.4247902272325
+        },
+        "total_positions": {
+            "mean": 172.1013565770682,
+            "std": 159.79551912957191,
+            "median": 212.82938711725973,
+            "min": 0.0,
+            "max": 1096.4247902272325
+        }
+    },
+    "vocalist_l1_TaylorSwift": {
+        "voiced_positions": {
+            "mean": 302.5346928039029,
+            "std": 87.1724728626562,
+            "median": 286.91670244246586,
+            "min": 51.31173137207717,
+            "max": 1098.9374311806605
+        },
+        "total_positions": {
+            "mean": 169.90968097339214,
+            "std": 163.7133164876362,
+            "median": 220.90943653386546,
+            "min": 0.0,
+            "max": 1098.9374311806605
+        }
+    },
+    "vocalist_l1_张学友": {
+        "voiced_positions": {
+            "mean": 233.6845479691867,
+            "std": 66.47140810463938,
+            "median": 228.28695118043396,
+            "min": 51.65338480121057,
+            "max": 1094.4381927885959
+        },
+        "total_positions": {
+            "mean": 167.79543637603194,
+            "std": 119.28338415844308,
+            "median": 194.81504136428546,
+            "min": 0.0,
+            "max": 1094.4381927885959
+        }
+    },
+    "vocalist_l1_李健": {
+        "voiced_positions": {
+            "mean": 234.98401896504657,
+            "std": 71.3955175177514,
+            "median": 221.86415264367847,
+            "min": 54.070687769392585,
+            "max": 1096.3342286660531
+        },
+        "total_positions": {
+            "mean": 148.74760079412246,
+            "std": 126.70486473504008,
+            "median": 180.21374566147688,
+            "min": 0.0,
+            "max": 1096.3342286660531
+        }
+    },
+    "vocalist_l1_汪峰": {
+        "voiced_positions": {
+            "mean": 284.27752567207864,
+            "std": 78.51774150654873,
+            "median": 278.26186808969493,
+            "min": 54.30945929095861,
+            "max": 1053.6870553733015
+        },
+        "total_positions": {
+            "mean": 172.41584497486713,
+            "std": 151.74272125914902,
+            "median": 216.27534661524862,
+            "min": 0.0,
+            "max": 1053.6870553733015
+        }
+    },
+    "vocalist_l1_王菲": {
+        "voiced_positions": {
+            "mean": 339.1661679865587,
+            "std": 86.86768172635271,
+            "median": 327.4151031268507,
+            "min": 51.21299842481366,
+            "max": 1096.7044574066776
+        },
+        "total_positions": {
+            "mean": 217.726880186,
+            "std": 176.8748978138034,
+            "median": 277.8608050501477,
+            "min": 0.0,
+            "max": 1096.7044574066776
+        }
+    },
+    "vocalist_l1_石倚洁": {
+        "voiced_positions": {
+            "mean": 279.67710779262256,
+            "std": 87.82306577322389,
+            "median": 271.13024912248443,
+            "min": 59.604772357481075,
+            "max": 1098.0574674417153
+        },
+        "total_positions": {
+            "mean": 205.49634806008135,
+            "std": 144.6064344590865,
+            "median": 234.19454400899718,
+            "min": 0.0,
+            "max": 1098.0574674417153
+        }
+    },
+    "vocalist_l1_蔡琴": {
+        "voiced_positions": {
+            "mean": 258.9105806499278,
+            "std": 67.4079737418162,
+            "median": 250.29778287949176,
+            "min": 54.81875790199644,
+            "max": 930.3733192171918
+        },
+        "total_positions": {
+            "mean": 197.64675891035662,
+            "std": 124.80889987119957,
+            "median": 228.14775033720753,
+            "min": 0.0,
+            "max": 930.3733192171918
+        }
+    },
+    "vocalist_l1_那英": {
+        "voiced_positions": {
+            "mean": 358.98655838013195,
+            "std": 91.30591323348871,
+            "median": 346.95185476261275,
+            "min": 71.62879029165369,
+            "max": 1085.4349856526985
+        },
+        "total_positions": {
+            "mean": 243.83317702162077,
+            "std": 183.68660712060583,
+            "median": 294.9745603259994,
+            "min": 0.0,
+            "max": 1085.4349856526985
+        }
+    },
+    "vocalist_l1_陈奕迅": {
+        "voiced_positions": {
+            "mean": 222.0124146654594,
+            "std": 68.65002654904572,
+            "median": 218.9200565540147,
+            "min": 50.48503062529368,
+            "max": 1084.6336454006018
+        },
+        "total_positions": {
+            "mean": 154.2275169157727,
+            "std": 117.16740631313343,
+            "median": 176.89315636838086,
+            "min": 0.0,
+            "max": 1084.6336454006018
+        }
+    },
+    "vocalist_l1_陶喆": {
+        "voiced_positions": {
+            "mean": 242.58206762395713,
+            "std": 69.61805791083957,
+            "median": 227.5222796096177,
+            "min": 50.44809060945403,
+            "max": 1098.4942623171203
+        },
+        "total_positions": {
+            "mean": 171.59040988406485,
+            "std": 124.93911390018495,
+            "median": 204.4328861811408,
+            "min": 0.0,
+            "max": 1098.4942623171203
+        }
+    }
+}

ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7f490fd0c97876e24bfc44413365ded7ff5d22c1c79f0dac0b754f3b32df76f
+size 88

ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e01bcf2fa621ba563b70568c18fe0742d0f48cafae83a6e8beb0bb6d1f6d146d
+size 77413046

ckpts/svc/vocalist_l1_contentvec+whisper/singers.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "vocalist_l1_Adele": 0,
+    "vocalist_l1_Beyonce": 1,
+    "vocalist_l1_BrunoMars": 2,
+    "vocalist_l1_JohnMayer": 3,
+    "vocalist_l1_MichaelJackson": 4,
+    "vocalist_l1_TaylorSwift": 5,
+    "vocalist_l1_张学友": 6,
+    "vocalist_l1_李健": 7,
+    "vocalist_l1_汪峰": 8,
+    "vocalist_l1_王菲": 9,
+    "vocalist_l1_石倚洁": 10,
+    "vocalist_l1_蔡琴": 11,
+    "vocalist_l1_那英": 12,
+    "vocalist_l1_陈奕迅": 13,
+    "vocalist_l1_陶喆": 14
+}

egs/svc/MultipleContentsSVC/README.md ADDED Viewed

	@@ -0,0 +1,153 @@

+# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
+[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
+<br>
+<div align="center">
+<img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
+</div>
+<br>
+This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
+- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
+- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
+- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
+There are four stages in total:
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## 1. Data Preparation
+### Dataset Download
+By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+### Configuration
+Specify the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+```
+## 2. Features Extraction
+### Content-based Pretrained Models Download
+By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+### Configuration
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+### Run
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+```bash
+sh egs/svc/MultipleContentsSVC/run.sh --stage 1
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Training
+### Configuration
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
+```bash
+sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+## 4. Inference/Conversion
+### Pretrained Vocoder Download
+We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
+### Run
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+| Parameters                                          | Description                                                                                                                                | Example                                                                                                                                                                            |
+| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--infer_expt_dir`                                  | The experimental directory which contains `checkpoint`                                                                                     | `Amphion/ckpts/svc/[YourExptName]`                                                                                                                                                 |
+| `--infer_output_dir`                                | The output directory to save inferred audios.                                                                                              | `Amphion/ckpts/svc/[YourExptName]/result`                                                                                                                                          |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir).                                                                                        | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker`                            | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`.                                                                                                                |
+| `--infer_key_shift`                                 | How many semitones you want to transpose.                                                                                                  | `"autoshfit"` (by default), `3`, `-3`, etc.                                                                                                                                        |
+For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
+```bash
+sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
+	--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
+	--infer_source_audio_dir [Your Audios Folder] \
+	--infer_target_speaker "opencpop_female1" \
+	--infer_key_shift "autoshift"
+```
+## Citations
+```bibtex
+@article{zhang2023leveraging,
+  title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
+  author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
+  journal={Machine Learning for Audio Worshop, NeurIPS 2023},
+  year={2023}
+}
+```

egs/svc/MultipleContentsSVC/exp_config.json ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+    "base_config": "config/diffusion.json",
+    "model_type": "DiffWaveNetSVC",
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        // Config for features extraction
+        "extract_mel": true,
+        "extract_pitch": true,
+        "extract_energy": true,
+        "extract_whisper_feature": true,
+        "extract_contentvec_feature": true,
+        "extract_wenet_feature": false,
+        "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+        "contentvec_batch_size": 1,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        // Config for features usage
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_frame_energy": true,
+        "use_spkid": true,
+        "use_whisper": true,
+        "use_contentvec": true,
+        "use_wenet": false,
+        "n_mel": 100,
+        "sample_rate": 24000
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "use_whisper": true,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+            "use_singer_encoder": false,
+            "pitch_min": 50,
+            "pitch_max": 1100
+        },
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 512,
+                "n_res_block": 40,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            }
+        }
+    },
+    "train": {
+        "batch_size": 32,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1, // -1 means no limit
+        "save_checkpoint_stride": [
+            3,
+            50
+        ],
+        "keep_last": [
+            3,
+            2
+        ],
+        "run_eval": [
+            true,
+            true
+        ],
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 30,
+            "min_lr": 1.0e-4
+        },
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true
+        },
+        "sampler": {
+            "holistic_shuffle": false,
+            "drop_last": true
+        }
+    }
+}

egs/svc/MultipleContentsSVC/run.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../_template/run.sh

egs/svc/README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Amphion Singing Voice Conversion (SVC) Recipe
+## Quick Start
+We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
+## Supported Model Architectures
+The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
+<br>
+<div align="center">
+  <img src="../../imgs/svc/pipeline.png" width="70%">
+</div>
+<br>
+Until now, Amphion SVC has supported the following features and models:
+- **Speaker-agnostic Representations**:
+  - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
+  - Prosody Features: F0 and energy.
+- **Speaker Embeddings**:
+  - Speaker Look-Up Table.
+  - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
+- **Acoustic Decoders**:
+  - Diffusion-based models:
+    - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
+    - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
+  - Transformer-based models:
+    - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
+  - VAE- and Flow-based models:
+    - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
+- **Waveform Synthesizers (Vocoders)**:
+  - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).

egs/svc/_template/run.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+    --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+    --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+    # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
+    --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
+    # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+    --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+    # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+    --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
+        --config $exp_config \
+        --num_workers 4
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$expt_dir/result"
+    fi
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+    if [ -z "$infer_target_speaker" ]; then
+        echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
+        exit 1
+    fi
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+        echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
+    fi
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
+        --config $exp_config \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $infer_source \
+        --output_dir $infer_output_dir  \
+        --log_level debug
+fi

inference.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import glob
+from tqdm import tqdm
+import json
+import torch
+import time
+from models.svc.diffusion.diffusion_inference import DiffusionInference
+from models.svc.comosvc.comosvc_inference import ComoSVCInference
+from models.svc.transformer.transformer_inference import TransformerInference
+from utils.util import load_config
+from utils.audio_slicer import split_audio, merge_segments_encodec
+from processors import acoustic_extractor, content_extractor
+def build_inference(args, cfg, infer_type="from_dataset"):
+    supported_inference = {
+        "DiffWaveNetSVC": DiffusionInference,
+        "DiffComoSVC": ComoSVCInference,
+        "TransformerSVC": TransformerInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    return inference_class(args, cfg, infer_type)
+def prepare_for_audio_file(args, cfg, num_workers=1):
+    preprocess_path = cfg.preprocess.processed_dir
+    audio_name = cfg.inference.source_audio_name
+    temp_audio_dir = os.path.join(preprocess_path, audio_name)
+    ### eval file
+    t = time.time()
+    eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
+    args.source = eval_file
+    with open(eval_file, "r") as f:
+        metadata = json.load(f)
+    print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
+    ### acoustic features
+    t = time.time()
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, temp_audio_dir, cfg
+    )
+    acoustic_extractor.cal_mel_min_max(
+        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+    )
+    acoustic_extractor.cal_pitch_statistics_svc(
+        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+    )
+    print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
+    ### content features
+    t = time.time()
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+    print("Prepare for content features: {:.1f}s".format(time.time() - t))
+    return args, cfg, temp_audio_dir
+def merge_for_audio_segments(audio_files, args, cfg):
+    audio_name = cfg.inference.source_audio_name
+    target_singer_name = args.target_singer
+    merge_segments_encodec(
+        wav_files=audio_files,
+        fs=cfg.preprocess.sample_rate,
+        output_path=os.path.join(
+            args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
+        ),
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    for tmp_file in audio_files:
+        os.remove(tmp_file)
+def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
+    """
+    Prepare the eval file (json) for an audio
+    """
+    audio_chunks_results = split_audio(
+        wav_file=cfg.inference.source_audio_path,
+        target_sr=cfg.preprocess.sample_rate,
+        output_dir=os.path.join(temp_audio_dir, "wavs"),
+        max_duration_of_segment=cfg.inference.segments_max_duration,
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    metadata = []
+    for i, res in enumerate(audio_chunks_results):
+        res["index"] = i
+        res["Dataset"] = audio_name
+        res["Singer"] = audio_name
+        res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
+        metadata.append(res)
+    eval_file = os.path.join(temp_audio_dir, "eval.json")
+    with open(eval_file, "w") as f:
+        json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
+    return eval_file
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def infer(args, cfg, infer_type):
+    # Build inference
+    t = time.time()
+    trainer = build_inference(args, cfg, infer_type)
+    print("Model Init: {:.1f}s".format(time.time() - t))
+    # Run inference
+    t = time.time()
+    output_audio_files = trainer.inference()
+    print("Model inference: {:.1f}s".format(time.time() - t))
+    return output_audio_files
+def build_parser():
+    r"""Build argument parser for inference.py.
+    Anything else should be put in an extra config YAML file.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        help="Acoustics model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--target_singer",
+        type=str,
+        required=True,
+        help="convert to a specific singer (e.g. --target_singers singer_id).",
+    )
+    parser.add_argument(
+        "--trans_key",
+        default=0,
+        help="0: no pitch shift; autoshift: pitch shift;  int: key shift.",
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="conversion_results",
+        help="Output directory. Default: ./conversion_results",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=True,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    parser.add_argument(
+        "--diffusion_inference_steps",
+        type=int,
+        default=1000,
+        help="Number of inference steps. Only applicable to diffusion inference.",
+    )
+    return parser
+def main():
+    ### Parse arguments and config
+    args = build_parser().parse_args()
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    if os.path.isdir(args.source):
+        ### Infer from file
+        # Get all the source audio files (.wav, .flac, .mp3)
+        source_audio_dir = args.source
+        audio_list = []
+        for suffix in ["wav", "flac", "mp3"]:
+            audio_list += glob.glob(
+                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+            )
+        print("There are {} source audios: ".format(len(audio_list)))
+        # Infer for every file as dataset
+        output_root_path = args.output_dir
+        for audio_path in tqdm(audio_list):
+            audio_name = audio_path.split("/")[-1].split(".")[0]
+            args.output_dir = os.path.join(output_root_path, audio_name)
+            print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
+            cfg.inference.source_audio_path = audio_path
+            cfg.inference.source_audio_name = audio_name
+            cfg.inference.segments_max_duration = 10.0
+            cfg.inference.segments_overlap_duration = 1.0
+            # Prepare metadata and features
+            args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
+            # Infer from file
+            output_audio_files = infer(args, cfg, infer_type="from_file")
+            # Merge the split segments
+            merge_for_audio_segments(output_audio_files, args, cfg)
+            # Keep or remove caches
+            if not args.keep_cache:
+                os.removedirs(cache_dir)
+    else:
+        ### Infer from dataset
+        infer(args, cfg, infer_type="from_dataset")

models/__init__.py ADDED Viewed

File without changes

models/base/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .new_trainer import BaseTrainer
+from .new_inference import BaseInference

models/base/base_dataset.py ADDED Viewed

	@@ -0,0 +1,350 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+import torch.utils.data
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from processors.acoustic_extractor import cal_normalized_mel
+from text import text_to_sequence
+from text.text_token_collation import phoneIDCollation
+class BaseDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        assert isinstance(dataset, str)
+        # self.data_root = processed_data_dir
+        self.cfg = cfg
+        processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
+        self.metafile_path = os.path.join(processed_data_dir, meta_file)
+        self.metadata = self.get_metadata()
+        '''
+        load spk2id and utt2spk from json file
+            spk2id: {spk1: 0, spk2: 1, ...}
+            utt2spk: {dataset_uid: spk1, ...}
+        '''
+        if cfg.preprocess.use_spkid:
+            spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id)
+            with open(spk2id_path, "r") as f:
+                self.spk2id = json.load(f)
+            utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk)
+            self.utt2spk = dict()
+            with open(utt2spk_path, "r") as f:
+                for line in f.readlines():
+                    utt, spk = line.strip().split('\t')
+                    self.utt2spk[utt] = spk
+        if cfg.preprocess.use_uv:
+            self.utt2uv_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2uv_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.uv_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_pitch:
+            self.utt2frame_pitch_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_pitch_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.pitch_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_energy:
+            self.utt2frame_energy_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_energy_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.energy_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_mel:
+            self.utt2mel_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2mel_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.mel_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_linear:
+            self.utt2linear_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2linear_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.linear_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_audio:
+            self.utt2audio_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2audio_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.audio_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_label:
+            self.utt2label_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2label_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.label_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_one_hot:
+            self.utt2one_hot_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2one_hot_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.one_hot_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_text or cfg.preprocess.use_phone:
+            self.utt2seq = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                if cfg.preprocess.use_text:
+                    text = utt_info["Text"]
+                    sequence = text_to_sequence(text, cfg.preprocess.text_cleaners)
+                elif cfg.preprocess.use_phone:
+                    # load phoneme squence from phone file
+                    phone_path = os.path.join(processed_data_dir,
+                                            cfg.preprocess.phone_dir,
+                                            uid+'.phone'
+                                            )
+                    with open(phone_path, 'r') as fin:
+                        phones = fin.readlines()
+                        assert len(phones) == 1
+                        phones = phones[0].strip()
+                    phones_seq = phones.split(' ')
+                    phon_id_collator = phoneIDCollation(cfg, dataset=dataset)
+                    sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq)
+                self.utt2seq[utt] = sequence
+    def get_metadata(self):
+        with open(self.metafile_path, "r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        return metadata
+    def get_dataset_name(self):
+        return self.metadata[0]["Dataset"]
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        single_feature = dict()
+        if self.cfg.preprocess.use_spkid:
+            single_feature["spk_id"] = np.array(
+                [self.spk2id[self.utt2spk[utt]]], dtype=np.int32
+            )
+        if self.cfg.preprocess.use_mel:
+            mel = np.load(self.utt2mel_path[utt])
+            assert mel.shape[0] == self.cfg.preprocess.n_mel  # [n_mels, T]
+            if self.cfg.preprocess.use_min_max_norm_mel:
+                # do mel norm
+                mel = cal_normalized_mel(mel, utt_info["Dataset"], self.cfg.preprocess)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = mel.shape[1]
+            single_feature["mel"] = mel.T  # [T, n_mels]
+        if self.cfg.preprocess.use_linear:
+            linear = np.load(self.utt2linear_path[utt])
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = linear.shape[1]
+            single_feature["linear"] = linear.T  # [T, n_linear]
+        if self.cfg.preprocess.use_frame_pitch:
+            frame_pitch_path = self.utt2frame_pitch_path[utt]
+            frame_pitch = np.load(frame_pitch_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_pitch)
+            aligned_frame_pitch = align_length(
+                frame_pitch, single_feature["target_len"]
+            )
+            single_feature["frame_pitch"] = aligned_frame_pitch
+            if self.cfg.preprocess.use_uv:
+                frame_uv_path = self.utt2uv_path[utt]
+                frame_uv = np.load(frame_uv_path)
+                aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
+                aligned_frame_uv = [
+                    0 if frame_uv else 1 for frame_uv in aligned_frame_uv
+                ]
+                aligned_frame_uv = np.array(aligned_frame_uv)
+                single_feature["frame_uv"] = aligned_frame_uv
+        if self.cfg.preprocess.use_frame_energy:
+            frame_energy_path = self.utt2frame_energy_path[utt]
+            frame_energy = np.load(frame_energy_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_energy)
+            aligned_frame_energy = align_length(
+                frame_energy, single_feature["target_len"]
+            )
+            single_feature["frame_energy"] = aligned_frame_energy
+        if self.cfg.preprocess.use_audio:
+            audio = np.load(self.utt2audio_path[utt])
+            single_feature["audio"] = audio
+            single_feature["audio_len"] = audio.shape[0]
+        if self.cfg.preprocess.use_phone or self.cfg.preprocess.use_text:
+            single_feature["phone_seq"] = np.array(self.utt2seq[utt])
+            single_feature["phone_len"] = len(self.utt2seq[utt])
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+class BaseCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, T, n_mels]
+        # frame_pitch, frame_energy: [1, T]
+        # target_len: [1]
+        # spk_id: [b, 1]
+        # mask: [b, T, 1]
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "phone_len":
+                packed_batch_features["phone_len"] = torch.LongTensor(
+                    [b["phone_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["phone_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["phn_mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "audio_len":
+                packed_batch_features["audio_len"] = torch.LongTensor(
+                    [b["audio_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["audio_len"], 1), dtype=torch.long) for b in batch
+                ]
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features
+class BaseTestDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, args):
+        raise NotImplementedError
+    def get_metadata(self):
+        raise NotImplementedError
+    def __getitem__(self, index):
+        raise NotImplementedError
+    def __len__(self):
+        return len(self.metadata)
+class BaseTestCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        raise NotImplementedError
+    def __call__(self, batch):
+        raise NotImplementedError

models/base/base_inference.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import re
+import time
+from pathlib import Path
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from models.vocoders.vocoder_inference import synthesis
+from torch.utils.data import DataLoader
+from utils.util import set_all_random_seed
+from utils.util import load_config
+def parse_vocoder(vocoder_dir):
+    r"""Parse vocoder config"""
+    vocoder_dir = os.path.abspath(vocoder_dir)
+    ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
+    ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
+    ckpt_path = str(ckpt_list[0])
+    vocoder_cfg = load_config(os.path.join(vocoder_dir, "args.json"), lowercase=True)
+    vocoder_cfg.model.bigvgan = vocoder_cfg.vocoder
+    return vocoder_cfg, ckpt_path
+class BaseInference(object):
+    def __init__(self, cfg, args):
+        self.cfg = cfg
+        self.args = args
+        self.model_type = cfg.model_type
+        self.avg_rtf = list()
+        set_all_random_seed(10086)
+        os.makedirs(args.output_dir, exist_ok=True)
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+            torch.set_num_threads(10)  # inference on 1 core cpu.
+        # Load acoustic model
+        self.model = self.create_model().to(self.device)
+        state_dict = self.load_state_dict()
+        self.load_model(state_dict)
+        self.model.eval()
+        # Load vocoder model if necessary
+        if self.args.checkpoint_dir_vocoder is not None:
+            self.get_vocoder_info()
+    def create_model(self):
+        raise NotImplementedError
+    def load_state_dict(self):
+        self.checkpoint_file = self.args.checkpoint_file
+        if self.checkpoint_file is None:
+            assert self.args.checkpoint_dir is not None
+            checkpoint_path = os.path.join(self.args.checkpoint_dir, "checkpoint")
+            checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
+            self.checkpoint_file = os.path.join(
+                self.args.checkpoint_dir, checkpoint_filename
+            )
+        self.checkpoint_dir = os.path.split(self.checkpoint_file)[0]
+        print("Restore acoustic model from {}".format(self.checkpoint_file))
+        raw_state_dict = torch.load(self.checkpoint_file, map_location=self.device)
+        self.am_restore_step = re.findall(r"step-(.+?)_loss", self.checkpoint_file)[0]
+        return raw_state_dict
+    def load_model(self, model):
+        raise NotImplementedError
+    def get_vocoder_info(self):
+        self.checkpoint_dir_vocoder = self.args.checkpoint_dir_vocoder
+        self.vocoder_cfg = os.path.join(
+            os.path.dirname(self.checkpoint_dir_vocoder), "args.json"
+        )
+        self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True)
+        self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1]
+        self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0]
+    def build_test_utt_data(self):
+        raise NotImplementedError
+    def build_testdata_loader(self, args, target_speaker=None):
+        datasets, collate = self.build_test_dataset()
+        self.test_dataset = datasets(self.cfg, args, target_speaker)
+        self.test_collate = collate(self.cfg)
+        self.test_batch_size = min(
+            self.cfg.train.batch_size, len(self.test_dataset.metadata)
+        )
+        test_loader = DataLoader(
+            self.test_dataset,
+            collate_fn=self.test_collate,
+            num_workers=self.args.num_workers,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+        )
+        return test_loader
+    def inference_each_batch(self, batch_data):
+        raise NotImplementedError
+    def inference_for_batches(self, args, target_speaker=None):
+        ###### Construct test_batch ######
+        loader = self.build_testdata_loader(args, target_speaker)
+        n_batch = len(loader)
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
+        print(
+            "Model eval time: {}, batch_size = {}, n_batch = {}".format(
+                now, self.test_batch_size, n_batch
+            )
+        )
+        self.model.eval()
+        ###### Inference for each batch ######
+        pred_res = []
+        with torch.no_grad():
+            for i, batch_data in enumerate(loader if n_batch == 1 else tqdm(loader)):
+                # Put the data to device
+                for k, v in batch_data.items():
+                    batch_data[k] = batch_data[k].to(self.device)
+                y_pred, stats = self.inference_each_batch(batch_data)
+                pred_res += y_pred
+        return pred_res
+    def inference(self, feature):
+        raise NotImplementedError
+    def synthesis_by_vocoder(self, pred):
+        audios_pred = synthesis(
+            self.vocoder_cfg,
+            self.checkpoint_dir_vocoder,
+            len(pred),
+            pred,
+        )
+        return audios_pred
+    def __call__(self, utt):
+        feature = self.build_test_utt_data(utt)
+        start_time = time.time()
+        with torch.no_grad():
+            outputs = self.inference(feature)[0]
+        time_used = time.time() - start_time
+        rtf = time_used / (
+            outputs.shape[1]
+            * self.cfg.preprocess.hop_size
+            / self.cfg.preprocess.sample_rate
+        )
+        print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf))
+        self.avg_rtf.append(rtf)
+        audios = outputs.cpu().squeeze().numpy().reshape(-1, 1)
+        return audios
+def base_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--use_ddp_inference", default=False)
+    parser.add_argument("--n_workers", default=1, type=int)
+    parser.add_argument("--local_rank", default=-1, type=int)
+    parser.add_argument(
+        "--batch_size", default=1, type=int, help="Batch size for inference"
+    )
+    parser.add_argument(
+        "--num_workers",
+        default=1,
+        type=int,
+        help="Worker number for inference dataloader",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default=None,
+        help="Checkpoint dir including model file and configuration",
+    )
+    parser.add_argument(
+        "--checkpoint_file", help="checkpoint file", type=str, default=None
+    )
+    parser.add_argument(
+        "--test_list", help="test utterance list for testing", type=str, default=None
+    )
+    parser.add_argument(
+        "--checkpoint_dir_vocoder",
+        help="Vocoder's checkpoint dir including model file and configuration",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output dir for saving generated results",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = base_parser()
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Build inference
+    inference = BaseInference(cfg, args)
+    inference()

models/base/base_sampler.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+from torch.utils.data import ConcatDataset, Dataset
+from torch.utils.data.sampler import (
+    BatchSampler,
+    RandomSampler,
+    Sampler,
+    SequentialSampler,
+)
+class ScheduledSampler(Sampler):
+    """A sampler that samples data from a given concat-dataset.
+    Args:
+        concat_dataset (ConcatDataset): a concatenated dataset consisting of all datasets
+        batch_size (int): batch size
+        holistic_shuffle (bool): whether to shuffle the whole dataset or not
+        logger (logging.Logger): logger to print warning message
+    Usage:
+        For cfg.train.batch_size = 3, cfg.train.holistic_shuffle = False, cfg.train.drop_last = True:
+        >>> list(ScheduledSampler(ConcatDataset([0, 1, 2], [3, 4, 5], [6, 7, 8]])))
+        [3, 4, 5, 0, 1, 2, 6, 7, 8]
+    """
+    def __init__(
+        self,
+        concat_dataset,
+        batch_size,
+        holistic_shuffle,
+        logger=None,
+        loader_type="train",
+    ):
+        if not isinstance(concat_dataset, ConcatDataset):
+            raise ValueError(
+                "concat_dataset must be an instance of ConcatDataset, but got {}".format(
+                    type(concat_dataset)
+                )
+            )
+        if not isinstance(batch_size, int):
+            raise ValueError(
+                "batch_size must be an integer, but got {}".format(type(batch_size))
+            )
+        if not isinstance(holistic_shuffle, bool):
+            raise ValueError(
+                "holistic_shuffle must be a boolean, but got {}".format(
+                    type(holistic_shuffle)
+                )
+            )
+        self.concat_dataset = concat_dataset
+        self.batch_size = batch_size
+        self.holistic_shuffle = holistic_shuffle
+        affected_dataset_name = []
+        affected_dataset_len = []
+        for dataset in concat_dataset.datasets:
+            dataset_len = len(dataset)
+            dataset_name = dataset.get_dataset_name()
+            if dataset_len < batch_size:
+                affected_dataset_name.append(dataset_name)
+                affected_dataset_len.append(dataset_len)
+        self.type = loader_type
+        for dataset_name, dataset_len in zip(
+            affected_dataset_name, affected_dataset_len
+        ):
+            if not loader_type == "valid":
+                logger.warning(
+                    "The {} dataset {} has a length of {}, which is smaller than the batch size {}. This may cause unexpected behavior.".format(
+                        loader_type, dataset_name, dataset_len, batch_size
+                    )
+                )
+    def __len__(self):
+        # the number of batches with drop last
+        num_of_batches = sum(
+            [
+                math.floor(len(dataset) / self.batch_size)
+                for dataset in self.concat_dataset.datasets
+            ]
+        )
+        # if samples are not enough for one batch, we don't drop last
+        if self.type == "valid" and num_of_batches < 1:
+            return len(self.concat_dataset)
+        return num_of_batches * self.batch_size
+    def __iter__(self):
+        iters = []
+        for dataset in self.concat_dataset.datasets:
+            iters.append(
+                SequentialSampler(dataset).__iter__()
+                if not self.holistic_shuffle
+                else RandomSampler(dataset).__iter__()
+            )
+        # e.g. [0, 200, 400]
+        init_indices = [0] + self.concat_dataset.cumulative_sizes[:-1]
+        output_batches = []
+        for dataset_idx in range(len(self.concat_dataset.datasets)):
+            cur_batch = []
+            for idx in iters[dataset_idx]:
+                cur_batch.append(idx + init_indices[dataset_idx])
+                if len(cur_batch) == self.batch_size:
+                    output_batches.append(cur_batch)
+                    cur_batch = []
+            # if loader_type is valid, we don't need to drop last
+            if self.type == "valid" and len(cur_batch) > 0:
+                output_batches.append(cur_batch)
+        # force drop last in training
+        random.shuffle(output_batches)
+        output_indices = [item for sublist in output_batches for item in sublist]
+        return iter(output_indices)
+def build_samplers(concat_dataset: Dataset, cfg, logger, loader_type):
+    sampler = ScheduledSampler(
+        concat_dataset,
+        cfg.train.batch_size,
+        cfg.train.sampler.holistic_shuffle,
+        logger,
+        loader_type,
+    )
+    batch_sampler = BatchSampler(
+        sampler,
+        cfg.train.batch_size,
+        cfg.train.sampler.drop_last if not loader_type == "valid" else False,
+    )
+    return sampler, batch_sampler

models/base/base_trainer.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import collections
+import json
+import os
+import sys
+import time
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import ConcatDataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from models.base.base_sampler import BatchSampler
+from utils.util import (
+    Logger,
+    remove_older_ckpt,
+    save_config,
+    set_all_random_seed,
+    ValueWindow,
+)
+class BaseTrainer(object):
+    def __init__(self, args, cfg):
+        self.args = args
+        self.log_dir = args.log_dir
+        self.cfg = cfg
+        self.checkpoint_dir = os.path.join(args.log_dir, "checkpoints")
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+        if not cfg.train.ddp or args.local_rank == 0:
+            self.sw = SummaryWriter(os.path.join(args.log_dir, "events"))
+            self.logger = self.build_logger()
+        self.time_window = ValueWindow(50)
+        self.step = 0
+        self.epoch = -1
+        self.max_epochs = self.cfg.train.epochs
+        self.max_steps = self.cfg.train.max_steps
+        # set random seed & init distributed training
+        set_all_random_seed(self.cfg.train.random_seed)
+        if cfg.train.ddp:
+            dist.init_process_group(backend="nccl")
+        if cfg.model_type not in ["AutoencoderKL", "AudioLDM"]:
+            self.singers = self.build_singers_lut()
+        # setup data_loader
+        self.data_loader = self.build_data_loader()
+        # setup model & enable distributed training
+        self.model = self.build_model()
+        print(self.model)
+        if isinstance(self.model, dict):
+            for key, value in self.model.items():
+                value.cuda(self.args.local_rank)
+                if key == "PQMF":
+                    continue
+                if cfg.train.ddp:
+                    self.model[key] = DistributedDataParallel(
+                        value, device_ids=[self.args.local_rank]
+                    )
+        else:
+            self.model.cuda(self.args.local_rank)
+            if cfg.train.ddp:
+                self.model = DistributedDataParallel(
+                    self.model, device_ids=[self.args.local_rank]
+                )
+        # create criterion
+        self.criterion = self.build_criterion()
+        if isinstance(self.criterion, dict):
+            for key, value in self.criterion.items():
+                self.criterion[key].cuda(args.local_rank)
+        else:
+            self.criterion.cuda(self.args.local_rank)
+        # optimizer
+        self.optimizer = self.build_optimizer()
+        self.scheduler = self.build_scheduler()
+        # save config file
+        self.config_save_path = os.path.join(self.checkpoint_dir, "args.json")
+    def build_logger(self):
+        log_file = os.path.join(self.checkpoint_dir, "train.log")
+        logger = Logger(log_file, level=self.args.log_level).logger
+        return logger
+    def build_dataset(self):
+        raise NotImplementedError
+    def build_data_loader(self):
+        Dataset, Collator = self.build_dataset()
+        # build dataset instance for each dataset and combine them by ConcatDataset
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        # TODO: multi-GPU training
+        if self.cfg.train.ddp:
+            raise NotImplementedError("DDP is not supported yet.")
+        # sampler will provide indices to batch_sampler, which will perform batching and yield batch indices
+        batch_sampler = BatchSampler(
+            cfg=self.cfg, concat_dataset=train_dataset, dataset_list=datasets_list
+        )
+        # use batch_sampler argument instead of (sampler, shuffle, drop_last, batch_size)
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            num_workers=self.args.num_workers,
+            batch_sampler=batch_sampler,
+            pin_memory=False,
+        )
+        if not self.cfg.train.ddp or self.args.local_rank == 0:
+            datasets_list = []
+            for dataset in self.cfg.dataset:
+                subdataset = Dataset(self.cfg, dataset, is_valid=True)
+                datasets_list.append(subdataset)
+            valid_dataset = ConcatDataset(datasets_list)
+            valid_collate = Collator(self.cfg)
+            batch_sampler = BatchSampler(
+                cfg=self.cfg, concat_dataset=valid_dataset, dataset_list=datasets_list
+            )
+            valid_loader = DataLoader(
+                valid_dataset,
+                collate_fn=valid_collate,
+                num_workers=1,
+                batch_sampler=batch_sampler,
+            )
+        else:
+            raise NotImplementedError("DDP is not supported yet.")
+            # valid_loader = None
+        data_loader = {"train": train_loader, "valid": valid_loader}
+        return data_loader
+    def build_singers_lut(self):
+        # combine singers
+        if not os.path.exists(os.path.join(self.log_dir, self.cfg.preprocess.spk2id)):
+            singers = collections.OrderedDict()
+        else:
+            with open(
+                os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "r"
+            ) as singer_file:
+                singers = json.load(singer_file)
+        singer_count = len(singers)
+        for dataset in self.cfg.dataset:
+            singer_lut_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+            )
+            with open(singer_lut_path, "r") as singer_lut_path:
+                singer_lut = json.load(singer_lut_path)
+            for singer in singer_lut.keys():
+                if singer not in singers:
+                    singers[singer] = singer_count
+                    singer_count += 1
+        with open(
+            os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "w"
+        ) as singer_file:
+            json.dump(singers, singer_file, indent=4, ensure_ascii=False)
+        print(
+            "singers have been dumped to {}".format(
+                os.path.join(self.log_dir, self.cfg.preprocess.spk2id)
+            )
+        )
+        return singers
+    def build_model(self):
+        raise NotImplementedError()
+    def build_optimizer(self):
+        raise NotImplementedError
+    def build_scheduler(self):
+        raise NotImplementedError()
+    def build_criterion(self):
+        raise NotImplementedError
+    def get_state_dict(self):
+        raise NotImplementedError
+    def save_config_file(self):
+        save_config(self.config_save_path, self.cfg)
+    # TODO, save without module.
+    def save_checkpoint(self, state_dict, saved_model_path):
+        torch.save(state_dict, saved_model_path)
+    def load_checkpoint(self):
+        checkpoint_path = os.path.join(self.checkpoint_dir, "checkpoint")
+        assert os.path.exists(checkpoint_path)
+        checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
+        model_path = os.path.join(self.checkpoint_dir, checkpoint_filename)
+        assert os.path.exists(model_path)
+        if not self.cfg.train.ddp or self.args.local_rank == 0:
+            self.logger.info(f"Re(store) from {model_path}")
+        checkpoint = torch.load(model_path, map_location="cpu")
+        return checkpoint
+    def load_model(self, checkpoint):
+        raise NotImplementedError
+    def restore(self):
+        checkpoint = self.load_checkpoint()
+        self.load_model(checkpoint)
+    def train_step(self, data):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    @torch.no_grad()
+    def eval_step(self):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    def write_summary(self, losses, stats):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    def write_valid_summary(self, losses, stats):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    def echo_log(self, losses, mode="Training"):
+        message = [
+            "{} - Epoch {} Step {}: [{:.3f} s/step]".format(
+                mode, self.epoch + 1, self.step, self.time_window.average
+            )
+        ]
+        for key in sorted(losses.keys()):
+            if isinstance(losses[key], dict):
+                for k, v in losses[key].items():
+                    message.append(
+                        str(k).split("/")[-1] + "=" + str(round(float(v), 5))
+                    )
+            else:
+                message.append(
+                    str(key).split("/")[-1] + "=" + str(round(float(losses[key]), 5))
+                )
+        self.logger.info(", ".join(message))
+    def eval_epoch(self):
+        self.logger.info("Validation...")
+        valid_losses = {}
+        for i, batch_data in enumerate(self.data_loader["valid"]):
+            for k, v in batch_data.items():
+                if isinstance(v, torch.Tensor):
+                    batch_data[k] = v.cuda()
+            valid_loss, valid_stats, total_valid_loss = self.eval_step(batch_data, i)
+            for key in valid_loss:
+                if key not in valid_losses:
+                    valid_losses[key] = 0
+                valid_losses[key] += valid_loss[key]
+        # Add mel and audio to the Tensorboard
+        # Average loss
+        for key in valid_losses:
+            valid_losses[key] /= i + 1
+        self.echo_log(valid_losses, "Valid")
+        return valid_losses, valid_stats
+    def train_epoch(self):
+        for i, batch_data in enumerate(self.data_loader["train"]):
+            start_time = time.time()
+            # Put the data to cuda device
+            for k, v in batch_data.items():
+                if isinstance(v, torch.Tensor):
+                    batch_data[k] = v.cuda(self.args.local_rank)
+            # Training step
+            train_losses, train_stats, total_loss = self.train_step(batch_data)
+            self.time_window.append(time.time() - start_time)
+            if self.args.local_rank == 0 or not self.cfg.train.ddp:
+                if self.step % self.args.stdout_interval == 0:
+                    self.echo_log(train_losses, "Training")
+                if self.step % self.cfg.train.save_summary_steps == 0:
+                    self.logger.info(f"Save summary as step {self.step}")
+                    self.write_summary(train_losses, train_stats)
+                if (
+                    self.step % self.cfg.train.save_checkpoints_steps == 0
+                    and self.step != 0
+                ):
+                    saved_model_name = "step-{:07d}_loss-{:.4f}.pt".format(
+                        self.step, total_loss
+                    )
+                    saved_model_path = os.path.join(
+                        self.checkpoint_dir, saved_model_name
+                    )
+                    saved_state_dict = self.get_state_dict()
+                    self.save_checkpoint(saved_state_dict, saved_model_path)
+                    self.save_config_file()
+                    # keep max n models
+                    remove_older_ckpt(
+                        saved_model_name,
+                        self.checkpoint_dir,
+                        max_to_keep=self.cfg.train.keep_checkpoint_max,
+                    )
+                if self.step != 0 and self.step % self.cfg.train.valid_interval == 0:
+                    if isinstance(self.model, dict):
+                        for key in self.model.keys():
+                            self.model[key].eval()
+                    else:
+                        self.model.eval()
+                    # Evaluate one epoch and get average loss
+                    valid_losses, valid_stats = self.eval_epoch()
+                    if isinstance(self.model, dict):
+                        for key in self.model.keys():
+                            self.model[key].train()
+                    else:
+                        self.model.train()
+                    # Write validation losses to summary.
+                    self.write_valid_summary(valid_losses, valid_stats)
+            self.step += 1
+    def train(self):
+        for epoch in range(max(0, self.epoch), self.max_epochs):
+            self.train_epoch()
+            self.epoch += 1
+            if self.step > self.max_steps:
+                self.logger.info("Training finished!")
+                break

models/base/new_dataset.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from abc import abstractmethod
+from pathlib import Path
+import json5
+import torch
+import yaml
+# TODO: for training and validating
+class BaseDataset(torch.utils.data.Dataset):
+    r"""Base dataset for training and validating."""
+    def __init__(self, args, cfg, is_valid=False):
+        pass
+class BaseTestDataset(torch.utils.data.Dataset):
+    r"""Test dataset for inference."""
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        assert infer_type in ["from_dataset", "from_file"]
+        self.args = args
+        self.cfg = cfg
+        self.infer_type = infer_type
+    @abstractmethod
+    def __getitem__(self, index):
+        pass
+    def __len__(self):
+        return len(self.metadata)
+    def get_metadata(self):
+        path = Path(self.args.source)
+        if path.suffix == ".json" or path.suffix == ".jsonc":
+            metadata = json5.load(open(self.args.source, "r"))
+        elif path.suffix == ".yaml" or path.suffix == ".yml":
+            metadata = yaml.full_load(open(self.args.source, "r"))
+        else:
+            raise ValueError(f"Unsupported file type: {path.suffix}")
+        return metadata

models/base/new_inference.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import random
+import re
+import time
+from abc import abstractmethod
+from pathlib import Path
+import accelerate
+import json5
+import numpy as np
+import torch
+from accelerate.logging import get_logger
+from torch.utils.data import DataLoader
+from models.vocoders.vocoder_inference import synthesis
+from utils.io import save_audio
+from utils.util import load_config
+from utils.audio_slicer import is_silence
+EPS = 1.0e-12
+class BaseInference(object):
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        super().__init__()
+        start = time.monotonic_ns()
+        self.args = args
+        self.cfg = cfg
+        assert infer_type in ["from_dataset", "from_file"]
+        self.infer_type = infer_type
+        # init with accelerate
+        self.accelerator = accelerate.Accelerator()
+        self.accelerator.wait_for_everyone()
+        # Use accelerate logger for distributed inference
+        with self.accelerator.main_process_first():
+            self.logger = get_logger("inference", log_level=args.log_level)
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New inference process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+        self.acoustics_dir = args.acoustics_dir
+        self.logger.debug(f"Acoustic dir: {args.acoustics_dir}")
+        self.vocoder_dir = args.vocoder_dir
+        self.logger.debug(f"Vocoder dir: {args.vocoder_dir}")
+        # should be in svc inferencer
+        # self.target_singer = args.target_singer
+        # self.logger.info(f"Target singers: {args.target_singer}")
+        # self.trans_key = args.trans_key
+        # self.logger.info(f"Trans key: {args.trans_key}")
+        os.makedirs(args.output_dir, exist_ok=True)
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.test_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
+        # setup model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            # self.logger.debug(self.model)
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms")
+        # init with accelerate
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self.accelerator = accelerate.Accelerator()
+        self.model = self.accelerator.prepare(self.model)
+        end = time.monotonic_ns()
+        self.accelerator.wait_for_everyone()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms")
+        with self.accelerator.main_process_first():
+            self.logger.info("Loading checkpoint...")
+            start = time.monotonic_ns()
+            # TODO: Also, suppose only use latest one yet
+            self.__load_model(os.path.join(args.acoustics_dir, "checkpoint"))
+            end = time.monotonic_ns()
+            self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms")
+        self.model.eval()
+        self.accelerator.wait_for_everyone()
+    ### Abstract methods ###
+    @abstractmethod
+    def _build_test_dataset(self):
+        pass
+    @abstractmethod
+    def _build_model(self):
+        pass
+    @abstractmethod
+    @torch.inference_mode()
+    def _inference_each_batch(self, batch_data):
+        pass
+    ### Abstract methods end ###
+    @torch.inference_mode()
+    def inference(self):
+        for i, batch in enumerate(self.test_dataloader):
+            y_pred = self._inference_each_batch(batch).cpu()
+            mel_min, mel_max = self.test_dataset.target_mel_extrema
+            y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min
+            y_ls = y_pred.chunk(self.test_batch_size)
+            tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size)
+            j = 0
+            for it, l in zip(y_ls, tgt_ls):
+                l = l.item()
+                it = it.squeeze(0)[:l]
+                uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
+                torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
+                j += 1
+        vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir)
+        res = synthesis(
+            cfg=vocoder_cfg,
+            vocoder_weight_file=vocoder_ckpt,
+            n_samples=None,
+            pred=[
+                torch.load(
+                    os.path.join(self.args.output_dir, "{}.pt".format(i["Uid"]))
+                ).numpy(force=True)
+                for i in self.test_dataset.metadata
+            ],
+        )
+        output_audio_files = []
+        for it, wav in zip(self.test_dataset.metadata, res):
+            uid = it["Uid"]
+            file = os.path.join(self.args.output_dir, f"{uid}.wav")
+            output_audio_files.append(file)
+            wav = wav.numpy(force=True)
+            save_audio(
+                file,
+                wav,
+                self.cfg.preprocess.sample_rate,
+                add_silence=False,
+                turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate),
+            )
+            os.remove(os.path.join(self.args.output_dir, f"{uid}.pt"))
+        return sorted(output_audio_files)
+    # TODO: LEGACY CODE
+    def _build_dataloader(self):
+        datasets, collate = self._build_test_dataset()
+        self.test_dataset = datasets(self.args, self.cfg, self.infer_type)
+        self.test_collate = collate(self.cfg)
+        self.test_batch_size = min(
+            self.cfg.train.batch_size, len(self.test_dataset.metadata)
+        )
+        test_dataloader = DataLoader(
+            self.test_dataset,
+            collate_fn=self.test_collate,
+            num_workers=1,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+        )
+        return test_dataloader
+    def __load_model(self, checkpoint_dir: str = None, checkpoint_path: str = None):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            ls = []
+            for i in Path(checkpoint_dir).iterdir():
+                if re.match(r"epoch-\d+_step-\d+_loss-[\d.]+", str(i.stem)):
+                    ls.append(i)
+            ls.sort(
+                key=lambda x: int(x.stem.split("_")[-3].split("-")[-1]), reverse=True
+            )
+            checkpoint_path = ls[0]
+        else:
+            checkpoint_path = Path(checkpoint_path)
+        self.accelerator.load_state(str(checkpoint_path))
+        # set epoch and step
+        self.epoch = int(checkpoint_path.stem.split("_")[-3].split("-")[-1])
+        self.step = int(checkpoint_path.stem.split("_")[-2].split("-")[-1])
+        return str(checkpoint_path)
+    @staticmethod
+    def _set_random_seed(seed):
+        r"""Set random seed for all possible random modules."""
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.random.manual_seed(seed)
+    @staticmethod
+    def _parse_vocoder(vocoder_dir):
+        r"""Parse vocoder config"""
+        vocoder_dir = os.path.abspath(vocoder_dir)
+        ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
+        ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
+        ckpt_path = str(ckpt_list[0])
+        vocoder_cfg = load_config(
+            os.path.join(vocoder_dir, "args.json"), lowercase=True
+        )
+        return vocoder_cfg, ckpt_path
+    @staticmethod
+    def __count_parameters(model):
+        return sum(p.numel() for p in model.parameters())
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )

models/base/new_trainer.py ADDED Viewed

	@@ -0,0 +1,722 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import random
+import shutil
+import time
+from abc import abstractmethod
+from pathlib import Path
+import accelerate
+import json5
+import numpy as np
+import torch
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from torch.utils.data import ConcatDataset, DataLoader
+from tqdm import tqdm
+from models.base.base_sampler import build_samplers
+from optimizer.optimizers import NoamLR
+class BaseTrainer(object):
+    r"""The base trainer for all tasks. Any trainer should inherit from this class."""
+    def __init__(self, args=None, cfg=None):
+        super().__init__()
+        self.args = args
+        self.cfg = cfg
+        cfg.exp_name = args.exp_name
+        # init with accelerate
+        self._init_accelerator()
+        self.accelerator.wait_for_everyone()
+        # Use accelerate logger for distributed training
+        with self.accelerator.main_process_first():
+            self.logger = get_logger(args.exp_name, log_level=args.log_level)
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New training process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+        self.logger.info(f"Experiment name: {args.exp_name}")
+        self.logger.info(f"Experiment directory: {self.exp_dir}")
+        self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+        if self.accelerator.is_main_process:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+        # init counts
+        self.batch_count: int = 0
+        self.step: int = 0
+        self.epoch: int = 0
+        self.max_epoch = (
+            self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
+        )
+        self.logger.info(
+            "Max epoch: {}".format(
+                self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
+            )
+        )
+        # Check values
+        if self.accelerator.is_main_process:
+            self.__check_basic_configs()
+            # Set runtime configs
+            self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
+            self.checkpoints_path = [
+                [] for _ in range(len(self.save_checkpoint_stride))
+            ]
+            self.keep_last = [
+                i if i > 0 else float("inf") for i in self.cfg.train.keep_last
+            ]
+            self.run_eval = self.cfg.train.run_eval
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.train_dataloader, self.valid_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
+        # setup model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            self.logger.debug(self.model)
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
+            self.logger.info(
+                f"Model parameters: {self.__count_parameters(self.model)/1e6:.2f}M"
+            )
+        # optimizer & scheduler
+        with self.accelerator.main_process_first():
+            self.logger.info("Building optimizer and scheduler...")
+            start = time.monotonic_ns()
+            self.optimizer = self.__build_optimizer()
+            self.scheduler = self.__build_scheduler()
+            end = time.monotonic_ns()
+            self.logger.info(
+                f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
+            )
+        # accelerate prepare
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        (
+            self.train_dataloader,
+            self.valid_dataloader,
+            self.model,
+            self.optimizer,
+            self.scheduler,
+        ) = self.accelerator.prepare(
+            self.train_dataloader,
+            self.valid_dataloader,
+            self.model,
+            self.optimizer,
+            self.scheduler,
+        )
+        end = time.monotonic_ns()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms")
+        # create criterion
+        with self.accelerator.main_process_first():
+            self.logger.info("Building criterion...")
+            start = time.monotonic_ns()
+            self.criterion = self._build_criterion()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building criterion done in {(end - start) / 1e6:.2f}ms")
+        # Resume or Finetune
+        with self.accelerator.main_process_first():
+            if args.resume:
+                ## Automatically resume according to the current exprimental name
+                self.logger.info("Resuming from {}...".format(self.checkpoint_dir))
+                start = time.monotonic_ns()
+                ckpt_path = self.__load_model(
+                    checkpoint_dir=self.checkpoint_dir, resume_type=args.resume_type
+                )
+                end = time.monotonic_ns()
+                self.logger.info(
+                    f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+                )
+                self.checkpoints_path = json.load(
+                    open(os.path.join(ckpt_path, "ckpts.json"), "r")
+                )
+            elif args.resume_from_ckpt_path and args.resume_from_ckpt_path != "":
+                ## Resume from the given checkpoint path
+                if not os.path.exists(args.resume_from_ckpt_path):
+                    raise ValueError(
+                        "[Error] The resumed checkpoint path {} don't exist.".format(
+                            args.resume_from_ckpt_path
+                        )
+                    )
+                self.logger.info(
+                    "Resuming from {}...".format(args.resume_from_ckpt_path)
+                )
+                start = time.monotonic_ns()
+                ckpt_path = self.__load_model(
+                    checkpoint_path=args.resume_from_ckpt_path,
+                    resume_type=args.resume_type,
+                )
+                end = time.monotonic_ns()
+                self.logger.info(
+                    f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+                )
+        # save config file path
+        self.config_save_path = os.path.join(self.exp_dir, "args.json")
+    ### Following are abstract methods that should be implemented in child classes ###
+    @abstractmethod
+    def _build_dataset(self):
+        r"""Build dataset for model training/validating/evaluating."""
+        pass
+    @staticmethod
+    @abstractmethod
+    def _build_criterion():
+        r"""Build criterion function for model loss calculation."""
+        pass
+    @abstractmethod
+    def _build_model(self):
+        r"""Build model for training/validating/evaluating."""
+        pass
+    @abstractmethod
+    def _forward_step(self, batch):
+        r"""One forward step of the neural network. This abstract method is trying to
+        unify ``_train_step`` and ``_valid_step`` and avoid redundant implementation.
+        However, for special case that using different forward step pattern for
+        training and validating, you could just override this method with ``pass`` and
+        implement ``_train_step`` and ``_valid_step`` separately.
+        """
+        pass
+    @abstractmethod
+    def _save_auxiliary_states(self):
+        r"""To save some auxiliary states when saving model's ckpt"""
+        pass
+    ### Abstract methods end ###
+    ### THIS IS MAIN ENTRY ###
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self.__dump_cfg(self.config_save_path)
+        self.model.train()
+        self.optimizer.zero_grad()
+        # Wait to ensure good to go
+        self.accelerator.wait_for_everyone()
+        while self.epoch < self.max_epoch:
+            self.logger.info("\n")
+            self.logger.info("-" * 32)
+            self.logger.info("Epoch {}: ".format(self.epoch))
+            ### TODO: change the return values of _train_epoch() to a loss dict, or (total_loss, loss_dict)
+            ### It's inconvenient for the model with multiple losses
+            # Do training & validating epoch
+            train_loss = self._train_epoch()
+            self.logger.info("  |- Train/Loss: {:.6f}".format(train_loss))
+            valid_loss = self._valid_epoch()
+            self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_loss))
+            self.accelerator.log(
+                {"Epoch/Train Loss": train_loss, "Epoch/Valid Loss": valid_loss},
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            # TODO: what is scheduler?
+            self.scheduler.step(valid_loss)  # FIXME: use epoch track correct?
+            # Check if hit save_checkpoint_stride and run_eval
+            run_eval = False
+            if self.accelerator.is_main_process:
+                save_checkpoint = False
+                hit_dix = []
+                for i, num in enumerate(self.save_checkpoint_stride):
+                    if self.epoch % num == 0:
+                        save_checkpoint = True
+                        hit_dix.append(i)
+                        run_eval |= self.run_eval[i]
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process and save_checkpoint:
+                path = os.path.join(
+                    self.checkpoint_dir,
+                    "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, train_loss
+                    ),
+                )
+                self.tmp_checkpoint_save_path = path
+                self.accelerator.save_state(path)
+                print(f"save checkpoint in {path}")
+                json.dump(
+                    self.checkpoints_path,
+                    open(os.path.join(path, "ckpts.json"), "w"),
+                    ensure_ascii=False,
+                    indent=4,
+                )
+                self._save_auxiliary_states()
+                # Remove old checkpoints
+                to_remove = []
+                for idx in hit_dix:
+                    self.checkpoints_path[idx].append(path)
+                    while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
+                        to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
+                # Search conflicts
+                total = set()
+                for i in self.checkpoints_path:
+                    total |= set(i)
+                do_remove = set()
+                for idx, path in to_remove[::-1]:
+                    if path in total:
+                        self.checkpoints_path[idx].insert(0, path)
+                    else:
+                        do_remove.add(path)
+                # Remove old checkpoints
+                for path in do_remove:
+                    shutil.rmtree(path, ignore_errors=True)
+                    self.logger.debug(f"Remove old checkpoint: {path}")
+            self.accelerator.wait_for_everyone()
+            if run_eval:
+                # TODO: run evaluation
+                pass
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_loss
+                    ),
+                )
+            )
+            self._save_auxiliary_states()
+        self.accelerator.end_training()
+    ### Following are methods that can be used directly in child classes ###
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.train()
+        epoch_sum_loss: float = 0.0
+        epoch_step: int = 0
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                loss = self._train_step(batch)
+                self.accelerator.backward(loss)
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+            self.batch_count += 1
+            # Update info for each step
+            # TODO: step means BP counts or batch counts?
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                epoch_sum_loss += loss
+                self.accelerator.log(
+                    {
+                        "Step/Train Loss": loss,
+                        "Step/Learning Rate": self.optimizer.param_groups[0]["lr"],
+                    },
+                    step=self.step,
+                )
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        return (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step
+        )
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.eval()
+        epoch_sum_loss = 0.0
+        for batch in tqdm(
+            self.valid_dataloader,
+            desc=f"Validating Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            batch_loss = self._valid_step(batch)
+            epoch_sum_loss += batch_loss.item()
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss / len(self.valid_dataloader)
+    def _train_step(self, batch):
+        r"""Training forward step. Should return average loss of a sample over
+        one batch. Provoke ``_forward_step`` is recommended except for special case.
+        See ``_train_epoch`` for usage.
+        """
+        return self._forward_step(batch)
+    @torch.inference_mode()
+    def _valid_step(self, batch):
+        r"""Testing forward step. Should return average loss of a sample over
+        one batch. Provoke ``_forward_step`` is recommended except for special case.
+        See ``_test_epoch`` for usage.
+        """
+        return self._forward_step(batch)
+    def __load_model(
+        self,
+        checkpoint_dir: str = None,
+        checkpoint_path: str = None,
+        resume_type: str = "",
+    ):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
+            ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+            checkpoint_path = ls[0]
+            self.logger.info("Resume from {}...".format(checkpoint_path))
+        if resume_type in ["resume", ""]:
+            # Load all the things, including model weights, optimizer, scheduler, and random states.
+            self.accelerator.load_state(input_dir=checkpoint_path)
+            # set epoch and step
+            self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
+            self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
+        elif resume_type == "finetune":
+            # Load only the model weights
+            accelerate.load_checkpoint_and_dispatch(
+                self.accelerator.unwrap_model(self.model),
+                os.path.join(checkpoint_path, "pytorch_model.bin"),
+            )
+            self.logger.info("Load model weights for finetune...")
+        else:
+            raise ValueError("Resume_type must be `resume` or `finetune`.")
+        return checkpoint_path
+    # TODO: LEGACY CODE
+    def _build_dataloader(self):
+        Dataset, Collator = self._build_dataset()
+        # build dataset instance for each dataset and combine them by ConcatDataset
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(train_dataset, self.cfg, self.logger, "train")
+        self.logger.debug(f"train batch_sampler: {list(batch_sampler)}")
+        self.logger.debug(f"length: {train_dataset.cumulative_sizes}")
+        # TODO: use config instead of (sampler, shuffle, drop_last, batch_size)
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        # Build valid dataloader
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=True)
+            datasets_list.append(subdataset)
+        valid_dataset = ConcatDataset(datasets_list)
+        valid_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(valid_dataset, self.cfg, self.logger, "valid")
+        self.logger.debug(f"valid batch_sampler: {list(batch_sampler)}")
+        self.logger.debug(f"length: {valid_dataset.cumulative_sizes}")
+        valid_loader = DataLoader(
+            valid_dataset,
+            collate_fn=valid_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        return train_loader, valid_loader
+    @staticmethod
+    def _set_random_seed(seed):
+        r"""Set random seed for all possible random modules."""
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.random.manual_seed(seed)
+    def _check_nan(self, loss, y_pred, y_gt):
+        if torch.any(torch.isnan(loss)):
+            self.logger.fatal("Fatal Error: Training is down since loss has Nan!")
+            self.logger.error("loss = {:.6f}".format(loss.item()), in_order=True)
+            if torch.any(torch.isnan(y_pred)):
+                self.logger.error(
+                    f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
+                )
+            else:
+                self.logger.debug(
+                    f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
+                )
+            if torch.any(torch.isnan(y_gt)):
+                self.logger.error(
+                    f"y_gt has Nan: {torch.any(torch.isnan(y_gt))}", in_order=True
+                )
+            else:
+                self.logger.debug(
+                    f"y_gt has nan: {torch.any(torch.isnan(y_gt))}", in_order=True
+                )
+            if torch.any(torch.isnan(y_pred)):
+                self.logger.error(f"y_pred: {y_pred}", in_order=True)
+            else:
+                self.logger.debug(f"y_pred: {y_pred}", in_order=True)
+            if torch.any(torch.isnan(y_gt)):
+                self.logger.error(f"y_gt: {y_gt}", in_order=True)
+            else:
+                self.logger.debug(f"y_gt: {y_gt}", in_order=True)
+            # TODO: still OK to save tracking?
+            self.accelerator.end_training()
+            raise RuntimeError("Loss has Nan! See log for more info.")
+    ### Protected methods end ###
+    ## Following are private methods ##
+    ## !!! These are inconvenient for GAN-based model training. It'd be better to move these to svc_trainer.py if needed.
+    def __build_optimizer(self):
+        r"""Build optimizer for model."""
+        # Make case-insensitive matching
+        if self.cfg.train.optimizer.lower() == "adadelta":
+            optimizer = torch.optim.Adadelta(
+                self.model.parameters(), **self.cfg.train.adadelta
+            )
+            self.logger.info("Using Adadelta optimizer.")
+        elif self.cfg.train.optimizer.lower() == "adagrad":
+            optimizer = torch.optim.Adagrad(
+                self.model.parameters(), **self.cfg.train.adagrad
+            )
+            self.logger.info("Using Adagrad optimizer.")
+        elif self.cfg.train.optimizer.lower() == "adam":
+            optimizer = torch.optim.Adam(self.model.parameters(), **self.cfg.train.adam)
+            self.logger.info("Using Adam optimizer.")
+        elif self.cfg.train.optimizer.lower() == "adamw":
+            optimizer = torch.optim.AdamW(
+                self.model.parameters(), **self.cfg.train.adamw
+            )
+        elif self.cfg.train.optimizer.lower() == "sparseadam":
+            optimizer = torch.optim.SparseAdam(
+                self.model.parameters(), **self.cfg.train.sparseadam
+            )
+        elif self.cfg.train.optimizer.lower() == "adamax":
+            optimizer = torch.optim.Adamax(
+                self.model.parameters(), **self.cfg.train.adamax
+            )
+        elif self.cfg.train.optimizer.lower() == "asgd":
+            optimizer = torch.optim.ASGD(self.model.parameters(), **self.cfg.train.asgd)
+        elif self.cfg.train.optimizer.lower() == "lbfgs":
+            optimizer = torch.optim.LBFGS(
+                self.model.parameters(), **self.cfg.train.lbfgs
+            )
+        elif self.cfg.train.optimizer.lower() == "nadam":
+            optimizer = torch.optim.NAdam(
+                self.model.parameters(), **self.cfg.train.nadam
+            )
+        elif self.cfg.train.optimizer.lower() == "radam":
+            optimizer = torch.optim.RAdam(
+                self.model.parameters(), **self.cfg.train.radam
+            )
+        elif self.cfg.train.optimizer.lower() == "rmsprop":
+            optimizer = torch.optim.RMSprop(
+                self.model.parameters(), **self.cfg.train.rmsprop
+            )
+        elif self.cfg.train.optimizer.lower() == "rprop":
+            optimizer = torch.optim.Rprop(
+                self.model.parameters(), **self.cfg.train.rprop
+            )
+        elif self.cfg.train.optimizer.lower() == "sgd":
+            optimizer = torch.optim.SGD(self.model.parameters(), **self.cfg.train.sgd)
+        else:
+            raise NotImplementedError(
+                f"Optimizer {self.cfg.train.optimizer} not supported yet!"
+            )
+        return optimizer
+    def __build_scheduler(self):
+        r"""Build scheduler for optimizer."""
+        # Make case-insensitive matching
+        if self.cfg.train.scheduler.lower() == "lambdalr":
+            scheduler = torch.optim.lr_scheduler.LambdaLR(
+                self.optimizer, **self.cfg.train.lambdalr
+            )
+        elif self.cfg.train.scheduler.lower() == "multiplicativelr":
+            scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
+                self.optimizer, **self.cfg.train.multiplicativelr
+            )
+        elif self.cfg.train.scheduler.lower() == "steplr":
+            scheduler = torch.optim.lr_scheduler.StepLR(
+                self.optimizer, **self.cfg.train.steplr
+            )
+        elif self.cfg.train.scheduler.lower() == "multisteplr":
+            scheduler = torch.optim.lr_scheduler.MultiStepLR(
+                self.optimizer, **self.cfg.train.multisteplr
+            )
+        elif self.cfg.train.scheduler.lower() == "constantlr":
+            scheduler = torch.optim.lr_scheduler.ConstantLR(
+                self.optimizer, **self.cfg.train.constantlr
+            )
+        elif self.cfg.train.scheduler.lower() == "linearlr":
+            scheduler = torch.optim.lr_scheduler.LinearLR(
+                self.optimizer, **self.cfg.train.linearlr
+            )
+        elif self.cfg.train.scheduler.lower() == "exponentiallr":
+            scheduler = torch.optim.lr_scheduler.ExponentialLR(
+                self.optimizer, **self.cfg.train.exponentiallr
+            )
+        elif self.cfg.train.scheduler.lower() == "polynomiallr":
+            scheduler = torch.optim.lr_scheduler.PolynomialLR(
+                self.optimizer, **self.cfg.train.polynomiallr
+            )
+        elif self.cfg.train.scheduler.lower() == "cosineannealinglr":
+            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                self.optimizer, **self.cfg.train.cosineannealinglr
+            )
+        elif self.cfg.train.scheduler.lower() == "sequentiallr":
+            scheduler = torch.optim.lr_scheduler.SequentialLR(
+                self.optimizer, **self.cfg.train.sequentiallr
+            )
+        elif self.cfg.train.scheduler.lower() == "reducelronplateau":
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                self.optimizer, **self.cfg.train.reducelronplateau
+            )
+        elif self.cfg.train.scheduler.lower() == "cycliclr":
+            scheduler = torch.optim.lr_scheduler.CyclicLR(
+                self.optimizer, **self.cfg.train.cycliclr
+            )
+        elif self.cfg.train.scheduler.lower() == "onecyclelr":
+            scheduler = torch.optim.lr_scheduler.OneCycleLR(
+                self.optimizer, **self.cfg.train.onecyclelr
+            )
+        elif self.cfg.train.scheduler.lower() == "cosineannearingwarmrestarts":
+            scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+                self.optimizer, **self.cfg.train.cosineannearingwarmrestarts
+            )
+        elif self.cfg.train.scheduler.lower() == "noamlr":
+            scheduler = NoamLR(self.optimizer, **self.cfg.train.lr_scheduler)
+        else:
+            raise NotImplementedError(
+                f"Scheduler {self.cfg.train.scheduler} not supported yet!"
+            )
+        return scheduler
+    def _init_accelerator(self):
+        self.exp_dir = os.path.join(
+            os.path.abspath(self.cfg.log_dir), self.args.exp_name
+        )
+        project_config = ProjectConfiguration(
+            project_dir=self.exp_dir,
+            logging_dir=os.path.join(self.exp_dir, "log"),
+        )
+        self.accelerator = accelerate.Accelerator(
+            gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
+            log_with=self.cfg.train.tracker,
+            project_config=project_config,
+        )
+        if self.accelerator.is_main_process:
+            os.makedirs(project_config.project_dir, exist_ok=True)
+            os.makedirs(project_config.logging_dir, exist_ok=True)
+        with self.accelerator.main_process_first():
+            self.accelerator.init_trackers(self.args.exp_name)
+    def __check_basic_configs(self):
+        if self.cfg.train.gradient_accumulation_step <= 0:
+            self.logger.fatal("Invalid gradient_accumulation_step value!")
+            self.logger.error(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+            self.accelerator.end_training()
+            raise ValueError(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+        # TODO: check other values
+    @staticmethod
+    def __count_parameters(model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )
+    ### Private methods end ###

models/svc/__init__.py ADDED Viewed

File without changes

models/svc/base/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .svc_inference import SVCInference
+from .svc_trainer import SVCTrainer

models/svc/base/svc_dataset.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+import json
+import os
+import numpy as np
+from utils.data_utils import *
+from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema
+from processors.content_extractor import (
+    ContentvecExtractor,
+    WhisperExtractor,
+    WenetExtractor,
+)
+from models.base.base_dataset import (
+    BaseCollator,
+    BaseDataset,
+)
+from models.base.new_dataset import BaseTestDataset
+EPS = 1.0e-12
+class SVCDataset(BaseDataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid)
+        cfg = self.cfg
+        if cfg.model.condition_encoder.use_whisper:
+            self.whisper_aligner = WhisperExtractor(self.cfg)
+            self.utt2whisper_path = load_content_feature_path(
+                self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
+            )
+        if cfg.model.condition_encoder.use_contentvec:
+            self.contentvec_aligner = ContentvecExtractor(self.cfg)
+            self.utt2contentVec_path = load_content_feature_path(
+                self.metadata,
+                cfg.preprocess.processed_dir,
+                cfg.preprocess.contentvec_dir,
+            )
+        if cfg.model.condition_encoder.use_mert:
+            self.utt2mert_path = load_content_feature_path(
+                self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
+            )
+        if cfg.model.condition_encoder.use_wenet:
+            self.wenet_aligner = WenetExtractor(self.cfg)
+            self.utt2wenet_path = load_content_feature_path(
+                self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
+            )
+    def __getitem__(self, index):
+        single_feature = BaseDataset.__getitem__(self, index)
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        if self.cfg.model.condition_encoder.use_whisper:
+            assert "target_len" in single_feature.keys()
+            aligned_whisper_feat = self.whisper_aligner.offline_align(
+                np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
+            )
+            single_feature["whisper_feat"] = aligned_whisper_feat
+        if self.cfg.model.condition_encoder.use_contentvec:
+            assert "target_len" in single_feature.keys()
+            aligned_contentvec = self.contentvec_aligner.offline_align(
+                np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
+            )
+            single_feature["contentvec_feat"] = aligned_contentvec
+        if self.cfg.model.condition_encoder.use_mert:
+            assert "target_len" in single_feature.keys()
+            aligned_mert_feat = align_content_feature_length(
+                np.load(self.utt2mert_path[utt]),
+                single_feature["target_len"],
+                source_hop=self.cfg.preprocess.mert_hop_size,
+            )
+            single_feature["mert_feat"] = aligned_mert_feat
+        if self.cfg.model.condition_encoder.use_wenet:
+            assert "target_len" in single_feature.keys()
+            aligned_wenet_feat = self.wenet_aligner.offline_align(
+                np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
+            )
+            single_feature["wenet_feat"] = aligned_wenet_feat
+        # print(single_feature.keys())
+        # for k, v in single_feature.items():
+        #     if type(v) in [torch.Tensor, np.ndarray]:
+        #         print(k, v.shape)
+        #     else:
+        #         print(k, v)
+        # exit()
+        return self.clip_if_too_long(single_feature)
+    def __len__(self):
+        return len(self.metadata)
+    def random_select(self, feature_seq_len, max_seq_len, ending_ts=2812):
+        """
+        ending_ts: to avoid invalid whisper features for over 30s audios
+            2812 = 30 * 24000 // 256
+        """
+        ts = max(feature_seq_len - max_seq_len, 0)
+        ts = min(ts, ending_ts - max_seq_len)
+        start = random.randint(0, ts)
+        end = start + max_seq_len
+        return start, end
+    def clip_if_too_long(self, sample, max_seq_len=512):
+        """
+        sample :
+            {
+                'spk_id': (1,),
+                'target_len': int
+                'mel': (seq_len, dim),
+                'frame_pitch': (seq_len,)
+                'frame_energy': (seq_len,)
+                'content_vector_feat': (seq_len, dim)
+            }
+        """
+        if sample["target_len"] <= max_seq_len:
+            return sample
+        start, end = self.random_select(sample["target_len"], max_seq_len)
+        sample["target_len"] = end - start
+        for k in sample.keys():
+            if k not in ["spk_id", "target_len"]:
+                sample[k] = sample[k][start:end]
+        return sample
+class SVCCollator(BaseCollator):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        BaseCollator.__init__(self, cfg)
+    def __call__(self, batch):
+        parsed_batch_features = BaseCollator.__call__(self, batch)
+        return parsed_batch_features
+class SVCTestDataset(BaseTestDataset):
+    def __init__(self, args, cfg, infer_type):
+        BaseTestDataset.__init__(self, args, cfg, infer_type)
+        self.metadata = self.get_metadata()
+        target_singer = args.target_singer
+        self.cfg = cfg
+        self.trans_key = args.trans_key
+        assert type(target_singer) == str
+        self.target_singer = target_singer.split("_")[-1]
+        self.target_dataset = target_singer.replace(
+            "_{}".format(self.target_singer), ""
+        )
+        self.target_mel_extrema = load_mel_extrema(cfg.preprocess, self.target_dataset)
+        self.target_mel_extrema = torch.as_tensor(
+            self.target_mel_extrema[0]
+        ), torch.as_tensor(self.target_mel_extrema[1])
+        ######### Load source acoustic features #########
+        if cfg.preprocess.use_spkid:
+            spk2id_path = os.path.join(args.acoustics_dir, cfg.preprocess.spk2id)
+            # utt2sp_path = os.path.join(self.data_root, cfg.preprocess.utt2spk)
+            with open(spk2id_path, "r") as f:
+                self.spk2id = json.load(f)
+            # print("self.spk2id", self.spk2id)
+        if cfg.preprocess.use_uv:
+            self.utt2uv_path = {
+                f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+                    cfg.preprocess.processed_dir,
+                    utt_info["Dataset"],
+                    cfg.preprocess.uv_dir,
+                    utt_info["Uid"] + ".npy",
+                )
+                for utt_info in self.metadata
+            }
+        if cfg.preprocess.use_frame_pitch:
+            self.utt2frame_pitch_path = {
+                f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+                    cfg.preprocess.processed_dir,
+                    utt_info["Dataset"],
+                    cfg.preprocess.pitch_dir,
+                    utt_info["Uid"] + ".npy",
+                )
+                for utt_info in self.metadata
+            }
+            # Target F0 median
+            target_f0_statistics_path = os.path.join(
+                cfg.preprocess.processed_dir,
+                self.target_dataset,
+                cfg.preprocess.pitch_dir,
+                "statistics.json",
+            )
+            self.target_pitch_median = json.load(open(target_f0_statistics_path, "r"))[
+                f"{self.target_dataset}_{self.target_singer}"
+            ]["voiced_positions"]["median"]
+            # Source F0 median (if infer from file)
+            if infer_type == "from_file":
+                source_audio_name = cfg.inference.source_audio_name
+                source_f0_statistics_path = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    source_audio_name,
+                    cfg.preprocess.pitch_dir,
+                    "statistics.json",
+                )
+                self.source_pitch_median = json.load(
+                    open(source_f0_statistics_path, "r")
+                )[f"{source_audio_name}_{source_audio_name}"]["voiced_positions"][
+                    "median"
+                ]
+            else:
+                self.source_pitch_median = None
+        if cfg.preprocess.use_frame_energy:
+            self.utt2frame_energy_path = {
+                f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+                    cfg.preprocess.processed_dir,
+                    utt_info["Dataset"],
+                    cfg.preprocess.energy_dir,
+                    utt_info["Uid"] + ".npy",
+                )
+                for utt_info in self.metadata
+            }
+        if cfg.preprocess.use_mel:
+            self.utt2mel_path = {
+                f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+                    cfg.preprocess.processed_dir,
+                    utt_info["Dataset"],
+                    cfg.preprocess.mel_dir,
+                    utt_info["Uid"] + ".npy",
+                )
+                for utt_info in self.metadata
+            }
+        ######### Load source content features' path #########
+        if cfg.model.condition_encoder.use_whisper:
+            self.whisper_aligner = WhisperExtractor(cfg)
+            self.utt2whisper_path = load_content_feature_path(
+                self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
+            )
+        if cfg.model.condition_encoder.use_contentvec:
+            self.contentvec_aligner = ContentvecExtractor(cfg)
+            self.utt2contentVec_path = load_content_feature_path(
+                self.metadata,
+                cfg.preprocess.processed_dir,
+                cfg.preprocess.contentvec_dir,
+            )
+        if cfg.model.condition_encoder.use_mert:
+            self.utt2mert_path = load_content_feature_path(
+                self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
+            )
+        if cfg.model.condition_encoder.use_wenet:
+            self.wenet_aligner = WenetExtractor(cfg)
+            self.utt2wenet_path = load_content_feature_path(
+                self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
+            )
+    def __getitem__(self, index):
+        single_feature = {}
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        source_dataset = self.metadata[index]["Dataset"]
+        if self.cfg.preprocess.use_spkid:
+            single_feature["spk_id"] = np.array(
+                [self.spk2id[f"{self.target_dataset}_{self.target_singer}"]],
+                dtype=np.int32,
+            )
+        ######### Get Acoustic Features Item #########
+        if self.cfg.preprocess.use_mel:
+            mel = np.load(self.utt2mel_path[utt])
+            assert mel.shape[0] == self.cfg.preprocess.n_mel  # [n_mels, T]
+            if self.cfg.preprocess.use_min_max_norm_mel:
+                # mel norm
+                mel = cal_normalized_mel(mel, source_dataset, self.cfg.preprocess)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = mel.shape[1]
+            single_feature["mel"] = mel.T  # [T, n_mels]
+        if self.cfg.preprocess.use_frame_pitch:
+            frame_pitch_path = self.utt2frame_pitch_path[utt]
+            frame_pitch = np.load(frame_pitch_path)
+            if self.trans_key:
+                try:
+                    self.trans_key = int(self.trans_key)
+                except:
+                    pass
+                if type(self.trans_key) == int:
+                    frame_pitch = transpose_key(frame_pitch, self.trans_key)
+                elif self.trans_key:
+                    assert self.target_singer
+                    frame_pitch = pitch_shift_to_target(
+                        frame_pitch, self.target_pitch_median, self.source_pitch_median
+                    )
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_pitch)
+            aligned_frame_pitch = align_length(
+                frame_pitch, single_feature["target_len"]
+            )
+            single_feature["frame_pitch"] = aligned_frame_pitch
+            if self.cfg.preprocess.use_uv:
+                frame_uv_path = self.utt2uv_path[utt]
+                frame_uv = np.load(frame_uv_path)
+                aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
+                aligned_frame_uv = [
+                    0 if frame_uv else 1 for frame_uv in aligned_frame_uv
+                ]
+                aligned_frame_uv = np.array(aligned_frame_uv)
+                single_feature["frame_uv"] = aligned_frame_uv
+        if self.cfg.preprocess.use_frame_energy:
+            frame_energy_path = self.utt2frame_energy_path[utt]
+            frame_energy = np.load(frame_energy_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_energy)
+            aligned_frame_energy = align_length(
+                frame_energy, single_feature["target_len"]
+            )
+            single_feature["frame_energy"] = aligned_frame_energy
+        ######### Get Content Features Item #########
+        if self.cfg.model.condition_encoder.use_whisper:
+            assert "target_len" in single_feature.keys()
+            aligned_whisper_feat = self.whisper_aligner.offline_align(
+                np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
+            )
+            single_feature["whisper_feat"] = aligned_whisper_feat
+        if self.cfg.model.condition_encoder.use_contentvec:
+            assert "target_len" in single_feature.keys()
+            aligned_contentvec = self.contentvec_aligner.offline_align(
+                np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
+            )
+            single_feature["contentvec_feat"] = aligned_contentvec
+        if self.cfg.model.condition_encoder.use_mert:
+            assert "target_len" in single_feature.keys()
+            aligned_mert_feat = align_content_feature_length(
+                np.load(self.utt2mert_path[utt]),
+                single_feature["target_len"],
+                source_hop=self.cfg.preprocess.mert_hop_size,
+            )
+            single_feature["mert_feat"] = aligned_mert_feat
+        if self.cfg.model.condition_encoder.use_wenet:
+            assert "target_len" in single_feature.keys()
+            aligned_wenet_feat = self.wenet_aligner.offline_align(
+                np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
+            )
+            single_feature["wenet_feat"] = aligned_wenet_feat
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+class SVCTestCollator:
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, T, n_mels]
+        # frame_pitch, frame_energy: [1, T]
+        # target_len: [1]
+        # spk_id: [b, 1]
+        # mask: [b, T, 1]
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features

models/svc/base/svc_inference.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from models.base.new_inference import BaseInference
+from models.svc.base.svc_dataset import SVCTestCollator, SVCTestDataset
+class SVCInference(BaseInference):
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        BaseInference.__init__(self, args, cfg, infer_type)
+    def _build_test_dataset(self):
+        return SVCTestDataset, SVCTestCollator

models/svc/base/svc_trainer.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import torch
+import torch.nn as nn
+from models.base.new_trainer import BaseTrainer
+from models.svc.base.svc_dataset import SVCCollator, SVCDataset
+class SVCTrainer(BaseTrainer):
+    r"""The base trainer for all SVC models. It inherits from BaseTrainer and implements
+    ``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this
+    class, and implement ``_build_model``, ``_forward_step``.
+    """
+    def __init__(self, args=None, cfg=None):
+        self.args = args
+        self.cfg = cfg
+        self._init_accelerator()
+        # Only for SVC tasks
+        with self.accelerator.main_process_first():
+            self.singers = self._build_singer_lut()
+        # Super init
+        BaseTrainer.__init__(self, args, cfg)
+        # Only for SVC tasks
+        self.task_type = "SVC"
+        self.logger.info("Task type: {}".format(self.task_type))
+    ### Following are methods only for SVC tasks ###
+    # TODO: LEGACY CODE, NEED TO BE REFACTORED
+    def _build_dataset(self):
+        return SVCDataset, SVCCollator
+    @staticmethod
+    def _build_criterion():
+        criterion = nn.MSELoss(reduction="none")
+        return criterion
+    @staticmethod
+    def _compute_loss(criterion, y_pred, y_gt, loss_mask):
+        """
+        Args:
+            criterion: MSELoss(reduction='none')
+            y_pred, y_gt: (bs, seq_len, D)
+            loss_mask: (bs, seq_len, 1)
+        Returns:
+            loss: Tensor of shape []
+        """
+        # (bs, seq_len, D)
+        loss = criterion(y_pred, y_gt)
+        # expand loss_mask to (bs, seq_len, D)
+        loss_mask = loss_mask.repeat(1, 1, loss.shape[-1])
+        loss = torch.sum(loss * loss_mask) / torch.sum(loss_mask)
+        return loss
+    def _save_auxiliary_states(self):
+        """
+        To save the singer's look-up table in the checkpoint saving path
+        """
+        with open(
+            os.path.join(self.tmp_checkpoint_save_path, self.cfg.preprocess.spk2id), "w"
+        ) as f:
+            json.dump(self.singers, f, indent=4, ensure_ascii=False)
+    def _build_singer_lut(self):
+        resumed_singer_path = None
+        if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "":
+            resumed_singer_path = os.path.join(
+                self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id
+            )
+        if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
+            resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+        if resumed_singer_path:
+            with open(resumed_singer_path, "r") as f:
+                singers = json.load(f)
+        else:
+            singers = dict()
+        for dataset in self.cfg.dataset:
+            singer_lut_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+            )
+            with open(singer_lut_path, "r") as singer_lut_path:
+                singer_lut = json.load(singer_lut_path)
+            for singer in singer_lut.keys():
+                if singer not in singers:
+                    singers[singer] = len(singers)
+        with open(
+            os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
+        ) as singer_file:
+            json.dump(singers, singer_file, indent=4, ensure_ascii=False)
+        print(
+            "singers have been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+            )
+        )
+        return singers

models/svc/comosvc/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.

models/svc/comosvc/comosvc.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Adapted from https://github.com/zhenye234/CoMoSpeech"""
+import torch
+import torch.nn as nn
+import copy
+import numpy as np
+import math
+from tqdm.auto import tqdm
+from utils.ssim import SSIM
+from models.svc.transformer.conformer import Conformer, BaseModule
+from models.svc.diffusion.diffusion_wrapper import DiffusionWrapper
+from models.svc.comosvc.utils import slice_segments, rand_ids_segments
+class Consistency(nn.Module):
+    def __init__(self, cfg, distill=False):
+        super().__init__()
+        self.cfg = cfg
+        # self.denoise_fn = GradLogPEstimator2d(96)
+        self.denoise_fn = DiffusionWrapper(self.cfg)
+        self.cfg = cfg.model.comosvc
+        self.teacher = not distill
+        self.P_mean = self.cfg.P_mean
+        self.P_std = self.cfg.P_std
+        self.sigma_data = self.cfg.sigma_data
+        self.sigma_min = self.cfg.sigma_min
+        self.sigma_max = self.cfg.sigma_max
+        self.rho = self.cfg.rho
+        self.N = self.cfg.n_timesteps
+        self.ssim_loss = SSIM()
+        # Time step discretization
+        step_indices = torch.arange(self.N)
+        # karras boundaries formula
+        t_steps = (
+            self.sigma_min ** (1 / self.rho)
+            + step_indices
+            / (self.N - 1)
+            * (self.sigma_max ** (1 / self.rho) - self.sigma_min ** (1 / self.rho))
+        ) ** self.rho
+        self.t_steps = torch.cat(
+            [torch.zeros_like(t_steps[:1]), self.round_sigma(t_steps)]
+        )
+    def init_consistency_training(self):
+        self.denoise_fn_ema = copy.deepcopy(self.denoise_fn)
+        self.denoise_fn_pretrained = copy.deepcopy(self.denoise_fn)
+    def EDMPrecond(self, x, sigma, cond, denoise_fn, mask, spk=None):
+        """
+        karras diffusion reverse process
+        Args:
+            x: noisy mel-spectrogram [B x n_mel x L]
+            sigma: noise level [B x 1 x 1]
+            cond: output of conformer encoder [B x n_mel x L]
+            denoise_fn: denoiser neural network e.g. DilatedCNN
+            mask: mask of padded frames [B x n_mel x L]
+        Returns:
+            denoised mel-spectrogram [B x n_mel x L]
+        """
+        sigma = sigma.reshape(-1, 1, 1)
+        c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
+        c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2).sqrt()
+        c_in = 1 / (self.sigma_data**2 + sigma**2).sqrt()
+        c_noise = sigma.log() / 4
+        x_in = c_in * x
+        x_in = x_in.transpose(1, 2)
+        x = x.transpose(1, 2)
+        cond = cond.transpose(1, 2)
+        F_x = denoise_fn(x_in, c_noise.squeeze(), cond)
+        # F_x =  denoise_fn((c_in * x), mask, cond, c_noise.flatten())
+        D_x = c_skip * x + c_out * (F_x)
+        D_x = D_x.transpose(1, 2)
+        return D_x
+    def EDMLoss(self, x_start, cond, mask):
+        """
+        compute loss for EDM model
+        Args:
+            x_start: ground truth mel-spectrogram [B x n_mel x L]
+            cond: output of conformer encoder [B x n_mel x L]
+            mask: mask of padded frames [B x n_mel x L]
+        """
+        rnd_normal = torch.randn([x_start.shape[0], 1, 1], device=x_start.device)
+        sigma = (rnd_normal * self.P_std + self.P_mean).exp()
+        weight = (sigma**2 + self.sigma_data**2) / (sigma * self.sigma_data) ** 2
+        # follow Grad-TTS, start from Gaussian noise with mean cond and std I
+        noise = (torch.randn_like(x_start) + cond) * sigma
+        D_yn = self.EDMPrecond(x_start + noise, sigma, cond, self.denoise_fn, mask)
+        loss = weight * ((D_yn - x_start) ** 2)
+        loss = torch.sum(loss * mask) / torch.sum(mask)
+        return loss
+    def round_sigma(self, sigma):
+        return torch.as_tensor(sigma)
+    def edm_sampler(
+        self,
+        latents,
+        cond,
+        nonpadding,
+        num_steps=50,
+        sigma_min=0.002,
+        sigma_max=80,
+        rho=7,
+        S_churn=0,
+        S_min=0,
+        S_max=float("inf"),
+        S_noise=1,
+        # S_churn=40 ,S_min=0.05,S_max=50,S_noise=1.003,# S_churn=0, S_min=0, S_max=float('inf'), S_noise=1,
+        # S_churn=30 ,S_min=0.01,S_max=30,S_noise=1.007,
+        # S_churn=30 ,S_min=0.01,S_max=1,S_noise=1.007,
+        # S_churn=80 ,S_min=0.05,S_max=50,S_noise=1.003,
+    ):
+        """
+        karras diffusion sampler
+        Args:
+            latents: noisy mel-spectrogram [B x n_mel x L]
+            cond: output of conformer encoder [B x n_mel x L]
+            nonpadding: mask of padded frames [B x n_mel x L]
+            num_steps: number of steps for diffusion inference
+        Returns:
+            denoised mel-spectrogram [B x n_mel x L]
+        """
+        # Time step discretization.
+        step_indices = torch.arange(num_steps, device=latents.device)
+        num_steps = num_steps + 1
+        t_steps = (
+            sigma_max ** (1 / rho)
+            + step_indices
+            / (num_steps - 1)
+            * (sigma_min ** (1 / rho) - sigma_max ** (1 / rho))
+        ) ** rho
+        t_steps = torch.cat([self.round_sigma(t_steps), torch.zeros_like(t_steps[:1])])
+        # Main sampling loop.
+        x_next = latents * t_steps[0]
+        # wrap in tqdm for progress bar
+        bar = tqdm(enumerate(zip(t_steps[:-1], t_steps[1:])))
+        for i, (t_cur, t_next) in bar:
+            x_cur = x_next
+            # Increase noise temporarily.
+            gamma = (
+                min(S_churn / num_steps, np.sqrt(2) - 1)
+                if S_min <= t_cur <= S_max
+                else 0
+            )
+            t_hat = self.round_sigma(t_cur + gamma * t_cur)
+            t = torch.zeros((x_cur.shape[0], 1, 1), device=x_cur.device)
+            t[:, 0, 0] = t_hat
+            t_hat = t
+            x_hat = x_cur + (
+                t_hat**2 - t_cur**2
+            ).sqrt() * S_noise * torch.randn_like(x_cur)
+            # Euler step.
+            denoised = self.EDMPrecond(x_hat, t_hat, cond, self.denoise_fn, nonpadding)
+            d_cur = (x_hat - denoised) / t_hat
+            x_next = x_hat + (t_next - t_hat) * d_cur
+        return x_next
+    def CTLoss_D(self, y, cond, mask):
+        """
+        compute loss for consistency distillation
+        Args:
+            y: ground truth mel-spectrogram [B x n_mel x L]
+            cond: output of conformer encoder [B x n_mel x L]
+            mask: mask of padded frames [B x n_mel x L]
+        """
+        with torch.no_grad():
+            mu = 0.95
+            for p, ema_p in zip(
+                self.denoise_fn.parameters(), self.denoise_fn_ema.parameters()
+            ):
+                ema_p.mul_(mu).add_(p, alpha=1 - mu)
+        n = torch.randint(1, self.N, (y.shape[0],))
+        z = torch.randn_like(y) + cond
+        tn_1 = self.t_steps[n + 1].reshape(-1, 1, 1).to(y.device)
+        f_theta = self.EDMPrecond(y + tn_1 * z, tn_1, cond, self.denoise_fn, mask)
+        with torch.no_grad():
+            tn = self.t_steps[n].reshape(-1, 1, 1).to(y.device)
+            # euler step
+            x_hat = y + tn_1 * z
+            denoised = self.EDMPrecond(
+                x_hat, tn_1, cond, self.denoise_fn_pretrained, mask
+            )
+            d_cur = (x_hat - denoised) / tn_1
+            y_tn = x_hat + (tn - tn_1) * d_cur
+            f_theta_ema = self.EDMPrecond(y_tn, tn, cond, self.denoise_fn_ema, mask)
+        # loss = (f_theta - f_theta_ema.detach()) ** 2
+        # loss = torch.sum(loss * mask) / torch.sum(mask)
+        loss = self.ssim_loss(f_theta, f_theta_ema.detach())
+        loss = torch.sum(loss * mask) / torch.sum(mask)
+        return loss
+    def get_t_steps(self, N):
+        N = N + 1
+        step_indices = torch.arange(N)  # , device=latents.device)
+        t_steps = (
+            self.sigma_min ** (1 / self.rho)
+            + step_indices
+            / (N - 1)
+            * (self.sigma_max ** (1 / self.rho) - self.sigma_min ** (1 / self.rho))
+        ) ** self.rho
+        return t_steps.flip(0)
+    def CT_sampler(self, latents, cond, nonpadding, t_steps=1):
+        """
+        consistency distillation sampler
+        Args:
+            latents: noisy mel-spectrogram [B x n_mel x L]
+            cond: output of conformer encoder [B x n_mel x L]
+            nonpadding: mask of padded frames [B x n_mel x L]
+            t_steps: number of steps for diffusion inference
+        Returns:
+            denoised mel-spectrogram [B x n_mel x L]
+        """
+        # one-step
+        if t_steps == 1:
+            t_steps = [80]
+        # multi-step
+        else:
+            t_steps = self.get_t_steps(t_steps)
+        t_steps = torch.as_tensor(t_steps).to(latents.device)
+        latents = latents * t_steps[0]
+        _t = torch.zeros((latents.shape[0], 1, 1), device=latents.device)
+        _t[:, 0, 0] = t_steps
+        x = self.EDMPrecond(latents, _t, cond, self.denoise_fn_ema, nonpadding)
+        for t in t_steps[1:-1]:
+            z = torch.randn_like(x) + cond
+            x_tn = x + (t**2 - self.sigma_min**2).sqrt() * z
+            _t = torch.zeros((x.shape[0], 1, 1), device=x.device)
+            _t[:, 0, 0] = t
+            t = _t
+            print(t)
+            x = self.EDMPrecond(x_tn, t, cond, self.denoise_fn_ema, nonpadding)
+        return x
+    def forward(self, x, nonpadding, cond, t_steps=1, infer=False):
+        """
+        calculate loss or sample mel-spectrogram
+        Args:
+            x:
+                training: ground truth mel-spectrogram [B x n_mel x L]
+                inference: output of encoder [B x n_mel x L]
+        """
+        if self.teacher:  # teacher model -- karras diffusion
+            if not infer:
+                loss = self.EDMLoss(x, cond, nonpadding)
+                return loss
+            else:
+                shape = (cond.shape[0], self.cfg.n_mel, cond.shape[2])
+                x = torch.randn(shape, device=x.device) + cond
+                x = self.edm_sampler(x, cond, nonpadding, t_steps)
+            return x
+        else:  # Consistency distillation
+            if not infer:
+                loss = self.CTLoss_D(x, cond, nonpadding)
+                return loss
+            else:
+                shape = (cond.shape[0], self.cfg.n_mel, cond.shape[2])
+                x = torch.randn(shape, device=x.device) + cond
+                x = self.CT_sampler(x, cond, nonpadding, t_steps=1)
+            return x
+class ComoSVC(BaseModule):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.cfg.model.comosvc.n_mel = self.cfg.preprocess.n_mel
+        self.distill = self.cfg.model.comosvc.distill
+        self.encoder = Conformer(self.cfg.model.comosvc)
+        self.decoder = Consistency(self.cfg, distill=self.distill)
+        self.ssim_loss = SSIM()
+    @torch.no_grad()
+    def forward(self, x_mask, x, n_timesteps, temperature=1.0):
+        """
+        Generates mel-spectrogram from pitch, content vector, energy. Returns:
+            1. encoder outputs (from conformer)
+            2. decoder outputs (from diffusion-based decoder)
+        Args:
+            x_mask : mask of padded frames in mel-spectrogram. [B x L x n_mel]
+            x : output of encoder framework. [B x L x d_condition]
+            n_timesteps : number of steps to use for reverse diffusion in decoder.
+            temperature : controls variance of terminal distribution.
+        """
+        # Get encoder_outputs `mu_x`
+        mu_x = self.encoder(x, x_mask)
+        encoder_outputs = mu_x
+        mu_x = mu_x.transpose(1, 2)
+        x_mask = x_mask.transpose(1, 2)
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder(
+            mu_x, x_mask, mu_x, t_steps=n_timesteps, infer=True
+        )
+        decoder_outputs = decoder_outputs.transpose(1, 2)
+        return encoder_outputs, decoder_outputs
+    def compute_loss(self, x_mask, x, mel, out_size=None, skip_diff=False):
+        """
+        Computes 2 losses:
+            1. prior loss: loss between mel-spectrogram and encoder outputs.
+            2. diffusion loss: loss between gaussian noise and its reconstruction by diffusion-based decoder.
+        Args:
+            x_mask : mask of padded frames in mel-spectrogram. [B x L x n_mel]
+            x : output of encoder framework. [B x L x d_condition]
+            mel : ground truth mel-spectrogram. [B x L x n_mel]
+        """
+        mu_x = self.encoder(x, x_mask)
+        # prior loss
+        prior_loss = torch.sum(
+            0.5 * ((mel - mu_x) ** 2 + math.log(2 * math.pi)) * x_mask
+        )
+        prior_loss = prior_loss / (torch.sum(x_mask) * self.cfg.model.comosvc.n_mel)
+        # ssim loss
+        ssim_loss = self.ssim_loss(mu_x, mel)
+        ssim_loss = torch.sum(ssim_loss * x_mask) / torch.sum(x_mask)
+        x_mask = x_mask.transpose(1, 2)
+        mu_x = mu_x.transpose(1, 2)
+        mel = mel.transpose(1, 2)
+        if not self.distill and skip_diff:
+            diff_loss = prior_loss.clone()
+            diff_loss.fill_(0)
+        # Cut a small segment of mel-spectrogram in order to increase batch size
+        else:
+            if self.distill:
+                mu_y = mu_x.detach()
+            else:
+                mu_y = mu_x
+            mask_y = x_mask
+            diff_loss = self.decoder(mel, mask_y, mu_y, infer=False)
+        return ssim_loss, prior_loss, diff_loss

models/svc/comosvc/comosvc_inference.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from models.svc.base import SVCInference
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.svc.comosvc.comosvc import ComoSVC
+class ComoSVCInference(SVCInference):
+    def __init__(self, args, cfg, infer_type="from_dataset"):
+        SVCInference.__init__(self, args, cfg, infer_type)
+    def _build_model(self):
+        # TODO: sort out the config
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        self.acoustic_mapper = ComoSVC(self.cfg)
+        if self.cfg.model.comosvc.distill:
+            self.acoustic_mapper.decoder.init_consistency_training()
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        return model
+    def _inference_each_batch(self, batch_data):
+        device = self.accelerator.device
+        for k, v in batch_data.items():
+            batch_data[k] = v.to(device)
+        cond = self.condition_encoder(batch_data)
+        mask = batch_data["mask"]
+        encoder_pred, decoder_pred = self.acoustic_mapper(
+            mask, cond, self.cfg.inference.comosvc.inference_steps
+        )
+        return decoder_pred

models/svc/comosvc/comosvc_trainer.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import os
+import json5
+from collections import OrderedDict
+from tqdm import tqdm
+import json
+import shutil
+from models.svc.base import SVCTrainer
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.svc.comosvc.comosvc import ComoSVC
+class ComoSVCTrainer(SVCTrainer):
+    r"""The base trainer for all diffusion models. It inherits from SVCTrainer and
+    implements ``_build_model`` and ``_forward_step`` methods.
+    """
+    def __init__(self, args=None, cfg=None):
+        SVCTrainer.__init__(self, args, cfg)
+        self.distill = cfg.model.comosvc.distill
+        self.skip_diff = True
+        if self.distill:  # and args.resume is None:
+            self.teacher_model_path = cfg.model.teacher_model_path
+            self.teacher_state_dict = self._load_teacher_state_dict()
+            self._load_teacher_model(self.teacher_state_dict)
+            self.acoustic_mapper.decoder.init_consistency_training()
+    ### Following are methods only for comoSVC models ###
+    def _load_teacher_state_dict(self):
+        self.checkpoint_file = self.teacher_model_path
+        print("Load teacher acoustic model from {}".format(self.checkpoint_file))
+        raw_state_dict = torch.load(self.checkpoint_file)  # , map_location=self.device)
+        return raw_state_dict
+    def _load_teacher_model(self, state_dict):
+        raw_dict = state_dict
+        clean_dict = OrderedDict()
+        for k, v in raw_dict.items():
+            if k.startswith("module."):
+                clean_dict[k[7:]] = v
+            else:
+                clean_dict[k] = v
+        self.model.load_state_dict(clean_dict)
+    def _build_model(self):
+        r"""Build the model for training. This function is called in ``__init__`` function."""
+        # TODO: sort out the config
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        self.acoustic_mapper = ComoSVC(self.cfg)
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        return model
+    def _forward_step(self, batch):
+        r"""Forward step for training and inference. This function is called
+        in ``_train_step`` & ``_test_step`` function.
+        """
+        loss = {}
+        mask = batch["mask"]
+        mel_input = batch["mel"]
+        cond = self.condition_encoder(batch)
+        if self.distill:
+            cond = cond.detach()
+        self.skip_diff = True if self.step < self.cfg.train.fast_steps else False
+        ssim_loss, prior_loss, diff_loss = self.acoustic_mapper.compute_loss(
+            mask, cond, mel_input, skip_diff=self.skip_diff
+        )
+        if self.distill:
+            loss["distil_loss"] = diff_loss
+        else:
+            loss["ssim_loss_encoder"] = ssim_loss
+            loss["prior_loss_encoder"] = prior_loss
+            loss["diffusion_loss_decoder"] = diff_loss
+        return loss
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.train()
+        epoch_sum_loss: float = 0.0
+        epoch_step: int = 0
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                loss = self._train_step(batch)
+                total_loss = 0
+                for k, v in loss.items():
+                    total_loss += v
+                self.accelerator.backward(total_loss)
+                enc_grad_norm = torch.nn.utils.clip_grad_norm_(
+                    self.acoustic_mapper.encoder.parameters(), max_norm=1
+                )
+                dec_grad_norm = torch.nn.utils.clip_grad_norm_(
+                    self.acoustic_mapper.decoder.parameters(), max_norm=1
+                )
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+            self.batch_count += 1
+            # Update info for each step
+            # TODO: step means BP counts or batch counts?
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                epoch_sum_loss += total_loss
+                log_info = {}
+                for k, v in loss.items():
+                    key = "Step/Train Loss/{}".format(k)
+                    log_info[key] = v
+                log_info["Step/Learning Rate"]: self.optimizer.param_groups[0]["lr"]
+                self.accelerator.log(
+                    log_info,
+                    step=self.step,
+                )
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        return (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step,
+            loss,
+        )
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self.__dump_cfg(self.config_save_path)
+        self.model.train()
+        self.optimizer.zero_grad()
+        # Wait to ensure good to go
+        self.accelerator.wait_for_everyone()
+        while self.epoch < self.max_epoch:
+            self.logger.info("\n")
+            self.logger.info("-" * 32)
+            self.logger.info("Epoch {}: ".format(self.epoch))
+            ### TODO: change the return values of _train_epoch() to a loss dict, or (total_loss, loss_dict)
+            ### It's inconvenient for the model with multiple losses
+            # Do training & validating epoch
+            train_loss, loss = self._train_epoch()
+            self.logger.info("  |- Train/Loss: {:.6f}".format(train_loss))
+            for k, v in loss.items():
+                self.logger.info("  |- Train/Loss/{}: {:.6f}".format(k, v))
+            valid_loss = self._valid_epoch()
+            self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_loss))
+            self.accelerator.log(
+                {"Epoch/Train Loss": train_loss, "Epoch/Valid Loss": valid_loss},
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            # TODO: what is scheduler?
+            self.scheduler.step(valid_loss)  # FIXME: use epoch track correct?
+            # Check if hit save_checkpoint_stride and run_eval
+            run_eval = False
+            if self.accelerator.is_main_process:
+                save_checkpoint = False
+                hit_dix = []
+                for i, num in enumerate(self.save_checkpoint_stride):
+                    if self.epoch % num == 0:
+                        save_checkpoint = True
+                        hit_dix.append(i)
+                        run_eval |= self.run_eval[i]
+            self.accelerator.wait_for_everyone()
+            if (
+                self.accelerator.is_main_process
+                and save_checkpoint
+                and (self.distill or not self.skip_diff)
+            ):
+                path = os.path.join(
+                    self.checkpoint_dir,
+                    "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, train_loss
+                    ),
+                )
+                self.accelerator.save_state(path)
+                json.dump(
+                    self.checkpoints_path,
+                    open(os.path.join(path, "ckpts.json"), "w"),
+                    ensure_ascii=False,
+                    indent=4,
+                )
+                # Remove old checkpoints
+                to_remove = []
+                for idx in hit_dix:
+                    self.checkpoints_path[idx].append(path)
+                    while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
+                        to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
+                # Search conflicts
+                total = set()
+                for i in self.checkpoints_path:
+                    total |= set(i)
+                do_remove = set()
+                for idx, path in to_remove[::-1]:
+                    if path in total:
+                        self.checkpoints_path[idx].insert(0, path)
+                    else:
+                        do_remove.add(path)
+                # Remove old checkpoints
+                for path in do_remove:
+                    shutil.rmtree(path, ignore_errors=True)
+                    self.logger.debug(f"Remove old checkpoint: {path}")
+            self.accelerator.wait_for_everyone()
+            if run_eval:
+                # TODO: run evaluation
+                pass
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_loss
+                    ),
+                )
+            )
+        self.accelerator.end_training()
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.eval()
+        epoch_sum_loss = 0.0
+        for batch in tqdm(
+            self.valid_dataloader,
+            desc=f"Validating Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            batch_loss = self._valid_step(batch)
+            for k, v in batch_loss.items():
+                epoch_sum_loss += v
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss / len(self.valid_dataloader)
+    @staticmethod
+    def __count_parameters(model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )

models/svc/comosvc/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def slice_segments(x, ids_str, segment_size=200):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_ids_segments(lengths, segment_size=200):
+    b = lengths.shape[0]
+    ids_str_max = lengths - segment_size
+    ids_str = (torch.rand([b]).to(device=lengths.device) * ids_str_max).to(
+        dtype=torch.long
+    )
+    return ids_str
+def fix_len_compatibility(length, num_downsamplings_in_unet=2):
+    while True:
+        if length % (2**num_downsamplings_in_unet) == 0:
+            return length
+        length += 1

models/svc/diffusion/__init__.py ADDED Viewed

File without changes

models/svc/diffusion/diffusion_inference.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from diffusers import DDIMScheduler, DDPMScheduler, PNDMScheduler
+from models.svc.base import SVCInference
+from models.svc.diffusion.diffusion_inference_pipeline import DiffusionInferencePipeline
+from models.svc.diffusion.diffusion_wrapper import DiffusionWrapper
+from modules.encoder.condition_encoder import ConditionEncoder
+class DiffusionInference(SVCInference):
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        SVCInference.__init__(self, args, cfg, infer_type)
+        settings = {
+            **cfg.model.diffusion.scheduler_settings,
+            **cfg.inference.diffusion.scheduler_settings,
+        }
+        settings.pop("num_inference_timesteps")
+        if cfg.inference.diffusion.scheduler.lower() == "ddpm":
+            self.scheduler = DDPMScheduler(**settings)
+            self.logger.info("Using DDPM scheduler.")
+        elif cfg.inference.diffusion.scheduler.lower() == "ddim":
+            self.scheduler = DDIMScheduler(**settings)
+            self.logger.info("Using DDIM scheduler.")
+        elif cfg.inference.diffusion.scheduler.lower() == "pndm":
+            self.scheduler = PNDMScheduler(**settings)
+            self.logger.info("Using PNDM scheduler.")
+        else:
+            raise NotImplementedError(
+                "Unsupported scheduler type: {}".format(
+                    cfg.inference.diffusion.scheduler.lower()
+                )
+            )
+        self.pipeline = DiffusionInferencePipeline(
+            self.model[1],
+            self.scheduler,
+            args.diffusion_inference_steps,
+        )
+    def _build_model(self):
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        self.acoustic_mapper = DiffusionWrapper(self.cfg)
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        return model
+    def _inference_each_batch(self, batch_data):
+        device = self.accelerator.device
+        for k, v in batch_data.items():
+            batch_data[k] = v.to(device)
+        conditioner = self.model[0](batch_data)
+        noise = torch.randn_like(batch_data["mel"], device=device)
+        y_pred = self.pipeline(noise, conditioner)
+        return y_pred

models/svc/diffusion/diffusion_inference_pipeline.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from diffusers import DiffusionPipeline
+class DiffusionInferencePipeline(DiffusionPipeline):
+    def __init__(self, network, scheduler, num_inference_timesteps=1000):
+        super().__init__()
+        self.register_modules(network=network, scheduler=scheduler)
+        self.num_inference_timesteps = num_inference_timesteps
+    @torch.inference_mode()
+    def __call__(
+        self,
+        initial_noise: torch.Tensor,
+        conditioner: torch.Tensor = None,
+    ):
+        r"""
+        Args:
+            initial_noise: The initial noise to be denoised.
+            conditioner:The conditioner.
+            n_inference_steps: The number of denoising steps. More denoising steps
+                usually lead to a higher quality at the expense of slower inference.
+        """
+        mel = initial_noise
+        batch_size = mel.size(0)
+        self.scheduler.set_timesteps(self.num_inference_timesteps)
+        for t in self.progress_bar(self.scheduler.timesteps):
+            timestep = torch.full((batch_size,), t, device=mel.device, dtype=torch.long)
+            # 1. predict noise model_output
+            model_output = self.network(mel, timestep, conditioner)
+            # 2. denoise, compute previous step: x_t -> x_t-1
+            mel = self.scheduler.step(model_output, t, mel).prev_sample
+            # 3. clamp
+            mel = mel.clamp(-1.0, 1.0)
+        return mel

models/svc/diffusion/diffusion_trainer.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from diffusers import DDPMScheduler
+from models.svc.base import SVCTrainer
+from modules.encoder.condition_encoder import ConditionEncoder
+from .diffusion_wrapper import DiffusionWrapper
+class DiffusionTrainer(SVCTrainer):
+    r"""The base trainer for all diffusion models. It inherits from SVCTrainer and
+    implements ``_build_model`` and ``_forward_step`` methods.
+    """
+    def __init__(self, args=None, cfg=None):
+        SVCTrainer.__init__(self, args, cfg)
+        # Only for SVC tasks using diffusion
+        self.noise_scheduler = DDPMScheduler(
+            **self.cfg.model.diffusion.scheduler_settings,
+        )
+        self.diffusion_timesteps = (
+            self.cfg.model.diffusion.scheduler_settings.num_train_timesteps
+        )
+    ### Following are methods only for diffusion models ###
+    def _build_model(self):
+        r"""Build the model for training. This function is called in ``__init__`` function."""
+        # TODO: sort out the config
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        self.acoustic_mapper = DiffusionWrapper(self.cfg)
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        num_of_params_encoder = self.count_parameters(self.condition_encoder)
+        num_of_params_am = self.count_parameters(self.acoustic_mapper)
+        num_of_params = num_of_params_encoder + num_of_params_am
+        log = "Diffusion Model's Parameters: #Encoder is {:.2f}M, #Diffusion is {:.2f}M. The total is {:.2f}M".format(
+            num_of_params_encoder / 1e6, num_of_params_am / 1e6, num_of_params / 1e6
+        )
+        self.logger.info(log)
+        return model
+    def count_parameters(self, model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def _forward_step(self, batch):
+        r"""Forward step for training and inference. This function is called
+        in ``_train_step`` & ``_test_step`` function.
+        """
+        device = self.accelerator.device
+        mel_input = batch["mel"]
+        noise = torch.randn_like(mel_input, device=device, dtype=torch.float32)
+        batch_size = mel_input.size(0)
+        timesteps = torch.randint(
+            0,
+            self.diffusion_timesteps,
+            (batch_size,),
+            device=device,
+            dtype=torch.long,
+        )
+        noisy_mel = self.noise_scheduler.add_noise(mel_input, noise, timesteps)
+        conditioner = self.condition_encoder(batch)
+        y_pred = self.acoustic_mapper(noisy_mel, timesteps, conditioner)
+        # TODO: Predict noise or gt should be configurable
+        loss = self._compute_loss(self.criterion, y_pred, noise, batch["mask"])
+        self._check_nan(loss, y_pred, noise)
+        # FIXME: Clarify that we should not divide it with batch size here
+        return loss

models/svc/diffusion/diffusion_wrapper.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+from modules.diffusion import BiDilConv
+from modules.encoder.position_encoder import PositionEncoder
+class DiffusionWrapper(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.diff_cfg = cfg.model.diffusion
+        self.diff_encoder = PositionEncoder(
+            d_raw_emb=self.diff_cfg.step_encoder.dim_raw_embedding,
+            d_out=self.diff_cfg.bidilconv.base_channel,
+            d_mlp=self.diff_cfg.step_encoder.dim_hidden_layer,
+            activation_function=self.diff_cfg.step_encoder.activation,
+            n_layer=self.diff_cfg.step_encoder.num_layer,
+            max_period=self.diff_cfg.step_encoder.max_period,
+        )
+        # FIXME: Only support BiDilConv now for debug
+        if self.diff_cfg.model_type.lower() == "bidilconv":
+            self.neural_network = BiDilConv(
+                input_channel=self.cfg.preprocess.n_mel, **self.diff_cfg.bidilconv
+            )
+        else:
+            raise ValueError(
+                f"Unsupported diffusion model type: {self.diff_cfg.model_type}"
+            )
+    def forward(self, x, t, c):
+        """
+        Args:
+            x: [N, T, mel_band] of mel spectrogram
+            t: Diffusion time step with shape of [N]
+            c: [N, T, conditioner_size] of conditioner
+        Returns:
+            [N, T, mel_band] of mel spectrogram
+        """
+        assert (
+            x.size()[:-1] == c.size()[:-1]
+        ), "x mismatch with c, got \n x: {} \n c: {}".format(x.size(), c.size())
+        assert x.size(0) == t.size(
+            0
+        ), "x mismatch with t, got \n x: {} \n t: {}".format(x.size(), t.size())
+        assert t.dim() == 1, "t must be 1D tensor, got {}".format(t.dim())
+        N, T, mel_band = x.size()
+        x = x.transpose(1, 2).contiguous()  # [N, mel_band, T]
+        c = c.transpose(1, 2).contiguous()  # [N, conditioner_size, T]
+        t = self.diff_encoder(t).contiguous()  # [N, base_channel]
+        h = self.neural_network(x, t, c)
+        h = h.transpose(1, 2).contiguous()  # [N, T, mel_band]
+        assert h.size() == (
+            N,
+            T,
+            mel_band,
+        ), "h mismatch with input x, got \n h: {} \n x: {}".format(
+            h.size(), (N, T, mel_band)
+        )
+        return h

models/svc/transformer/__init__.py ADDED Viewed

File without changes

models/svc/transformer/conformer.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+from utils.util import convert_pad_shape
+class BaseModule(torch.nn.Module):
+    def __init__(self):
+        super(BaseModule, self).__init__()
+    @property
+    def nparams(self):
+        """
+        Returns number of trainable parameters of the module.
+        """
+        num_params = 0
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                num_params += np.prod(param.detach().cpu().numpy().shape)
+        return num_params
+    def relocate_input(self, x: list):
+        """
+        Relocates provided tensors to the same device set for the module.
+        """
+        device = next(self.parameters()).device
+        for i in range(len(x)):
+            if isinstance(x[i], torch.Tensor) and x[i].device != device:
+                x[i] = x[i].to(device)
+        return x
+class LayerNorm(BaseModule):
+    def __init__(self, channels, eps=1e-4):
+        super(LayerNorm, self).__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = torch.nn.Parameter(torch.ones(channels))
+        self.beta = torch.nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+class ConvReluNorm(BaseModule):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+        eps=1e-5,
+    ):
+        super(ConvReluNorm, self).__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.eps = eps
+        self.conv_layers = torch.nn.ModuleList()
+        self.conv_layers.append(
+            torch.nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.relu_drop = torch.nn.Sequential(
+            torch.nn.ReLU(), torch.nn.Dropout(p_dropout)
+        )
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                torch.nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+        self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.instance_norm(x, x_mask)
+            x = self.relu_drop(x)
+        x = self.proj(x)
+        return x * x_mask
+    def instance_norm(self, x, mask, return_mean_std=False):
+        mean, std = self.calc_mean_std(x, mask)
+        x = (x - mean) / std
+        if return_mean_std:
+            return x, mean, std
+        else:
+            return x
+    def calc_mean_std(self, x, mask=None):
+        x = x * mask
+        B, C = x.shape[:2]
+        mn = x.view(B, C, -1).mean(-1)
+        sd = (x.view(B, C, -1).var(-1) + self.eps).sqrt()
+        mn = mn.view(B, C, *((len(x.shape) - 2) * [1]))
+        sd = sd.view(B, C, *((len(x.shape) - 2) * [1]))
+        return mn, sd
+class MultiHeadAttention(BaseModule):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        window_size=None,
+        heads_share=True,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super(MultiHeadAttention, self).__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = torch.nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = torch.nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+        self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+        self.drop = torch.nn.Dropout(p_dropout)
+        torch.nn.init.xavier_uniform_(self.conv_q.weight)
+        torch.nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        torch.nn.init.xavier_uniform_(self.conv_v.weight)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+        p_attn = torch.nn.functional.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = torch.nn.functional.pad(
+                relative_embeddings,
+                convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(
+            x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
+        )
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = torch.nn.functional.pad(
+            x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(
+            x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        x_flat = torch.nn.functional.pad(
+            x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
+        )
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(BaseModule):
+    def __init__(
+        self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0
+    ):
+        super(FFN, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.conv_1 = torch.nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.conv_2 = torch.nn.Conv1d(
+            filter_channels, out_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.drop = torch.nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        return x * x_mask
+class Encoder(BaseModule):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads=2,
+        n_layers=6,
+        kernel_size=3,
+        p_dropout=0.1,
+        window_size=4,
+        **kwargs
+    ):
+        super(Encoder, self).__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.attn_layers = torch.nn.ModuleList()
+        self.norm_layers_1 = torch.nn.ModuleList()
+        self.ffn_layers = torch.nn.ModuleList()
+        self.norm_layers_2 = torch.nn.ModuleList()
+        for _ in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    window_size=window_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        for i in range(self.n_layers):
+            x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class Conformer(BaseModule):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.n_heads = self.cfg.n_heads
+        self.n_layers = self.cfg.n_layers
+        self.hidden_channels = self.cfg.input_dim
+        self.filter_channels = self.cfg.filter_channels
+        self.output_dim = self.cfg.output_dim
+        self.dropout = self.cfg.dropout
+        self.conformer_encoder = Encoder(
+            self.hidden_channels,
+            self.filter_channels,
+            n_heads=self.n_heads,
+            n_layers=self.n_layers,
+            kernel_size=3,
+            p_dropout=self.dropout,
+            window_size=4,
+        )
+        self.projection = nn.Conv1d(self.hidden_channels, self.output_dim, 1)
+    def forward(self, x, x_mask):
+        """
+        Args:
+            x: (N, seq_len, input_dim)
+        Returns:
+            output: (N, seq_len, output_dim)
+        """
+        # (N, seq_len, d_model)
+        x = x.transpose(1, 2)
+        x_mask = x_mask.transpose(1, 2)
+        output = self.conformer_encoder(x, x_mask)
+        # (N, seq_len, output_dim)
+        output = self.projection(output)
+        output = output.transpose(1, 2)
+        return output

models/svc/transformer/transformer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+class Transformer(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        dropout = self.cfg.dropout
+        nhead = self.cfg.n_heads
+        nlayers = self.cfg.n_layers
+        input_dim = self.cfg.input_dim
+        output_dim = self.cfg.output_dim
+        d_model = input_dim
+        self.pos_encoder = PositionalEncoding(d_model, dropout)
+        encoder_layers = TransformerEncoderLayer(
+            d_model, nhead, dropout=dropout, batch_first=True
+        )
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.output_mlp = nn.Linear(d_model, output_dim)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: (N, seq_len, input_dim)
+        Returns:
+            output: (N, seq_len, output_dim)
+        """
+        # (N, seq_len, d_model)
+        src = self.pos_encoder(x)
+        # model_stats["pos_embedding"] = x
+        # (N, seq_len, d_model)
+        output = self.transformer_encoder(src)
+        # (N, seq_len, output_dim)
+        output = self.output_mlp(output)
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
+        )
+        # Assume that x is (seq_len, N, d)
+        # pe = torch.zeros(max_len, 1, d_model)
+        # pe[:, 0, 0::2] = torch.sin(position * div_term)
+        # pe[:, 0, 1::2] = torch.cos(position * div_term)
+        # Assume that x in (N, seq_len, d)
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor, shape [N, seq_len, d]
+        """
+        # Old: Assume that x is (seq_len, N, d), and self.pe is (max_len, 1, d_model)
+        # x = x + self.pe[: x.size(0)]
+        # Now: self.pe is (1, max_len, d)
+        x = x + self.pe[:, : x.size(1), :]
+        return self.dropout(x)

models/svc/transformer/transformer_inference.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+import numpy as np
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+from collections import OrderedDict
+from models.svc.base import SVCInference
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.svc.transformer.transformer import Transformer
+from models.svc.transformer.conformer import Conformer
+class TransformerInference(SVCInference):
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        SVCInference.__init__(self, args, cfg, infer_type)
+    def _build_model(self):
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        if self.cfg.model.transformer.type == "transformer":
+            self.acoustic_mapper = Transformer(self.cfg.model.transformer)
+        elif self.cfg.model.transformer.type == "conformer":
+            self.acoustic_mapper = Conformer(self.cfg.model.transformer)
+        else:
+            raise NotImplementedError
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        return model
+    def _inference_each_batch(self, batch_data):
+        device = self.accelerator.device
+        for k, v in batch_data.items():
+            batch_data[k] = v.to(device)
+        condition = self.condition_encoder(batch_data)
+        y_pred = self.acoustic_mapper(condition, batch_data["mask"])
+        return y_pred

models/svc/transformer/transformer_trainer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from models.svc.base import SVCTrainer
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.svc.transformer.transformer import Transformer
+from models.svc.transformer.conformer import Conformer
+from utils.ssim import SSIM
+class TransformerTrainer(SVCTrainer):
+    def __init__(self, args, cfg):
+        SVCTrainer.__init__(self, args, cfg)
+        self.ssim_loss = SSIM()
+    def _build_model(self):
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        if self.cfg.model.transformer.type == "transformer":
+            self.acoustic_mapper = Transformer(self.cfg.model.transformer)
+        elif self.cfg.model.transformer.type == "conformer":
+            self.acoustic_mapper = Conformer(self.cfg.model.transformer)
+        else:
+            raise NotImplementedError
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        return model
+    def _forward_step(self, batch):
+        total_loss = 0
+        device = self.accelerator.device
+        mel = batch["mel"]
+        mask = batch["mask"]
+        condition = self.condition_encoder(batch)
+        mel_pred = self.acoustic_mapper(condition, mask)
+        l1_loss = torch.sum(torch.abs(mel_pred - mel) * batch["mask"]) / torch.sum(
+            batch["mask"]
+        )
+        self._check_nan(l1_loss, mel_pred, mel)
+        total_loss += l1_loss
+        ssim_loss = self.ssim_loss(mel_pred, mel)
+        ssim_loss = torch.sum(ssim_loss * batch["mask"]) / torch.sum(batch["mask"])
+        self._check_nan(ssim_loss, mel_pred, mel)
+        total_loss += ssim_loss
+        return total_loss