Spaces:

Mahiruoshi
/

BangStarlight

Running

App Files Files Community

Mahiruoshi commited on Jun 12

Commit

69fa064

•

1 Parent(s): ea40339

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -2

app.py CHANGED Viewed

@@ -189,7 +189,7 @@ def infer(
             torch.cuda.empty_cache()
         return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
 def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
     audio_fin = []
     ass_entries = []
@@ -244,6 +244,58 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
     with open(ass_filename, 'w', encoding='utf-8') as f:
         f.write(ass_header + '\n'.join(ass_entries))
     return (hps.data.sampling_rate, np.concatenate(audio_fin))
 def infer_simple(
@@ -446,7 +498,7 @@ if __name__ == "__main__":
                                         value="つくし|なんではるひかげやったの?!!",
                     )
                     groupSize = gr.Slider(
-                    minimum=10, maximum=1000 if  torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
                     )
                     silenceTime = gr.Slider(
                     minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"

             torch.cuda.empty_cache()
         return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
+'''srt格式
 def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
     audio_fin = []
     ass_entries = []
     with open(ass_filename, 'w', encoding='utf-8') as f:
         f.write(ass_header + '\n'.join(ass_entries))
     return (hps.data.sampling_rate, np.concatenate(audio_fin))
+'''
+def format_srt_timestamp(seconds):
+    ms = int((seconds - int(seconds)) * 1000)
+    seconds = int(seconds)
+    hours = seconds // 3600
+    minutes = (seconds % 3600) // 60
+    seconds = seconds % 60
+    return f"{hours:02}:{minutes:02}:{seconds:02},{ms:03}"
+def clean_sentence(sentence):
+    return sentence.replace('\n', '').replace('\r', '').replace(' ', '')
+def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, spealerList, silenceTime):
+    audio_fin = []
+    srt_entries = []
+    start_time = 0
+    for i, sentence in enumerate(group):
+        try:
+            FakeSpeaker = sentence.split("|")[0]
+            SpeakersList = re.split('\n', spealerList)
+            if FakeSpeaker in list(hps.data.spk2id.keys()):
+                speaker = FakeSpeaker
+            for s in SpeakersList:
+                if FakeSpeaker == s.split("|")[1]:
+                    speaker = s.split("|")[0]
+            if len(sentence)>2 and (sentence != '\n' or sentence != '\r' or sentence != '' or sentence != ' ' or sentence != '\r\n'):
+                clean_msg = clean_sentence(sentence.split("|")[-1])
+                audio = infer_simple((remove_annotations(clean_msg) + "。").replace("，。", "。").replace("。。", "。"), sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker)
+                silence_frames = int(silenceTime * 44100) if is_chinese(sentence) else int(silenceTime * 44100)
+                silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
+                audio_fin.append(audio)
+                audio_fin.append(silence_data)
+                duration = len(audio) / sampling_rate
+                end_time = start_time + duration + silenceTime
+                srt_entries.append(f"{i+1}\n{format_srt_timestamp(start_time)} --> {format_srt_timestamp(end_time)}\n{clean_msg.replace('|', '：')}\n\n")
+                start_time = end_time
+        except:
+            pass
+    wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
+    srt_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.srt')
+    write(wav_filename, sampling_rate, np.concatenate(audio_fin))
+    with open(srt_filename, 'w', encoding='utf-8') as f:
+        f.writelines(srt_entries)
+    return (hps.data.sampling_rate, np.concatenate(audio_fin))
 def infer_simple(
                                         value="つくし|なんではるひかげやったの?!!",
                     )
                     groupSize = gr.Slider(
+                    minimum=10, maximum=1000000 if  torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
                     )
                     silenceTime = gr.Slider(
                     minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"