gorkemgoknar commited on
Commit
3b69bc5
1 Parent(s): 4281c8c

make voice cleaner optional

Browse files
Files changed (1) hide show
  1. app.py +65 -25
app.py CHANGED
@@ -37,7 +37,7 @@ DEVICE_ASSERT_DETECTED=0
37
  DEVICE_ASSERT_PROMPT=None
38
  DEVICE_ASSERT_LANG=None
39
 
40
- def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_auto_detect, agree):
41
  if agree == True:
42
  supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
43
 
@@ -47,6 +47,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
47
  return (
48
  None,
49
  None,
 
50
  )
51
 
52
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
@@ -70,54 +71,71 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
70
  return (
71
  None,
72
  None,
 
73
  )
74
 
75
 
76
  if use_mic == True:
77
  if mic_file_path is not None:
78
- try:
79
- # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
80
- # This is fast filtering not perfect
81
-
82
- # better to remove silence in beginning and end for microphone
83
- trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
84
-
85
- speechnorm="e=6.25:r=0.00001:l=1,"
86
-
87
- out_filename = mic_file_path + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
88
-
89
- #we will use newer ffmpeg as that has afftn denoise filter
90
- shell_command = f"./ffmpeg -y -i {mic_file_path} -af {trim_silence},{speechnorm} {out_filename}".split(" ")
91
-
92
- command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
93
- speaker_wav=out_filename
94
- print("Filtered microphone input")
95
- except subprocess.CalledProcessError:
96
- # There was an error - command exited with non-zero code
97
- print("Error: failed filtering, use original microphone input")
98
- speaker_wav=mic_file_path
99
  else:
100
  gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
101
  return (
102
  None,
103
  None,
 
104
  )
105
 
106
  else:
107
  speaker_wav=audio_file_pth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  if len(prompt)<2:
111
  gr.Warning("Please give a longer prompt text")
112
  return (
113
  None,
114
  None,
 
115
  )
116
  if len(prompt)>200:
117
  gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
118
  return (
119
  None,
120
  None,
 
121
  )
122
  global DEVICE_ASSERT_DETECTED
123
  if DEVICE_ASSERT_DETECTED:
@@ -155,12 +173,14 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
155
  audio="output.wav",
156
  ),
157
  "output.wav",
 
158
  )
159
  else:
160
  gr.Warning("Please accept the Terms & Condition!")
161
  return (
162
  None,
163
  None,
 
164
  )
165
 
166
 
@@ -200,7 +220,9 @@ examples = [
200
  None,
201
  False,
202
  False,
 
203
  True,
 
204
  ],
205
  [
206
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
@@ -209,7 +231,9 @@ examples = [
209
  None,
210
  False,
211
  False,
 
212
  True,
 
213
  ],
214
  [
215
  "Als ich sechs war, sah ich einmal ein wunderbares Bild",
@@ -218,6 +242,7 @@ examples = [
218
  None,
219
  False,
220
  False,
 
221
  True,
222
  ],
223
  [
@@ -227,6 +252,7 @@ examples = [
227
  None,
228
  False,
229
  False,
 
230
  True,
231
  ],
232
  [
@@ -236,6 +262,7 @@ examples = [
236
  None,
237
  False,
238
  False,
 
239
  True,
240
  ],
241
  [
@@ -245,6 +272,7 @@ examples = [
245
  None,
246
  False,
247
  False,
 
248
  True,
249
  ],
250
  [
@@ -252,6 +280,7 @@ examples = [
252
  "it",
253
  "examples/female.wav",
254
  None,
 
255
  False,
256
  False,
257
  True,
@@ -263,6 +292,7 @@ examples = [
263
  None,
264
  False,
265
  False,
 
266
  True,
267
  ],
268
  [
@@ -270,6 +300,7 @@ examples = [
270
  "ru",
271
  "examples/female.wav",
272
  None,
 
273
  False,
274
  False,
275
  True,
@@ -279,6 +310,7 @@ examples = [
279
  "nl",
280
  "examples/male.wav",
281
  None,
 
282
  False,
283
  False,
284
  True,
@@ -288,6 +320,7 @@ examples = [
288
  "cs",
289
  "examples/female.wav",
290
  None,
 
291
  False,
292
  False,
293
  True,
@@ -297,6 +330,7 @@ examples = [
297
  "zh-cn",
298
  "examples/female.wav",
299
  None,
 
300
  False,
301
  False,
302
  True,
@@ -344,9 +378,13 @@ gr.Interface(
344
  type="filepath",
345
  info="Use your microphone to record audio",
346
  label="Use Microphone for Reference"),
347
- gr.Checkbox(label="Check to use Microphone as Reference",
348
  value=False,
349
  info="Notice: Microphone input may not work properly under traffic",),
 
 
 
 
350
  gr.Checkbox(label="Do not use language auto-detect",
351
  value=False,
352
  info="Check to disable language auto-detection",),
@@ -355,14 +393,16 @@ gr.Interface(
355
  value=False,
356
  info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
357
  ),
 
 
358
  ],
359
  outputs=[
360
  gr.Video(label="Waveform Visual"),
361
  gr.Audio(label="Synthesised Audio"),
 
362
  ],
363
  title=title,
364
  description=description,
365
  article=article,
366
  examples=examples,
367
- ).queue().launch(debug=True)
368
-
 
37
  DEVICE_ASSERT_PROMPT=None
38
  DEVICE_ASSERT_LANG=None
39
 
40
+ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
41
  if agree == True:
42
  supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
43
 
 
47
  return (
48
  None,
49
  None,
50
+ None,
51
  )
52
 
53
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
 
71
  return (
72
  None,
73
  None,
74
+ None,
75
  )
76
 
77
 
78
  if use_mic == True:
79
  if mic_file_path is not None:
80
+ speaker_wav=mic_file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  else:
82
  gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
83
  return (
84
  None,
85
  None,
86
+ None,
87
  )
88
 
89
  else:
90
  speaker_wav=audio_file_pth
91
+
92
+
93
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
94
+ # This is fast filtering not perfect
95
+
96
+ # Apply all on demand
97
+ lowpassfilter=denoise=trim=loudness=True
98
+
99
+ if lowpassfilter:
100
+ lowpass_highpass="lowpass=8000,highpass=75,"
101
+ else:
102
+ lowpass_highpass=""
103
+
104
+ if trim:
105
+ # better to remove silence in beginning and end for microphone
106
+ trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
107
+ else:
108
+ trim_silence=""
109
 
110
+ if (voice_cleanup):
111
+ try:
112
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
113
+
114
+ #we will use newer ffmpeg as that has afftn denoise filter
115
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
116
+
117
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
118
+ speaker_wav=out_filename
119
+ print("Filtered microphone input")
120
+ except subprocess.CalledProcessError:
121
+ # There was an error - command exited with non-zero code
122
+ print("Error: failed filtering, use original microphone input")
123
+ else:
124
+ speaker_wav=speaker_wav
125
 
126
  if len(prompt)<2:
127
  gr.Warning("Please give a longer prompt text")
128
  return (
129
  None,
130
  None,
131
+ None,
132
  )
133
  if len(prompt)>200:
134
  gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
135
  return (
136
  None,
137
  None,
138
+ None,
139
  )
140
  global DEVICE_ASSERT_DETECTED
141
  if DEVICE_ASSERT_DETECTED:
 
173
  audio="output.wav",
174
  ),
175
  "output.wav",
176
+ speaker_wav,
177
  )
178
  else:
179
  gr.Warning("Please accept the Terms & Condition!")
180
  return (
181
  None,
182
  None,
183
+ None,
184
  )
185
 
186
 
 
220
  None,
221
  False,
222
  False,
223
+ False,
224
  True,
225
+
226
  ],
227
  [
228
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
 
231
  None,
232
  False,
233
  False,
234
+ False,
235
  True,
236
+ False,
237
  ],
238
  [
239
  "Als ich sechs war, sah ich einmal ein wunderbares Bild",
 
242
  None,
243
  False,
244
  False,
245
+ False,
246
  True,
247
  ],
248
  [
 
252
  None,
253
  False,
254
  False,
255
+ False,
256
  True,
257
  ],
258
  [
 
262
  None,
263
  False,
264
  False,
265
+ False,
266
  True,
267
  ],
268
  [
 
272
  None,
273
  False,
274
  False,
275
+ False,
276
  True,
277
  ],
278
  [
 
280
  "it",
281
  "examples/female.wav",
282
  None,
283
+ False,
284
  False,
285
  False,
286
  True,
 
292
  None,
293
  False,
294
  False,
295
+ False,
296
  True,
297
  ],
298
  [
 
300
  "ru",
301
  "examples/female.wav",
302
  None,
303
+ False,
304
  False,
305
  False,
306
  True,
 
310
  "nl",
311
  "examples/male.wav",
312
  None,
313
+ False,
314
  False,
315
  False,
316
  True,
 
320
  "cs",
321
  "examples/female.wav",
322
  None,
323
+ False,
324
  False,
325
  False,
326
  True,
 
330
  "zh-cn",
331
  "examples/female.wav",
332
  None,
333
+ False,
334
  False,
335
  False,
336
  True,
 
378
  type="filepath",
379
  info="Use your microphone to record audio",
380
  label="Use Microphone for Reference"),
381
+ gr.Checkbox(label="Use Microphone",
382
  value=False,
383
  info="Notice: Microphone input may not work properly under traffic",),
384
+ gr.Checkbox(label="Cleanup Reference Voice",
385
+ value=False,
386
+ info="This check can improve output if your microphone or reference voice is noisy",
387
+ ),
388
  gr.Checkbox(label="Do not use language auto-detect",
389
  value=False,
390
  info="Check to disable language auto-detection",),
 
393
  value=False,
394
  info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
395
  ),
396
+
397
+
398
  ],
399
  outputs=[
400
  gr.Video(label="Waveform Visual"),
401
  gr.Audio(label="Synthesised Audio"),
402
+ gr.Audio(label="Reference Audio Used"),
403
  ],
404
  title=title,
405
  description=description,
406
  article=article,
407
  examples=examples,
408
+ ).queue().launch(debug=True)