Spaces:

gradio
/

omni-mini

Running on T4

App Files Files Community

freddyaboulton HF staff commited on Sep 23

Commit

5f58cac

•

1 Parent(s): 6cba8bb

Stop recording

Browse files

Files changed (2) hide show

app.py +30 -21
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -7,10 +7,7 @@ import numpy as np
 import requests
 import traceback
 from dataclasses import dataclass
-from pathlib import Path
 import io
-import wave
-import tempfile
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
@@ -87,7 +84,15 @@ def warm_up():
 warm_up()
-def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
     """Take in the stream, determine if a pause happened"""
     temp_audio = audio
@@ -95,6 +100,11 @@ def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
     dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
     duration = len(audio) / sampling_rate
     print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
     return (duration - dur_vad) > 0.5
@@ -102,14 +112,14 @@ def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
 def speaking(audio: np.ndarray, sampling_rate: int):
     audio_buffer = io.BytesIO()
-    audio = AudioSegment(
         audio.tobytes(),
         frame_rate=sampling_rate,
-        sample_width=data.dtype.itemsize,
-        channels=(1 if len(data.shape) == 1 else data.shape[1]),
     )
-    file = audio.export(audio_buffer, format="wav")
     with open("input_audio.wav", "wb") as f:
         f.write(audio_buffer.getvalue())
@@ -144,12 +154,6 @@ def speaking(audio: np.ndarray, sampling_rate: int):
-@dataclass
-class AppState:
-    stream: np.ndarray | None = None
-    sampling_rate: int = 0
-    pause_detected: bool = False
 def process_audio(audio: tuple, state: AppState):
     if state.stream is None:
@@ -158,22 +162,22 @@ def process_audio(audio: tuple, state: AppState):
     else:
         state.stream =  np.concatenate((state.stream, audio[1]))
-    pause_detected = determine_pause(state.stream, state.sampling_rate)
     state.pause_detected = pause_detected
-    if state.pause_detected:
         return gr.Audio(recording=False), state
     return None, state
 def response(state: AppState):
     if not state.pause_detected:
-        return None, None, AppState()
     for mp3_bytes in speaking(state.stream, state.sampling_rate):
-        yield None, mp3_bytes, state
-    yield gr.Audio(recording=True), None, AppState()
 with gr.Blocks() as demo:
@@ -196,7 +200,12 @@ with gr.Blocks() as demo:
     respond = input_audio.stop_recording(
         response,
         [state],
-        [input_audio, output_audio, state]
     )
     cancel = gr.Button("Stop Conversation", variant="stop")
     cancel.click(lambda: AppState(), None, [state], cancels=[respond])

 import requests
 import traceback
 from dataclasses import dataclass
 import io
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 warm_up()
+@dataclass
+class AppState:
+    stream: np.ndarray | None = None
+    sampling_rate: int = 0
+    pause_detected: bool = False
+    started_talking = False
+def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
     temp_audio = audio
     dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
     duration = len(audio) / sampling_rate
+    if dur_vad > 0.5 and not state.started_talking:
+        print("started talking")
+        state.started_talking = True
+        return False
     print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
     return (duration - dur_vad) > 0.5
 def speaking(audio: np.ndarray, sampling_rate: int):
     audio_buffer = io.BytesIO()
+    segment = AudioSegment(
         audio.tobytes(),
         frame_rate=sampling_rate,
+        sample_width=audio.dtype.itemsize,
+        channels=(1 if len(audio.shape) == 1 else audio.shape[1]),
     )
+    segment.export(audio_buffer, format="wav")
     with open("input_audio.wav", "wb") as f:
         f.write(audio_buffer.getvalue())
 def process_audio(audio: tuple, state: AppState):
     if state.stream is None:
     else:
         state.stream =  np.concatenate((state.stream, audio[1]))
+    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
     state.pause_detected = pause_detected
+    if state.pause_detected and state.started_talking:
         return gr.Audio(recording=False), state
     return None, state
 def response(state: AppState):
     if not state.pause_detected:
+        return None, AppState()
     for mp3_bytes in speaking(state.stream, state.sampling_rate):
+        yield mp3_bytes, state
+    yield None, AppState()
 with gr.Blocks() as demo:
     respond = input_audio.stop_recording(
         response,
         [state],
+        [output_audio, state]
+    )
+    output_audio.stop(
+        lambda: gr.Audio(recording=True),
+        None,
+        [input_audio]
     )
     cancel = gr.Button("Stop Conversation", variant="stop")
     cancel.click(lambda: AppState(), None, [state], cancels=[respond])

requirements.txt CHANGED Viewed

@@ -11,7 +11,7 @@ streamlit==1.37.1
 pydub==0.25.1
 onnxruntime==1.19.0
 # numpy==1.26.3
-https://gradio-builds.s3.amazonaws.com/e3011b3b19ee8f7b7fc2dbba848d56a0b30b6cdb/gradio-5.0.0b1-py3-none-any.whl
 fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3

 pydub==0.25.1
 onnxruntime==1.19.0
 # numpy==1.26.3
+https://gradio-builds.s3.amazonaws.com/cffe9a7ab7f71e76d7214dc57c6278ffaf5bcdf9/gradio-5.0.0b1-py3-none-any.whl
 fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3