Spaces:

gradio
/

omni-mini

Running on T4

App Files Files Community

freddyaboulton HF staff commited on Sep 23

Commit

2084afa

•

1 Parent(s): c4d6bf6

try again

Browse files

Files changed (2) hide show

app.py +48 -44
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -51,7 +51,7 @@ OUT_CHANNELS = 1
 def run_vad(ori_audio, sr):
     _st = time.time()
     try:
-        audio = np.frombuffer(ori_audio, dtype=np.int16)
         audio = audio.astype(np.float32) / 32768.0
         sampling_rate = 16000
         if sr != sampling_rate:
@@ -87,42 +87,32 @@ def warm_up():
 warm_up()
-def determine_pause(stream: bytes, start_talking: bool) -> tuple[bool, bool]:
     """Take in the stream, determine if a pause happened"""
-    temp_audio = stream
-    if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
-        dur_vad, _, time_vad = run_vad(temp_audio, IN_RATE)
-        print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
-        if dur_vad > 0.2 and not start_talking:
-            start_talking = True
-            pause = False
-            return pause, start_talking
-        if dur_vad < 0.1 and start_talking:
-            print("pause detected")
-            return True, start_talking
-        return False, start_talking
-    return False, start_talking
-def speaking(total_frames: bytes):
     audio_buffer = io.BytesIO()
-    wf = wave.open(audio_buffer, "wb")
-    wf.setnchannels(IN_CHANNELS)
-    wf.setsampwidth(IN_SAMPLE_WIDTH)
-    wf.setframerate(IN_RATE)
-    dur = len(total_frames) / (IN_RATE * IN_CHANNELS * IN_SAMPLE_WIDTH)
-    print(f"Speaking... recorded audio duration: {dur:.3f} s")
-    wf.writeframes(total_frames)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
-        with open(tmpfile.name, "wb") as f:
-            f.write(audio_buffer.getvalue())
     audio_bytes = audio_buffer.getvalue()
@@ -152,31 +142,38 @@ def speaking(total_frames: bytes):
         except Exception as e:
             raise gr.Error(f"Error during audio streaming: {e}")
-    wf.close()
 @dataclass
 class AppState:
-    start_talking: bool = False
-    stream: bytes = b""
     pause_detected: bool = False
-def process_audio(audio: str, state: AppState):
-    state.stream += Path(audio).read_bytes()
-    pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
     state.pause_detected = pause_detected
-    state.start_talking = start_talking
-    if not state.pause_detected:
-        yield None, state
-    for out_bytes in speaking(state.stream):
-        yield out_bytes, state
-    state = AppState()
-    yield None, state
 with gr.Blocks() as demo:
@@ -189,13 +186,20 @@ with gr.Blocks() as demo:
             output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
     state = gr.State(value=AppState())
-    input_audio.stop_recording(
         process_audio,
         [input_audio, state],
-        [output_audio, state],
         stream_every=0.5,
         time_limit=30,
     )
 demo.launch()

 def run_vad(ori_audio, sr):
     _st = time.time()
     try:
+        audio = ori_audio
         audio = audio.astype(np.float32) / 32768.0
         sampling_rate = 16000
         if sr != sampling_rate:
 warm_up()
+def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
     """Take in the stream, determine if a pause happened"""
+    temp_audio = audio
+    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
+    duration = len(audio) / sampling_rate
+    print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+    return (duration - dur_vad) > 0.5
+def speaking(audio: np.ndarray, sampling_rate: int):
     audio_buffer = io.BytesIO()
+    audio = AudioSegment(
+        data.tobytes(),
+        frame_rate=sampling_rate,
+        sample_width=data.dtype.itemsize,
+        channels=(1 if len(data.shape) == 1 else data.shape[1]),
+    )
+    file = audio.export(audio_buffer, format="wav")
+    with open("input_audio.wav", "wb") as f:
+        f.write(audio_buffer.getvalue())
     audio_bytes = audio_buffer.getvalue()
         except Exception as e:
             raise gr.Error(f"Error during audio streaming: {e}")
 @dataclass
 class AppState:
+    stream: np.ndarray | None = None
+    sampling_rate: int = 0
     pause_detected: bool = False
+def process_audio(audio: tuple, state: AppState):
+    if state.stream is None:
+        state.stream = audio[1]
+        state.sampling_rate = audio[0]
+    else:
+        state.stream =  np.concatenate((state.stream, audio[1]))
+    pause_detected = determine_pause(state.stream, state.sampling_rate)
     state.pause_detected = pause_detected
+    if state.pause_detected:
+        return gr.Audio(recording=False), state
+    return None, state
+def response(state: AppState):
+    if not state.pause_detected:
+        return None, None, AppState()
+    for mp3_bytes in speaking(state.stream, state.sampling_rate):
+        yield None, mp3_bytes, state
+    yield gr.Audio(recording=True), None, AppState()
 with gr.Blocks() as demo:
             output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
     state = gr.State(value=AppState())
+    stream = input_audio.stream(
         process_audio,
         [input_audio, state],
+        [input_audio, state],
         stream_every=0.5,
         time_limit=30,
     )
+    respond = inp.stop_recording(
+        response,
+        [state],
+        [input_audio, output_audio, state]
+    )
+    cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(lambda: AppState(), None, [state], cancels=[respond])
 demo.launch()

requirements.txt CHANGED Viewed

@@ -11,7 +11,7 @@ streamlit==1.37.1
 pydub==0.25.1
 onnxruntime==1.19.0
 # numpy==1.26.3
-https://gradio-builds.s3.amazonaws.com/5.0-dev/e2157efe20cdec2454b0b5d312fad00b2b5bfe1c/gradio-5.0.0b1-py3-none-any.whl
 fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3

 pydub==0.25.1
 onnxruntime==1.19.0
 # numpy==1.26.3
+https://gradio-builds.s3.amazonaws.com/e3011b3b19ee8f7b7fc2dbba848d56a0b30b6cdb/gradio-5.0.0b1-py3-none-any.whl
 fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3