Spaces:

gradio
/

omni-mini

Running on T4

App Files Files Community

freddyaboulton HF staff commited on Sep 18

Commit

d531709

•

1 Parent(s): 556b4ae

first response

Browse files

Files changed (1) hide show

app.py +93 -45

app.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import gradio as gr
 from huggingface_hub import snapshot_download
 from threading import Thread
-import os
 import time
-import gradio as gr
 import base64
 import numpy as np
 import requests
 import traceback
 from server import serve
@@ -78,58 +84,100 @@ def warm_up():
 warm_up()
-def determine_pause(stream: bytes, start_talking: bool) -> tuple[bytes, bool]:
     """Take in the stream, determine if a pause happened"""
     temp_audio = stream
     if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
-        dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)
         print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
         if dur_vad > 0.2 and not start_talking:
-            if last_temp_audio is not None:
-                st.session_state.frames.append(last_temp_audio)
             start_talking = True
-        if start_talking:
-            st.session_state.frames.append(temp_audio)
         if dur_vad < 0.1 and start_talking:
-            st.session_state.recording = False
-            print(f"speech end detected. excit")
-        last_temp_audio = temp_audio
-        temp_audio = b""
-def process_audio(audio):
-    filepath = audio
-    print(f"filepath: {filepath}")
-    if filepath is None:
-        return
-    cnt = 0
-    with open(filepath, "rb") as f:
-        data = f.read()
-        base64_encoded = str(base64.b64encode(data), encoding="utf-8")
-        files = {"audio": base64_encoded}
-        tik = time.time()
-        with requests.post(API_URL, json=files, stream=True) as response:
-            try:
-                for chunk in response.iter_content(chunk_size=OUT_CHUNK):
-                    if chunk:
-                        # Convert chunk to numpy array
-                        if cnt == 0:
-                            print(f"first chunk time cost: {time.time() - tik:.3f}")
-                        cnt += 1
-                        audio_data = np.frombuffer(chunk, dtype=np.int16)
-                        audio_data = audio_data.reshape(-1, OUT_CHANNELS)
-                        yield OUT_RATE, audio_data.astype(np.int16)
-            except Exception as e:
-                print(f"error: {e}")
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
 from huggingface_hub import snapshot_download
 from threading import Thread
 import time
 import base64
 import numpy as np
 import requests
 import traceback
+from dataclasses import dataclass
+from pathlib import Path
+import io
+import wave
+import tempfile
+import librosa
+from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 from server import serve
 warm_up()
+def determine_pause(stream: bytes, start_talking: bool) -> tuple[bool, bool]:
     """Take in the stream, determine if a pause happened"""
     temp_audio = stream
     if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
+        dur_vad, _, time_vad = run_vad(temp_audio, IN_RATE)
         print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
         if dur_vad > 0.2 and not start_talking:
             start_talking = True
+            pause = False
+            return pause, start_talking
         if dur_vad < 0.1 and start_talking:
+            print("pause detected")
+            return True, start_talking
+        return False, start_talking
+    return False, start_talking
+def speaking(total_frames: bytes):
+    audio_buffer = io.BytesIO()
+    wf = wave.open(audio_buffer, "wb")
+    wf.setnchannels(IN_CHANNELS)
+    wf.setsampwidth(IN_SAMPLE_WIDTH)
+    wf.setframerate(IN_RATE)
+    dur = len(total_frames) / (IN_RATE * IN_CHANNELS * IN_SAMPLE_WIDTH)
+    print(f"Speaking... recorded audio duration: {dur:.3f} s")
+    wf.writeframes(total_frames)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
+        with open(tmpfile.name, "wb") as f:
+            f.write(audio_buffer.getvalue())
+    audio_bytes = audio_buffer.getvalue()
+    base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
+    files = {"audio": base64_encoded}
+    with requests.post(API_URL, json=files, stream=True) as response:
+        try:
+            for chunk in response.iter_content(chunk_size=OUT_CHUNK):
+                if chunk:
+                    yield chunk
+                    # # Convert chunk to numpy array
+                    # output_audio_bytes += chunk
+                    # audio_data = np.frombuffer(chunk, dtype=np.int8)
+                    # # Play audio
+                    # stream.write(audio_data)
+        except Exception as e:
+            raise gr.Error(f"Error during audio streaming: {e}")
+    wf.close()
+@dataclass
+class AppState:
+    start_talking: bool = False
+    stream: bytes = b""
+    pause_detected: bool = False
+def process_audio(audio: str, state: AppState):
+    state.stream += Path(audio).read_bytes()
+    pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
+    state.pause_detected = pause_detected
+    state.start_talking = start_talking
+    if not state.pause_detected:
+        yield None, state
+    for out_bytes in speaking(state.stream):
+        yield out_bytes, state
+    state = AppState()
+    yield None, state
+with gr.Blocks() as demo:
+    with gr.Row():
+        input_audio = gr.Audio(label="Input Audio")
+    with gr.Row():
+        output_audio = gr.Audio(label="Output Audio")
+    state = gr.State(value=AppState())
+    input_audio.stream(process_audio, [input_audio, state], [output_audio, state],
+                       stream_every=0.5, time_limit=30)
 demo.launch()