Spaces:

gradio
/

omni-mini

Running on T4

freddyaboulton HF staff commited on Sep 18

Commit

556b4ae

•

1 Parent(s): 41d06ba

pause

Files changed (1) hide show

app.py CHANGED Viewed

@@ -78,34 +78,27 @@ def warm_up():
 warm_up()
-def determine_pause():
-    temp_audio = b""
-    vad_audio = b""
-    start_talking = False
-    last_temp_audio = None
-    while st.session_state.recording:
-        status.success("Listening...")
-        audio_bytes = stream.read(IN_CHUNK)
-        temp_audio += audio_bytes
-        if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
-            dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)
-            print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
-            if dur_vad > 0.2 and not start_talking:
-                if last_temp_audio is not None:
-                    st.session_state.frames.append(last_temp_audio)
-                start_talking = True
-            if start_talking:
-                st.session_state.frames.append(temp_audio)
-            if dur_vad < 0.1 and start_talking:
-                st.session_state.recording = False
-                print(f"speech end detected. excit")
-            last_temp_audio = temp_audio
-            temp_audio = b""
 def process_audio(audio):

 warm_up()
+def determine_pause(stream: bytes, start_talking: bool) -> tuple[bytes, bool]:
+    """Take in the stream, determine if a pause happened"""
+    temp_audio = stream
+    if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
+        dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)
+        print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+        if dur_vad > 0.2 and not start_talking:
+            if last_temp_audio is not None:
+                st.session_state.frames.append(last_temp_audio)
+            start_talking = True
+        if start_talking:
+            st.session_state.frames.append(temp_audio)
+        if dur_vad < 0.1 and start_talking:
+            st.session_state.recording = False
+            print(f"speech end detected. excit")
+        last_temp_audio = temp_audio
+        temp_audio = b""
 def process_audio(audio):