Spaces:

gradio
/

omni-mini

Running on T4

App Files Files Community

freddyaboulton HF staff commited on Sep 18

Commit

891e37e

•

1 Parent(s): 0fcc790

first draft

Browse files

Files changed (1) hide show

app.py +84 -0

app.py CHANGED Viewed

@@ -21,10 +21,94 @@ thread.start()
 API_URL = "http://0.0.0.0:60808/chat"
 OUT_CHUNK = 4096
 OUT_RATE = 24000
 OUT_CHANNELS = 1
 def process_audio(audio):
     filepath = audio
     print(f"filepath: {filepath}")

 API_URL = "http://0.0.0.0:60808/chat"
+# recording parameters
+IN_FORMAT = pyaudio.paInt16
+IN_CHANNELS = 1
+IN_RATE = 24000
+IN_CHUNK = 1024
+IN_SAMPLE_WIDTH = 2
+VAD_STRIDE = 0.5
+# playing parameters
+OUT_FORMAT = pyaudio.paInt16
+OUT_CHANNELS = 1
+OUT_RATE = 24000
+OUT_SAMPLE_WIDTH = 2
+OUT_CHUNK = 5760
 OUT_CHUNK = 4096
 OUT_RATE = 24000
 OUT_CHANNELS = 1
+def run_vad(ori_audio, sr):
+    _st = time.time()
+    try:
+        audio = np.frombuffer(ori_audio, dtype=np.int16)
+        audio = audio.astype(np.float32) / 32768.0
+        sampling_rate = 16000
+        if sr != sampling_rate:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
+        vad_parameters = {}
+        vad_parameters = VadOptions(**vad_parameters)
+        speech_chunks = get_speech_timestamps(audio, vad_parameters)
+        audio = collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
+        if sr != sampling_rate:
+            # resample to original sampling rate
+            vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
+        else:
+            vad_audio = audio
+        vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
+        vad_audio_bytes = vad_audio.tobytes()
+        return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
+    except Exception as e:
+        msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
+        print(msg)
+        return -1, ori_audio, round(time.time() - _st, 4)
+def warm_up():
+    frames = b"\x00\x00" * 1024 * 2  # 1024 frames of 2 bytes each
+    dur, frames, tcost = run_vad(frames, 16000)
+    print(f"warm up done, time_cost: {tcost:.3f} s")
+warm_up()
+def determine_pause():
+    temp_audio = b""
+    vad_audio = b""
+    start_talking = False
+    last_temp_audio = None
+    while st.session_state.recording:
+        status.success("Listening...")
+        audio_bytes = stream.read(IN_CHUNK)
+        temp_audio += audio_bytes
+        if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
+            dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)
+            print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+            if dur_vad > 0.2 and not start_talking:
+                if last_temp_audio is not None:
+                    st.session_state.frames.append(last_temp_audio)
+                start_talking = True
+            if start_talking:
+                st.session_state.frames.append(temp_audio)
+            if dur_vad < 0.1 and start_talking:
+                st.session_state.recording = False
+                print(f"speech end detected. excit")
+            last_temp_audio = temp_audio
+            temp_audio = b""
 def process_audio(audio):
     filepath = audio
     print(f"filepath: {filepath}")