Spaces:

fishaudio
/

fish-agent

Running on L40S

App Files Files Community

SpicyqSama007 commited on 2 days ago

Commit

7a0182a

•

1 Parent(s): f9b6bed

Streaming agent

Browse files

Files changed (1) hide show

app.py +20 -15

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import re
 import gradio as gr
 import numpy as np
 import os
 import threading
 import subprocess
 import sys
@@ -52,6 +54,17 @@ class ChatState:
 def clear_fn():
     return [], ChatState(), None, None, None
 async def process_audio_input(
     sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
@@ -72,11 +85,9 @@ async def process_audio_input(
     if isinstance(sys_audio_input, tuple):
         sr, sys_audio_data = sys_audio_input
-    elif text_input:
         sr = 44100
         sys_audio_data = None
-    else:
-        raise gr.Error("Invalid audio format")
     def append_to_chat_ctx(
         part: ServeTextPart | ServeVQPart, role: str = "assistant"
@@ -106,22 +117,16 @@ async def process_audio_input(
     ):
         if event.type == FishE2EEventType.USER_CODES:
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
         elif event.type == FishE2EEventType.SPEECH_SEGMENT:
-            result_audio += event.frame.data
-            np_audio = np.frombuffer(result_audio, dtype=np.int16)
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
-            yield state.get_history(), (44100, np_audio), None, None
         elif event.type == FishE2EEventType.TEXT_SEGMENT:
             append_to_chat_ctx(ServeTextPart(text=event.text))
-            if result_audio:
-                np_audio = np.frombuffer(result_audio, dtype=np.int16)
-                yield state.get_history(), (44100, np_audio), None, None
-            else:
-                yield state.get_history(), None, None, None
-    np_audio = np.frombuffer(result_audio, dtype=np.int16)
-    yield state.get_history(), (44100, np_audio), None, None
 async def process_text_input(

 import gradio as gr
 import numpy as np
 import os
+import io
+import wave
 import threading
 import subprocess
 import sys
 def clear_fn():
     return [], ChatState(), None, None, None
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
 async def process_audio_input(
     sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
     if isinstance(sys_audio_input, tuple):
         sr, sys_audio_data = sys_audio_input
+    else:
         sr = 44100
         sys_audio_data = None
     def append_to_chat_ctx(
         part: ServeTextPart | ServeVQPart, role: str = "assistant"
     ):
         if event.type == FishE2EEventType.USER_CODES:
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
         elif event.type == FishE2EEventType.SPEECH_SEGMENT:
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
+            yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
         elif event.type == FishE2EEventType.TEXT_SEGMENT:
             append_to_chat_ctx(ServeTextPart(text=event.text))
+            yield state.get_history(), None, None, None
+    yield state.get_history(), None, None, None
 async def process_text_input(