Spaces:

gradio
/

omni-mini

Running on T4

File size: 4,132 Bytes

import gradio as gr
from huggingface_hub import snapshot_download
from threading import Thread
import os
import time
import gradio as gr
import base64
import numpy as np
import requests
import traceback

from server import serve

repo_id = "gpt-omni/mini-omni"
snapshot_download(repo_id, local_dir="./checkpoint", revision="main")

IP='0.0.0.0'
PORT=60808

thread = Thread(target=serve, daemon=True)
thread.start()

API_URL = "http://0.0.0.0:60808/chat"

# recording parameters
IN_CHANNELS = 1
IN_RATE = 24000
IN_CHUNK = 1024
IN_SAMPLE_WIDTH = 2
VAD_STRIDE = 0.5

# playing parameters
OUT_CHANNELS = 1
OUT_RATE = 24000
OUT_SAMPLE_WIDTH = 2
OUT_CHUNK = 5760



OUT_CHUNK = 4096
OUT_RATE = 24000
OUT_CHANNELS = 1

def run_vad(ori_audio, sr):
    _st = time.time()
    try:
        audio = np.frombuffer(ori_audio, dtype=np.int16)
        audio = audio.astype(np.float32) / 32768.0
        sampling_rate = 16000
        if sr != sampling_rate:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)

        vad_parameters = {}
        vad_parameters = VadOptions(**vad_parameters)
        speech_chunks = get_speech_timestamps(audio, vad_parameters)
        audio = collect_chunks(audio, speech_chunks)
        duration_after_vad = audio.shape[0] / sampling_rate

        if sr != sampling_rate:
            # resample to original sampling rate
            vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
        else:
            vad_audio = audio
        vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
        vad_audio_bytes = vad_audio.tobytes()

        return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
    except Exception as e:
        msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
        print(msg)
        return -1, ori_audio, round(time.time() - _st, 4)


def warm_up():
    frames = b"\x00\x00" * 1024 * 2  # 1024 frames of 2 bytes each
    dur, frames, tcost = run_vad(frames, 16000)
    print(f"warm up done, time_cost: {tcost:.3f} s")

warm_up()

def determine_pause(stream: bytes, start_talking: bool) -> tuple[bytes, bool]:
    """Take in the stream, determine if a pause happened"""

    temp_audio = stream

    if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
        dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)

        print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")

        if dur_vad > 0.2 and not start_talking:
            if last_temp_audio is not None:
                st.session_state.frames.append(last_temp_audio)
            start_talking = True
        if start_talking:
            st.session_state.frames.append(temp_audio)
        if dur_vad < 0.1 and start_talking:
            st.session_state.recording = False
            print(f"speech end detected. excit")
        last_temp_audio = temp_audio
        temp_audio = b""


def process_audio(audio):
    filepath = audio
    print(f"filepath: {filepath}")
    if filepath is None:
        return

    cnt = 0
    with open(filepath, "rb") as f:
        data = f.read()
        base64_encoded = str(base64.b64encode(data), encoding="utf-8")
        files = {"audio": base64_encoded}
        tik = time.time()
        with requests.post(API_URL, json=files, stream=True) as response:
            try:
                for chunk in response.iter_content(chunk_size=OUT_CHUNK):
                    if chunk:
                        # Convert chunk to numpy array
                        if cnt == 0:
                            print(f"first chunk time cost: {time.time() - tik:.3f}")
                        cnt += 1
                        audio_data = np.frombuffer(chunk, dtype=np.int16)
                        audio_data = audio_data.reshape(-1, OUT_CHANNELS)
                        yield OUT_RATE, audio_data.astype(np.int16)

            except Exception as e:
                print(f"error: {e}")

def greet(name):
    return "Hello " + name + "!!"

demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch()