from llama_cpp import Llama import whisper from TTS.api import TTS import numpy as np import gradio as gr from gradio_unifiedaudio import UnifiedAudio from pathlib import Path import torch from scipy.io import wavfile from collections import deque whisper_model = whisper.load_model("base") llm = Llama.from_pretrained( repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", filename="*q8_0.gguf", verbose=False ) tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) dir_ = Path(__file__).parent instream = None def detect_pause(instream, energy_threshold=800, pause_duration=2.0, sample_rate=16000): pause_samples = int(pause_duration * sample_rate) energy = np.abs(instream[1]) window = deque(maxlen=pause_samples) for i, e in enumerate(energy): window.append(e < energy_threshold) if len(window) == pause_samples and all(window): return True return False def add_to_stream(audio, instream, pause_detected): if instream is None: ret = audio else: ret = (audio[0], np.concatenate((instream[1], audio[1]))) if detect_pause(instream): pause_detected = True stop_recording(ret) return audio, ret, pause_detected def stop_recording(audio): wavfile.write("user_output.wav", audio[0], audio[1]) text = whisper_model.transcribe("user_output.wav")['text'] print(f"You said: {text}") if text.lower() in ["exit", "quit", "stop"]: print("Voice Assistant is shutting down.") response = generate_response(text) print(f"Assistant: {response}") return UnifiedAudio(value=speak_text(response), streaming=False) def stop_playing(): pause_detected = False return UnifiedAudio(value=None, streaming=True), None, pause_detected def transcribe_audio(audio_data): return whisper_model.transcribe("user_output.wav", language='en')['text'] def generate_response(prompt): response = llm(prompt=prompt) return response['choices'][0]['text'].strip() def speak_text(text): tts.tts_to_file(text=text.strip(), file_path="bot_output.wav") return "bot_output.wav" with gr.Blocks() as demo: mic = UnifiedAudio(sources=["microphone"], streaming=True) stream = gr.State() pause_detected = gr.State(False) mic.stop_recording(stop_recording, stream, mic) mic.end(stop_playing, None, [mic, stream, pause_detected]) mic.stream(add_to_stream, [mic, stream, pause_detected], [mic, stream, pause_detected]) # @gr.render(inputs=[mic, stream, pause_detected]) # def recording_paused(microphone, stream, pause_detected): # if pause_detected: # stop_recording(stream) if __name__ == '__main__': demo.launch()