dawood HF staff commited on
Commit
9088133
1 Parent(s): a66f187

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ import whisper
3
+ from TTS.api import TTS
4
+ import numpy as np
5
+ import gradio as gr
6
+ from gradio_unifiedaudio import UnifiedAudio
7
+ from pathlib import Path
8
+ import torch
9
+ from scipy.io import wavfile
10
+ from collections import deque
11
+
12
+ whisper_model = whisper.load_model("base")
13
+ llm = Llama.from_pretrained(
14
+ repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
15
+ filename="*q8_0.gguf",
16
+ verbose=False
17
+ )
18
+ tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
19
+ dir_ = Path(__file__).parent
20
+ instream = None
21
+
22
+ def detect_pause(instream, energy_threshold=800, pause_duration=2.0, sample_rate=16000):
23
+ pause_samples = int(pause_duration * sample_rate)
24
+ energy = np.abs(instream[1])
25
+
26
+ window = deque(maxlen=pause_samples)
27
+ for i, e in enumerate(energy):
28
+ window.append(e < energy_threshold)
29
+ if len(window) == pause_samples and all(window):
30
+ return True
31
+ return False
32
+
33
+ def add_to_stream(audio, instream, pause_detected):
34
+ if instream is None:
35
+ ret = audio
36
+ else:
37
+ ret = (audio[0], np.concatenate((instream[1], audio[1])))
38
+ if detect_pause(instream):
39
+ pause_detected = True
40
+ stop_recording(ret)
41
+ return audio, ret, pause_detected
42
+
43
+ def stop_recording(audio):
44
+ wavfile.write("user_output.wav", audio[0], audio[1])
45
+ text = whisper_model.transcribe("user_output.wav")['text']
46
+ print(f"You said: {text}")
47
+
48
+ if text.lower() in ["exit", "quit", "stop"]:
49
+ print("Voice Assistant is shutting down.")
50
+
51
+ response = generate_response(text)
52
+ print(f"Assistant: {response}")
53
+ return UnifiedAudio(value=speak_text(response), streaming=False)
54
+
55
+ def stop_playing():
56
+ pause_detected = False
57
+ return UnifiedAudio(value=None, streaming=True), None, pause_detected
58
+
59
+ def transcribe_audio(audio_data):
60
+ return whisper_model.transcribe("user_output.wav", language='en')['text']
61
+
62
+ def generate_response(prompt):
63
+ response = llm(prompt=prompt)
64
+ return response['choices'][0]['text'].strip()
65
+
66
+ def speak_text(text):
67
+ tts.tts_to_file(text=text.strip(), file_path="bot_output.wav")
68
+ return "bot_output.wav"
69
+
70
+ with gr.Blocks() as demo:
71
+ mic = UnifiedAudio(sources=["microphone"], streaming=True)
72
+ stream = gr.State()
73
+ pause_detected = gr.State(False)
74
+ mic.stop_recording(stop_recording, stream, mic)
75
+ mic.end(stop_playing, None, [mic, stream, pause_detected])
76
+ mic.stream(add_to_stream, [mic, stream, pause_detected], [mic, stream, pause_detected])
77
+
78
+ # @gr.render(inputs=[mic, stream, pause_detected])
79
+ # def recording_paused(microphone, stream, pause_detected):
80
+ # if pause_detected:
81
+ # stop_recording(stream)
82
+
83
+ if __name__ == '__main__':
84
+ demo.launch()