whisper-resume / app.py
chan4lk's picture
with voice clone
4cd2ebc
raw
history blame
2.88 kB
import gradio as gr
import os
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
import torch
import soundfile as sf
from pdfminer.high_level import extract_text
from llama_cpp import Llama
# Check if MPS is available and set the device
if torch.backends.mps.is_available():
device = torch.device("mps")
print("Using MPS device")
else:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"MPS not available, using {device}")
def toText(audio):
asr = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny.en",
chunk_length_s=30,
device=device,
)
question = asr(audio, batch_size=8)["text"]
return question
def extract_answer(question, text):
# Load the LLaMA model
model_path="/Users/chandima/.cache/lm-studio/models/lmstudio-community/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q3_K_L.gguf"
# Load the LLaMA model with MPS acceleration
llm = Llama(
model_path=model_path,
n_gpu_layers=-1, # Use all available layers for GPU acceleration
n_ctx=2048, # Adjust context size as needed
verbose=True, # Optional: for debugging
use_mlock=True, # Optional: for better memory management
n_threads=6, # Adjust based on your CPU
use_mmap=True, # Optional: for faster loading
)
# Use LLaMA to extract skills
prompt = f"""
Answer the question based on the Resume.
Question:
{question}:
Resume:
{text}
Answer:
"""
response = llm(prompt, max_tokens=800, stop=["Human:", "\n\n"])
answer = response['choices'][0]['text'].strip()
print(answer)
return answer
def toAudio(text):
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device=device)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
return speech
def clone(audio, file):
question = toText(audio=audio)
text = extract_text(file.name)
res = extract_answer(question, text)
print(res)
speech = toAudio(res)
sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
return "./speech.wav"
iface = gr.Interface(fn=clone,
inputs=[gr.Audio(type='filepath', label='Voice reference audio file'), gr.File(label="Resume")],
outputs=gr.Audio(label='Says'),
title='Voice Clone',
description="""
whisper
""",
theme = gr.themes.Base(primary_hue="teal",secondary_hue="teal",neutral_hue="slate"))
iface.launch()