Spaces:

bistecglobal
/

whisper-resume

Sleeping

App Files Files Community

whisper-resume / app.py

chan4lk

with voice clone

4cd2ebc about 2 months ago

raw

history blame

2.88 kB

	import gradio as gr
	import os
	from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
	from datasets import load_dataset
	import torch
	import soundfile as sf
	from pdfminer.high_level import extract_text
	from llama_cpp import Llama


	# Check if MPS is available and set the device
	if torch.backends.mps.is_available():
	device = torch.device("mps")
	print("Using MPS device")
	else:
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"MPS not available, using {device}")
	def toText(audio):
	asr = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-tiny.en",
	chunk_length_s=30,
	device=device,
	)
	question = asr(audio, batch_size=8)["text"]
	return question


	def extract_answer(question, text):
	# Load the LLaMA model
	model_path="/Users/chandima/.cache/lm-studio/models/lmstudio-community/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q3_K_L.gguf"
	# Load the LLaMA model with MPS acceleration
	llm = Llama(
	model_path=model_path,
	n_gpu_layers=-1, # Use all available layers for GPU acceleration
	n_ctx=2048, # Adjust context size as needed
	verbose=True, # Optional: for debugging
	use_mlock=True, # Optional: for better memory management
	n_threads=6, # Adjust based on your CPU
	use_mmap=True, # Optional: for faster loading
	)

	# Use LLaMA to extract skills
	prompt = f"""
	Answer the question based on the Resume.

	Question:
	{question}:

	Resume:
	{text}

	Answer:
	"""

	response = llm(prompt, max_tokens=800, stop=["Human:", "\n\n"])
	answer = response['choices'][0]['text'].strip()
	print(answer)
	return answer

	def toAudio(text):
	synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device=device)
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
	speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
	return speech

	def clone(audio, file):
	question = toText(audio=audio)
	text = extract_text(file.name)
	res = extract_answer(question, text)
	print(res)
	speech = toAudio(res)
	sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
	return "./speech.wav"

	iface = gr.Interface(fn=clone,
	inputs=[gr.Audio(type='filepath', label='Voice reference audio file'), gr.File(label="Resume")],
	outputs=gr.Audio(label='Says'),
	title='Voice Clone',
	description="""
	whisper
	""",
	theme = gr.themes.Base(primary_hue="teal",secondary_hue="teal",neutral_hue="slate"))
	iface.launch()