Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

App Files Files Community

Persian-Speech-Transcription / app.py

SeyedAli

Update app.py

8aac9d1 about 1 year ago

raw

history blame contribute delete

1.87 kB

	import tempfile ,os
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForCTC,pipeline
	import torch
	import numpy as np
	import torchaudio
	import numpy as np
	import re
	import string

	audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
	text_output = gr.TextArea(label="متن فارسی",text_align="right",rtl=True,type="text")

	processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")

	def ASR(audio):
	pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
	# Copy the contents of the uploaded audio file to the temporary file
	temp_audio_file.write(open(audio, "rb").read())
	temp_audio_file.flush()
	# Load the audio file using torchaudio
	waveform, sample_rate = torchaudio.load(temp_audio_file.name)
	# Resample the audio to 16kHz
	resampler = torchaudio.transforms.Resample(sample_rate, 16000)
	waveform = resampler(waveform)
	# Convert the PyTorch tensor to a NumPy ndarray
	# Preprocess the audio file
	input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
	attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
	# Transcribe the audio file
	with torch.no_grad():
	logits = model(input_values,attention_mask).logits
	# Decode the transcription
	result = processor.decode(torch.argmax(logits[0], dim=-1))
	return result
	iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
	iface.launch(share=False)