import gradio as gr from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, BartForConditionalGeneration import torch import librosa # Load BART tokenizer and model for summarization tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") summarizer = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") # Load Wav2Vec2 processor and model for transcription processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") # Check if CUDA is available device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) summarizer.to(device) def transcribe_and_summarize(audioFile): # Load audio as an array audio, sampling_rate = librosa.load(audioFile, sr=16000) # Ensure it's 16kHz for Wav2Vec2 values = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_values # Move tensors to GPU if available values = values.to(device) # Transcription with torch.no_grad(): logits = model(values).logits predictedIDs = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predictedIDs, skip_special_tokens=True)[0] # Summarization inputs = tokenizer(transcription, return_tensors="pt", truncation=True, max_length=1024) inputs = inputs.to(device) # Move inputs to GPU result = summarizer.generate( inputs["input_ids"], min_length=10, max_length=256, no_repeat_ngram_size=2, encoder_no_repeat_ngram_size=2, repetition_penalty=2.0, num_beams=4, early_stopping=True, ) summary = tokenizer.decode(result[0], skip_special_tokens=True) return transcription, summary # Gradio interface iface = gr.Interface( fn=transcribe_and_summarize, inputs=gr.Audio(type="filepath", label="Upload Audio"), outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")], title="Audio Transcription and Summarization", description="Transcribe and summarize audio using Wav2Vec2 and BART.", ) iface.launch()