JuanjoJ55 commited on
Commit
e2374dd
1 Parent(s): 39cf578

doc: removed comments

Browse files
Files changed (1) hide show
  1. app.py +7 -13
app.py CHANGED
@@ -1,17 +1,17 @@
1
  import gradio as gr
2
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, BartForConditionalGeneration
3
  import torch
4
- import torchaudio # Replace librosa for faster audio processing
5
 
6
- # Load BART tokenizer and model for summarization
7
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
8
  summarizer = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
9
 
10
- # Load Wav2Vec2 processor and model for transcription
11
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
12
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
13
 
14
- # Check if CUDA is available
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model.to(device)
17
  summarizer.to(device)
@@ -21,30 +21,25 @@ summarizer = torch.quantization.quantize_dynamic(summarizer, {torch.nn.Linear},
21
 
22
 
23
  def transcribe_and_summarize(audioFile):
24
- # Load audio using torchaudio
25
  audio, sampling_rate = torchaudio.load(audioFile)
26
 
27
- # Resample audio to 16kHz if necessary
28
  if sampling_rate != 16000:
29
  resample_transform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
30
  audio = resample_transform(audio)
31
  audio = audio.squeeze()
32
 
33
- # Process audio in chunks for large files
34
- chunk_size = int(16000 * 30) # 10-second chunks
35
  transcription = ""
36
 
37
  for i in range(0, len(audio), chunk_size):
38
  chunk = audio[i:i+chunk_size].numpy()
39
  inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_values.to(device)
40
 
41
- # Transcription
42
  with torch.no_grad():
43
  logits = model(inputs).logits
44
  predicted_ids = torch.argmax(logits, dim=-1)
45
  transcription += processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + " "
46
 
47
- # Summarization
48
  inputs = tokenizer(transcription, return_tensors="pt", truncation=True, max_length=1024).to(device)
49
 
50
  result = summarizer.generate(
@@ -54,20 +49,19 @@ def transcribe_and_summarize(audioFile):
54
  no_repeat_ngram_size=2,
55
  encoder_no_repeat_ngram_size=2,
56
  repetition_penalty=2.0,
57
- num_beams=2, # Reduced beams for faster inference
58
  early_stopping=True,
59
  )
60
  summary = tokenizer.decode(result[0], skip_special_tokens=True)
61
 
62
  return transcription.strip(), summary.strip()
63
 
64
- # Gradio interface
65
  iface = gr.Interface(
66
  fn=transcribe_and_summarize,
67
  inputs=gr.Audio(type="filepath", label="Upload Audio"),
68
  outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
69
  title="Audio Transcription and Summarization",
70
- description="Transcribe and summarize audio using Wav2Vec2 and BART.",
71
  )
72
 
73
  iface.launch()
 
1
  import gradio as gr
2
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, BartForConditionalGeneration
3
  import torch
4
+ import torchaudio
5
 
6
+ # Load BART
7
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
8
  summarizer = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
9
 
10
+ # Load Wav2Vec2
11
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
12
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
13
 
14
+ # Check for CUDA
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model.to(device)
17
  summarizer.to(device)
 
21
 
22
 
23
  def transcribe_and_summarize(audioFile):
 
24
  audio, sampling_rate = torchaudio.load(audioFile)
25
 
 
26
  if sampling_rate != 16000:
27
  resample_transform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
28
  audio = resample_transform(audio)
29
  audio = audio.squeeze()
30
 
31
+ chunk_size = int(16000 * 30)
 
32
  transcription = ""
33
 
34
  for i in range(0, len(audio), chunk_size):
35
  chunk = audio[i:i+chunk_size].numpy()
36
  inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_values.to(device)
37
 
 
38
  with torch.no_grad():
39
  logits = model(inputs).logits
40
  predicted_ids = torch.argmax(logits, dim=-1)
41
  transcription += processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + " "
42
 
 
43
  inputs = tokenizer(transcription, return_tensors="pt", truncation=True, max_length=1024).to(device)
44
 
45
  result = summarizer.generate(
 
49
  no_repeat_ngram_size=2,
50
  encoder_no_repeat_ngram_size=2,
51
  repetition_penalty=2.0,
52
+ num_beams=2,
53
  early_stopping=True,
54
  )
55
  summary = tokenizer.decode(result[0], skip_special_tokens=True)
56
 
57
  return transcription.strip(), summary.strip()
58
 
 
59
  iface = gr.Interface(
60
  fn=transcribe_and_summarize,
61
  inputs=gr.Audio(type="filepath", label="Upload Audio"),
62
  outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
63
  title="Audio Transcription and Summarization",
64
+ description="Transcribe and summarize audio using Audio Summarizer.",
65
  )
66
 
67
  iface.launch()