Amr453 commited on
Commit
f018e8b
1 Parent(s): eca3d88

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import datetime
4
+
5
+ import subprocess
6
+
7
+ import torch
8
+ import pyannote.audio
9
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
10
+
11
+ from pyannote.audio import Audio
12
+ from pyannote.core import Segment
13
+
14
+ import wave
15
+ import contextlib
16
+
17
+ from sklearn.cluster import AgglomerativeClustering
18
+ import numpy as np
19
+
20
+ model = whisper.load_model("large-v2")
21
+ embedding_model = PretrainedSpeakerEmbedding(
22
+ "speechbrain/spkrec-ecapa-voxceleb",
23
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
+ )
25
+
26
+ def transcribe(audio, num_speakers):
27
+ path, error = convert_to_wav(audio)
28
+ if error is not None:
29
+ return error
30
+
31
+ duration = get_duration(path)
32
+ if duration > 4 * 60 * 60:
33
+ return "Audio duration too long"
34
+
35
+ result = model.transcribe(path)
36
+ segments = result["segments"]
37
+
38
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
39
+ if len(segments) == 1:
40
+ segments[0]['speaker'] = 'SPEAKER 1'
41
+ else:
42
+ embeddings = make_embeddings(path, segments, duration)
43
+ add_speaker_labels(segments, embeddings, num_speakers)
44
+ output = get_output(segments)
45
+ return output
46
+
47
+ def convert_to_wav(path):
48
+ if path[-3:] != 'wav':
49
+ new_path = '.'.join(path.split('.')[:-1]) + '.wav'
50
+ try:
51
+ subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
52
+ except:
53
+ return path, 'Error: Could not convert file to .wav'
54
+ path = new_path
55
+ return path, None
56
+
57
+ def get_duration(path):
58
+ with contextlib.closing(wave.open(path,'r')) as f:
59
+ frames = f.getnframes()
60
+ rate = f.getframerate()
61
+ return frames / float(rate)
62
+
63
+ def make_embeddings(path, segments, duration):
64
+ embeddings = np.zeros(shape=(len(segments), 192))
65
+ for i, segment in enumerate(segments):
66
+ embeddings[i] = segment_embedding(path, segment, duration)
67
+ return np.nan_to_num(embeddings)
68
+
69
+ audio = Audio()
70
+
71
+ def segment_embedding(path, segment, duration):
72
+ start = segment["start"]
73
+ # Whisper overshoots the end timestamp in the last segment
74
+ end = min(duration, segment["end"])
75
+ clip = Segment(start, end)
76
+ waveform, sample_rate = audio.crop(path, clip)
77
+ return embedding_model(waveform[None])
78
+
79
+ def add_speaker_labels(segments, embeddings, num_speakers):
80
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
81
+ labels = clustering.labels_
82
+ for i in range(len(segments)):
83
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
84
+
85
+ def time(secs):
86
+ return datetime.timedelta(seconds=round(secs))
87
+
88
+ def get_output(segments):
89
+ output = ''
90
+ for (i, segment) in enumerate(segments):
91
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
92
+ if i != 0:
93
+ output += '\n\n'
94
+ output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
95
+ output += segment["text"][1:] + ' '
96
+ return output
97
+
98
+ gr.Interface(
99
+ title = 'Whisper with Speaker Recognition',
100
+ fn=transcribe,
101
+ inputs=[
102
+ gr.inputs.Audio(source="upload", type="filepath"),
103
+ gr.inputs.Number(default=2, label="Number of Speakers")
104
+
105
+ ],
106
+ outputs=[
107
+ gr.outputs.Textbox(label='Transcript')
108
+ ]
109
+ ).launch()