File size: 6,237 Bytes
ee54bd3 ca203fb 644c545 ee54bd3 c3a36b3 ee54bd3 c3a36b3 c94ca86 ee54bd3 7a5f23d a1204a0 ee54bd3 a1204a0 ee54bd3 2a6fdc6 ee54bd3 c3a36b3 ee54bd3 c3a36b3 ee54bd3 c3a36b3 ee54bd3 c3a36b3 a1204a0 c3a36b3 a1204a0 c3a36b3 ee54bd3 c3a36b3 ee54bd3 c3a36b3 f5fe020 c3a36b3 644c545 c3a36b3 ee54bd3 a1204a0 ee54bd3 a1204a0 ee54bd3 a1204a0 c3a36b3 ee54bd3 c3a36b3 ee54bd3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from pytubefix import YouTube
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment
import whisper
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize'punkt')
import gradio as gr
import ast
from IPython.display import Audio, display
import requests'punkt_tab')
model = whisper.load_model("base")
def extract_yt_audio(it, video_url, video_file):
Takes youtube url (youtobe_url) and path where audio clip will be stored (audio_path)
in string format as input arguments.
Returns the extracted video clip (video) and the path to audio clip (audio_path).
if it == 'URL' and ("" in video_url or "" in video_url):
yt = YouTube(video_url, use_oauth=True)
a = yt.streams.filter(only_audio=True).first()
audio_file =
sample = AudioSegment.from_file(audio_file, format="mp4")
elif it == 'URL':
response = requests.get(video_url)
video_data = BytesIO(response.content)
sample = AudioSegment.from_file(video_data, format="mp4")
sample = AudioSegment.from_file(video_file)
audio_path = 'audio.wav'
# display(Audio(audio_path))
sample.export(audio_path, format="wav")
print("Transcription started \nTranscript:\n")
result = model.transcribe(audio_path)
print(result['text'], '\n')
return gr.update(visible=True, value=result['text']), gr.update(visible=True), result['segments'], gr.update(visible=True, value=audio_path)
def semantic_chunks(segs, max_chunk_length=15.0):
print("Trying to get symantically chunked segments:")
Takes segments of transcribed audio and 15secs as maximum check duration and returns chunks of the audio as a list.
segs = ast.literal_eval(segs)
chunks = []
current_chunk = []
chunk_start_time = None
chunk_end_time = None
chunk_duration = 0
# iterate over segments and create chunks out of each segment
for segment in segs:
start = segment['start']
end = segment['end']
text = segment['text']
# sentence tokenize each segment to capture more semantic context
sentences = sent_tokenize(text)
# iterate over the sentences and group them into chunks subject to the max_chunk_length is 15 secs
for sentence in sentences:
sentence_duration = (end - start) / len(sentences)
# Check if adding the sentence exceeds the max_chunk_length of 15 secs
if chunk_duration + sentence_duration <= max_chunk_length:
if not current_chunk:
chunk_start_time = start
chunk_duration += sentence_duration
chunk_end_time = end
# If the chunk would be too long, finalize the current chunk with required parameters
'chunk_id': len(chunks) + 1,
'chunk_length (secs)': chunk_duration,
'semantic_chunk': ' '.join(current_chunk),
'start_time (secs)': chunk_start_time,
'end_time (secs)': chunk_end_time
# Start a new chunk with the current sentence
current_chunk = [sentence]
chunk_start_time = start
chunk_end_time = end
chunk_duration = sentence_duration
# Finalize the last chunk if it exists
if current_chunk:
'chunk_id': len(chunks) + 1,
'chunk_length (secs)': chunk_duration,
'semantic_chunk': ' '.join(current_chunk),
'start_time (secs)': chunk_start_time,
'end_time (secs)': chunk_end_time
return gr.update(visible=True, value=pd.DataFrame(chunks))
def toggle_input_fields(input_type):
if input_type == "URL":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
def clear_all():
return (gr.update(visible=True, value=""), gr.update(visible=True, value=""), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
with gr.Blocks() as demo:
# Extract audio from video, get the transcript and then get the semantic chunk information.
# Radio button to choose between URL or upload
input_type = gr.Radio(choices=["URL", "Upload"], label="Select Video Input Type", value="URL")
# input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='')
input_url = gr.Textbox(label="Enter Video URL", visible=True, value='sample.mp4')
video_file = gr.File(label="Upload Video", visible=False)
# input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='sample.mp4')
segments = gr.Textbox(visible=False)
submit_btn_1 = gr.Button("Get the Transcript", visible=True)
audio = gr.Audio(visible=False, type="filepath", label='Play Audio')
transcript = gr.Textbox(visible=False, label='Transcript')
submit_btn_2 = gr.Button("Get the semantically Chuncked Segments", visible=False)
chunks = gr.Dataframe(visible=False, label = 'Semantic Chunks')
clear_btn = gr.Button("Clear")
input_type.change(fn=toggle_input_fields, inputs=input_type, outputs=[input_url, video_file, audio, transcript]), inputs=[input_type, input_url, video_file], outputs=[transcript, submit_btn_2, segments, audio])
#, inputs=[input_url], outputs=[transcript, submit_btn_2, segments, audio]), inputs=[segments], outputs=[chunks]), outputs=[input_url, video_file, transcript, submit_btn_2, chunks, audio])
demo.launch(debug=True) |