from pytubefix import YouTube from moviepy.editor import VideoFileClip, AudioFileClip from pydub import AudioSegment import whisper import pandas as pd import nltk from nltk.tokenize import sent_tokenize nltk.download('punkt') import gradio as gr import ast from IPython.display import Audio, display model = whisper.load_model("base") def extract_yt_audio(video_url): """ Takes youtube url (youtobe_url) and path where audio clip will be stored (audio_path) in string format as input arguments. Returns the extracted video clip (video) and the path to audio clip (audio_path). """ if "youtube.com" in video_url or "youtu.be" in video_url: yt = YouTube(video_url) a = yt.streams.filter(only_audio=True).first() audio_file = a.download() sample = AudioSegment.from_file(audio_file) else: sample = AudioSegment.from_file(video_url) audio_path = 'audio.wav' display(Audio(audio_path)) sample.export(audio_path, format="wav") result = model.transcribe(audio_path) print("Transcription started \nTranscript:\n") print(result['text'], '\n') return gr.update(visible=True, value=result['text']), gr.update(visible=True), result['segments'], gr.update(visible=True, value=audio_path) def semantic_chunks(segs, max_chunk_length=15.0): print(type(segs)) print(segs) """ Takes segments of transcribed audio and 15secs as maximum check duration and returns chunks of the audio as a list. """ segs = ast.literal_eval(segs) print(type(segs)) chunks = [] current_chunk = [] chunk_start_time = None chunk_end_time = None chunk_duration = 0 # iterate over segments and create chunks out of each segment for segment in segs: start = segment['start'] end = segment['end'] text = segment['text'] # sentence tokenize each segment to capture more semantic context sentences = sent_tokenize(text) # iterate over the sentences and group them into chunks subject to the max_chunk_length is 15 secs for sentence in sentences: sentence_duration = (end - start) / len(sentences) # Check if adding the sentence exceeds the max_chunk_length of 15 secs if chunk_duration + sentence_duration <= max_chunk_length: if not current_chunk: chunk_start_time = start current_chunk.append(sentence) chunk_duration += sentence_duration chunk_end_time = end else: # If the chunk would be too long, finalize the current chunk with required parameters chunks.append({ 'chunk_id': len(chunks) + 1, 'chunk_length (secs)': chunk_duration, 'semantic_chunk': ' '.join(current_chunk), 'start_time (secs)': chunk_start_time, 'end_time (secs)': chunk_end_time }) # Start a new chunk with the current sentence current_chunk = [sentence] chunk_start_time = start chunk_end_time = end chunk_duration = sentence_duration # Finalize the last chunk if it exists if current_chunk: chunks.append({ 'chunk_id': len(chunks) + 1, 'chunk_length (secs)': chunk_duration, 'semantic_chunk': ' '.join(current_chunk), 'start_time (secs)': chunk_start_time, 'end_time (secs)': chunk_end_time }) return gr.update(visible=True, value=pd.DataFrame(chunks)) def clear_all(): return (gr.update(visible=True, value=""), gr.update(visible=True, value=""), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)) with gr.Blocks() as demo: gr.Markdown( """ # Extract audio from video, get the transcript and then get the semantic chunk information. """) input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='https://www.youtube.com/watch?v=ug5e4JfC3oo') segments = gr.Textbox(visible=False) submit_btn_1 = gr.Button("Get the Transcript", visible=True) audio = gr.Audio(visible=True, type="filepath", label='Play Audio') transcript = gr.Textbox(visible=True, label='Transcript') submit_btn_2 = gr.Button("Get the semantically Chuncked Segments", visible=False) chunks = gr.Dataframe(visible=False, label = 'semantic Chunks') clear_btn = gr.Button("Clear") submit_btn_1.click(fn=extract_yt_audio, inputs=[input_url], outputs=[transcript, submit_btn_2, segments, audio]) submit_btn_2.click(fn=semantic_chunks, inputs=[segments], outputs=[chunks]) clear_btn.click(fn=clear_all, outputs=[input_url, transcript, submit_btn_2, chunks, audio]) demo.launch(debug=True)