Spaces:

vsrinivas
/

Transcribe_the_audio_and_get_semantic_chunks

Running

Transcribe_the_audio_and_get_semantic_chunks

File size: 4,987 Bytes

from pytubefix import YouTube
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment
import whisper
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import gradio as gr
import ast
from IPython.display import Audio, display

model = whisper.load_model("base")

def extract_yt_audio(video_url):

    """
    Takes youtube url (youtobe_url) and path where audio clip will be stored (audio_path)
    in string format as input arguments.
    Returns the extracted video clip (video) and the path to audio clip (audio_path).
    """

    if "youtube.com" in video_url or "youtu.be" in video_url:
      yt = YouTube(video_url, use_oauth=True)
      a = yt.streams.filter(only_audio=True).first()
      audio_file = a.download()
      sample = AudioSegment.from_file(audio_file)
    else:
      sample = AudioSegment.from_file(video_url)
    audio_path = 'audio.wav'
    # display(Audio(audio_path))
    sample.export(audio_path, format="wav")
    result = model.transcribe(audio_path)
    print("Transcription started \nTranscript:\n")
    print(result['text'], '\n')
    return gr.update(visible=True, value=result['text']), gr.update(visible=True), result['segments'], gr.update(visible=True, value=audio_path)

    
def semantic_chunks(segs, max_chunk_length=15.0):
    print(type(segs))
    print(segs)
    """
    Takes segments of transcribed audio and 15secs as maximum check duration and returns chunks of the audio as a list.
    """
    segs = ast.literal_eval(segs)
    print(type(segs))

    chunks = []
    current_chunk = []
    chunk_start_time = None
    chunk_end_time = None
    chunk_duration = 0

    # iterate over segments and create chunks out of each segment
    for segment in segs:
        start = segment['start']
        end = segment['end']
        text = segment['text']

        # sentence tokenize each segment to capture more semantic context
        sentences = sent_tokenize(text)

        # iterate over the sentences and group them into chunks subject to the max_chunk_length is 15 secs
        for sentence in sentences:
            sentence_duration = (end - start) / len(sentences)

            # Check if adding the sentence exceeds the max_chunk_length of 15 secs
            if chunk_duration + sentence_duration <= max_chunk_length:
                if not current_chunk:
                    chunk_start_time = start
                current_chunk.append(sentence)
                chunk_duration += sentence_duration
                chunk_end_time = end
            else:
                # If the chunk would be too long, finalize the current chunk with required parameters
                chunks.append({
                    'chunk_id': len(chunks) + 1,
                    'chunk_length (secs)': chunk_duration,
                    'semantic_chunk': ' '.join(current_chunk),
                    'start_time (secs)': chunk_start_time,
                    'end_time (secs)': chunk_end_time
                })
                # Start a new chunk with the current sentence
                current_chunk = [sentence]
                chunk_start_time = start
                chunk_end_time = end
                chunk_duration = sentence_duration

    # Finalize the last chunk if it exists
    if current_chunk:
        chunks.append({
            'chunk_id': len(chunks) + 1,
            'chunk_length (secs)': chunk_duration,
            'semantic_chunk': ' '.join(current_chunk),
            'start_time (secs)': chunk_start_time,
            'end_time (secs)': chunk_end_time
        })

    return gr.update(visible=True, value=pd.DataFrame(chunks))


def clear_all():
    return (gr.update(visible=True, value=""), gr.update(visible=True, value=""), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))


with gr.Blocks() as demo:
    gr.Markdown(
    """
    # Extract audio from video, get the transcript and then get the semantic chunk information.
    """)
    # input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='https://www.youtube.com/watch?v=ug5e4JfC3oo')
    input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='sample.wav')
    segments = gr.Textbox(visible=False)
    submit_btn_1 = gr.Button("Get the Transcript", visible=True)
    audio = gr.Audio(visible=True, type="filepath", label='Play Audio')
    transcript = gr.Textbox(visible=True, label='Transcript')
    submit_btn_2 = gr.Button("Get the semantically Chuncked Segments", visible=False)
    chunks = gr.Dataframe(visible=False, label = 'semantic Chunks')
    clear_btn = gr.Button("Clear")

    submit_btn_1.click(fn=extract_yt_audio, inputs=[input_url], outputs=[transcript, submit_btn_2, segments, audio])
    submit_btn_2.click(fn=semantic_chunks, inputs=[segments], outputs=[chunks])
    clear_btn.click(fn=clear_all, outputs=[input_url, transcript, submit_btn_2, chunks, audio])
demo.launch(debug=True)