Spaces:

mr-dee
/

AI-auto-narrator

Runtime error

App Files Files Community

Dy commited on Nov 12, 2023

Commit

4b1428d

•

1 Parent(s): c08f21e

Create app.py

Browse files

Files changed (1) hide show

app.py +358 -0

app.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import openai
+import requests
+import os
+from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
+from moviepy.audio.io.AudioFileClip import AudioFileClip
+import cv2  # We're using OpenCV to read video
+import base64
+import time
+import io
+import tempfile
+import numpy as np
+import gradio as gr
+OPENAI_API_KEY = OPENAI_API_KEY
+ELEVEN_LABS_API = ELEVEN_LABS_API
+# Set your OpenAI API key here
+openai.api_key = OPENAI_API_KEY
+def video_to_frames(video_file_path):
+    if type(video_file_path) == str:
+        video_filename = video_file_path
+    else:
+        video_filename = video_file_path.name
+    video_duration = VideoFileClip(video_filename).duration
+    video = cv2.VideoCapture(video_filename)
+    base64Frames = []
+    frame_count = 0
+    while video.isOpened():
+        success, frame = video.read()
+        if not success:
+            break
+        _, buffer = cv2.imencode(".jpg", frame)
+        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+        frame_count += 1
+        if frame_count % 30 == 0:
+            print("30 frames added.")
+    video.release()
+    print(len(base64Frames), "frames read.")
+    return base64Frames, video_filename, video_duration
+def text_to_speech(text, video_filename, voice_type="feminine-american", API_KEY = ELEVEN_LABS_API):
+    CHUNK_SIZE = 2048
+    voice_id = '21m00Tcm4TlvDq8ikWAM'
+    BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+    headers = {
+        "Accept": "audio/mpeg",
+        "Content-Type": "application/json",
+        "xi-api-key": API_KEY
+    }
+    if voice_type == "masculine-american":
+        MODEL_ID = "eleven_monolingual_v1"
+        voice_id = 'VR6AewLTigWG4xSOukaG'
+        BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+        chunk = text
+        data = {
+            "text": chunk,
+            "model_id": MODEL_ID,
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.5
+            }
+        }
+    elif voice_type == "feminine-british":
+        MODEL_ID = "eleven_monolingual_v1"
+        voice_id = 'ThT5KcBeYPX3keUQqHPh'
+        BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+        chunk = text
+        data = {
+            "text": chunk,
+            "model_id": MODEL_ID,
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.5
+            }
+        }
+    elif voice_type == "masculine-british":
+        MODEL_ID = "eleven_monolingual_v1"
+        voice_id = 'Yko7PKHZNXotIFUBG7I9'
+        BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+        chunk = text
+        data = {
+            "text": chunk,
+            "model_id": MODEL_ID,
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.5
+            }
+        }
+    else:
+        MODEL_ID = "eleven_monolingual_v1"
+        voice_id = '21m00Tcm4TlvDq8ikWAM'
+        BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+        chunk = text
+        data = {
+            "text": chunk,
+            "model_id": MODEL_ID,
+            "voice_settings": {
+                "stability": 0.3,
+                "similarity_boost": 0.5
+            }
+        }
+    # Send the POST request to the API
+    response = requests.post(BASE_URL, json=data, headers=headers)
+    # Check if the response is OK
+    if response.status_code == 200:
+        # Write the chunk to an mp3 file in the directory
+        # Save audio to a specified file
+        audio_filename = 'testing_file.mp3'
+        with open(audio_filename, 'wb') as file:
+            for chunk in response.iter_content(chunk_size=1024 * 1024):
+                file.write(chunk)
+        print(f'Saved {audio_filename}')
+    else:
+        print(f'Error: Received response code {response.status_code}')
+    return audio_filename
+def frames_to_story(base64Frames, prompt, video_duration):
+    fps = int(len(base64Frames) / video_duration)
+    frame_cut_thres = fps
+    print("Cutting at", frame_cut_thres)
+    list_of_dictionaries = list(map(lambda x: {
+        "type": "image_url",
+        "image_url": {
+            "url": f"data:image/jpeg;base64,{x}",
+            "detail": "low"
+        }
+    }, base64Frames[0::frame_cut_thres]))
+    PROMPT_MESSAGES = [
+        {
+            "role": "user",
+            "content": [
+                prompt,
+                *list_of_dictionaries,
+            ],
+        },
+    ]
+    params = {
+        "model": "gpt-4-vision-preview",
+        "messages": PROMPT_MESSAGES,
+        #"api_key": OPENAI_API_KEY,
+        #"headers": {"Openai-Version": "2020-11-07"},
+        "max_tokens": 500,
+    }
+    result = openai.chat.completions.create(**params)
+    print(result.choices[0].message.content)
+    return result.choices[0].message.content
+def prompt_type(prompt_user, prompt_input, video_duration):
+    prompt_documentary = '''
+    You are a world class documentary narration script writer.
+    Based on the frames in the video, write a captivating voiceover for it.
+    Write it with close observation of each frame.
+    Observe the suddent change in movement of each frame and narrate about it.
+    '''
+    prompt_how_to = '''
+    You are an expert narrator that specializes in writing narration scripts for "how-to" videos.
+    Your goal is to write a script so that the audience can follow instructions from the video.
+    Pay attention to where the mouse and tap cursor is and navigate based on the sequence of each frame.
+    Remember to narrate something useful.  Narrate something that the audience can understand to take an action.
+    '''
+    prompt_sports_commentator = '''
+    You are a professional sports commentator that can comment for all kinds of sports including e-sports.
+    Your goal is to write a script that is exciting and make the audience's heart beat fast.
+    Pay attention to what the characters of the players are doing in each frame and narrate their actions.
+    Remember to narrate something exciting and nail-biting.  Keep the audience on their toes and wanting to know more.
+    Add a lot of exclamation mark and emotions into the voiceover script.
+    '''
+    if prompt_input == "how-to":
+        prompt_input = prompt_how_to
+        mul_factor = 1.5
+    elif prompt_input == "documentary":
+        prompt_input = prompt_documentary
+        mul_factor = 2
+    elif prompt_input == "sports-commentator":
+        prompt_input = prompt_sports_commentator
+        mul_factor = 1.5
+    elif prompt_input == "custom-prompt":
+        prompt_input = prompt_user
+        mul_factor = 2
+    else:
+        prompt_input = ""
+        mul_factor = 2
+    est_word_count = int(video_duration * mul_factor)
+    word_lim_prompt = f'''This video is EXACTLY {video_duration} seconds long,
+    make sure the voiceover narration script to be EXACTLY {est_word_count} words.
+    Do not go over {est_word_count} for the output script.
+    '''
+    initial_prompt = '''
+    These are a sequence of frames for a short video.
+    You are an expert voiceover script writer.  The voiceover is to help the audience and viewer.
+    Write a voiceover for the video by carefully analyzing each frame.
+    Make sure there is coherence between each frame.
+    '''
+    final_prompt = word_lim_prompt + initial_prompt + prompt_user + prompt_input + "\n" + word_lim_prompt
+    return(final_prompt)
+def merge_audio_video(video_filename, audio_filename, output_filename, original_audio_volume=0.3):
+    print("Merging audio and video...")
+    print("Video filename:", video_filename)
+    print("Audio filename:", audio_filename)
+    # Load the video file
+    video_clip = VideoFileClip(video_filename)
+    try:# Reduce the volume of the original audio
+        original_audio = video_clip.audio.volumex(original_audio_volume)
+        # Load the new audio file
+        new_audio_clip = AudioFileClip(audio_filename)
+        # Mix the adjusted original audio with the new audio
+        mixed_audio = CompositeAudioClip([original_audio, new_audio_clip])
+        # Set the mixed audio as the audio of the video clip
+        final_clip = video_clip.set_audio(mixed_audio)
+        # Write the result to a file
+        final_clip.write_videofile(output_filename, codec='libx264', audio_codec='aac')
+        # Close the clips
+        video_clip.close()
+        new_audio_clip.close()
+    except:
+        print("No volume")
+        # Set the audio of the video clip
+        final_clip = video_clip.set_audio(audio_filename)
+        # Write the result to a file
+        final_clip.write_videofile(output_filename, codec='libx264', audio_codec='aac')
+        # Close the clips
+        video_clip.close()
+        new_audio_clip.close()
+    # Return the path to the new video file
+    return output_filename
+# Rest of your imports and functions remain the same
+def process_video(uploaded_file, prompt_user, prompt_input, voice_type="feminine-american"):
+    if type(uploaded_file) == str:
+        video_filename = uploaded_file
+    else:
+        video_filename = uploaded_file.name
+    print("video", video_filename)
+    base64Frames, video_filename, video_duration = video_to_frames(video_filename)
+    final_prompt = prompt_type(prompt_user, prompt_input, video_duration)
+    print(final_prompt)
+    text = frames_to_story(base64Frames, final_prompt, video_duration)
+    audio_filename = text_to_speech(text, video_filename, voice_type)
+    print("audio", audio_filename)
+    # Merge audio and video
+    output_video_filename = os.path.splitext(video_filename)[0] + '_output.mp4'
+    final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
+    print("final", final_video_filename)
+    if type(uploaded_file) != str:
+        os.unlink(video_filename)
+        os.unlink(audio_filename)
+    return final_video_filename, text
+# Rest of your imports and functions remain the same
+def regenerate(uploaded_file, edited_script):
+    # Generate audio from text
+    audio_filename = text_to_speech(edited_script, video_filename)
+    print("audio", audio_filename)
+    # Merge audio and video
+    output_video_filename = os.path.splitext(video_filename)[0] + '_output.mp4'
+    final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
+    print("final", final_video_filename)
+    if type(uploaded_file) != str:
+        os.unlink(video_filename)
+        os.unlink(audio_filename)
+    return final_video_filename, text
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Auto Narrator
+    Upload a video and provide a prompt to generate a narration.
+    """)
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Upload Video")
+            prompt_user = gr.Textbox(label="Enter your prompt")
+            prompt_input = gr.Dropdown(['how-to', 'documentary', 'sports-commentator', 'custom-prompt'], label="Choose Your Narration")
+            voice_type = gr.Dropdown(['masculine-american', 'masculine-british', 'feminine-american', 'feminine-british'], label="Choose Your Voice")
+            generate_btn = gr.Button(value="Generate")
+            #render_btn = gr.Button(value="Render")
+            #print_btn = gr.Button(value="Print")
+        with gr.Column():
+            output_file = gr.Video(label="Ouput video file.")
+            output_voiceover = gr.Textbox(label="Generated Text")
+            regenerate_btn = gr.Button(value="Re-generate")
+            #print_text = gr.Text(label="Printing")
+    generate_btn.click(process_video, inputs=[video_input, prompt_user, prompt_input, voice_type], outputs=[output_file,output_voiceover])
+    regenerate_btn.click(regenerate, inputs=[video_input, output_voiceover], outputs=[output_file,output_voiceover])
+demo.launch()