Dy commited on
Commit
4b1428d
1 Parent(s): c08f21e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +358 -0
app.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import requests
3
+ import os
4
+ from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
5
+ from moviepy.audio.io.AudioFileClip import AudioFileClip
6
+ import cv2 # We're using OpenCV to read video
7
+ import base64
8
+ import time
9
+ import io
10
+ import tempfile
11
+ import numpy as np
12
+ import gradio as gr
13
+
14
+ OPENAI_API_KEY = OPENAI_API_KEY
15
+
16
+ ELEVEN_LABS_API = ELEVEN_LABS_API
17
+
18
+ # Set your OpenAI API key here
19
+ openai.api_key = OPENAI_API_KEY
20
+
21
+ def video_to_frames(video_file_path):
22
+
23
+ if type(video_file_path) == str:
24
+ video_filename = video_file_path
25
+ else:
26
+ video_filename = video_file_path.name
27
+
28
+
29
+ video_duration = VideoFileClip(video_filename).duration
30
+
31
+ video = cv2.VideoCapture(video_filename)
32
+ base64Frames = []
33
+
34
+ frame_count = 0
35
+ while video.isOpened():
36
+ success, frame = video.read()
37
+ if not success:
38
+ break
39
+ _, buffer = cv2.imencode(".jpg", frame)
40
+ base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
41
+ frame_count += 1
42
+ if frame_count % 30 == 0:
43
+ print("30 frames added.")
44
+
45
+ video.release()
46
+ print(len(base64Frames), "frames read.")
47
+ return base64Frames, video_filename, video_duration
48
+
49
+
50
+ def text_to_speech(text, video_filename, voice_type="feminine-american", API_KEY = ELEVEN_LABS_API):
51
+
52
+ CHUNK_SIZE = 2048
53
+ voice_id = '21m00Tcm4TlvDq8ikWAM'
54
+ BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
55
+
56
+
57
+ headers = {
58
+ "Accept": "audio/mpeg",
59
+ "Content-Type": "application/json",
60
+ "xi-api-key": API_KEY
61
+ }
62
+
63
+ if voice_type == "masculine-american":
64
+
65
+ MODEL_ID = "eleven_monolingual_v1"
66
+ voice_id = 'VR6AewLTigWG4xSOukaG'
67
+ BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
68
+ chunk = text
69
+ data = {
70
+ "text": chunk,
71
+ "model_id": MODEL_ID,
72
+ "voice_settings": {
73
+ "stability": 0.5,
74
+ "similarity_boost": 0.5
75
+ }
76
+ }
77
+
78
+ elif voice_type == "feminine-british":
79
+
80
+ MODEL_ID = "eleven_monolingual_v1"
81
+ voice_id = 'ThT5KcBeYPX3keUQqHPh'
82
+ BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
83
+ chunk = text
84
+ data = {
85
+ "text": chunk,
86
+ "model_id": MODEL_ID,
87
+ "voice_settings": {
88
+ "stability": 0.5,
89
+ "similarity_boost": 0.5
90
+ }
91
+ }
92
+ elif voice_type == "masculine-british":
93
+
94
+ MODEL_ID = "eleven_monolingual_v1"
95
+ voice_id = 'Yko7PKHZNXotIFUBG7I9'
96
+ BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
97
+ chunk = text
98
+ data = {
99
+ "text": chunk,
100
+ "model_id": MODEL_ID,
101
+ "voice_settings": {
102
+ "stability": 0.5,
103
+ "similarity_boost": 0.5
104
+ }
105
+ }
106
+ else:
107
+
108
+ MODEL_ID = "eleven_monolingual_v1"
109
+ voice_id = '21m00Tcm4TlvDq8ikWAM'
110
+ BASE_URL = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
111
+ chunk = text
112
+ data = {
113
+ "text": chunk,
114
+ "model_id": MODEL_ID,
115
+ "voice_settings": {
116
+ "stability": 0.3,
117
+ "similarity_boost": 0.5
118
+ }
119
+ }
120
+
121
+ # Send the POST request to the API
122
+ response = requests.post(BASE_URL, json=data, headers=headers)
123
+
124
+ # Check if the response is OK
125
+ if response.status_code == 200:
126
+ # Write the chunk to an mp3 file in the directory
127
+ # Save audio to a specified file
128
+ audio_filename = 'testing_file.mp3'
129
+ with open(audio_filename, 'wb') as file:
130
+ for chunk in response.iter_content(chunk_size=1024 * 1024):
131
+ file.write(chunk)
132
+
133
+ print(f'Saved {audio_filename}')
134
+ else:
135
+ print(f'Error: Received response code {response.status_code}')
136
+
137
+ return audio_filename
138
+
139
+ def frames_to_story(base64Frames, prompt, video_duration):
140
+
141
+ fps = int(len(base64Frames) / video_duration)
142
+
143
+ frame_cut_thres = fps
144
+ print("Cutting at", frame_cut_thres)
145
+
146
+ list_of_dictionaries = list(map(lambda x: {
147
+ "type": "image_url",
148
+ "image_url": {
149
+ "url": f"data:image/jpeg;base64,{x}",
150
+ "detail": "low"
151
+ }
152
+ }, base64Frames[0::frame_cut_thres]))
153
+
154
+ PROMPT_MESSAGES = [
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ prompt,
159
+ *list_of_dictionaries,
160
+ ],
161
+ },
162
+ ]
163
+ params = {
164
+ "model": "gpt-4-vision-preview",
165
+ "messages": PROMPT_MESSAGES,
166
+ #"api_key": OPENAI_API_KEY,
167
+ #"headers": {"Openai-Version": "2020-11-07"},
168
+ "max_tokens": 500,
169
+
170
+ }
171
+
172
+ result = openai.chat.completions.create(**params)
173
+ print(result.choices[0].message.content)
174
+ return result.choices[0].message.content
175
+
176
+
177
+ def prompt_type(prompt_user, prompt_input, video_duration):
178
+
179
+ prompt_documentary = '''
180
+ You are a world class documentary narration script writer.
181
+ Based on the frames in the video, write a captivating voiceover for it.
182
+ Write it with close observation of each frame.
183
+ Observe the suddent change in movement of each frame and narrate about it.
184
+ '''
185
+
186
+ prompt_how_to = '''
187
+ You are an expert narrator that specializes in writing narration scripts for "how-to" videos.
188
+ Your goal is to write a script so that the audience can follow instructions from the video.
189
+ Pay attention to where the mouse and tap cursor is and navigate based on the sequence of each frame.
190
+ Remember to narrate something useful. Narrate something that the audience can understand to take an action.
191
+ '''
192
+
193
+ prompt_sports_commentator = '''
194
+ You are a professional sports commentator that can comment for all kinds of sports including e-sports.
195
+ Your goal is to write a script that is exciting and make the audience's heart beat fast.
196
+ Pay attention to what the characters of the players are doing in each frame and narrate their actions.
197
+ Remember to narrate something exciting and nail-biting. Keep the audience on their toes and wanting to know more.
198
+ Add a lot of exclamation mark and emotions into the voiceover script.
199
+ '''
200
+
201
+ if prompt_input == "how-to":
202
+ prompt_input = prompt_how_to
203
+ mul_factor = 1.5
204
+ elif prompt_input == "documentary":
205
+ prompt_input = prompt_documentary
206
+ mul_factor = 2
207
+ elif prompt_input == "sports-commentator":
208
+ prompt_input = prompt_sports_commentator
209
+ mul_factor = 1.5
210
+ elif prompt_input == "custom-prompt":
211
+ prompt_input = prompt_user
212
+ mul_factor = 2
213
+ else:
214
+ prompt_input = ""
215
+ mul_factor = 2
216
+
217
+ est_word_count = int(video_duration * mul_factor)
218
+
219
+ word_lim_prompt = f'''This video is EXACTLY {video_duration} seconds long,
220
+ make sure the voiceover narration script to be EXACTLY {est_word_count} words.
221
+ Do not go over {est_word_count} for the output script.
222
+ '''
223
+
224
+ initial_prompt = '''
225
+ These are a sequence of frames for a short video.
226
+ You are an expert voiceover script writer. The voiceover is to help the audience and viewer.
227
+ Write a voiceover for the video by carefully analyzing each frame.
228
+ Make sure there is coherence between each frame.
229
+ '''
230
+ final_prompt = word_lim_prompt + initial_prompt + prompt_user + prompt_input + "\n" + word_lim_prompt
231
+
232
+ return(final_prompt)
233
+
234
+
235
+ def merge_audio_video(video_filename, audio_filename, output_filename, original_audio_volume=0.3):
236
+ print("Merging audio and video...")
237
+ print("Video filename:", video_filename)
238
+ print("Audio filename:", audio_filename)
239
+
240
+ # Load the video file
241
+ video_clip = VideoFileClip(video_filename)
242
+
243
+ try:# Reduce the volume of the original audio
244
+ original_audio = video_clip.audio.volumex(original_audio_volume)
245
+
246
+ # Load the new audio file
247
+ new_audio_clip = AudioFileClip(audio_filename)
248
+
249
+ # Mix the adjusted original audio with the new audio
250
+ mixed_audio = CompositeAudioClip([original_audio, new_audio_clip])
251
+
252
+ # Set the mixed audio as the audio of the video clip
253
+ final_clip = video_clip.set_audio(mixed_audio)
254
+
255
+ # Write the result to a file
256
+ final_clip.write_videofile(output_filename, codec='libx264', audio_codec='aac')
257
+
258
+ # Close the clips
259
+ video_clip.close()
260
+ new_audio_clip.close()
261
+
262
+ except:
263
+ print("No volume")
264
+
265
+ # Set the audio of the video clip
266
+ final_clip = video_clip.set_audio(audio_filename)
267
+
268
+ # Write the result to a file
269
+ final_clip.write_videofile(output_filename, codec='libx264', audio_codec='aac')
270
+
271
+ # Close the clips
272
+ video_clip.close()
273
+ new_audio_clip.close()
274
+
275
+
276
+ # Return the path to the new video file
277
+ return output_filename
278
+
279
+
280
+
281
+ # Rest of your imports and functions remain the same
282
+
283
+ def process_video(uploaded_file, prompt_user, prompt_input, voice_type="feminine-american"):
284
+ if type(uploaded_file) == str:
285
+ video_filename = uploaded_file
286
+ else:
287
+ video_filename = uploaded_file.name
288
+ print("video", video_filename)
289
+
290
+ base64Frames, video_filename, video_duration = video_to_frames(video_filename)
291
+
292
+ final_prompt = prompt_type(prompt_user, prompt_input, video_duration)
293
+ print(final_prompt)
294
+ text = frames_to_story(base64Frames, final_prompt, video_duration)
295
+
296
+ audio_filename = text_to_speech(text, video_filename, voice_type)
297
+ print("audio", audio_filename)
298
+
299
+ # Merge audio and video
300
+ output_video_filename = os.path.splitext(video_filename)[0] + '_output.mp4'
301
+ final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
302
+ print("final", final_video_filename)
303
+
304
+ if type(uploaded_file) != str:
305
+ os.unlink(video_filename)
306
+ os.unlink(audio_filename)
307
+
308
+ return final_video_filename, text
309
+
310
+ # Rest of your imports and functions remain the same
311
+
312
+ def regenerate(uploaded_file, edited_script):
313
+
314
+ # Generate audio from text
315
+ audio_filename = text_to_speech(edited_script, video_filename)
316
+ print("audio", audio_filename)
317
+
318
+ # Merge audio and video
319
+ output_video_filename = os.path.splitext(video_filename)[0] + '_output.mp4'
320
+ final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename)
321
+ print("final", final_video_filename)
322
+
323
+ if type(uploaded_file) != str:
324
+ os.unlink(video_filename)
325
+ os.unlink(audio_filename)
326
+
327
+ return final_video_filename, text
328
+
329
+ with gr.Blocks() as demo:
330
+
331
+ gr.Markdown(
332
+ """
333
+ # Auto Narrator
334
+ Upload a video and provide a prompt to generate a narration.
335
+ """)
336
+ with gr.Row():
337
+ with gr.Column():
338
+
339
+ video_input = gr.Video(label="Upload Video")
340
+ prompt_user = gr.Textbox(label="Enter your prompt")
341
+ prompt_input = gr.Dropdown(['how-to', 'documentary', 'sports-commentator', 'custom-prompt'], label="Choose Your Narration")
342
+ voice_type = gr.Dropdown(['masculine-american', 'masculine-british', 'feminine-american', 'feminine-british'], label="Choose Your Voice")
343
+ generate_btn = gr.Button(value="Generate")
344
+ #render_btn = gr.Button(value="Render")
345
+ #print_btn = gr.Button(value="Print")
346
+ with gr.Column():
347
+
348
+ output_file = gr.Video(label="Ouput video file.")
349
+ output_voiceover = gr.Textbox(label="Generated Text")
350
+ regenerate_btn = gr.Button(value="Re-generate")
351
+ #print_text = gr.Text(label="Printing")
352
+
353
+
354
+ generate_btn.click(process_video, inputs=[video_input, prompt_user, prompt_input, voice_type], outputs=[output_file,output_voiceover])
355
+ regenerate_btn.click(regenerate, inputs=[video_input, output_voiceover], outputs=[output_file,output_voiceover])
356
+
357
+
358
+ demo.launch()