Ruslan Magana Vsevolodovna commited on
Commit
b080b2f
β€’
1 Parent(s): a0b536e

First version

Browse files

First version of Text to Video Dalle

Files changed (2) hide show
  1. app.py +236 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Step 2 - Importing Libraries
2
+ from moviepy.editor import *
3
+ from PIL import Image
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
5
+ import requests
6
+ import gradio as gr
7
+ import torch
8
+ import re
9
+ import os
10
+ import sys
11
+ from huggingface_hub import snapshot_download
12
+ import base64
13
+ import io
14
+ import cv2
15
+ import argparse
16
+ import os
17
+ from PIL import Image
18
+ from min_dalle import MinDalle
19
+ import torch
20
+ from PIL import Image, ImageDraw, ImageFont
21
+ import textwrap
22
+ from mutagen.mp3 import MP3
23
+ # to speech conversion
24
+ from gtts import gTTS
25
+ from IPython.display import Audio
26
+ from IPython.display import display
27
+ from pydub import AudioSegment
28
+ from os import getcwd
29
+ import glob
30
+ import nltk
31
+ from IPython.display import HTML
32
+ from base64 import b64encode
33
+ nltk.download('punkt')
34
+
35
+ description = " Video Story Generator with Audio \n PS: Generation of video by using Artifical Intellingence by dalle-mini and distilbart and gtss "
36
+ title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss "
37
+ tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
38
+ model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
39
+ def get_output_video(text):
40
+ inputs = tokenizer(text,
41
+ max_length=1024,
42
+ truncation=True,
43
+ return_tensors="pt")
44
+
45
+ summary_ids = model.generate(inputs["input_ids"])
46
+ summary = tokenizer.batch_decode(summary_ids,
47
+ skip_special_tokens=True,
48
+ clean_up_tokenization_spaces=False)
49
+ plot = list(summary[0].split('.'))
50
+
51
+ def generate_image(
52
+ is_mega: bool,
53
+ text: str,
54
+ seed: int,
55
+ grid_size: int,
56
+ top_k: int,
57
+ image_path: str,
58
+ models_root: str,
59
+ fp16: bool,):
60
+ model = MinDalle(
61
+ is_mega=is_mega,
62
+ models_root=models_root,
63
+ is_reusable=False,
64
+ is_verbose=True,
65
+ dtype=torch.float16 if fp16 else torch.float32
66
+ )
67
+
68
+ image = model.generate_image(
69
+ text,
70
+ seed,
71
+ grid_size,
72
+ top_k=top_k,
73
+ is_verbose=True
74
+ )
75
+
76
+ return image
77
+
78
+ generated_images = []
79
+ for senten in plot[:-1]:
80
+ #print(senten)
81
+ image=generate_image(
82
+ is_mega='store_true',
83
+ text=senten,
84
+ seed=1,
85
+ grid_size=1,
86
+ top_k=256,
87
+ image_path='generated',
88
+ models_root='pretrained',
89
+ fp16=256,)
90
+ generated_images.append(image)
91
+
92
+ # Step 4- Creation of the subtitles
93
+ sentences =plot[:-1]
94
+ num_sentences=len(sentences)
95
+ assert len(generated_images) == len(sentences) , print('Something is wrong')
96
+ #We can generate our list of subtitles
97
+ from nltk import tokenize
98
+ c = 0
99
+ sub_names = []
100
+ for k in range(len(generated_images)):
101
+ subtitles=tokenize.sent_tokenize(sentences[k])
102
+ sub_names.append(subtitles)
103
+
104
+ # Step 5- Adding Subtitles to the Images
105
+ def draw_multiple_line_text(image, text, font, text_color, text_start_height):
106
+ draw = ImageDraw.Draw(image)
107
+ image_width, image_height = image.size
108
+ y_text = text_start_height
109
+ lines = textwrap.wrap(text, width=40)
110
+ for line in lines:
111
+ line_width, line_height = font.getsize(line)
112
+ draw.text(((image_width - line_width) / 2, y_text),
113
+ line, font=font, fill=text_color)
114
+ y_text += line_height
115
+
116
+ def add_text_to_img(text1,image_input):
117
+ '''
118
+ Testing draw_multiple_line_text
119
+ '''
120
+ image =image_input
121
+ fontsize = 13 # starting font size
122
+ path_font="/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
123
+ font = ImageFont.truetype(path_font, fontsize)
124
+ text_color = (255,255,0)
125
+ text_start_height = 200
126
+ draw_multiple_line_text(image, text1, font, text_color, text_start_height)
127
+ return image
128
+
129
+ generated_images_sub = []
130
+ for k in range(len(generated_images)):
131
+ imagenes = generated_images[k].copy()
132
+ text_to_add=sub_names[k][0]
133
+ result=add_text_to_img(text_to_add,imagenes)
134
+ generated_images_sub.append(result)
135
+ # Step 7 - Creation of audio
136
+ c = 0
137
+ mp3_names = []
138
+ mp3_lengths = []
139
+ for k in range(len(generated_images)):
140
+ text_to_add=sub_names[k][0]
141
+ print(text_to_add)
142
+ f_name = 'audio_'+str(c)+'.mp3'
143
+ mp3_names.append(f_name)
144
+ # The text that you want to convert to audio
145
+ mytext = text_to_add
146
+ # Language in which you want to convert
147
+ language = 'en'
148
+ # Passing the text and language to the engine,
149
+ # here we have marked slow=False. Which tells
150
+ # the module that the converted audio should
151
+ # have a high speed
152
+ myobj = gTTS(text=mytext, lang=language, slow=False)
153
+ # Saving the converted audio in a mp3 file named
154
+ sound_file=f_name
155
+ myobj.save(sound_file)
156
+ audio = MP3(sound_file)
157
+ duration=audio.info.length
158
+ mp3_lengths.append(duration)
159
+ print(audio.info.length)
160
+ c+=1
161
+
162
+ # Step 8 - Merge audio files
163
+ cwd = (getcwd()).replace(chr(92), '/')
164
+ #export_path = f'{cwd}/result.mp3'
165
+ export_path ='result.mp3'
166
+ MP3_FILES = glob.glob(pathname=f'{cwd}/*.mp3', recursive=True)
167
+ silence = AudioSegment.silent(duration=500)
168
+ full_audio = AudioSegment.empty() # this will accumulate the entire mp3 audios
169
+ for n, mp3_file in enumerate(mp3_names):
170
+ mp3_file = mp3_file.replace(chr(92), '/')
171
+ print(n, mp3_file)
172
+
173
+ # Load the current mp3 into `audio_segment`
174
+ audio_segment = AudioSegment.from_mp3(mp3_file)
175
+
176
+ # Just accumulate the new `audio_segment` + `silence`
177
+ full_audio += audio_segment + silence
178
+ print('Merging ', n)
179
+
180
+ # The loop will exit once all files in the list have been used
181
+ # Then export
182
+ full_audio.export(export_path, format='mp3')
183
+ print('\ndone!')
184
+
185
+ # Step 9 - Creation of the video with adjusted times of the sound
186
+ c = 0
187
+ file_names = []
188
+ for img in generated_images_sub:
189
+ f_name = 'img_'+str(c)+'.jpg'
190
+ file_names.append(f_name)
191
+ img = img.save(f_name)
192
+ c+=1
193
+ print(file_names)
194
+ clips=[]
195
+ d=0
196
+ for m in file_names:
197
+ duration=mp3_lengths[d]
198
+ print(d,duration)
199
+ clips.append(ImageClip(m).set_duration(duration+0.5))
200
+ d+=1
201
+ concat_clip = concatenate_videoclips(clips, method="compose")
202
+ concat_clip.write_videofile("result_new.mp4", fps=24)
203
+
204
+ # Step 10 - Merge Video + Audio
205
+ movie_name = 'result_new.mp4'
206
+ export_path='result.mp3'
207
+ movie_final= 'result_final.mp4'
208
+
209
+ def combine_audio(vidname, audname, outname, fps=60):
210
+ import moviepy.editor as mpe
211
+ my_clip = mpe.VideoFileClip(vidname)
212
+ audio_background = mpe.AudioFileClip(audname)
213
+ final_clip = my_clip.set_audio(audio_background)
214
+ final_clip.write_videofile(outname,fps=fps)
215
+ combine_audio(movie_name, export_path, movie_final) # i create a new file
216
+ return 'result_final.mp4'
217
+ text ='Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
218
+ demo = gr.Blocks()
219
+ with demo:
220
+ gr.Markdown("# Video Generator from long stories with Artificial Intelligence")
221
+ gr.Markdown("A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.")
222
+ with gr.Row():
223
+ # Left column (inputs)
224
+ with gr.Column():
225
+
226
+ input_start_text = gr.Textbox(value=text, label="Type your story here, for now a sample story is added already!")
227
+ with gr.Row():
228
+ button_gen_video = gr.Button("Generate Video")
229
+ # Right column (outputs)
230
+ with gr.Column():
231
+ output_interpolation = gr.Video(label="Generated Video")
232
+ gr.Markdown("<h3>Future Works </h3>")
233
+ gr.Markdown("This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
234
+ button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
235
+
236
+ demo.launch(debug=False)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ min-dalle
3
+ transformers
4
+ torch
5
+ requests
6
+ moviepy
7
+ huggingface_hub
8
+ opencv-python
9
+ imageio-ffmpeg
10
+ imageio==2.4.1
11
+ imagemagick
12
+ gTTS
13
+ mutagen