artificialguybr commited on
Commit
6d490a9
1 Parent(s): 23be978

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -14
app.py CHANGED
@@ -7,7 +7,7 @@ import uuid
7
  from googletrans import Translator
8
  from TTS.api import TTS
9
  import ffmpeg
10
- from faster_whisper import WhisperModel
11
  from scipy.signal import wiener
12
  import soundfile as sf
13
  from pydub import AudioSegment
@@ -21,6 +21,7 @@ import torchvision
21
  from tqdm import tqdm
22
  from numba import jit
23
  from huggingface_hub import HfApi
 
24
 
25
  HF_TOKEN = os.environ.get("HF_TOKEN")
26
  os.environ["COQUI_TOS_AGREED"] = "1"
@@ -30,9 +31,16 @@ ZipFile("ffmpeg.zip").extractall()
30
  st = os.stat('ffmpeg')
31
  os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
32
 
33
- #Whisper
34
- model_size = "small"
35
- model = WhisperModel(model_size, device="cpu", compute_type="int8")
 
 
 
 
 
 
 
36
 
37
  def check_for_faces(video_path):
38
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
@@ -51,6 +59,68 @@ def check_for_faces(video_path):
51
 
52
  return False
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  @spaces.GPU
55
  def process_video(radio, video, target_language, has_closeup_face):
56
  try:
@@ -80,20 +150,16 @@ def process_video(radio, video, target_language, has_closeup_face):
80
 
81
  print("Attempting to transcribe with Whisper...")
82
  try:
83
- segments, info = model.transcribe(f"{run_uuid}_output_audio_final.wav", beam_size=5)
84
- whisper_text = " ".join(segment.text for segment in segments)
85
- whisper_language = info.language
86
  print(f"Transcription successful: {whisper_text}")
87
- except RuntimeError as e:
88
- print(f"RuntimeError encountered: {str(e)}")
89
- if "CUDA failed with error device-side assert triggered" in str(e):
90
- gr.Warning("Error. Space need to restart. Please retry in a minute")
91
- api.restart_space(repo_id=repo_id)
92
 
93
  language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'}
94
  target_language_code = language_mapping[target_language]
95
  translator = Translator()
96
- translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text
97
  print(translated_text)
98
 
99
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
@@ -173,8 +239,8 @@ iface = gr.Interface(
173
  title="AI Video Dubbing",
174
  description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Thanks [@yeswondwer](https://twitter.com/@yeswondwerr) for original code. Test the [Video Transcription and Translate](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) space!""",
175
  allow_flagging=False
176
-
177
  )
 
178
  with gr.Blocks() as demo:
179
  iface.render()
180
  radio.change(swap, inputs=[radio], outputs=video)
@@ -188,5 +254,7 @@ with gr.Blocks() as demo:
188
  - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
189
  - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
190
  """)
 
 
191
  demo.queue()
192
  demo.launch()
 
7
  from googletrans import Translator
8
  from TTS.api import TTS
9
  import ffmpeg
10
+ import json
11
  from scipy.signal import wiener
12
  import soundfile as sf
13
  from pydub import AudioSegment
 
21
  from tqdm import tqdm
22
  from numba import jit
23
  from huggingface_hub import HfApi
24
+ import moviepy.editor as mp
25
 
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
  os.environ["COQUI_TOS_AGREED"] = "1"
 
31
  st = os.stat('ffmpeg')
32
  os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
33
 
34
+ print("Starting the program...")
35
+
36
+ def generate_unique_filename(extension):
37
+ return f"{uuid.uuid4()}{extension}"
38
+
39
+ def cleanup_files(*files):
40
+ for file in files:
41
+ if file and os.path.exists(file):
42
+ os.remove(file)
43
+ print(f"Removed file: {file}")
44
 
45
  def check_for_faces(video_path):
46
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
 
59
 
60
  return False
61
 
62
+ @spaces.GPU(duration=90)
63
+ def transcribe_audio(file_path):
64
+ print(f"Starting transcription of file: {file_path}")
65
+ temp_audio = None
66
+ if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
67
+ print("Video file detected. Extracting audio...")
68
+ try:
69
+ video = mp.VideoFileClip(file_path)
70
+ temp_audio = generate_unique_filename(".wav")
71
+ video.audio.write_audiofile(temp_audio)
72
+ file_path = temp_audio
73
+ except Exception as e:
74
+ print(f"Error extracting audio from video: {e}")
75
+ raise
76
+
77
+ print(f"Does the file exist? {os.path.exists(file_path)}")
78
+ print(f"File size: {os.path.getsize(file_path) if os.path.exists(file_path) else 'N/A'} bytes")
79
+
80
+ output_file = generate_unique_filename(".json")
81
+ command = [
82
+ "insanely-fast-whisper",
83
+ "--file-name", file_path,
84
+ "--device-id", "0",
85
+ "--model-name", "openai/whisper-large-v3",
86
+ "--task", "transcribe",
87
+ "--timestamp", "chunk",
88
+ "--transcript-path", output_file
89
+ ]
90
+ print(f"Executing command: {' '.join(command)}")
91
+ try:
92
+ result = subprocess.run(command, check=True, capture_output=True, text=True)
93
+ print(f"Standard output: {result.stdout}")
94
+ print(f"Error output: {result.stderr}")
95
+ except subprocess.CalledProcessError as e:
96
+ print(f"Error running insanely-fast-whisper: {e}")
97
+ print(f"Standard output: {e.stdout}")
98
+ print(f"Error output: {e.stderr}")
99
+ raise
100
+
101
+ print(f"Reading transcription file: {output_file}")
102
+ try:
103
+ with open(output_file, "r") as f:
104
+ transcription = json.load(f)
105
+ except json.JSONDecodeError as e:
106
+ print(f"Error decoding JSON: {e}")
107
+ print(f"File content: {open(output_file, 'r').read()}")
108
+ raise
109
+
110
+ if "text" in transcription:
111
+ result = transcription["text"]
112
+ else:
113
+ result = " ".join([chunk["text"] for chunk in transcription.get("chunks", [])])
114
+
115
+ print("Transcription completed.")
116
+
117
+ # Cleanup
118
+ cleanup_files(output_file)
119
+ if temp_audio:
120
+ cleanup_files(temp_audio)
121
+
122
+ return result
123
+
124
  @spaces.GPU
125
  def process_video(radio, video, target_language, has_closeup_face):
126
  try:
 
150
 
151
  print("Attempting to transcribe with Whisper...")
152
  try:
153
+ whisper_text = transcribe_audio(f"{run_uuid}_output_audio_final.wav")
 
 
154
  print(f"Transcription successful: {whisper_text}")
155
+ except Exception as e:
156
+ print(f"Error encountered during transcription: {str(e)}")
157
+ raise
 
 
158
 
159
  language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'}
160
  target_language_code = language_mapping[target_language]
161
  translator = Translator()
162
+ translated_text = translator.translate(whisper_text, dest=target_language_code).text
163
  print(translated_text)
164
 
165
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
 
239
  title="AI Video Dubbing",
240
  description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Thanks [@yeswondwer](https://twitter.com/@yeswondwerr) for original code. Test the [Video Transcription and Translate](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) space!""",
241
  allow_flagging=False
 
242
  )
243
+
244
  with gr.Blocks() as demo:
245
  iface.render()
246
  radio.change(swap, inputs=[radio], outputs=video)
 
254
  - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
255
  - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
256
  """)
257
+
258
+ print("Launching Gradio interface...")
259
  demo.queue()
260
  demo.launch()