Spaces:

ADOPLE
/

Video_Analytics

Sleeping

App Files Files Community

KarthickAdopleAI commited on Mar 29

Commit

28482bd

•

1 Parent(s): 4bfb1ad

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -30

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ from pytube import YouTube
 import requests
 import logging
 import os
 nltk.download('punkt')
 nltk.download('stopwords')
@@ -27,7 +29,6 @@ class VideoAnalytics:
     def __init__(self):
       """
       Initialize the VideoAnalytics object.
       Args:
           hf_token (str): Hugging Face API token.
       """
@@ -39,16 +40,7 @@ class VideoAnalytics:
       # Initialize transcribed text variable
       self.transcribed_text = ""
-      # API URL for accessing the Hugging Face model
-      self.API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
-      hf_token = os.getenv('HF_TOKEN')
-      # Placeholder for Hugging Face API token
-      self.hf_token = hf_token # Replace this with the actual Hugging Face API token
-      # Set headers for API requests with Hugging Face token
-      self.headers = {"Authorization": f"Bearer {self.hf_token}"}
       # Initialize english text variable
       self.english_text = ""
@@ -61,13 +53,86 @@ class VideoAnalytics:
       # Configure logging settings
       logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     def transcribe_video(self, vid: str) -> str:
       """
       Transcribe the audio of the video.
       Args:
           vid (str): Path to the video file.
       Returns:
           str: Transcribed text.
       """
@@ -78,28 +143,23 @@ class VideoAnalytics:
           # Write audio to a temporary file
           audio.write_audiofile("output_audio.mp3")
-          audio_file = open("output_audio.mp3", "rb")
-          # Define a helper function to query the Hugging Face model
-          def query(data):
-              response = requests.post(self.API_URL, headers=self.headers, data=data)
-              return response.json()
-          # Send audio data to the Hugging Face model for transcription
-          output = query(audio_file)
-          print(output)
           # Update the transcribed_text attribute with the transcription result
-          self.transcribed_text = output["text"]
           # Update the translation text into english_text
           self.english_text = self.translation()
           # Return the transcribed text
-          return output["text"]
       except Exception as e:
           logging.error(f"Error transcribing video: {e}")
           return ""
     def generate_video_summary(self) -> str:
         """
         Generate a summary of the transcribed video.
@@ -365,10 +425,30 @@ class VideoAnalytics:
             # Log any errors that occur during initialization of YouTube object
             logging.error(f"Error downloading video: {e}")
             return ""
-    def save_audio_with_gtts(self,text, filename):
-        tts = gTTS(text=text, lang='en')
-        tts.save(filename)
-        return filename
     def main(self, video: str = None, input_path: str = None) -> tuple:
         """
@@ -386,7 +466,7 @@ class VideoAnalytics:
             video_ = VideoFileClip(input_path)
             duration = video_.duration
             video_.close()
-            if round(duration) <= 600:
               text = self.transcribe_video(input_path)
             else:
               return "Video Duration Above 10 Minutes,Try Below 10 Minutes Video","","",None,None,None
@@ -394,7 +474,7 @@ class VideoAnalytics:
             video_ = VideoFileClip(video)
             duration = video_.duration
             video_.close()
-            if round(duration) <= 600:
               text = self.transcribe_video(video)
               input_path = video
             else:

 import requests
 import logging
 import os
+from pydub import AudioSegment
+import speech_recognition as sr
 nltk.download('punkt')
 nltk.download('stopwords')
     def __init__(self):
       """
       Initialize the VideoAnalytics object.
       Args:
           hf_token (str): Hugging Face API token.
       """
       # Initialize transcribed text variable
       self.transcribed_text = ""
+      self.r = sr.Recognizer()
       # Initialize english text variable
       self.english_text = ""
       # Configure logging settings
       logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    def mp3_to_wav(self, mp3_file: str, wav_file: str) -> str:
+        """
+        Convert an MP3 audio file to WAV format.
+        Args:
+            mp3_file (str): The path to the input MP3 file.
+            wav_file (str): The path to save the output WAV file.
+        Returns:
+            str: The filename of the converted WAV file.
+        Raises:
+            Exception: If there's an error during the conversion process.
+        """
+        try:
+            # Load the MP3 file
+            audio = AudioSegment.from_mp3(mp3_file)
+            # Export the audio to WAV format
+            audio.export(wav_file, format="wav")
+            logging.info(f"MP3 file '{mp3_file}' converted to WAV successfully: {wav_file}")
+            return wav_file
+        except Exception as e:
+            # Log the exception and raise it further
+            logging.error(f"Error occurred while converting MP3 to WAV: {e}")
+            raise e
+    # Function to recognize speech in the audio file
+    def transcribe_audio(self,path):
+        """Transcribe speech from an audio file."""
+        try:
+            with sr.AudioFile(path) as source:
+                audio_listened = r.record(source)
+                text = r.recognize_google(audio_listened)
+            return text
+        except sr.UnknownValueError as e:
+            logging.error(f"Speech recognition could not understand audio: {e}")
+            return ""
+        except sr.RequestError as e:
+            logging.error(f"Could not request results from Google Speech Recognition service: {e}")
+            return ""
+    # Function to split the audio file into chunks on silence and apply speech recognition
+    def get_large_audio_transcription_on_silence(self,path):
+        """Split the large audio file into chunks and apply speech recognition on each chunk."""
+        try:
+            sound = AudioSegment.from_file(path)
+            chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=sound.dBFS-14, keep_silence=500)
+            folder_name = "audio-chunks"
+            if not os.path.isdir(folder_name):
+                os.mkdir(folder_name)
+            whole_text = ""
+            for i, audio_chunk in enumerate(chunks, start=1):
+                chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
+                audio_chunk.export(chunk_filename, format="wav")
+                text = self.transcribe_audio(chunk_filename)
+                if text:
+                    text = f"{text.capitalize()}. "
+                    logging.info(f"Transcribed {chunk_filename}: {text}")
+                    whole_text += text
+                else:
+                    logging.warning(f"No speech recognized in {chunk_filename}")
+            return whole_text
+        except Exception as e:
+            logging.error(f"Error processing audio: {e}")
+            return ""
     def transcribe_video(self, vid: str) -> str:
       """
       Transcribe the audio of the video.
       Args:
           vid (str): Path to the video file.
       Returns:
           str: Transcribed text.
       """
           # Write audio to a temporary file
           audio.write_audiofile("output_audio.mp3")
+          # Replace 'input.mp3' and 'output.wav' with your file paths
+          audio_filename = self.mp3_to_wav("output_audio.mp3", 'output.wav')
+          text  = self.get_large_audio_transcription_on_silence(audio_filename)
           # Update the transcribed_text attribute with the transcription result
+          self.transcribed_text = text
           # Update the translation text into english_text
           self.english_text = self.translation()
           # Return the transcribed text
+          return text
       except Exception as e:
           logging.error(f"Error transcribing video: {e}")
           return ""
     def generate_video_summary(self) -> str:
         """
         Generate a summary of the transcribed video.
             # Log any errors that occur during initialization of YouTube object
             logging.error(f"Error downloading video: {e}")
             return ""
+    def save_audio_with_gtts(self, text: str, filename: str) -> str:
+        """
+        Generate an audio file from the given text using gTTS and save it.
+        Args:
+            text (str): The text to be converted into speech.
+            filename (str): The filename (including path) to save the audio file.
+        Returns:
+            str: The filename of the saved audio file.
+        Raises:
+            Exception: If there's an error during the conversion or saving process.
+        """
+        try:
+            tts = gTTS(text=text, lang='en')
+            tts.save(filename)
+            logging.info(f"Audio file saved successfully: {filename}")
+            return filename
+        except Exception as e:
+            # Log the exception and raise it further
+            logging.error(f"Error occurred while saving audio: {e}")
+            raise e
     def main(self, video: str = None, input_path: str = None) -> tuple:
         """
             video_ = VideoFileClip(input_path)
             duration = video_.duration
             video_.close()
+            if round(duration) <= 6*600:
               text = self.transcribe_video(input_path)
             else:
               return "Video Duration Above 10 Minutes,Try Below 10 Minutes Video","","",None,None,None
             video_ = VideoFileClip(video)
             duration = video_.duration
             video_.close()
+            if round(duration) <= 6*600:
               text = self.transcribe_video(video)
               input_path = video
             else: