Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Running

App Files Files Community

capradeepgujaran commited on 27 days ago

Commit

a2433fb

•

1 Parent(s): ca72d14

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -87

app.py CHANGED Viewed

@@ -11,9 +11,8 @@ import os
 import shutil
 from tqdm.auto import tqdm
 from pathlib import Path
-from typing import List, Dict, Tuple
-import time
-from huggingface_hub import snapshot_download
 import warnings
 warnings.filterwarnings("ignore")
@@ -22,13 +21,15 @@ os.environ["TRANSFORMERS_CACHE"] = "./model_cache"
 os.environ["HF_HOME"] = "./model_cache"
 os.makedirs("./model_cache", exist_ok=True)
 class VideoProcessor:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"Using device: {self.device}")
         # Load models with optimizations
-        self.load_models()
         # Processing settings
         self.frame_interval = 30  # Process 1 frame every 30 frames
@@ -36,48 +37,57 @@ class VideoProcessor:
         self.target_size = (224, 224)
         self.batch_size = 4 if torch.cuda.is_available() else 2
-    def load_models(self):
         """Load models with optimizations and proper configurations"""
-        print("Loading CLIP model...")
-        self.clip_model = CLIPModel.from_pretrained(
-            "openai/clip-vit-base-patch32",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            cache_dir="./model_cache"
-        ).to(self.device)
-        self.clip_processor = CLIPProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32",
-            cache_dir="./model_cache"
-        )
-        print("Loading BLIP2 model...")
-        model_name = "Salesforce/blip2-opt-2.7b"
-        # Initialize BLIP2 processor without config modifications
-        self.blip_processor = Blip2Processor.from_pretrained(
-            model_name,
-            cache_dir="./model_cache"
-        )
-        # Load BLIP2 model with optimizations
-        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-            cache_dir="./model_cache",
-            low_cpu_mem_usage=True
-        ).to(self.device)
-        # Set models to evaluation mode
-        self.clip_model.eval()
-        self.blip_model.eval()
-        print("Models loaded successfully!")
     @torch.no_grad()
-    def process_frame_batch(self, frames):
         """Process a batch of frames efficiently"""
         try:
             # Convert frames to PIL Images
-            pil_frames = [Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB)).resize(self.target_size) for f in frames]
             # Get CLIP features
             clip_inputs = self.clip_processor(
@@ -90,7 +100,7 @@ class VideoProcessor:
                 clip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in clip_inputs.items()}
             features = self.clip_model.get_image_features(**clip_inputs)
-            # Get BLIP captions with updated processing
             blip_inputs = self.blip_processor(
                 images=pil_frames,
                 return_tensors="pt",
@@ -100,7 +110,7 @@ class VideoProcessor:
             if self.device.type == "cuda":
                 blip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in blip_inputs.items()}
-            # Generate captions with better parameters
             captions = self.blip_model.generate(
                 **blip_inputs,
                 max_length=30,
@@ -113,38 +123,44 @@ class VideoProcessor:
             captions = [self.blip_processor.decode(c, skip_special_tokens=True) for c in captions]
             return features.cpu().numpy(), captions
         except Exception as e:
-            print(f"Error in batch processing: {str(e)}")
             return None, None
-    def process_video(self, video_path: str, progress=gr.Progress()):
         """Process video with batching and progress updates"""
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError("Could not open video file")
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        # Calculate frames to process
-        frames_to_process = min(self.max_frames, total_frames // self.frame_interval)
-        progress(0, desc="Initializing video processing...")
-        features_list = []
-        frame_data = []
-        current_batch = []
-        batch_positions = []
         try:
             frame_count = 0
             processed_count = 0
-            while cap.isOpened() and processed_count < frames_to_process:
                 ret, frame = cap.read()
                 if not ret:
                     break
                 if frame_count % self.frame_interval == 0:
                     current_batch.append(frame)
                     batch_positions.append(frame_count)
@@ -157,12 +173,12 @@ class VideoProcessor:
                         features, captions = self.process_frame_batch(current_batch)
                         if features is not None and captions is not None:
-                            for i, (feat, cap) in enumerate(zip(features, captions)):
                                 features_list.append(feat)
                                 frame_data.append({
                                     'frame_number': batch_positions[i],
                                     'timestamp': batch_positions[i] / fps,
-                                    'caption': cap
                                 })
                         processed_count += len(current_batch)
@@ -171,21 +187,25 @@ class VideoProcessor:
                 frame_count += 1
-            cap.release()
             # Create FAISS index
             if features_list:
                 features_array = np.vstack(features_list)
                 frame_index = faiss.IndexFlatL2(features_array.shape[1])
                 frame_index.add(features_array)
                 return frame_index, frame_data, "Video processed successfully!"
             else:
                 return None, None, "No frames were processed successfully."
         except Exception as e:
-            cap.release()
-            raise e
 class VideoQAInterface:
     def __init__(self):
@@ -195,28 +215,28 @@ class VideoQAInterface:
         self.processed = False
         self.current_video_path = None
         self.temp_dir = tempfile.mkdtemp()
-        print(f"Initialized temp directory: {self.temp_dir}")
     def __del__(self):
         """Cleanup temporary files"""
-        if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
-            try:
                 shutil.rmtree(self.temp_dir)
-                print(f"Cleaned up temp directory: {self.temp_dir}")
-            except Exception as e:
-                print(f"Error cleaning up temp directory: {str(e)}")
     def process_video(self, video_file, progress=gr.Progress()):
         """Process video with progress tracking"""
-        try:
-            if video_file is None:
-                return "Please upload a video first."
             # Save uploaded video to temp directory
             temp_video_path = os.path.join(self.temp_dir, "input_video.mp4")
             shutil.copy2(video_file.name, temp_video_path)
             self.current_video_path = temp_video_path
-            print(f"Saved video to: {self.current_video_path}")
             progress(0, desc="Starting video processing...")
             self.frame_index, self.frame_data, message = self.processor.process_video(
@@ -232,6 +252,7 @@ class VideoQAInterface:
         except Exception as e:
             self.processed = False
             return f"Error processing video: {str(e)}"
     @torch.no_grad()
@@ -259,7 +280,6 @@ class VideoQAInterface:
             descriptions = []
             frames = []
-            # Use cv2.VideoCapture to read frames
             cap = cv2.VideoCapture(self.current_video_path)
             try:
                 for result in results:
@@ -288,6 +308,7 @@ class VideoQAInterface:
             return frames, combined_desc
         except Exception as e:
             return None, f"Error answering question: {str(e)}"
     def create_interface(self):
@@ -341,14 +362,12 @@ class VideoQAInterface:
         return interface
 # Create and launch the app
 app = VideoQAInterface()
 interface = app.create_interface()
 if __name__ == "__main__":
     interface.launch(
-        server_name="0.0.0.0",  # Allow external connections
-        share=False,            # Set to True for public URL
-        show_error=True,        # Show detailed error messages
-        quiet=False             # Show server logs
     )

 import shutil
 from tqdm.auto import tqdm
 from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+import gc
 import warnings
 warnings.filterwarnings("ignore")
 os.environ["HF_HOME"] = "./model_cache"
 os.makedirs("./model_cache", exist_ok=True)
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 class VideoProcessor:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logging.info(f"Using device: {self.device}")
         # Load models with optimizations
+        self._load_models()
         # Processing settings
         self.frame_interval = 30  # Process 1 frame every 30 frames
         self.target_size = (224, 224)
         self.batch_size = 4 if torch.cuda.is_available() else 2
+    def _load_models(self):
         """Load models with optimizations and proper configurations"""
+        try:
+            logging.info("Loading CLIP model...")
+            self.clip_model = CLIPModel.from_pretrained(
+                "openai/clip-vit-base-patch32",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                cache_dir="./model_cache"
+            ).to(self.device)
+            self.clip_processor = CLIPProcessor.from_pretrained(
+                "openai/clip-vit-base-patch32",
+                cache_dir="./model_cache"
+            )
+            logging.info("Loading BLIP2 model...")
+            model_name = "Salesforce/blip2-opt-2.7b"
+            # Initialize BLIP2 with minimal configuration
+            self.blip_processor = Blip2Processor.from_pretrained(
+                model_name,
+                cache_dir="./model_cache"
+            )
+            self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
+                cache_dir="./model_cache",
+                low_cpu_mem_usage=True
+            ).to(self.device)
+            # Set models to evaluation mode
+            self.clip_model.eval()
+            self.blip_model.eval()
+            logging.info("Models loaded successfully!")
+        except Exception as e:
+            logging.error(f"Error loading models: {str(e)}")
+            raise
+    def _preprocess_frame(self, frame: np.ndarray) -> Image.Image:
+        """Preprocess a single frame"""
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        return Image.fromarray(rgb_frame).resize(self.target_size, Image.LANCZOS)
     @torch.no_grad()
+    def process_frame_batch(self, frames: List[np.ndarray]) -> Tuple[Optional[np.ndarray], Optional[List[str]]]:
         """Process a batch of frames efficiently"""
         try:
             # Convert frames to PIL Images
+            pil_frames = [self._preprocess_frame(f) for f in frames]
             # Get CLIP features
             clip_inputs = self.clip_processor(
                 clip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in clip_inputs.items()}
             features = self.clip_model.get_image_features(**clip_inputs)
+            # Get BLIP captions
             blip_inputs = self.blip_processor(
                 images=pil_frames,
                 return_tensors="pt",
             if self.device.type == "cuda":
                 blip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in blip_inputs.items()}
+            # Generate captions
             captions = self.blip_model.generate(
                 **blip_inputs,
                 max_length=30,
             captions = [self.blip_processor.decode(c, skip_special_tokens=True) for c in captions]
+            # Clear GPU memory if needed
+            if self.device.type == "cuda":
+                torch.cuda.empty_cache()
             return features.cpu().numpy(), captions
         except Exception as e:
+            logging.error(f"Error in batch processing: {str(e)}")
             return None, None
+    def process_video(self, video_path: str, progress: gr.Progress) -> Tuple[Optional[faiss.Index], Optional[List[Dict]], str]:
         """Process video with batching and progress updates"""
+        cap = None
         try:
+            cap = cv2.VideoCapture(video_path)
+            if not cap.isOpened():
+                raise ValueError(f"Could not open video file: {video_path}")
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            # Calculate frames to process
+            frames_to_process = min(self.max_frames, total_frames // self.frame_interval)
+            progress(0, desc="Initializing video processing...")
+            features_list = []
+            frame_data = []
+            current_batch = []
+            batch_positions = []
             frame_count = 0
             processed_count = 0
+            while processed_count < frames_to_process:
                 ret, frame = cap.read()
                 if not ret:
                     break
                 if frame_count % self.frame_interval == 0:
                     current_batch.append(frame)
                     batch_positions.append(frame_count)
                         features, captions = self.process_frame_batch(current_batch)
                         if features is not None and captions is not None:
+                            for i, (feat, cap_text) in enumerate(zip(features, captions)):
                                 features_list.append(feat)
                                 frame_data.append({
                                     'frame_number': batch_positions[i],
                                     'timestamp': batch_positions[i] / fps,
+                                    'caption': cap_text
                                 })
                         processed_count += len(current_batch)
                 frame_count += 1
             # Create FAISS index
             if features_list:
                 features_array = np.vstack(features_list)
                 frame_index = faiss.IndexFlatL2(features_array.shape[1])
                 frame_index.add(features_array)
                 return frame_index, frame_data, "Video processed successfully!"
             else:
                 return None, None, "No frames were processed successfully."
         except Exception as e:
+            logging.error(f"Error processing video: {str(e)}")
+            return None, None, f"Error processing video: {str(e)}"
+        finally:
+            if cap is not None:
+                cap.release()
+            gc.collect()
+            if self.device.type == "cuda":
+                torch.cuda.empty_cache()
 class VideoQAInterface:
     def __init__(self):
         self.processed = False
         self.current_video_path = None
         self.temp_dir = tempfile.mkdtemp()
+        logging.info(f"Initialized temp directory: {self.temp_dir}")
     def __del__(self):
         """Cleanup temporary files"""
+        try:
+            if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
                 shutil.rmtree(self.temp_dir)
+                logging.info(f"Cleaned up temp directory: {self.temp_dir}")
+        except Exception as e:
+            logging.error(f"Error cleaning up temp directory: {str(e)}")
     def process_video(self, video_file, progress=gr.Progress()):
         """Process video with progress tracking"""
+        if video_file is None:
+            return "Please upload a video first."
+        try:
             # Save uploaded video to temp directory
             temp_video_path = os.path.join(self.temp_dir, "input_video.mp4")
             shutil.copy2(video_file.name, temp_video_path)
             self.current_video_path = temp_video_path
+            logging.info(f"Saved video to: {self.current_video_path}")
             progress(0, desc="Starting video processing...")
             self.frame_index, self.frame_data, message = self.processor.process_video(
         except Exception as e:
             self.processed = False
+            logging.error(f"Error processing video: {str(e)}")
             return f"Error processing video: {str(e)}"
     @torch.no_grad()
             descriptions = []
             frames = []
             cap = cv2.VideoCapture(self.current_video_path)
             try:
                 for result in results:
             return frames, combined_desc
         except Exception as e:
+            logging.error(f"Error answering question: {str(e)}")
             return None, f"Error answering question: {str(e)}"
     def create_interface(self):
         return interface
 # Create and launch the app
 app = VideoQAInterface()
 interface = app.create_interface()
 if __name__ == "__main__":
     interface.launch(
+        server_name="0.0.0.0",
+        share=False,
+        show_error=True
     )