Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Running

App Files Files Community

capradeepgujaran commited on 28 days ago

Commit

5f52218

•

1 Parent(s): 8ad7e0c

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -99

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditi
 import torch
 from PIL import Image
 import faiss
-import pickle
 from typing import List, Dict, Tuple
 import logging
 import gradio as gr
@@ -12,16 +11,18 @@ import tempfile
 import os
 import shutil
 from tqdm import tqdm
-import torch.nn as nn
 import math
-from concurrent.futures import ThreadPoolExecutor
-import numpy as np
 class VideoRAGTool:
     def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
                  blip_model_name: str = "Salesforce/blip-image-captioning-base"):
         """Initialize with performance optimizations."""
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Initialize models with optimization flags
         self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
@@ -34,18 +35,37 @@ class VideoRAGTool:
         self.blip_model.eval()
         # Batch processing settings
-        self.batch_size = 8  # Adjust based on your GPU memory
         self.frame_index = None
         self.frame_data = []
-        self.logger = self._setup_logger()
-    @torch.no_grad()  # Disable gradient computation for inference
     def generate_caption(self, image: Image.Image) -> str:
         """Optimized caption generation."""
-        inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
-        out = self.blip_model.generate(**inputs, max_length=30, num_beams=2)
-        return self.blip_processor.decode(out[0], skip_special_tokens=True)
     def get_video_info(self, video_path: str) -> Tuple[int, float]:
         """Get video frame count and FPS."""
@@ -64,110 +84,119 @@ class VideoRAGTool:
     @torch.no_grad()
     def process_batch(self, frames: List[Image.Image]) -> Tuple[np.ndarray, List[str]]:
         """Process a batch of frames efficiently."""
-        # CLIP processing
-        clip_inputs = self.clip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
-        image_features = self.clip_model.get_image_features(**clip_inputs)
-        # BLIP processing
-        captions = []
-        blip_inputs = self.blip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
-        out = self.blip_model.generate(**blip_inputs, max_length=30, num_beams=2)
-        for o in out:
-            caption = self.blip_processor.decode(o, skip_special_tokens=True)
-            captions.append(caption)
-        return image_features.cpu().numpy(), captions
     def process_video(self, video_path: str, frame_interval: int = 30) -> None:
         """Optimized video processing with batching and progress tracking."""
         self.logger.info(f"Processing video: {video_path}")
-        total_frames, fps = self.get_video_info(video_path)
-        cap = cv2.VideoCapture(video_path)
-        # Calculate total batches for progress bar
-        frames_to_process = total_frames // frame_interval
-        total_batches = math.ceil(frames_to_process / self.batch_size)
-        current_batch = []
-        features_list = []
-        frame_count = 0
-        with tqdm(total=frames_to_process, desc="Processing frames") as pbar:
-            while cap.isOpened():
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                if frame_count % frame_interval == 0:
-                    # Preprocess frame
-                    processed_frame = self.preprocess_frame(frame)
-                    current_batch.append(processed_frame)
-                    # Process batch when it reaches batch_size
-                    if len(current_batch) == self.batch_size:
-                        batch_features, batch_captions = self.process_batch(current_batch)
-                        # Store results
-                        for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
-                            batch_frame_number = frame_count - (self.batch_size - i - 1) * frame_interval
-                            self.frame_data.append({
-                                'frame_number': batch_frame_number,
-                                'timestamp': batch_frame_number / fps,
-                                'caption': caption
-                            })
-                            features_list.append(features)
-                        current_batch = []
-                        pbar.update(self.batch_size)
-                frame_count += 1
-            # Process remaining frames
-            if current_batch:
-                batch_features, batch_captions = self.process_batch(current_batch)
-                for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
-                    batch_frame_number = frame_count - (len(current_batch) - i - 1) * frame_interval
-                    self.frame_data.append({
-                        'frame_number': batch_frame_number,
-                        'timestamp': batch_frame_number / fps,
-                        'caption': caption
-                    })
-                    features_list.append(features)
-        cap.release()
-        if not features_list:
-            raise ValueError("No frames were processed from the video")
-        # Create FAISS index
-        features_array = np.vstack(features_list)
-        self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
-        self.frame_index.add(features_array)
-        self.logger.info(f"Processed {len(self.frame_data)} frames from video")
     def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
         """Query the video using natural language and return relevant frames."""
         self.logger.info(f"Processing query: {query_text}")
-        inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
-        text_features = self.clip_model.get_text_features(**inputs)
-        distances, indices = self.frame_index.search(
-            text_features.cpu().detach().numpy(),
-            k
-        )
-        results = []
-        for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
-            frame_info = self.frame_data[idx].copy()
-            frame_info['relevance_score'] = float(1 / (1 + distance))
-            results.append(frame_info)
-        return results
 class VideoRAGApp:
     def __init__(self):

 import torch
 from PIL import Image
 import faiss
 from typing import List, Dict, Tuple
 import logging
 import gradio as gr
 import os
 import shutil
 from tqdm import tqdm
 import math
 class VideoRAGTool:
     def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
                  blip_model_name: str = "Salesforce/blip-image-captioning-base"):
         """Initialize with performance optimizations."""
+        # Setup logger first to avoid the attribute error
+        self.logger = self.setup_logger()
+        self.logger.info("Initializing VideoRAGTool...")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.logger.info(f"Using device: {self.device}")
         # Initialize models with optimization flags
         self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
         self.blip_model.eval()
         # Batch processing settings
+        self.batch_size = 4  # Reduced batch size for better memory management
         self.frame_index = None
         self.frame_data = []
+    def setup_logger(self) -> logging.Logger:
+        """Set up logging configuration."""
+        logger = logging.getLogger('VideoRAGTool')
+        # Clear any existing handlers
+        if logger.handlers:
+            logger.handlers.clear()
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        return logger
+    @torch.no_grad()
     def generate_caption(self, image: Image.Image) -> str:
         """Optimized caption generation."""
+        try:
+            inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
+            out = self.blip_model.generate(**inputs, max_length=30, num_beams=2)
+            caption = self.blip_processor.decode(out[0], skip_special_tokens=True)
+            return caption
+        except Exception as e:
+            self.logger.error(f"Error generating caption: {str(e)}")
+            return "Caption generation failed"
     def get_video_info(self, video_path: str) -> Tuple[int, float]:
         """Get video frame count and FPS."""
     @torch.no_grad()
     def process_batch(self, frames: List[Image.Image]) -> Tuple[np.ndarray, List[str]]:
         """Process a batch of frames efficiently."""
+        try:
+            # CLIP processing
+            clip_inputs = self.clip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
+            image_features = self.clip_model.get_image_features(**clip_inputs)
+            # BLIP processing
+            captions = []
+            for frame in frames:
+                caption = self.generate_caption(frame)
+                captions.append(caption)
+            return image_features.cpu().numpy(), captions
+        except Exception as e:
+            self.logger.error(f"Error processing batch: {str(e)}")
+            raise
     def process_video(self, video_path: str, frame_interval: int = 30) -> None:
         """Optimized video processing with batching and progress tracking."""
         self.logger.info(f"Processing video: {video_path}")
+        try:
+            total_frames, fps = self.get_video_info(video_path)
+            cap = cv2.VideoCapture(video_path)
+            # Calculate total batches for progress bar
+            frames_to_process = total_frames // frame_interval
+            total_batches = math.ceil(frames_to_process / self.batch_size)
+            current_batch = []
+            features_list = []
+            frame_count = 0
+            with tqdm(total=frames_to_process, desc="Processing frames") as pbar:
+                while cap.isOpened():
+                    ret, frame = cap.read()
+                    if not ret:
+                        break
+                    if frame_count % frame_interval == 0:
+                        # Preprocess frame
+                        processed_frame = self.preprocess_frame(frame)
+                        current_batch.append(processed_frame)
+                        # Process batch when it reaches batch_size
+                        if len(current_batch) == self.batch_size:
+                            batch_features, batch_captions = self.process_batch(current_batch)
+                            # Store results
+                            for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
+                                batch_frame_number = frame_count - (self.batch_size - i - 1) * frame_interval
+                                self.frame_data.append({
+                                    'frame_number': batch_frame_number,
+                                    'timestamp': batch_frame_number / fps,
+                                    'caption': caption
+                                })
+                                features_list.append(features)
+                            current_batch = []
+                            pbar.update(self.batch_size)
+                    frame_count += 1
+                # Process remaining frames
+                if current_batch:
+                    batch_features, batch_captions = self.process_batch(current_batch)
+                    for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
+                        batch_frame_number = frame_count - (len(current_batch) - i - 1) * frame_interval
+                        self.frame_data.append({
+                            'frame_number': batch_frame_number,
+                            'timestamp': batch_frame_number / fps,
+                            'caption': caption
+                        })
+                        features_list.append(features)
+            cap.release()
+            if not features_list:
+                raise ValueError("No frames were processed from the video")
+            # Create FAISS index
+            features_array = np.vstack(features_list)
+            self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
+            self.frame_index.add(features_array)
+            self.logger.info(f"Processed {len(self.frame_data)} frames from video")
+        except Exception as e:
+            self.logger.error(f"Error processing video: {str(e)}")
+            raise
     def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
         """Query the video using natural language and return relevant frames."""
         self.logger.info(f"Processing query: {query_text}")
+        try:
+            inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
+            text_features = self.clip_model.get_text_features(**inputs)
+            distances, indices = self.frame_index.search(
+                text_features.cpu().detach().numpy(),
+                k
+            )
+            results = []
+            for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
+                frame_info = self.frame_data[idx].copy()
+                frame_info['relevance_score'] = float(1 / (1 + distance))
+                results.append(frame_info)
+            return results
+        except Exception as e:
+            self.logger.error(f"Error querying video: {str(e)}")
+            raise
 class VideoRAGApp:
     def __init__(self):