Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Running

App Files Files Community

capradeepgujaran commited on 28 days ago

Commit

17e6c9d

•

1 Parent(s): 17991a3

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -24

app.py CHANGED Viewed

@@ -13,9 +13,19 @@ from tqdm.auto import tqdm
 from pathlib import Path
 from typing import List, Dict, Tuple
 import time
 class VideoProcessor:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Load models with optimizations
         self.load_models()
@@ -27,25 +37,42 @@ class VideoProcessor:
         self.batch_size = 4 if torch.cuda.is_available() else 2
     def load_models(self):
-        """Load models with optimizations"""
-        # Load CLIP
         self.clip_model = CLIPModel.from_pretrained(
             "openai/clip-vit-base-patch32",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         ).to(self.device)
-        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-        # Load BLIP2 with reduced size
         self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-opt-2.7b",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None
         ).to(self.device)
-        self.blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
         # Set models to evaluation mode
         self.clip_model.eval()
         self.blip_model.eval()
     @torch.no_grad()
     def process_frame_batch(self, frames):
@@ -55,16 +82,37 @@ class VideoProcessor:
             pil_frames = [Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB)).resize(self.target_size) for f in frames]
             # Get CLIP features
-            clip_inputs = self.clip_processor(images=pil_frames, return_tensors="pt", padding=True).to(self.device)
             if self.device.type == "cuda":
                 clip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in clip_inputs.items()}
             features = self.clip_model.get_image_features(**clip_inputs)
-            # Get BLIP captions
-            blip_inputs = self.blip_processor(images=pil_frames, return_tensors="pt", padding=True).to(self.device)
             if self.device.type == "cuda":
                 blip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in blip_inputs.items()}
-            captions = self.blip_model.generate(**blip_inputs, max_length=30)
             captions = [self.blip_processor.decode(c, skip_special_tokens=True) for c in captions]
             return features.cpu().numpy(), captions
@@ -75,12 +123,15 @@ class VideoProcessor:
     def process_video(self, video_path: str, progress=gr.Progress()):
         """Process video with batching and progress updates"""
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         fps = cap.get(cv2.CAP_PROP_FPS)
         # Calculate frames to process
         frames_to_process = min(self.max_frames, total_frames // self.frame_interval)
-        progress(0, desc="Initializing...")
         features_list = []
         frame_data = []
@@ -102,6 +153,9 @@ class VideoProcessor:
                     # Process batch when full
                     if len(current_batch) == self.batch_size or frame_count == total_frames - 1:
                         features, captions = self.process_frame_batch(current_batch)
                         if features is not None and captions is not None:
@@ -116,13 +170,9 @@ class VideoProcessor:
                         processed_count += len(current_batch)
                         current_batch = []
                         batch_positions = []
-                        # Update progress
-                        progress(processed_count / frames_to_process,
-                               desc=f"Processing frames... {processed_count}/{frames_to_process}")
                 frame_count += 1
             cap.release()
             # Create FAISS index
@@ -137,7 +187,7 @@ class VideoProcessor:
         except Exception as e:
             cap.release()
-            return None, None, f"Error processing video: {str(e)}"
 class VideoQAInterface:
     def __init__(self):
@@ -145,13 +195,18 @@ class VideoQAInterface:
         self.frame_index = None
         self.frame_data = None
         self.processed = False
-        self.current_video_path = None  # Store the video path
         self.temp_dir = tempfile.mkdtemp()
     def __del__(self):
         """Cleanup temporary files"""
         if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
-            shutil.rmtree(self.temp_dir, ignore_errors=True)
     def process_video(self, video_file, progress=gr.Progress()):
         """Process video with progress tracking"""
@@ -163,6 +218,7 @@ class VideoQAInterface:
             temp_video_path = os.path.join(self.temp_dir, "input_video.mp4")
             shutil.copy2(video_file.name, temp_video_path)
             self.current_video_path = temp_video_path
             progress(0, desc="Starting video processing...")
             self.frame_index, self.frame_data, message = self.processor.process_video(
@@ -178,7 +234,7 @@ class VideoQAInterface:
         except Exception as e:
             self.processed = False
-            return f"Error: {str(e)}"
     @torch.no_grad()
     def answer_question(self, query):
@@ -222,7 +278,7 @@ class VideoQAInterface:
                         desc += f"Relevance Score: {result['relevance']:.2f}"
                         descriptions.append(desc)
             finally:
-                cap.release()  # Ensure video capture is released
             if not frames:
                 return None, "No relevant frames found."
@@ -291,4 +347,9 @@ app = VideoQAInterface()
 interface = app.create_interface()
 if __name__ == "__main__":
-    interface.launch()

 from pathlib import Path
 from typing import List, Dict, Tuple
 import time
+from huggingface_hub import snapshot_download
+import warnings
+warnings.filterwarnings("ignore")
+# Configure model caching and environment
+os.environ["TRANSFORMERS_CACHE"] = "./model_cache"
+os.environ["HF_HOME"] = "./model_cache"
+os.makedirs("./model_cache", exist_ok=True)
 class VideoProcessor:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
         # Load models with optimizations
         self.load_models()
         self.batch_size = 4 if torch.cuda.is_available() else 2
     def load_models(self):
+        """Load models with optimizations and proper configurations"""
+        print("Loading CLIP model...")
         self.clip_model = CLIPModel.from_pretrained(
             "openai/clip-vit-base-patch32",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            cache_dir="./model_cache"
         ).to(self.device)
+        self.clip_processor = CLIPProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32",
+            cache_dir="./model_cache"
+        )
+        print("Loading BLIP2 model...")
+        model_name = "Salesforce/blip2-opt-2.7b"
+        # Initialize BLIP2 processor with updated configuration
+        self.blip_processor = Blip2Processor.from_pretrained(
+            model_name,
+            cache_dir="./model_cache"
+        )
+        self.blip_processor.config.use_fast_tokenizer = True
+        self.blip_processor.config.processor_class = "Blip2Processor"
+        # Load BLIP2 model with optimizations
         self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
+            model_name,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
+            cache_dir="./model_cache",
+            low_cpu_mem_usage=True
         ).to(self.device)
         # Set models to evaluation mode
         self.clip_model.eval()
         self.blip_model.eval()
+        print("Models loaded successfully!")
     @torch.no_grad()
     def process_frame_batch(self, frames):
             pil_frames = [Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB)).resize(self.target_size) for f in frames]
             # Get CLIP features
+            clip_inputs = self.clip_processor(
+                images=pil_frames,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
             if self.device.type == "cuda":
                 clip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in clip_inputs.items()}
             features = self.clip_model.get_image_features(**clip_inputs)
+            # Get BLIP captions with updated processing
+            blip_inputs = self.blip_processor(
+                images=pil_frames,
+                return_tensors="pt",
+                padding=True
+            ).to(self.device)
             if self.device.type == "cuda":
                 blip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in blip_inputs.items()}
+            # Generate captions with better parameters
+            captions = self.blip_model.generate(
+                **blip_inputs,
+                max_length=30,
+                min_length=10,
+                num_beams=5,
+                length_penalty=1.0,
+                temperature=0.7,
+                do_sample=False
+            )
             captions = [self.blip_processor.decode(c, skip_special_tokens=True) for c in captions]
             return features.cpu().numpy(), captions
     def process_video(self, video_path: str, progress=gr.Progress()):
         """Process video with batching and progress updates"""
         cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise ValueError("Could not open video file")
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         fps = cap.get(cv2.CAP_PROP_FPS)
         # Calculate frames to process
         frames_to_process = min(self.max_frames, total_frames // self.frame_interval)
+        progress(0, desc="Initializing video processing...")
         features_list = []
         frame_data = []
                     # Process batch when full
                     if len(current_batch) == self.batch_size or frame_count == total_frames - 1:
+                        progress(processed_count / frames_to_process,
+                               desc=f"Processing frames... {processed_count}/{frames_to_process}")
                         features, captions = self.process_frame_batch(current_batch)
                         if features is not None and captions is not None:
                         processed_count += len(current_batch)
                         current_batch = []
                         batch_positions = []
                 frame_count += 1
             cap.release()
             # Create FAISS index
         except Exception as e:
             cap.release()
+            raise e
 class VideoQAInterface:
     def __init__(self):
         self.frame_index = None
         self.frame_data = None
         self.processed = False
+        self.current_video_path = None
         self.temp_dir = tempfile.mkdtemp()
+        print(f"Initialized temp directory: {self.temp_dir}")
     def __del__(self):
         """Cleanup temporary files"""
         if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+                print(f"Cleaned up temp directory: {self.temp_dir}")
+            except Exception as e:
+                print(f"Error cleaning up temp directory: {str(e)}")
     def process_video(self, video_file, progress=gr.Progress()):
         """Process video with progress tracking"""
             temp_video_path = os.path.join(self.temp_dir, "input_video.mp4")
             shutil.copy2(video_file.name, temp_video_path)
             self.current_video_path = temp_video_path
+            print(f"Saved video to: {self.current_video_path}")
             progress(0, desc="Starting video processing...")
             self.frame_index, self.frame_data, message = self.processor.process_video(
         except Exception as e:
             self.processed = False
+            return f"Error processing video: {str(e)}"
     @torch.no_grad()
     def answer_question(self, query):
                         desc += f"Relevance Score: {result['relevance']:.2f}"
                         descriptions.append(desc)
             finally:
+                cap.release()
             if not frames:
                 return None, "No relevant frames found."
 interface = app.create_interface()
 if __name__ == "__main__":
+    interface.launch(
+        server_name="0.0.0.0",
+        share=False,  # Set to True if you want to create a public link
+        cache_examples=True,
+        max_threads=4
+    )