Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Running

App Files Files Community

capradeepgujaran commited on 28 days ago

Commit

007d795

•

1 Parent(s): e25cab4

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -209

app.py CHANGED Viewed

@@ -1,11 +1,6 @@
 import cv2
 import numpy as np
-from transformers import (
-    CLIPProcessor, CLIPModel,
-    BlipProcessor, BlipForConditionalGeneration,
-    Blip2Processor, Blip2ForConditionalGeneration,
-    AutoProcessor, AutoModelForObjectDetection
-)
 import torch
 from PIL import Image
 import faiss
@@ -16,43 +11,37 @@ import tempfile
 import os
 import shutil
 from tqdm import tqdm
-class EnhancedVideoAnalyzer:
     def __init__(self):
         self.logger = self.setup_logger()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.logger.info(f"Using device: {self.device}")
-        # Initialize CLIP for general scene understanding
-        self.logger.info("Loading CLIP model...")
-        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device)
-        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        # Initialize BLIP-2 for detailed scene description
-        self.logger.info("Loading BLIP-2 model...")
-        self.blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        self.blip2_model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-opt-2.7b",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         ).to(self.device)
-        # Initialize Object Detection model
-        self.logger.info("Loading object detection model...")
-        self.obj_processor = AutoProcessor.from_pretrained("microsoft/table-transformer-detection")
-        self.obj_model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection").to(self.device)
         self.frame_index = None
         self.frame_data = []
-        self.target_size = (384, 384)  # Increased size for better detail recognition
-        self.batch_size = 4
-        # Set all models to evaluation mode
-        self.clip_model.eval()
-        self.blip2_model.eval()
-        self.obj_model.eval()
     def setup_logger(self) -> logging.Logger:
-        logger = logging.getLogger('EnhancedVideoAnalyzer')
         if logger.handlers:
             logger.handlers.clear()
         logger.setLevel(logging.INFO)
@@ -62,256 +51,218 @@ class EnhancedVideoAnalyzer:
         logger.addHandler(handler)
         return logger
     @torch.no_grad()
     def analyze_frame(self, image: Image.Image) -> Dict:
         """Comprehensive frame analysis"""
         try:
-            # 1. Generate detailed caption using BLIP-2
-            inputs = self.blip2_processor(image, return_tensors="pt").to(self.device, torch.float16)
-            caption = self.blip2_model.generate(**inputs, max_new_tokens=50)
-            caption_text = self.blip2_processor.decode(caption[0], skip_special_tokens=True)
-            # 2. Detect objects
-            obj_inputs = self.obj_processor(images=image, return_tensors="pt").to(self.device)
-            obj_outputs = self.obj_model(**obj_inputs)
-            # Process object detection results
-            target_sizes = torch.tensor([image.size[::-1]])
-            results = self.obj_processor.post_process_object_detection(
-                obj_outputs, threshold=0.5, target_sizes=target_sizes
-            )[0]
-            detected_objects = []
-            for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-                detected_objects.append({
-                    "label": self.obj_processor.model.config.id2label[label.item()],
-                    "confidence": score.item()
-                })
             return {
                 "caption": caption_text,
-                "objects": detected_objects
             }
         except Exception as e:
-            self.logger.error(f"Error in frame analysis: {str(e)}")
-            return {"caption": "Error analyzing frame", "objects": []}
-    def extract_keyframes(self, video_path: str, max_frames: int = 15) -> List[Tuple[int, np.ndarray]]:
-        """Extract key frames using scene detection"""
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        # Calculate frame interval to get approximately max_frames
-        frame_interval = max(1, total_frames // max_frames)
-        frames = []
-        frame_positions = []
-        prev_gray = None
-        with tqdm(total=total_frames, desc="Extracting frames") as pbar:
-            while cap.isOpened() and len(frames) < max_frames:
                 ret, frame = cap.read()
                 if not ret:
                     break
-                # Convert to grayscale for scene detection
-                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-                if prev_gray is not None:
-                    # Calculate frame difference
-                    diff = cv2.absdiff(gray, prev_gray)
-                    mean_diff = np.mean(diff)
-                    # If significant change or first/last frame
-                    if mean_diff > 30 or len(frames) == 0:
-                        frames.append(frame)
-                        frame_positions.append(cap.get(cv2.CAP_PROP_POS_FRAMES))
-                prev_gray = gray
-                pbar.update(1)
         cap.release()
-        return list(zip(frame_positions, frames))
-    @torch.no_grad()
-    def process_video(self, video_path: str) -> None:
-        """Process video with comprehensive analysis"""
         self.logger.info(f"Processing video: {video_path}")
-        self.frame_data = []
-        features_list = []
         try:
-            # Extract key frames
-            keyframes = self.extract_keyframes(video_path)
-            self.logger.info(f"Extracted {len(keyframes)} key frames")
-            # Process frames with progress bar
-            with tqdm(total=len(keyframes), desc="Analyzing frames") as pbar:
-                for frame_pos, frame in keyframes:
-                    # Convert frame to PIL Image
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    image = Image.fromarray(frame_rgb).resize(self.target_size, Image.LANCZOS)
-                    # Analyze frame
-                    analysis = self.analyze_frame(image)
-                    # Get CLIP features
-                    clip_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
-                    image_features = self.clip_model.get_image_features(**clip_inputs)
-                    # Store results
-                    self.frame_data.append({
-                        'frame_number': int(frame_pos),
-                        'timestamp': frame_pos / 30.0,  # Approximate timestamp
-                        'caption': analysis['caption'],
-                        'objects': analysis['objects']
-                    })
-                    features_list.append(image_features.cpu().numpy())
-                    pbar.update(1)
-            # Create FAISS index
-            if features_list:
-                features_array = np.vstack(features_list)
-                self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
-                self.frame_index.add(features_array)
-            self.logger.info("Video processing completed successfully")
         except Exception as e:
-            self.logger.error(f"Error processing video: {str(e)}")
-            raise
     @torch.no_grad()
-    def query_video(self, query_text: str, k: int = 4) -> List[Dict]:
-        """Enhanced query processing"""
         try:
-            # Process query with CLIP
-            text_inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
-            text_features = self.clip_model.get_text_features(**text_inputs)
-            # Search for relevant frames
             distances, indices = self.frame_index.search(
-                text_features.cpu().numpy(),
                 k
             )
-            # Prepare results with enhanced information
             results = []
             for distance, idx in zip(distances[0], indices[0]):
                 frame_info = self.frame_data[idx].copy()
-                # Add relevance score
-                frame_info['relevance_score'] = float(1 / (1 + distance))
-                # Add object summary
-                obj_summary = ", ".join(obj["label"] for obj in frame_info['objects'][:3])
-                if obj_summary:
-                    frame_info['object_summary'] = f"Objects detected: {obj_summary}"
                 results.append(frame_info)
             return results
         except Exception as e:
-            self.logger.error(f"Error querying video: {str(e)}")
-            raise
-class VideoQAApp:
     def __init__(self):
-        self.analyzer = EnhancedVideoAnalyzer()
-        self.current_video_path = None
         self.processed = False
-        self.temp_dir = tempfile.mkdtemp()
-    def __del__(self):
-        if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
-            shutil.rmtree(self.temp_dir, ignore_errors=True)
     def process_video(self, video_file):
-        """Process video with progress updates"""
         try:
             if video_file is None:
                 return "Please upload a video first.", gr.Progress(0)
-            video_path = video_file.name
-            temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
-            shutil.copy2(video_path, temp_video_path)
-            self.current_video_path = temp_video_path
-            self.analyzer.process_video(self.current_video_path)
-            self.processed = True
-            return "Video processed successfully! You can now ask questions about the video.", gr.Progress(100)
         except Exception as e:
             self.processed = False
-            return f"Error processing video: {str(e)}", gr.Progress(0)
-    def query_video(self, query_text):
-        """Query video with comprehensive results"""
         if not self.processed:
             return None, "Please process a video first."
         try:
-            results = self.analyzer.query_video(query_text)
             frames = []
             descriptions = []
-            cap = cv2.VideoCapture(self.current_video_path)
             for result in results:
-                frame_number = result['frame_number']
-                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
-                ret, frame = cap.read()
-                if ret:
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    frames.append(Image.fromarray(frame_rgb))
-                    description = f"Timestamp: {result['timestamp']:.2f}s\n"
-                    description += f"Scene Description: {result['caption']}\n"
-                    if 'object_summary' in result:
-                        description += f"{result['object_summary']}\n"
-                    description += f"Relevance Score: {result['relevance_score']:.2f}"
-                    descriptions.append(description)
-            cap.release()
-            combined_description = "\n\nScene Analysis:\n\n"
             for i, desc in enumerate(descriptions, 1):
-                combined_description += f"Frame {i}:\n{desc}\n\n"
-            return frames, combined_description
         except Exception as e:
-            return None, f"Error querying video: {str(e)}"
     def create_interface(self):
         """Create Gradio interface"""
-        with gr.Blocks(title="Video Question Answering") as interface:
             gr.Markdown("# Advanced Video Question Answering")
             gr.Markdown("Upload a video and ask questions about any aspect of its content!")
             with gr.Row():
                 video_input = gr.File(
-                    label="Upload Video (Recommended: 30 seconds to 5 minutes)",
                     file_types=["video"],
                 )
                 process_button = gr.Button("Process Video")
-            with gr.Row():
-                status_output = gr.Textbox(
-                    label="Status",
-                    interactive=False
-                )
-                progress = gr.Progress()
             with gr.Row():
                 query_input = gr.Textbox(
-                    label="Ask anything about the video",
                     placeholder="What's happening in the video?"
                 )
                 query_button = gr.Button("Search")
@@ -319,7 +270,6 @@ class VideoQAApp:
             gallery = gr.Gallery(
                 label="Retrieved Frames",
                 show_label=True,
-                elem_id="gallery",
                 columns=[2],
                 rows=[2],
                 height="auto"
@@ -334,11 +284,11 @@ class VideoQAApp:
             process_button.click(
                 fn=self.process_video,
                 inputs=[video_input],
-                outputs=[status_output, progress]
             )
             query_button.click(
-                fn=self.query_video,
                 inputs=[query_input],
                 outputs=[gallery, descriptions]
             )
@@ -346,7 +296,7 @@ class VideoQAApp:
         return interface
 # Initialize and create the interface
-app = VideoQAApp()
 interface = app.create_interface()
 # Launch the app

 import cv2
 import numpy as np
+from transformers import CLIPProcessor, CLIPModel, Blip2Processor, Blip2ForConditionalGeneration
 import torch
 from PIL import Image
 import faiss
 import os
 import shutil
 from tqdm import tqdm
+from pathlib import Path
+from moviepy.video.io.VideoFileClip import VideoFileClip
+class VideoRAGSystem:
     def __init__(self):
         self.logger = self.setup_logger()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.logger.info(f"Using device: {self.device}")
+        # Initialize models
+        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
+        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
         ).to(self.device)
+        # Vector store setup
         self.frame_index = None
         self.frame_data = []
+        self.target_size = (224, 224)
+        # Create directories for storing processed data
+        self.temp_dir = tempfile.mkdtemp()
+        self.frames_dir = os.path.join(self.temp_dir, "frames")
+        os.makedirs(self.frames_dir, exist_ok=True)
     def setup_logger(self) -> logging.Logger:
+        logger = logging.getLogger('VideoRAGSystem')
         if logger.handlers:
             logger.handlers.clear()
         logger.setLevel(logging.INFO)
         logger.addHandler(handler)
         return logger
+    def split_video(self, video_path: str, timestamp_ms: int, context_seconds: int = 3) -> str:
+        """Extract a clip around the specified timestamp"""
+        timestamp_sec = timestamp_ms / 1000
+        output_path = os.path.join(self.temp_dir, "clip.mp4")
+        with VideoFileClip(video_path) as video:
+            duration = video.duration
+            start_time = max(timestamp_sec - context_seconds, 0)
+            end_time = min(timestamp_sec + context_seconds, duration)
+            clip = video.subclip(start_time, end_time)
+            clip.write_videofile(output_path, audio_codec='aac')
+        return output_path
     @torch.no_grad()
     def analyze_frame(self, image: Image.Image) -> Dict:
         """Comprehensive frame analysis"""
         try:
+            # Generate caption
+            inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
+            if self.device.type == "cuda":
+                inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
+            caption = self.blip_model.generate(**inputs, max_length=50)
+            caption_text = self.blip_processor.decode(caption[0], skip_special_tokens=True)
+            # Get visual features
+            clip_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
+            if self.device.type == "cuda":
+                clip_inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in clip_inputs.items()}
+            features = self.clip_model.get_image_features(**clip_inputs)
             return {
                 "caption": caption_text,
+                "features": features.cpu().numpy()
             }
         except Exception as e:
+            self.logger.error(f"Frame analysis error: {str(e)}")
+            return None
+    def extract_keyframes(self, video_path: str, max_frames: int = 15) -> List[Dict]:
+        """Extract and analyze key frames"""
         cap = cv2.VideoCapture(video_path)
+        frames_info = []
+        frame_count = 0
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        interval = max(1, total_frames // max_frames)
+        with tqdm(total=max_frames, desc="Analyzing frames") as pbar:
+            while len(frames_info) < max_frames and cap.isOpened():
                 ret, frame = cap.read()
                 if not ret:
                     break
+                if frame_count % interval == 0:
+                    # Save frame
+                    frame_path = os.path.join(self.frames_dir, f"frame_{frame_count}.jpg")
+                    cv2.imwrite(frame_path, frame)
+                    # Analyze frame
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    image = Image.fromarray(frame_rgb).resize(self.target_size, Image.LANCZOS)
+                    analysis = self.analyze_frame(image)
+                    if analysis is not None:
+                        frames_info.append({
+                            "frame_number": frame_count,
+                            "timestamp": frame_count / cap.get(cv2.CAP_PROP_FPS),
+                            "path": frame_path,
+                            "caption": analysis["caption"],
+                            "features": analysis["features"]
+                        })
+                        pbar.update(1)
+                frame_count += 1
         cap.release()
+        return frames_info
+    def process_video(self, video_path: str):
+        """Process video and build search index"""
         self.logger.info(f"Processing video: {video_path}")
         try:
+            # Extract and analyze frames
+            frames_info = self.extract_keyframes(video_path)
+            self.frame_data = frames_info
+            # Build FAISS index
+            if frames_info:
+                features = np.vstack([frame["features"] for frame in frames_info])
+                self.frame_index = faiss.IndexFlatL2(features.shape[1])
+                self.frame_index.add(features)
+            self.logger.info(f"Processed {len(frames_info)} frames successfully")
+            return True
         except Exception as e:
+            self.logger.error(f"Video processing error: {str(e)}")
+            return False
     @torch.no_grad()
+    def search_frames(self, query: str, k: int = 4) -> List[Dict]:
+        """Search for relevant frames based on the query"""
         try:
+            # Process query
+            inputs = self.clip_processor(text=[query], return_tensors="pt").to(self.device)
+            if self.device.type == "cuda":
+                inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
+            query_features = self.clip_model.get_text_features(**inputs)
+            # Search
             distances, indices = self.frame_index.search(
+                query_features.cpu().numpy(),
                 k
             )
+            # Prepare results
             results = []
             for distance, idx in zip(distances[0], indices[0]):
                 frame_info = self.frame_data[idx].copy()
+                frame_info["relevance"] = float(1 / (1 + distance))
                 results.append(frame_info)
             return results
         except Exception as e:
+            self.logger.error(f"Search error: {str(e)}")
+            return []
+class VideoQAInterface:
     def __init__(self):
+        self.rag_system = VideoRAGSystem()
+        self.current_video = None
         self.processed = False
     def process_video(self, video_file):
+        """Handle video upload and processing"""
         try:
             if video_file is None:
                 return "Please upload a video first.", gr.Progress(0)
+            self.current_video = video_file.name
+            success = self.rag_system.process_video(self.current_video)
+            if success:
+                self.processed = True
+                return "Video processed successfully! You can now ask questions.", gr.Progress(100)
+            else:
+                return "Error processing video. Please try again.", gr.Progress(0)
         except Exception as e:
             self.processed = False
+            return f"Error: {str(e)}", gr.Progress(0)
+    def answer_question(self, query):
+        """Handle question answering"""
         if not self.processed:
             return None, "Please process a video first."
         try:
+            # Search for relevant frames
+            results = self.rag_system.search_frames(query)
+            if not results:
+                return None, "No relevant frames found."
+            # Prepare output
             frames = []
             descriptions = []
             for result in results:
+                # Load frame
+                frame = Image.open(result["path"])
+                frames.append(frame)
+                # Prepare description
+                desc = f"Timestamp: {result['timestamp']:.2f}s\n"
+                desc += f"Scene Description: {result['caption']}\n"
+                desc += f"Relevance Score: {result['relevance']:.2f}"
+                descriptions.append(desc)
+            # Combine descriptions
+            combined_desc = "\n\nFrame Analysis:\n\n"
             for i, desc in enumerate(descriptions, 1):
+                combined_desc += f"Frame {i}:\n{desc}\n\n"
+            return frames, combined_desc
         except Exception as e:
+            return None, f"Error: {str(e)}"
     def create_interface(self):
         """Create Gradio interface"""
+        with gr.Blocks(title="Advanced Video Question Answering") as interface:
             gr.Markdown("# Advanced Video Question Answering")
             gr.Markdown("Upload a video and ask questions about any aspect of its content!")
             with gr.Row():
                 video_input = gr.File(
+                    label="Upload Video",
                     file_types=["video"],
                 )
                 process_button = gr.Button("Process Video")
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
             with gr.Row():
                 query_input = gr.Textbox(
+                    label="Ask about the video",
                     placeholder="What's happening in the video?"
                 )
                 query_button = gr.Button("Search")
             gallery = gr.Gallery(
                 label="Retrieved Frames",
                 show_label=True,
                 columns=[2],
                 rows=[2],
                 height="auto"
             process_button.click(
                 fn=self.process_video,
                 inputs=[video_input],
+                outputs=[status_output]
             )
             query_button.click(
+                fn=self.answer_question,
                 inputs=[query_input],
                 outputs=[gallery, descriptions]
             )
         return interface
 # Initialize and create the interface
+app = VideoQAInterface()
 interface = app.create_interface()
 # Launch the app