KB-VQA-E

Running

App Files Files Community

m7mdal7aj commited on May 13

Commit

484e080

•

1 Parent(s): 7854237

Create dataset_processor.py

Browse files

Files changed (1) hide show

my_model/dataset/dataset_processor.py +176 -0

my_model/dataset/dataset_processor.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import json
+from collections import Counter
+import contractions
+import csv
+from typing import Tuple, List, Optional
+from my_model.config import dataset_config as config
+class OKVQADatasetProcessor:
+    """
+    Processes the OKVQA dataset by loading, processing, and merging question and annotation data.
+    Attributes:
+        questions_file_path (str): Path to the questions JSON file.
+        annotations_file_path (str): Path to the annotations JSON file.
+        questions (List[dict]): Extracted list of question entries from the JSON file.
+        annotations (List[dict]): Extracted list of annotation entries from the JSON file.
+        df_questions (DataFrame): DataFrame holding the questions.
+        df_answers (DataFrame): DataFrame holding the annotations.
+        merged_df (Optional[DataFrame]): DataFrame resulting from merging questions and answers, initialized as None.
+    """
+    def __init__(self, questions_file_path: str, annotations_file_path: str) -> None:
+        """
+        Initializes the dataset processor with file paths and loads the data into DataFrames.
+        Parameters:
+            questions_file_path (str): The file path for the questions JSON file.
+            annotations_file_path (str): The file path for the annotations JSON file.
+        """
+        self.questions_file_path = questions_file_path
+        self.annotations_file_path = annotations_file_path
+        self.questions, self.annotations = self.load_data_files()
+        self.df_questions = pd.DataFrame(self.questions)
+        self.df_answers = pd.DataFrame(self.annotations)
+        self.merged_df = None
+    def load_data_files(self) -> Tuple[List[dict], List[dict]]:
+        """
+        Loads the question and annotation data from JSON files.
+        Returns:
+            Tuple[List[dict], List[dict]]: A tuple containing lists of questions and annotations.
+        """
+        with open(self.questions_file_path, 'r') as file:
+            data = json.load(file)
+            questions = data['questions']
+        with open(self.annotations_file_path, 'r') as file:
+            data = json.load(file)
+            annotations = data['annotations']
+        return questions, annotations
+    @staticmethod
+    def find_most_frequent(my_list: List[str]) -> Optional[str]:
+        """
+        Determines the most frequent item in a list.
+        Parameters:
+            my_list (List[str]): The list from which to find the most frequent item.
+        Returns:
+            Optional[str]: The most frequent item or None if the list is empty.
+        """
+        if not my_list:
+            return None
+        counter = Counter(my_list)
+        most_common = counter.most_common(1)
+        return most_common[0][0]
+    def merge_data(self) -> None:
+        """
+        Merges the question and answer DataFrames on a common key.
+        This method sets the 'merged_df' attribute to the resulting DataFrame after merging
+        'df_questions' and 'df_answers' on the 'question_id' field, which is assumed to be
+        present in both DataFrames.
+        """
+        self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
+    def join_words_with_hyphen(self, sentence):
+        return '-'.join(sentence.split())
+    def process_answers(self) -> None:
+        """
+        Processes answers from merged DataFrame by extracting and identifying the most frequent answers.
+        """
+        if self.merged_df is not None:
+            self.merged_df['raw_answers'] = self.merged_df['answers'].apply(lambda x: [ans['raw_answer'] for ans in x])
+            self.merged_df['processed_answers'] = self.merged_df['answers'].apply(
+                lambda x: [ans['answer'] for ans in x])
+            self.merged_df['most_frequent_raw_answer'] = self.merged_df['raw_answers'].apply(self.find_most_frequent)
+            self.merged_df['most_frequent_processed_answer'] = self.merged_df['processed_answers'].apply(
+                self.find_most_frequent)
+            self.merged_df.drop(columns=['answers'], inplace=True)
+        else:
+            print("DataFrames have not been merged yet.")
+        # Apply the function to the 'most_frequent_processed_answer' column
+        self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
+            self.join_words_with_hyphen)
+    def get_processed_data(self) -> Optional[pd.DataFrame]:
+        """
+        Retrieves the processed DataFrame.
+        Returns:
+            Optional[pd.DataFrame]: The processed DataFrame or None if it is not available.
+        """
+        if self.merged_df is not None:
+            return self.merged_df
+        else:
+            print("DataFrame is empty or not processed yet.")
+            return None
+    def save_to_csv(self, df: pd.DataFrame, saved_file_name: Optional[str]) -> None:
+        """
+        Saves the DataFrame to a CSV file.
+        Parameters:
+            df (pd.DataFrame): The DataFrame to save.
+            saved_file_name (Optional[str]): The target file name or path.
+        """
+        if saved_file_name is not None:
+            if ".csv" not in saved_file_name:
+                df.to_csv(os.path.join(saved_file_name, ".csv"), index=None)
+            else:
+                df.to_csv(saved_file_name, index=None)
+        else:
+            df.to_csv("data.csv", index=None)
+    def display_dataframe(self) -> None:
+        """
+        Displays the processed DataFrame.
+        """
+        if self.merged_df is not None:
+            print(self.merged_df)
+        else:
+            print("DataFrame is empty.")
+def process_okvqa_dataset(questions_file_path: str, annotations_file_path: str, save_to_csv: bool = False,
+                          saved_file_name: Optional[str] = None) -> Optional[pd.DataFrame]:
+    """
+    Orchestrates the processing of the OK-VQA dataset using specified JSON file paths for questions and annotations.
+    Parameters:
+        questions_file_path (str): Path to the questions JSON file.
+        annotations_file_path (str): Path to the annotations JSON file.
+        save_to_csv (bool): Flag to determine if the processed data should be saved to CSV.
+        saved_file_name (Optional[str]): Filename or path to save the CSV file. If None, defaults to 'data.csv'.
+    Returns:
+        Optional[pd.DataFrame]: The processed DataFrame containing merged and processed VQA data or None if empty.
+    """
+    # Initialize the dataset processor
+    processor = OKVQADatasetProcessor(questions_file_path, annotations_file_path)
+    # Merge question and answer data and process answers
+    processor.merge_data()
+    processor.process_answers()
+    # Retrieve the processed DataFrame
+    processed_data = processor.get_processed_data()
+    # Optionally save the processed DataFrame to a CSV file
+    if save_to_csv and processed_data is not None:
+        processor.save_to_csv(processed_data, saved_file_name)
+    return processed_data