File size: 10,335 Bytes
8b0c813
 
 
 
 
 
 
dbca0c8
8b0c813
 
1d9397e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b0c813
1d9397e
 
 
 
8b0c813
1d9397e
dbca0c8
 
8b0c813
 
 
 
 
dbca0c8
 
 
 
 
 
 
8b0c813
 
 
 
1d9397e
 
 
 
 
 
8b0c813
1d9397e
8b0c813
 
 
 
 
 
1d9397e
8b0c813
 
1d9397e
 
 
 
 
 
 
8b0c813
1d9397e
8b0c813
dbca0c8
8b0c813
 
 
 
 
1d9397e
8b0c813
 
1d9397e
 
 
 
 
 
 
8b0c813
1d9397e
8b0c813
dbca0c8
8b0c813
 
 
1d9397e
8b0c813
 
1d9397e
8b0c813
1d9397e
8b0c813
 
dbca0c8
 
8b0c813
 
 
 
 
 
 
 
dbca0c8
1d9397e
dbca0c8
 
1d9397e
 
 
 
 
 
 
 
dbca0c8
1d9397e
dbca0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d9397e
dbca0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
8b0c813
1d9397e
 
 
 
 
 
 
 
8b0c813
 
 
 
 
 
 
 
 
f3dc719
dbca0c8
8b0c813
 
1d9397e
dbca0c8
 
1d9397e
 
 
 
dbca0c8
1d9397e
dbca0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import pandas as pd
from fuzzywuzzy import fuzz
from collections import Counter
from nltk.stem import PorterStemmer
from ast import literal_eval
from typing import Union, List
import streamlit as st
from my_model.config import evaluation_config as config 

class KBVQAEvaluator:
    """
    A class to evaluate Knowledge-Based Visual Question Answering (KB-VQA) models.
    
    This class provides methods for syntactic and semantic evaluation of the KB-VQA model,
    using both exact match and VQA scores. The evaluation results can be saved to an 
    Excel file for further analysis.

    Attributes:
        data_path (str): Path to the evaluation data.
        use_fuzzy (bool): Flag to determine if fuzzy matching should be used.
        stemmer (PorterStemmer): Instance of PorterStemmer for stemming answers.
        scores_df (pd.DataFrame): DataFrame containing scores.
        df (pd.DataFrame): Main DataFrame containing evaluation data.
        vqa_scores (Dict[str, float]): Dictionary to store VQA scores for different model configurations.
        exact_match_scores (Dict[str, float]): Dictionary to store exact match scores for different model configurations.
        fuzzy_threshold (int): Threshold for fuzzy matching score.
        openai_api_key (str): API key for OpenAI GPT-4.
        model_names (List[str]): List of model names to be evaluated.
        model_configurations (List[str]): List of model configurations to be evaluated.
        gpt4_seed (int): Seed for GPT-4 evaluation.
        gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
        gpt4_temperature (float): Temperature setting for GPT-4 responses.
    """
    
    def __init__(self): -> None
        """
        Initialize the KBVQAEvaluator with the dataset and configuration settings.
        
        Reads data from the specified paths in the configuration and initializes
        various attributes required for evaluation.
        """

        self.data_path = config.EVALUATION_DATA_PATH
        self.use_fuzzy = config.USE_FUZZY
        self.stemmer = PorterStemmer()
        self.scores_df = pd.read_excel(self.data_path, sheet_name="Scores")
        self.df = pd.read_excel(self.data_path, sheet_name="Main Data")
        self.vqa_scores = {}
        self.exact_match_scores = {}
        self.fuzzy_threshold = config.FUZZY_SCORE
        self.openai_api_key = config.OPENAI_API_KEY
        self.model_names = config.MODEL_NAMES
        self.model_configurations = config.MODEL_CONFIGURATIONS  # ['caption+detic', 'caption+yolov5', 'only_caption', 'only_detic', 'only_yolov5']
        self.gpt4_seed = config.GPT4_SEED
        self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
        self.gpt4_temperature = config.GPT4_TEMPERATURE

    def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
        """
        Apply Porter Stemmer to either a single string or a list of strings.

        Args:
            answers (Union[str, List[str]]): A single answer string or a list of answer strings.

        Returns:
            Union[str, List[str]]: Stemmed version of the input string or list of strings.
        """
        
        if isinstance(answers, list):
            return [" ".join(self.stemmer.stem(word.strip()) for word in answer.split()) for answer in answers]
        else:
            words = answers.split()
            return " ".join(self.stemmer.stem(word.strip()) for word in words)

   def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
        """
        Calculate VQA score based on the number of matching answers, with optional fuzzy matching.

        Args:
            ground_truths (List[str]): List of ground truth answers.
            model_answer (str): Model's answer to be evaluated.

        Returns:
            float: VQA score based on the number of matches.
        """
       
        if self.use_fuzzy:
            fuzzy_matches = sum(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths)
            return min(fuzzy_matches / 3, 1)
        else:
            count = Counter(ground_truths)
            return min(count.get(model_answer, 0) / 3, 1)

    def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
        """
        Calculate Exact Match score, with optional fuzzy matching.

        Args:
            ground_truths (List[str]): List of ground truth answers.
            model_answer (str): Model's answer to be evaluated.

        Returns:
            int: Exact match score (1 if there is a match, 0 otherwise).
        """
        
        if self.use_fuzzy:
            return int(any(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths))
        else:
            return int(model_answer in ground_truths)

    def syntactic_evaluation(self) -> None:
        """
        Process the DataFrame: stem answers, calculate scores, and store results.

        """
        
        self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
        
        for name in self.model_names:
            for config in self.model_configurations:
                full_config = f'{name}_{config}'
                self.df[f'{full_config}_stemmed'] = self.df[full_config].apply(self.stem_answers)
                
                self.df[f'vqa_score_{full_config}'] = self.df.apply(lambda x: self.calculate_vqa_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
                self.df[f'exact_match_score_{full_config}'] = self.df.apply(lambda x: self.calculate_exact_match_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
                
                self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
                self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)

    def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
        """
        Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.

        Args:
            question (str): The question being evaluated.
            ground_truths (List[str]): List of ground truth answers.
            model_answer (str): Model's answer to be evaluated.

        Returns:
            List[dict]: Messages formatted for GPT-4 API call.
        """
        
        system_message = {
            "role": "system",
            "content": """You are an AI trained to evaluate the equivalence of AI-generated answers to a set of ground truth answers for a given question. Upon reviewing a model's answer, determine if it matches the ground truths. Use the following rating system: 1 if you find that the model answer matches more than 25% of the ground truth answers, 2 if you find that the model answer matches only less than 25% of the ground truth answers, and 3 if the model answer is incorrect. Respond in the format below for easy parsing:
                        Rating: {1/2/3}
                       """
                     }
    
        user_message = {
            "role": "user", 
            "content": f"Question : {question}\nGround Truth: {ground_truths}\nModel's Response: {model_answer}"
        }

        return [system_message, user_message]
        

    def semantic_evaluation(self) -> None:
        """
        Perform semantic evaluation using GPT-4 for each model configuration.
        """
        openai.api_key = self.openai_api_key
        model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
        for name in self.model_names:
            for config in model_configurations_for_semantic_evaluation:
                # Iterate over rows and send requests
                for index, row in self.df.iterrows():
                    messages = self.create_GPT4_messages_template(row['question'], row['raw_answers'][1:-1], row[name+'_'+config])
                    response = openai.ChatCompletion.create(model="gpt-4", messages=messages, max_tokens=self.gpt4_max_tokens, temperature=self.gpt4_temperature, seed=self.gpt4_seed)
                    evaluation = response["choices"][0]["message"]["content"]
                    rating = int(evaluation.split('\n')[0].split(":")[1].strip())
                    self.df.at[index, f'gpt4_rating_{config}'] = rating    
                
     def save_results(self, save_filename: str) -> None:
        """
        Save the evaluation results to an Excel file.

        Args:
            save_filename (str): The filename to save the results.
        """
         
        # Create a DataFrame for the scores
        scores_data = {
            'Model Configuration': list(self.vqa_scores.keys()),
            'VQA Score': list(self.vqa_scores.values()),
            'Exact Match Score': list(self.exact_match_scores.values())
        }
        scores_df = pd.DataFrame(scores_data)

        # Saving the scores DataFrame to an Excel file
        with pd.ExcelWriter(save_filename+'.xlsx', engine='openpyxl', mode='w') as writer:
            self.df.to_excel(writer, sheet_name='Main Data', index=False)
            scores_df.to_excel(writer, sheet_name='Scores', index=False)

def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
    """
    Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.

    Args:
        save (bool): Whether to save the results to an Excel file. Defaults to False.
        save_filename (str): The filename to save the results if save is True. Defaults to "results".
    """
    
    # Instantiate the evaluator
    evaluator = KBVQAEvaluator()
    
    # Run syntactic evaluation
    evaluator.syntactic_evaluation()
    
    # Optionally, run semantic evaluation if required (can be cost-intensive)
    evaluator.semantic_evaluation()
    
    if save:
        # Save results
        evaluator.save_results(save_filename)

# Call run_evaluation() to execute the evaluation process
if __name__ == "__main__":
    #run_evaluation(save=True, save_filename="results")
    pass