KB-VQA-E

Running

File size: 7,266 Bytes

8b0c813
 
 
 
 
 
 
dbca0c8
8b0c813
 
 
 
 
 
dbca0c8
 
8b0c813
 
 
 
 
dbca0c8
 
 
 
 
 
 
8b0c813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbca0c8
8b0c813
 
 
 
 
 
 
 
 
 
dbca0c8
8b0c813
 
 
dbca0c8
8b0c813
 
 
 
 
dbca0c8
 
8b0c813
 
 
 
 
 
 
 
dbca0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b0c813
dbca0c8
8b0c813
 
 
 
 
 
 
 
 
f3dc719
dbca0c8
8b0c813
 
dbca0c8

import pandas as pd
from fuzzywuzzy import fuzz
from collections import Counter
from nltk.stem import PorterStemmer
from ast import literal_eval
from typing import Union, List
import streamlit as st
from my_model.config import evaluation_config as config 

class KBVQAEvaluator:
    def __init__(self):
        """
        Initialize the VQA Processor with the dataset and configuration settings.
        """
        self.data_path = config.EVALUATION_DATA_PATH
        self.use_fuzzy = config.USE_FUZZY
        self.stemmer = PorterStemmer()
        self.scores_df = pd.read_excel(self.data_path, sheet_name="Scores")
        self.df = pd.read_excel(self.data_path, sheet_name="Main Data")
        self.vqa_scores = {}
        self.exact_match_scores = {}
        self.fuzzy_threshold = config.FUZZY_SCORE
        self.openai_api_key = config.OPENAI_API_KEY
        self.model_names = config.MODEL_NAMES
        self.model_configurations = config.MODEL_CONFIGURATIONS  # ['caption+detic', 'caption+yolov5', 'only_caption', 'only_detic', 'only_yolov5']
        self.gpt4_seed = config.GPT4_SEED
        self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
        self.gpt4_temperature = config.GPT4_TEMPERATURE

    def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
        """
        Apply Porter Stemmer to either a single string or a list of strings.
        """
        if isinstance(answers, list):
            return [" ".join(self.stemmer.stem(word.strip()) for word in answer.split()) for answer in answers]
        else:
            words = answers.split()
            return " ".join(self.stemmer.stem(word.strip()) for word in words)

    def calculate_vqa_score(self, ground_truths, model_answer):
        """
        Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
        """
        if self.use_fuzzy:
            fuzzy_matches = sum(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths)
            return min(fuzzy_matches / 3, 1)
        else:
            count = Counter(ground_truths)
            return min(count.get(model_answer, 0) / 3, 1)

    def calculate_exact_match_score(self, ground_truths, model_answer):
        """
        Calculate Exact Match score, with optional fuzzy matching.
        """
        if self.use_fuzzy:
            return int(any(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths))
        else:
            return int(model_answer in ground_truths)

    def syntactic_evaluation(self):
        """
        Process the DataFrame: stem answers, calculate scores, and store results.
        """
        self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
        
        for name in self.model_names:
            for config in self.model_configurations:
                full_config = f'{name}_{config}'
                self.df[f'{full_config}_stemmed'] = self.df[full_config].apply(self.stem_answers)
                
                self.df[f'vqa_score_{full_config}'] = self.df.apply(lambda x: self.calculate_vqa_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
                self.df[f'exact_match_score_{full_config}'] = self.df.apply(lambda x: self.calculate_exact_match_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
                
                self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
                self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)

    def create_GPT4_messages_template(self, question, ground_truths, model_answer):
        """
        Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
        """
        system_message = {
            "role": "system",
            "content": """You are an AI trained to evaluate the equivalence of AI-generated answers to a set of ground truth answers for a given question. Upon reviewing a model's answer, determine if it matches the ground truths. Use the following rating system: 1 if you find that the model answer matches more than 25% of the ground truth answers, 2 if you find that the model answer matches only less than 25% of the ground truth answers, and 3 if the model answer is incorrect. Respond in the format below for easy parsing:
                        Rating: {1/2/3}
                       """
                     }
    
        user_message = {
            "role": "user", 
            "content": f"Question : {question}\nGround Truth: {ground_truths}\nModel's Response: {model_answer}"
        }

        return [system_message, user_message]
        

    def semantic_evaluation(self):
        """
        Perform semantic evaluation using GPT-4 for each model configuration.
        """
        openai.api_key = self.openai_api_key
        model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
        for name in self.model_names:
            for config in model_configurations_for_semantic_evaluation:
                # Iterate over rows and send requests
                for index, row in self.df.iterrows():
                    messages = self.create_GPT4_messages_template(row['question'], row['raw_answers'][1:-1], row[name+'_'+config])
                    response = openai.ChatCompletion.create(model="gpt-4", messages=messages, max_tokens=self.gpt4_max_tokens, temperature=self.gpt4_temperature, seed=self.gpt4_seed)
                    evaluation = response["choices"][0]["message"]["content"]
                    rating = int(evaluation.split('\n')[0].split(":")[1].strip())
                    self.df.at[index, f'gpt4_rating_{config}'] = rating    
                
    def save_results(self, save_filename):
        # Create a DataFrame for the scores
        scores_data = {
            'Model Configuration': list(self.vqa_scores.keys()),
            'VQA Score': list(self.vqa_scores.values()),
            'Exact Match Score': list(self.exact_match_scores.values())
        }
        scores_df = pd.DataFrame(scores_data)

        # Saving the scores DataFrame to an Excel file
        with pd.ExcelWriter(save_filename+'.xlsx', engine='openpyxl', mode='w') as writer:
            self.df.to_excel(writer, sheet_name='Main Data', index=False)
            scores_df.to_excel(writer, sheet_name='Scores', index=False)

def run_evaluation(save=False, save_filename="results"):
    """
    Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
    """
    # Instantiate the evaluator
    evaluator = KBVQAEvaluator()
    
    # Run syntactic evaluation
    evaluator.syntactic_evaluation()
    
    # Optionally, run semantic evaluation if required (can be cost-intensive)
    evaluator.semantic_evaluation()
    
    if save:
        # Save results
        evaluator.save_results(save_filename)

# Call run_evaluation() to execute the evaluation process
if __name__ == "__main__":
    #run_evaluation(save=True, save_filename="results")
    pass