File size: 7,266 Bytes
8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 dbca0c8 8b0c813 f3dc719 dbca0c8 8b0c813 dbca0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import pandas as pd
from fuzzywuzzy import fuzz
from collections import Counter
from nltk.stem import PorterStemmer
from ast import literal_eval
from typing import Union, List
import streamlit as st
from my_model.config import evaluation_config as config
class KBVQAEvaluator:
def __init__(self):
"""
Initialize the VQA Processor with the dataset and configuration settings.
"""
self.data_path = config.EVALUATION_DATA_PATH
self.use_fuzzy = config.USE_FUZZY
self.stemmer = PorterStemmer()
self.scores_df = pd.read_excel(self.data_path, sheet_name="Scores")
self.df = pd.read_excel(self.data_path, sheet_name="Main Data")
self.vqa_scores = {}
self.exact_match_scores = {}
self.fuzzy_threshold = config.FUZZY_SCORE
self.openai_api_key = config.OPENAI_API_KEY
self.model_names = config.MODEL_NAMES
self.model_configurations = config.MODEL_CONFIGURATIONS # ['caption+detic', 'caption+yolov5', 'only_caption', 'only_detic', 'only_yolov5']
self.gpt4_seed = config.GPT4_SEED
self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
self.gpt4_temperature = config.GPT4_TEMPERATURE
def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
"""
Apply Porter Stemmer to either a single string or a list of strings.
"""
if isinstance(answers, list):
return [" ".join(self.stemmer.stem(word.strip()) for word in answer.split()) for answer in answers]
else:
words = answers.split()
return " ".join(self.stemmer.stem(word.strip()) for word in words)
def calculate_vqa_score(self, ground_truths, model_answer):
"""
Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
"""
if self.use_fuzzy:
fuzzy_matches = sum(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths)
return min(fuzzy_matches / 3, 1)
else:
count = Counter(ground_truths)
return min(count.get(model_answer, 0) / 3, 1)
def calculate_exact_match_score(self, ground_truths, model_answer):
"""
Calculate Exact Match score, with optional fuzzy matching.
"""
if self.use_fuzzy:
return int(any(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths))
else:
return int(model_answer in ground_truths)
def syntactic_evaluation(self):
"""
Process the DataFrame: stem answers, calculate scores, and store results.
"""
self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
for name in self.model_names:
for config in self.model_configurations:
full_config = f'{name}_{config}'
self.df[f'{full_config}_stemmed'] = self.df[full_config].apply(self.stem_answers)
self.df[f'vqa_score_{full_config}'] = self.df.apply(lambda x: self.calculate_vqa_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
self.df[f'exact_match_score_{full_config}'] = self.df.apply(lambda x: self.calculate_exact_match_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
def create_GPT4_messages_template(self, question, ground_truths, model_answer):
"""
Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
"""
system_message = {
"role": "system",
"content": """You are an AI trained to evaluate the equivalence of AI-generated answers to a set of ground truth answers for a given question. Upon reviewing a model's answer, determine if it matches the ground truths. Use the following rating system: 1 if you find that the model answer matches more than 25% of the ground truth answers, 2 if you find that the model answer matches only less than 25% of the ground truth answers, and 3 if the model answer is incorrect. Respond in the format below for easy parsing:
Rating: {1/2/3}
"""
}
user_message = {
"role": "user",
"content": f"Question : {question}\nGround Truth: {ground_truths}\nModel's Response: {model_answer}"
}
return [system_message, user_message]
def semantic_evaluation(self):
"""
Perform semantic evaluation using GPT-4 for each model configuration.
"""
openai.api_key = self.openai_api_key
model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
for name in self.model_names:
for config in model_configurations_for_semantic_evaluation:
# Iterate over rows and send requests
for index, row in self.df.iterrows():
messages = self.create_GPT4_messages_template(row['question'], row['raw_answers'][1:-1], row[name+'_'+config])
response = openai.ChatCompletion.create(model="gpt-4", messages=messages, max_tokens=self.gpt4_max_tokens, temperature=self.gpt4_temperature, seed=self.gpt4_seed)
evaluation = response["choices"][0]["message"]["content"]
rating = int(evaluation.split('\n')[0].split(":")[1].strip())
self.df.at[index, f'gpt4_rating_{config}'] = rating
def save_results(self, save_filename):
# Create a DataFrame for the scores
scores_data = {
'Model Configuration': list(self.vqa_scores.keys()),
'VQA Score': list(self.vqa_scores.values()),
'Exact Match Score': list(self.exact_match_scores.values())
}
scores_df = pd.DataFrame(scores_data)
# Saving the scores DataFrame to an Excel file
with pd.ExcelWriter(save_filename+'.xlsx', engine='openpyxl', mode='w') as writer:
self.df.to_excel(writer, sheet_name='Main Data', index=False)
scores_df.to_excel(writer, sheet_name='Scores', index=False)
def run_evaluation(save=False, save_filename="results"):
"""
Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
"""
# Instantiate the evaluator
evaluator = KBVQAEvaluator()
# Run syntactic evaluation
evaluator.syntactic_evaluation()
# Optionally, run semantic evaluation if required (can be cost-intensive)
evaluator.semantic_evaluation()
if save:
# Save results
evaluator.save_results(save_filename)
# Call run_evaluation() to execute the evaluation process
if __name__ == "__main__":
#run_evaluation(save=True, save_filename="results")
pass |