naveensp commited on 28 days ago

Commit

2ce0406

•

1 Parent(s): 3f472e0

all the files required for inference

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

OLMo_Bitnet_1B/__pycache__/__init__.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/aliases.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/beam_search.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/config.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/configuration_olmo.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/exceptions.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/initialization.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/model.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/modeling_olmo.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/optim.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/safetensors_util.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/torch_util.cpython-311.pyc +0 -0
OLMo_Bitnet_1B/__pycache__/util.cpython-311.pyc +0 -0
checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/pytorch_model.bin +3 -0
llava/__init__.py +1 -0
llava/__pycache__/__init__.cpython-310.pyc +0 -0
llava/__pycache__/__init__.cpython-311.pyc +0 -0
llava/__pycache__/__init__.cpython-312.pyc +0 -0
llava/__pycache__/__init__.pypy39.pyc +0 -0
llava/__pycache__/constants.cpython-310.pyc +0 -0
llava/__pycache__/constants.cpython-311.pyc +0 -0
llava/__pycache__/constants.cpython-312.pyc +0 -0
llava/__pycache__/conversation.cpython-310.pyc +0 -0
llava/__pycache__/conversation.cpython-311.pyc +0 -0
llava/__pycache__/conversation.cpython-312.pyc +0 -0
llava/__pycache__/mm_utils.cpython-310.pyc +0 -0
llava/__pycache__/mm_utils.cpython-311.pyc +0 -0
llava/__pycache__/mm_utils.cpython-312.pyc +0 -0
llava/__pycache__/utils.cpython-311.pyc +0 -0
llava/__pycache__/utils.cpython-312.pyc +0 -0
llava/config.json +66 -0
llava/constants.py +13 -0
llava/conversation.py +396 -0
llava/eval/__pycache__/eval_textvqa.cpython-311.pyc +0 -0
llava/eval/__pycache__/eval_textvqa.cpython-312.pyc +0 -0
llava/eval/__pycache__/m4c_evaluator.cpython-311.pyc +0 -0
llava/eval/__pycache__/m4c_evaluator.cpython-312.pyc +0 -0
llava/eval/__pycache__/model_vqa.cpython-311.pyc +0 -0
llava/eval/__pycache__/model_vqa_loader.cpython-311.pyc +0 -0
llava/eval/__pycache__/model_vqa_loader.cpython-312.pyc +0 -0
llava/eval/eval_gpt_review.py +113 -0
llava/eval/eval_gpt_review_bench.py +121 -0
llava/eval/eval_gpt_review_visual.py +118 -0
llava/eval/eval_pope.py +81 -0
llava/eval/eval_science_qa.py +114 -0
llava/eval/eval_science_qa_gpt4.py +104 -0
llava/eval/eval_science_qa_gpt4_requery.py +149 -0
llava/eval/eval_textvqa.py +65 -0
llava/eval/generate_webpage_data_from_table.py +111 -0
llava/eval/m4c_evaluator.py +334 -0

OLMo_Bitnet_1B/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/__init__.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/__init__.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/aliases.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/aliases.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/aliases.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/beam_search.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/beam_search.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/beam_search.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/config.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/config.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/config.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/configuration_olmo.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/configuration_olmo.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/configuration_olmo.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/exceptions.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/exceptions.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/exceptions.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/initialization.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/initialization.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/initialization.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/model.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/model.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/model.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/modeling_olmo.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/modeling_olmo.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/modeling_olmo.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/optim.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/optim.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/optim.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/safetensors_util.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/safetensors_util.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/safetensors_util.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/torch_util.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/torch_util.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/torch_util.cpython-311.pyc differ

OLMo_Bitnet_1B/__pycache__/util.cpython-311.pyc CHANGED Viewed

Binary files a/OLMo_Bitnet_1B/__pycache__/util.cpython-311.pyc and b/OLMo_Bitnet_1B/__pycache__/util.cpython-311.pyc differ

checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0838b7dc466915522b3661e1c3882ea090083d1339363a375b6442de273b858a
+size 3179392898

llava/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model.language_model.llava_llama import LlavaLlamaForCausalLM

llava/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (188 Bytes). View file

llava/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (254 Bytes). View file

llava/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (222 Bytes). View file

llava/__pycache__/__init__.pypy39.pyc ADDED Viewed

Binary file (207 Bytes). View file

llava/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (496 Bytes). View file

llava/__pycache__/constants.cpython-311.pyc ADDED Viewed

Binary file (559 Bytes). View file

llava/__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (542 Bytes). View file

llava/__pycache__/conversation.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

llava/__pycache__/conversation.cpython-311.pyc ADDED Viewed

Binary file (16.9 kB). View file

llava/__pycache__/conversation.cpython-312.pyc ADDED Viewed

Binary file (15.9 kB). View file

llava/__pycache__/mm_utils.cpython-310.pyc ADDED Viewed

Binary file (8.77 kB). View file

llava/__pycache__/mm_utils.cpython-311.pyc ADDED Viewed

Binary file (14.6 kB). View file

llava/__pycache__/mm_utils.cpython-312.pyc ADDED Viewed

Binary file (13 kB). View file

llava/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (6.94 kB). View file

llava/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (6.63 kB). View file

llava/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "activation_type": "swiglu",
+  "alibi": false,
+  "alibi_bias_max": 8.0,
+  "architectures": [
+    "LlavaOLMoBitnet1BForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attention_layer_norm": false,
+  "attention_layer_norm_with_affine": false,
+  "bias_for_layer_norm": false,
+  "block_group_size": 1,
+  "block_type": "sequential",
+  "clip_qkv": null,
+  "d_model": 2048,
+  "embedding_dropout": 0.0,
+  "embedding_size": 50304,
+  "eos_token_id": 50279,
+  "bos_token_id": 50279,
+  "flash_attention": true,
+  "include_bias": false,
+  "init_cutoff_factor": null,
+  "init_device": "cpu",
+  "init_fn": "mitchell",
+  "init_std": 0.02,
+  "layer_norm_type": "rms",
+  "layer_norm_with_affine": true,
+  "max_sequence_length": 2048,
+  "mlp_hidden_size": null,
+  "mlp_ratio": 8,
+  "model_type": "llava",
+  "multi_query_attention": false,
+  "n_heads": 16,
+  "n_layers": 16,
+  "pad_token_id": 1,
+  "precision": "amp_bf16",
+  "residual_dropout": 0.0,
+  "rope": true,
+  "rope_full_precision": true,
+  "scale_logits": false,
+  "ternary": true,
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 50280,
+  "inference_mode":false,
+  "weight_tying": true,
+  "auto_map": {
+    "AutoConfig": "configuration_olmo.OLMoConfig",
+    "AutoModelForCausalLM": "modeling_olmo.OLMoForCausalLM"
+  },
+  "freeze_mm_vision_resampler": false,
+  "mm_hidden_size": 1024,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_resampler_type": null,
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+  "tune_mm_mlp_adapter": false,
+  "tune_mm_vision_resampler": false,
+  "unfreeze_mm_vision_tower": false,
+  "use_mm_proj": true,
+  "image_aspect_ratio": "pad"
+}

llava/constants.py ADDED Viewed

	@@ -0,0 +1,13 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

llava/conversation.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

llava/eval/__pycache__/eval_textvqa.cpython-311.pyc ADDED Viewed

Binary file (4.62 kB). View file

llava/eval/__pycache__/eval_textvqa.cpython-312.pyc ADDED Viewed

Binary file (4 kB). View file

llava/eval/__pycache__/m4c_evaluator.cpython-311.pyc ADDED Viewed

Binary file (16.4 kB). View file

llava/eval/__pycache__/m4c_evaluator.cpython-312.pyc ADDED Viewed

Binary file (14.2 kB). View file

llava/eval/__pycache__/model_vqa.cpython-311.pyc ADDED Viewed

Binary file (7.37 kB). View file

llava/eval/__pycache__/model_vqa_loader.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

llava/eval/__pycache__/model_vqa_loader.cpython-312.pyc ADDED Viewed

Binary file (8.97 kB). View file

llava/eval/eval_gpt_review.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import argparse
+import json
+import os
+import openai
+import tqdm
+import ray
+import time
+NUM_SECONDS_TO_SLEEP = 3
+@ray.remote(num_cpus=4)
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    print('success!')
+    return response['choices'][0]['message']['content']
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    # parser.add_argument('-a', '--answer')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+    ray.init()
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    review_file = open(f'{args.output}', 'w')
+    js_list = []
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        # if idx == 1:
+        #     break
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            rule = rule_dict['default']
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        js_list.append({
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1['answer_id'],
+            'answer2_id': ans2['answer_id'],
+            'category': category})
+        idx += 1
+        handles.append(get_eval.remote(content, args.max_tokens))
+        # To avoid the rate limit set by OpenAI
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    reviews = ray.get(handles)
+    for idx, review in enumerate(reviews):
+        scores = parse_score(review)
+        js_list[idx]['content'] = review
+        js_list[idx]['tuple'] = scores
+        review_file.write(json.dumps(js_list[idx]) + '\n')
+    review_file.close()

llava/eval/eval_gpt_review_bench.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import argparse
+import json
+import os
+import openai
+import time
+NUM_SECONDS_TO_SLEEP = 0.5
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    return response['choices'][0]['message']['content']
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+    review_file = open(f'{args.output}', 'a')
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+        inst = image_to_context[ques['image']]
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()

llava/eval/eval_gpt_review_visual.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import argparse
+import json
+import os
+import openai
+import time
+NUM_SECONDS_TO_SLEEP = 0.5
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    return response['choices'][0]['message']['content']
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+    review_file = open(f'{args.output}', 'a')
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()

llava/eval/eval_pope.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import json
+import argparse
+def eval_pope(answers, label_file):
+    label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
+    for answer in answers:
+        text = answer['text']
+        # Only keep the first sentence
+        if text.find('.') != -1:
+            text = text.split('.')[0]
+        text = text.replace(',', '')
+        words = text.split(' ')
+        if 'No' in words or 'not' in words or 'no' in words:
+            answer['text'] = 'no'
+        else:
+            answer['text'] = 'yes'
+    for i in range(len(label_list)):
+        if label_list[i] == 'no':
+            label_list[i] = 0
+        else:
+            label_list[i] = 1
+    pred_list = []
+    for answer in answers:
+        if answer['text'] == 'no':
+            pred_list.append(0)
+        else:
+            pred_list.append(1)
+    pos = 1
+    neg = 0
+    yes_ratio = pred_list.count(1) / len(pred_list)
+    TP, TN, FP, FN = 0, 0, 0, 0
+    for pred, label in zip(pred_list, label_list):
+        if pred == pos and label == pos:
+            TP += 1
+        elif pred == pos and label == neg:
+            FP += 1
+        elif pred == neg and label == neg:
+            TN += 1
+        elif pred == neg and label == pos:
+            FN += 1
+    print('TP\tFP\tTN\tFN\t')
+    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
+    precision = float(TP) / float(TP + FP)
+    recall = float(TP) / float(TP + FN)
+    f1 = 2*precision*recall / (precision + recall)
+    acc = (TP + TN) / (TP + TN + FP + FN)
+    print('Accuracy: {}'.format(acc))
+    print('Precision: {}'.format(precision))
+    print('Recall: {}'.format(recall))
+    print('F1 score: {}'.format(f1))
+    print('Yes ratio: {}'.format(yes_ratio))
+    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-dir", type=str)
+    parser.add_argument("--question-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    args = parser.parse_args()
+    questions = [json.loads(line) for line in open(args.question_file)]
+    questions = {question['question_id']: question for question in questions}
+    answers = [json.loads(q) for q in open(args.result_file)]
+    for file in os.listdir(args.annotation_dir):
+        assert file.startswith('coco_pope_')
+        assert file.endswith('.json')
+        category = file[10:-5]
+        cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
+        print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
+        eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
+        print("====================================")

llava/eval/eval_science_qa.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import argparse
+import json
+import os
+import re
+import random
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return -1
+        return random.choice(range(len(choices)))
+if __name__ == "__main__":
+    args = get_args()
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            pred = {'text': 'FAILED', 'prompt': 'Unknown'}
+            pred_text = 'FAILED'
+        else:
+            pred = predictions[prob_id]
+            pred_text = pred['text']
+        if pred_text in args.options:
+            answer = pred_text
+        elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
+            answer = pred_text[0]
+        else:
+            pattern = re.compile(r'The answer is ([A-Z]).')
+            res = pattern.findall(pred_text)
+            if len(res) == 1:
+                answer = res[0]  # 'A', 'B', ...
+            else:
+                answer = "FAILED"
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+        else:
+            results['incorrect'].append(analysis)
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+    ###### IMG ######
+    multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
+    multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
+    multimodal_total = multimodal_correct + multimodal_incorrect
+    ###### IMG ######
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)

llava/eval/eval_science_qa_gpt4.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+if __name__ == "__main__":
+    args = get_args()
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+    results = defaultdict(lambda: 0)
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            continue
+        if prob_id not in gpt4_predictions:
+            continue
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            # continue
+            gpt4_pred_idx = our_pred_idx
+            # if our_pred_idx != prob['answer']:
+            #     print(our_predictions[prob_id]['prompt'])
+            #     print('-----------------')
+            #     print(f'LECTURE: {prob["lecture"]}')
+            #     print(f'SOLUTION: {prob["solution"]}')
+            #     print('=====================')
+        else:
+            # continue
+            pass
+        # gpt4_pred_idx = our_pred_idx
+        if gpt4_pred_idx == prob['answer']:
+            results['correct'] += 1
+        else:
+            results['incorrect'] += 1
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+    correct = results['correct']
+    total = results['correct'] + results['incorrect']
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')

llava/eval/eval_science_qa_gpt4_requery.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--requery-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+if __name__ == "__main__":
+    args = get_args()
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+    requery_predictions = [json.loads(line) for line in open(args.requery_result)]
+    requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+    results = defaultdict(lambda: 0)
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            assert False
+        if prob_id not in gpt4_predictions:
+            assert False
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+        if prob_id not in requery_predictions:
+            results['missing_requery'] += 1
+            requery_pred = "MISSING"
+        else:
+            requery_pred = requery_predictions[prob_id]['text']
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+        requery_res = pattern.findall(requery_pred)
+        if len(requery_res) == 1:
+            requery_answer = requery_res[0]  # 'A', 'B', ...
+        else:
+            requery_answer = "FAILED"
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+        requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
+        results['total'] += 1
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            if gpt4_pred_idx == prob['answer']:
+                results['gpt4_correct'] += 1
+            if our_pred_idx == prob['answer']:
+                results['gpt4_ourvisual_correct'] += 1
+        elif gpt4_pred_idx == prob['answer']:
+            results['gpt4_correct'] += 1
+            results['gpt4_ourvisual_correct'] += 1
+        if our_pred_idx == prob['answer']:
+            results['our_correct'] += 1
+        if requery_answer == 'FAILED':
+            sqa_results['results'][prob_id] = our_pred_idx
+            if our_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+        else:
+            sqa_results['results'][prob_id] = requery_pred_idx
+            if requery_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+            else:
+                print(f"""
+Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
+Our ({our_answer}): {our_pred}
+GPT-4 ({gpt4_answer}): {gpt4_pred}
+Requery ({requery_answer}): {requery_pred}
+print("=====================================")
+""")
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+    total = results['total']
+    print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+    sqa_results['acc'] = results["requery_correct"] / total * 100
+    sqa_results['correct'] = results["requery_correct"]
+    sqa_results['count'] = total
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)

llava/eval/eval_textvqa.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import argparse
+import json
+import re
+from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--result-dir', type=str)
+    return parser.parse_args()
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+    return question.lower()
+def eval_single(annotation_file, result_file):
+    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+    print(experiment_name)
+    annotations = json.load(open(annotation_file))['data']
+    annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+    results = [json.loads(line) for line in open(result_file)]
+    pred_list = []
+    for result in results:
+        annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
+        pred_list.append({
+            "pred_answer": result['text'],
+            "gt_answers": annotation['answers'],
+        })
+    evaluator = TextVQAAccuracyEvaluator()
+    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+if __name__ == "__main__":
+    args = get_args()
+    if args.result_file is not None:
+        eval_single(args.annotation_file, args.result_file)
+    if args.result_dir is not None:
+        for result_file in sorted(os.listdir(args.result_dir)):
+            if not result_file.endswith('.jsonl'):
+                print(f'Skipping {result_file}')
+                continue
+            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))

llava/eval/generate_webpage_data_from_table.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Generate json file for webpage."""
+import json
+import os
+import re
+# models = ['llama', 'alpaca', 'gpt35', 'bard']
+models = ['vicuna']
+def read_jsonl(path: str, key: str=None):
+    data = []
+    with open(os.path.expanduser(path)) as f:
+        for line in f:
+            if not line:
+                continue
+            data.append(json.loads(line))
+    if key is not None:
+        data.sort(key=lambda x: x[key])
+        data = {item[key]: item for item in data}
+    return data
+def trim_hanging_lines(s: str, n: int) -> str:
+    s = s.strip()
+    for _ in range(n):
+        s = s.split('\n', 1)[1].strip()
+    return s
+if __name__ == '__main__':
+    questions = read_jsonl('table/question.jsonl', key='question_id')
+    # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
+    # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
+    # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
+    # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
+    vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
+    ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
+    review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
+    # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
+    # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
+    # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
+    # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
+    records = []
+    for qid in questions.keys():
+        r = {
+            'id': qid,
+            'category': questions[qid]['category'],
+            'question': questions[qid]['text'],
+            'answers': {
+                # 'alpaca': alpaca_answers[qid]['text'],
+                # 'llama': llama_answers[qid]['text'],
+                # 'bard': bard_answers[qid]['text'],
+                # 'gpt35': gpt35_answers[qid]['text'],
+                'vicuna': vicuna_answers[qid]['text'],
+                'ours': ours_answers[qid]['text'],
+            },
+            'evaluations': {
+                # 'alpaca': review_alpaca[qid]['text'],
+                # 'llama': review_llama[qid]['text'],
+                # 'bard': review_bard[qid]['text'],
+                'vicuna': review_vicuna[qid]['content'],
+                # 'gpt35': review_gpt35[qid]['text'],
+            },
+            'scores': {
+                'vicuna': review_vicuna[qid]['tuple'],
+                # 'alpaca': review_alpaca[qid]['score'],
+                # 'llama': review_llama[qid]['score'],
+                # 'bard': review_bard[qid]['score'],
+                # 'gpt35': review_gpt35[qid]['score'],
+            },
+        }
+        # cleanup data
+        cleaned_evals = {}
+        for k, v in r['evaluations'].items():
+            v = v.strip()
+            lines = v.split('\n')
+            # trim the first line if it's a pair of numbers
+            if re.match(r'\d+[, ]+\d+', lines[0]):
+                lines = lines[1:]
+            v = '\n'.join(lines)
+            cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
+        r['evaluations'] = cleaned_evals
+        records.append(r)
+    # Reorder the records, this is optional
+    for r in records:
+        if r['id'] <= 20:
+            r['id'] += 60
+        else:
+            r['id'] -= 20
+    for r in records:
+        if r['id'] <= 50:
+            r['id'] += 10
+        elif 50 < r['id'] <= 60:
+            r['id'] -= 50
+    for r in records:
+        if r['id'] == 7:
+            r['id'] = 1
+        elif r['id'] < 7:
+            r['id'] += 1
+    records.sort(key=lambda x: x['id'])
+    # Write to file
+    with open('webpage/data.json', 'w') as f:
+        json.dump({'questions': records, 'models': models}, f, indent=2)

llava/eval/m4c_evaluator.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import re
+from tqdm import tqdm
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+    CONTRACTIONS = {
+        "aint": "ain't",
+        "arent": "aren't",
+        "cant": "can't",
+        "couldve": "could've",
+        "couldnt": "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        "didnt": "didn't",
+        "doesnt": "doesn't",
+        "dont": "don't",
+        "hadnt": "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        "hasnt": "hasn't",
+        "havent": "haven't",
+        "hed": "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        "hes": "he's",
+        "howd": "how'd",
+        "howll": "how'll",
+        "hows": "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        "Im": "I'm",
+        "Ive": "I've",
+        "isnt": "isn't",
+        "itd": "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        "itll": "it'll",
+        "let's": "let's",
+        "maam": "ma'am",
+        "mightnt": "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        "mightve": "might've",
+        "mustnt": "mustn't",
+        "mustve": "must've",
+        "neednt": "needn't",
+        "notve": "not've",
+        "oclock": "o'clock",
+        "oughtnt": "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        "shant": "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        "shouldve": "should've",
+        "shouldnt": "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": "somebodyd",
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        "somebodyll": "somebody'll",
+        "somebodys": "somebody's",
+        "someoned": "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        "someonell": "someone'll",
+        "someones": "someone's",
+        "somethingd": "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        "somethingll": "something'll",
+        "thats": "that's",
+        "thered": "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        "therere": "there're",
+        "theres": "there's",
+        "theyd": "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        "theyll": "they'll",
+        "theyre": "they're",
+        "theyve": "they've",
+        "twas": "'twas",
+        "wasnt": "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        "weve": "we've",
+        "werent": "weren't",
+        "whatll": "what'll",
+        "whatre": "what're",
+        "whats": "what's",
+        "whatve": "what've",
+        "whens": "when's",
+        "whered": "where'd",
+        "wheres": "where's",
+        "whereve": "where've",
+        "whod": "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        "wholl": "who'll",
+        "whos": "who's",
+        "whove": "who've",
+        "whyll": "why'll",
+        "whyre": "why're",
+        "whys": "why's",
+        "wont": "won't",
+        "wouldve": "would've",
+        "wouldnt": "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        "yall": "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        "youd": "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        "youll": "you'll",
+        "youre": "you're",
+        "youve": "you've",
+    }
+    NUMBER_MAP = {
+        "none": "0",
+        "zero": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+        "ten": "10",
+    }
+    ARTICLES = ["a", "an", "the"]
+    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
+    PUNCTUATIONS = [
+        ";",
+        r"/",
+        "[",
+        "]",
+        '"',
+        "{",
+        "}",
+        "(",
+        ")",
+        "=",
+        "+",
+        "\\",
+        "_",
+        "-",
+        ">",
+        "<",
+        "@",
+        "`",
+        ",",
+        "?",
+        "!",
+    ]
+    def __init__(self, *args, **kwargs):
+        pass
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
+        return word.strip()
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + " " in in_text or " " + p in in_text) or (
+                re.search(self.COMMA_STRIP, in_text) is not None
+            ):
+                out_text = out_text.replace(p, "")
+            else:
+                out_text = out_text.replace(p, " ")
+        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
+        return out_text
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = " ".join(out_text)
+        return out_text
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace("\n", " ").replace("\t", " ").strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item
+class TextVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+    def _compute_answer_scores(self, raw_answers):
+        """
+        compute the accuracy (soft score) of human answers
+        """
+        answers = [self.answer_processor(a) for a in raw_answers]
+        assert len(answers) == 10
+        gt_answers = list(enumerate(answers))
+        unique_answers = set(answers)
+        unique_answer_scores = {}
+        for unique_answer in unique_answers:
+            accs = []
+            for gt_answer in gt_answers:
+                other_answers = [item for item in gt_answers if item != gt_answer]
+                matching_answers = [
+                    item for item in other_answers if item[1] == unique_answer
+                ]
+                acc = min(1, float(len(matching_answers)) / 3)
+                accs.append(acc)
+            unique_answer_scores[unique_answer] = sum(accs) / len(accs)
+        return unique_answer_scores
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in tqdm(pred_list):
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
+            score = unique_answer_scores.get(pred_answer, 0.0)
+            pred_scores.append(score)
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+class STVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            gts = [self.answer_processor(a) for a in entry["gt_answers"]]
+            score = 1.0 if pred_answer in gts else 0.0
+            pred_scores.append(score)
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+class STVQAANLSEvaluator:
+    def __init__(self):
+        import editdistance  # install with `pip install editdistance`
+        self.get_edit_distance = editdistance.eval
+    def get_anls(self, s1, s2):
+        s1 = s1.lower().strip()
+        s2 = s2.lower().strip()
+        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
+        anls = iou if iou >= 0.5 else 0.0
+        return anls
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            anls = max(
+                self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
+            )
+            pred_scores.append(anls)
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+class TextCapsBleu4Evaluator:
+    def __init__(self):
+        # The following script requires Java 1.8.0 and pycocotools installed.
+        # The pycocoevalcap can be installed with pip as
+        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
+        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
+        # but has no python3 support yet.
+        try:
+            from pycocoevalcap.bleu.bleu import Bleu
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        except ModuleNotFoundError:
+            print(
+                "Please install pycocoevalcap module using "
+                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
+            )
+            raise
+        self.tokenizer = PTBTokenizer()
+        self.scorer = Bleu(4)
+    def eval_pred_list(self, pred_list):
+        # Create reference and hypotheses captions.
+        gts = {}
+        res = {}
+        for idx, entry in enumerate(pred_list):
+            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
+            res[idx] = [{"caption": entry["pred_answer"]}]
+        gts = self.tokenizer.tokenize(gts)
+        res = self.tokenizer.tokenize(res)
+        score, _ = self.scorer.compute_score(gts, res)
+        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
+        return bleu4