Spaces:

IVGSZ
/

Flash-VStream-demo

Runtime error

App Files Files Community

zhanghaoji commited on Jun 26

Commit

eb0678a

•

1 Parent(s): 378cd97

init

Browse files

Files changed (28) hide show

app.py +172 -62
app_old.py +63 -0
flash_vstream/__init__.py +1 -0
flash_vstream/constants.py +15 -0
flash_vstream/conversation.py +337 -0
flash_vstream/eval_video/eval_activitynet_qa.py +296 -0
flash_vstream/eval_video/eval_any_dataset_features.py +340 -0
flash_vstream/eval_video/model_msvd_qa.py +157 -0
flash_vstream/eval_video/model_msvd_qa_featuresloader.py +179 -0
flash_vstream/mm_utils.py +106 -0
flash_vstream/model/__init__.py +1 -0
flash_vstream/model/builder.py +139 -0
flash_vstream/model/compress_functions.py +277 -0
flash_vstream/model/language_model/vstream_llama.py +129 -0
flash_vstream/model/multimodal_encoder/builder.py +13 -0
flash_vstream/model/multimodal_encoder/clip_encoder.py +80 -0
flash_vstream/model/multimodal_projector/builder.py +51 -0
flash_vstream/model/vstream_arch.py +742 -0
flash_vstream/serve/cli_video_stream.py +351 -0
flash_vstream/serve/demo.py +144 -0
flash_vstream/train/llama_flash_attn_monkey_patch.py +117 -0
flash_vstream/train/llama_xformers_attn_monkey_patch.py +131 -0
flash_vstream/train/train.py +1069 -0
flash_vstream/train/train_mem.py +14 -0
flash_vstream/train/train_xformers.py +15 -0
flash_vstream/train/vstream_trainer.py +248 -0
flash_vstream/utils.py +128 -0
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,63 +1,173 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import torch
 import gradio as gr
+from flash_vstream.serve.demo import Chat, title_markdown, block_css
+from flash_vstream.constants import *
+from flash_vstream.conversation import conv_templates, Conversation
+import os
+from PIL import Image
+import tempfile
+import imageio
+import shutil
+model_path = "IVGSZ/Flash-VStream-7b"
+load_8bit = False
+load_4bit = False
+def save_image_to_local(image):
+    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
+    image = Image.open(image)
+    image.save(filename)
+    return filename
+def save_video_to_local(video_path):
+    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
+    shutil.copyfile(video_path, filename)
+    return filename
+def generate(video, textbox_in, first_run, state, state_, images_tensor):
+    flag = 1
+    if not textbox_in:
+        if len(state_.messages) > 0:
+            textbox_in = state_.messages[-1][1]
+            state_.messages.pop(-1)
+            flag = 0
+        else:
+            return "Please enter instruction"
+    video = video if video else "none"
+    if type(state) is not Conversation:
+        state = conv_templates[conv_mode].copy()
+        state_ = conv_templates[conv_mode].copy()
+        images_tensor = []
+    first_run = False if len(state.messages) > 0 else True
+    text_en_in = textbox_in.replace("picture", "image")
+    image_processor = handler.image_processor
+    if os.path.exists(video):
+        video_tensor = handler._get_rawvideo_dec(video, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        for img in video_tensor:
+            images_tensor.append(image_processor(img, return_tensors='pt')['pixel_values'][0].to(handler.model.device, dtype=torch.float16))
+    if os.path.exists(video):
+        text_en_in = DEFAULT_IMAGE_TOKEN * len(video_tensor) + '\n' + text_en_in
+    text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
+    state_.messages[-1] = (state_.roles[1], text_en_out)
+    text_en_out = text_en_out.split('#')[0]
+    textbox_out = text_en_out
+    show_images = ""
+    if os.path.exists(video):
+        filename = save_video_to_local(video)
+        show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={filename}"></video>'
+    if flag:
+        state.append_message(state.roles[0], textbox_in + "\n" + show_images)
+    state.append_message(state.roles[1], textbox_out)
+    return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=None, interactive=True))
+def regenerate(state, state_):
+    state.messages.pop(-1)
+    state_.messages.pop(-1)
+    if len(state.messages) > 0:
+        return state, state_, state.to_gradio_chatbot(), False
+    return (state, state_, state.to_gradio_chatbot(), True)
+def clear_history(state, state_):
+    state = conv_templates[conv_mode].copy()
+    state_ = conv_templates[conv_mode].copy()
+    return (gr.update(value=None, interactive=True), \
+        gr.update(value=None, interactive=True),\
+        True, state, state_, state.to_gradio_chatbot(), [])
+conv_mode = "simple"
+handler = Chat(model_path, conv_mode=conv_mode, load_4bit=load_4bit, load_8bit=load_8bit)
+if not os.path.exists("temp"):
+    os.makedirs("temp")
+print(torch.cuda.memory_allocated())
+print(torch.cuda.max_memory_allocated())
+with gr.Blocks(title='Flash-VStream', theme=gr.themes.Soft(), css=block_css) as demo:
+    gr.Markdown(title_markdown)
+    state = gr.State()
+    state_ = gr.State()
+    first_run = gr.State()
+    images_tensor = gr.State()
+    with gr.Row():
+        with gr.Column(scale=3):
+            video = gr.Video(label="Input Video")
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="Flash-VStream", bubble_full_width=True).style(height=700)
+            with gr.Row():
+                with gr.Column(scale=8):
+                    textbox = gr.Textbox(show_label=False,
+                                         placeholder="Enter text and press Send",
+                                         container=False)
+                with gr.Column(scale=2, min_width=50):
+                    submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
+    with gr.Row(visible=True) as button_row:
+        flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
+        regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+        clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                [
+                    f"{cur_dir}/examples/video2.mp4",
+                    "Describe the video briefly.",
+                ]
+            ],
+            inputs=[video, textbox],
+        )
+        gr.Examples(
+            examples=[
+                [
+                    f"{cur_dir}/examples/video4.mp4",
+                    "What is the boy doing?",
+                ]
+            ],
+            inputs=[video, textbox],
+        )
+        gr.Examples(
+            examples=[
+                [
+                    f"{cur_dir}/examples/video5.mp4",
+                    "Why is this video funny?",
+                ]
+            ],
+            inputs=[video, textbox],
+        )
+    submit_btn.click(generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])
+    regenerate_btn.click(regenerate, [state, state_], [state, state_, chatbot, first_run]).then(
+        generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])
+    clear_btn.click(clear_history, [state, state_],
+                    [video, textbox, first_run, state, state_, chatbot, images_tensor])
+# app = gr.mount_gradio_app(app, demo, path="/")
+demo.launch()

app_old.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import gradio as gr
+from huggingface_hub import InferenceClient
+"""
+For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
+"""
+client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+):
+    messages = [{"role": "system", "content": system_message}]
+    for val in history:
+        if val[0]:
+            messages.append({"role": "user", "content": val[0]})
+        if val[1]:
+            messages.append({"role": "assistant", "content": val[1]})
+    messages.append({"role": "user", "content": message})
+    response = ""
+    for message in client.chat_completion(
+        messages,
+        max_tokens=max_tokens,
+        stream=True,
+        temperature=temperature,
+        top_p=top_p,
+    ):
+        token = message.choices[0].delta.content
+        response += token
+        yield response
+"""
+For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
+"""
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top-p (nucleus sampling)",
+        ),
+    ],
+)
+if __name__ == "__main__":
+    demo.launch()

flash_vstream/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from flash_vstream.model import VStreamLlamaForCausalLM

flash_vstream/constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

flash_vstream/conversation.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_vicuna_v1_mcq = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions. "
+    "The assistant should give the number of correct answer.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_tiny = Conversation(
+    system="""<|system|>
+A conversation between a user and an AI assistant. The assistant gives short and honest answers.""",
+    roles=("<|user|>\n", "<|assistant|>\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "plain": conv_plain,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

flash_vstream/eval_video/eval_activitynet_qa.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import os
+import ast
+import json
+import openai
+import argparse
+from tqdm import tqdm
+from time import sleep
+from collections import defaultdict
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
+    parser.add_argument("--api_key", required=True, type=str, help="OpenAI API key")
+    parser.add_argument("--api_type", default=None, type=str, help="OpenAI API type")
+    parser.add_argument("--api_version", default=None, type=str, help="OpenAI API version")
+    parser.add_argument("--api_base", default=None, type=str, help="OpenAI API base")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in tqdm(caption_files):
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the correctness of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
+                    }
+                ],
+                temperature=0.002
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+            sleep(0.5)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+            sleep(1)
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    if args.num_chunks > 1:
+        pred_contents = []
+        for _idx in range(args.num_chunks):
+            file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
+            pred_contents += [json.loads(line) for line in open(file)]
+    else:
+        file = os.path.join(args.pred_path, f"pred.json")
+        pred_contents = [json.loads(line) for line in open(file)]
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['id']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['id'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['id'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['id']
+        question = sample['question']
+        answer = sample['answer']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred, "a_type": sample['answer_type'] if 'answer_type' in sample else None}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key # Your API key here
+    if args.api_type:
+        openai.api_type = args.api_type
+    if args.api_version:
+        openai.api_version = args.api_version
+    if args.api_base:
+        openai.api_base = args.api_base # Your API base here
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    incomplete_lengths = []
+    for _ in range(100):
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            incomplete_lengths.append(len(incomplete_files))
+            if len(incomplete_lengths) > 5 and len(set(incomplete_lengths[-5:])) <= 1:
+                print(f"incomplete_lengths: {incomplete_lengths}")
+                print(f"incomplete_files: {incomplete_files}")
+                print(f"completed_files: {completed_files}")
+                print(f"failed for 5 times, break")
+                break
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                assert 'pred' in content[0], f"Error: {file_name} don't has key=pred"
+                assert 'score' in content[0], f"Error: {file_name} don't has key=score"
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    class ScoreMeter:
+        def __init__(self):
+            self.score_sum = 0
+            self.count = 0
+            self.yes_count = 0
+            self.no_count = 0
+            self.score_dict = {'yes': defaultdict(int), 'no': defaultdict(int)}
+        def add_score(self, score, pred):
+            self.score_sum += score
+            self.count += 1
+            pred_lower = pred.lower()
+            if 'yes' in pred_lower:
+                self.yes_count += 1
+                self.score_dict['yes'][score] += 1
+            elif 'no' in pred_lower:
+                self.no_count += 1
+                self.score_dict['no'][score] += 1
+        def get_average_score(self):
+            res = (self.score_sum / self.count) if self.count else 0
+            return f"{res:.6f}"
+        def get_accuracy(self, response_type):
+            if response_type == 'yes':
+                res =  (self.yes_count / self.count) if self.count else 0
+            elif response_type == 'no':
+                res = (self.no_count / self.count) if self.count else 0
+            else:
+                res = 0
+            return f"{res:.6f}"
+    meter_dic = {'total': ScoreMeter()}
+    for key, result in combined_contents.items():
+        # Computing score
+        score_match = result[0]['score']
+        score = int(score_match)
+        pred = result[0]['pred']
+        meter_dic["total"].add_score(score, pred)
+        if 'a_type' in result[1] and result[1]['a_type'] is not None:
+            typ = str(result[1]['a_type'])
+            if typ not in meter_dic:
+                meter_dic[typ] = ScoreMeter()
+            meter_dic[typ].add_score(score, pred)
+            if 'next' in args.output_dir:
+                typ = typ[0]
+                if typ not in meter_dic:
+                    meter_dic[typ] = ScoreMeter()
+                meter_dic[typ].add_score(score, pred)
+    csv_dic = {'acc': meter_dic["total"].get_accuracy('yes'), 'score': meter_dic["total"].get_average_score()}
+    output = ""
+    output += "Yes count: " + str(meter_dic["total"].yes_count) + "\n"
+    output += "No count: " + str(meter_dic["total"].no_count) + "\n"
+    output += "Accuracy: " + str(meter_dic["total"].get_accuracy('yes')) + "\n"
+    output += "Average score: " + str(meter_dic["total"].get_average_score()) + "\n"
+    output += "\n"
+    output += "Total Score Yes/No distribution:\n"
+    for key, value in meter_dic["total"].score_dict.items():
+        output += f"{key}:\n"
+        for k in range(0, 6):
+            v = value[k]
+            output += f"{k}: {v}\n"
+    output += "\n"
+    output += "Answer Type Score distribution:\n"
+    output += 'Type, Accuracy, Avg_score\n'
+    key_list = sorted([k for k in meter_dic.keys()])
+    for key in key_list:
+        output += f"{key}, {meter_dic[key].get_accuracy('yes')}, {meter_dic[key].get_average_score()}\n"
+        csv_dic[key] = meter_dic[key].get_accuracy('yes')
+    output += "\n"
+    for k in csv_dic.keys():
+        output += f"{k}, "
+    output = output.rstrip(', ')  # Remove the trailing comma and space
+    output += "\n"
+    for k in csv_dic.keys():
+        output += str(csv_dic[k]) + ", "
+    output = output.rstrip(', ')  # Remove the trailing comma and space
+    output += "\n"
+    print(output)
+    args.output_csv = args.output_json.replace(".json", ".csv")
+    with open(args.output_csv, 'w') as f:
+        f.write(output)
+if __name__ == "__main__":
+    main()

flash_vstream/eval_video/eval_any_dataset_features.py ADDED Viewed

	@@ -0,0 +1,340 @@

+#    Copyright 2024 Flash-VStream Authors
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import argparse
+import subprocess
+import multiprocessing
+def exec(cmd, sub=False, device=None):
+    print(f'exec: {cmd}')
+    if not sub:
+        if isinstance(cmd, list):
+            cmd = ' '.join(cmd)
+        os.system(cmd)
+    else:
+        my_env = os.environ.copy()
+        my_env["CUDA_VISIBLE_DEVICES"] = device
+        subprocess.run(cmd, env=my_env)
+# multi gpu, feature
+def eval_msvd(args):
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/MSVD-QA/video_features",
+                    "--gt_file", "./data/eval_video/MSVD-QA/test_qa.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "msvd"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1"]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "msvd"),
+           "--output_dir", os.path.join(model_path, "evaluation", "msvd", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "msvd", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_msrvtt(args):
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/MSRVTT-QA/video_features",
+                    "--gt_file", "./data/eval_video/MSRVTT-QA/test_qa.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "msrvtt"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1"]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "msrvtt"),
+           "--output_dir", os.path.join(model_path, "evaluation", "msrvtt", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "msrvtt", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_actnet(args):
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/ActivityNet-QA/video_features",
+                    "--gt_file", "./data/eval_video/ActivityNet-QA/test_qa.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "actnet"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1",
+                    ]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "actnet"),
+           "--output_dir", os.path.join(model_path, "evaluation", "actnet", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "actnet", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_nextoe(args):  # follow msvd format, OE follow actnet
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/nextoe/video_features",
+                    "--gt_file", "./data/eval_video/nextoe/test_qa.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "nextoe"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1",
+                    ]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "nextoe"),
+           "--output_dir", os.path.join(model_path, "evaluation", "nextoe", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "nextoe", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_vsmovienet(args):  # follow msvd format
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/vstream/movienet_video_features",
+                    "--gt_file", "./data/eval_video/vstream/test_qa_movienet.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "vsmovienet"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1",
+                ]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "vsmovienet"),
+           "--output_dir", os.path.join(model_path, "evaluation", "vsmovienet", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "vsmovienet", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_vsego4d(args):  # follow msvd format
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/vstream/ego4d_video_features",
+                    "--gt_file", "./data/eval_video/vstream/test_qa_ego4d.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "vsego4d"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1",
+                ]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "vsego4d"),
+           "--output_dir", os.path.join(model_path, "evaluation", "vsego4d", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "vsego4d", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_realtime_vsmovienet(args):  # follow msvd format
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/vstream-realtime/movienet_video_features",
+                    "--gt_file", "./data/eval_video/vstream-realtime/test_qa_movienet.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "realtime_vsmovienet"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1",
+                ]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "realtime_vsmovienet"),
+           "--output_dir", os.path.join(model_path, "evaluation", "realtime_vsmovienet", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "realtime_vsmovienet", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+# multi gpu, feature
+def eval_realtime_vsego4d(args):  # follow msvd format
+    model_path = args.model_path
+    num_chunks = args.num_chunks
+    if not args.only_eval:
+        processes = []
+        for idx in range(0, num_chunks):
+            cmd = ["python", "llama_vstream/eval_video/model_msvd_qa_featuresloader.py",
+                    "--model-path", model_path,
+                    "--video_dir", "./data/eval_video/vstream-realtime/ego4d_video_features",
+                    "--gt_file", "./data/eval_video/vstream-realtime/test_qa_ego4d.json",
+                    "--output_dir", os.path.join(model_path, "evaluation", "realtime_vsego4d"),
+                    "--output_name", "pred",
+                    "--num-chunks", str(num_chunks),
+                    "--chunk-idx", str(idx),
+                    "--conv-mode", "vicuna_v1",
+                ]
+            p = multiprocessing.Process(target=exec, args=(cmd, True, str(idx)))
+            processes.append(p)
+            p.start() # 启动子进程
+        for p in processes:
+            p.join()
+    cmd = ["python", "llama_vstream/eval_video/eval_activitynet_qa.py",
+           "--pred_path", os.path.join(model_path, "evaluation", "realtime_vsego4d"),
+           "--output_dir", os.path.join(model_path, "evaluation", "realtime_vsego4d", "results"),
+           "--output_json", os.path.join(model_path, "evaluation", "realtime_vsego4d", "results.json"),
+           "--num_chunks", str(num_chunks),
+           "--num_tasks", "16",
+           "--api_key", args.api_key,
+           "--api_base", args.api_base,
+           "--api_type", args.api_type,
+           "--api_version", args.api_version,
+           ]
+    exec(cmd)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--dataset", type=str, default=None)
+    parser.add_argument("--api_key", type=str, default=None)
+    parser.add_argument("--api_base", type=str, default=None)
+    parser.add_argument("--api_type", type=str, default=None)
+    parser.add_argument("--api_version", type=str, default=None)
+    parser.add_argument("--num_chunks", type=int, default=1)
+    parser.add_argument("--only_eval", action="store_true")
+    parser.add_argument("--vizlen", type=int, default=0)
+    parser.add_argument("--use_speech", action="store_true", default=False)
+    args = parser.parse_args()
+    func_dic = {'msvd': eval_msvd,
+                'msrvtt': eval_msrvtt,
+                'actnet': eval_actnet,
+                'nextoe': eval_nextoe,
+                'vsmovienet': eval_vsmovienet,
+                'vsego4d': eval_vsego4d,
+                'realtime_vsmovienet': eval_realtime_vsmovienet,
+                'realtime_vsego4d': eval_realtime_vsego4d,
+                }
+    if args.dataset in func_dic:
+        print(f'Execute {args.dataset} evaluation')
+        func_dic[args.dataset](args)

flash_vstream/eval_video/model_msvd_qa.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import os
+import json
+import math
+import torch
+import argparse
+from tqdm import tqdm
+from decord import VideoReader, cpu
+from llama_vstream.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llama_vstream.conversation import conv_templates, SeparatorStyle
+from llama_vstream.model.builder import load_pretrained_model
+from llama_vstream.utils import disable_torch_init
+from llama_vstream.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def parse_args():
+    """
+    Parse command-line arguments.
+    """
+    parser = argparse.ArgumentParser()
+    # Define the command-line arguments
+    parser.add_argument('--video_dir', help='Directory containing video files.', required=True)
+    parser.add_argument('--gt_file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--output_dir', help='Directory to save the model results JSON.', required=True)
+    parser.add_argument('--output_name', help='Name of the file for storing results JSON.', required=True)
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--model-max-length", type=int, default=None)
+    return parser.parse_args()
+def load_video(video_path):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frame_num = len(vr)
+    fps = round(vr.get_avg_fps())
+    frame_idx = [i for i in range(0, len(vr), fps)]
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    return spare_frames
+def run_inference(args):
+    """
+    Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
+    Args:
+        args: Command-line arguments.
+    """
+    # Initialize the model
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.model_max_length)
+    # Load both ground truth file containing questions and answers
+    with open(args.gt_file) as file:
+        gt_questions = json.load(file)
+    gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
+    # Create the output directory if it doesn't exist
+    if not os.path.exists(args.output_dir):
+        try:
+            os.makedirs(args.output_dir)
+        except Exception as e:
+            print(f'mkdir Except: {e}')
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    if args.num_chunks > 1:
+        output_name = f"{args.num_chunks}_{args.chunk_idx}"
+    else:
+        output_name = args.output_name
+    answers_file = os.path.join(args.output_dir, f"{output_name}.json")
+    ans_file = open(answers_file, "w")
+    for sample in tqdm(gt_questions, desc=f"cuda:{args.chunk_idx} "):
+        video_name = sample['video_id']
+        question = sample['question']
+        id = sample['id']
+        answer = sample['answer']
+        sample_set = {'id': id, 'question': question, 'answer': answer}
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            temp_path = os.path.join(args.video_dir, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if os.path.exists(video_path):
+            video = load_video(video_path)
+            video = image_processor.preprocess(video, return_tensors='pt')['pixel_values'].half().cuda()
+            video = [video]
+        qs = question
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=video,
+                do_sample=True,
+                temperature=0.002,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        sample_set['pred'] = outputs
+        ans_file.write(json.dumps(sample_set) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    args = parse_args()
+    run_inference(args)

flash_vstream/eval_video/model_msvd_qa_featuresloader.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+# Based on https://github.com/haotian-liu/LLaVA.
+import os
+import json
+import math
+import torch
+import random
+import argparse
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+from safetensors.torch import load_file
+from llama_vstream.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llama_vstream.conversation import conv_templates, SeparatorStyle
+from llama_vstream.model.builder import load_pretrained_model
+from llama_vstream.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def parse_args():
+    """
+    Parse command-line arguments.
+    """
+    parser = argparse.ArgumentParser()
+    # Define the command-line arguments
+    parser.add_argument('--video_dir', help='Directory containing video files.', required=True)
+    parser.add_argument('--gt_file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--output_dir', help='Directory to save the model results JSON.', required=True)
+    parser.add_argument('--output_name', help='Name of the file for storing results JSON.', required=True)
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--model-max-length", type=int, default=None)
+    return parser.parse_args()
+class CustomDataset(Dataset):
+    def __init__(self, questions, video_dir, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.video_dir = video_dir
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+    def __getitem__(self, index):
+        sample = self.questions[index]
+        video_name = sample['video_id']
+        try:
+            video_path = os.path.join(self.video_dir, video_name + '.safetensors')
+            video_tensor = load_file(video_path)['feature']
+        except Exception as e:
+            print(f'Dataset Exception: {e}, randomly choose one.')
+            idx = random.randint(0, len(self.questions) - 1)
+            return self.__getitem__(idx)
+        qs = sample['question']
+        if self.model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        if 'system' in sample:
+            conv.system = conv.system + ' ' + sample['system']
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        return input_ids, video_tensor
+    def __len__(self):
+        return len(self.questions)
+def create_data_loader(questions, video_dir, tokenizer, image_processor, model_config, batch_size=1, num_workers=2):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, video_dir, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
+    return data_loader
+def run_inference(args):
+    """
+    Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
+    Args:
+        args: Command-line arguments.
+    """
+    # Initialize the model
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.model_max_length)
+    # Load both ground truth file containing questions and answers
+    with open(args.gt_file) as file:
+        gt_questions = json.load(file)
+    gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
+    # Create the output directory if it doesn't exist
+    if not os.path.exists(args.output_dir):
+        try:
+            os.makedirs(args.output_dir)
+        except Exception as e:
+            print(f'mkdir Except: {e}')
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    if args.num_chunks > 1:
+        output_name = f"{args.num_chunks}_{args.chunk_idx}"
+    else:
+        output_name = args.output_name
+    answers_file = os.path.join(args.output_dir, f"{output_name}.json")
+    # resume from old exp
+    exist_id_set = set()
+    if os.path.exists(answers_file):
+        with open(answers_file) as f:
+            exist_pred_contents = [json.loads(line) for line in f]
+        exist_id_set = set([x['id'] for x in exist_pred_contents])
+    new_gt_questions = []
+    for sample in tqdm(gt_questions):
+        if not sample['id'] in exist_id_set:
+            new_gt_questions.append(sample)
+    gt_questions = new_gt_questions
+    data_loader = create_data_loader(gt_questions, args.video_dir, tokenizer, image_processor, model.config)
+    conv = conv_templates[args.conv_mode].copy()
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    with open(answers_file, "a") as ans_file:
+        for data, sample in tqdm(zip(data_loader, gt_questions), desc=f"cuda:{args.chunk_idx} ", total=len(gt_questions)):
+            input_ids, video_tensors = data
+            input_ids = input_ids.to(device='cuda', non_blocking=True)
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    features=video_tensors.to(dtype=torch.float16, device='cuda', non_blocking=True),
+                    do_sample=True,
+                    temperature=0.002,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria],
+                )
+            input_token_len = input_ids.shape[1]
+            n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+            if n_diff_input_output > 0:
+                print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+            outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            if outputs.endswith(stop_str):
+                outputs = outputs[:-len(stop_str)]
+            outputs = outputs.strip()
+            sample_set = {
+                'id': sample['id'],
+                'question': sample['question'],
+                'answer': sample['answer'],
+                'answer_type': sample['answer_type'] if 'answer_type' in sample else None,
+                'pred': outputs
+            }
+            ans_file.write(json.dumps(sample_set) + "\n")
+            ans_file.flush()
+if __name__ == "__main__":
+    args = parse_args()
+    run_inference(args)

flash_vstream/mm_utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+from transformers import StoppingCriteria
+from flash_vstream.constants import IMAGE_TOKEN_INDEX
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

flash_vstream/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .language_model.vstream_llama import VStreamLlamaForCausalLM, VStreamConfig

flash_vstream/model/builder.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#    This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+# ------------------------------------------------------------------------
+# Based on https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from flash_vstream.model import VStreamLlamaForCausalLM, VStreamConfig
+from flash_vstream.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if 'vstream' in model_name.lower():
+        # Load LLaMA-VStream model
+        if 'lora' in model_name.lower() and model_base is None:
+            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+        if 'lora' in model_name.lower() and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('(LoRA) Loading LLaMA-VStream from base model...')
+            model = VStreamLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            print('(LoRA) Loading additional LLaMA-VStream weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading LLaMA-VStream from base model...')
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            model = VStreamLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = VStreamLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    image_processor = None
+    if 'vstream' in model_name.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        vision_tower.to(device=device, dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

flash_vstream/model/compress_functions.py ADDED Viewed

	@@ -0,0 +1,277 @@

+#    Copyright 2024 Flash-VStream Authors
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def drop_feature(img_feature, video_max_frames, img_similarity=None):
+    T, P, D = img_feature.shape
+    indices = [[i] for i in range(T)]
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, img_similarity, [indices]
+    cur_feature = img_feature[:T0]  # [T0, P, D]
+    if img_similarity is not None:
+        cur_sim = img_similarity[:T0 - 1]
+    else:
+        cur_sim = F.cosine_similarity(cur_feature[:-1].view(T0 - 1, P * D), cur_feature[1:].view(T0 - 1, P * D))  # [T0 - 1]
+    cur_indices = indices[:T0]
+    step_indices = [cur_indices]
+    for i in range(T0, T):
+        new_feature = img_feature[i]
+        new_sim = F.cosine_similarity(cur_feature[-1].view(-1), new_feature.view(-1), dim=0)
+        all_feature = torch.cat([cur_feature, new_feature.unsqueeze(0)], dim=0)
+        all_indices = cur_indices + [[i]]
+        all_sim = torch.cat([cur_sim, new_sim.unsqueeze(0)], dim=0)
+        idx = torch.argmax(all_sim)
+        if random.randint(0, 1) > 0:
+            idx = idx + 1
+        cur_feature = torch.cat([all_feature[:idx], all_feature[idx + 1:]])
+        if idx + 1 == T0 + 1:
+            cur_sim = all_sim[:T0 - 1]
+            cur_indices = all_indices[:-1]
+        elif idx == 0:
+            cur_sim = all_sim[1:]
+            cur_indices = all_indices[1:]
+        else:
+            cur_sim = torch.cat([all_sim[:idx], all_sim[idx + 1:]])
+            cur_sim[idx - 1] = F.cosine_similarity(all_feature[idx - 1].view(-1), all_feature[idx + 1].view(-1), dim=0)
+            cur_indices = all_indices[:idx] + all_indices[idx + 1:]
+        step_indices.append(cur_indices)
+    # print(f'Note: perform drop feature {img_feature.shape} to {cur_feature.shape}')
+    return cur_feature, cur_sim, step_indices
+def merge_feature(img_feature, video_max_frames, img_similarity=None):
+    T, P, D = img_feature.shape
+    indices = [[i] for i in range(T)]
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, img_similarity, [indices]
+    cur_feature = img_feature[:T0]  # [T0, P, D]
+    cur_indices = indices[:T0]
+    step_indices = [cur_indices]
+    if img_similarity is not None:
+        cur_sim = img_similarity[:T0 - 1]
+    else:
+        cur_sim = F.cosine_similarity(cur_feature[:-1].view(T0 - 1, P * D), cur_feature[1:].view(T0 - 1, P * D))  # [T0 - 1]
+    for i in range(T0, T):
+        new_feature = img_feature[i]
+        new_sim = F.cosine_similarity(cur_feature[-1].view(-1), new_feature.view(-1), dim=0)
+        all_feature = torch.cat([cur_feature, new_feature.unsqueeze(0)], dim=0)
+        all_sim = torch.cat([cur_sim, new_sim.unsqueeze(0)], dim=0)
+        all_indices = cur_indices + [[i]]
+        idx = torch.argmax(all_sim)
+        all_feature[idx + 1] = (all_feature[idx] + all_feature[idx + 1]) / 2.0
+        all_indices[idx + 1] = all_indices[idx] + all_indices[idx + 1]
+        cur_feature = torch.cat([all_feature[:idx], all_feature[idx + 1:]])
+        cur_sim = torch.cat([all_sim[:idx], all_sim[idx + 1:]])
+        cur_indices = all_indices[:idx] + all_indices[idx + 1:]
+        if idx > 0:
+            cur_sim[idx - 1] = F.cosine_similarity(all_feature[idx - 1].view(-1), all_feature[idx + 1].view(-1), dim=0)
+        if idx + 1 < T0:
+            cur_sim[idx] = F.cosine_similarity(all_feature[idx + 1].view(-1), all_feature[idx + 2].view(-1), dim=0)
+        step_indices.append(cur_indices)
+    # print(f'Note: perform merge feature {img_feature.shape} to {cur_feature.shape}')
+    return cur_feature, cur_sim, step_indices
+def kmeans_feature(img_feature, video_max_frames, img_similarity=None):
+    def kmeans_torch(X, num_clusters, distance='euclidean', tol=1e-4, max_iter=10):
+        indices = torch.randperm(X.size(0))[:num_clusters]
+        centroids = X[indices]
+        for i in range(max_iter):
+            if distance == 'euclidean':
+                dists = torch.cdist(X, centroids, p=2)
+            else:
+                raise NotImplementedError("Only Euclidean distance is supported yet")
+            labels = torch.argmin(dists, dim=1)
+            new_centroids = []
+            for j in range(num_clusters):
+                cluster_points = X[labels == j]
+                if len(cluster_points) > 0:
+                    new_centroid = cluster_points.mean(0)
+                else:  # fix nan centroids
+                    new_centroid = X[random.randint(0, X.size(0) - 1)]
+                new_centroids.append(new_centroid)
+            new_centroids = torch.stack(new_centroids)
+            diff = torch.norm(centroids - new_centroids, dim=1).sum()
+            if diff < tol:
+                break
+            centroids = new_centroids
+        return centroids, labels, i
+    T, P, D = img_feature.shape
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, img_similarity, [[[i] for i in range(T)]]
+    X = img_feature.view(T, -1)  # [T, P, D]
+    centroids, labels, exit_step = kmeans_torch(X, T0)
+    reduced_feature = centroids.view(T0, P, D)
+    # print(f'Note: perform kmeans feature {img_feature.shape} to {reduced_feature.shape}, exit at step={exit_step}')  # actually, K=T0
+    step_indices = [[] for _ in range(T0)]
+    for i in range(T0):
+        step_indices[i] = [j for j in range(T) if labels[j] == i]
+    return reduced_feature, img_similarity, [step_indices]
+def weighted_kmeans_feature(img_feature, video_max_frames, weights=None):
+    if weights is None:
+        weights = torch.ones(img_feature.size(0), dtype=img_feature.dtype, device=img_feature.device)
+    def weighted_kmeans_torch(X, num_clusters, weights=None, distance='euclidean', tol=1e-4, max_iter=10):
+        indices = torch.randperm(X.size(0), device=X.device)[:num_clusters]
+        centroids = X[indices]
+        for i in range(max_iter):
+            if distance == 'euclidean':
+                dists = ((X.unsqueeze(1) - centroids.unsqueeze(0)) ** 2).sum(dim=2).sqrt()
+            else:
+                raise NotImplementedError("Only Euclidean distance is supported yet")
+            labels = torch.argmin(dists, dim=1)
+            weighted_sum = torch.zeros_like(centroids)
+            weights_sum = torch.zeros(num_clusters, dtype=X.dtype, device=X.device)
+            for j in range(num_clusters):
+                cluster_mask = labels == j
+                weighted_sum[j] = torch.sum(weights[cluster_mask, None] * X[cluster_mask], dim=0)
+                weights_sum[j] = torch.sum(weights[cluster_mask])
+            mask = weights_sum > 0
+            new_centroids = torch.zeros_like(weighted_sum)
+            new_centroids[mask] = weighted_sum[mask] / weights_sum[mask, None]
+            if mask.sum() < num_clusters:  # fix nan centroids
+                new_centroids[~mask] = torch.stack([X[random.randint(0, X.size(0) - 1)] for _ in range(num_clusters - mask.sum())])
+            diff = torch.norm(centroids - new_centroids, dim=1).sum()
+            if diff < tol:
+                break
+            centroids = new_centroids
+        return centroids, labels, weights_sum, i
+    T, P, D = img_feature.shape
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, weights, [[[i] for i in range(T)]]
+    X = img_feature.view(T, -1)  # [T, P, D]
+    centroids, labels, weights, exit_step = weighted_kmeans_torch(X, T0, weights)
+    reduced_feature = centroids.view(T0, P, D)
+    # print(f'Note: perform weighted kmeans feature {img_feature.shape} to {reduced_feature.shape}, exit at step={exit_step}')  # actually, K=T0
+    step_indices = [[] for _ in range(T0)]
+    for i in range(T0):
+        step_indices[i] = [j for j in range(T) if labels[j] == i]
+    return reduced_feature, weights, [step_indices]
+def k_drop_feature(img_feature, video_max_frames, img_similarity=None):
+    T, P, D = img_feature.shape
+    indices = [[i] for i in range(T)]
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, img_similarity, [indices]
+    cur_feature = img_feature[:T0]  # [T0, P, D]
+    normed_cur_features = F.normalize(cur_feature.view(T0, P * D), p=2, dim=1)
+    cur_sim = torch.mm(normed_cur_features, normed_cur_features.T)  # [T0, T0]
+    cur_sim.fill_diagonal_(-100.0)
+    cur_indices = indices[:T0]
+    step_indices = [cur_indices]
+    for i in range(T0, T):
+        # get new feature
+        new_feature = img_feature[i]
+        normed_new_feature = F.normalize(new_feature.view(1, P * D), p=2, dim=1)
+        new_sim = torch.mm(normed_cur_features, normed_new_feature.T)  # [T0, 1]
+        all_feature = torch.cat([cur_feature, new_feature.unsqueeze(0)], dim=0)
+        normed_all_features = torch.cat([normed_cur_features, normed_new_feature], dim=0)
+        all_indices = cur_indices + [[i]]
+        # get new similarity
+        all_sim_1 = torch.cat([cur_sim, new_sim], dim=1)  # [T0, T0 + 1]
+        all_sim = torch.cat([all_sim_1, torch.ones_like(all_sim_1[-1:]) * -100.0], dim=0)  # [T0 + 1, T0 + 1]
+        all_sim[-1, :-1] = new_sim.T
+        # choose compression position
+        idx = torch.argmax(all_sim)
+        left, right = idx // (T0 + 1), idx % (T0 + 1)
+        if random.randint(0, 1) > 0:
+            idx = left
+        else:
+            idx = right
+        assert all_sim[left, right] == torch.max(all_sim)
+        # get compressed feature and similarity
+        cur_feature = torch.cat([all_feature[:idx], all_feature[idx + 1:]])
+        normed_cur_features = torch.cat([normed_all_features[:idx], normed_all_features[idx + 1:]])
+        cur_indices = all_indices[:idx] + all_indices[idx + 1:]
+        cur_sim_1 = torch.cat([all_sim[:idx], all_sim[idx + 1:]], dim=0)  # [T0, T0 + 1]
+        cur_sim = torch.cat([cur_sim_1[:, :idx], cur_sim_1[:, idx + 1:]], dim=1)  # [T0, T0]
+        step_indices.append(cur_indices)
+    # print(f'Note: perform k-drop feature {img_feature.shape} to {cur_feature.shape}')
+    return cur_feature, None, step_indices
+def k_merge_feature(img_feature, video_max_frames, img_similarity=None):
+    T, P, D = img_feature.shape
+    indices = [[i] for i in range(T)]
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, img_similarity, [indices]
+    cur_feature = img_feature[:T0]  # [T0, P, D]
+    normed_cur_features = F.normalize(cur_feature.view(T0, P * D), p=2, dim=1)
+    cur_sim = torch.mm(normed_cur_features, normed_cur_features.T)  # [T0, T0]
+    cur_sim.fill_diagonal_(-100.0)
+    cur_indices = indices[:T0]
+    step_indices = [cur_indices]
+    for i in range(T0, T):
+        # get new feature
+        new_feature = img_feature[i]
+        normed_new_feature = F.normalize(new_feature.view(1, P * D), p=2, dim=1)
+        new_sim = torch.mm(normed_cur_features, normed_new_feature.T)  # [T0, 1]
+        all_feature = torch.cat([cur_feature, new_feature.unsqueeze(0)], dim=0)
+        normed_all_features = torch.cat([normed_cur_features, normed_new_feature], dim=0)
+        all_indices = cur_indices + [[i]]
+        # get new similarity
+        all_sim_1 = torch.cat([cur_sim, new_sim], dim=1)  # [T0, T0 + 1]
+        all_sim = torch.cat([all_sim_1, torch.ones_like(all_sim_1[-1:]) * -100.0], dim=0)  # [T0 + 1, T0 + 1]
+        all_sim[-1, :-1] = new_sim.T
+        # choose compression position
+        idx = torch.argmax(all_sim)
+        left, right = idx // (T0 + 1), idx % (T0 + 1)
+        assert all_sim[left, right] == torch.max(all_sim)
+        # update feature
+        all_feature[right] = (all_feature[left] + all_feature[right]) / 2.0
+        normed_all_features[right] = F.normalize(all_feature[right].view(1, P * D), p=2, dim=1)
+        all_indices[right] = all_indices[left] + all_indices[right]
+        # update similarity
+        new_sim = torch.mm(normed_all_features, normed_all_features[right:right+1].T)  # [T0 + 1, 1]
+        all_sim[right, :] = new_sim.T
+        all_sim[:, right:right+1] = new_sim
+        all_sim[right, right] = -100.0
+        # get compressed feature and similarity
+        cur_feature = torch.cat([all_feature[:left], all_feature[left + 1:]])
+        normed_cur_features = torch.cat([normed_all_features[:left], normed_all_features[left + 1:]])
+        cur_indices = all_indices[:left] + all_indices[left + 1:]
+        cur_sim_1 = torch.cat([all_sim[:left], all_sim[left + 1:]], dim=0)  # [T0, T0 + 1]
+        cur_sim = torch.cat([cur_sim_1[:, :left], cur_sim_1[:, left + 1:]], dim=1)  # [T0, T0]
+        step_indices.append(cur_indices)
+    # print(f'Note: perform k-merge feature {img_feature.shape} to {cur_feature.shape}')
+    return cur_feature, cur_sim, step_indices
+def attention_feature(img_feature, video_max_frames, attention_fn=None, update_ratio=0.2):
+    T, P, D = img_feature.shape
+    T0 = video_max_frames
+    if T <= T0:
+        return img_feature, None
+    cur_feature = img_feature[:T0]  # [T0, P, D]
+    turing_memory = cur_feature.reshape(T0*P, D)  # [T0*P, D]
+    for i in range(T0, T, T0):
+        j = min(i + T0, T)
+        new_feature = img_feature[i:j]  # [P, D]
+        new_feature = new_feature.reshape(-1, D)  # [n*P, D]
+        turing_memory = attention_fn(turing_memory, new_feature, update_ratio=update_ratio)  # [T0*P, n*P]
+    cur_feature = turing_memory.reshape(T0, P, D)
+    # print(f'Note: perform {attention_fn.__name__} feature {img_feature.shape} to {cur_feature.shape}')
+    return cur_feature, None

flash_vstream/model/language_model/vstream_llama.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#    This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+# ------------------------------------------------------------------------
+# Based on https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Optional, Tuple, Union
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from flash_vstream.model.vstream_arch import VStreamMetaModel, VStreamMetaForCausalLM
+class VStreamConfig(LlamaConfig):
+    model_type = "vstream"
+class VStreamLlamaModel(VStreamMetaModel, LlamaModel):
+    config_class = VStreamConfig
+    def __init__(self, config: LlamaConfig):
+        super(VStreamLlamaModel, self).__init__(config)
+class VStreamLlamaForCausalLM(VStreamMetaForCausalLM, LlamaForCausalLM):
+    config_class = VStreamConfig
+    def __init__(self, config):
+        super(VStreamLlamaForCausalLM, self).__init__(config)
+        self.model = VStreamLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = True,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        features: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            if self.use_video_streaming_mode:
+                (
+                    input_ids,
+                    position_ids,
+                    attention_mask,
+                    past_key_values,
+                    inputs_embeds,
+                    labels
+                ) = self.prepare_inputs_labels_for_multimodal_streaming(
+                    input_ids,
+                    position_ids,
+                    attention_mask,
+                    past_key_values,
+                    labels,
+                )
+            else:
+                (
+                    input_ids,
+                    position_ids,
+                    attention_mask,
+                    past_key_values,
+                    inputs_embeds,
+                    labels
+                ) = self.prepare_inputs_labels_for_multimodal(
+                    input_ids,
+                    position_ids,
+                    attention_mask,
+                    past_key_values,
+                    labels,
+                    images,
+                    features,
+                )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        features = kwargs.pop("features", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        if features is not None:
+            _inputs['features'] = features
+        return _inputs
+AutoConfig.register("vstream", VStreamConfig)
+AutoModelForCausalLM.register(VStreamConfig, VStreamLlamaForCausalLM)

flash_vstream/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import os
+from .clip_encoder import CLIPVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    raise ValueError(f'Unknown vision tower: {vision_tower}')

flash_vstream/model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

flash_vstream/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import torch
+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, input_dim, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(input_dim, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(input_dim, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

flash_vstream/model/vstream_arch.py ADDED Viewed

	@@ -0,0 +1,742 @@

+#    This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+# ------------------------------------------------------------------------
+# Based on https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import time
+import math
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.multiprocessing import Lock, Manager
+from abc import ABC, abstractmethod
+from flash_vstream.model.multimodal_encoder.builder import build_vision_tower
+from flash_vstream.model.multimodal_projector.builder import build_vision_projector
+from flash_vstream.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from flash_vstream.model.compress_functions import drop_feature, merge_feature, kmeans_feature, weighted_kmeans_feature, k_drop_feature, k_merge_feature, attention_feature
+class NeuralTuringMachine(nn.Module):
+    def __init__(self, input_dim=1024, output_dim=1024, attention_dropout=0.1):
+        super(NeuralTuringMachine, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.q_proj = nn.Linear(input_dim, output_dim)
+        self.k_proj = nn.Linear(input_dim, output_dim)
+        self.v_proj = nn.Linear(input_dim, output_dim)
+        self.dropout = nn.Dropout(attention_dropout)
+        self.out_proj = nn.Linear(output_dim, input_dim)
+        self.out_dropout = nn.Dropout(attention_dropout)
+        self.out_ln = nn.LayerNorm(input_dim, eps=1e-12)
+    def get_weight(self, x, y):
+        query = self.q_proj(x)
+        key = self.k_proj(y)
+        scores = torch.matmul(query, key.transpose(0, 1)) / math.sqrt(self.output_dim)
+        weight = F.softmax(scores, dim=-1)
+        return weight
+    def forward(self, x, y):
+        query = self.q_proj(x)
+        key = self.k_proj(y)
+        scores = torch.matmul(query, key.transpose(0, 1)) / math.sqrt(self.output_dim)
+        weight = F.softmax(scores, dim=-1)
+        attn = self.dropout(weight)
+        value = self.v_proj(y)
+        output = torch.matmul(attn, value)
+        output = self.out_proj(output)
+        output = self.out_dropout(output)
+        output = self.out_ln(output.unsqueeze(0)).squeeze(0)
+        return output
+class VStreamMetaModel:
+    def __init__(self, config):
+        super(VStreamMetaModel, self).__init__(config)
+        self.mm_input_dim = config.mm_hidden_size
+        if getattr(config, 'mm_use_4_vision_tokens', False):
+            self.mm_input_dim = self.mm_input_dim * 4
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config, self.mm_input_dim)
+        compress_Turing_hidden_dim = getattr(self.config, "compress_Turing_hidden_dim", 32)
+        self.attention_model = NeuralTuringMachine(self.mm_input_dim, compress_Turing_hidden_dim)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower = vision_tower
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.compress_type = getattr(model_args, "compress_type", None)
+        self.config.compress_size = getattr(model_args, "compress_size", 1)
+        self.config.compress_long_memory_size = getattr(model_args, "compress_long_memory_size", 1)
+        self.config.compress_Turing_memory_size = getattr(model_args, "compress_Turing_memory_size", 1)
+        self.config.compress_Turing_update_ratio = getattr(model_args, "compress_Turing_update_ratio", 0.2)
+        self.config.video_max_frames = getattr(model_args, "video_max_frames", 50)
+        self.config.video_long_memory_length = getattr(model_args, "video_long_memory_length", 10)
+        self.config.video_Turing_memory_length = getattr(model_args, "video_Turing_memory_length", 10)
+        self.config.video_short_memory_length = getattr(model_args, "video_short_memory_length", 10)
+        self.config.video_current_memory_length = getattr(model_args, "video_current_memory_length", 1)
+        self.config.video_sample_type = getattr(model_args, "video_sample_type", "center")
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+class VStreamMetaForCausalLM(ABC):
+    def __init__(self, config):
+        super(VStreamMetaForCausalLM, self).__init__(config)
+        # support video streaming mode
+        self.use_video_streaming_mode = False
+        self.video_embedding_memory = None  # set to torch.multiprocessing.Manager.list() when launching
+        self.video_embedding_mem_lock = Lock()
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        return image_features
+    def reshape_2x2_image_features(self, image_features):
+        B, P, D = image_features.shape
+        patch_size = round(math.sqrt(P))
+        assert patch_size % 2 == 0, "Patch size must be divisible by 2."
+        image_features = image_features.reshape(B, patch_size, patch_size, D)
+        image_features_2x2 = image_features.reshape(B, patch_size // 2, 2, patch_size // 2, 2, D)
+        image_features_2x2 = image_features_2x2.permute(0, 1, 3, 2, 4, 5)
+        image_features_2x2 = image_features_2x2.reshape(B, patch_size // 2, patch_size // 2, 4 * D)  # concat 2x2 neighbor patches
+        image_features = image_features_2x2.reshape(B, (patch_size // 2) ** 2, 4 * D)
+        return image_features
+    def attention(self, turing_memory, new_feature, update_ratio=0.2):
+        T1, D1 = turing_memory.shape
+        T2, D2 = new_feature.shape
+        assert D1 == D2, f"dimmension not match, {D1} != {D2}"
+        model = self.get_model().attention_model
+        weight = model.get_weight(turing_memory, new_feature)
+        weight = weight * update_ratio  # [T1, T2]
+        decay = weight.sum(dim=1, keepdim=True)  # [T0*P, 1], 表示当前NTM memory和新来的feat的相似度
+        turing_memory = turing_memory * (1 - decay) + torch.mm(weight, new_feature)
+        return turing_memory
+    def attention2(self, turing_memory, new_feature, update_ratio=0.2):  # deprecated
+        T1, D1 = turing_memory.shape
+        T2, D2 = new_feature.shape
+        assert D1 == D2, f"dimmension not match, {D1} != {D2}"
+        model = self.get_model().attention_model
+        turing_memory = model.forward(turing_memory, new_feature)
+        return turing_memory
+    def compress_spatial_features(self, image_features, compress_size=1):
+        compress_type = getattr(self.config, "compress_type", None)
+        patch_size = round(math.sqrt(image_features.shape[1]))
+        assert patch_size * patch_size == image_features.shape[1], f"For ViT feature map, {patch_size}*{patch_size}={patch_size**2} != {image_features.shape[1]}"
+        if patch_size == compress_size:
+            return image_features
+        elif compress_type is not None:
+            if 'mean' in self.config.compress_type:
+                # TODO: currently use 1 token per frame (or image), direct poolt
+                if compress_size == 1:
+                    image_features = image_features.mean(dim=1, keepdim=True)
+                else:
+                    image_features = image_features.view(-1, patch_size, patch_size, image_features.shape[-1])
+                    image_features = image_features.permute(0, 3, 1, 2)  # [B*T, D, P, P]
+                    pooled_features = F.avg_pool2d(image_features, (patch_size // compress_size, patch_size // compress_size))
+                    pooled_features = pooled_features.permute(0, 2, 3, 1)  # [B*T, P, P, D]
+                    image_features = pooled_features.view(-1, compress_size * compress_size, pooled_features.shape[-1])
+            else:
+                raise NotImplementedError(f"`compress_type` {self.config.compress_type} is not supported yet.")
+        return image_features
+    def compress_temporal_features(self, image_features):
+        video_long_memory_length = getattr(self.config, "video_long_memory_length", 10)
+        video_Turing_memory_length = getattr(self.config, "video_Turing_memory_length", 10)
+        video_short_memory_length = getattr(self.config, "video_short_memory_length", 10)  # not used
+        video_current_memory_length = getattr(self.config, "video_current_memory_length", 1)
+        compress_long_memory_size = getattr(self.config, "compress_long_memory_size", 1)
+        compress_Turing_memory_size = getattr(self.config, "compress_Turing_memory_size", 1)
+        compress_Turing_update_ratio = getattr(self.config, "compress_Turing_update_ratio", 0.2)
+        compress_fn_dic = {
+            'drop': drop_feature,
+            'merge': merge_feature,
+            'kmeans': kmeans_feature,
+            'weighted_kmeans': weighted_kmeans_feature,
+            'kdrop': k_drop_feature,
+            'kmerge': k_merge_feature,
+            'attention': attention_feature,
+        }
+        compress_type = self.config.video_sample_type
+        if compress_type in compress_fn_dic:
+            compress_fn = compress_fn_dic[compress_type]
+        else:
+            raise NotImplementedError(f'max_length = {self.config.video_max_frames},'
+                                        f'while video_sample_type = {compress_type} is not supported yet.')
+        new_image_features = []
+        step_indices = []
+        step_features = []
+        for img_feature in image_features:  # [T, P*P, D]
+            cur_start = min(video_current_memory_length, img_feature.shape[0])
+            ### Calc Spatial Memory
+            if cur_start == 0:
+                cur_memory = img_feature[:0]
+                long_memory = img_feature
+                Turing_memory = img_feature
+            else:
+                cur_memory = img_feature[-cur_start:]  # [C, P*P, D]
+                long_memory = img_feature[:-cur_start]  # [L, P*P, D]
+                Turing_memory = img_feature[:-cur_start]  # [L, P*P, D]
+            if compress_long_memory_size * compress_long_memory_size != long_memory.shape[1]:
+                long_memory = self.compress_spatial_features(long_memory, compress_long_memory_size) # [L, P'*P', D]
+            if compress_Turing_memory_size * compress_Turing_memory_size != Turing_memory.shape[1]:
+                Turing_memory = self.compress_spatial_features(Turing_memory, compress_Turing_memory_size) # [L, P'*P', D]
+            ### Calc Temporal Memory
+            if video_long_memory_length == 0 or long_memory.shape[0] == 0:
+                long_memory_compreesed = long_memory[:0]
+            else:
+                long_memory_compreesed, weight, step_long_indices = compress_fn(long_memory, video_long_memory_length) # [L_long, P'*P', D], [L_long]
+                ### Calc Retrieved Memory
+                sorted_indices = torch.argsort(weight, descending=True)  # [L_long]
+                key_centroids = long_memory[sorted_indices]  # [L_long, P'*P', D]
+                key_length = 3
+                if key_centroids.shape[0] > key_length:
+                    key_centroids = key_centroids[:key_length]
+                dists = ((long_memory.unsqueeze(1) - key_centroids.unsqueeze(0)) ** 2).sum(dim=3).sum(dim=2).sqrt()  # [L_long, k_L]
+                min_indices = torch.argmin(dists, dim=0)  # [k_L]
+                key_memory = img_feature[min_indices]
+                cur_memory = torch.cat([key_memory, cur_memory], dim=0)
+            ### Calc Abstract Memory
+            if video_Turing_memory_length == 0 or Turing_memory.shape[0] == 0:
+                Turing_memory_compreesed = Turing_memory[:0]
+            else:
+                Turing_memory_compreesed, _ = attention_feature(Turing_memory, video_Turing_memory_length, self.attention, update_ratio=compress_Turing_update_ratio)
+            memory_feature = torch.cat([Turing_memory_compreesed.flatten(0, 1), long_memory_compreesed.flatten(0, 1), cur_memory.flatten(0, 1)], dim=0)
+            new_image_features.append(memory_feature)
+        return new_image_features
+    def cat_proj(self, all_features):  # concatenate features and project them together
+        feature_split_size = [x.shape[0] for x in all_features]
+        feature_embed = torch.cat(all_features, dim=0)
+        feature_proj = self.get_model().mm_projector(feature_embed)
+        feature_proj = torch.split(feature_proj, feature_split_size, dim=0)
+        return feature_proj
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        images,
+        features
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or (images is None and features is None) or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and ((images is not None) or (features is not None)) and input_ids.shape[1] == 1:
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                if target_shape - attention_mask.shape[1] >= 0:
+                    attention_mask = torch.cat((attention_mask, torch.ones(
+                        (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device
+                    )), dim=1)
+                elif target_shape - attention_mask.shape[1] < 0:
+                    attention_mask = attention_mask[:, :target_shape]
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        if (features is not None) or (type(images) is list) or (images.ndim == 5):
+            compress_size = getattr(self.config, "compress_size", 1)
+            if images is not None:
+                images = [image if len(image.shape) == 4 else image.unsqueeze(0) for image in images]  # [B, T, C, H, W]
+                concat_images = torch.cat([image for image in images], dim=0)  # [B*T, C, H, W]
+                image_features = self.encode_images(concat_images)  # [B*T, P, D]
+                if getattr(self.config, 'mm_use_4_vision_tokens', False):
+                    image_features = self.reshape_2x2_image_features(image_features)  # [B*T, P/4, 4*D]
+                image_features = self.compress_spatial_features(image_features, compress_size)  # [B*T, P', D]
+                split_sizes = [image.shape[0] for image in images]
+                image_features = torch.split(image_features, split_sizes, dim=0)  # [B, T, P, D]
+            else:
+                image_features = [feat if len(feat.shape) == 3 else feat.unsqueeze(0) for feat in features]
+                origin_img_features = image_features
+                if getattr(self.config, 'mm_use_4_vision_tokens', False):
+                    image_features = [self.reshape_2x2_image_features(img_feature) for img_feature in image_features]  # [B*T, P/4, 4*D]
+                image_features = [self.compress_spatial_features(image_feature, compress_size) for image_feature in image_features]  # [B*T, P', D]
+            # perform memory consolidation
+            image_features = self.compress_temporal_features(image_features)  # [B, TP, D]
+            image_features = [x.to(self.device) for x in image_features]  # [B, TP, D]
+            image_features = self.cat_proj(image_features)
+        else:
+            image_features = self.encode_images(images).to(self.device)  # [B, 576, 2048]
+            if getattr(self.config, 'mm_use_4_vision_tokens', False):
+                image_features = self.reshape_2x2_image_features(image_features)  # [B*T, P/4, 4*D]
+            image_features = self.get_model().mm_projector(image_features)
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]  # only input first image_token
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+            assert cur_image_idx == batch_idx + 1
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def prepare_inputs_labels_for_multimodal_streaming(  # Asynchronous encoding with a SemLock, only for videos, batch_size=1
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels
+    ):
+        assert self.use_video_streaming_mode
+        logger = logging.getLogger(__name__)
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and input_ids.shape[1] == 1:
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                if target_shape - attention_mask.shape[1] >= 0:
+                    attention_mask = torch.cat((attention_mask, torch.ones(
+                        (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device
+                    )), dim=1)
+                elif target_shape - attention_mask.shape[1] < 0:
+                    attention_mask = attention_mask[:, :target_shape]
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        # Have some tries to avoid deadlock
+        attempt_times = 0
+        while attempt_times < 300:
+            try:
+                with self.video_embedding_mem_lock:
+                    cur_memory, long_memory_compreesed, Turing_memory_compreesed, _ = self.video_embedding_memory
+                    logger.info(f'Read cur_memory={cur_memory.shape} {cur_memory.dtype}, long_memory_compreesed={long_memory_compreesed.shape} {long_memory_compreesed.dtype}, Turing_memory_compreesed={Turing_memory_compreesed.shape} {Turing_memory_compreesed.dtype}')
+                    image_feature = torch.cat([Turing_memory_compreesed.flatten(0, 1), long_memory_compreesed.flatten(0, 1), cur_memory.flatten(0, 1)], dim=0)
+                    image_features = [image_feature.to(self.device)]
+                    break
+            except Exception as e:
+                logger.error(f'Attempt:{attempt_times} Failed to get video features, Error: {e}')
+                image_features = []
+                time.sleep(0.1)
+                attempt_times += 1
+        image_features = [x.to(self.device) for x in image_features]  # [B, TP, D]
+        image_features = self.cat_proj(image_features)
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]  # only input first image_token
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+            assert cur_image_idx == batch_idx + 1
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def embed_video_streaming(  # Asynchronous encoding with a SemLock, only for videos, batch_size=1
+        self,
+        images
+    ):
+        assert self.use_video_streaming_mode
+        logger = logging.getLogger(__name__)
+        compress_size = getattr(self.config, "compress_size", 1)
+        video_long_memory_length = getattr(self.config, "video_long_memory_length", 10)
+        video_Turing_memory_length = getattr(self.config, "video_Turing_memory_length", 10)
+        video_short_memory_length = getattr(self.config, "video_short_memory_length", 10)  # not used
+        video_current_memory_length = getattr(self.config, "video_current_memory_length", 1)
+        compress_long_memory_size = getattr(self.config, "compress_long_memory_size", 1)
+        compress_Turing_memory_size = getattr(self.config, "compress_Turing_memory_size", 1)
+        compress_Turing_update_ratio = getattr(self.config, "compress_Turing_update_ratio", 0.2)
+        compress_fn_dic = {
+            'drop': drop_feature,
+            'merge': merge_feature,
+            'kmeans': kmeans_feature,
+            'weighted_kmeans': weighted_kmeans_feature,
+            'kdrop': k_drop_feature,
+            'kmerge': k_merge_feature,
+            'uni_kmerge': k_merge_feature,
+            'both_kmerge': k_merge_feature,
+            'split_kmerge': k_merge_feature,
+            'attention': attention_feature,
+        }
+        if type(images) is list or images.ndim == 5:
+            assert len(images) == 1
+            images = [image if len(image.shape) == 4 else image.unsqueeze(0) for image in images]  # [B, T, C, H, W]
+            concat_images = torch.cat([image for image in images], dim=0)  # [B*T, C, H, W]
+            image_features = self.encode_images(concat_images)  # [B*T, P, D]
+            image_features = self.compress_spatial_features(image_features, compress_size)  # [B*T, P', D]
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)  # [B, T, P, D]
+        else:
+            raise NotImplementedError('Should input video frames, not a single image')
+        image_feature = image_features[0].detach().to(torch.float16).to(self.device)  # [T, P, D]
+        img_feature_buffer = image_feature.cpu()
+        cur_start = min(video_current_memory_length, image_feature.shape[0])
+        if cur_start == 0:
+            cur_memory = image_feature[:0]
+        else:
+            cur_memory = image_feature[-cur_start:]  # [L_c, P*P, D]
+        long_memory = image_feature
+        Turing_memory = image_feature
+        if compress_long_memory_size * compress_long_memory_size != long_memory.shape[1]:
+            long_memory = self.compress_spatial_features(long_memory, compress_long_memory_size) # [L_l, P'*P', D]
+        if compress_Turing_memory_size * compress_Turing_memory_size != Turing_memory.shape[1]:
+            Turing_memory = self.compress_spatial_features(Turing_memory, compress_Turing_memory_size) # [L_t, P'*P', D]
+        compress_type = self.config.video_sample_type
+        if compress_type in compress_fn_dic:
+            compress_fn = compress_fn_dic[compress_type]
+        else:
+            raise NotImplementedError(f'max_length = {self.config.video_max_frames},'
+                                        f'while video_sample_type = {compress_type} is not supported yet.')
+        long_memory_compreesed = long_memory
+        Turing_memory_compreesed = Turing_memory
+        # Read old memory from shared memory, do not need an I/O lock
+        if self.video_embedding_memory is not None and len(self.video_embedding_memory) > 0:
+            old_cur_memory, old_long_memory_compreesed, old_Turing_memory_compreesed, old_img_feature_buffer = self.video_embedding_memory
+            old_long_memory_compreesed = old_long_memory_compreesed.to(self.device)
+            old_Turing_memory_compreesed = old_Turing_memory_compreesed.to(self.device)
+            img_feature_buffer = torch.cat([old_img_feature_buffer, image_feature.cpu()], dim=0)
+            assert isinstance(old_long_memory_compreesed, torch.Tensor) and old_long_memory_compreesed.shape[1:] == long_memory_compreesed.shape[1:]
+            long_memory = torch.cat((old_long_memory_compreesed, long_memory_compreesed), dim=0)
+            long_memory_compreesed, weight, step_long_indices = compress_fn(long_memory, video_long_memory_length)
+            # Retrive key frames
+            sorted_indices = torch.argsort(weight, descending=True)  # [L_long]
+            key_centroids = long_memory[sorted_indices]  # [L_long, P'*P', D]
+            key_length = 3
+            if key_centroids.shape[0] > key_length:
+                key_centroids = key_centroids[:key_length]
+            dists = ((long_memory.unsqueeze(1) - key_centroids.unsqueeze(0)) ** 2).sum(dim=3).sum(dim=2).sqrt()  # [L_long, k_L]
+            min_indices = torch.argmin(dists, dim=0)  # [k_L]
+            key_memory = img_feature_buffer[min_indices.cpu()].to(self.device)
+            cur_memory = torch.cat([key_memory, cur_memory], dim=0)
+            Turing_memory = torch.cat((old_Turing_memory_compreesed, Turing_memory_compreesed), dim=0)
+            Turing_memory_compreesed, _ = attention_feature(Turing_memory, video_Turing_memory_length, self.attention, update_ratio=compress_Turing_update_ratio)
+        # Write to shared memory, need an I/O lock
+        with self.video_embedding_mem_lock:
+            self.video_embedding_memory[:] = [cur_memory.cpu(), long_memory_compreesed.cpu(), Turing_memory_compreesed.cpu(), img_feature_buffer]  # Only change content
+            logger.info(f'Write cur_memory={cur_memory.shape} {cur_memory.dtype}, long_memory_compreesed={long_memory_compreesed.shape} {long_memory_compreesed.dtype}, Turing_memory_compreesed={Turing_memory_compreesed.shape} {Turing_memory_compreesed.dtype}')
+        return []
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

flash_vstream/serve/cli_video_stream.py ADDED Viewed

	@@ -0,0 +1,351 @@

+#    This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+#    Based on https://github.com/haotian-liu/LLaVA.
+"""
+    This file demonstrates an implementation of a multiprocess Real-time Long Video Understanding System. With a multiprocess logging module.
+        main process: CLI server I/O, LLM inference
+        process-1: logger listener
+        process-2: frame generator,
+        process-3: frame memory manager
+    Author: Haoji Zhang, Haotian Liu
+    (This code is based on https://github.com/haotian-liu/LLaVA)
+"""
+import argparse
+import requests
+import logging
+import torch
+import numpy as np
+import time
+import os
+from flash_vstream.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from flash_vstream.conversation import conv_templates, SeparatorStyle
+from flash_vstream.model.builder import load_pretrained_model
+from flash_vstream.utils import disable_torch_init
+from flash_vstream.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from torch.multiprocessing import Process, Queue, Manager
+from transformers import TextStreamer
+from decord import VideoReader
+from datetime import datetime
+from PIL import Image
+from io import BytesIO
+class _Metric:
+    def __init__(self):
+        self._latest_value = None
+        self._sum = 0.0
+        self._max = 0.0
+        self._count = 0
+    @property
+    def val(self):
+        return self._latest_value
+    @property
+    def max(self):
+        return self._max
+    @property
+    def avg(self):
+        if self._count == 0:
+            return float('nan')
+        return self._sum / self._count
+    def add(self, value):
+        self._latest_value = value
+        self._sum += value
+        self._count += 1
+        if value > self._max:
+            self._max = value
+    def __str__(self):
+        latest_formatted = f"{self.val:.6f}" if self.val is not None else "None"
+        average_formatted = f"{self.avg:.6f}"
+        max_formatted = f"{self.max:.6f}"
+        return f"{latest_formatted} ({average_formatted}, {max_formatted})"
+class MetricMeter:
+    def __init__(self):
+        self._metrics = {}
+    def add(self, key, value):
+        if key not in self._metrics:
+            self._metrics[key] = _Metric()
+        self._metrics[key].add(value)
+    def val(self, key):
+        metric = self._metrics.get(key)
+        if metric is None or metric.val is None:
+            raise ValueError(f"No values have been added for key '{key}'.")
+        return metric.val
+    def avg(self, key):
+        metric = self._metrics.get(key)
+        if metric is None:
+            raise ValueError(f"No values have been added for key '{key}'.")
+        return metric.avg
+    def max(self, key):
+        metric = self._metrics.get(key)
+        if metric is None:
+            raise ValueError(f"No values have been added for key '{key}'.")
+        return metric.max
+    def __getitem__(self, key):
+        metric = self._metrics.get(key)
+        if metric is None:
+            raise KeyError(f"The key '{key}' does not exist.")
+        return str(metric)
+def load_image(image_file):
+    if image_file.startswith('http://') or image_file.startswith('https://'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    return image
+def listener(queue, filename):
+    ############## Start sub process-1: Listener #############
+    import sys, traceback
+    root = logging.getLogger()
+    root.setLevel(logging.DEBUG)
+    # h = logging.StreamHandler(sys.stdout)
+    h = logging.FileHandler(filename)
+    f = logging.Formatter('%(asctime)s %(processName)-10s %(name)s %(levelname)-8s %(message)s')
+    h.setFormatter(f)
+    root.addHandler(h)
+    while True:
+        try:
+            record = queue.get()
+            if record is None:  # None is a signal to finish
+                break
+            logger = logging.getLogger(record.name)
+            logger.handle(record)  # No level or filter logic applied - just do it!
+        except Exception:
+            import sys, traceback
+            print('Whoops! Problem:', file=sys.stderr)
+            traceback.print_exc(file=sys.stderr)
+def worker_configurer(queue):
+    h = logging.handlers.QueueHandler(queue)  # Just the one handler needed
+    root = logging.getLogger()
+    root.addHandler(h)
+    root.setLevel(logging.DEBUG)
+def video_stream_similator(video_file, frame_queue, log_queue, video_fps=1.0, play_speed=1.0):
+    ############## Start sub process-2: Simulator #############
+    worker_configurer(log_queue)
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    vr = VideoReader(video_file)
+    sample_fps = round(vr.get_avg_fps() / video_fps)
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    video = vr.get_batch(frame_idx).asnumpy()
+    video = np.repeat(video, 6, axis=0)
+    length = video.shape[0]
+    sleep_time = 1 / video_fps / play_speed
+    time_meter = MetricMeter()
+    logger.info(f'Simulator Process: start, length = {length}')
+    try:
+        for start in range(0, length):
+            start_time = time.perf_counter()
+            end = min(start + 1, length)
+            video_clip = video[start:end]
+            frame_queue.put(video_clip)
+            if start > 0:
+                time_meter.add('real_sleep', start_time - last_start)
+                logger.info(f'Simulator: write {end - start} frames,\t{start} to {end},\treal_sleep={time_meter["real_sleep"]}')
+            if end < length:
+                time.sleep(sleep_time)
+            last_start = start_time
+        frame_queue.put(None)
+    except Exception as e:
+        print(f'Simulator Exception: {e}')
+        time.sleep(0.1)
+    logger.info(f'Simulator Process: end')
+def frame_memory_manager(model, image_processor, frame_queue, log_queue):
+    ############## Start sub process-3: Memory Manager #############
+    worker_configurer(log_queue)
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    time_meter = MetricMeter()
+    logger.info(f'MemManager Process: start')
+    frame_cnt = 0
+    while True:
+        try:
+            video_clip = frame_queue.get()
+            start_time = time.perf_counter()
+            if video_clip is None:
+                logger.info(f'MemManager: Ooops, get None')
+                break
+            logger.info(f'MemManager: get {video_clip.shape[0]} frames from queue')
+            image = image_processor.preprocess(video_clip, return_tensors='pt')['pixel_values']
+            image = image.unsqueeze(0)
+            image_tensor = image.to(model.device, dtype=torch.float16)
+            # time_2 = time.perf_counter()
+            logger.info(f'MemManager: Start embedding')
+            with torch.inference_mode():
+                model.embed_video_streaming(image_tensor)
+            logger.info(f'MemManager: End embedding')
+            end_time = time.perf_counter()
+            if frame_cnt > 0:
+                time_meter.add('memory_latency', end_time - start_time)
+                logger.info(f'MemManager: embedded {video_clip.shape[0]} frames,\tidx={frame_cnt},\tmemory_latency={time_meter["memory_latency"]}')
+            else:
+                logger.info(f'MemManager: embedded {video_clip.shape[0]} frames,\tidx={frame_cnt},\tmemory_latency={end_time - start_time:.6f}, not logged')
+            frame_cnt += video_clip.shape[0]
+        except Exception as e:
+            print(f'MemManager Exception: {e}')
+            time.sleep(0.1)
+    logger.info(f'MemManager Process: end')
+def main(args):
+    # torch.multiprocessing.log_to_stderr(logging.DEBUG)
+    torch.multiprocessing.set_start_method('spawn', force=True)
+    disable_torch_init()
+    log_queue = Queue()
+    frame_queue = Queue(maxsize=10)
+    processes = []
+    ############## Start listener process #############
+    p1 = Process(target=listener, args=(log_queue, args.log_file))
+    processes.append(p1)
+    p1.start()
+    ############## Start main process #############
+    worker_configurer(log_queue)
+    logger = logging.getLogger(__name__)
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+    logger.info(f'Using conv_mode={args.conv_mode}')
+    conv = conv_templates[args.conv_mode].copy()
+    if "mpt" in model_name.lower():
+        roles = ('user', 'assistant')
+    else:
+        roles = conv.roles
+    with Manager() as manager:
+        image_tensor = None
+        model.use_video_streaming_mode = True
+        model.video_embedding_memory = manager.list()
+        if args.video_max_frames is not None:
+            model.config.video_max_frames = args.video_max_frames
+            logger.info(f'Important: set model.config.video_max_frames = {model.config.video_max_frames}')
+        logger.info(f'Important: set video_fps = {args.video_fps}')
+        logger.info(f'Important: set play_speed = {args.play_speed}')
+        ############## Start simulator process #############
+        p2 = Process(target=video_stream_similator,
+                     args=(args.video_file, frame_queue, log_queue, args.video_fps, args.play_speed))
+        processes.append(p2)
+        p2.start()
+        ############## Start memory manager process #############
+        p3 = Process(target=frame_memory_manager,
+                     args=(model, image_processor, frame_queue, log_queue))
+        processes.append(p3)
+        p3.start()
+        # start QA server
+        start_time = datetime.now()
+        time_meter = MetricMeter()
+        conv_cnt = 0
+        while True:
+            time.sleep(5)
+            try:
+                # inp = input(f"{roles[0]}: ")
+                inp = "what is in the video?"
+            except EOFError:
+                inp = ""
+            if not inp:
+                print("exit...")
+                break
+            # 获取当前时间
+            now = datetime.now()
+            conv_start_time = time.perf_counter()
+            # 将当前时间格式化为字符串
+            current_time = now.strftime("%H:%M:%S")
+            duration = now.timestamp() - start_time.timestamp()
+            # 打印当前时间
+            print("\nCurrent Time:", current_time, "Run for:", duration)
+            print(f"{roles[0]}: {inp}", end="\n")
+            print(f"{roles[1]}: ", end="")
+            # every conversation is a new conversation
+            conv = conv_templates[args.conv_mode].copy()
+            inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
+            conv.append_message(conv.roles[0], inp)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            llm_start_time = time.perf_counter()
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor,
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    max_new_tokens=args.max_new_tokens,
+                    streamer=streamer,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria]
+                )
+            llm_end_time = time.perf_counter()
+            outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
+            conv.messages[-1][-1] = outputs
+            conv_end_time = time.perf_counter()
+            if conv_cnt > 0:
+                time_meter.add('conv_latency', conv_end_time - conv_start_time)
+                time_meter.add('llm_latency', llm_end_time - llm_start_time)
+                time_meter.add('real_sleep', conv_start_time - last_conv_start_time)
+                logger.info(f'CliServer: idx={conv_cnt},\treal_sleep={time_meter["real_sleep"]},\tconv_latency={time_meter["conv_latency"]},\tllm_latency={time_meter["llm_latency"]}')
+            else:
+                logger.info(f'CliServer: idx={conv_cnt},\tconv_latency={conv_end_time - conv_start_time},\tllm_latency={llm_end_time - llm_start_time}')
+            conv_cnt += 1
+            last_conv_start_time = conv_start_time
+    for p in processes:
+        p.terminate()
+    print("All processes finished.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, default=None)
+    parser.add_argument("--video-file", type=str, default=None)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--conv-mode", type=str, default="vicuna_v1")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--log-file", type=str, default="tmp_cli.log")
+    parser.add_argument("--use_1process", action="store_true")
+    parser.add_argument("--video_max_frames", type=int, default=None)
+    parser.add_argument("--video_fps", type=float, default=1.0)
+    parser.add_argument("--play_speed", type=float, default=1.0)
+    args = parser.parse_args()
+    main(args)

flash_vstream/serve/demo.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+from ..constants import *
+from ..conversation import conv_templates, SeparatorStyle
+from ..model.builder import load_pretrained_model
+from ..utils import disable_torch_init
+from ..mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
+from PIL import Image
+import os
+from decord import VideoReader, cpu
+import numpy as np
+class Chat:
+    def __init__(self, model_path, conv_mode="simple", load_8bit=False, load_4bit=False):
+        disable_torch_init()
+        self.tokenizer, self.model, self.image_processor, context_len = load_pretrained_model(model_path, None, model_name="ChatUniVi", load_8bit=load_8bit, load_4bit=load_4bit)
+        mm_use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(self.model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            self.tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            self.tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        self.model.resize_token_embeddings(len(self.tokenizer))
+        vision_tower = self.model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        self.image_processor = vision_tower.image_processor
+        self.conv_mode = conv_mode
+        print(self.model)
+    def get_prompt(self, qs, state):
+        state.append_message(state.roles[0], qs)
+        state.append_message(state.roles[1], None)
+        return state
+    def _get_rawvideo_dec(self, video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224,
+                          video_framerate=1, s=None, e=None):
+        if s is None:
+            start_time, end_time = None, None
+        else:
+            start_time = int(s)
+            end_time = int(e)
+            start_time = start_time if start_time >= 0. else 0.
+            end_time = end_time if end_time >= 0. else 0.
+            if start_time > end_time:
+                start_time, end_time = end_time, start_time
+            elif start_time == end_time:
+                end_time = start_time + 1
+        if os.path.exists(video_path):
+            vreader = VideoReader(video_path, ctx=cpu(0))
+        else:
+            print(video_path)
+            raise FileNotFoundError
+        fps = vreader.get_avg_fps()
+        f_start = 0 if start_time is None else int(start_time * fps)
+        f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+        num_frames = f_end - f_start + 1
+        if num_frames > 0:
+            sample_fps = int(video_framerate)
+            t_stride = int(round(float(fps) / sample_fps))
+            all_pos = list(range(f_start, f_end + 1, t_stride))
+            if len(all_pos) > max_frames:
+                sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+            else:
+                sample_pos = all_pos
+            patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+            return patch_images
+    @torch.inference_mode()
+    def generate(self, images_tensor: list, prompt: str, first_run: bool, state):
+        tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
+        state = self.get_prompt(prompt, state)
+        prompt = state.get_prompt()
+        print(prompt)
+        images_tensor = torch.stack(images_tensor, dim=0)
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        temperature = 0.2
+        max_new_tokens = 1024
+        stop_str = conv_templates[self.conv_mode].copy().sep if conv_templates[self.conv_mode].copy().sep_style != SeparatorStyle.TWO else \
+        conv_templates[self.conv_mode].copy().sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images_tensor,
+                do_sample=True,
+                temperature=temperature,
+                num_beams=1,
+                max_new_tokens=max_new_tokens,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        print('response', outputs)
+        return outputs, state
+title_markdown = ("""
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <a href="https://github.com/PKU-YuanGroup/Chat-UniVi" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://z1.ax1x.com/2023/11/22/pidlXh4.jpg" alt="Chat-UniVi🚀" style="max-width: 120px; height: auto;">
+  </a>
+  <div>
+    <h1 >Chat-UniVi: Unified Visual Representation Empowers Large Language Models with Image and Video Understanding</h1>
+    <h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
+  </div>
+</div>
+<div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        <a href='https://github.com/PKU-YuanGroup/Chat-UniVi'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        <a href="https://arxiv.org/pdf/2311.08046.pdf"><img src="https://img.shields.io/badge/Arxiv-2311.08046-red"></a>
+        <a href='https://github.com/PKU-YuanGroup/Chat-UniVi/stargazers'><img src='https://img.shields.io/github/stars/PKU-YuanGroup/Chat-UniVi.svg?style=social'></a>
+    </div>
+</div>
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+"""

flash_vstream/train/llama_flash_attn_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+from typing import Optional, Tuple
+import warnings
+import torch
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )  # shape: (b, num_heads, s, head_dim)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    if past_key_value is not None:
+        # reuse k, v
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # Transform the data into the format required by flash attention
+    qkv = torch.stack([query_states, key_states, value_states], dim=2)
+    qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
+    key_padding_mask = attention_mask
+    if key_padding_mask is None:
+        qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
+        cu_q_lens = torch.arange(
+            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
+        )
+        max_s = q_len
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = output.view(bsz, q_len, -1)
+    else:
+        qkv = qkv.reshape(bsz, q_len, -1)
+        qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+        qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+    return self.o_proj(output), None, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask
+    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward

flash_vstream/train/llama_xformers_attn_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+"""
+Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
+"""
+import logging
+import math
+from typing import Optional, Tuple
+import torch
+import transformers.models.llama.modeling_llama
+from torch import nn
+try:
+    import xformers.ops
+except ImportError:
+    logging.error("xformers not found! Please install it before trying to use it.")
+def replace_llama_attn_with_xformers_attn():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
+def xformers_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xformers.ops.LowerTriangularMask(),
+            )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights, past_key_value

flash_vstream/train/train.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+#    This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+# ------------------------------------------------------------------------
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import copy
+import json
+import torch
+import random
+import logging
+import pathlib
+import transformers
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List
+from flash_vstream.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from torch.utils.data import Dataset
+from flash_vstream.train.vstream_trainer import VStreamTrainer
+from flash_vstream import conversation as conversation_lib
+from flash_vstream.model import VStreamLlamaForCausalLM, VStreamConfig
+from flash_vstream.mm_utils import tokenizer_image_token
+from PIL import Image
+from decord import VideoReader
+from safetensors.torch import load_file, save_file
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default='linear')
+    mm_use_im_start_end: bool = field(default=False)
+    mm_use_im_patch_token: bool = field(default=True)
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    mm_use_4_vision_tokens: bool = field(default=False)
+    compress_type: Optional[str] = field(default=None)
+    compress_size: int = field(default=4)
+    compress_long_memory_size: int = field(default=1)
+    compress_Turing_memory_size: int = field(default=1)
+    compress_Turing_hidden_dim: int = field(default=32)
+    compress_Turing_update_ratio: float = field(default=0.2)
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    image_folder: Optional[str] = field(default=None)
+    video_folder: Optional[str] = field(default=None)
+    video_fps: Optional[int] = field(default=1)
+    video_token: Optional[int] = field(default=2)
+    video_max_frames: Optional[int] = field(default=50)
+    video_long_memory_length: Optional[int] = field(default=10)
+    video_Turing_memory_length: Optional[int] = field(default=10)
+    video_short_memory_length: Optional[int] = field(default=10)
+    video_current_memory_length: Optional[int] = field(default=1)
+    video_sample_type: Optional[str] = field(default='center')  # center, uniform, drop, merge
+    image_aspect_ratio: str = 'square'
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector']
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(['embed_tokens', 'embed_in'])
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        return
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = 'unknown'
+        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
+                             sentence["value"] + END_SIGNAL)
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence['value']:
+                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+            replace_token = DEFAULT_IMAGE_TOKEN
+            if data_args.mm_use_im_start_end:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    return sources
+def preprocess_llama_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_mpt(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer))
+            instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+    if has_image:
+        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+    return dict(input_ids=input_ids, labels=targets)
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.list_data_dict)
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if 'image' in sample else 0
+            length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+        return length_list
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+            cur_len = cur_len if ('image' in sample) or ('video' in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        attempt, max_attempt = 0, 10
+        while attempt < max_attempt:
+            try:
+                sources = self.list_data_dict[i]
+                if isinstance(i, int):
+                    sources = [sources]
+                assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+                feature = None
+                if 'image' in sources[0]:
+                    image_file = self.list_data_dict[i]['image']
+                    image_folder = self.data_args.image_folder
+                    image_file = os.path.join(image_folder, image_file)
+                    suffix = image_file.split('.')[-1]
+                    if 'features' in image_folder:
+                        # TODO: load video feature, not supported yet
+                        image_file = image_file.replace(suffix, 'safetensors')
+                        if not os.path.exists(image_file):
+                            print('Image file {} not exist!'.format(image_file))
+                        feature = load_file(image_file)['feature'].unsqueeze(0)
+                        sources = preprocess_multimodal(
+                            copy.deepcopy([e["conversations"] for e in sources]),
+                            self.data_args)
+                    else:
+                        processor = self.data_args.image_processor
+                        image = Image.open().convert('RGB')
+                        if self.data_args.image_aspect_ratio == 'pad':
+                            def expand2square(pil_img, background_color):
+                                width, height = pil_img.size
+                                if width == height:
+                                    return pil_img
+                                elif width > height:
+                                    result = Image.new(pil_img.mode, (width, width), background_color)
+                                    result.paste(pil_img, (0, (width - height) // 2))
+                                    return result
+                                else:
+                                    result = Image.new(pil_img.mode, (height, height), background_color)
+                                    result.paste(pil_img, ((height - width) // 2, 0))
+                                    return result
+                            image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+                            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                        else:
+                            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                        sources = preprocess_multimodal(
+                            copy.deepcopy([e["conversations"] for e in sources]),
+                            self.data_args)
+                elif 'video' in sources[0]:
+                    video_file = self.list_data_dict[i]['video']
+                    video_folder = self.data_args.video_folder
+                    video_file = os.path.join(video_folder, video_file)
+                    suffix = video_file.split('.')[-1]
+                    if 'features' in video_folder:
+                        # TODO: load video feature, not supported yet
+                        video_file = video_file.replace(suffix, 'safetensors')
+                        if not os.path.exists(video_file):
+                            print('Video file {} not exist!'.format(video_file))
+                        feature = load_file(video_file)['feature']
+                        if 'time' in self.list_data_dict[i]:  # breakpoint mode
+                            if 'time_9dense' in self.list_data_dict[i]:
+                                tim = self.list_data_dict[i]['time_9dense'] // 4
+                                start = max(tim - 6 * 9, 0)
+                                end = min(tim + 6 * 9, feature.shape[0])
+                                feature = feature[start:end]
+                            else:
+                                expansion = 15
+                                if 'time_9' in self.list_data_dict[i]:
+                                    expansion = 9
+                                tim = self.list_data_dict[i]['time']
+                                start = max(tim - expansion, 0)
+                                end = min(tim + expansion, feature.shape[0])
+                                feature = feature[start:end]
+                        elif 'time_9dense' in self.list_data_dict[i]:
+                            feature = feature[::6]
+                        sources = preprocess_multimodal(
+                            copy.deepcopy([e["conversations"] for e in sources]),
+                            self.data_args)
+                    else:
+                        # directly load video file
+                        if not os.path.exists(video_file):
+                            print('File {} not exist!'.format(video_file))
+                        vr = VideoReader(video_file, num_threads=4)
+                        sample_fps = round(vr.get_avg_fps()/self.data_args.video_fps)
+                        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+                        if len(frame_idx) > self.data_args.video_max_frames:
+                            if self.data_args.video_sample_type == 'center':
+                                # select middle frames
+                                start_pos = (len(frame_idx) - self.data_args.video_max_frames) // 2
+                                frame_idx = frame_idx[start_pos:start_pos + self.data_args.video_max_frames]
+                            elif self.data_args.video_sample_type == 'uniform':
+                                scale = 1.0 * len(frame_idx) / self.data_args.video_max_frames
+                                uniform_idx = [round((i + 1) * scale - 1) for i in range(self.data_args.video_max_frames)]
+                                frame_idx = [frame_idx[i] for i in uniform_idx]
+                            elif len(frame_idx) > 18000:
+                                scale = 1.0 * len(frame_idx) / 180
+                                uniform_idx = [round((i + 1) * scale - 1) for i in range(180)]
+                                frame_idx = [frame_idx[i] for i in uniform_idx]
+                        video = vr.get_batch(frame_idx).asnumpy()
+                        processor = self.data_args.image_processor
+                        image = processor.preprocess(video, return_tensors='pt')['pixel_values']
+                        sources = preprocess_multimodal(
+                            copy.deepcopy([e["conversations"] for e in sources]),
+                            self.data_args)
+                else:
+                    sources = copy.deepcopy([e["conversations"] for e in sources])
+                break
+            except Exception as e:
+                attempt += 1
+                print(f"Error in loading id:{i} sample, retrying {attempt} time... Error={e}")
+                i = random.randint(0, len(self.list_data_dict)-1)
+        has_image = ('image' in self.list_data_dict[i]) or ('video' in self.list_data_dict[i])
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=has_image)
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # image exist in the data
+        if 'image' in self.list_data_dict[i] or 'video' in self.list_data_dict[i]:
+            if feature is not None:
+                data_dict['feature'] = feature
+            else:
+                data_dict['image'] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            patch_size = 14
+            data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            data_dict['feature'] = torch.zeros((crop_size['height'] // patch_size) * (crop_size['width'] // patch_size), self.data_args.mm_hidden_size)
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'feature' in instances[0]:
+            batch['features'] = [instance['feature'] for instance in instances]
+        elif 'image' in instances[0]:
+            images = [instance['image'] for instance in instances]
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch['images'] = torch.stack(images)
+            else:
+                batch['images'] = images
+        return batch
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.data_path,
+                                data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+def train():
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["mm_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+    if model_args.vision_tower is not None:
+        model = VStreamLlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            **bnb_model_from_pretrained_args
+        )
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    else:
+        tokenizer.pad_token = tokenizer.unk_token
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
+    if model_args.vision_tower is not None:
+        model_args.video_sample_type = data_args.video_sample_type
+        model_args.video_max_frames = data_args.video_max_frames
+        model_args.video_long_memory_length = data_args.video_long_memory_length
+        model_args.video_Turing_memory_length = data_args.video_Turing_memory_length
+        model_args.video_short_memory_length = data_args.video_short_memory_length
+        model_args.video_current_memory_length = data_args.video_current_memory_length
+        model.get_model().initialize_vision_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
+        data_args.image_processor = vision_tower.image_processor
+        data_args.is_multimodal = True
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+            for p in model.get_model().attention_model.parameters():
+                p.requires_grad = True
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+            for p in model.get_model().attention_model.parameters():
+                p.requires_grad = False
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+        model.config.mm_use_4_vision_tokens = model_args.mm_use_4_vision_tokens
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+    data_args.mm_hidden_size = model.get_vision_tower().hidden_size
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    trainer = VStreamTrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    model.config.use_cache = True
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    # random.seed(42)
+    # np.random.seed(42)
+    # torch.manual_seed(42)
+    # torch.cuda.manual_seed(42)
+    # torch.cuda.manual_seed_all(42)
+    # torch.backends.cudnn.deterministic = True
+    # torch.backends.cudnn.benchmark = False
+    train()

flash_vstream/train/train_mem.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Adopted from https://github.com/haotian-liu/LLaVA.
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+# Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
+# Need to call this before importing transformers.
+from flash_vstream.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
+replace_llama_attn_with_flash_attn()
+from flash_vstream.train.train import train
+if __name__ == "__main__":
+    train()

flash_vstream/train/train_xformers.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+# Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
+# Need to call this before importing transformers.
+from flash_vstream.train.llama_xformers_attn_monkey_patch import (
+    replace_llama_attn_with_xformers_attn,
+)
+replace_llama_attn_with_xformers_attn()
+from flash_vstream.train.train import train
+if __name__ == "__main__":
+    train()

flash_vstream/train/vstream_trainer.py ADDED Viewed

	@@ -0,0 +1,248 @@

+#    This file may have been modified by Flash-VStream Authors (Flash-VStream Modifications”). All Flash-VStream Modifications are Copyright 2024 Flash-VStream Authors.
+# ------------------------------------------------------------------------
+# Based on https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import torch
+import torch.nn as nn
+from torch.utils.data import Sampler
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    ShardedDDPOption,
+    logger,
+)
+from typing import List, Optional
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+    num_indices_per_chunk = len(indices) // num_chunks
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+    return chunks
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+    return [i for megabatch in megabatches for i in megabatch]
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+    def __len__(self):
+        return len(self.lengths)
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+class VStreamTrainer(Trainer):
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+        if self.sharded_ddp == ShardedDDPOption.SIMPLE:
+            return super().create_optimizer()
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+        return self.optimizer

flash_vstream/utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import requests
+from flash_vstream.constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"

requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- huggingface_hub==0.22.2


1	+ huggingface_hub==0.22.2