zR
commited on
Commit
•
90e901a
1
Parent(s):
98be0af
update
Browse files- modeling_cogvlm.py +5 -4
modeling_cogvlm.py
CHANGED
@@ -837,7 +837,7 @@ class CogVLMVideoForCausalLM(CogVLMPreTrainedModel):
|
|
837 |
text = _history_to_prompt(template_version, history, query)
|
838 |
input_ids = [tokenizer.bos_token_id]
|
839 |
token_type_ids = [LANGUAGE_TOKEN_TYPE]
|
840 |
-
add_time_indices = False
|
841 |
if images is not None and len(images) == 1:
|
842 |
# vision
|
843 |
transform = transforms.Compose(
|
@@ -853,15 +853,16 @@ class CogVLMVideoForCausalLM(CogVLMPreTrainedModel):
|
|
853 |
images = [transform(images[0]).transpose(0, 1)] # (T, C, H, W)
|
854 |
num_eois = len(images[0])
|
855 |
tokenizer.pad_token_id = 128002
|
856 |
-
vision_token_num = (64 + 2) * num_eois
|
857 |
if not add_time_indices:
|
|
|
858 |
input_ids += [tokenizer.pad_token_id] * vision_token_num # add spetial token
|
859 |
token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num
|
860 |
else:
|
861 |
video_ids, video_type_ids = [], []
|
|
|
862 |
for _time_idx in range(num_eois):
|
863 |
-
video_ids += [tokenizer.pad_token_id] *
|
864 |
-
video_type_ids += [VISION_TOKEN_TYPE] *
|
865 |
# add time indices
|
866 |
time_indices = tokenizer.encode(str(_time_idx), add_special_tokens=False)
|
867 |
video_ids += time_indices
|
|
|
837 |
text = _history_to_prompt(template_version, history, query)
|
838 |
input_ids = [tokenizer.bos_token_id]
|
839 |
token_type_ids = [LANGUAGE_TOKEN_TYPE]
|
840 |
+
add_time_indices = True if template_version == 'chat' else False
|
841 |
if images is not None and len(images) == 1:
|
842 |
# vision
|
843 |
transform = transforms.Compose(
|
|
|
853 |
images = [transform(images[0]).transpose(0, 1)] # (T, C, H, W)
|
854 |
num_eois = len(images[0])
|
855 |
tokenizer.pad_token_id = 128002
|
|
|
856 |
if not add_time_indices:
|
857 |
+
vision_token_num = (64 + 2) * num_eois
|
858 |
input_ids += [tokenizer.pad_token_id] * vision_token_num # add spetial token
|
859 |
token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num
|
860 |
else:
|
861 |
video_ids, video_type_ids = [], []
|
862 |
+
sing_vision_token_num = (64 + 2)
|
863 |
for _time_idx in range(num_eois):
|
864 |
+
video_ids += [tokenizer.pad_token_id] * sing_vision_token_num
|
865 |
+
video_type_ids += [VISION_TOKEN_TYPE] * sing_vision_token_num
|
866 |
# add time indices
|
867 |
time_indices = tokenizer.encode(str(_time_idx), add_special_tokens=False)
|
868 |
video_ids += time_indices
|