zR commited on
Commit
90e901a
1 Parent(s): 98be0af
Files changed (1) hide show
  1. modeling_cogvlm.py +5 -4
modeling_cogvlm.py CHANGED
@@ -837,7 +837,7 @@ class CogVLMVideoForCausalLM(CogVLMPreTrainedModel):
837
  text = _history_to_prompt(template_version, history, query)
838
  input_ids = [tokenizer.bos_token_id]
839
  token_type_ids = [LANGUAGE_TOKEN_TYPE]
840
- add_time_indices = False
841
  if images is not None and len(images) == 1:
842
  # vision
843
  transform = transforms.Compose(
@@ -853,15 +853,16 @@ class CogVLMVideoForCausalLM(CogVLMPreTrainedModel):
853
  images = [transform(images[0]).transpose(0, 1)] # (T, C, H, W)
854
  num_eois = len(images[0])
855
  tokenizer.pad_token_id = 128002
856
- vision_token_num = (64 + 2) * num_eois
857
  if not add_time_indices:
 
858
  input_ids += [tokenizer.pad_token_id] * vision_token_num # add spetial token
859
  token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num
860
  else:
861
  video_ids, video_type_ids = [], []
 
862
  for _time_idx in range(num_eois):
863
- video_ids += [tokenizer.pad_token_id] * vision_token_num
864
- video_type_ids += [VISION_TOKEN_TYPE] * vision_token_num
865
  # add time indices
866
  time_indices = tokenizer.encode(str(_time_idx), add_special_tokens=False)
867
  video_ids += time_indices
 
837
  text = _history_to_prompt(template_version, history, query)
838
  input_ids = [tokenizer.bos_token_id]
839
  token_type_ids = [LANGUAGE_TOKEN_TYPE]
840
+ add_time_indices = True if template_version == 'chat' else False
841
  if images is not None and len(images) == 1:
842
  # vision
843
  transform = transforms.Compose(
 
853
  images = [transform(images[0]).transpose(0, 1)] # (T, C, H, W)
854
  num_eois = len(images[0])
855
  tokenizer.pad_token_id = 128002
 
856
  if not add_time_indices:
857
+ vision_token_num = (64 + 2) * num_eois
858
  input_ids += [tokenizer.pad_token_id] * vision_token_num # add spetial token
859
  token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num
860
  else:
861
  video_ids, video_type_ids = [], []
862
+ sing_vision_token_num = (64 + 2)
863
  for _time_idx in range(num_eois):
864
+ video_ids += [tokenizer.pad_token_id] * sing_vision_token_num
865
+ video_type_ids += [VISION_TOKEN_TYPE] * sing_vision_token_num
866
  # add time indices
867
  time_indices = tokenizer.encode(str(_time_idx), add_special_tokens=False)
868
  video_ids += time_indices