czczup commited on
Commit
144479b
1 Parent(s): e2f4cc0

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +20 -3
  2. config.json +2 -55
  3. modeling_internvl_chat.py +28 -10
  4. preprocessor_config.json +19 -0
README.md CHANGED
@@ -74,8 +74,10 @@ We provide an example code to run InternVL2-4B using `transformers`.
74
  > Please use transformers==4.37.2 to ensure the model works normally.
75
 
76
  ```python
 
77
  import torch
78
  import torchvision.transforms as T
 
79
  from PIL import Image
80
  from torchvision.transforms.functional import InterpolationMode
81
  from transformers import AutoModel, AutoTokenizer
@@ -204,7 +206,22 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
204
  print(f'User: {question}')
205
  print(f'Assistant: {response}')
206
 
207
- # multi-image multi-round conversation (多图多轮对话)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
209
  pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
210
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
@@ -286,7 +303,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
286
  print(f'User: {question}')
287
  print(f'Assistant: {response}')
288
 
289
- question = 'Describe this video in detail.'
290
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
291
  num_patches_list=num_patches_list,
292
  history=history, return_history=True)
@@ -416,4 +433,4 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
416
  journal={arXiv preprint arXiv:2404.16821},
417
  year={2024}
418
  }
419
- ```
 
74
  > Please use transformers==4.37.2 to ensure the model works normally.
75
 
76
  ```python
77
+ import numpy as np
78
  import torch
79
  import torchvision.transforms as T
80
+ from decord import VideoReader, cpu
81
  from PIL import Image
82
  from torchvision.transforms.functional import InterpolationMode
83
  from transformers import AutoModel, AutoTokenizer
 
206
  print(f'User: {question}')
207
  print(f'Assistant: {response}')
208
 
209
+ # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
210
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
211
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
212
+ pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
213
+
214
+ question = '<image>\nDescribe the two images in detail.'
215
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
216
+ history=None, return_history=True)
217
+
218
+ question = 'What are the similarities and differences between these two images.'
219
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
220
+ history=history, return_history=True)
221
+ print(f'User: {question}')
222
+ print(f'Assistant: {response}')
223
+
224
+ # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
225
  pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
226
  pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
227
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 
303
  print(f'User: {question}')
304
  print(f'Assistant: {response}')
305
 
306
+ question = 'Describe this video in detail. Don\'t repeat.'
307
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
308
  num_patches_list=num_patches_list,
309
  history=history, return_history=True)
 
433
  journal={arXiv preprint arXiv:2404.16821},
434
  year={2024}
435
  }
436
+ ```
config.json CHANGED
@@ -12,11 +12,12 @@
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
- "_name_or_path": "./pretrained/Phi-3-mini-128k-instruct",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "Phi3ForCausalLM"
19
  ],
 
20
  "attention_dropout": 0.0,
21
  "auto_map": {
22
  "AutoConfig": "configuration_phi3.Phi3Config",
@@ -212,86 +213,32 @@
212
  "use_llm_lora": 0,
213
  "use_thumbnail": true,
214
  "vision_config": {
215
- "_name_or_path": "",
216
- "add_cross_attention": false,
217
  "architectures": [
218
  "InternVisionModel"
219
  ],
220
  "attention_dropout": 0.0,
221
- "bad_words_ids": null,
222
- "begin_suppress_tokens": null,
223
- "bos_token_id": null,
224
- "chunk_size_feed_forward": 0,
225
- "cross_attention_hidden_size": null,
226
- "decoder_start_token_id": null,
227
- "diversity_penalty": 0.0,
228
- "do_sample": false,
229
  "drop_path_rate": 0.0,
230
  "dropout": 0.0,
231
- "early_stopping": false,
232
- "encoder_no_repeat_ngram_size": 0,
233
- "eos_token_id": null,
234
- "exponential_decay_length_penalty": null,
235
- "finetuning_task": null,
236
- "forced_bos_token_id": null,
237
- "forced_eos_token_id": null,
238
  "hidden_act": "gelu",
239
  "hidden_size": 1024,
240
- "id2label": {
241
- "0": "LABEL_0",
242
- "1": "LABEL_1"
243
- },
244
  "image_size": 448,
245
  "initializer_factor": 1.0,
246
  "initializer_range": 0.02,
247
  "intermediate_size": 4096,
248
- "is_decoder": false,
249
- "is_encoder_decoder": false,
250
- "label2id": {
251
- "LABEL_0": 0,
252
- "LABEL_1": 1
253
- },
254
  "layer_norm_eps": 1e-06,
255
- "length_penalty": 1.0,
256
- "max_length": 20,
257
- "min_length": 0,
258
  "model_type": "intern_vit_6b",
259
- "no_repeat_ngram_size": 0,
260
  "norm_type": "layer_norm",
261
  "num_attention_heads": 16,
262
- "num_beam_groups": 1,
263
- "num_beams": 1,
264
  "num_channels": 3,
265
  "num_hidden_layers": 24,
266
- "num_return_sequences": 1,
267
  "output_attentions": false,
268
  "output_hidden_states": false,
269
- "output_scores": false,
270
- "pad_token_id": null,
271
  "patch_size": 14,
272
- "prefix": null,
273
- "problem_type": null,
274
- "pruned_heads": {},
275
  "qk_normalization": false,
276
  "qkv_bias": true,
277
- "remove_invalid_values": false,
278
- "repetition_penalty": 1.0,
279
  "return_dict": true,
280
- "return_dict_in_generate": false,
281
- "sep_token_id": null,
282
- "suppress_tokens": null,
283
- "task_specific_params": null,
284
- "temperature": 1.0,
285
- "tf_legacy_loss": false,
286
- "tie_encoder_decoder": false,
287
- "tie_word_embeddings": true,
288
- "tokenizer_class": null,
289
- "top_k": 50,
290
- "top_p": null,
291
  "torch_dtype": "bfloat16",
292
- "torchscript": false,
293
  "transformers_version": "4.37.2",
294
- "typical_p": 1.0,
295
  "use_bfloat16": true,
296
  "use_flash_attn": true
297
  }
 
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
+ "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "Phi3ForCausalLM"
19
  ],
20
+ "attn_implementation": "flash_attention_2",
21
  "attention_dropout": 0.0,
22
  "auto_map": {
23
  "AutoConfig": "configuration_phi3.Phi3Config",
 
213
  "use_llm_lora": 0,
214
  "use_thumbnail": true,
215
  "vision_config": {
 
 
216
  "architectures": [
217
  "InternVisionModel"
218
  ],
219
  "attention_dropout": 0.0,
 
 
 
 
 
 
 
 
220
  "drop_path_rate": 0.0,
221
  "dropout": 0.0,
 
 
 
 
 
 
 
222
  "hidden_act": "gelu",
223
  "hidden_size": 1024,
 
 
 
 
224
  "image_size": 448,
225
  "initializer_factor": 1.0,
226
  "initializer_range": 0.02,
227
  "intermediate_size": 4096,
 
 
 
 
 
 
228
  "layer_norm_eps": 1e-06,
 
 
 
229
  "model_type": "intern_vit_6b",
 
230
  "norm_type": "layer_norm",
231
  "num_attention_heads": 16,
 
 
232
  "num_channels": 3,
233
  "num_hidden_layers": 24,
 
234
  "output_attentions": false,
235
  "output_hidden_states": false,
 
 
236
  "patch_size": 14,
 
 
 
237
  "qk_normalization": false,
238
  "qkv_bias": true,
 
 
239
  "return_dict": true,
 
 
 
 
 
 
 
 
 
 
 
240
  "torch_dtype": "bfloat16",
 
241
  "transformers_version": "4.37.2",
 
242
  "use_bfloat16": true,
243
  "use_flash_attn": true
244
  }
modeling_internvl_chat.py CHANGED
@@ -7,6 +7,7 @@ import warnings
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
 
10
  from torch import nn
11
  from torch.nn import CrossEntropyLoss
12
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
@@ -23,6 +24,14 @@ from .modeling_phi3 import Phi3ForCausalLM
23
  logger = logging.get_logger(__name__)
24
 
25
 
 
 
 
 
 
 
 
 
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
@@ -31,6 +40,7 @@ class InternVLChatModel(PreTrainedModel):
31
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
32
  super().__init__(config)
33
 
 
34
  image_size = config.force_image_size or config.vision_config.image_size
35
  patch_size = config.vision_config.patch_size
36
  self.patch_size = patch_size
@@ -183,36 +193,44 @@ class InternVLChatModel(PreTrainedModel):
183
  vit_embeds = self.mlp1(vit_embeds)
184
  return vit_embeds
185
 
186
- def batch_chat(self, tokenizer, pixel_values, num_patches_list, questions, generation_config, history=None,
187
- return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
188
- IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False):
189
  if history is not None or return_history:
190
  print('Now multi-turn chat is not supported in batch_chat.')
191
  raise NotImplementedError
 
 
 
 
 
192
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
193
  self.img_context_token_id = img_context_token_id
194
 
195
- from .conversation import get_conv_template
 
 
196
 
197
  queries = []
198
- if verbose:
199
- image_bs = pixel_values.shape[0]
200
- print(f'dynamic ViT batch size: {image_bs}, num_patches_list: {num_patches_list}')
201
  for idx, num_patches in enumerate(num_patches_list):
202
- image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
203
- question = image_token + '\n' + questions[idx]
 
204
  template = get_conv_template(self.template)
205
  template.append_message(template.roles[0], question)
206
  template.append_message(template.roles[1], None)
207
  query = template.get_prompt()
 
 
 
208
  queries.append(query)
 
209
  tokenizer.padding_side = 'left'
210
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
211
  input_ids = model_inputs['input_ids'].cuda()
212
  attention_mask = model_inputs['attention_mask'].cuda()
213
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
214
  generation_config['eos_token_id'] = eos_token_id
215
-
216
  generation_output = self.generate(
217
  pixel_values=pixel_values,
218
  input_ids=input_ids,
 
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
10
+ import transformers
11
  from torch import nn
12
  from torch.nn import CrossEntropyLoss
13
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
 
24
  logger = logging.get_logger(__name__)
25
 
26
 
27
+ def version_cmp(v1, v2, op='eq'):
28
+ import operator
29
+
30
+ from packaging import version
31
+ op_func = getattr(operator, op)
32
+ return op_func(version.parse(v1), version.parse(v2))
33
+
34
+
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
 
40
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
41
  super().__init__(config)
42
 
43
+ assert version_cmp(transformers.__version__, '4.36.2', 'ge')
44
  image_size = config.force_image_size or config.vision_config.image_size
45
  patch_size = config.vision_config.patch_size
46
  self.patch_size = patch_size
 
193
  vit_embeds = self.mlp1(vit_embeds)
194
  return vit_embeds
195
 
196
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
197
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
198
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
199
  if history is not None or return_history:
200
  print('Now multi-turn chat is not supported in batch_chat.')
201
  raise NotImplementedError
202
+
203
+ if image_counts is not None:
204
+ num_patches_list = image_counts
205
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
206
+
207
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
208
  self.img_context_token_id = img_context_token_id
209
 
210
+ if verbose and pixel_values is not None:
211
+ image_bs = pixel_values.shape[0]
212
+ print(f'dynamic ViT batch size: {image_bs}')
213
 
214
  queries = []
 
 
 
215
  for idx, num_patches in enumerate(num_patches_list):
216
+ question = questions[idx]
217
+ if pixel_values is not None and '<image>' not in question:
218
+ question = '<image>\n' + question
219
  template = get_conv_template(self.template)
220
  template.append_message(template.roles[0], question)
221
  template.append_message(template.roles[1], None)
222
  query = template.get_prompt()
223
+
224
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
225
+ query = query.replace('<image>', image_tokens, 1)
226
  queries.append(query)
227
+
228
  tokenizer.padding_side = 'left'
229
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
230
  input_ids = model_inputs['input_ids'].cuda()
231
  attention_mask = model_inputs['attention_mask'].cuda()
232
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
233
  generation_config['eos_token_id'] = eos_token_id
 
234
  generation_output = self.generate(
235
  pixel_values=pixel_values,
236
  input_ids=input_ids,
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }