DylanJHJ commited on
Commit
fe717f0
1 Parent(s): b950b6a

use the conver codes and in this hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BlipForQuestionAnswering"
4
+ ],
5
+ "image_text_hidden_size": 256,
6
+ "initializer_factor": 1.0,
7
+ "initializer_range": 0.02,
8
+ "logit_scale_init_value": 2.6592,
9
+ "model_type": "blip",
10
+ "projection_dim": 512,
11
+ "text_config": {
12
+ "initializer_factor": 1.0,
13
+ "model_type": "blip_text_model",
14
+ "num_attention_heads": 12
15
+ },
16
+ "torch_dtype": "float32",
17
+ "transformers_version": "4.33.1",
18
+ "vision_config": {
19
+ "dropout": 0.0,
20
+ "initializer_factor": 1.0,
21
+ "initializer_range": 0.02,
22
+ "model_type": "blip_vision_model",
23
+ "num_channels": 3
24
+ }
25
+ }
convert_blip_original_pytorch_to_hf.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import argparse
17
+ import re
18
+
19
+ import requests
20
+ import torch
21
+
22
+ # git clone https://github.com/salesforce/BLIP.git
23
+ from models.blip import blip_decoder
24
+ from models.blip_itm import blip_itm
25
+ from models.blip_vqa import blip_vqa
26
+ from PIL import Image
27
+ from torchvision import transforms
28
+ from torchvision.transforms.functional import InterpolationMode
29
+
30
+ from transformers import (
31
+ BertTokenizer,
32
+ BlipConfig,
33
+ BlipForConditionalGeneration,
34
+ BlipForImageTextRetrieval,
35
+ BlipForQuestionAnswering,
36
+ )
37
+
38
+
39
+ def load_demo_image(image_size, device):
40
+ img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
41
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
42
+
43
+ transform = transforms.Compose(
44
+ [
45
+ transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
46
+ transforms.ToTensor(),
47
+ transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
48
+ ]
49
+ )
50
+ image = transform(raw_image).unsqueeze(0).to(device)
51
+ return image
52
+
53
+
54
+ def rename_key(key):
55
+ if "visual_encoder" in key:
56
+ key = re.sub("visual_encoder*", "vision_model.encoder", key)
57
+ if "blocks" in key:
58
+ key = re.sub(r"blocks", "layers", key)
59
+ if "attn" in key:
60
+ key = re.sub(r"attn", "self_attn", key)
61
+ if "norm1" in key:
62
+ key = re.sub(r"norm1", "layer_norm1", key)
63
+ if "norm2" in key:
64
+ key = re.sub(r"norm2", "layer_norm2", key)
65
+ if "encoder.norm" in key:
66
+ key = re.sub(r"encoder.norm", "post_layernorm", key)
67
+ if "encoder.patch_embed.proj" in key:
68
+ key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key)
69
+
70
+ if "encoder.pos_embed" in key:
71
+ key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key)
72
+ if "encoder.cls_token" in key:
73
+ key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key)
74
+
75
+ if "self_attn" in key:
76
+ key = re.sub(r"self_attn.proj", "self_attn.projection", key)
77
+
78
+ return key
79
+
80
+
81
+ @torch.no_grad()
82
+ def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
83
+ """
84
+ Copy/paste/tweak model's weights to transformers design.
85
+ """
86
+ if config_path is not None:
87
+ config = BlipConfig.from_pretrained(config_path)
88
+ else:
89
+ config = BlipConfig(projection_dim=512, text_config={}, vision_config={})
90
+
91
+ hf_model = BlipForConditionalGeneration(config).eval()
92
+
93
+ model_url = "model_base_capfilt_large.pth"
94
+
95
+ # pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base")
96
+ # pt_model = pt_model.eval()
97
+
98
+ # modified_state_dict = pt_model.state_dict()
99
+ # for key in modified_state_dict.copy():
100
+ # value = modified_state_dict.pop(key)
101
+ # renamed_key = rename_key(key)
102
+ # modified_state_dict[renamed_key] = value
103
+ #
104
+ # hf_model.load_state_dict(modified_state_dict)
105
+ #
106
+ image_size = 384
107
+ image = load_demo_image(image_size=image_size, device="cpu")
108
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
109
+ # input_ids = tokenizer(["a picture of"]).input_ids
110
+ #
111
+ # out = hf_model.generate(image, input_ids)
112
+ #
113
+ # assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
114
+ #
115
+ # out = hf_model.generate(image)
116
+ #
117
+ # assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
118
+ #
119
+ # if pytorch_dump_folder_path is not None:
120
+ # hf_model.save_pretrained(pytorch_dump_folder_path)
121
+ #
122
+ # # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth'
123
+ # model_url = (
124
+ # # "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
125
+ # # )
126
+
127
+ vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base")
128
+ vqa_model.eval()
129
+
130
+ modified_state_dict = vqa_model.state_dict()
131
+ for key in modified_state_dict.copy():
132
+ value = modified_state_dict.pop(key)
133
+ renamed_key = rename_key(key)
134
+ modified_state_dict[renamed_key] = value
135
+
136
+ hf_vqa_model = BlipForQuestionAnswering(config)
137
+ offset_keys = [i for i in modified_state_dict.keys() if i not in hf_vqa_model.state_dict().keys()]
138
+ print(len([i for i in hf_vqa_model.state_dict().keys() if i in modified_state_dict.keys()]))
139
+ for key in offset_keys:
140
+ modified_state_dict.pop(key)
141
+
142
+ hf_vqa_model.load_state_dict(modified_state_dict)
143
+
144
+ question = ["How many dogs are in this image?"]
145
+ question_input_ids = tokenizer(question, return_tensors="pt").input_ids
146
+
147
+ answer = hf_vqa_model.generate(question_input_ids, image)
148
+ print(tokenizer.decode(answer[0]))
149
+
150
+ # assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]"
151
+ if pytorch_dump_folder_path is not None:
152
+ hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa")
153
+
154
+ # model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
155
+ #
156
+ # itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base")
157
+ # itm_model.eval()
158
+ #
159
+ # modified_state_dict = itm_model.state_dict()
160
+ # for key in modified_state_dict.copy():
161
+ # value = modified_state_dict.pop(key)
162
+ # renamed_key = rename_key(key)
163
+ # modified_state_dict[renamed_key] = value
164
+ #
165
+ # hf_itm_model = BlipForImageTextRetrieval(config)
166
+ #
167
+ # question = ["A picture of a woman with a dog sitting in a beach"]
168
+ # question_input_ids = tokenizer(
169
+ # question,
170
+ # return_tensors="pt",
171
+ # padding="max_length",
172
+ # truncation=True,
173
+ # max_length=35,
174
+ # ).input_ids
175
+ #
176
+ # hf_itm_model.load_state_dict(modified_state_dict)
177
+ # hf_itm_model.eval()
178
+ #
179
+ # out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True)
180
+ # out = hf_itm_model(question_input_ids, image, use_itm_head=False)
181
+ #
182
+ # assert out[0].item() == 0.2110687494277954
183
+ # assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127
184
+ #
185
+ # if pytorch_dump_folder_path is not None:
186
+ # hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm")
187
+
188
+
189
+ if __name__ == "__main__":
190
+ parser = argparse.ArgumentParser()
191
+ parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
192
+ parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
193
+ args = parser.parse_args()
194
+
195
+ convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path)
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 30522,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.1"
7
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_pad": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "BlipImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "processor_class": "BlipProcessor",
18
+ "resample": 3,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 384,
22
+ "width": 384
23
+ },
24
+ "size_divisor": 32
25
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1f1430f0112631bd4b5d0409c2ead0a942eecd32b96de5b561ea117e6483b63
3
+ size 1538958234
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_input_names": [
7
+ "input_ids",
8
+ "attention_mask"
9
+ ],
10
+ "model_max_length": 512,
11
+ "name_or_path": "ybelkada/blip-image-captioning-base",
12
+ "never_split": null,
13
+ "pad_token": "[PAD]",
14
+ "processor_class": "BlipProcessor",
15
+ "sep_token": "[SEP]",
16
+ "special_tokens_map_file": null,
17
+ "strip_accents": null,
18
+ "tokenize_chinese_chars": true,
19
+ "tokenizer_class": "BertTokenizer",
20
+ "unk_token": "[UNK]",
21
+ "model_input_names": [
22
+ "input_ids",
23
+ "attention_mask"
24
+ ]
25
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff