use the conver codes and in this hub
Browse files- .gitattributes +1 -0
- config.json +25 -0
- convert_blip_original_pytorch_to_hf.py +195 -0
- generation_config.json +7 -0
- preprocessor_config.json +25 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +25 -0
- vocab.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"BlipForQuestionAnswering"
|
4 |
+
],
|
5 |
+
"image_text_hidden_size": 256,
|
6 |
+
"initializer_factor": 1.0,
|
7 |
+
"initializer_range": 0.02,
|
8 |
+
"logit_scale_init_value": 2.6592,
|
9 |
+
"model_type": "blip",
|
10 |
+
"projection_dim": 512,
|
11 |
+
"text_config": {
|
12 |
+
"initializer_factor": 1.0,
|
13 |
+
"model_type": "blip_text_model",
|
14 |
+
"num_attention_heads": 12
|
15 |
+
},
|
16 |
+
"torch_dtype": "float32",
|
17 |
+
"transformers_version": "4.33.1",
|
18 |
+
"vision_config": {
|
19 |
+
"dropout": 0.0,
|
20 |
+
"initializer_factor": 1.0,
|
21 |
+
"initializer_range": 0.02,
|
22 |
+
"model_type": "blip_vision_model",
|
23 |
+
"num_channels": 3
|
24 |
+
}
|
25 |
+
}
|
convert_blip_original_pytorch_to_hf.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
import argparse
|
17 |
+
import re
|
18 |
+
|
19 |
+
import requests
|
20 |
+
import torch
|
21 |
+
|
22 |
+
# git clone https://github.com/salesforce/BLIP.git
|
23 |
+
from models.blip import blip_decoder
|
24 |
+
from models.blip_itm import blip_itm
|
25 |
+
from models.blip_vqa import blip_vqa
|
26 |
+
from PIL import Image
|
27 |
+
from torchvision import transforms
|
28 |
+
from torchvision.transforms.functional import InterpolationMode
|
29 |
+
|
30 |
+
from transformers import (
|
31 |
+
BertTokenizer,
|
32 |
+
BlipConfig,
|
33 |
+
BlipForConditionalGeneration,
|
34 |
+
BlipForImageTextRetrieval,
|
35 |
+
BlipForQuestionAnswering,
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
+
def load_demo_image(image_size, device):
|
40 |
+
img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
|
41 |
+
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
|
42 |
+
|
43 |
+
transform = transforms.Compose(
|
44 |
+
[
|
45 |
+
transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
|
46 |
+
transforms.ToTensor(),
|
47 |
+
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
|
48 |
+
]
|
49 |
+
)
|
50 |
+
image = transform(raw_image).unsqueeze(0).to(device)
|
51 |
+
return image
|
52 |
+
|
53 |
+
|
54 |
+
def rename_key(key):
|
55 |
+
if "visual_encoder" in key:
|
56 |
+
key = re.sub("visual_encoder*", "vision_model.encoder", key)
|
57 |
+
if "blocks" in key:
|
58 |
+
key = re.sub(r"blocks", "layers", key)
|
59 |
+
if "attn" in key:
|
60 |
+
key = re.sub(r"attn", "self_attn", key)
|
61 |
+
if "norm1" in key:
|
62 |
+
key = re.sub(r"norm1", "layer_norm1", key)
|
63 |
+
if "norm2" in key:
|
64 |
+
key = re.sub(r"norm2", "layer_norm2", key)
|
65 |
+
if "encoder.norm" in key:
|
66 |
+
key = re.sub(r"encoder.norm", "post_layernorm", key)
|
67 |
+
if "encoder.patch_embed.proj" in key:
|
68 |
+
key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key)
|
69 |
+
|
70 |
+
if "encoder.pos_embed" in key:
|
71 |
+
key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key)
|
72 |
+
if "encoder.cls_token" in key:
|
73 |
+
key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key)
|
74 |
+
|
75 |
+
if "self_attn" in key:
|
76 |
+
key = re.sub(r"self_attn.proj", "self_attn.projection", key)
|
77 |
+
|
78 |
+
return key
|
79 |
+
|
80 |
+
|
81 |
+
@torch.no_grad()
|
82 |
+
def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
|
83 |
+
"""
|
84 |
+
Copy/paste/tweak model's weights to transformers design.
|
85 |
+
"""
|
86 |
+
if config_path is not None:
|
87 |
+
config = BlipConfig.from_pretrained(config_path)
|
88 |
+
else:
|
89 |
+
config = BlipConfig(projection_dim=512, text_config={}, vision_config={})
|
90 |
+
|
91 |
+
hf_model = BlipForConditionalGeneration(config).eval()
|
92 |
+
|
93 |
+
model_url = "model_base_capfilt_large.pth"
|
94 |
+
|
95 |
+
# pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base")
|
96 |
+
# pt_model = pt_model.eval()
|
97 |
+
|
98 |
+
# modified_state_dict = pt_model.state_dict()
|
99 |
+
# for key in modified_state_dict.copy():
|
100 |
+
# value = modified_state_dict.pop(key)
|
101 |
+
# renamed_key = rename_key(key)
|
102 |
+
# modified_state_dict[renamed_key] = value
|
103 |
+
#
|
104 |
+
# hf_model.load_state_dict(modified_state_dict)
|
105 |
+
#
|
106 |
+
image_size = 384
|
107 |
+
image = load_demo_image(image_size=image_size, device="cpu")
|
108 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
109 |
+
# input_ids = tokenizer(["a picture of"]).input_ids
|
110 |
+
#
|
111 |
+
# out = hf_model.generate(image, input_ids)
|
112 |
+
#
|
113 |
+
# assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
|
114 |
+
#
|
115 |
+
# out = hf_model.generate(image)
|
116 |
+
#
|
117 |
+
# assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
|
118 |
+
#
|
119 |
+
# if pytorch_dump_folder_path is not None:
|
120 |
+
# hf_model.save_pretrained(pytorch_dump_folder_path)
|
121 |
+
#
|
122 |
+
# # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth'
|
123 |
+
# model_url = (
|
124 |
+
# # "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
|
125 |
+
# # )
|
126 |
+
|
127 |
+
vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base")
|
128 |
+
vqa_model.eval()
|
129 |
+
|
130 |
+
modified_state_dict = vqa_model.state_dict()
|
131 |
+
for key in modified_state_dict.copy():
|
132 |
+
value = modified_state_dict.pop(key)
|
133 |
+
renamed_key = rename_key(key)
|
134 |
+
modified_state_dict[renamed_key] = value
|
135 |
+
|
136 |
+
hf_vqa_model = BlipForQuestionAnswering(config)
|
137 |
+
offset_keys = [i for i in modified_state_dict.keys() if i not in hf_vqa_model.state_dict().keys()]
|
138 |
+
print(len([i for i in hf_vqa_model.state_dict().keys() if i in modified_state_dict.keys()]))
|
139 |
+
for key in offset_keys:
|
140 |
+
modified_state_dict.pop(key)
|
141 |
+
|
142 |
+
hf_vqa_model.load_state_dict(modified_state_dict)
|
143 |
+
|
144 |
+
question = ["How many dogs are in this image?"]
|
145 |
+
question_input_ids = tokenizer(question, return_tensors="pt").input_ids
|
146 |
+
|
147 |
+
answer = hf_vqa_model.generate(question_input_ids, image)
|
148 |
+
print(tokenizer.decode(answer[0]))
|
149 |
+
|
150 |
+
# assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]"
|
151 |
+
if pytorch_dump_folder_path is not None:
|
152 |
+
hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa")
|
153 |
+
|
154 |
+
# model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
|
155 |
+
#
|
156 |
+
# itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base")
|
157 |
+
# itm_model.eval()
|
158 |
+
#
|
159 |
+
# modified_state_dict = itm_model.state_dict()
|
160 |
+
# for key in modified_state_dict.copy():
|
161 |
+
# value = modified_state_dict.pop(key)
|
162 |
+
# renamed_key = rename_key(key)
|
163 |
+
# modified_state_dict[renamed_key] = value
|
164 |
+
#
|
165 |
+
# hf_itm_model = BlipForImageTextRetrieval(config)
|
166 |
+
#
|
167 |
+
# question = ["A picture of a woman with a dog sitting in a beach"]
|
168 |
+
# question_input_ids = tokenizer(
|
169 |
+
# question,
|
170 |
+
# return_tensors="pt",
|
171 |
+
# padding="max_length",
|
172 |
+
# truncation=True,
|
173 |
+
# max_length=35,
|
174 |
+
# ).input_ids
|
175 |
+
#
|
176 |
+
# hf_itm_model.load_state_dict(modified_state_dict)
|
177 |
+
# hf_itm_model.eval()
|
178 |
+
#
|
179 |
+
# out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True)
|
180 |
+
# out = hf_itm_model(question_input_ids, image, use_itm_head=False)
|
181 |
+
#
|
182 |
+
# assert out[0].item() == 0.2110687494277954
|
183 |
+
# assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127
|
184 |
+
#
|
185 |
+
# if pytorch_dump_folder_path is not None:
|
186 |
+
# hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm")
|
187 |
+
|
188 |
+
|
189 |
+
if __name__ == "__main__":
|
190 |
+
parser = argparse.ArgumentParser()
|
191 |
+
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
|
192 |
+
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
|
193 |
+
args = parser.parse_args()
|
194 |
+
|
195 |
+
convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path)
|
generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 30522,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"pad_token_id": 0,
|
6 |
+
"transformers_version": "4.33.1"
|
7 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"do_pad": true,
|
4 |
+
"do_rescale": true,
|
5 |
+
"do_resize": true,
|
6 |
+
"image_mean": [
|
7 |
+
0.48145466,
|
8 |
+
0.4578275,
|
9 |
+
0.40821073
|
10 |
+
],
|
11 |
+
"image_processor_type": "BlipImageProcessor",
|
12 |
+
"image_std": [
|
13 |
+
0.26862954,
|
14 |
+
0.26130258,
|
15 |
+
0.27577711
|
16 |
+
],
|
17 |
+
"processor_class": "BlipProcessor",
|
18 |
+
"resample": 3,
|
19 |
+
"rescale_factor": 0.00392156862745098,
|
20 |
+
"size": {
|
21 |
+
"height": 384,
|
22 |
+
"width": 384
|
23 |
+
},
|
24 |
+
"size_divisor": 32
|
25 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1f1430f0112631bd4b5d0409c2ead0a942eecd32b96de5b561ea117e6483b63
|
3 |
+
size 1538958234
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_input_names": [
|
7 |
+
"input_ids",
|
8 |
+
"attention_mask"
|
9 |
+
],
|
10 |
+
"model_max_length": 512,
|
11 |
+
"name_or_path": "ybelkada/blip-image-captioning-base",
|
12 |
+
"never_split": null,
|
13 |
+
"pad_token": "[PAD]",
|
14 |
+
"processor_class": "BlipProcessor",
|
15 |
+
"sep_token": "[SEP]",
|
16 |
+
"special_tokens_map_file": null,
|
17 |
+
"strip_accents": null,
|
18 |
+
"tokenize_chinese_chars": true,
|
19 |
+
"tokenizer_class": "BertTokenizer",
|
20 |
+
"unk_token": "[UNK]",
|
21 |
+
"model_input_names": [
|
22 |
+
"input_ids",
|
23 |
+
"attention_mask"
|
24 |
+
]
|
25 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|