|
import os |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
MODEL_PATH = "../MiniCPM-V-2_6/" |
|
DEVICE_MAP = "cpu" |
|
|
|
origin_model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_PATH, trust_remote_code=True, attn_implementation='eager', device_map=DEVICE_MAP).eval() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) |
|
|
|
for param in origin_model.parameters(): |
|
param.requires_grad = False |
|
|
|
class VisionTransformer(torch.nn.Module): |
|
def __init__(self): |
|
super().__init__() |
|
self.vpm = origin_model.vpm |
|
self.resampler = origin_model.resampler |
|
self.tgt_sizes = torch.Tensor([[32, 32]]).type(torch.int32) |
|
|
|
def forward(self, pixel_values): |
|
vit_embeds = self.vpm(pixel_values).last_hidden_state |
|
vit_embeds = self.resampler(vit_embeds, self.tgt_sizes) |
|
return vit_embeds |
|
|
|
|
|
def convert_vision_transformer(): |
|
model = VisionTransformer() |
|
IMAGE_SIZE = 448 |
|
pixel_values = torch.randn( |
|
(1, 3, IMAGE_SIZE, IMAGE_SIZE)) |
|
|
|
|
|
vit_embeds = model(pixel_values) |
|
print(vit_embeds.shape) |
|
if vit_embeds.shape != (1, 64, 3584): |
|
raise ValueError("vit_embeds shape is not correct, something is wrong") |
|
|
|
|
|
torch.onnx.export(model, pixel_values, |
|
f'vision_transformer.onnx', |
|
verbose=False, |
|
input_names=['pixel_values'], |
|
output_names=['vit_embeds'], |
|
dynamic_axes={'pixel_values': {0: 'batch_size', 2: 'height', 3: 'width'}, |
|
'vit_embeds': {0: 'batch_size', 1: 'seq_len'}}, |
|
do_constant_folding=True, |
|
opset_version=17) |
|
|
|
if __name__ == "__main__": |
|
convert_vision_transformer() |
|
|