chrisc36 commited on
Commit
130d2f2
1 Parent(s): 2070ed4

Add files using upload-large-folder tool

Browse files
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "<im_col>": 100281,
3
+ "<im_end>": 100279,
4
+ "<im_patch>": 100280,
5
+ "<im_start>": 100278,
6
+ "<|image|>": 100282
7
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MolmoForCausalLM"
4
+ ],
5
+ "attention_layer_norm": true,
6
+ "auto_map": {
7
+ "AutoConfig": "config_molmo.MolmoConfig",
8
+ "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
9
+ },
10
+ "clip_qkv": null,
11
+ "embedding_size": 100352,
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 22016,
15
+ "layer_norm_eps": 1e-06,
16
+ "layer_norm_type": "rms",
17
+ "max_position_embeddings": 4096,
18
+ "model_type": "molmo",
19
+ "norm_after": true,
20
+ "num_attention_heads": 32,
21
+ "num_hidden_layers": 32,
22
+ "num_key_value_heads": null,
23
+ "qkv_bias": false,
24
+ "rope_theta": 500000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.43.3",
28
+ "use_cache": true,
29
+ "use_position_ids": true,
30
+ "vocab_size": 100278,
31
+ "weight_tying": false
32
+ }
config_molmo.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from transformers import PretrainedConfig, AutoTokenizer
4
+
5
+
6
+ class MolmoConfig(PretrainedConfig):
7
+ model_type = "molmo"
8
+ keys_to_ignore_at_inference = ["past_key_values"]
9
+
10
+ def __init__(
11
+ self,
12
+ vocab_size=50304,
13
+ embedding_size=50304,
14
+ hidden_size=4096,
15
+ intermediate_size=11008,
16
+ num_hidden_layers=32,
17
+ num_attention_heads=32,
18
+ num_key_value_heads=None,
19
+ max_position_embeddings=2048,
20
+ initializer_range=0.02,
21
+ use_cache=True,
22
+ layer_norm_eps: float = 1e-5,
23
+ rope_theta=10000.0,
24
+ clip_qkv=None,
25
+ qkv_bias: bool = False,
26
+ weight_tying: bool = False,
27
+ use_position_ids: bool=True,
28
+ tie_word_embeddings: bool=True,
29
+ attention_layer_norm: bool=False,
30
+ norm_after: bool = False,
31
+ layer_norm_type: str="rms",
32
+ **kwargs,
33
+ ):
34
+ self.vocab_size = vocab_size
35
+ self.embedding_size = embedding_size
36
+ self.max_position_embeddings = max_position_embeddings
37
+ self.hidden_size = hidden_size
38
+ self.intermediate_size = intermediate_size
39
+ self.num_hidden_layers = num_hidden_layers
40
+ self.num_attention_heads = num_attention_heads
41
+ self.layer_norm_eps = layer_norm_eps
42
+ self.weight_tying = weight_tying
43
+ self.use_position_ids = use_position_ids
44
+ self.attention_layer_norm = attention_layer_norm
45
+ self.num_key_value_heads = num_key_value_heads
46
+ self.initializer_range = initializer_range
47
+ self.use_cache = use_cache
48
+ self.rope_theta = rope_theta
49
+ self.clip_qkv = clip_qkv
50
+ self.qkv_bias = qkv_bias
51
+ self.norm_after = norm_after
52
+ self.tie_word_embeddings = tie_word_embeddings
53
+ self.layer_norm_type = layer_norm_type
54
+
55
+ super().__init__(
56
+ tie_word_embeddings=tie_word_embeddings,
57
+ **kwargs,
58
+ )
59
+
60
+ MolmoConfig.register_for_auto_class()
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.43.3"
4
+ }
image_preprocessing_molmo.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Molmo"""
2
+ from typing import List, Optional, Union, Mapping
3
+
4
+ import numpy as np
5
+ import einops
6
+ import torch
7
+ import torchvision.transforms
8
+ from torchvision.transforms import InterpolationMode
9
+ from torchvision.transforms.functional import convert_image_dtype
10
+
11
+ from transformers.image_utils import (
12
+ OPENAI_CLIP_MEAN,
13
+ OPENAI_CLIP_STD,
14
+ ImageInput,
15
+ is_valid_image,
16
+ )
17
+ from transformers.processing_utils import ImagesKwargs
18
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
19
+ from transformers.utils import TensorType, is_vision_available, logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ def make_batched_images(images) -> List[List[ImageInput]]:
26
+ """
27
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
28
+
29
+ Args:
30
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
31
+ The input image.
32
+
33
+ Returns:
34
+ list: A list of images.
35
+ """
36
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
37
+ return [img for img_list in images for img in img_list]
38
+
39
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
40
+ return images
41
+
42
+ elif is_valid_image(images):
43
+ return [images]
44
+
45
+ raise ValueError(f"Could not make batched images from {images}")
46
+
47
+
48
+ def pad_to_bounding_box(
49
+ image, offset_height, offset_width, target_height,
50
+ target_width, value=0
51
+ ):
52
+ height, width = image.shape[:2]
53
+ after_padding_width = target_width - offset_width - width
54
+ after_padding_height = target_height - offset_height - height
55
+ return np.pad(image, [
56
+ [offset_height, after_padding_height],
57
+ [offset_width, after_padding_width],
58
+ [0, 0]
59
+ ], constant_values=value)
60
+
61
+
62
+ def normalize_image(image, offset, scale):
63
+ image -= np.array(offset, dtype=np.float32)[None, None, :]
64
+ image /= np.array(scale, dtype=np.float32)[None, None, :]
65
+ return image
66
+
67
+
68
+ def resize_and_pad(
69
+ image,
70
+ desired_output_size,
71
+ resize_method=InterpolationMode.BILINEAR,
72
+ pad_value=0,
73
+ normalize=True,
74
+ image_mean=OPENAI_CLIP_MEAN,
75
+ image_std=OPENAI_CLIP_STD,
76
+ ):
77
+ desired_height, desired_width = desired_output_size
78
+ height, width = image.shape[:2]
79
+
80
+ # Cast into float32 since the training code did this in float32 and it (very rarely) effects
81
+ # the results after rounding.
82
+ image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
83
+ image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
84
+ image_scale = min(image_scale_x, image_scale_y)
85
+ scaled_height = int(np.array(height, np.float32) * image_scale)
86
+ scaled_width = int(np.array(width, np.float32) * image_scale)
87
+
88
+ # if resize_method == "tensorflow":
89
+ # FIXME remove
90
+ import tensorflow as tf
91
+ image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
92
+ image = tf.image.resize(
93
+ image,
94
+ [scaled_height, scaled_width],
95
+ method=tf.image.ResizeMethod.BILINEAR,
96
+ antialias=True,
97
+ )
98
+ image = tf.clip_by_value(image, 0.0, 1.0)
99
+ image = image.numpy()
100
+ # else:
101
+ # image = torch.permute(torch.from_numpy(image), [2, 0, 1])
102
+ # image = convert_image_dtype(image) # resize in flaot32
103
+ # image = torchvision.transforms.Resize(
104
+ # [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
105
+ # )(image)
106
+ # image = torch.clip(image, 0.0, 1.0)
107
+ # image = torch.permute(image, [1, 2, 0]).numpy()
108
+
109
+ top_pad = (desired_height - scaled_height) // 2
110
+ left_pad = (desired_width - scaled_width) // 2
111
+ padding = [
112
+ [top_pad, desired_height - scaled_height - top_pad],
113
+ [left_pad, desired_width - scaled_width - left_pad],
114
+ [0, 0]
115
+ ]
116
+ image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
117
+ image = np.pad(image, padding, constant_values=pad_value)
118
+ if normalize:
119
+ image = normalize_image(image, offset=image_mean, scale=image_std)
120
+ return image, image_mask
121
+
122
+
123
+ def select_tiling(h, w, patch_size, max_num_patches):
124
+ """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
125
+ original_size = np.stack([h, w]) # [1, 2]
126
+ original_res = h * w
127
+ tilings = []
128
+ for i in range(1, max_num_patches+1):
129
+ for j in range(1, max_num_patches+1):
130
+ if i*j <= max_num_patches:
131
+ tilings.append((i, j))
132
+ # sort so argmin and argmax favour smaller tilings in the event of a tie
133
+ tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
134
+ candidate_tilings = np.array(tilings, dtype=np.int32) # [n_resolutions, 2]
135
+ candidate_resolutions = candidate_tilings * patch_size # [n_resolutions, 2]
136
+
137
+ # How much we would need to scale the image to fit exactly in each tiling
138
+ original_size = np.stack([h, w], dtype=np.float32) # [1, 2]
139
+ required_scale_d = candidate_resolutions.astype(np.float32) / original_size
140
+ required_scale = np.min(required_scale_d, axis=-1, keepdims=True) # [n_resolutions, 1]
141
+ if np.all(required_scale < 1):
142
+ # We are forced to downscale, so try to minimize the amount of downscaling
143
+ ix = np.argmax(required_scale)
144
+ else:
145
+ # Pick the resolution that required the least upscaling so that it most closely fits the image
146
+ required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
147
+ ix = np.argmin(required_scale)
148
+ return candidate_tilings[ix]
149
+
150
+
151
+ class MolmoImagesKwargs(ImagesKwargs, total=False):
152
+ max_crops: Optional[int]
153
+ overlap_margins: Optional[List[int]]
154
+ base_image_input_size: Optional[List[int]]
155
+ image_token_length_w: Optional[int]
156
+ image_token_length_h: Optional[int]
157
+ image_patch_size: Optional[int]
158
+ image_padding_mask: Optional[bool]
159
+
160
+
161
+ class MolmoImageProcessor(BaseImageProcessor):
162
+ """Preprocess images and multi-model inputs"""
163
+
164
+ def __init__(
165
+ self,
166
+ max_crops: int = 12,
167
+ overlap_margins: List[int] = (4, 4),
168
+ base_image_input_size: List[int] = (336, 336),
169
+ image_token_length_w: int = 12,
170
+ image_token_length_h: int = 12,
171
+ image_patch_size: int = 14,
172
+ image_padding_mask: bool = True,
173
+ do_normalize: bool = True,
174
+ image_mean: Optional[Union[float, List[float]]] = None,
175
+ image_std: Optional[Union[float, List[float]]] = None,
176
+ **kwargs,
177
+ ):
178
+ super().__init__(**kwargs)
179
+ self.max_crops = max_crops
180
+ self.overlap_margins = overlap_margins
181
+ self.base_image_input_size = base_image_input_size
182
+ self.image_token_length_w = image_token_length_w
183
+ self.image_token_length_h = image_token_length_h
184
+ self.image_patch_size = image_patch_size
185
+ self.image_padding_mask = image_padding_mask
186
+ self.do_normalize = do_normalize
187
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
188
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
189
+
190
+ def image_to_patches_and_tokens(
191
+ self,
192
+ image: ImageInput,
193
+ image_patch_token_id: int,
194
+ image_col_token_id: int,
195
+ image_start_token_id: int,
196
+ image_end_token_id: int,
197
+ max_crops: Optional[int] = None,
198
+ overlap_margins: Optional[List[int]] = None,
199
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
200
+ image_token_length_w: Optional[int] = None,
201
+ image_token_length_h: Optional[int] = None,
202
+ image_patch_size: Optional[int] = None,
203
+ ):
204
+ """Preprocesses an image
205
+
206
+ Returns:
207
+ crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
208
+ change between images but the other dimension are fixed
209
+ tokens: (n_tokens,) int32 tokens, pad tokens indicating where to insert the
210
+ patch features, might include other special tokens as well
211
+ patch_ordering: (n_crops, n_tokens_per_crop) order image features should be inserted
212
+ into the `tokens`, negative values indicates patches features to exclude
213
+ padding_mask: (n_crops, n_patches) what percent of each crop is padding, be None
214
+ if the image mask is not being used.
215
+ """
216
+ if isinstance(base_image_input_size, int):
217
+ base_image_input_size = (base_image_input_size, base_image_input_size)
218
+
219
+ base_image_input_d = image_patch_size
220
+ tokens_per_image = image_token_length_w * image_token_length_h
221
+ image_base_patch_w = base_image_input_size[1] // base_image_input_d
222
+ image_base_patch_h = base_image_input_size[0] // base_image_input_d
223
+
224
+ original_image_h, original_image_w = image.shape[:2]
225
+ crop_size = base_image_input_size[0]
226
+
227
+ # Discard this many patches from the (left/top, right/bottom) of crops
228
+ left_margin, right_margin = overlap_margins
229
+ # left_margin, right_margin = 2, 2
230
+ assert left_margin % 2 == 0 # Required for compatibility with 2x2 pooling
231
+ total_margin_pixels = base_image_input_d*(right_margin + left_margin) # pixels removed per dim
232
+ crop_patches = base_image_input_size[0] // base_image_input_d # patches per crop dim
233
+ crop_window_patches = crop_patches - (right_margin + left_margin) # usable patches
234
+ crop_window_size = crop_window_patches * base_image_input_d
235
+ tiling = select_tiling(
236
+ original_image_h - total_margin_pixels,
237
+ original_image_w - total_margin_pixels,
238
+ crop_window_size,
239
+ max_crops
240
+ )
241
+ src, img_mask = resize_and_pad(
242
+ image,
243
+ [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels]
244
+ )
245
+
246
+ # Now we have to split the image into crops, while keeping track of how each patch in the
247
+ # each crop should be ordered in the global image, this require a lot of tricky booking
248
+ n_crops = tiling[0] * tiling[1]
249
+ patches_arr = []
250
+ mask_arr = []
251
+ patch_ordering_arr = []
252
+
253
+ # We assume 2x2 pooling, but can allow padding the right/bottom with extra
254
+ # patches if the number of patches per side is not even
255
+ assert (crop_patches+1)//2 == image_token_length_h
256
+ assert (crop_patches+1)//2 == image_token_length_w
257
+ on = 0
258
+ on_patch = 0
259
+ for i in range(tiling[0]):
260
+ y0 = i*crop_window_size
261
+ if i == 0:
262
+ crop_y0 = 0
263
+ else:
264
+ crop_y0 = left_margin // 2
265
+
266
+ crop_h = image_base_patch_h - (right_margin + left_margin)
267
+ if i == 0:
268
+ crop_h += left_margin
269
+ if i == (tiling[0]-1):
270
+ crop_h += right_margin
271
+ for j in range(tiling[1]):
272
+ x0 = j*crop_window_size
273
+ if j == 0:
274
+ crop_x0 = 0
275
+ else:
276
+ crop_x0 = left_margin // 2
277
+
278
+ crop_w = image_base_patch_w - (right_margin + left_margin)
279
+ if j == 0:
280
+ crop_w += left_margin
281
+ if j == (tiling[1]-1):
282
+ crop_w += right_margin
283
+
284
+ pooled_w = (crop_w + 1) // 2
285
+ pooled_h = (crop_h + 1) // 2
286
+ patch_ordering_arr.append(
287
+ pad_to_bounding_box(
288
+ np.reshape(np.arange(on, on+pooled_h*pooled_w, dtype=np.int32), (pooled_h, pooled_w, 1)),
289
+ crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
290
+ )[:, :, 0]
291
+ )
292
+ patches_arr.append(src[y0:y0+crop_size, x0:x0+crop_size])
293
+ mask_arr.append(img_mask[y0:y0+crop_size, x0:x0+crop_size])
294
+
295
+ on += pooled_h*pooled_w
296
+ on_patch += 1
297
+ patches = np.stack(patches_arr)
298
+ patch_ordering = np.stack(patch_ordering_arr)
299
+ img_mask = np.stack(mask_arr)
300
+
301
+ # Switch to [n_crops, n_patches, pixels_per_patch] format
302
+ image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
303
+ patches = einops.rearrange(
304
+ patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
305
+ dh=base_image_input_d,
306
+ dw=base_image_input_d,
307
+ h=image_base_patch_h,
308
+ w=image_base_patch_w
309
+ )
310
+ img_mask = einops.rearrange(
311
+ img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
312
+ dh=base_image_input_d,
313
+ dw=base_image_input_d,
314
+ h=image_base_patch_h,
315
+ w=image_base_patch_w
316
+ )
317
+
318
+ img_mask = img_mask.astype(np.float32).mean(axis=-1)
319
+ patch_ordering = np.reshape(patch_ordering, [-1])
320
+ valid = patch_ordering >= 0
321
+
322
+ # Transpose order, to get left-to-right order instead of crop-by-crop order
323
+ patch_ordering_rh = np.reshape(
324
+ patch_ordering,
325
+ [tiling[0], tiling[1], image_token_length_h, image_token_length_w]
326
+ )
327
+ patch_ordering_rh = np.transpose(patch_ordering_rh, [0, 2, 1, 3])
328
+ patch_ordering_rh = np.reshape(patch_ordering_rh, [-1])
329
+
330
+ # The transpose will screw up which patches are masked, project the
331
+ # new order into sparse structure of `patch_ordering` to fix this
332
+ patch_ordering[valid] = patch_ordering_rh[patch_ordering_rh >= 0]
333
+
334
+ # Now build the output tokens
335
+ h = tiling[0] * crop_window_patches + (right_margin+left_margin)
336
+ w = tiling[1] * crop_window_patches + (right_margin+left_margin)
337
+ per_row = np.full(
338
+ ((w+1)//2,),
339
+ image_patch_token_id,
340
+ )
341
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
342
+
343
+ joint = np.tile(per_row, [(h+1)//2])
344
+ joint = [
345
+ [image_start_token_id],
346
+ joint,
347
+ [image_end_token_id]
348
+ ]
349
+
350
+ # Finally do the same for the global image
351
+ resized, _ = resize_and_pad(image, base_image_input_size)
352
+ resized = einops.rearrange(
353
+ resized, '(h dh) (w dw) c -> (h w) (dh dw c)',
354
+ dh=base_image_input_d,
355
+ dw=base_image_input_d,
356
+ h=image_base_patch_h,
357
+ w=image_base_patch_w
358
+ )
359
+ patches = np.concatenate([np.expand_dims(resized, 0), patches], 0)
360
+
361
+ # Global image goes first, so the order of patches in previous crops gets increased
362
+ patch_ordering = np.where(
363
+ patch_ordering >= 0,
364
+ patch_ordering + tokens_per_image,
365
+ -1
366
+ )
367
+ patch_ordering = np.concatenate([np.arange(0, tokens_per_image), patch_ordering], 0)
368
+ per_row = np.full(
369
+ (image_token_length_w,),
370
+ image_patch_token_id,
371
+ )
372
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
373
+ extra_tokens = np.tile(per_row, [image_token_length_h])
374
+ joint = [
375
+ [image_start_token_id],
376
+ extra_tokens,
377
+ [image_end_token_id],
378
+ ] + joint
379
+
380
+ joint = np.concatenate(joint, 0)
381
+ img_mask = np.pad(img_mask, [[0, 1], [0, 0]], constant_values=-1)
382
+ return patches, joint, patch_ordering, img_mask
383
+
384
+ def build_image_input_idx(
385
+ self,
386
+ image_tokens: np.ndarray,
387
+ patch_order: np.ndarray,
388
+ image_patch_token_id: int,
389
+ no_image: Optional[bool] = None,
390
+ image_token_length_w: Optional[int] = None,
391
+ image_token_length_h: Optional[int] = None,
392
+ ):
393
+ """Converts `patch_order` into a mapping of token_id -> patch_id"""
394
+
395
+ tokens_per_image = image_token_length_w * image_token_length_h
396
+ if no_image is not None and no_image:
397
+ return np.zeros((0, tokens_per_image), np.int32)
398
+
399
+ # Indices to insert the patches
400
+ image_input_idx = image_tokens == image_patch_token_id
401
+ image_input_idx = np.nonzero(image_input_idx)[0].astype(np.int32)
402
+
403
+ if patch_order is not None:
404
+ n_tokens = image_input_idx.shape[0]
405
+ patch_order = np.reshape(patch_order, [-1])
406
+ n_patches = patch_order.shape[0]
407
+
408
+ valid = patch_order >= 0
409
+ n_valid_patches = valid.sum()
410
+ assert len(image_input_idx) == n_valid_patches
411
+
412
+ sorted_patch_ixs = np.zeros([n_tokens], np.int32)
413
+ sorted_patch_ixs[patch_order[valid]] = np.arange(n_valid_patches, dtype=np.int32)
414
+
415
+ # Project the inverted mapping into same sparse structure
416
+ sorted_patch_ixs_ex = np.full(np.shape(patch_order), -1)
417
+ sorted_patch_ixs_ex[valid] = sorted_patch_ixs
418
+
419
+ # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
420
+ valid = (sorted_patch_ixs_ex >= 0).astype(np.int32)
421
+ image_input_idx = image_input_idx[sorted_patch_ixs_ex*valid]
422
+ image_input_idx = image_input_idx*valid - 100*(1 - valid)
423
+ image_input_idx = np.reshape(image_input_idx, [-1, tokens_per_image])
424
+ return image_input_idx
425
+
426
+ def preprocess(
427
+ self,
428
+ image: np.ndarray,
429
+ image_patch_token_id: int,
430
+ image_col_token_id: int,
431
+ image_start_token_id: int,
432
+ image_end_token_id: int,
433
+ max_crops: Optional[int] = None,
434
+ overlap_margins: Optional[List[int]] = None,
435
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
436
+ image_token_length_w: Optional[int] = None,
437
+ image_token_length_h: Optional[int] = None,
438
+ image_patch_size: Optional[int] = None,
439
+ **kwargs,
440
+ ):
441
+ """Preprocesses a single image"""
442
+
443
+ max_crops = max_crops or self.max_crops
444
+ overlap_margins = overlap_margins or self.overlap_margins
445
+ base_image_input_size = base_image_input_size or self.base_image_input_size
446
+ image_token_length_w = image_token_length_w or self.image_token_length_w
447
+ image_token_length_h = image_token_length_h or self.image_token_length_h
448
+ image_patch_size = image_patch_size or self.image_patch_size
449
+
450
+ crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(
451
+ image,
452
+ image_patch_token_id,
453
+ image_col_token_id,
454
+ image_start_token_id,
455
+ image_end_token_id,
456
+ max_crops,
457
+ overlap_margins,
458
+ base_image_input_size,
459
+ image_token_length_w,
460
+ image_token_length_h,
461
+ image_patch_size,
462
+ )
463
+ patch_idx = self.build_image_input_idx(
464
+ image_tokens,
465
+ patch_ordering,
466
+ image_patch_token_id,
467
+ image_token_length_w=image_token_length_w,
468
+ image_token_length_h=image_token_length_h,
469
+ )
470
+ return crops, image_tokens, patch_idx, img_mask
471
+
472
+ def multimodal_preprocess(
473
+ self,
474
+ images: np.ndarray,
475
+ tokens: List[int],
476
+ image_idx: np.ndarray,
477
+ sequence_length: int,
478
+ image_patch_token_id: int,
479
+ image_col_token_id: int,
480
+ image_start_token_id: int,
481
+ image_end_token_id: int,
482
+ **kwargs,
483
+ ):
484
+ """Merge images and text tokens into multi-modal features for the model
485
+
486
+ :param images: images to use as input
487
+ :param tokens: input text tokens
488
+ :param image_idx: where to insert the images into `tokens`
489
+ :params image_patch_token_id: id to use of tokens that will contain image features
490
+ :params image_col_token_id: token id for image column special tokens
491
+ :params image_start_token_id: token id for image start special tokens
492
+ :params image_end_token_id: token id for image end special tokens
493
+ :params kwargs: override preprocessor default args
494
+ """
495
+ max_total_crops = kwargs.get("max_crops") or self.max_crops
496
+ image_token_length_w = kwargs.get("image_token_length_w") or self.image_token_length_w
497
+ image_token_length_h = kwargs.get("image_token_length_h") or self.image_token_length_h
498
+ image_patch_size = kwargs.get("image_patch_size") or self.image_patch_size
499
+ base_image_input_size = kwargs.get("base_image_input_size") or self.base_image_input_size
500
+ image_num_patch = (
501
+ base_image_input_size[0] // image_patch_size,
502
+ base_image_input_size[1] // image_patch_size,
503
+ )
504
+ image_padding_mask = kwargs.get("image_padding_mask") or self.image_padding_mask
505
+
506
+ tokens_per_image = image_token_length_w * image_token_length_h
507
+ n_pixels = image_patch_size * image_patch_size * 3
508
+ n_patches = image_num_patch[0] * image_num_patch[1]
509
+
510
+ if images is None:
511
+ return {
512
+ "input_ids": tokens,
513
+ "images": None,
514
+ "image_input_idx": None
515
+ }
516
+ else:
517
+ n = len(images)
518
+ all_crops = []
519
+ all_image_idx = []
520
+ out_tokens = []
521
+ all_crop_masks = []
522
+
523
+ for ix in range(n):
524
+ token_ix = image_idx[ix]
525
+ crops, image_tokens, patch_idx, img_mask = self.preprocess(
526
+ images[ix],
527
+ image_patch_token_id,
528
+ image_col_token_id,
529
+ image_start_token_id,
530
+ image_end_token_id,
531
+ **kwargs,
532
+ )
533
+
534
+ if token_ix == -1: # -1 is an image inserted at the very start
535
+ start = 0
536
+ token_ix = 0
537
+ end = 0
538
+ else:
539
+ start = 0 if ix == 0 else image_idx[ix-1] + 1
540
+ end = token_ix + 1
541
+
542
+ all_image_idx.append(patch_idx + token_ix)
543
+ all_crops.append(crops)
544
+ out_tokens.append(tokens[start:token_ix])
545
+ out_tokens.append(image_tokens)
546
+ if ix == (n - 1):
547
+ out_tokens.append(tokens[end:])
548
+ if image_padding_mask:
549
+ all_crop_masks.append(img_mask)
550
+
551
+ input_ids = np.concatenate(out_tokens, 0)
552
+ images = np.concatenate(all_crops, 0)
553
+ image_input_idx = np.concatenate(all_image_idx, 0)
554
+ if image_padding_mask:
555
+ image_masks = np.concatenate(all_crop_masks, 0)
556
+ else:
557
+ image_masks = None
558
+
559
+ out = {
560
+ "input_ids": input_ids,
561
+ "images": images,
562
+ "image_input_idx": image_input_idx
563
+ }
564
+ if image_masks is not None:
565
+ out["image_masks"] = image_masks
566
+ return out
567
+
568
+
569
+ MolmoImageProcessor.register_for_auto_class()
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f62462f2b44ec6cf5e9e463be3d628f63b2a8685519d9d58920311a00a79d74d
3
+ size 4951691600
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27fa5b456885e5541ca10c08b6768ffe4bfdfd376199e58d0b57ce7ad1dea9eb
3
+ size 4857402880
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:569422af4369e75a7d2243484aa25715d3aa8d0a47500001b79c18826b078654
3
+ size 4857402936
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f1b3639387e4a063ae0c29477684018cd6719be3123a21dc5640172dfdd2ed
3
+ size 4857402936
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67c52ed3b2d8e75f07e31eb9a7ab663518f7b843ed08e7571581e154538afe5b
3
+ size 4857402936
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9bd0ebb5ea43f3167015279b60351ed5480af76067d1bf8330336e9e1426a4f
3
+ size 4988051816
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5a6e64e4b75e2268eeb34b0f07520a0a24092a6fd548991664a9170c5acac45
3
+ size 1290861152
model.safetensors.index.json ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 30660128768
4
+ },
5
+ "weight_map": {
6
+ "model.transformer.blocks.0.att_proj.weight": "model-00001-of-00007.safetensors",
7
+ "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00007.safetensors",
8
+ "model.transformer.blocks.0.attn_out.weight": "model-00001-of-00007.safetensors",
9
+ "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00007.safetensors",
10
+ "model.transformer.blocks.0.ff_out.weight": "model-00001-of-00007.safetensors",
11
+ "model.transformer.blocks.0.ff_proj.weight": "model-00001-of-00007.safetensors",
12
+ "model.transformer.blocks.0.k_norm.weight": "model-00001-of-00007.safetensors",
13
+ "model.transformer.blocks.0.q_norm.weight": "model-00001-of-00007.safetensors",
14
+ "model.transformer.blocks.1.att_proj.weight": "model-00001-of-00007.safetensors",
15
+ "model.transformer.blocks.1.attn_norm.weight": "model-00001-of-00007.safetensors",
16
+ "model.transformer.blocks.1.attn_out.weight": "model-00001-of-00007.safetensors",
17
+ "model.transformer.blocks.1.ff_norm.weight": "model-00001-of-00007.safetensors",
18
+ "model.transformer.blocks.1.ff_out.weight": "model-00001-of-00007.safetensors",
19
+ "model.transformer.blocks.1.ff_proj.weight": "model-00001-of-00007.safetensors",
20
+ "model.transformer.blocks.1.k_norm.weight": "model-00001-of-00007.safetensors",
21
+ "model.transformer.blocks.1.q_norm.weight": "model-00001-of-00007.safetensors",
22
+ "model.transformer.blocks.10.att_proj.weight": "model-00003-of-00007.safetensors",
23
+ "model.transformer.blocks.10.attn_norm.weight": "model-00003-of-00007.safetensors",
24
+ "model.transformer.blocks.10.attn_out.weight": "model-00002-of-00007.safetensors",
25
+ "model.transformer.blocks.10.ff_norm.weight": "model-00003-of-00007.safetensors",
26
+ "model.transformer.blocks.10.ff_out.weight": "model-00003-of-00007.safetensors",
27
+ "model.transformer.blocks.10.ff_proj.weight": "model-00003-of-00007.safetensors",
28
+ "model.transformer.blocks.10.k_norm.weight": "model-00002-of-00007.safetensors",
29
+ "model.transformer.blocks.10.q_norm.weight": "model-00002-of-00007.safetensors",
30
+ "model.transformer.blocks.11.att_proj.weight": "model-00003-of-00007.safetensors",
31
+ "model.transformer.blocks.11.attn_norm.weight": "model-00003-of-00007.safetensors",
32
+ "model.transformer.blocks.11.attn_out.weight": "model-00003-of-00007.safetensors",
33
+ "model.transformer.blocks.11.ff_norm.weight": "model-00003-of-00007.safetensors",
34
+ "model.transformer.blocks.11.ff_out.weight": "model-00003-of-00007.safetensors",
35
+ "model.transformer.blocks.11.ff_proj.weight": "model-00003-of-00007.safetensors",
36
+ "model.transformer.blocks.11.k_norm.weight": "model-00003-of-00007.safetensors",
37
+ "model.transformer.blocks.11.q_norm.weight": "model-00003-of-00007.safetensors",
38
+ "model.transformer.blocks.12.att_proj.weight": "model-00003-of-00007.safetensors",
39
+ "model.transformer.blocks.12.attn_norm.weight": "model-00003-of-00007.safetensors",
40
+ "model.transformer.blocks.12.attn_out.weight": "model-00003-of-00007.safetensors",
41
+ "model.transformer.blocks.12.ff_norm.weight": "model-00003-of-00007.safetensors",
42
+ "model.transformer.blocks.12.ff_out.weight": "model-00003-of-00007.safetensors",
43
+ "model.transformer.blocks.12.ff_proj.weight": "model-00003-of-00007.safetensors",
44
+ "model.transformer.blocks.12.k_norm.weight": "model-00003-of-00007.safetensors",
45
+ "model.transformer.blocks.12.q_norm.weight": "model-00003-of-00007.safetensors",
46
+ "model.transformer.blocks.13.att_proj.weight": "model-00003-of-00007.safetensors",
47
+ "model.transformer.blocks.13.attn_norm.weight": "model-00003-of-00007.safetensors",
48
+ "model.transformer.blocks.13.attn_out.weight": "model-00003-of-00007.safetensors",
49
+ "model.transformer.blocks.13.ff_norm.weight": "model-00003-of-00007.safetensors",
50
+ "model.transformer.blocks.13.ff_out.weight": "model-00003-of-00007.safetensors",
51
+ "model.transformer.blocks.13.ff_proj.weight": "model-00003-of-00007.safetensors",
52
+ "model.transformer.blocks.13.k_norm.weight": "model-00003-of-00007.safetensors",
53
+ "model.transformer.blocks.13.q_norm.weight": "model-00003-of-00007.safetensors",
54
+ "model.transformer.blocks.14.att_proj.weight": "model-00003-of-00007.safetensors",
55
+ "model.transformer.blocks.14.attn_norm.weight": "model-00003-of-00007.safetensors",
56
+ "model.transformer.blocks.14.attn_out.weight": "model-00003-of-00007.safetensors",
57
+ "model.transformer.blocks.14.ff_norm.weight": "model-00003-of-00007.safetensors",
58
+ "model.transformer.blocks.14.ff_out.weight": "model-00003-of-00007.safetensors",
59
+ "model.transformer.blocks.14.ff_proj.weight": "model-00003-of-00007.safetensors",
60
+ "model.transformer.blocks.14.k_norm.weight": "model-00003-of-00007.safetensors",
61
+ "model.transformer.blocks.14.q_norm.weight": "model-00003-of-00007.safetensors",
62
+ "model.transformer.blocks.15.att_proj.weight": "model-00003-of-00007.safetensors",
63
+ "model.transformer.blocks.15.attn_norm.weight": "model-00003-of-00007.safetensors",
64
+ "model.transformer.blocks.15.attn_out.weight": "model-00003-of-00007.safetensors",
65
+ "model.transformer.blocks.15.ff_norm.weight": "model-00003-of-00007.safetensors",
66
+ "model.transformer.blocks.15.ff_out.weight": "model-00003-of-00007.safetensors",
67
+ "model.transformer.blocks.15.ff_proj.weight": "model-00003-of-00007.safetensors",
68
+ "model.transformer.blocks.15.k_norm.weight": "model-00003-of-00007.safetensors",
69
+ "model.transformer.blocks.15.q_norm.weight": "model-00003-of-00007.safetensors",
70
+ "model.transformer.blocks.16.att_proj.weight": "model-00004-of-00007.safetensors",
71
+ "model.transformer.blocks.16.attn_norm.weight": "model-00004-of-00007.safetensors",
72
+ "model.transformer.blocks.16.attn_out.weight": "model-00003-of-00007.safetensors",
73
+ "model.transformer.blocks.16.ff_norm.weight": "model-00004-of-00007.safetensors",
74
+ "model.transformer.blocks.16.ff_out.weight": "model-00004-of-00007.safetensors",
75
+ "model.transformer.blocks.16.ff_proj.weight": "model-00004-of-00007.safetensors",
76
+ "model.transformer.blocks.16.k_norm.weight": "model-00003-of-00007.safetensors",
77
+ "model.transformer.blocks.16.q_norm.weight": "model-00003-of-00007.safetensors",
78
+ "model.transformer.blocks.17.att_proj.weight": "model-00004-of-00007.safetensors",
79
+ "model.transformer.blocks.17.attn_norm.weight": "model-00004-of-00007.safetensors",
80
+ "model.transformer.blocks.17.attn_out.weight": "model-00004-of-00007.safetensors",
81
+ "model.transformer.blocks.17.ff_norm.weight": "model-00004-of-00007.safetensors",
82
+ "model.transformer.blocks.17.ff_out.weight": "model-00004-of-00007.safetensors",
83
+ "model.transformer.blocks.17.ff_proj.weight": "model-00004-of-00007.safetensors",
84
+ "model.transformer.blocks.17.k_norm.weight": "model-00004-of-00007.safetensors",
85
+ "model.transformer.blocks.17.q_norm.weight": "model-00004-of-00007.safetensors",
86
+ "model.transformer.blocks.18.att_proj.weight": "model-00004-of-00007.safetensors",
87
+ "model.transformer.blocks.18.attn_norm.weight": "model-00004-of-00007.safetensors",
88
+ "model.transformer.blocks.18.attn_out.weight": "model-00004-of-00007.safetensors",
89
+ "model.transformer.blocks.18.ff_norm.weight": "model-00004-of-00007.safetensors",
90
+ "model.transformer.blocks.18.ff_out.weight": "model-00004-of-00007.safetensors",
91
+ "model.transformer.blocks.18.ff_proj.weight": "model-00004-of-00007.safetensors",
92
+ "model.transformer.blocks.18.k_norm.weight": "model-00004-of-00007.safetensors",
93
+ "model.transformer.blocks.18.q_norm.weight": "model-00004-of-00007.safetensors",
94
+ "model.transformer.blocks.19.att_proj.weight": "model-00004-of-00007.safetensors",
95
+ "model.transformer.blocks.19.attn_norm.weight": "model-00004-of-00007.safetensors",
96
+ "model.transformer.blocks.19.attn_out.weight": "model-00004-of-00007.safetensors",
97
+ "model.transformer.blocks.19.ff_norm.weight": "model-00004-of-00007.safetensors",
98
+ "model.transformer.blocks.19.ff_out.weight": "model-00004-of-00007.safetensors",
99
+ "model.transformer.blocks.19.ff_proj.weight": "model-00004-of-00007.safetensors",
100
+ "model.transformer.blocks.19.k_norm.weight": "model-00004-of-00007.safetensors",
101
+ "model.transformer.blocks.19.q_norm.weight": "model-00004-of-00007.safetensors",
102
+ "model.transformer.blocks.2.att_proj.weight": "model-00001-of-00007.safetensors",
103
+ "model.transformer.blocks.2.attn_norm.weight": "model-00001-of-00007.safetensors",
104
+ "model.transformer.blocks.2.attn_out.weight": "model-00001-of-00007.safetensors",
105
+ "model.transformer.blocks.2.ff_norm.weight": "model-00001-of-00007.safetensors",
106
+ "model.transformer.blocks.2.ff_out.weight": "model-00001-of-00007.safetensors",
107
+ "model.transformer.blocks.2.ff_proj.weight": "model-00001-of-00007.safetensors",
108
+ "model.transformer.blocks.2.k_norm.weight": "model-00001-of-00007.safetensors",
109
+ "model.transformer.blocks.2.q_norm.weight": "model-00001-of-00007.safetensors",
110
+ "model.transformer.blocks.20.att_proj.weight": "model-00004-of-00007.safetensors",
111
+ "model.transformer.blocks.20.attn_norm.weight": "model-00004-of-00007.safetensors",
112
+ "model.transformer.blocks.20.attn_out.weight": "model-00004-of-00007.safetensors",
113
+ "model.transformer.blocks.20.ff_norm.weight": "model-00004-of-00007.safetensors",
114
+ "model.transformer.blocks.20.ff_out.weight": "model-00004-of-00007.safetensors",
115
+ "model.transformer.blocks.20.ff_proj.weight": "model-00004-of-00007.safetensors",
116
+ "model.transformer.blocks.20.k_norm.weight": "model-00004-of-00007.safetensors",
117
+ "model.transformer.blocks.20.q_norm.weight": "model-00004-of-00007.safetensors",
118
+ "model.transformer.blocks.21.att_proj.weight": "model-00004-of-00007.safetensors",
119
+ "model.transformer.blocks.21.attn_norm.weight": "model-00004-of-00007.safetensors",
120
+ "model.transformer.blocks.21.attn_out.weight": "model-00004-of-00007.safetensors",
121
+ "model.transformer.blocks.21.ff_norm.weight": "model-00004-of-00007.safetensors",
122
+ "model.transformer.blocks.21.ff_out.weight": "model-00004-of-00007.safetensors",
123
+ "model.transformer.blocks.21.ff_proj.weight": "model-00004-of-00007.safetensors",
124
+ "model.transformer.blocks.21.k_norm.weight": "model-00004-of-00007.safetensors",
125
+ "model.transformer.blocks.21.q_norm.weight": "model-00004-of-00007.safetensors",
126
+ "model.transformer.blocks.22.att_proj.weight": "model-00005-of-00007.safetensors",
127
+ "model.transformer.blocks.22.attn_norm.weight": "model-00005-of-00007.safetensors",
128
+ "model.transformer.blocks.22.attn_out.weight": "model-00004-of-00007.safetensors",
129
+ "model.transformer.blocks.22.ff_norm.weight": "model-00005-of-00007.safetensors",
130
+ "model.transformer.blocks.22.ff_out.weight": "model-00005-of-00007.safetensors",
131
+ "model.transformer.blocks.22.ff_proj.weight": "model-00005-of-00007.safetensors",
132
+ "model.transformer.blocks.22.k_norm.weight": "model-00004-of-00007.safetensors",
133
+ "model.transformer.blocks.22.q_norm.weight": "model-00004-of-00007.safetensors",
134
+ "model.transformer.blocks.23.att_proj.weight": "model-00005-of-00007.safetensors",
135
+ "model.transformer.blocks.23.attn_norm.weight": "model-00005-of-00007.safetensors",
136
+ "model.transformer.blocks.23.attn_out.weight": "model-00005-of-00007.safetensors",
137
+ "model.transformer.blocks.23.ff_norm.weight": "model-00005-of-00007.safetensors",
138
+ "model.transformer.blocks.23.ff_out.weight": "model-00005-of-00007.safetensors",
139
+ "model.transformer.blocks.23.ff_proj.weight": "model-00005-of-00007.safetensors",
140
+ "model.transformer.blocks.23.k_norm.weight": "model-00005-of-00007.safetensors",
141
+ "model.transformer.blocks.23.q_norm.weight": "model-00005-of-00007.safetensors",
142
+ "model.transformer.blocks.24.att_proj.weight": "model-00005-of-00007.safetensors",
143
+ "model.transformer.blocks.24.attn_norm.weight": "model-00005-of-00007.safetensors",
144
+ "model.transformer.blocks.24.attn_out.weight": "model-00005-of-00007.safetensors",
145
+ "model.transformer.blocks.24.ff_norm.weight": "model-00005-of-00007.safetensors",
146
+ "model.transformer.blocks.24.ff_out.weight": "model-00005-of-00007.safetensors",
147
+ "model.transformer.blocks.24.ff_proj.weight": "model-00005-of-00007.safetensors",
148
+ "model.transformer.blocks.24.k_norm.weight": "model-00005-of-00007.safetensors",
149
+ "model.transformer.blocks.24.q_norm.weight": "model-00005-of-00007.safetensors",
150
+ "model.transformer.blocks.25.att_proj.weight": "model-00005-of-00007.safetensors",
151
+ "model.transformer.blocks.25.attn_norm.weight": "model-00005-of-00007.safetensors",
152
+ "model.transformer.blocks.25.attn_out.weight": "model-00005-of-00007.safetensors",
153
+ "model.transformer.blocks.25.ff_norm.weight": "model-00005-of-00007.safetensors",
154
+ "model.transformer.blocks.25.ff_out.weight": "model-00005-of-00007.safetensors",
155
+ "model.transformer.blocks.25.ff_proj.weight": "model-00005-of-00007.safetensors",
156
+ "model.transformer.blocks.25.k_norm.weight": "model-00005-of-00007.safetensors",
157
+ "model.transformer.blocks.25.q_norm.weight": "model-00005-of-00007.safetensors",
158
+ "model.transformer.blocks.26.att_proj.weight": "model-00005-of-00007.safetensors",
159
+ "model.transformer.blocks.26.attn_norm.weight": "model-00005-of-00007.safetensors",
160
+ "model.transformer.blocks.26.attn_out.weight": "model-00005-of-00007.safetensors",
161
+ "model.transformer.blocks.26.ff_norm.weight": "model-00005-of-00007.safetensors",
162
+ "model.transformer.blocks.26.ff_out.weight": "model-00005-of-00007.safetensors",
163
+ "model.transformer.blocks.26.ff_proj.weight": "model-00005-of-00007.safetensors",
164
+ "model.transformer.blocks.26.k_norm.weight": "model-00005-of-00007.safetensors",
165
+ "model.transformer.blocks.26.q_norm.weight": "model-00005-of-00007.safetensors",
166
+ "model.transformer.blocks.27.att_proj.weight": "model-00005-of-00007.safetensors",
167
+ "model.transformer.blocks.27.attn_norm.weight": "model-00005-of-00007.safetensors",
168
+ "model.transformer.blocks.27.attn_out.weight": "model-00005-of-00007.safetensors",
169
+ "model.transformer.blocks.27.ff_norm.weight": "model-00005-of-00007.safetensors",
170
+ "model.transformer.blocks.27.ff_out.weight": "model-00005-of-00007.safetensors",
171
+ "model.transformer.blocks.27.ff_proj.weight": "model-00005-of-00007.safetensors",
172
+ "model.transformer.blocks.27.k_norm.weight": "model-00005-of-00007.safetensors",
173
+ "model.transformer.blocks.27.q_norm.weight": "model-00005-of-00007.safetensors",
174
+ "model.transformer.blocks.28.att_proj.weight": "model-00006-of-00007.safetensors",
175
+ "model.transformer.blocks.28.attn_norm.weight": "model-00006-of-00007.safetensors",
176
+ "model.transformer.blocks.28.attn_out.weight": "model-00005-of-00007.safetensors",
177
+ "model.transformer.blocks.28.ff_norm.weight": "model-00006-of-00007.safetensors",
178
+ "model.transformer.blocks.28.ff_out.weight": "model-00006-of-00007.safetensors",
179
+ "model.transformer.blocks.28.ff_proj.weight": "model-00006-of-00007.safetensors",
180
+ "model.transformer.blocks.28.k_norm.weight": "model-00005-of-00007.safetensors",
181
+ "model.transformer.blocks.28.q_norm.weight": "model-00005-of-00007.safetensors",
182
+ "model.transformer.blocks.29.att_proj.weight": "model-00006-of-00007.safetensors",
183
+ "model.transformer.blocks.29.attn_norm.weight": "model-00006-of-00007.safetensors",
184
+ "model.transformer.blocks.29.attn_out.weight": "model-00006-of-00007.safetensors",
185
+ "model.transformer.blocks.29.ff_norm.weight": "model-00006-of-00007.safetensors",
186
+ "model.transformer.blocks.29.ff_out.weight": "model-00006-of-00007.safetensors",
187
+ "model.transformer.blocks.29.ff_proj.weight": "model-00006-of-00007.safetensors",
188
+ "model.transformer.blocks.29.k_norm.weight": "model-00006-of-00007.safetensors",
189
+ "model.transformer.blocks.29.q_norm.weight": "model-00006-of-00007.safetensors",
190
+ "model.transformer.blocks.3.att_proj.weight": "model-00001-of-00007.safetensors",
191
+ "model.transformer.blocks.3.attn_norm.weight": "model-00001-of-00007.safetensors",
192
+ "model.transformer.blocks.3.attn_out.weight": "model-00001-of-00007.safetensors",
193
+ "model.transformer.blocks.3.ff_norm.weight": "model-00001-of-00007.safetensors",
194
+ "model.transformer.blocks.3.ff_out.weight": "model-00001-of-00007.safetensors",
195
+ "model.transformer.blocks.3.ff_proj.weight": "model-00001-of-00007.safetensors",
196
+ "model.transformer.blocks.3.k_norm.weight": "model-00001-of-00007.safetensors",
197
+ "model.transformer.blocks.3.q_norm.weight": "model-00001-of-00007.safetensors",
198
+ "model.transformer.blocks.30.att_proj.weight": "model-00006-of-00007.safetensors",
199
+ "model.transformer.blocks.30.attn_norm.weight": "model-00006-of-00007.safetensors",
200
+ "model.transformer.blocks.30.attn_out.weight": "model-00006-of-00007.safetensors",
201
+ "model.transformer.blocks.30.ff_norm.weight": "model-00006-of-00007.safetensors",
202
+ "model.transformer.blocks.30.ff_out.weight": "model-00006-of-00007.safetensors",
203
+ "model.transformer.blocks.30.ff_proj.weight": "model-00006-of-00007.safetensors",
204
+ "model.transformer.blocks.30.k_norm.weight": "model-00006-of-00007.safetensors",
205
+ "model.transformer.blocks.30.q_norm.weight": "model-00006-of-00007.safetensors",
206
+ "model.transformer.blocks.31.att_proj.weight": "model-00006-of-00007.safetensors",
207
+ "model.transformer.blocks.31.attn_norm.weight": "model-00006-of-00007.safetensors",
208
+ "model.transformer.blocks.31.attn_out.weight": "model-00006-of-00007.safetensors",
209
+ "model.transformer.blocks.31.ff_norm.weight": "model-00006-of-00007.safetensors",
210
+ "model.transformer.blocks.31.ff_out.weight": "model-00006-of-00007.safetensors",
211
+ "model.transformer.blocks.31.ff_proj.weight": "model-00006-of-00007.safetensors",
212
+ "model.transformer.blocks.31.k_norm.weight": "model-00006-of-00007.safetensors",
213
+ "model.transformer.blocks.31.q_norm.weight": "model-00006-of-00007.safetensors",
214
+ "model.transformer.blocks.4.att_proj.weight": "model-00002-of-00007.safetensors",
215
+ "model.transformer.blocks.4.attn_norm.weight": "model-00002-of-00007.safetensors",
216
+ "model.transformer.blocks.4.attn_out.weight": "model-00001-of-00007.safetensors",
217
+ "model.transformer.blocks.4.ff_norm.weight": "model-00002-of-00007.safetensors",
218
+ "model.transformer.blocks.4.ff_out.weight": "model-00002-of-00007.safetensors",
219
+ "model.transformer.blocks.4.ff_proj.weight": "model-00002-of-00007.safetensors",
220
+ "model.transformer.blocks.4.k_norm.weight": "model-00001-of-00007.safetensors",
221
+ "model.transformer.blocks.4.q_norm.weight": "model-00001-of-00007.safetensors",
222
+ "model.transformer.blocks.5.att_proj.weight": "model-00002-of-00007.safetensors",
223
+ "model.transformer.blocks.5.attn_norm.weight": "model-00002-of-00007.safetensors",
224
+ "model.transformer.blocks.5.attn_out.weight": "model-00002-of-00007.safetensors",
225
+ "model.transformer.blocks.5.ff_norm.weight": "model-00002-of-00007.safetensors",
226
+ "model.transformer.blocks.5.ff_out.weight": "model-00002-of-00007.safetensors",
227
+ "model.transformer.blocks.5.ff_proj.weight": "model-00002-of-00007.safetensors",
228
+ "model.transformer.blocks.5.k_norm.weight": "model-00002-of-00007.safetensors",
229
+ "model.transformer.blocks.5.q_norm.weight": "model-00002-of-00007.safetensors",
230
+ "model.transformer.blocks.6.att_proj.weight": "model-00002-of-00007.safetensors",
231
+ "model.transformer.blocks.6.attn_norm.weight": "model-00002-of-00007.safetensors",
232
+ "model.transformer.blocks.6.attn_out.weight": "model-00002-of-00007.safetensors",
233
+ "model.transformer.blocks.6.ff_norm.weight": "model-00002-of-00007.safetensors",
234
+ "model.transformer.blocks.6.ff_out.weight": "model-00002-of-00007.safetensors",
235
+ "model.transformer.blocks.6.ff_proj.weight": "model-00002-of-00007.safetensors",
236
+ "model.transformer.blocks.6.k_norm.weight": "model-00002-of-00007.safetensors",
237
+ "model.transformer.blocks.6.q_norm.weight": "model-00002-of-00007.safetensors",
238
+ "model.transformer.blocks.7.att_proj.weight": "model-00002-of-00007.safetensors",
239
+ "model.transformer.blocks.7.attn_norm.weight": "model-00002-of-00007.safetensors",
240
+ "model.transformer.blocks.7.attn_out.weight": "model-00002-of-00007.safetensors",
241
+ "model.transformer.blocks.7.ff_norm.weight": "model-00002-of-00007.safetensors",
242
+ "model.transformer.blocks.7.ff_out.weight": "model-00002-of-00007.safetensors",
243
+ "model.transformer.blocks.7.ff_proj.weight": "model-00002-of-00007.safetensors",
244
+ "model.transformer.blocks.7.k_norm.weight": "model-00002-of-00007.safetensors",
245
+ "model.transformer.blocks.7.q_norm.weight": "model-00002-of-00007.safetensors",
246
+ "model.transformer.blocks.8.att_proj.weight": "model-00002-of-00007.safetensors",
247
+ "model.transformer.blocks.8.attn_norm.weight": "model-00002-of-00007.safetensors",
248
+ "model.transformer.blocks.8.attn_out.weight": "model-00002-of-00007.safetensors",
249
+ "model.transformer.blocks.8.ff_norm.weight": "model-00002-of-00007.safetensors",
250
+ "model.transformer.blocks.8.ff_out.weight": "model-00002-of-00007.safetensors",
251
+ "model.transformer.blocks.8.ff_proj.weight": "model-00002-of-00007.safetensors",
252
+ "model.transformer.blocks.8.k_norm.weight": "model-00002-of-00007.safetensors",
253
+ "model.transformer.blocks.8.q_norm.weight": "model-00002-of-00007.safetensors",
254
+ "model.transformer.blocks.9.att_proj.weight": "model-00002-of-00007.safetensors",
255
+ "model.transformer.blocks.9.attn_norm.weight": "model-00002-of-00007.safetensors",
256
+ "model.transformer.blocks.9.attn_out.weight": "model-00002-of-00007.safetensors",
257
+ "model.transformer.blocks.9.ff_norm.weight": "model-00002-of-00007.safetensors",
258
+ "model.transformer.blocks.9.ff_out.weight": "model-00002-of-00007.safetensors",
259
+ "model.transformer.blocks.9.ff_proj.weight": "model-00002-of-00007.safetensors",
260
+ "model.transformer.blocks.9.k_norm.weight": "model-00002-of-00007.safetensors",
261
+ "model.transformer.blocks.9.q_norm.weight": "model-00002-of-00007.safetensors",
262
+ "model.transformer.ff_out.weight": "model-00006-of-00007.safetensors",
263
+ "model.transformer.ln_f.weight": "model-00001-of-00007.safetensors",
264
+ "model.transformer.wte.embedding": "model-00001-of-00007.safetensors",
265
+ "model.transformer.wte.new_embedding": "model-00001-of-00007.safetensors",
266
+ "model.vision_backbone.image_pooling_2d.wk.bias": "model-00007-of-00007.safetensors",
267
+ "model.vision_backbone.image_pooling_2d.wk.weight": "model-00007-of-00007.safetensors",
268
+ "model.vision_backbone.image_pooling_2d.wo.bias": "model-00007-of-00007.safetensors",
269
+ "model.vision_backbone.image_pooling_2d.wo.weight": "model-00007-of-00007.safetensors",
270
+ "model.vision_backbone.image_pooling_2d.wq.bias": "model-00007-of-00007.safetensors",
271
+ "model.vision_backbone.image_pooling_2d.wq.weight": "model-00007-of-00007.safetensors",
272
+ "model.vision_backbone.image_pooling_2d.wv.bias": "model-00007-of-00007.safetensors",
273
+ "model.vision_backbone.image_pooling_2d.wv.weight": "model-00007-of-00007.safetensors",
274
+ "model.vision_backbone.image_projector.w1.weight": "model-00007-of-00007.safetensors",
275
+ "model.vision_backbone.image_projector.w2.weight": "model-00007-of-00007.safetensors",
276
+ "model.vision_backbone.image_projector.w3.weight": "model-00007-of-00007.safetensors",
277
+ "model.vision_backbone.image_vit.class_embedding": "model-00006-of-00007.safetensors",
278
+ "model.vision_backbone.image_vit.patch_embedding.weight": "model-00006-of-00007.safetensors",
279
+ "model.vision_backbone.image_vit.positional_embedding": "model-00006-of-00007.safetensors",
280
+ "model.vision_backbone.image_vit.pre_ln.bias": "model-00006-of-00007.safetensors",
281
+ "model.vision_backbone.image_vit.pre_ln.weight": "model-00006-of-00007.safetensors",
282
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.bias": "model-00006-of-00007.safetensors",
283
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.weight": "model-00006-of-00007.safetensors",
284
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.bias": "model-00006-of-00007.safetensors",
285
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.weight": "model-00006-of-00007.safetensors",
286
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.bias": "model-00006-of-00007.safetensors",
287
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.weight": "model-00006-of-00007.safetensors",
288
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.bias": "model-00006-of-00007.safetensors",
289
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.weight": "model-00006-of-00007.safetensors",
290
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.bias": "model-00006-of-00007.safetensors",
291
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.weight": "model-00006-of-00007.safetensors",
292
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.bias": "model-00006-of-00007.safetensors",
293
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.weight": "model-00006-of-00007.safetensors",
294
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.bias": "model-00006-of-00007.safetensors",
295
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.weight": "model-00006-of-00007.safetensors",
296
+ "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.bias": "model-00006-of-00007.safetensors",
297
+ "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.weight": "model-00006-of-00007.safetensors",
298
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.bias": "model-00006-of-00007.safetensors",
299
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.weight": "model-00006-of-00007.safetensors",
300
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.bias": "model-00006-of-00007.safetensors",
301
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.weight": "model-00006-of-00007.safetensors",
302
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.bias": "model-00006-of-00007.safetensors",
303
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.weight": "model-00006-of-00007.safetensors",
304
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.bias": "model-00006-of-00007.safetensors",
305
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.weight": "model-00006-of-00007.safetensors",
306
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.bias": "model-00006-of-00007.safetensors",
307
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.weight": "model-00006-of-00007.safetensors",
308
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.bias": "model-00006-of-00007.safetensors",
309
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.weight": "model-00006-of-00007.safetensors",
310
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.bias": "model-00006-of-00007.safetensors",
311
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.weight": "model-00006-of-00007.safetensors",
312
+ "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.bias": "model-00006-of-00007.safetensors",
313
+ "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.weight": "model-00006-of-00007.safetensors",
314
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.bias": "model-00007-of-00007.safetensors",
315
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.weight": "model-00007-of-00007.safetensors",
316
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.bias": "model-00007-of-00007.safetensors",
317
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.weight": "model-00007-of-00007.safetensors",
318
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.bias": "model-00007-of-00007.safetensors",
319
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.weight": "model-00007-of-00007.safetensors",
320
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.bias": "model-00007-of-00007.safetensors",
321
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.weight": "model-00007-of-00007.safetensors",
322
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.bias": "model-00007-of-00007.safetensors",
323
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.weight": "model-00007-of-00007.safetensors",
324
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
325
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
326
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
327
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
328
+ "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.bias": "model-00007-of-00007.safetensors",
329
+ "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.weight": "model-00007-of-00007.safetensors",
330
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.bias": "model-00007-of-00007.safetensors",
331
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.weight": "model-00007-of-00007.safetensors",
332
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.bias": "model-00007-of-00007.safetensors",
333
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.weight": "model-00007-of-00007.safetensors",
334
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.bias": "model-00007-of-00007.safetensors",
335
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.weight": "model-00007-of-00007.safetensors",
336
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.bias": "model-00007-of-00007.safetensors",
337
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.weight": "model-00007-of-00007.safetensors",
338
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.bias": "model-00007-of-00007.safetensors",
339
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.weight": "model-00007-of-00007.safetensors",
340
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
341
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
342
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
343
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
344
+ "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.bias": "model-00007-of-00007.safetensors",
345
+ "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.weight": "model-00007-of-00007.safetensors",
346
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.bias": "model-00007-of-00007.safetensors",
347
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.weight": "model-00007-of-00007.safetensors",
348
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.bias": "model-00007-of-00007.safetensors",
349
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.weight": "model-00007-of-00007.safetensors",
350
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.bias": "model-00007-of-00007.safetensors",
351
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.weight": "model-00007-of-00007.safetensors",
352
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.bias": "model-00007-of-00007.safetensors",
353
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.weight": "model-00007-of-00007.safetensors",
354
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.bias": "model-00007-of-00007.safetensors",
355
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.weight": "model-00007-of-00007.safetensors",
356
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
357
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
358
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
359
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
360
+ "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.bias": "model-00007-of-00007.safetensors",
361
+ "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.weight": "model-00007-of-00007.safetensors",
362
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.bias": "model-00007-of-00007.safetensors",
363
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.weight": "model-00007-of-00007.safetensors",
364
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.bias": "model-00007-of-00007.safetensors",
365
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.weight": "model-00007-of-00007.safetensors",
366
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.bias": "model-00007-of-00007.safetensors",
367
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.weight": "model-00007-of-00007.safetensors",
368
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.bias": "model-00007-of-00007.safetensors",
369
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.weight": "model-00007-of-00007.safetensors",
370
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.bias": "model-00007-of-00007.safetensors",
371
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.weight": "model-00007-of-00007.safetensors",
372
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
373
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
374
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
375
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
376
+ "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.bias": "model-00007-of-00007.safetensors",
377
+ "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.weight": "model-00007-of-00007.safetensors",
378
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.bias": "model-00007-of-00007.safetensors",
379
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.weight": "model-00007-of-00007.safetensors",
380
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.bias": "model-00007-of-00007.safetensors",
381
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.weight": "model-00007-of-00007.safetensors",
382
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.bias": "model-00007-of-00007.safetensors",
383
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.weight": "model-00007-of-00007.safetensors",
384
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.bias": "model-00007-of-00007.safetensors",
385
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.weight": "model-00007-of-00007.safetensors",
386
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.bias": "model-00007-of-00007.safetensors",
387
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.weight": "model-00007-of-00007.safetensors",
388
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
389
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
390
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
391
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
392
+ "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.bias": "model-00007-of-00007.safetensors",
393
+ "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.weight": "model-00007-of-00007.safetensors",
394
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.bias": "model-00007-of-00007.safetensors",
395
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.weight": "model-00007-of-00007.safetensors",
396
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.bias": "model-00007-of-00007.safetensors",
397
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.weight": "model-00007-of-00007.safetensors",
398
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.bias": "model-00007-of-00007.safetensors",
399
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.weight": "model-00007-of-00007.safetensors",
400
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.bias": "model-00007-of-00007.safetensors",
401
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.weight": "model-00007-of-00007.safetensors",
402
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.bias": "model-00007-of-00007.safetensors",
403
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.weight": "model-00007-of-00007.safetensors",
404
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
405
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
406
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
407
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
408
+ "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.bias": "model-00007-of-00007.safetensors",
409
+ "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.weight": "model-00007-of-00007.safetensors",
410
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.bias": "model-00007-of-00007.safetensors",
411
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.weight": "model-00007-of-00007.safetensors",
412
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.bias": "model-00007-of-00007.safetensors",
413
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.weight": "model-00007-of-00007.safetensors",
414
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.bias": "model-00007-of-00007.safetensors",
415
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.weight": "model-00007-of-00007.safetensors",
416
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.bias": "model-00007-of-00007.safetensors",
417
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.weight": "model-00007-of-00007.safetensors",
418
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.bias": "model-00007-of-00007.safetensors",
419
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.weight": "model-00007-of-00007.safetensors",
420
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
421
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
422
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
423
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
424
+ "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.bias": "model-00007-of-00007.safetensors",
425
+ "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.weight": "model-00007-of-00007.safetensors",
426
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.bias": "model-00007-of-00007.safetensors",
427
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.weight": "model-00007-of-00007.safetensors",
428
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.bias": "model-00007-of-00007.safetensors",
429
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.weight": "model-00007-of-00007.safetensors",
430
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.bias": "model-00007-of-00007.safetensors",
431
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.weight": "model-00007-of-00007.safetensors",
432
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.bias": "model-00007-of-00007.safetensors",
433
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.weight": "model-00007-of-00007.safetensors",
434
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.bias": "model-00007-of-00007.safetensors",
435
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.weight": "model-00007-of-00007.safetensors",
436
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
437
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
438
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
439
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
440
+ "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.bias": "model-00007-of-00007.safetensors",
441
+ "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.weight": "model-00007-of-00007.safetensors",
442
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.bias": "model-00007-of-00007.safetensors",
443
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.weight": "model-00007-of-00007.safetensors",
444
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.bias": "model-00007-of-00007.safetensors",
445
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.weight": "model-00007-of-00007.safetensors",
446
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.bias": "model-00007-of-00007.safetensors",
447
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.weight": "model-00007-of-00007.safetensors",
448
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.bias": "model-00007-of-00007.safetensors",
449
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.weight": "model-00007-of-00007.safetensors",
450
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.bias": "model-00007-of-00007.safetensors",
451
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.weight": "model-00007-of-00007.safetensors",
452
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
453
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
454
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
455
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
456
+ "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.bias": "model-00007-of-00007.safetensors",
457
+ "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.weight": "model-00007-of-00007.safetensors",
458
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.bias": "model-00007-of-00007.safetensors",
459
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.weight": "model-00007-of-00007.safetensors",
460
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.bias": "model-00007-of-00007.safetensors",
461
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.weight": "model-00007-of-00007.safetensors",
462
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.bias": "model-00007-of-00007.safetensors",
463
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.weight": "model-00007-of-00007.safetensors",
464
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.bias": "model-00007-of-00007.safetensors",
465
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.weight": "model-00007-of-00007.safetensors",
466
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.bias": "model-00007-of-00007.safetensors",
467
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.weight": "model-00007-of-00007.safetensors",
468
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
469
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
470
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
471
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
472
+ "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.bias": "model-00007-of-00007.safetensors",
473
+ "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.weight": "model-00007-of-00007.safetensors",
474
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.bias": "model-00006-of-00007.safetensors",
475
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.weight": "model-00006-of-00007.safetensors",
476
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.bias": "model-00006-of-00007.safetensors",
477
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.weight": "model-00006-of-00007.safetensors",
478
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.bias": "model-00006-of-00007.safetensors",
479
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.weight": "model-00006-of-00007.safetensors",
480
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.bias": "model-00006-of-00007.safetensors",
481
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.weight": "model-00006-of-00007.safetensors",
482
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.bias": "model-00006-of-00007.safetensors",
483
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.weight": "model-00006-of-00007.safetensors",
484
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.bias": "model-00006-of-00007.safetensors",
485
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.weight": "model-00006-of-00007.safetensors",
486
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.bias": "model-00006-of-00007.safetensors",
487
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.weight": "model-00006-of-00007.safetensors",
488
+ "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.bias": "model-00006-of-00007.safetensors",
489
+ "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.weight": "model-00006-of-00007.safetensors",
490
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.bias": "model-00007-of-00007.safetensors",
491
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.weight": "model-00007-of-00007.safetensors",
492
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.bias": "model-00007-of-00007.safetensors",
493
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.weight": "model-00007-of-00007.safetensors",
494
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.bias": "model-00007-of-00007.safetensors",
495
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.weight": "model-00007-of-00007.safetensors",
496
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.bias": "model-00007-of-00007.safetensors",
497
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.weight": "model-00007-of-00007.safetensors",
498
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.bias": "model-00007-of-00007.safetensors",
499
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.weight": "model-00007-of-00007.safetensors",
500
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
501
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
502
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
503
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
504
+ "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.bias": "model-00007-of-00007.safetensors",
505
+ "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.weight": "model-00007-of-00007.safetensors",
506
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.bias": "model-00007-of-00007.safetensors",
507
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.weight": "model-00007-of-00007.safetensors",
508
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.bias": "model-00007-of-00007.safetensors",
509
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.weight": "model-00007-of-00007.safetensors",
510
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.bias": "model-00007-of-00007.safetensors",
511
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.weight": "model-00007-of-00007.safetensors",
512
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.bias": "model-00007-of-00007.safetensors",
513
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.weight": "model-00007-of-00007.safetensors",
514
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.bias": "model-00007-of-00007.safetensors",
515
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.weight": "model-00007-of-00007.safetensors",
516
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
517
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
518
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
519
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
520
+ "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.bias": "model-00007-of-00007.safetensors",
521
+ "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.weight": "model-00007-of-00007.safetensors",
522
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wk.bias": "model-00007-of-00007.safetensors",
523
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wk.weight": "model-00007-of-00007.safetensors",
524
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wo.bias": "model-00007-of-00007.safetensors",
525
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wo.weight": "model-00007-of-00007.safetensors",
526
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wq.bias": "model-00007-of-00007.safetensors",
527
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wq.weight": "model-00007-of-00007.safetensors",
528
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wv.bias": "model-00007-of-00007.safetensors",
529
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wv.weight": "model-00007-of-00007.safetensors",
530
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention_norm.bias": "model-00007-of-00007.safetensors",
531
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention_norm.weight": "model-00007-of-00007.safetensors",
532
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
533
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
534
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
535
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
536
+ "model.vision_backbone.image_vit.transformer.resblocks.22.ffn_norm.bias": "model-00007-of-00007.safetensors",
537
+ "model.vision_backbone.image_vit.transformer.resblocks.22.ffn_norm.weight": "model-00007-of-00007.safetensors",
538
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.bias": "model-00006-of-00007.safetensors",
539
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.weight": "model-00006-of-00007.safetensors",
540
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.bias": "model-00006-of-00007.safetensors",
541
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.weight": "model-00006-of-00007.safetensors",
542
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.bias": "model-00006-of-00007.safetensors",
543
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.weight": "model-00006-of-00007.safetensors",
544
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.bias": "model-00006-of-00007.safetensors",
545
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.weight": "model-00006-of-00007.safetensors",
546
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.bias": "model-00007-of-00007.safetensors",
547
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.weight": "model-00007-of-00007.safetensors",
548
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
549
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
550
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
551
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
552
+ "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.bias": "model-00007-of-00007.safetensors",
553
+ "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.weight": "model-00007-of-00007.safetensors",
554
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.bias": "model-00007-of-00007.safetensors",
555
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.weight": "model-00007-of-00007.safetensors",
556
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.bias": "model-00007-of-00007.safetensors",
557
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.weight": "model-00007-of-00007.safetensors",
558
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.bias": "model-00007-of-00007.safetensors",
559
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.weight": "model-00007-of-00007.safetensors",
560
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.bias": "model-00007-of-00007.safetensors",
561
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.weight": "model-00007-of-00007.safetensors",
562
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.bias": "model-00007-of-00007.safetensors",
563
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.weight": "model-00007-of-00007.safetensors",
564
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
565
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
566
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
567
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
568
+ "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.bias": "model-00007-of-00007.safetensors",
569
+ "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.weight": "model-00007-of-00007.safetensors",
570
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.bias": "model-00007-of-00007.safetensors",
571
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.weight": "model-00007-of-00007.safetensors",
572
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.bias": "model-00007-of-00007.safetensors",
573
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.weight": "model-00007-of-00007.safetensors",
574
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.bias": "model-00007-of-00007.safetensors",
575
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.weight": "model-00007-of-00007.safetensors",
576
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.bias": "model-00007-of-00007.safetensors",
577
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.weight": "model-00007-of-00007.safetensors",
578
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.bias": "model-00007-of-00007.safetensors",
579
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.weight": "model-00007-of-00007.safetensors",
580
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
581
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
582
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
583
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
584
+ "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.bias": "model-00007-of-00007.safetensors",
585
+ "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.weight": "model-00007-of-00007.safetensors",
586
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.bias": "model-00007-of-00007.safetensors",
587
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.weight": "model-00007-of-00007.safetensors",
588
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.bias": "model-00007-of-00007.safetensors",
589
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.weight": "model-00007-of-00007.safetensors",
590
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.bias": "model-00007-of-00007.safetensors",
591
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.weight": "model-00007-of-00007.safetensors",
592
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.bias": "model-00007-of-00007.safetensors",
593
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.weight": "model-00007-of-00007.safetensors",
594
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.bias": "model-00007-of-00007.safetensors",
595
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.weight": "model-00007-of-00007.safetensors",
596
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
597
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
598
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
599
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
600
+ "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.bias": "model-00007-of-00007.safetensors",
601
+ "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.weight": "model-00007-of-00007.safetensors",
602
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.bias": "model-00007-of-00007.safetensors",
603
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.weight": "model-00007-of-00007.safetensors",
604
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.bias": "model-00007-of-00007.safetensors",
605
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.weight": "model-00007-of-00007.safetensors",
606
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.bias": "model-00007-of-00007.safetensors",
607
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.weight": "model-00007-of-00007.safetensors",
608
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.bias": "model-00007-of-00007.safetensors",
609
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.weight": "model-00007-of-00007.safetensors",
610
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.bias": "model-00007-of-00007.safetensors",
611
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.weight": "model-00007-of-00007.safetensors",
612
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
613
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
614
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
615
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
616
+ "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.bias": "model-00007-of-00007.safetensors",
617
+ "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.weight": "model-00007-of-00007.safetensors",
618
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.bias": "model-00007-of-00007.safetensors",
619
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.weight": "model-00007-of-00007.safetensors",
620
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.bias": "model-00007-of-00007.safetensors",
621
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.weight": "model-00007-of-00007.safetensors",
622
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.bias": "model-00007-of-00007.safetensors",
623
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.weight": "model-00007-of-00007.safetensors",
624
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.bias": "model-00007-of-00007.safetensors",
625
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.weight": "model-00007-of-00007.safetensors",
626
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.bias": "model-00007-of-00007.safetensors",
627
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.weight": "model-00007-of-00007.safetensors",
628
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
629
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
630
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
631
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
632
+ "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.bias": "model-00007-of-00007.safetensors",
633
+ "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.weight": "model-00007-of-00007.safetensors",
634
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.bias": "model-00007-of-00007.safetensors",
635
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.weight": "model-00007-of-00007.safetensors",
636
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.bias": "model-00007-of-00007.safetensors",
637
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.weight": "model-00007-of-00007.safetensors",
638
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.bias": "model-00007-of-00007.safetensors",
639
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.weight": "model-00007-of-00007.safetensors",
640
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.bias": "model-00007-of-00007.safetensors",
641
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.weight": "model-00007-of-00007.safetensors",
642
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.bias": "model-00007-of-00007.safetensors",
643
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.weight": "model-00007-of-00007.safetensors",
644
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.bias": "model-00007-of-00007.safetensors",
645
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.weight": "model-00007-of-00007.safetensors",
646
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
647
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
648
+ "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
649
+ "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors",
650
+ "model.vision_backbone.pad_embed": "model-00006-of-00007.safetensors"
651
+ }
652
+ }
modeling_molmo.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessing_molmo.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Processor class for Molmo.
3
+ """
4
+
5
+ from typing import Optional
6
+
7
+ try:
8
+ from typing import Unpack
9
+ except ImportError:
10
+ from typing_extensions import Unpack
11
+
12
+ import numpy as np
13
+ import torch
14
+
15
+ from transformers.image_utils import ImageInput
16
+ from transformers.processing_utils import (
17
+ TextKwargs,
18
+ ProcessingKwargs,
19
+ ProcessorMixin,
20
+ )
21
+
22
+ from transformers.tokenization_utils_base import TextInput
23
+ from transformers.utils import logging
24
+
25
+ from transformers import AutoTokenizer
26
+ from .image_preprocessing_molmo import MolmoImagesKwargs, make_batched_images, MolmoImageProcessor
27
+
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+
32
+ DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
33
+ DEFAULT_IM_START_TOKEN = f"<im_start>"
34
+ DEFAULT_IM_END_TOKEN = f"<im_end>"
35
+ DEFAULT_IM_COL_TOKEN = f"<im_col>"
36
+ IMAGE_PROMPT = "<|image|>"
37
+
38
+ EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
39
+
40
+
41
+ def get_special_token_ids(tokenizer):
42
+ ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
43
+ assert len(ids) == len(EXTRA_TOKENS)
44
+ return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
45
+
46
+
47
+ class MolmoTextKwargs(TextKwargs, total=False):
48
+ style: Optional[str]
49
+ system_prompt: Optional[str]
50
+ message_format: Optional[str]
51
+ always_start_with_space: Optional[bool]
52
+ sequence_length: Optional[int]
53
+
54
+
55
+ class MolmoProcessorKwargs(ProcessingKwargs, total=False):
56
+ text_kwargs: MolmoTextKwargs
57
+ images_kwargs: MolmoImagesKwargs
58
+ _defaults = {
59
+ "images_kwargs": {
60
+ "max_crops": 12,
61
+ "overlap_margins": [4, 4],
62
+ "base_image_input_size": [336, 336],
63
+ "image_token_length_w": 12,
64
+ "image_token_length_h": 12,
65
+ "image_patch_size": 14,
66
+ "image_padding_mask": True,
67
+ },
68
+ "text_kwargs": {
69
+ "style": "long_caption",
70
+ "system_prompt": "none",
71
+ "message_format": "role",
72
+ "always_start_with_space": True,
73
+ "sequence_length": 1536,
74
+ "padding": False,
75
+ },
76
+ }
77
+
78
+
79
+ class MolmoProcessor(ProcessorMixin):
80
+ attributes = ["image_processor", "tokenizer"]
81
+ image_processor_class = "AutoImageProcessor"
82
+ tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
83
+
84
+ def __init__(self, image_processor: MolmoImageProcessor = None, tokenizer : AutoTokenizer = None, **kwargs):
85
+ # self.image_processor = image_processor
86
+ # self.tokenizer = tokenizer
87
+ super().__init__(image_processor, tokenizer)
88
+ self._special_tokens = None
89
+
90
+ @property
91
+ def special_token_ids(self):
92
+ if self._special_tokens is None:
93
+ self._special_tokens = get_special_token_ids(self.tokenizer)
94
+ return self._special_tokens
95
+
96
+ def get_tokens_input(self, prompt, message_format, always_start_with_space):
97
+ if message_format == "none" or message_format is None:
98
+ pass
99
+ elif message_format == "role":
100
+ prompt = "User: " + prompt + " Assistant:"
101
+ else:
102
+ raise NotImplementedError(f"Message format {message_format} not implemented")
103
+
104
+ if always_start_with_space:
105
+ prompt = " " + prompt
106
+
107
+ tokens = self.tokenizer.encode(prompt, add_special_tokens=False)
108
+
109
+ return tokens
110
+
111
+ def process(
112
+ self,
113
+ text: TextInput = None,
114
+ images: ImageInput = None,
115
+ **kwargs: Unpack[MolmoProcessorKwargs],
116
+ ):
117
+ output_kwargs = self._merge_kwargs(
118
+ MolmoProcessorKwargs,
119
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
120
+ **kwargs,
121
+ )
122
+
123
+ tokens = self.get_tokens_input(
124
+ text,
125
+ output_kwargs["text_kwargs"]["message_format"],
126
+ output_kwargs["text_kwargs"]["always_start_with_space"],
127
+ )
128
+
129
+ image_token_id = self.special_token_ids[IMAGE_PROMPT]
130
+
131
+ if images is not None:
132
+ images = make_batched_images(images)
133
+ images = [np.array(image).astype(np.uint8) for image in images]
134
+ # For now only support inserting images at the start
135
+ image_idx = [-1]*len(images)
136
+ else:
137
+ image_idx = None
138
+
139
+ sequence_length = output_kwargs["text_kwargs"]["sequence_length"]
140
+
141
+ image_patch_token_id = self.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]
142
+ image_col_token_id = self.special_token_ids[DEFAULT_IM_COL_TOKEN]
143
+ image_start_token_id = self.special_token_ids[DEFAULT_IM_START_TOKEN]
144
+ image_end_token_id = self.special_token_ids[DEFAULT_IM_END_TOKEN]
145
+ out = self.image_processor.multimodal_preprocess(
146
+ images=images,
147
+ image_idx=image_idx,
148
+ tokens=np.asarray(tokens).astype(np.int32),
149
+ sequence_length=sequence_length,
150
+ image_patch_token_id=image_patch_token_id,
151
+ image_col_token_id=image_col_token_id,
152
+ image_start_token_id=image_start_token_id,
153
+ image_end_token_id=image_end_token_id,
154
+ **output_kwargs["images_kwargs"]
155
+ )
156
+
157
+ # Prepend BOS
158
+ # qwen2 and olmo do not have a BOS, and instead use EOS as a generic seperator token.
159
+ bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
160
+ decoder_input_tokens = np.pad(out["input_ids"], [[1, 0]], constant_values=bos)
161
+ out["input_ids"] = decoder_input_tokens
162
+ if "image_input_idx" in out:
163
+ # Shift patch mapping up by one since we added BOS
164
+ image_input_idx = out["image_input_idx"]
165
+ out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
166
+
167
+ for k, v in out.items():
168
+ out[k] = torch.from_numpy(v)
169
+
170
+ return out
171
+
172
+
173
+ MolmoProcessor.register_for_auto_class()
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_preprocessing_molmo.MolmoImageProcessor",
4
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
5
+ },
6
+ "base_image_input_size": [
7
+ 336,
8
+ 336
9
+ ],
10
+ "do_normalize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_padding_mask": true,
17
+ "image_patch_size": 14,
18
+ "image_processor_type": "MolmoImageProcessor",
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "image_token_length_h": 12,
25
+ "image_token_length_w": 12,
26
+ "max_crops": 12,
27
+ "overlap_margins": [
28
+ 4,
29
+ 4
30
+ ],
31
+ "processor_class": "MolmoProcessor"
32
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
4
+ },
5
+ "processor_class": "MolmoProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<im_start>",
4
+ "<im_end>",
5
+ "<im_patch>",
6
+ "<im_col>",
7
+ "<|image|>"
8
+ ],
9
+ "bos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<|pad|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100256": {
5
+ "content": "<|extra_id_0|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": false
11
+ },
12
+ "100257": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100258": {
21
+ "content": "<|fim_prefix|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100259": {
29
+ "content": "<|fim_middle|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100260": {
37
+ "content": "<|fim_suffix|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "100261": {
45
+ "content": "|||PHONE_NUMBER|||",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "100262": {
53
+ "content": "|||EMAIL_ADDRESS|||",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "100263": {
61
+ "content": "|||IP_ADDRESS|||",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "100264": {
69
+ "content": "<|im_start|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "100265": {
77
+ "content": "<|im_end|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "100266": {
85
+ "content": "<|extra_id_1|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "100267": {
93
+ "content": "<|extra_id_2|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "100268": {
101
+ "content": "<|extra_id_3|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "100269": {
109
+ "content": "<|extra_id_4|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "100270": {
117
+ "content": "<|extra_id_5|>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "100271": {
125
+ "content": "<|extra_id_6|>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "100272": {
133
+ "content": "<|extra_id_7|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "100273": {
141
+ "content": "<|extra_id_8|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "100274": {
149
+ "content": "<|extra_id_9|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "100275": {
157
+ "content": "<|extra_id_10|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "100276": {
165
+ "content": "<|endofprompt|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "100277": {
173
+ "content": "<|pad|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "100278": {
181
+ "content": "<im_start>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "100279": {
189
+ "content": "<im_end>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "100280": {
197
+ "content": "<im_patch>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "100281": {
205
+ "content": "<im_col>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "100282": {
213
+ "content": "<|image|>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ }
220
+ },
221
+ "additional_special_tokens": [
222
+ "<im_start>",
223
+ "<im_end>",
224
+ "<im_patch>",
225
+ "<im_col>",
226
+ "<|image|>"
227
+ ],
228
+ "auto_map": {
229
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
230
+ },
231
+ "bos_token": "<|endoftext|>",
232
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
233
+ "clean_up_tokenization_spaces": false,
234
+ "eos_token": "<|endoftext|>",
235
+ "model_max_length": 8192,
236
+ "pad_token": "<|pad|>",
237
+ "processor_class": "MolmoProcessor",
238
+ "tokenizer_class": "GPT2Tokenizer",
239
+ "unk_token": "<|endoftext|>"
240
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff