stevengrove commited on
Commit
e413c25
1 Parent(s): aa7c8e9

Create yolo_world_xl_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py

Browse files
configs/pretrain/yolo_world_xl_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_x_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 512
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.05 / 2
17
+ train_batch_size_per_gpu = 16
18
+ deepen_factor = 1.0
19
+ widen_factor = 1.5
20
+
21
+ # model settings
22
+ image_backbone = _base_.model.backbone
23
+ image_backbone.update(
24
+ deepen_factor=deepen_factor,
25
+ widen_factor=widen_factor
26
+ )
27
+ model = dict(
28
+ type='YOLOWorldDetector',
29
+ mm_neck=True,
30
+ num_train_classes=num_training_classes,
31
+ num_test_classes=num_classes,
32
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
33
+ backbone=dict(
34
+ _delete_=True,
35
+ type='MultiModalYOLOBackbone',
36
+ image_model=image_backbone,
37
+ text_model=dict(
38
+ type='HuggingCLIPLanguageBackbone',
39
+ model_name='openai/clip-vit-base-patch32',
40
+ frozen_modules=['all'])),
41
+ neck=dict(type='YOLOWorldPAFPN',
42
+ deepen_factor=deepen_factor,
43
+ widen_factor=widen_factor,
44
+ guide_channels=text_channels,
45
+ embed_channels=neck_embed_channels,
46
+ num_heads=neck_num_heads,
47
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
48
+ num_csp_blocks=2),
49
+ bbox_head=dict(type='YOLOWorldHead',
50
+ head_module=dict(type='YOLOWorldHeadModule',
51
+ widen_factor=widen_factor,
52
+ embed_dims=text_channels,
53
+ use_bn_head=True,
54
+ num_classes=num_training_classes)),
55
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
56
+
57
+ # dataset settings
58
+ text_transform = [
59
+ dict(type='RandomLoadText',
60
+ num_neg_samples=(num_classes, num_classes),
61
+ max_num_samples=num_training_classes,
62
+ padding_to_max=True,
63
+ padding_value=''),
64
+ dict(type='mmdet.PackDetInputs',
65
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
66
+ 'flip_direction', 'texts'))
67
+ ]
68
+ train_pipeline = [
69
+ *_base_.pre_transform,
70
+ dict(type='MultiModalMosaic',
71
+ img_scale=_base_.img_scale,
72
+ pad_val=114.0,
73
+ pre_transform=_base_.pre_transform),
74
+ dict(
75
+ type='YOLOv5RandomAffine',
76
+ max_rotate_degree=0.0,
77
+ max_shear_degree=0.0,
78
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
79
+ max_aspect_ratio=_base_.max_aspect_ratio,
80
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
81
+ border_val=(114, 114, 114)),
82
+ *_base_.last_transform[:-1],
83
+ *text_transform,
84
+ ]
85
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
86
+ obj365v1_train_dataset = dict(
87
+ type='MultiModalDataset',
88
+ dataset=dict(
89
+ type='YOLOv5Objects365V1Dataset',
90
+ data_root='data/objects365v1/',
91
+ ann_file='annotations/objects365_train.json',
92
+ data_prefix=dict(img='train/'),
93
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
94
+ class_text_path='data/captions/obj365v1_class_captions.json',
95
+ pipeline=train_pipeline)
96
+
97
+ mg_train_dataset = dict(
98
+ type='YOLOv5MixedGroundingDataset',
99
+ data_root='data/mixed_grounding/',
100
+ ann_file='annotations/final_mixed_train_no_coco.json',
101
+ data_prefix=dict(img='gqa/images/'),
102
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
103
+ pipeline=train_pipeline)
104
+
105
+ flickr_train_dataset = dict(
106
+ type='YOLOv5MixedGroundingDataset',
107
+ data_root='data/flickr/',
108
+ ann_file='annotations/final_flickr_separateGT_train.json',
109
+ data_prefix=dict(img='images/'),
110
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
111
+ pipeline=train_pipeline)
112
+
113
+ train_dataloader = dict(
114
+ batch_size=train_batch_size_per_gpu,
115
+ collate_fn=dict(type='yolow_collate'),
116
+ dataset=dict(
117
+ _delete_=True,
118
+ type='ConcatDataset',
119
+ datasets=[
120
+ obj365v1_train_dataset,
121
+ flickr_train_dataset,
122
+ mg_train_dataset
123
+ ],
124
+ ignore_keys=['classes', 'palette']))
125
+
126
+ test_pipeline = [
127
+ *_base_.test_pipeline[:-1],
128
+ dict(type='LoadText'),
129
+ dict(type='mmdet.PackDetInputs',
130
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
131
+ 'scale_factor', 'pad_param', 'texts'))
132
+ ]
133
+ coco_val_dataset = dict(
134
+ _delete_=True,
135
+ type='MultiModalDataset',
136
+ dataset=dict(
137
+ type='YOLOv5LVISV1Dataset',
138
+ data_root='data/lvis/',
139
+ test_mode=True,
140
+ ann_file='annotations/'
141
+ 'lvis_v1_minival_inserted_image_name.json',
142
+ data_prefix=dict(img=''),
143
+ batch_shapes_cfg=None),
144
+ class_text_path='data/captions/lvis_v1_class_captions.json',
145
+ pipeline=test_pipeline)
146
+ val_dataloader = dict(dataset=coco_val_dataset)
147
+ test_dataloader = val_dataloader
148
+
149
+ val_evaluator = dict(
150
+ type='mmdet.LVISMetric',
151
+ ann_file='data/lvis/annotations/'
152
+ 'lvis_v1_minival_inserted_image_name.json',
153
+ metric='bbox')
154
+ test_evaluator = val_evaluator
155
+
156
+ # training settings
157
+ default_hooks = dict(
158
+ param_scheduler=dict(max_epochs=max_epochs),
159
+ checkpoint=dict(interval=save_epoch_intervals,
160
+ rule='greater'))
161
+ custom_hooks = [
162
+ dict(type='EMAHook',
163
+ ema_type='ExpMomentumEMA',
164
+ momentum=0.0001,
165
+ update_buffers=True,
166
+ strict_load=False,
167
+ priority=49),
168
+ dict(type='mmdet.PipelineSwitchHook',
169
+ switch_epoch=max_epochs - close_mosaic_epochs,
170
+ switch_pipeline=train_pipeline_stage2)
171
+ ]
172
+ train_cfg = dict(
173
+ max_epochs=max_epochs,
174
+ val_interval=10,
175
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
176
+ _base_.val_interval_stage2)])
177
+ optim_wrapper = dict(optimizer=dict(
178
+ _delete_=True,
179
+ type='AdamW',
180
+ lr=base_lr,
181
+ weight_decay=weight_decay,
182
+ batch_size_per_gpu=train_batch_size_per_gpu),
183
+ paramwise_cfg=dict(
184
+ bias_decay_mult=0.0,
185
+ norm_decay_mult=0.0,
186
+ custom_keys={
187
+ 'backbone.text_model':
188
+ dict(lr_mult=0.01),
189
+ 'logit_scale':
190
+ dict(weight_decay=0.0)
191
+ }),
192
+ constructor='YOLOWv5OptimizerConstructor')