freddyaboulton HF staff commited on
Commit
8cfdd9d
1 Parent(s): c530e94
MobileNetSSD_deploy.caffemodel DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:761c86fbae3d8361dd454f7c740a964f62975ed32f4324b8b85994edec30f6af
3
- size 23147564
 
 
 
 
MobileNetSSD_deploy.prototxt.txt DELETED
@@ -1,1912 +0,0 @@
1
- name: "MobileNet-SSD"
2
- input: "data"
3
- input_shape {
4
- dim: 1
5
- dim: 3
6
- dim: 300
7
- dim: 300
8
- }
9
- layer {
10
- name: "conv0"
11
- type: "Convolution"
12
- bottom: "data"
13
- top: "conv0"
14
- param {
15
- lr_mult: 1.0
16
- decay_mult: 1.0
17
- }
18
- param {
19
- lr_mult: 2.0
20
- decay_mult: 0.0
21
- }
22
- convolution_param {
23
- num_output: 32
24
- pad: 1
25
- kernel_size: 3
26
- stride: 2
27
- weight_filler {
28
- type: "msra"
29
- }
30
- bias_filler {
31
- type: "constant"
32
- value: 0.0
33
- }
34
- }
35
- }
36
- layer {
37
- name: "conv0/relu"
38
- type: "ReLU"
39
- bottom: "conv0"
40
- top: "conv0"
41
- }
42
- layer {
43
- name: "conv1/dw"
44
- type: "Convolution"
45
- bottom: "conv0"
46
- top: "conv1/dw"
47
- param {
48
- lr_mult: 1.0
49
- decay_mult: 1.0
50
- }
51
- param {
52
- lr_mult: 2.0
53
- decay_mult: 0.0
54
- }
55
- convolution_param {
56
- num_output: 32
57
- pad: 1
58
- kernel_size: 3
59
- group: 32
60
- engine: CAFFE
61
- weight_filler {
62
- type: "msra"
63
- }
64
- bias_filler {
65
- type: "constant"
66
- value: 0.0
67
- }
68
- }
69
- }
70
- layer {
71
- name: "conv1/dw/relu"
72
- type: "ReLU"
73
- bottom: "conv1/dw"
74
- top: "conv1/dw"
75
- }
76
- layer {
77
- name: "conv1"
78
- type: "Convolution"
79
- bottom: "conv1/dw"
80
- top: "conv1"
81
- param {
82
- lr_mult: 1.0
83
- decay_mult: 1.0
84
- }
85
- param {
86
- lr_mult: 2.0
87
- decay_mult: 0.0
88
- }
89
- convolution_param {
90
- num_output: 64
91
- kernel_size: 1
92
- weight_filler {
93
- type: "msra"
94
- }
95
- bias_filler {
96
- type: "constant"
97
- value: 0.0
98
- }
99
- }
100
- }
101
- layer {
102
- name: "conv1/relu"
103
- type: "ReLU"
104
- bottom: "conv1"
105
- top: "conv1"
106
- }
107
- layer {
108
- name: "conv2/dw"
109
- type: "Convolution"
110
- bottom: "conv1"
111
- top: "conv2/dw"
112
- param {
113
- lr_mult: 1.0
114
- decay_mult: 1.0
115
- }
116
- param {
117
- lr_mult: 2.0
118
- decay_mult: 0.0
119
- }
120
- convolution_param {
121
- num_output: 64
122
- pad: 1
123
- kernel_size: 3
124
- stride: 2
125
- group: 64
126
- engine: CAFFE
127
- weight_filler {
128
- type: "msra"
129
- }
130
- bias_filler {
131
- type: "constant"
132
- value: 0.0
133
- }
134
- }
135
- }
136
- layer {
137
- name: "conv2/dw/relu"
138
- type: "ReLU"
139
- bottom: "conv2/dw"
140
- top: "conv2/dw"
141
- }
142
- layer {
143
- name: "conv2"
144
- type: "Convolution"
145
- bottom: "conv2/dw"
146
- top: "conv2"
147
- param {
148
- lr_mult: 1.0
149
- decay_mult: 1.0
150
- }
151
- param {
152
- lr_mult: 2.0
153
- decay_mult: 0.0
154
- }
155
- convolution_param {
156
- num_output: 128
157
- kernel_size: 1
158
- weight_filler {
159
- type: "msra"
160
- }
161
- bias_filler {
162
- type: "constant"
163
- value: 0.0
164
- }
165
- }
166
- }
167
- layer {
168
- name: "conv2/relu"
169
- type: "ReLU"
170
- bottom: "conv2"
171
- top: "conv2"
172
- }
173
- layer {
174
- name: "conv3/dw"
175
- type: "Convolution"
176
- bottom: "conv2"
177
- top: "conv3/dw"
178
- param {
179
- lr_mult: 1.0
180
- decay_mult: 1.0
181
- }
182
- param {
183
- lr_mult: 2.0
184
- decay_mult: 0.0
185
- }
186
- convolution_param {
187
- num_output: 128
188
- pad: 1
189
- kernel_size: 3
190
- group: 128
191
- engine: CAFFE
192
- weight_filler {
193
- type: "msra"
194
- }
195
- bias_filler {
196
- type: "constant"
197
- value: 0.0
198
- }
199
- }
200
- }
201
- layer {
202
- name: "conv3/dw/relu"
203
- type: "ReLU"
204
- bottom: "conv3/dw"
205
- top: "conv3/dw"
206
- }
207
- layer {
208
- name: "conv3"
209
- type: "Convolution"
210
- bottom: "conv3/dw"
211
- top: "conv3"
212
- param {
213
- lr_mult: 1.0
214
- decay_mult: 1.0
215
- }
216
- param {
217
- lr_mult: 2.0
218
- decay_mult: 0.0
219
- }
220
- convolution_param {
221
- num_output: 128
222
- kernel_size: 1
223
- weight_filler {
224
- type: "msra"
225
- }
226
- bias_filler {
227
- type: "constant"
228
- value: 0.0
229
- }
230
- }
231
- }
232
- layer {
233
- name: "conv3/relu"
234
- type: "ReLU"
235
- bottom: "conv3"
236
- top: "conv3"
237
- }
238
- layer {
239
- name: "conv4/dw"
240
- type: "Convolution"
241
- bottom: "conv3"
242
- top: "conv4/dw"
243
- param {
244
- lr_mult: 1.0
245
- decay_mult: 1.0
246
- }
247
- param {
248
- lr_mult: 2.0
249
- decay_mult: 0.0
250
- }
251
- convolution_param {
252
- num_output: 128
253
- pad: 1
254
- kernel_size: 3
255
- stride: 2
256
- group: 128
257
- engine: CAFFE
258
- weight_filler {
259
- type: "msra"
260
- }
261
- bias_filler {
262
- type: "constant"
263
- value: 0.0
264
- }
265
- }
266
- }
267
- layer {
268
- name: "conv4/dw/relu"
269
- type: "ReLU"
270
- bottom: "conv4/dw"
271
- top: "conv4/dw"
272
- }
273
- layer {
274
- name: "conv4"
275
- type: "Convolution"
276
- bottom: "conv4/dw"
277
- top: "conv4"
278
- param {
279
- lr_mult: 1.0
280
- decay_mult: 1.0
281
- }
282
- param {
283
- lr_mult: 2.0
284
- decay_mult: 0.0
285
- }
286
- convolution_param {
287
- num_output: 256
288
- kernel_size: 1
289
- weight_filler {
290
- type: "msra"
291
- }
292
- bias_filler {
293
- type: "constant"
294
- value: 0.0
295
- }
296
- }
297
- }
298
- layer {
299
- name: "conv4/relu"
300
- type: "ReLU"
301
- bottom: "conv4"
302
- top: "conv4"
303
- }
304
- layer {
305
- name: "conv5/dw"
306
- type: "Convolution"
307
- bottom: "conv4"
308
- top: "conv5/dw"
309
- param {
310
- lr_mult: 1.0
311
- decay_mult: 1.0
312
- }
313
- param {
314
- lr_mult: 2.0
315
- decay_mult: 0.0
316
- }
317
- convolution_param {
318
- num_output: 256
319
- pad: 1
320
- kernel_size: 3
321
- group: 256
322
- engine: CAFFE
323
- weight_filler {
324
- type: "msra"
325
- }
326
- bias_filler {
327
- type: "constant"
328
- value: 0.0
329
- }
330
- }
331
- }
332
- layer {
333
- name: "conv5/dw/relu"
334
- type: "ReLU"
335
- bottom: "conv5/dw"
336
- top: "conv5/dw"
337
- }
338
- layer {
339
- name: "conv5"
340
- type: "Convolution"
341
- bottom: "conv5/dw"
342
- top: "conv5"
343
- param {
344
- lr_mult: 1.0
345
- decay_mult: 1.0
346
- }
347
- param {
348
- lr_mult: 2.0
349
- decay_mult: 0.0
350
- }
351
- convolution_param {
352
- num_output: 256
353
- kernel_size: 1
354
- weight_filler {
355
- type: "msra"
356
- }
357
- bias_filler {
358
- type: "constant"
359
- value: 0.0
360
- }
361
- }
362
- }
363
- layer {
364
- name: "conv5/relu"
365
- type: "ReLU"
366
- bottom: "conv5"
367
- top: "conv5"
368
- }
369
- layer {
370
- name: "conv6/dw"
371
- type: "Convolution"
372
- bottom: "conv5"
373
- top: "conv6/dw"
374
- param {
375
- lr_mult: 1.0
376
- decay_mult: 1.0
377
- }
378
- param {
379
- lr_mult: 2.0
380
- decay_mult: 0.0
381
- }
382
- convolution_param {
383
- num_output: 256
384
- pad: 1
385
- kernel_size: 3
386
- stride: 2
387
- group: 256
388
- engine: CAFFE
389
- weight_filler {
390
- type: "msra"
391
- }
392
- bias_filler {
393
- type: "constant"
394
- value: 0.0
395
- }
396
- }
397
- }
398
- layer {
399
- name: "conv6/dw/relu"
400
- type: "ReLU"
401
- bottom: "conv6/dw"
402
- top: "conv6/dw"
403
- }
404
- layer {
405
- name: "conv6"
406
- type: "Convolution"
407
- bottom: "conv6/dw"
408
- top: "conv6"
409
- param {
410
- lr_mult: 1.0
411
- decay_mult: 1.0
412
- }
413
- param {
414
- lr_mult: 2.0
415
- decay_mult: 0.0
416
- }
417
- convolution_param {
418
- num_output: 512
419
- kernel_size: 1
420
- weight_filler {
421
- type: "msra"
422
- }
423
- bias_filler {
424
- type: "constant"
425
- value: 0.0
426
- }
427
- }
428
- }
429
- layer {
430
- name: "conv6/relu"
431
- type: "ReLU"
432
- bottom: "conv6"
433
- top: "conv6"
434
- }
435
- layer {
436
- name: "conv7/dw"
437
- type: "Convolution"
438
- bottom: "conv6"
439
- top: "conv7/dw"
440
- param {
441
- lr_mult: 1.0
442
- decay_mult: 1.0
443
- }
444
- param {
445
- lr_mult: 2.0
446
- decay_mult: 0.0
447
- }
448
- convolution_param {
449
- num_output: 512
450
- pad: 1
451
- kernel_size: 3
452
- group: 512
453
- engine: CAFFE
454
- weight_filler {
455
- type: "msra"
456
- }
457
- bias_filler {
458
- type: "constant"
459
- value: 0.0
460
- }
461
- }
462
- }
463
- layer {
464
- name: "conv7/dw/relu"
465
- type: "ReLU"
466
- bottom: "conv7/dw"
467
- top: "conv7/dw"
468
- }
469
- layer {
470
- name: "conv7"
471
- type: "Convolution"
472
- bottom: "conv7/dw"
473
- top: "conv7"
474
- param {
475
- lr_mult: 1.0
476
- decay_mult: 1.0
477
- }
478
- param {
479
- lr_mult: 2.0
480
- decay_mult: 0.0
481
- }
482
- convolution_param {
483
- num_output: 512
484
- kernel_size: 1
485
- weight_filler {
486
- type: "msra"
487
- }
488
- bias_filler {
489
- type: "constant"
490
- value: 0.0
491
- }
492
- }
493
- }
494
- layer {
495
- name: "conv7/relu"
496
- type: "ReLU"
497
- bottom: "conv7"
498
- top: "conv7"
499
- }
500
- layer {
501
- name: "conv8/dw"
502
- type: "Convolution"
503
- bottom: "conv7"
504
- top: "conv8/dw"
505
- param {
506
- lr_mult: 1.0
507
- decay_mult: 1.0
508
- }
509
- param {
510
- lr_mult: 2.0
511
- decay_mult: 0.0
512
- }
513
- convolution_param {
514
- num_output: 512
515
- pad: 1
516
- kernel_size: 3
517
- group: 512
518
- engine: CAFFE
519
- weight_filler {
520
- type: "msra"
521
- }
522
- bias_filler {
523
- type: "constant"
524
- value: 0.0
525
- }
526
- }
527
- }
528
- layer {
529
- name: "conv8/dw/relu"
530
- type: "ReLU"
531
- bottom: "conv8/dw"
532
- top: "conv8/dw"
533
- }
534
- layer {
535
- name: "conv8"
536
- type: "Convolution"
537
- bottom: "conv8/dw"
538
- top: "conv8"
539
- param {
540
- lr_mult: 1.0
541
- decay_mult: 1.0
542
- }
543
- param {
544
- lr_mult: 2.0
545
- decay_mult: 0.0
546
- }
547
- convolution_param {
548
- num_output: 512
549
- kernel_size: 1
550
- weight_filler {
551
- type: "msra"
552
- }
553
- bias_filler {
554
- type: "constant"
555
- value: 0.0
556
- }
557
- }
558
- }
559
- layer {
560
- name: "conv8/relu"
561
- type: "ReLU"
562
- bottom: "conv8"
563
- top: "conv8"
564
- }
565
- layer {
566
- name: "conv9/dw"
567
- type: "Convolution"
568
- bottom: "conv8"
569
- top: "conv9/dw"
570
- param {
571
- lr_mult: 1.0
572
- decay_mult: 1.0
573
- }
574
- param {
575
- lr_mult: 2.0
576
- decay_mult: 0.0
577
- }
578
- convolution_param {
579
- num_output: 512
580
- pad: 1
581
- kernel_size: 3
582
- group: 512
583
- engine: CAFFE
584
- weight_filler {
585
- type: "msra"
586
- }
587
- bias_filler {
588
- type: "constant"
589
- value: 0.0
590
- }
591
- }
592
- }
593
- layer {
594
- name: "conv9/dw/relu"
595
- type: "ReLU"
596
- bottom: "conv9/dw"
597
- top: "conv9/dw"
598
- }
599
- layer {
600
- name: "conv9"
601
- type: "Convolution"
602
- bottom: "conv9/dw"
603
- top: "conv9"
604
- param {
605
- lr_mult: 1.0
606
- decay_mult: 1.0
607
- }
608
- param {
609
- lr_mult: 2.0
610
- decay_mult: 0.0
611
- }
612
- convolution_param {
613
- num_output: 512
614
- kernel_size: 1
615
- weight_filler {
616
- type: "msra"
617
- }
618
- bias_filler {
619
- type: "constant"
620
- value: 0.0
621
- }
622
- }
623
- }
624
- layer {
625
- name: "conv9/relu"
626
- type: "ReLU"
627
- bottom: "conv9"
628
- top: "conv9"
629
- }
630
- layer {
631
- name: "conv10/dw"
632
- type: "Convolution"
633
- bottom: "conv9"
634
- top: "conv10/dw"
635
- param {
636
- lr_mult: 1.0
637
- decay_mult: 1.0
638
- }
639
- param {
640
- lr_mult: 2.0
641
- decay_mult: 0.0
642
- }
643
- convolution_param {
644
- num_output: 512
645
- pad: 1
646
- kernel_size: 3
647
- group: 512
648
- engine: CAFFE
649
- weight_filler {
650
- type: "msra"
651
- }
652
- bias_filler {
653
- type: "constant"
654
- value: 0.0
655
- }
656
- }
657
- }
658
- layer {
659
- name: "conv10/dw/relu"
660
- type: "ReLU"
661
- bottom: "conv10/dw"
662
- top: "conv10/dw"
663
- }
664
- layer {
665
- name: "conv10"
666
- type: "Convolution"
667
- bottom: "conv10/dw"
668
- top: "conv10"
669
- param {
670
- lr_mult: 1.0
671
- decay_mult: 1.0
672
- }
673
- param {
674
- lr_mult: 2.0
675
- decay_mult: 0.0
676
- }
677
- convolution_param {
678
- num_output: 512
679
- kernel_size: 1
680
- weight_filler {
681
- type: "msra"
682
- }
683
- bias_filler {
684
- type: "constant"
685
- value: 0.0
686
- }
687
- }
688
- }
689
- layer {
690
- name: "conv10/relu"
691
- type: "ReLU"
692
- bottom: "conv10"
693
- top: "conv10"
694
- }
695
- layer {
696
- name: "conv11/dw"
697
- type: "Convolution"
698
- bottom: "conv10"
699
- top: "conv11/dw"
700
- param {
701
- lr_mult: 1.0
702
- decay_mult: 1.0
703
- }
704
- param {
705
- lr_mult: 2.0
706
- decay_mult: 0.0
707
- }
708
- convolution_param {
709
- num_output: 512
710
- pad: 1
711
- kernel_size: 3
712
- group: 512
713
- engine: CAFFE
714
- weight_filler {
715
- type: "msra"
716
- }
717
- bias_filler {
718
- type: "constant"
719
- value: 0.0
720
- }
721
- }
722
- }
723
- layer {
724
- name: "conv11/dw/relu"
725
- type: "ReLU"
726
- bottom: "conv11/dw"
727
- top: "conv11/dw"
728
- }
729
- layer {
730
- name: "conv11"
731
- type: "Convolution"
732
- bottom: "conv11/dw"
733
- top: "conv11"
734
- param {
735
- lr_mult: 1.0
736
- decay_mult: 1.0
737
- }
738
- param {
739
- lr_mult: 2.0
740
- decay_mult: 0.0
741
- }
742
- convolution_param {
743
- num_output: 512
744
- kernel_size: 1
745
- weight_filler {
746
- type: "msra"
747
- }
748
- bias_filler {
749
- type: "constant"
750
- value: 0.0
751
- }
752
- }
753
- }
754
- layer {
755
- name: "conv11/relu"
756
- type: "ReLU"
757
- bottom: "conv11"
758
- top: "conv11"
759
- }
760
- layer {
761
- name: "conv12/dw"
762
- type: "Convolution"
763
- bottom: "conv11"
764
- top: "conv12/dw"
765
- param {
766
- lr_mult: 1.0
767
- decay_mult: 1.0
768
- }
769
- param {
770
- lr_mult: 2.0
771
- decay_mult: 0.0
772
- }
773
- convolution_param {
774
- num_output: 512
775
- pad: 1
776
- kernel_size: 3
777
- stride: 2
778
- group: 512
779
- engine: CAFFE
780
- weight_filler {
781
- type: "msra"
782
- }
783
- bias_filler {
784
- type: "constant"
785
- value: 0.0
786
- }
787
- }
788
- }
789
- layer {
790
- name: "conv12/dw/relu"
791
- type: "ReLU"
792
- bottom: "conv12/dw"
793
- top: "conv12/dw"
794
- }
795
- layer {
796
- name: "conv12"
797
- type: "Convolution"
798
- bottom: "conv12/dw"
799
- top: "conv12"
800
- param {
801
- lr_mult: 1.0
802
- decay_mult: 1.0
803
- }
804
- param {
805
- lr_mult: 2.0
806
- decay_mult: 0.0
807
- }
808
- convolution_param {
809
- num_output: 1024
810
- kernel_size: 1
811
- weight_filler {
812
- type: "msra"
813
- }
814
- bias_filler {
815
- type: "constant"
816
- value: 0.0
817
- }
818
- }
819
- }
820
- layer {
821
- name: "conv12/relu"
822
- type: "ReLU"
823
- bottom: "conv12"
824
- top: "conv12"
825
- }
826
- layer {
827
- name: "conv13/dw"
828
- type: "Convolution"
829
- bottom: "conv12"
830
- top: "conv13/dw"
831
- param {
832
- lr_mult: 1.0
833
- decay_mult: 1.0
834
- }
835
- param {
836
- lr_mult: 2.0
837
- decay_mult: 0.0
838
- }
839
- convolution_param {
840
- num_output: 1024
841
- pad: 1
842
- kernel_size: 3
843
- group: 1024
844
- engine: CAFFE
845
- weight_filler {
846
- type: "msra"
847
- }
848
- bias_filler {
849
- type: "constant"
850
- value: 0.0
851
- }
852
- }
853
- }
854
- layer {
855
- name: "conv13/dw/relu"
856
- type: "ReLU"
857
- bottom: "conv13/dw"
858
- top: "conv13/dw"
859
- }
860
- layer {
861
- name: "conv13"
862
- type: "Convolution"
863
- bottom: "conv13/dw"
864
- top: "conv13"
865
- param {
866
- lr_mult: 1.0
867
- decay_mult: 1.0
868
- }
869
- param {
870
- lr_mult: 2.0
871
- decay_mult: 0.0
872
- }
873
- convolution_param {
874
- num_output: 1024
875
- kernel_size: 1
876
- weight_filler {
877
- type: "msra"
878
- }
879
- bias_filler {
880
- type: "constant"
881
- value: 0.0
882
- }
883
- }
884
- }
885
- layer {
886
- name: "conv13/relu"
887
- type: "ReLU"
888
- bottom: "conv13"
889
- top: "conv13"
890
- }
891
- layer {
892
- name: "conv14_1"
893
- type: "Convolution"
894
- bottom: "conv13"
895
- top: "conv14_1"
896
- param {
897
- lr_mult: 1.0
898
- decay_mult: 1.0
899
- }
900
- param {
901
- lr_mult: 2.0
902
- decay_mult: 0.0
903
- }
904
- convolution_param {
905
- num_output: 256
906
- kernel_size: 1
907
- weight_filler {
908
- type: "msra"
909
- }
910
- bias_filler {
911
- type: "constant"
912
- value: 0.0
913
- }
914
- }
915
- }
916
- layer {
917
- name: "conv14_1/relu"
918
- type: "ReLU"
919
- bottom: "conv14_1"
920
- top: "conv14_1"
921
- }
922
- layer {
923
- name: "conv14_2"
924
- type: "Convolution"
925
- bottom: "conv14_1"
926
- top: "conv14_2"
927
- param {
928
- lr_mult: 1.0
929
- decay_mult: 1.0
930
- }
931
- param {
932
- lr_mult: 2.0
933
- decay_mult: 0.0
934
- }
935
- convolution_param {
936
- num_output: 512
937
- pad: 1
938
- kernel_size: 3
939
- stride: 2
940
- weight_filler {
941
- type: "msra"
942
- }
943
- bias_filler {
944
- type: "constant"
945
- value: 0.0
946
- }
947
- }
948
- }
949
- layer {
950
- name: "conv14_2/relu"
951
- type: "ReLU"
952
- bottom: "conv14_2"
953
- top: "conv14_2"
954
- }
955
- layer {
956
- name: "conv15_1"
957
- type: "Convolution"
958
- bottom: "conv14_2"
959
- top: "conv15_1"
960
- param {
961
- lr_mult: 1.0
962
- decay_mult: 1.0
963
- }
964
- param {
965
- lr_mult: 2.0
966
- decay_mult: 0.0
967
- }
968
- convolution_param {
969
- num_output: 128
970
- kernel_size: 1
971
- weight_filler {
972
- type: "msra"
973
- }
974
- bias_filler {
975
- type: "constant"
976
- value: 0.0
977
- }
978
- }
979
- }
980
- layer {
981
- name: "conv15_1/relu"
982
- type: "ReLU"
983
- bottom: "conv15_1"
984
- top: "conv15_1"
985
- }
986
- layer {
987
- name: "conv15_2"
988
- type: "Convolution"
989
- bottom: "conv15_1"
990
- top: "conv15_2"
991
- param {
992
- lr_mult: 1.0
993
- decay_mult: 1.0
994
- }
995
- param {
996
- lr_mult: 2.0
997
- decay_mult: 0.0
998
- }
999
- convolution_param {
1000
- num_output: 256
1001
- pad: 1
1002
- kernel_size: 3
1003
- stride: 2
1004
- weight_filler {
1005
- type: "msra"
1006
- }
1007
- bias_filler {
1008
- type: "constant"
1009
- value: 0.0
1010
- }
1011
- }
1012
- }
1013
- layer {
1014
- name: "conv15_2/relu"
1015
- type: "ReLU"
1016
- bottom: "conv15_2"
1017
- top: "conv15_2"
1018
- }
1019
- layer {
1020
- name: "conv16_1"
1021
- type: "Convolution"
1022
- bottom: "conv15_2"
1023
- top: "conv16_1"
1024
- param {
1025
- lr_mult: 1.0
1026
- decay_mult: 1.0
1027
- }
1028
- param {
1029
- lr_mult: 2.0
1030
- decay_mult: 0.0
1031
- }
1032
- convolution_param {
1033
- num_output: 128
1034
- kernel_size: 1
1035
- weight_filler {
1036
- type: "msra"
1037
- }
1038
- bias_filler {
1039
- type: "constant"
1040
- value: 0.0
1041
- }
1042
- }
1043
- }
1044
- layer {
1045
- name: "conv16_1/relu"
1046
- type: "ReLU"
1047
- bottom: "conv16_1"
1048
- top: "conv16_1"
1049
- }
1050
- layer {
1051
- name: "conv16_2"
1052
- type: "Convolution"
1053
- bottom: "conv16_1"
1054
- top: "conv16_2"
1055
- param {
1056
- lr_mult: 1.0
1057
- decay_mult: 1.0
1058
- }
1059
- param {
1060
- lr_mult: 2.0
1061
- decay_mult: 0.0
1062
- }
1063
- convolution_param {
1064
- num_output: 256
1065
- pad: 1
1066
- kernel_size: 3
1067
- stride: 2
1068
- weight_filler {
1069
- type: "msra"
1070
- }
1071
- bias_filler {
1072
- type: "constant"
1073
- value: 0.0
1074
- }
1075
- }
1076
- }
1077
- layer {
1078
- name: "conv16_2/relu"
1079
- type: "ReLU"
1080
- bottom: "conv16_2"
1081
- top: "conv16_2"
1082
- }
1083
- layer {
1084
- name: "conv17_1"
1085
- type: "Convolution"
1086
- bottom: "conv16_2"
1087
- top: "conv17_1"
1088
- param {
1089
- lr_mult: 1.0
1090
- decay_mult: 1.0
1091
- }
1092
- param {
1093
- lr_mult: 2.0
1094
- decay_mult: 0.0
1095
- }
1096
- convolution_param {
1097
- num_output: 64
1098
- kernel_size: 1
1099
- weight_filler {
1100
- type: "msra"
1101
- }
1102
- bias_filler {
1103
- type: "constant"
1104
- value: 0.0
1105
- }
1106
- }
1107
- }
1108
- layer {
1109
- name: "conv17_1/relu"
1110
- type: "ReLU"
1111
- bottom: "conv17_1"
1112
- top: "conv17_1"
1113
- }
1114
- layer {
1115
- name: "conv17_2"
1116
- type: "Convolution"
1117
- bottom: "conv17_1"
1118
- top: "conv17_2"
1119
- param {
1120
- lr_mult: 1.0
1121
- decay_mult: 1.0
1122
- }
1123
- param {
1124
- lr_mult: 2.0
1125
- decay_mult: 0.0
1126
- }
1127
- convolution_param {
1128
- num_output: 128
1129
- pad: 1
1130
- kernel_size: 3
1131
- stride: 2
1132
- weight_filler {
1133
- type: "msra"
1134
- }
1135
- bias_filler {
1136
- type: "constant"
1137
- value: 0.0
1138
- }
1139
- }
1140
- }
1141
- layer {
1142
- name: "conv17_2/relu"
1143
- type: "ReLU"
1144
- bottom: "conv17_2"
1145
- top: "conv17_2"
1146
- }
1147
- layer {
1148
- name: "conv11_mbox_loc"
1149
- type: "Convolution"
1150
- bottom: "conv11"
1151
- top: "conv11_mbox_loc"
1152
- param {
1153
- lr_mult: 1.0
1154
- decay_mult: 1.0
1155
- }
1156
- param {
1157
- lr_mult: 2.0
1158
- decay_mult: 0.0
1159
- }
1160
- convolution_param {
1161
- num_output: 12
1162
- kernel_size: 1
1163
- weight_filler {
1164
- type: "msra"
1165
- }
1166
- bias_filler {
1167
- type: "constant"
1168
- value: 0.0
1169
- }
1170
- }
1171
- }
1172
- layer {
1173
- name: "conv11_mbox_loc_perm"
1174
- type: "Permute"
1175
- bottom: "conv11_mbox_loc"
1176
- top: "conv11_mbox_loc_perm"
1177
- permute_param {
1178
- order: 0
1179
- order: 2
1180
- order: 3
1181
- order: 1
1182
- }
1183
- }
1184
- layer {
1185
- name: "conv11_mbox_loc_flat"
1186
- type: "Flatten"
1187
- bottom: "conv11_mbox_loc_perm"
1188
- top: "conv11_mbox_loc_flat"
1189
- flatten_param {
1190
- axis: 1
1191
- }
1192
- }
1193
- layer {
1194
- name: "conv11_mbox_conf"
1195
- type: "Convolution"
1196
- bottom: "conv11"
1197
- top: "conv11_mbox_conf"
1198
- param {
1199
- lr_mult: 1.0
1200
- decay_mult: 1.0
1201
- }
1202
- param {
1203
- lr_mult: 2.0
1204
- decay_mult: 0.0
1205
- }
1206
- convolution_param {
1207
- num_output: 63
1208
- kernel_size: 1
1209
- weight_filler {
1210
- type: "msra"
1211
- }
1212
- bias_filler {
1213
- type: "constant"
1214
- value: 0.0
1215
- }
1216
- }
1217
- }
1218
- layer {
1219
- name: "conv11_mbox_conf_perm"
1220
- type: "Permute"
1221
- bottom: "conv11_mbox_conf"
1222
- top: "conv11_mbox_conf_perm"
1223
- permute_param {
1224
- order: 0
1225
- order: 2
1226
- order: 3
1227
- order: 1
1228
- }
1229
- }
1230
- layer {
1231
- name: "conv11_mbox_conf_flat"
1232
- type: "Flatten"
1233
- bottom: "conv11_mbox_conf_perm"
1234
- top: "conv11_mbox_conf_flat"
1235
- flatten_param {
1236
- axis: 1
1237
- }
1238
- }
1239
- layer {
1240
- name: "conv11_mbox_priorbox"
1241
- type: "PriorBox"
1242
- bottom: "conv11"
1243
- bottom: "data"
1244
- top: "conv11_mbox_priorbox"
1245
- prior_box_param {
1246
- min_size: 60.0
1247
- aspect_ratio: 2.0
1248
- flip: true
1249
- clip: false
1250
- variance: 0.1
1251
- variance: 0.1
1252
- variance: 0.2
1253
- variance: 0.2
1254
- offset: 0.5
1255
- }
1256
- }
1257
- layer {
1258
- name: "conv13_mbox_loc"
1259
- type: "Convolution"
1260
- bottom: "conv13"
1261
- top: "conv13_mbox_loc"
1262
- param {
1263
- lr_mult: 1.0
1264
- decay_mult: 1.0
1265
- }
1266
- param {
1267
- lr_mult: 2.0
1268
- decay_mult: 0.0
1269
- }
1270
- convolution_param {
1271
- num_output: 24
1272
- kernel_size: 1
1273
- weight_filler {
1274
- type: "msra"
1275
- }
1276
- bias_filler {
1277
- type: "constant"
1278
- value: 0.0
1279
- }
1280
- }
1281
- }
1282
- layer {
1283
- name: "conv13_mbox_loc_perm"
1284
- type: "Permute"
1285
- bottom: "conv13_mbox_loc"
1286
- top: "conv13_mbox_loc_perm"
1287
- permute_param {
1288
- order: 0
1289
- order: 2
1290
- order: 3
1291
- order: 1
1292
- }
1293
- }
1294
- layer {
1295
- name: "conv13_mbox_loc_flat"
1296
- type: "Flatten"
1297
- bottom: "conv13_mbox_loc_perm"
1298
- top: "conv13_mbox_loc_flat"
1299
- flatten_param {
1300
- axis: 1
1301
- }
1302
- }
1303
- layer {
1304
- name: "conv13_mbox_conf"
1305
- type: "Convolution"
1306
- bottom: "conv13"
1307
- top: "conv13_mbox_conf"
1308
- param {
1309
- lr_mult: 1.0
1310
- decay_mult: 1.0
1311
- }
1312
- param {
1313
- lr_mult: 2.0
1314
- decay_mult: 0.0
1315
- }
1316
- convolution_param {
1317
- num_output: 126
1318
- kernel_size: 1
1319
- weight_filler {
1320
- type: "msra"
1321
- }
1322
- bias_filler {
1323
- type: "constant"
1324
- value: 0.0
1325
- }
1326
- }
1327
- }
1328
- layer {
1329
- name: "conv13_mbox_conf_perm"
1330
- type: "Permute"
1331
- bottom: "conv13_mbox_conf"
1332
- top: "conv13_mbox_conf_perm"
1333
- permute_param {
1334
- order: 0
1335
- order: 2
1336
- order: 3
1337
- order: 1
1338
- }
1339
- }
1340
- layer {
1341
- name: "conv13_mbox_conf_flat"
1342
- type: "Flatten"
1343
- bottom: "conv13_mbox_conf_perm"
1344
- top: "conv13_mbox_conf_flat"
1345
- flatten_param {
1346
- axis: 1
1347
- }
1348
- }
1349
- layer {
1350
- name: "conv13_mbox_priorbox"
1351
- type: "PriorBox"
1352
- bottom: "conv13"
1353
- bottom: "data"
1354
- top: "conv13_mbox_priorbox"
1355
- prior_box_param {
1356
- min_size: 105.0
1357
- max_size: 150.0
1358
- aspect_ratio: 2.0
1359
- aspect_ratio: 3.0
1360
- flip: true
1361
- clip: false
1362
- variance: 0.1
1363
- variance: 0.1
1364
- variance: 0.2
1365
- variance: 0.2
1366
- offset: 0.5
1367
- }
1368
- }
1369
- layer {
1370
- name: "conv14_2_mbox_loc"
1371
- type: "Convolution"
1372
- bottom: "conv14_2"
1373
- top: "conv14_2_mbox_loc"
1374
- param {
1375
- lr_mult: 1.0
1376
- decay_mult: 1.0
1377
- }
1378
- param {
1379
- lr_mult: 2.0
1380
- decay_mult: 0.0
1381
- }
1382
- convolution_param {
1383
- num_output: 24
1384
- kernel_size: 1
1385
- weight_filler {
1386
- type: "msra"
1387
- }
1388
- bias_filler {
1389
- type: "constant"
1390
- value: 0.0
1391
- }
1392
- }
1393
- }
1394
- layer {
1395
- name: "conv14_2_mbox_loc_perm"
1396
- type: "Permute"
1397
- bottom: "conv14_2_mbox_loc"
1398
- top: "conv14_2_mbox_loc_perm"
1399
- permute_param {
1400
- order: 0
1401
- order: 2
1402
- order: 3
1403
- order: 1
1404
- }
1405
- }
1406
- layer {
1407
- name: "conv14_2_mbox_loc_flat"
1408
- type: "Flatten"
1409
- bottom: "conv14_2_mbox_loc_perm"
1410
- top: "conv14_2_mbox_loc_flat"
1411
- flatten_param {
1412
- axis: 1
1413
- }
1414
- }
1415
- layer {
1416
- name: "conv14_2_mbox_conf"
1417
- type: "Convolution"
1418
- bottom: "conv14_2"
1419
- top: "conv14_2_mbox_conf"
1420
- param {
1421
- lr_mult: 1.0
1422
- decay_mult: 1.0
1423
- }
1424
- param {
1425
- lr_mult: 2.0
1426
- decay_mult: 0.0
1427
- }
1428
- convolution_param {
1429
- num_output: 126
1430
- kernel_size: 1
1431
- weight_filler {
1432
- type: "msra"
1433
- }
1434
- bias_filler {
1435
- type: "constant"
1436
- value: 0.0
1437
- }
1438
- }
1439
- }
1440
- layer {
1441
- name: "conv14_2_mbox_conf_perm"
1442
- type: "Permute"
1443
- bottom: "conv14_2_mbox_conf"
1444
- top: "conv14_2_mbox_conf_perm"
1445
- permute_param {
1446
- order: 0
1447
- order: 2
1448
- order: 3
1449
- order: 1
1450
- }
1451
- }
1452
- layer {
1453
- name: "conv14_2_mbox_conf_flat"
1454
- type: "Flatten"
1455
- bottom: "conv14_2_mbox_conf_perm"
1456
- top: "conv14_2_mbox_conf_flat"
1457
- flatten_param {
1458
- axis: 1
1459
- }
1460
- }
1461
- layer {
1462
- name: "conv14_2_mbox_priorbox"
1463
- type: "PriorBox"
1464
- bottom: "conv14_2"
1465
- bottom: "data"
1466
- top: "conv14_2_mbox_priorbox"
1467
- prior_box_param {
1468
- min_size: 150.0
1469
- max_size: 195.0
1470
- aspect_ratio: 2.0
1471
- aspect_ratio: 3.0
1472
- flip: true
1473
- clip: false
1474
- variance: 0.1
1475
- variance: 0.1
1476
- variance: 0.2
1477
- variance: 0.2
1478
- offset: 0.5
1479
- }
1480
- }
1481
- layer {
1482
- name: "conv15_2_mbox_loc"
1483
- type: "Convolution"
1484
- bottom: "conv15_2"
1485
- top: "conv15_2_mbox_loc"
1486
- param {
1487
- lr_mult: 1.0
1488
- decay_mult: 1.0
1489
- }
1490
- param {
1491
- lr_mult: 2.0
1492
- decay_mult: 0.0
1493
- }
1494
- convolution_param {
1495
- num_output: 24
1496
- kernel_size: 1
1497
- weight_filler {
1498
- type: "msra"
1499
- }
1500
- bias_filler {
1501
- type: "constant"
1502
- value: 0.0
1503
- }
1504
- }
1505
- }
1506
- layer {
1507
- name: "conv15_2_mbox_loc_perm"
1508
- type: "Permute"
1509
- bottom: "conv15_2_mbox_loc"
1510
- top: "conv15_2_mbox_loc_perm"
1511
- permute_param {
1512
- order: 0
1513
- order: 2
1514
- order: 3
1515
- order: 1
1516
- }
1517
- }
1518
- layer {
1519
- name: "conv15_2_mbox_loc_flat"
1520
- type: "Flatten"
1521
- bottom: "conv15_2_mbox_loc_perm"
1522
- top: "conv15_2_mbox_loc_flat"
1523
- flatten_param {
1524
- axis: 1
1525
- }
1526
- }
1527
- layer {
1528
- name: "conv15_2_mbox_conf"
1529
- type: "Convolution"
1530
- bottom: "conv15_2"
1531
- top: "conv15_2_mbox_conf"
1532
- param {
1533
- lr_mult: 1.0
1534
- decay_mult: 1.0
1535
- }
1536
- param {
1537
- lr_mult: 2.0
1538
- decay_mult: 0.0
1539
- }
1540
- convolution_param {
1541
- num_output: 126
1542
- kernel_size: 1
1543
- weight_filler {
1544
- type: "msra"
1545
- }
1546
- bias_filler {
1547
- type: "constant"
1548
- value: 0.0
1549
- }
1550
- }
1551
- }
1552
- layer {
1553
- name: "conv15_2_mbox_conf_perm"
1554
- type: "Permute"
1555
- bottom: "conv15_2_mbox_conf"
1556
- top: "conv15_2_mbox_conf_perm"
1557
- permute_param {
1558
- order: 0
1559
- order: 2
1560
- order: 3
1561
- order: 1
1562
- }
1563
- }
1564
- layer {
1565
- name: "conv15_2_mbox_conf_flat"
1566
- type: "Flatten"
1567
- bottom: "conv15_2_mbox_conf_perm"
1568
- top: "conv15_2_mbox_conf_flat"
1569
- flatten_param {
1570
- axis: 1
1571
- }
1572
- }
1573
- layer {
1574
- name: "conv15_2_mbox_priorbox"
1575
- type: "PriorBox"
1576
- bottom: "conv15_2"
1577
- bottom: "data"
1578
- top: "conv15_2_mbox_priorbox"
1579
- prior_box_param {
1580
- min_size: 195.0
1581
- max_size: 240.0
1582
- aspect_ratio: 2.0
1583
- aspect_ratio: 3.0
1584
- flip: true
1585
- clip: false
1586
- variance: 0.1
1587
- variance: 0.1
1588
- variance: 0.2
1589
- variance: 0.2
1590
- offset: 0.5
1591
- }
1592
- }
1593
- layer {
1594
- name: "conv16_2_mbox_loc"
1595
- type: "Convolution"
1596
- bottom: "conv16_2"
1597
- top: "conv16_2_mbox_loc"
1598
- param {
1599
- lr_mult: 1.0
1600
- decay_mult: 1.0
1601
- }
1602
- param {
1603
- lr_mult: 2.0
1604
- decay_mult: 0.0
1605
- }
1606
- convolution_param {
1607
- num_output: 24
1608
- kernel_size: 1
1609
- weight_filler {
1610
- type: "msra"
1611
- }
1612
- bias_filler {
1613
- type: "constant"
1614
- value: 0.0
1615
- }
1616
- }
1617
- }
1618
- layer {
1619
- name: "conv16_2_mbox_loc_perm"
1620
- type: "Permute"
1621
- bottom: "conv16_2_mbox_loc"
1622
- top: "conv16_2_mbox_loc_perm"
1623
- permute_param {
1624
- order: 0
1625
- order: 2
1626
- order: 3
1627
- order: 1
1628
- }
1629
- }
1630
- layer {
1631
- name: "conv16_2_mbox_loc_flat"
1632
- type: "Flatten"
1633
- bottom: "conv16_2_mbox_loc_perm"
1634
- top: "conv16_2_mbox_loc_flat"
1635
- flatten_param {
1636
- axis: 1
1637
- }
1638
- }
1639
- layer {
1640
- name: "conv16_2_mbox_conf"
1641
- type: "Convolution"
1642
- bottom: "conv16_2"
1643
- top: "conv16_2_mbox_conf"
1644
- param {
1645
- lr_mult: 1.0
1646
- decay_mult: 1.0
1647
- }
1648
- param {
1649
- lr_mult: 2.0
1650
- decay_mult: 0.0
1651
- }
1652
- convolution_param {
1653
- num_output: 126
1654
- kernel_size: 1
1655
- weight_filler {
1656
- type: "msra"
1657
- }
1658
- bias_filler {
1659
- type: "constant"
1660
- value: 0.0
1661
- }
1662
- }
1663
- }
1664
- layer {
1665
- name: "conv16_2_mbox_conf_perm"
1666
- type: "Permute"
1667
- bottom: "conv16_2_mbox_conf"
1668
- top: "conv16_2_mbox_conf_perm"
1669
- permute_param {
1670
- order: 0
1671
- order: 2
1672
- order: 3
1673
- order: 1
1674
- }
1675
- }
1676
- layer {
1677
- name: "conv16_2_mbox_conf_flat"
1678
- type: "Flatten"
1679
- bottom: "conv16_2_mbox_conf_perm"
1680
- top: "conv16_2_mbox_conf_flat"
1681
- flatten_param {
1682
- axis: 1
1683
- }
1684
- }
1685
- layer {
1686
- name: "conv16_2_mbox_priorbox"
1687
- type: "PriorBox"
1688
- bottom: "conv16_2"
1689
- bottom: "data"
1690
- top: "conv16_2_mbox_priorbox"
1691
- prior_box_param {
1692
- min_size: 240.0
1693
- max_size: 285.0
1694
- aspect_ratio: 2.0
1695
- aspect_ratio: 3.0
1696
- flip: true
1697
- clip: false
1698
- variance: 0.1
1699
- variance: 0.1
1700
- variance: 0.2
1701
- variance: 0.2
1702
- offset: 0.5
1703
- }
1704
- }
1705
- layer {
1706
- name: "conv17_2_mbox_loc"
1707
- type: "Convolution"
1708
- bottom: "conv17_2"
1709
- top: "conv17_2_mbox_loc"
1710
- param {
1711
- lr_mult: 1.0
1712
- decay_mult: 1.0
1713
- }
1714
- param {
1715
- lr_mult: 2.0
1716
- decay_mult: 0.0
1717
- }
1718
- convolution_param {
1719
- num_output: 24
1720
- kernel_size: 1
1721
- weight_filler {
1722
- type: "msra"
1723
- }
1724
- bias_filler {
1725
- type: "constant"
1726
- value: 0.0
1727
- }
1728
- }
1729
- }
1730
- layer {
1731
- name: "conv17_2_mbox_loc_perm"
1732
- type: "Permute"
1733
- bottom: "conv17_2_mbox_loc"
1734
- top: "conv17_2_mbox_loc_perm"
1735
- permute_param {
1736
- order: 0
1737
- order: 2
1738
- order: 3
1739
- order: 1
1740
- }
1741
- }
1742
- layer {
1743
- name: "conv17_2_mbox_loc_flat"
1744
- type: "Flatten"
1745
- bottom: "conv17_2_mbox_loc_perm"
1746
- top: "conv17_2_mbox_loc_flat"
1747
- flatten_param {
1748
- axis: 1
1749
- }
1750
- }
1751
- layer {
1752
- name: "conv17_2_mbox_conf"
1753
- type: "Convolution"
1754
- bottom: "conv17_2"
1755
- top: "conv17_2_mbox_conf"
1756
- param {
1757
- lr_mult: 1.0
1758
- decay_mult: 1.0
1759
- }
1760
- param {
1761
- lr_mult: 2.0
1762
- decay_mult: 0.0
1763
- }
1764
- convolution_param {
1765
- num_output: 126
1766
- kernel_size: 1
1767
- weight_filler {
1768
- type: "msra"
1769
- }
1770
- bias_filler {
1771
- type: "constant"
1772
- value: 0.0
1773
- }
1774
- }
1775
- }
1776
- layer {
1777
- name: "conv17_2_mbox_conf_perm"
1778
- type: "Permute"
1779
- bottom: "conv17_2_mbox_conf"
1780
- top: "conv17_2_mbox_conf_perm"
1781
- permute_param {
1782
- order: 0
1783
- order: 2
1784
- order: 3
1785
- order: 1
1786
- }
1787
- }
1788
- layer {
1789
- name: "conv17_2_mbox_conf_flat"
1790
- type: "Flatten"
1791
- bottom: "conv17_2_mbox_conf_perm"
1792
- top: "conv17_2_mbox_conf_flat"
1793
- flatten_param {
1794
- axis: 1
1795
- }
1796
- }
1797
- layer {
1798
- name: "conv17_2_mbox_priorbox"
1799
- type: "PriorBox"
1800
- bottom: "conv17_2"
1801
- bottom: "data"
1802
- top: "conv17_2_mbox_priorbox"
1803
- prior_box_param {
1804
- min_size: 285.0
1805
- max_size: 300.0
1806
- aspect_ratio: 2.0
1807
- aspect_ratio: 3.0
1808
- flip: true
1809
- clip: false
1810
- variance: 0.1
1811
- variance: 0.1
1812
- variance: 0.2
1813
- variance: 0.2
1814
- offset: 0.5
1815
- }
1816
- }
1817
- layer {
1818
- name: "mbox_loc"
1819
- type: "Concat"
1820
- bottom: "conv11_mbox_loc_flat"
1821
- bottom: "conv13_mbox_loc_flat"
1822
- bottom: "conv14_2_mbox_loc_flat"
1823
- bottom: "conv15_2_mbox_loc_flat"
1824
- bottom: "conv16_2_mbox_loc_flat"
1825
- bottom: "conv17_2_mbox_loc_flat"
1826
- top: "mbox_loc"
1827
- concat_param {
1828
- axis: 1
1829
- }
1830
- }
1831
- layer {
1832
- name: "mbox_conf"
1833
- type: "Concat"
1834
- bottom: "conv11_mbox_conf_flat"
1835
- bottom: "conv13_mbox_conf_flat"
1836
- bottom: "conv14_2_mbox_conf_flat"
1837
- bottom: "conv15_2_mbox_conf_flat"
1838
- bottom: "conv16_2_mbox_conf_flat"
1839
- bottom: "conv17_2_mbox_conf_flat"
1840
- top: "mbox_conf"
1841
- concat_param {
1842
- axis: 1
1843
- }
1844
- }
1845
- layer {
1846
- name: "mbox_priorbox"
1847
- type: "Concat"
1848
- bottom: "conv11_mbox_priorbox"
1849
- bottom: "conv13_mbox_priorbox"
1850
- bottom: "conv14_2_mbox_priorbox"
1851
- bottom: "conv15_2_mbox_priorbox"
1852
- bottom: "conv16_2_mbox_priorbox"
1853
- bottom: "conv17_2_mbox_priorbox"
1854
- top: "mbox_priorbox"
1855
- concat_param {
1856
- axis: 2
1857
- }
1858
- }
1859
- layer {
1860
- name: "mbox_conf_reshape"
1861
- type: "Reshape"
1862
- bottom: "mbox_conf"
1863
- top: "mbox_conf_reshape"
1864
- reshape_param {
1865
- shape {
1866
- dim: 0
1867
- dim: -1
1868
- dim: 21
1869
- }
1870
- }
1871
- }
1872
- layer {
1873
- name: "mbox_conf_softmax"
1874
- type: "Softmax"
1875
- bottom: "mbox_conf_reshape"
1876
- top: "mbox_conf_softmax"
1877
- softmax_param {
1878
- axis: 2
1879
- }
1880
- }
1881
- layer {
1882
- name: "mbox_conf_flatten"
1883
- type: "Flatten"
1884
- bottom: "mbox_conf_softmax"
1885
- top: "mbox_conf_flatten"
1886
- flatten_param {
1887
- axis: 1
1888
- }
1889
- }
1890
- layer {
1891
- name: "detection_out"
1892
- type: "DetectionOutput"
1893
- bottom: "mbox_loc"
1894
- bottom: "mbox_conf_flatten"
1895
- bottom: "mbox_priorbox"
1896
- top: "detection_out"
1897
- include {
1898
- phase: TEST
1899
- }
1900
- detection_output_param {
1901
- num_classes: 21
1902
- share_location: true
1903
- background_label_id: 0
1904
- nms_param {
1905
- nms_threshold: 0.45
1906
- top_k: 100
1907
- }
1908
- code_type: CENTER_SIZE
1909
- keep_top_k: 100
1910
- confidence_threshold: 0.25
1911
- }
1912
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,13 +1,44 @@
1
  ---
2
- title: Webrtc
3
- emoji: 📈
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.0.0b3
8
- app_file: app.py
9
- pinned: false
10
  license: mit
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
 
 
 
 
 
 
 
 
2
  license: mit
3
+ tags:
4
+ - object-detection
5
+ - computer-vision
6
+ - yolov10
7
+ datasets:
8
+ - detection-datasets/coco
9
+ sdk: gradio
10
+ sdk_version: 5.0.0b1
11
  ---
12
 
13
+ ### Model Description
14
+ [YOLOv10: Real-Time End-to-End Object Detection](https://arxiv.org/abs/2405.14458v1)
15
+
16
+ - arXiv: https://arxiv.org/abs/2405.14458v1
17
+ - github: https://github.com/THU-MIG/yolov10
18
+
19
+ ### Installation
20
+ ```
21
+ pip install supervision git+https://github.com/THU-MIG/yolov10.git
22
+ ```
23
+
24
+ ### Yolov10 Inference
25
+ ```python
26
+ from ultralytics import YOLOv10
27
+ import supervision as sv
28
+ import cv2
29
+
30
+ IMAGE_PATH = 'dog.jpeg'
31
+
32
+ model = YOLOv10.from_pretrained('jameslahm/yolov10{n/s/m/b/l/x}')
33
+ model.predict(IMAGE_PATH, show=True)
34
+ ```
35
+
36
+ ### BibTeX Entry and Citation Info
37
+ ```
38
+ @article{wang2024yolov10,
39
+ title={YOLOv10: Real-Time End-to-End Object Detection},
40
+ author={Wang, Ao and Chen, Hui and Liu, Lihao and Chen, Kai and Lin, Zijia and Han, Jungong and Ding, Guiguang},
41
+ journal={arXiv preprint arXiv:2405.14458},
42
+ year={2024}
43
+ }
44
+ ```
app.py CHANGED
@@ -1,10 +1,16 @@
1
  import gradio as gr
2
  import cv2
3
- import numpy as np
4
  from gradio_webrtc import WebRTC
5
- from pathlib import Path
6
  from twilio.rest import Client
7
  import os
 
 
 
 
 
 
 
8
 
9
  account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
10
  auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
@@ -17,72 +23,16 @@ rtc_configuration = {
17
  "iceTransportPolicy": "relay",
18
  }
19
 
20
- CLASSES = [
21
- "background",
22
- "aeroplane",
23
- "bicycle",
24
- "bird",
25
- "boat",
26
- "bottle",
27
- "bus",
28
- "car",
29
- "cat",
30
- "chair",
31
- "cow",
32
- "diningtable",
33
- "dog",
34
- "horse",
35
- "motorbike",
36
- "person",
37
- "pottedplant",
38
- "sheep",
39
- "sofa",
40
- "train",
41
- "tvmonitor",
42
- ]
43
- COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
44
-
45
- directory = Path(__file__).parent
46
-
47
- MODEL = str((directory / "MobileNetSSD_deploy.caffemodel").resolve())
48
- PROTOTXT = str((directory / "MobileNetSSD_deploy.prototxt.txt").resolve())
49
- net = cv2.dnn.readNetFromCaffe(PROTOTXT, MODEL)
50
 
51
 
52
  def detection(image, conf_threshold=0.3):
53
-
54
- blob = cv2.dnn.blobFromImage(
55
- cv2.resize(image, (300, 300)), 0.007843, (300, 300), 127.5
56
- )
57
- net.setInput(blob)
58
-
59
- detections = net.forward()
60
- image = cv2.resize(image, (500, 500))
61
- (h, w) = image.shape[:2]
62
- labels = []
63
- for i in np.arange(0, detections.shape[2]):
64
- confidence = detections[0, 0, i, 2]
65
-
66
- if confidence > conf_threshold:
67
- # extract the index of the class label from the `detections`,
68
- # then compute the (x, y)-coordinates of the bounding box for
69
- # the object
70
- idx = int(detections[0, 0, i, 1])
71
- box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
72
- (startX, startY, endX, endY) = box.astype("int")
73
-
74
- # display the prediction
75
- label = f"{CLASSES[idx]}: {round(confidence * 100, 2)}%"
76
- labels.append(label)
77
- cv2.rectangle(image, (startX, startY), (endX, endY), COLORS[idx], 2)
78
- y = startY - 15 if startY - 15 > 15 else startY + 15
79
- cv2.putText(
80
- image, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2
81
- )
82
- return image
83
 
84
 
85
- css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
86
  .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
87
 
88
 
@@ -90,12 +40,20 @@ with gr.Blocks(css=css) as demo:
90
  gr.HTML(
91
  """
92
  <h1 style='text-align: center'>
93
- Image Detection from Webcam Stream (powered by WebRTC ⚡️)
94
  </h1>
95
- """)
 
 
 
 
 
 
 
 
96
  with gr.Column(elem_classes=["my-column"]):
97
  with gr.Group(elem_classes=["my-group"]):
98
- image = WebRTC(label="Strean", rtc_configuration=rtc_configuration)
99
  conf_threshold = gr.Slider(
100
  label="Confidence Threshold",
101
  minimum=0.0,
@@ -103,13 +61,10 @@ with gr.Blocks(css=css) as demo:
103
  step=0.05,
104
  value=0.30,
105
  )
106
-
107
  image.webrtc_stream(
108
- fn=detection,
109
- inputs=[image],
110
- stream_every=0.05,
111
- time_limit=30
112
  )
113
 
114
- if __name__ == '__main__':
115
  demo.launch()
 
1
  import gradio as gr
2
  import cv2
3
+ from huggingface_hub import hf_hub_download
4
  from gradio_webrtc import WebRTC
 
5
  from twilio.rest import Client
6
  import os
7
+ from inference import YOLOv10
8
+
9
+ model_file = hf_hub_download(
10
+ repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
11
+ )
12
+
13
+ model = YOLOv10(model_file)
14
 
15
  account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
16
  auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
 
23
  "iceTransportPolicy": "relay",
24
  }
25
 
26
+ rtc_configuration = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  def detection(image, conf_threshold=0.3):
30
+ image = cv2.resize(image, (model.input_width, model.input_height))
31
+ new_image = model.detect_objects(image, conf_threshold)
32
+ return new_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
+ css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
36
  .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
37
 
38
 
 
40
  gr.HTML(
41
  """
42
  <h1 style='text-align: center'>
43
+ YOLOv10 Webcam Stream
44
  </h1>
45
+ """
46
+ )
47
+ gr.HTML(
48
+ """
49
+ <h3 style='text-align: center'>
50
+ <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
51
+ </h3>
52
+ """
53
+ )
54
  with gr.Column(elem_classes=["my-column"]):
55
  with gr.Group(elem_classes=["my-group"]):
56
+ image = WebRTC(label="Stream", rtc_configuration=rtc_configuration)
57
  conf_threshold = gr.Slider(
58
  label="Confidence Threshold",
59
  minimum=0.0,
 
61
  step=0.05,
62
  value=0.30,
63
  )
64
+
65
  image.webrtc_stream(
66
+ fn=detection, inputs=[image, conf_threshold], stream_every=0.05, time_limit=30
 
 
 
67
  )
68
 
69
+ if __name__ == "__main__":
70
  demo.launch()
inference.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import cv2
3
+ import numpy as np
4
+ import onnxruntime
5
+
6
+ from utils import draw_detections
7
+
8
+
9
+ class YOLOv10:
10
+ def __init__(self, path):
11
+
12
+ # Initialize model
13
+ self.initialize_model(path)
14
+
15
+ def __call__(self, image):
16
+ return self.detect_objects(image)
17
+
18
+ def initialize_model(self, path):
19
+ self.session = onnxruntime.InferenceSession(
20
+ path, providers=onnxruntime.get_available_providers()
21
+ )
22
+ # Get model info
23
+ self.get_input_details()
24
+ self.get_output_details()
25
+
26
+ def detect_objects(self, image, conf_threshold=0.3):
27
+ input_tensor = self.prepare_input(image)
28
+
29
+ # Perform inference on the image
30
+ new_image = self.inference(image, input_tensor, conf_threshold)
31
+
32
+ return new_image
33
+
34
+ def prepare_input(self, image):
35
+ self.img_height, self.img_width = image.shape[:2]
36
+
37
+ input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
38
+
39
+ # Resize input image
40
+ input_img = cv2.resize(input_img, (self.input_width, self.input_height))
41
+
42
+ # Scale input pixel values to 0 to 1
43
+ input_img = input_img / 255.0
44
+ input_img = input_img.transpose(2, 0, 1)
45
+ input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
46
+
47
+ return input_tensor
48
+
49
+ def inference(self, image, input_tensor, conf_threshold=0.3):
50
+ start = time.perf_counter()
51
+ outputs = self.session.run(
52
+ self.output_names, {self.input_names[0]: input_tensor}
53
+ )
54
+
55
+ print(f"Inference time: {(time.perf_counter() - start)*1000:.2f} ms")
56
+ boxes, scores, class_ids, = self.process_output(outputs, conf_threshold)
57
+ return self.draw_detections(image, boxes, scores, class_ids)
58
+
59
+ def process_output(self, output, conf_threshold=0.3):
60
+ predictions = np.squeeze(output[0])
61
+
62
+ # Filter out object confidence scores below threshold
63
+ scores = predictions[:, 4]
64
+ predictions = predictions[scores > conf_threshold, :]
65
+ scores = scores[scores > conf_threshold]
66
+
67
+ if len(scores) == 0:
68
+ return [], [], []
69
+
70
+ # Get the class with the highest confidence
71
+ class_ids = np.argmax(predictions[:, 4:], axis=1)
72
+
73
+ # Get bounding boxes for each object
74
+ boxes = self.extract_boxes(predictions)
75
+
76
+ return boxes, scores, class_ids
77
+
78
+ def extract_boxes(self, predictions):
79
+ # Extract boxes from predictions
80
+ boxes = predictions[:, :4]
81
+
82
+ # Scale boxes to original image dimensions
83
+ boxes = self.rescale_boxes(boxes)
84
+
85
+ # Convert boxes to xyxy format
86
+ #boxes = xywh2xyxy(boxes)
87
+
88
+ return boxes
89
+
90
+ def rescale_boxes(self, boxes):
91
+ # Rescale boxes to original image dimensions
92
+ input_shape = np.array(
93
+ [self.input_width, self.input_height, self.input_width, self.input_height]
94
+ )
95
+ boxes = np.divide(boxes, input_shape, dtype=np.float32)
96
+ boxes *= np.array(
97
+ [self.img_width, self.img_height, self.img_width, self.img_height]
98
+ )
99
+ return boxes
100
+
101
+ def draw_detections(self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4):
102
+ return draw_detections(
103
+ image, boxes, scores, class_ids, mask_alpha
104
+ )
105
+
106
+ def get_input_details(self):
107
+ model_inputs = self.session.get_inputs()
108
+ self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
109
+
110
+ self.input_shape = model_inputs[0].shape
111
+ self.input_height = self.input_shape[2]
112
+ self.input_width = self.input_shape[3]
113
+
114
+ def get_output_details(self):
115
+ model_outputs = self.session.get_outputs()
116
+ self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
117
+
118
+
119
+ if __name__ == "__main__":
120
+ import requests
121
+ import tempfile
122
+ from huggingface_hub import hf_hub_download
123
+
124
+ model_file = hf_hub_download(
125
+ repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
126
+ )
127
+
128
+ yolov8_detector = YOLOv10(model_file)
129
+
130
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
131
+ f.write(
132
+ requests.get(
133
+ "https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
134
+ ).content
135
+ )
136
+ f.seek(0)
137
+ img = cv2.imread(f.name)
138
+
139
+ # # Detect Objects
140
+ combined_image = yolov8_detector.detect_objects(img)
141
+
142
+
143
+ # Draw detections
144
+ cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
145
+ cv2.imshow("Output", combined_image)
146
+ cv2.waitKey(0)
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  safetensors==0.4.3
2
  opencv-python
 
3
  https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio-5.0.0b3-py3-none-any.whl
4
- https://gradio-builds.s3.amazonaws.com/webrtc/03/gradio_webrtc-0.0.1-py3-none-any.whl
5
- twilio
 
1
  safetensors==0.4.3
2
  opencv-python
3
+ twilio
4
  https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio-5.0.0b3-py3-none-any.whl
5
+ https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio_webrtc-0.0.1-py3-none-any.whl
6
+ onx-runtime
utils.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+
4
+ class_names = [
5
+ "person",
6
+ "bicycle",
7
+ "car",
8
+ "motorcycle",
9
+ "airplane",
10
+ "bus",
11
+ "train",
12
+ "truck",
13
+ "boat",
14
+ "traffic light",
15
+ "fire hydrant",
16
+ "stop sign",
17
+ "parking meter",
18
+ "bench",
19
+ "bird",
20
+ "cat",
21
+ "dog",
22
+ "horse",
23
+ "sheep",
24
+ "cow",
25
+ "elephant",
26
+ "bear",
27
+ "zebra",
28
+ "giraffe",
29
+ "backpack",
30
+ "umbrella",
31
+ "handbag",
32
+ "tie",
33
+ "suitcase",
34
+ "frisbee",
35
+ "skis",
36
+ "snowboard",
37
+ "sports ball",
38
+ "kite",
39
+ "baseball bat",
40
+ "baseball glove",
41
+ "skateboard",
42
+ "surfboard",
43
+ "tennis racket",
44
+ "bottle",
45
+ "wine glass",
46
+ "cup",
47
+ "fork",
48
+ "knife",
49
+ "spoon",
50
+ "bowl",
51
+ "banana",
52
+ "apple",
53
+ "sandwich",
54
+ "orange",
55
+ "broccoli",
56
+ "carrot",
57
+ "hot dog",
58
+ "pizza",
59
+ "donut",
60
+ "cake",
61
+ "chair",
62
+ "couch",
63
+ "potted plant",
64
+ "bed",
65
+ "dining table",
66
+ "toilet",
67
+ "tv",
68
+ "laptop",
69
+ "mouse",
70
+ "remote",
71
+ "keyboard",
72
+ "cell phone",
73
+ "microwave",
74
+ "oven",
75
+ "toaster",
76
+ "sink",
77
+ "refrigerator",
78
+ "book",
79
+ "clock",
80
+ "vase",
81
+ "scissors",
82
+ "teddy bear",
83
+ "hair drier",
84
+ "toothbrush",
85
+ ]
86
+
87
+ # Create a list of colors for each class where each color is a tuple of 3 integer values
88
+ rng = np.random.default_rng(3)
89
+ colors = rng.uniform(0, 255, size=(len(class_names), 3))
90
+
91
+
92
+ def nms(boxes, scores, iou_threshold):
93
+ # Sort by score
94
+ sorted_indices = np.argsort(scores)[::-1]
95
+
96
+ keep_boxes = []
97
+ while sorted_indices.size > 0:
98
+ # Pick the last box
99
+ box_id = sorted_indices[0]
100
+ keep_boxes.append(box_id)
101
+
102
+ # Compute IoU of the picked box with the rest
103
+ ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
104
+
105
+ # Remove boxes with IoU over the threshold
106
+ keep_indices = np.where(ious < iou_threshold)[0]
107
+
108
+ # print(keep_indices.shape, sorted_indices.shape)
109
+ sorted_indices = sorted_indices[keep_indices + 1]
110
+
111
+ return keep_boxes
112
+
113
+
114
+ def multiclass_nms(boxes, scores, class_ids, iou_threshold):
115
+ unique_class_ids = np.unique(class_ids)
116
+
117
+ keep_boxes = []
118
+ for class_id in unique_class_ids:
119
+ class_indices = np.where(class_ids == class_id)[0]
120
+ class_boxes = boxes[class_indices, :]
121
+ class_scores = scores[class_indices]
122
+
123
+ class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
124
+ keep_boxes.extend(class_indices[class_keep_boxes])
125
+
126
+ return keep_boxes
127
+
128
+
129
+ def compute_iou(box, boxes):
130
+ # Compute xmin, ymin, xmax, ymax for both boxes
131
+ xmin = np.maximum(box[0], boxes[:, 0])
132
+ ymin = np.maximum(box[1], boxes[:, 1])
133
+ xmax = np.minimum(box[2], boxes[:, 2])
134
+ ymax = np.minimum(box[3], boxes[:, 3])
135
+
136
+ # Compute intersection area
137
+ intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
138
+
139
+ # Compute union area
140
+ box_area = (box[2] - box[0]) * (box[3] - box[1])
141
+ boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
142
+ union_area = box_area + boxes_area - intersection_area
143
+
144
+ # Compute IoU
145
+ iou = intersection_area / union_area
146
+
147
+ return iou
148
+
149
+
150
+ def xywh2xyxy(x):
151
+ # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
152
+ y = np.copy(x)
153
+ y[..., 0] = x[..., 0] - x[..., 2] / 2
154
+ y[..., 1] = x[..., 1] - x[..., 3] / 2
155
+ y[..., 2] = x[..., 0] + x[..., 2] / 2
156
+ y[..., 3] = x[..., 1] + x[..., 3] / 2
157
+ return y
158
+
159
+
160
+ def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
161
+ det_img = image.copy()
162
+
163
+ img_height, img_width = image.shape[:2]
164
+ font_size = min([img_height, img_width]) * 0.0006
165
+ text_thickness = int(min([img_height, img_width]) * 0.001)
166
+
167
+ #det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
168
+
169
+ # Draw bounding boxes and labels of detections
170
+ for class_id, box, score in zip(class_ids, boxes, scores):
171
+ color = colors[class_id]
172
+
173
+ draw_box(det_img, box, color)
174
+
175
+ label = class_names[class_id]
176
+ caption = f"{label} {int(score * 100)}%"
177
+ draw_text(det_img, caption, box, color, font_size, text_thickness)
178
+
179
+ return det_img
180
+
181
+
182
+ def draw_box(
183
+ image: np.ndarray,
184
+ box: np.ndarray,
185
+ color: tuple[int, int, int] = (0, 0, 255),
186
+ thickness: int = 2,
187
+ ) -> np.ndarray:
188
+ x1, y1, x2, y2 = box.astype(int)
189
+ return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
190
+
191
+
192
+ def draw_text(
193
+ image: np.ndarray,
194
+ text: str,
195
+ box: np.ndarray,
196
+ color: tuple[int, int, int] = (0, 0, 255),
197
+ font_size: float = 0.001,
198
+ text_thickness: int = 2,
199
+ ) -> np.ndarray:
200
+ x1, y1, x2, y2 = box.astype(int)
201
+ (tw, th), _ = cv2.getTextSize(
202
+ text=text,
203
+ fontFace=cv2.FONT_HERSHEY_SIMPLEX,
204
+ fontScale=font_size,
205
+ thickness=text_thickness,
206
+ )
207
+ th = int(th * 1.2)
208
+
209
+ cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
210
+
211
+ return cv2.putText(
212
+ image,
213
+ text,
214
+ (x1, y1),
215
+ cv2.FONT_HERSHEY_SIMPLEX,
216
+ font_size,
217
+ (255, 255, 255),
218
+ text_thickness,
219
+ cv2.LINE_AA,
220
+ )
221
+
222
+
223
+ def draw_masks(
224
+ image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
225
+ ) -> np.ndarray:
226
+ mask_img = image.copy()
227
+
228
+ # Draw bounding boxes and labels of detections
229
+ for box, class_id in zip(boxes, classes):
230
+ color = colors[class_id]
231
+
232
+ x1, y1, x2, y2 = box.astype(int)
233
+
234
+ # Draw fill rectangle in mask image
235
+ cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1)
236
+
237
+ return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)