first commit

Browse files

Files changed (12) hide show

1_Pooling/config.json +9 -0
README.md +1085 -1
added_tokens.json +7 -0
config.json +31 -0
config_sentence_transformers.json +7 -0
modules.json +20 -0
pytorch_model.bin +3 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +14 -0
tokenizer.json +0 -0
tokenizer_config.json +71 -0
vocab.txt +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false
+}

README.md CHANGED Viewed

@@ -1,3 +1,1087 @@
 ---
-license: apache-2.0
 ---

 ---
+tags:
+- mteb
+model-index:
+- name: xiaobu-embedding
+  results:
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/AFQMC
+      name: MTEB AFQMC
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 49.37874132528482
+    - type: cos_sim_spearman
+      value: 54.84722470052176
+    - type: euclidean_pearson
+      value: 53.0495882931575
+    - type: euclidean_spearman
+      value: 54.847727301700665
+    - type: manhattan_pearson
+      value: 53.0632140838278
+    - type: manhattan_spearman
+      value: 54.8744258024692
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/ATEC
+      name: MTEB ATEC
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 48.15992903013723
+    - type: cos_sim_spearman
+      value: 55.13198035464577
+    - type: euclidean_pearson
+      value: 55.435876753245715
+    - type: euclidean_spearman
+      value: 55.13215936702871
+    - type: manhattan_pearson
+      value: 55.41429518223402
+    - type: manhattan_spearman
+      value: 55.13363087679285
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_reviews_multi
+      name: MTEB AmazonReviewsClassification (zh)
+      config: zh
+      split: test
+      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
+    metrics:
+    - type: accuracy
+      value: 46.722
+    - type: f1
+      value: 45.039340641893205
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/BQ
+      name: MTEB BQ
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 63.517830355554224
+    - type: cos_sim_spearman
+      value: 65.57007801018649
+    - type: euclidean_pearson
+      value: 64.05153340906585
+    - type: euclidean_spearman
+      value: 65.5696865661119
+    - type: manhattan_pearson
+      value: 63.95710619755406
+    - type: manhattan_spearman
+      value: 65.48565785379489
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/CLSClusteringP2P
+      name: MTEB CLSClusteringP2P
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 43.24046498507819
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/CLSClusteringS2S
+      name: MTEB CLSClusteringS2S
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 41.22618199372116
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/CMedQAv1-reranking
+      name: MTEB CMedQAv1
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map
+      value: 87.12213224673621
+    - type: mrr
+      value: 89.57150793650794
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/CMedQAv2-reranking
+      name: MTEB CMedQAv2
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map
+      value: 87.57290061886421
+    - type: mrr
+      value: 90.19202380952382
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/CmedqaRetrieval
+      name: MTEB CmedqaRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 25.22
+    - type: map_at_10
+      value: 37.604
+    - type: map_at_100
+      value: 39.501
+    - type: map_at_1000
+      value: 39.614
+    - type: map_at_3
+      value: 33.378
+    - type: map_at_5
+      value: 35.774
+    - type: mrr_at_1
+      value: 38.385000000000005
+    - type: mrr_at_10
+      value: 46.487
+    - type: mrr_at_100
+      value: 47.504999999999995
+    - type: mrr_at_1000
+      value: 47.548
+    - type: mrr_at_3
+      value: 43.885999999999996
+    - type: mrr_at_5
+      value: 45.373000000000005
+    - type: ndcg_at_1
+      value: 38.385000000000005
+    - type: ndcg_at_10
+      value: 44.224999999999994
+    - type: ndcg_at_100
+      value: 51.637
+    - type: ndcg_at_1000
+      value: 53.55799999999999
+    - type: ndcg_at_3
+      value: 38.845
+    - type: ndcg_at_5
+      value: 41.163
+    - type: precision_at_1
+      value: 38.385000000000005
+    - type: precision_at_10
+      value: 9.812
+    - type: precision_at_100
+      value: 1.58
+    - type: precision_at_1000
+      value: 0.183
+    - type: precision_at_3
+      value: 21.88
+    - type: precision_at_5
+      value: 15.974
+    - type: recall_at_1
+      value: 25.22
+    - type: recall_at_10
+      value: 54.897
+    - type: recall_at_100
+      value: 85.469
+    - type: recall_at_1000
+      value: 98.18599999999999
+    - type: recall_at_3
+      value: 38.815
+    - type: recall_at_5
+      value: 45.885
+  - task:
+      type: PairClassification
+    dataset:
+      type: C-MTEB/CMNLI
+      name: MTEB Cmnli
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: cos_sim_accuracy
+      value: 83.22309079975948
+    - type: cos_sim_ap
+      value: 89.94833400328307
+    - type: cos_sim_f1
+      value: 84.39319055464031
+    - type: cos_sim_precision
+      value: 79.5774647887324
+    - type: cos_sim_recall
+      value: 89.82931961655366
+    - type: dot_accuracy
+      value: 83.22309079975948
+    - type: dot_ap
+      value: 89.95618559578415
+    - type: dot_f1
+      value: 84.41173239591345
+    - type: dot_precision
+      value: 79.61044343141317
+    - type: dot_recall
+      value: 89.82931961655366
+    - type: euclidean_accuracy
+      value: 83.23511725796753
+    - type: euclidean_ap
+      value: 89.94836342787318
+    - type: euclidean_f1
+      value: 84.40550133096718
+    - type: euclidean_precision
+      value: 80.29120067524794
+    - type: euclidean_recall
+      value: 88.9642272620996
+    - type: manhattan_accuracy
+      value: 83.23511725796753
+    - type: manhattan_ap
+      value: 89.9450103956978
+    - type: manhattan_f1
+      value: 84.44444444444444
+    - type: manhattan_precision
+      value: 80.09647651006712
+    - type: manhattan_recall
+      value: 89.29155950432546
+    - type: max_accuracy
+      value: 83.23511725796753
+    - type: max_ap
+      value: 89.95618559578415
+    - type: max_f1
+      value: 84.44444444444444
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/CovidRetrieval
+      name: MTEB CovidRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 76.87
+    - type: map_at_10
+      value: 84.502
+    - type: map_at_100
+      value: 84.615
+    - type: map_at_1000
+      value: 84.617
+    - type: map_at_3
+      value: 83.127
+    - type: map_at_5
+      value: 83.99600000000001
+    - type: mrr_at_1
+      value: 77.02799999999999
+    - type: mrr_at_10
+      value: 84.487
+    - type: mrr_at_100
+      value: 84.59299999999999
+    - type: mrr_at_1000
+      value: 84.59400000000001
+    - type: mrr_at_3
+      value: 83.193
+    - type: mrr_at_5
+      value: 83.994
+    - type: ndcg_at_1
+      value: 77.134
+    - type: ndcg_at_10
+      value: 87.68599999999999
+    - type: ndcg_at_100
+      value: 88.17099999999999
+    - type: ndcg_at_1000
+      value: 88.21
+    - type: ndcg_at_3
+      value: 84.993
+    - type: ndcg_at_5
+      value: 86.519
+    - type: precision_at_1
+      value: 77.134
+    - type: precision_at_10
+      value: 9.841999999999999
+    - type: precision_at_100
+      value: 1.006
+    - type: precision_at_1000
+      value: 0.101
+    - type: precision_at_3
+      value: 30.313000000000002
+    - type: precision_at_5
+      value: 18.945999999999998
+    - type: recall_at_1
+      value: 76.87
+    - type: recall_at_10
+      value: 97.418
+    - type: recall_at_100
+      value: 99.579
+    - type: recall_at_1000
+      value: 99.895
+    - type: recall_at_3
+      value: 90.227
+    - type: recall_at_5
+      value: 93.888
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/DuRetrieval
+      name: MTEB DuRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 25.941
+    - type: map_at_10
+      value: 78.793
+    - type: map_at_100
+      value: 81.57799999999999
+    - type: map_at_1000
+      value: 81.626
+    - type: map_at_3
+      value: 54.749
+    - type: map_at_5
+      value: 69.16
+    - type: mrr_at_1
+      value: 90.45
+    - type: mrr_at_10
+      value: 93.406
+    - type: mrr_at_100
+      value: 93.453
+    - type: mrr_at_1000
+      value: 93.45700000000001
+    - type: mrr_at_3
+      value: 93.10000000000001
+    - type: mrr_at_5
+      value: 93.27499999999999
+    - type: ndcg_at_1
+      value: 90.45
+    - type: ndcg_at_10
+      value: 86.44500000000001
+    - type: ndcg_at_100
+      value: 89.28399999999999
+    - type: ndcg_at_1000
+      value: 89.739
+    - type: ndcg_at_3
+      value: 85.62100000000001
+    - type: ndcg_at_5
+      value: 84.441
+    - type: precision_at_1
+      value: 90.45
+    - type: precision_at_10
+      value: 41.19
+    - type: precision_at_100
+      value: 4.761
+    - type: precision_at_1000
+      value: 0.48700000000000004
+    - type: precision_at_3
+      value: 76.583
+    - type: precision_at_5
+      value: 64.68
+    - type: recall_at_1
+      value: 25.941
+    - type: recall_at_10
+      value: 87.443
+    - type: recall_at_100
+      value: 96.54
+    - type: recall_at_1000
+      value: 98.906
+    - type: recall_at_3
+      value: 56.947
+    - type: recall_at_5
+      value: 73.714
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/EcomRetrieval
+      name: MTEB EcomRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 52.900000000000006
+    - type: map_at_10
+      value: 63.144
+    - type: map_at_100
+      value: 63.634
+    - type: map_at_1000
+      value: 63.644999999999996
+    - type: map_at_3
+      value: 60.817
+    - type: map_at_5
+      value: 62.202
+    - type: mrr_at_1
+      value: 52.900000000000006
+    - type: mrr_at_10
+      value: 63.144
+    - type: mrr_at_100
+      value: 63.634
+    - type: mrr_at_1000
+      value: 63.644999999999996
+    - type: mrr_at_3
+      value: 60.817
+    - type: mrr_at_5
+      value: 62.202
+    - type: ndcg_at_1
+      value: 52.900000000000006
+    - type: ndcg_at_10
+      value: 68.042
+    - type: ndcg_at_100
+      value: 70.417
+    - type: ndcg_at_1000
+      value: 70.722
+    - type: ndcg_at_3
+      value: 63.287000000000006
+    - type: ndcg_at_5
+      value: 65.77
+    - type: precision_at_1
+      value: 52.900000000000006
+    - type: precision_at_10
+      value: 8.34
+    - type: precision_at_100
+      value: 0.9450000000000001
+    - type: precision_at_1000
+      value: 0.097
+    - type: precision_at_3
+      value: 23.467
+    - type: precision_at_5
+      value: 15.28
+    - type: recall_at_1
+      value: 52.900000000000006
+    - type: recall_at_10
+      value: 83.39999999999999
+    - type: recall_at_100
+      value: 94.5
+    - type: recall_at_1000
+      value: 96.89999999999999
+    - type: recall_at_3
+      value: 70.39999999999999
+    - type: recall_at_5
+      value: 76.4
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/IFlyTek-classification
+      name: MTEB IFlyTek
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 49.74220854174683
+    - type: f1
+      value: 38.01399980618159
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/JDReview-classification
+      name: MTEB JDReview
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 86.73545966228893
+    - type: ap
+      value: 55.72394235169542
+    - type: f1
+      value: 81.58550390953492
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/LCQMC
+      name: MTEB LCQMC
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 69.96711977441642
+    - type: cos_sim_spearman
+      value: 75.54747609685569
+    - type: euclidean_pearson
+      value: 74.62663478056035
+    - type: euclidean_spearman
+      value: 75.54761576699639
+    - type: manhattan_pearson
+      value: 74.60983904582241
+    - type: manhattan_spearman
+      value: 75.52758938061503
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/Mmarco-reranking
+      name: MTEB MMarcoReranking
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map
+      value: 28.076927649720986
+    - type: mrr
+      value: 26.98015873015873
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/MMarcoRetrieval
+      name: MTEB MMarcoRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 65.58
+    - type: map_at_10
+      value: 74.763
+    - type: map_at_100
+      value: 75.077
+    - type: map_at_1000
+      value: 75.091
+    - type: map_at_3
+      value: 72.982
+    - type: map_at_5
+      value: 74.155
+    - type: mrr_at_1
+      value: 67.822
+    - type: mrr_at_10
+      value: 75.437
+    - type: mrr_at_100
+      value: 75.702
+    - type: mrr_at_1000
+      value: 75.715
+    - type: mrr_at_3
+      value: 73.91799999999999
+    - type: mrr_at_5
+      value: 74.909
+    - type: ndcg_at_1
+      value: 67.822
+    - type: ndcg_at_10
+      value: 78.472
+    - type: ndcg_at_100
+      value: 79.891
+    - type: ndcg_at_1000
+      value: 80.262
+    - type: ndcg_at_3
+      value: 75.138
+    - type: ndcg_at_5
+      value: 77.094
+    - type: precision_at_1
+      value: 67.822
+    - type: precision_at_10
+      value: 9.474
+    - type: precision_at_100
+      value: 1.019
+    - type: precision_at_1000
+      value: 0.105
+    - type: precision_at_3
+      value: 28.281
+    - type: precision_at_5
+      value: 18.017
+    - type: recall_at_1
+      value: 65.58
+    - type: recall_at_10
+      value: 89.18599999999999
+    - type: recall_at_100
+      value: 95.64399999999999
+    - type: recall_at_1000
+      value: 98.541
+    - type: recall_at_3
+      value: 80.455
+    - type: recall_at_5
+      value: 85.063
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_intent
+      name: MTEB MassiveIntentClassification (zh-CN)
+      config: zh-CN
+      split: test
+      revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
+    metrics:
+    - type: accuracy
+      value: 72.86819098856758
+    - type: f1
+      value: 70.25369778283451
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_scenario
+      name: MTEB MassiveScenarioClassification (zh-CN)
+      config: zh-CN
+      split: test
+      revision: 7d571f92784cd94a019292a1f45445077d0ef634
+    metrics:
+    - type: accuracy
+      value: 75.46738399462004
+    - type: f1
+      value: 75.02466838130249
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/MedicalRetrieval
+      name: MTEB MedicalRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 53.300000000000004
+    - type: map_at_10
+      value: 60.072
+    - type: map_at_100
+      value: 60.618
+    - type: map_at_1000
+      value: 60.659
+    - type: map_at_3
+      value: 58.550000000000004
+    - type: map_at_5
+      value: 59.425
+    - type: mrr_at_1
+      value: 53.5
+    - type: mrr_at_10
+      value: 60.187999999999995
+    - type: mrr_at_100
+      value: 60.73499999999999
+    - type: mrr_at_1000
+      value: 60.775999999999996
+    - type: mrr_at_3
+      value: 58.667
+    - type: mrr_at_5
+      value: 59.541999999999994
+    - type: ndcg_at_1
+      value: 53.300000000000004
+    - type: ndcg_at_10
+      value: 63.376999999999995
+    - type: ndcg_at_100
+      value: 66.24600000000001
+    - type: ndcg_at_1000
+      value: 67.408
+    - type: ndcg_at_3
+      value: 60.211000000000006
+    - type: ndcg_at_5
+      value: 61.781
+    - type: precision_at_1
+      value: 53.300000000000004
+    - type: precision_at_10
+      value: 7.380000000000001
+    - type: precision_at_100
+      value: 0.877
+    - type: precision_at_1000
+      value: 0.097
+    - type: precision_at_3
+      value: 21.667
+    - type: precision_at_5
+      value: 13.76
+    - type: recall_at_1
+      value: 53.300000000000004
+    - type: recall_at_10
+      value: 73.8
+    - type: recall_at_100
+      value: 87.7
+    - type: recall_at_1000
+      value: 97.0
+    - type: recall_at_3
+      value: 65.0
+    - type: recall_at_5
+      value: 68.8
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/MultilingualSentiment-classification
+      name: MTEB MultilingualSentiment
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 76.27666666666667
+    - type: f1
+      value: 76.31280038435165
+  - task:
+      type: PairClassification
+    dataset:
+      type: C-MTEB/OCNLI
+      name: MTEB Ocnli
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: cos_sim_accuracy
+      value: 78.72225230102869
+    - type: cos_sim_ap
+      value: 80.63941899467723
+    - type: cos_sim_f1
+      value: 80.52190121155638
+    - type: cos_sim_precision
+      value: 72.06005004170142
+    - type: cos_sim_recall
+      value: 91.23548046462513
+    - type: dot_accuracy
+      value: 78.72225230102869
+    - type: dot_ap
+      value: 80.63913939812744
+    - type: dot_f1
+      value: 80.51948051948052
+    - type: dot_precision
+      value: 71.7948717948718
+    - type: dot_recall
+      value: 91.65786694825766
+    - type: euclidean_accuracy
+      value: 78.72225230102869
+    - type: euclidean_ap
+      value: 80.64403797436798
+    - type: euclidean_f1
+      value: 80.52190121155638
+    - type: euclidean_precision
+      value: 72.06005004170142
+    - type: euclidean_recall
+      value: 91.23548046462513
+    - type: manhattan_accuracy
+      value: 78.18083378451544
+    - type: manhattan_ap
+      value: 80.5241189302444
+    - type: manhattan_f1
+      value: 80.43478260869566
+    - type: manhattan_precision
+      value: 72.7972626176219
+    - type: manhattan_recall
+      value: 89.86272439281943
+    - type: max_accuracy
+      value: 78.72225230102869
+    - type: max_ap
+      value: 80.64403797436798
+    - type: max_f1
+      value: 80.52190121155638
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/OnlineShopping-classification
+      name: MTEB OnlineShopping
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 92.49000000000001
+    - type: ap
+      value: 90.66330807324402
+    - type: f1
+      value: 92.48245049107115
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/PAWSX
+      name: MTEB PAWSX
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 33.6275431596535
+    - type: cos_sim_spearman
+      value: 37.865700050451494
+    - type: euclidean_pearson
+      value: 38.1050665279388
+    - type: euclidean_spearman
+      value: 37.864125056066364
+    - type: manhattan_pearson
+      value: 38.11206873232881
+    - type: manhattan_spearman
+      value: 37.852977098473936
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/QBQTC
+      name: MTEB QBQTC
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 32.137955501231104
+    - type: cos_sim_spearman
+      value: 33.68610910423116
+    - type: euclidean_pearson
+      value: 32.155444753547926
+    - type: euclidean_spearman
+      value: 33.685799252964124
+    - type: manhattan_pearson
+      value: 32.14490855334317
+    - type: manhattan_spearman
+      value: 33.656549820048554
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts22-crosslingual-sts
+      name: MTEB STS22 (zh)
+      config: zh
+      split: test
+      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
+    metrics:
+    - type: cos_sim_pearson
+      value: 63.63884916818661
+    - type: cos_sim_spearman
+      value: 64.3217581571435
+    - type: euclidean_pearson
+      value: 63.475760085926055
+    - type: euclidean_spearman
+      value: 64.31638169371887
+    - type: manhattan_pearson
+      value: 64.39677572604752
+    - type: manhattan_spearman
+      value: 64.85585019406021
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/STSB
+      name: MTEB STSB
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 79.74698333415277
+    - type: cos_sim_spearman
+      value: 81.1850043859317
+    - type: euclidean_pearson
+      value: 80.94512578669881
+    - type: euclidean_spearman
+      value: 81.18825478390181
+    - type: manhattan_pearson
+      value: 80.88114336824758
+    - type: manhattan_spearman
+      value: 81.12266715583868
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/T2Reranking
+      name: MTEB T2Reranking
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map
+      value: 66.59971552953814
+    - type: mrr
+      value: 76.42177408088038
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/T2Retrieval
+      name: MTEB T2Retrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 28.825
+    - type: map_at_10
+      value: 77.48899999999999
+    - type: map_at_100
+      value: 81.144
+    - type: map_at_1000
+      value: 81.216
+    - type: map_at_3
+      value: 55.435
+    - type: map_at_5
+      value: 67.496
+    - type: mrr_at_1
+      value: 91.377
+    - type: mrr_at_10
+      value: 94.062
+    - type: mrr_at_100
+      value: 94.122
+    - type: mrr_at_1000
+      value: 94.123
+    - type: mrr_at_3
+      value: 93.709
+    - type: mrr_at_5
+      value: 93.932
+    - type: ndcg_at_1
+      value: 91.377
+    - type: ndcg_at_10
+      value: 85.44800000000001
+    - type: ndcg_at_100
+      value: 89.11099999999999
+    - type: ndcg_at_1000
+      value: 89.752
+    - type: ndcg_at_3
+      value: 87.262
+    - type: ndcg_at_5
+      value: 85.668
+    - type: precision_at_1
+      value: 91.377
+    - type: precision_at_10
+      value: 41.525
+    - type: precision_at_100
+      value: 4.989
+    - type: precision_at_1000
+      value: 0.516
+    - type: precision_at_3
+      value: 75.452
+    - type: precision_at_5
+      value: 62.785000000000004
+    - type: recall_at_1
+      value: 28.825
+    - type: recall_at_10
+      value: 84.202
+    - type: recall_at_100
+      value: 95.768
+    - type: recall_at_1000
+      value: 98.791
+    - type: recall_at_3
+      value: 57.284
+    - type: recall_at_5
+      value: 71.071
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/TNews-classification
+      name: MTEB TNews
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 52.160000000000004
+    - type: f1
+      value: 50.49492950548829
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/ThuNewsClusteringP2P
+      name: MTEB ThuNewsClusteringP2P
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 70.06019845009966
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/ThuNewsClusteringS2S
+      name: MTEB ThuNewsClusteringS2S
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 63.9370959228245
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/VideoRetrieval
+      name: MTEB VideoRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 60.0
+    - type: map_at_10
+      value: 69.362
+    - type: map_at_100
+      value: 69.819
+    - type: map_at_1000
+      value: 69.833
+    - type: map_at_3
+      value: 67.783
+    - type: map_at_5
+      value: 68.71300000000001
+    - type: mrr_at_1
+      value: 60.0
+    - type: mrr_at_10
+      value: 69.362
+    - type: mrr_at_100
+      value: 69.819
+    - type: mrr_at_1000
+      value: 69.833
+    - type: mrr_at_3
+      value: 67.783
+    - type: mrr_at_5
+      value: 68.71300000000001
+    - type: ndcg_at_1
+      value: 60.0
+    - type: ndcg_at_10
+      value: 73.59400000000001
+    - type: ndcg_at_100
+      value: 75.734
+    - type: ndcg_at_1000
+      value: 76.049
+    - type: ndcg_at_3
+      value: 70.33
+    - type: ndcg_at_5
+      value: 72.033
+    - type: precision_at_1
+      value: 60.0
+    - type: precision_at_10
+      value: 8.67
+    - type: precision_at_100
+      value: 0.9650000000000001
+    - type: precision_at_1000
+      value: 0.099
+    - type: precision_at_3
+      value: 25.900000000000002
+    - type: precision_at_5
+      value: 16.38
+    - type: recall_at_1
+      value: 60.0
+    - type: recall_at_10
+      value: 86.7
+    - type: recall_at_100
+      value: 96.5
+    - type: recall_at_1000
+      value: 98.9
+    - type: recall_at_3
+      value: 77.7
+    - type: recall_at_5
+      value: 81.89999999999999
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/waimai-classification
+      name: MTEB Waimai
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 88.36
+    - type: ap
+      value: 73.25144216855439
+    - type: f1
+      value: 86.75076261442027
 ---
+# xiaobu-embedding
+模型：基于GTE模型[1]多任务微调。
+数据：闲聊类Query-Query、知识类Query-Doc、BGE开源Query-Doc[2]；清洗正例，挖掘中等难度负例；累计6M(质量更重要)。
+## Usage (Sentence-Transformers)
+```
+pip install -U sentence-transformers
+```
+相似度计算：
+```python
+from sentence_transformers import SentenceTransformer
+sentences_1 = ["样例数据-1", "样例数据-2"]
+sentences_2 = ["样例数据-3", "样例数据-4"]
+model = SentenceTransformer('lier007/xiaobu-embedding')
+embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
+embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
+similarity = embeddings_1 @ embeddings_2.T
+print(similarity)
+```
+## Evaluation
+参考BGE中文CMTEB评估[2]
+## Finetune
+参考BGE微调模块[2]
+## Reference
+1. https://huggingface.co/thenlper/gte-large-zh
+2. https://github.com/FlagOpen/FlagEmbedding

added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "[CLS]": 101,
+  "[MASK]": 103,
+  "[PAD]": 0,
+  "[SEP]": 102,
+  "[UNK]": 100
+}

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.34.0",
+    "pytorch": "2.0.1+cu118"
+  }
+}

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fd1e81e069dadb0469f24dd048b5778e6ab9d3ac79253514550ae0b1125bb08
+size 1302216105

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "[PAD]",
+    "[UNK]",
+    "[CLS]",
+    "[SEP]",
+    "[MASK]"
+  ],
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[PAD]",
+    "[UNK]",
+    "[CLS]",
+    "[SEP]",
+    "[MASK]"
+  ],
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "max_length": 256,
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff