--- license: apache-2.0 language: - en - zh library_name: transformers tags: - mteb - RAG-reranking model-index: - name: LdIR-reranker-large results: - task: type: Reranking dataset: type: C-MTEB/CMedQAv1-reranking name: MTEB CMedQAv1 config: default split: test revision: None metrics: - type: map value: 86.50438688414654 - type: mrr value: 88.91170634920635 - task: type: Reranking dataset: type: C-MTEB/CMedQAv2-reranking name: MTEB CMedQAv2 config: default split: test revision: None metrics: - type: map value: 87.10592353383732 - type: mrr value: 89.10178571428571 - task: type: Reranking dataset: type: C-MTEB/Mmarco-reranking name: MTEB MMarcoReranking config: default split: dev revision: None metrics: - type: map value: 39.354813242907133 - type: mrr value: 39.075793650793655 - task: type: Reranking dataset: type: C-MTEB/T2Reranking name: MTEB T2Reranking config: default split: dev revision: None metrics: - type: map value: 68.83696915006163 - type: mrr value: 79.77644651857584 --- ## Introduction This model is a downstream task of [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) . We leverage the work of [FlagEmbedding reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker) , and implement with Qwen2-1.5B as pretrained model. ## Dependencies ```text transformers==4.41.2 flash-attn==2.5.7 ``` ## Usage ```python from typing import cast, List, Union, Tuple, Dict, Optional import numpy as np import torch from tqdm import tqdm import transformers from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizer, DataCollatorWithPadding from transformers.models.qwen2 import Qwen2Config, Qwen2ForSequenceClassification from transformers.trainer_pt_utils import LabelSmoother IGNORE_TOKEN_ID = LabelSmoother.ignore_index def preprocess( sources, tokenizer: transformers.PreTrainedTokenizer, max_len: int = 1024, ) -> Dict: # Apply prompt templates input_ids, attention_masks = [], [] for i, source in enumerate(sources): messages = [ {"role": "user", "content": "\n\n".join(source)} ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text]) input_id = model_inputs['input_ids'][0] attention_mask = model_inputs['attention_mask'][0] if len(input_id) > max_len: ## last five tokens: <|im_end|>(151645), \n(198), <|im_start|>(151644), assistant(77091), \n(198) diff = len(input_id) - max_len input_id = input_id[:-5-diff] + input_id[-5:] attention_mask = attention_mask[:-5-diff] + attention_mask[-5:] assert len(input_id) == max_len input_ids.append(input_id) attention_masks.append(attention_mask) return dict( input_ids=input_ids, attention_mask=attention_masks ) class FlagRerankerCustom: def __init__( self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, use_fp16: bool = False ) -> None: self.tokenizer = tokenizer self.model = model self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer) if torch.cuda.is_available(): self.device = torch.device('cuda') elif torch.backends.mps.is_available(): self.device = torch.device('mps') else: self.device = torch.device('cpu') use_fp16 = False if use_fp16: self.model.half() self.model = self.model.to(self.device) self.model.eval() self.num_gpus = torch.cuda.device_count() if self.num_gpus > 1: print(f"----------using {self.num_gpus}*GPUs----------") self.model = torch.nn.DataParallel(self.model) @torch.no_grad() def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 64, max_length: int = 1024) -> List[float]: if self.num_gpus > 0: batch_size = batch_size * self.num_gpus assert isinstance(sentence_pairs, list) if isinstance(sentence_pairs[0], str): sentence_pairs = [sentence_pairs] all_scores = [] for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores", disable=True): sentences_batch = sentence_pairs[start_index:start_index + batch_size] inputs = preprocess(sources=sentences_batch, tokenizer=self.tokenizer, max_len=max_length) inputs = [dict(zip(inputs, t)) for t in zip(*inputs.values())] inputs = self.data_collator(inputs).to(self.device) scores = self.model(**inputs, return_dict=True).logits scores = scores.squeeze() all_scores.extend(scores.detach().to(torch.float).cpu().numpy().tolist()) if len(all_scores) == 1: return all_scores[0] return all_scores tokenizer = transformers.AutoTokenizer.from_pretrained( "neofung/LdIR-Qwen2-reranker-1.5B", padding_side="right", ) config = Qwen2Config.from_pretrained( "neofung/LdIR-Qwen2-reranker-1.5B", trust_remote_code=True, bf16=True, ) model = Qwen2ForSequenceClassification.from_pretrained( "neofung/LdIR-Qwen2-reranker-1.5B", config = config, trust_remote_code = True, ) model = FlagRerankerCustom(model=model, tokenizer=tokenizer, use_fp16=False) pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] model.compute_score(pairs) # [-2.655318021774292, 11.7670316696167] ``` ## Evaluation on C-MTEB ```python from C_MTEB.tasks import * from mteb import MTEB save_name = "LdIR-Qwen2-reranker-1.5B" evaluation = MTEB( task_types=["Reranking"], task_langs=['zh', 'zh2en', 'en2zh'] ) evaluation.run(model, output_folder=f"reranker_results/{save_name}") ```