Update README.md

416b44d verified about 2 months ago

6.36 kB

	---
	license: apache-2.0
	language:
	- en
	- zh
	library_name: transformers
	tags:
	- mteb
	- RAG-reranking
	model-index:
	- name: LdIR-Qwen2-reranker-1.5B
	results:
	- task:
	type: Reranking
	dataset:
	type: C-MTEB/CMedQAv1-reranking
	name: MTEB CMedQAv1
	config: default
	split: test
	revision: None
	metrics:
	- type: map
	value: 86.50438688414654
	- type: mrr
	value: 88.91170634920635
	- task:
	type: Reranking
	dataset:
	type: C-MTEB/CMedQAv2-reranking
	name: MTEB CMedQAv2
	config: default
	split: test
	revision: None
	metrics:
	- type: map
	value: 87.10592353383732
	- type: mrr
	value: 89.10178571428571
	- task:
	type: Reranking
	dataset:
	type: C-MTEB/Mmarco-reranking
	name: MTEB MMarcoReranking
	config: default
	split: dev
	revision: None
	metrics:
	- type: map
	value: 39.354813242907133
	- type: mrr
	value: 39.075793650793655
	- task:
	type: Reranking
	dataset:
	type: C-MTEB/T2Reranking
	name: MTEB T2Reranking
	config: default
	split: dev
	revision: None
	metrics:
	- type: map
	value: 68.83696915006163
	- type: mrr
	value: 79.77644651857584
	---

	## Introduction

	This model is a downstream task of [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) .
	We leverage the work of [FlagEmbedding reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker) ,
	and implement with Qwen2-1.5B as pretrained model.

	## Dependencies

	```text
	transformers==4.41.2
	flash-attn==2.5.7
	```

	## Usage

	```python
	from typing import cast, List, Union, Tuple, Dict, Optional
	import numpy as np
	import torch
	from tqdm import tqdm
	import transformers
	from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizer, DataCollatorWithPadding
	from transformers.models.qwen2 import Qwen2Config, Qwen2ForSequenceClassification
	from transformers.trainer_pt_utils import LabelSmoother
	IGNORE_TOKEN_ID = LabelSmoother.ignore_index

	def preprocess(
	sources,
	tokenizer: transformers.PreTrainedTokenizer,
	max_len: int = 1024,
	) -> Dict:

	# Apply prompt templates
	input_ids, attention_masks = [], []
	for i, source in enumerate(sources):
	messages = [
	{"role": "user",
	"content": "\n\n".join(source)}
	]
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	model_inputs = tokenizer([text])
	input_id = model_inputs['input_ids'][0]
	attention_mask = model_inputs['attention_mask'][0]
	if len(input_id) > max_len:
	## last five tokens: <\|im_end\|>(151645), \n(198), <\|im_start\|>(151644), assistant(77091), \n(198)
	diff = len(input_id) - max_len
	input_id = input_id[:-5-diff] + input_id[-5:]
	attention_mask = attention_mask[:-5-diff] + attention_mask[-5:]
	assert len(input_id) == max_len
	input_ids.append(input_id)
	attention_masks.append(attention_mask)

	return dict(
	input_ids=input_ids,
	attention_mask=attention_masks
	)

	class FlagRerankerCustom:
	def __init__(
	self,
	model: PreTrainedModel,
	tokenizer: PreTrainedTokenizer,
	use_fp16: bool = False
	) -> None:
	self.tokenizer = tokenizer
	self.model = model
	self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

	if torch.cuda.is_available():
	self.device = torch.device('cuda')
	elif torch.backends.mps.is_available():
	self.device = torch.device('mps')
	else:
	self.device = torch.device('cpu')
	use_fp16 = False
	if use_fp16:
	self.model.half()

	self.model = self.model.to(self.device)

	self.model.eval()

	self.num_gpus = torch.cuda.device_count()
	if self.num_gpus > 1:
	print(f"----------using {self.num_gpus}*GPUs----------")
	self.model = torch.nn.DataParallel(self.model)

	@torch.no_grad()
	def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 64,
	max_length: int = 1024) -> List[float]:

	if self.num_gpus > 0:
	batch_size = batch_size * self.num_gpus

	assert isinstance(sentence_pairs, list)
	if isinstance(sentence_pairs[0], str):
	sentence_pairs = [sentence_pairs]

	all_scores = []
	for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores",
	disable=True):
	sentences_batch = sentence_pairs[start_index:start_index + batch_size]
	inputs = preprocess(sources=sentences_batch, tokenizer=self.tokenizer, max_len=max_length)
	inputs = [dict(zip(inputs, t)) for t in zip(*inputs.values())]
	inputs = self.data_collator(inputs).to(self.device)
	scores = self.model(**inputs, return_dict=True).logits
	scores = scores.squeeze()
	all_scores.extend(scores.detach().to(torch.float).cpu().numpy().tolist())

	if len(all_scores) == 1:
	return all_scores[0]
	return all_scores

	tokenizer = transformers.AutoTokenizer.from_pretrained(
	"neofung/LdIR-Qwen2-reranker-1.5B",
	padding_side="right",
	)

	config = Qwen2Config.from_pretrained(
	"neofung/LdIR-Qwen2-reranker-1.5B",
	trust_remote_code=True,
	bf16=True,
	)

	model = Qwen2ForSequenceClassification.from_pretrained(
	"neofung/LdIR-Qwen2-reranker-1.5B",
	config = config,
	trust_remote_code = True,
	)

	model = FlagRerankerCustom(model=model, tokenizer=tokenizer, use_fp16=False)

	pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

	model.compute_score(pairs)

	# [-2.655318021774292, 11.7670316696167]
	```


	## Evaluation on C-MTEB

	```python

	from C_MTEB.tasks import *
	from mteb import MTEB

	save_name = "LdIR-Qwen2-reranker-1.5B"

	evaluation = MTEB(
	task_types=["Reranking"], task_langs=['zh', 'zh2en', 'en2zh']
	)

	evaluation.run(model, output_folder=f"reranker_results/{save_name}")
	```