jupyterjazz
commited on
Merge branch 'main' into pr/19
Browse files- README.md +20 -63
- custom_st.py +6 -6
- model.safetensors +3 -0
- modules.json +1 -1
README.md
CHANGED
@@ -21528,7 +21528,7 @@ model-index:
|
|
21528 |
</p>
|
21529 |
|
21530 |
<p align="center">
|
21531 |
-
<b>
|
21532 |
</p>
|
21533 |
|
21534 |
## Quick Start
|
@@ -21541,12 +21541,12 @@ The easiest way to start using `jina-embeddings-v3` is with the [Jina Embedding
|
|
21541 |
|
21542 |
`jina-embeddings-v3` is a **multilingual multi-task text embedding model** designed for a variety of NLP applications.
|
21543 |
Based on the [Jina-XLM-RoBERTa architecture](https://huggingface.co/jinaai/xlm-roberta-flash-implementation),
|
21544 |
-
this model supports
|
21545 |
-
Additionally, it features 5
|
21546 |
|
21547 |
### Key Features:
|
21548 |
- **Extended Sequence Length:** Supports up to 8192 tokens with RoPE.
|
21549 |
-
- **Task-Specific Embedding:** Customize embeddings through the `
|
21550 |
- `retrieval.query`: Used for query embeddings in asymmetric retrieval tasks
|
21551 |
- `retrieval.passage`: Used for passage embeddings in asymmetric retrieval tasks
|
21552 |
- `separation`: Used for embeddings in clustering and re-ranking applications
|
@@ -21560,11 +21560,6 @@ While the foundation model supports 89 languages, we've focused our tuning effor
|
|
21560 |
Hindi, Indonesian, Italian, Japanese, Korean, Latvian, Norwegian, Polish, Portuguese, Romanian,
|
21561 |
Russian, Slovak, Spanish, Swedish, Thai, Turkish, Ukrainian, Urdu,** and **Vietnamese.**
|
21562 |
|
21563 |
-
|
21564 |
-
## Data & Parameters
|
21565 |
-
|
21566 |
-
The data and training details are described in the technical report (coming soon).
|
21567 |
-
|
21568 |
## Usage
|
21569 |
|
21570 |
**<details><summary>Apply mean pooling when integrating the model.</summary>**
|
@@ -21605,7 +21600,7 @@ model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code
|
|
21605 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
|
21606 |
|
21607 |
with torch.no_grad():
|
21608 |
-
model_output = model(**encoded_input,
|
21609 |
|
21610 |
embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
|
21611 |
embeddings = F.normalize(embeddings, p=2, dim=1)
|
@@ -21643,10 +21638,10 @@ texts = [
|
|
21643 |
"Folge dem weißen Kaninchen.", # German
|
21644 |
]
|
21645 |
|
21646 |
-
# When calling the `encode` function, you can choose a `
|
21647 |
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
|
21648 |
-
# Alternatively, you can choose not to pass a `
|
21649 |
-
embeddings = model.encode(texts,
|
21650 |
|
21651 |
# Compute similarities
|
21652 |
print(embeddings[0] @ embeddings[1].T)
|
@@ -21680,11 +21675,11 @@ from sentence_transformers import SentenceTransformer
|
|
21680 |
|
21681 |
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
|
21682 |
|
21683 |
-
|
21684 |
embeddings = model.encode(
|
21685 |
["What is the weather like in Berlin today?"],
|
21686 |
-
|
21687 |
-
prompt_name=
|
21688 |
)
|
21689 |
```
|
21690 |
|
@@ -21720,53 +21715,6 @@ outputs = session.run(None, inputs)
|
|
21720 |
|
21721 |
|
21722 |
|
21723 |
-
|
21724 |
-
## Performance
|
21725 |
-
|
21726 |
-
### English MTEB
|
21727 |
-
| Model | Dimension | Average | Classification | Clustering | Pair Classification | Reranking | Retrieval | STS | Summarization |
|
21728 |
-
|:------------------------------:|:-----------:|:---------:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
|
21729 |
-
| jina-embeddings-v3 | 1024 | **65.60** | **82.58**| 45.27| 84.01| 58.13| 53.87| **85.8** | 30.98|
|
21730 |
-
| jina-embeddings-v2-en | 768 | 58.12 | 68.82 | 40.08| 84.44| 55.09| 45.64| 80.00| 30.56|
|
21731 |
-
| text-embedding-3-large | 3072 | 62.03 | 75.45 | 49.01| 84.22| 59.16| 55.44| 81.04| 29.92|
|
21732 |
-
| multilingual-e5-large-instruct | 1024 | 64.41 | 77.56 | 47.1 | 86.19| 58.58| 52.47| 84.78| 30.39|
|
21733 |
-
| Cohere-embed-multilingual-v3.0 | 1024 | 60.08 | 64.01 | 46.6 | 86.15| 57.86| 53.84| 83.15| 30.99|
|
21734 |
-
|
21735 |
-
### Multilingual MTEB
|
21736 |
-
|
21737 |
-
| Model | Dimension | Average | Classification | Clustering | Pair Classification | Reranking | Retrieval | STS | Summarization |
|
21738 |
-
|:------------------------------:|:---------:|:---------:|:--------------:|:----------:|:-------------------:|:---------:|:---------:|:---------:|:-------------:|
|
21739 |
-
| jina-embeddings-v3 | 1024 | **64.44** | **71.46** | 46.71 | 76.91 | 63.98 | 57.98 | **69.83** | - |
|
21740 |
-
| multilingual-e5-large | 1024 | 59.58 | 65.22 | 42.12 | 76.95 | 63.4 | 52.37 | 64.65 | - |
|
21741 |
-
| multilingual-e5-large-instruct | 1024 | 64.25 | 67.45 | **52.12** | 77.79 | **69.02** | **58.38** | 68.77 | - |
|
21742 |
-
|
21743 |
-
|
21744 |
-
### Long Context Tasks (LongEmbed)
|
21745 |
-
|
21746 |
-
| Model | Dimension | Average | NarrativeQA | Needle | Passkey | QMSum | SummScreen | WikiQA |
|
21747 |
-
|:----------------------:|:---------:|:---------:|:-----------:|:---------:|:----------:|:---------:|:----------:|:---------:|
|
21748 |
-
| jina-embeddings-v3* | 1024 | **70.39** | 33.32 | **84.00** | **100.00** | **39.75** | 92.78 | 72.46 |
|
21749 |
-
| jina-embeddings-v2 | 768 | 58.12 | 37.89 | 54.25 | 50.25 | 38.87 | 93.48 | 73.99 |
|
21750 |
-
| text-embedding-3-large | 3072 | 51.30 | 44.09 | 29.25 | 63.00 | 32.49 | 84.80 | 54.16 |
|
21751 |
-
| baai-bge-m3 | 1024 | 56.56 | **45.76** | 40.25 | 46.00 | 35.54 | **94.09** | **77.73** |
|
21752 |
-
|
21753 |
-
Notes: `*`, use the text-matching adapter
|
21754 |
-
|
21755 |
-
|
21756 |
-
#### Matryoshka Embeddings
|
21757 |
-
|
21758 |
-
| Dimension | Retrieval | STS |
|
21759 |
-
|:-----------:|:-----------:|:-------:|
|
21760 |
-
| 32 | 52.54 | 76.35 |
|
21761 |
-
| 64 | 58.54 | 77.03 |
|
21762 |
-
| 128 | 61.64 | 77.43 |
|
21763 |
-
| 256 | 62.72 | 77.56 |
|
21764 |
-
| 512 | 63.16 | 77.59 |
|
21765 |
-
| 768 | 63.3 | 77.59 |
|
21766 |
-
| 1024 | 63.35 | 77.58 |
|
21767 |
-
|
21768 |
-
For a comprehensive evaluation and detailed metrics, please refer to the full paper available here (coming soon).
|
21769 |
-
|
21770 |
## Contact
|
21771 |
|
21772 |
Join our [Discord community](https://discord.jina.ai) and chat with other community members about ideas.
|
@@ -21776,5 +21724,14 @@ Join our [Discord community](https://discord.jina.ai) and chat with other commun
|
|
21776 |
If you find `jina-embeddings-v3` useful in your research, please cite the following paper:
|
21777 |
|
21778 |
```bibtex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21779 |
|
21780 |
```
|
|
|
21528 |
</p>
|
21529 |
|
21530 |
<p align="center">
|
21531 |
+
<b>jina-embeddings-v3: Multilingual Embeddings With Task LoRA</b>
|
21532 |
</p>
|
21533 |
|
21534 |
## Quick Start
|
|
|
21541 |
|
21542 |
`jina-embeddings-v3` is a **multilingual multi-task text embedding model** designed for a variety of NLP applications.
|
21543 |
Based on the [Jina-XLM-RoBERTa architecture](https://huggingface.co/jinaai/xlm-roberta-flash-implementation),
|
21544 |
+
this model supports Rotary Position Embeddings to handle long input sequences up to **8192 tokens**.
|
21545 |
+
Additionally, it features 5 LoRA adapters to generate task-specific embeddings efficiently.
|
21546 |
|
21547 |
### Key Features:
|
21548 |
- **Extended Sequence Length:** Supports up to 8192 tokens with RoPE.
|
21549 |
+
- **Task-Specific Embedding:** Customize embeddings through the `task` argument with the following options:
|
21550 |
- `retrieval.query`: Used for query embeddings in asymmetric retrieval tasks
|
21551 |
- `retrieval.passage`: Used for passage embeddings in asymmetric retrieval tasks
|
21552 |
- `separation`: Used for embeddings in clustering and re-ranking applications
|
|
|
21560 |
Hindi, Indonesian, Italian, Japanese, Korean, Latvian, Norwegian, Polish, Portuguese, Romanian,
|
21561 |
Russian, Slovak, Spanish, Swedish, Thai, Turkish, Ukrainian, Urdu,** and **Vietnamese.**
|
21562 |
|
|
|
|
|
|
|
|
|
|
|
21563 |
## Usage
|
21564 |
|
21565 |
**<details><summary>Apply mean pooling when integrating the model.</summary>**
|
|
|
21600 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
|
21601 |
|
21602 |
with torch.no_grad():
|
21603 |
+
model_output = model(**encoded_input, task='retrieval.query')
|
21604 |
|
21605 |
embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
|
21606 |
embeddings = F.normalize(embeddings, p=2, dim=1)
|
|
|
21638 |
"Folge dem weißen Kaninchen.", # German
|
21639 |
]
|
21640 |
|
21641 |
+
# When calling the `encode` function, you can choose a `task` based on the use case:
|
21642 |
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
|
21643 |
+
# Alternatively, you can choose not to pass a `task`, and no specific LoRA adapter will be used.
|
21644 |
+
embeddings = model.encode(texts, task="text-matching")
|
21645 |
|
21646 |
# Compute similarities
|
21647 |
print(embeddings[0] @ embeddings[1].T)
|
|
|
21675 |
|
21676 |
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
|
21677 |
|
21678 |
+
task = "retrieval.query"
|
21679 |
embeddings = model.encode(
|
21680 |
["What is the weather like in Berlin today?"],
|
21681 |
+
task=task,
|
21682 |
+
prompt_name=task,
|
21683 |
)
|
21684 |
```
|
21685 |
|
|
|
21715 |
|
21716 |
|
21717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21718 |
## Contact
|
21719 |
|
21720 |
Join our [Discord community](https://discord.jina.ai) and chat with other community members about ideas.
|
|
|
21724 |
If you find `jina-embeddings-v3` useful in your research, please cite the following paper:
|
21725 |
|
21726 |
```bibtex
|
21727 |
+
@misc{sturua2024jinaembeddingsv3multilingualembeddingstask,
|
21728 |
+
title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA},
|
21729 |
+
author={Saba Sturua and Isabelle Mohr and Mohammad Kalim Akram and Michael Günther and Bo Wang and Markus Krimmel and Feng Wang and Georgios Mastrapas and Andreas Koukounas and Andreas Koukounas and Nan Wang and Han Xiao},
|
21730 |
+
year={2024},
|
21731 |
+
eprint={2409.10173},
|
21732 |
+
archivePrefix={arXiv},
|
21733 |
+
primaryClass={cs.CL},
|
21734 |
+
url={https://arxiv.org/abs/2409.10173},
|
21735 |
+
}
|
21736 |
|
21737 |
```
|
custom_st.py
CHANGED
@@ -91,19 +91,19 @@ class Transformer(nn.Module):
|
|
91 |
self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
|
92 |
|
93 |
def forward(
|
94 |
-
self, features: Dict[str, torch.Tensor],
|
95 |
) -> Dict[str, torch.Tensor]:
|
96 |
"""Returns token_embeddings, cls_token"""
|
97 |
-
if
|
98 |
raise ValueError(
|
99 |
-
f"Unsupported task '{
|
100 |
f"Supported tasks are: {', '.join(self.config.lora_adaptations)}."
|
101 |
-
f"Alternatively, don't pass the `
|
102 |
)
|
103 |
|
104 |
adapter_mask = None
|
105 |
-
if
|
106 |
-
task_id = self._adaptation_map[
|
107 |
num_examples = features['input_ids'].size(0)
|
108 |
adapter_mask = torch.full(
|
109 |
(num_examples,), task_id, dtype=torch.int32, device=features['input_ids'].device
|
|
|
91 |
self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
|
92 |
|
93 |
def forward(
|
94 |
+
self, features: Dict[str, torch.Tensor], task: Optional[str] = None
|
95 |
) -> Dict[str, torch.Tensor]:
|
96 |
"""Returns token_embeddings, cls_token"""
|
97 |
+
if task and task not in self._lora_adaptations:
|
98 |
raise ValueError(
|
99 |
+
f"Unsupported task '{task}'. "
|
100 |
f"Supported tasks are: {', '.join(self.config.lora_adaptations)}."
|
101 |
+
f"Alternatively, don't pass the `task` argument to disable LoRA."
|
102 |
)
|
103 |
|
104 |
adapter_mask = None
|
105 |
+
if task:
|
106 |
+
task_id = self._adaptation_map[task]
|
107 |
num_examples = features['input_ids'].size(0)
|
108 |
adapter_mask = torch.full(
|
109 |
(num_examples,), task_id, dtype=torch.int32, device=features['input_ids'].device
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17ca06efd886a065d0081912b04c9e27ef5086a9dd09659cce32aa9c84587f23
|
3 |
+
size 1144685320
|
modules.json
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
"name": "0",
|
5 |
"path": "",
|
6 |
"type": "custom_st.Transformer",
|
7 |
-
"kwargs": ["
|
8 |
},
|
9 |
{
|
10 |
"idx": 1,
|
|
|
4 |
"name": "0",
|
5 |
"path": "",
|
6 |
"type": "custom_st.Transformer",
|
7 |
+
"kwargs": ["task"]
|
8 |
},
|
9 |
{
|
10 |
"idx": 1,
|