Spaces:
Running
Running
File size: 1,993 Bytes
47b5f0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import torch
from qdrant_client import models
from qdrant_client.models import NamedVector
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
class DenseEmbeddings:
def __init__(
self,
dense_model: AutoModel,
dense_tokenizer: AutoTokenizer,
sparse_model: AutoModelForMaskedLM,
sparse_tokenizer: AutoTokenizer,
):
self.dense_model = dense_model
self.dense_tokenizer = dense_tokenizer
self.sparse_model = sparse_model
self.sparse_tokenizer = sparse_tokenizer
def get_dense_vector(self, text: str) -> NamedVector:
"""
Get dense vector from the dense model
:param text: str
:return: NamedVector
"""
inputs = self.dense_tokenizer(
text, return_tensors="pt", padding=True, truncation=True
)
with torch.no_grad():
outputs = self.dense_model(**inputs)
dense_vector = NamedVector(
name="text-dense",
vector=torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy(),
)
return dense_vector
def get_sparse_vector(self, text: str) -> models.SparseVector:
"""
Get sparse vector from the sparse model
:param text: str
:return: SparseVector
"""
inputs = self.sparse_tokenizer(
text, return_tensors="pt", padding=True, truncation=True
)
with torch.no_grad():
outputs = self.sparse_model(**inputs)
token_scores = outputs.logits.squeeze().max(dim=0)[0]
token_ids = inputs["input_ids"].squeeze()
sparse_vector = {
int(token_id): float(score)
for token_id, score in zip(token_ids, token_scores)
if score > -5.0
}
sparse_vector = models.SparseVector(
indices=list(sparse_vector.keys()),
values=list(sparse_vector.values()),
)
return sparse_vector
|