text embedding
#15
by
Ausen
- opened
i want to get the text embedding to do some analysis, then how can i get it?
Hello,
You can custom pipeline like below:
import torch
from transformers import Pipeline
class FinBERTPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "text" in kwargs:
preprocess_kwargs["text"] = kwargs["text"]
return preprocess_kwargs, {}, {}
def preprocess(self, sentence, maybe_arg=2):
return self.tokenizer(sentence, return_tensors="pt")
def _forward(self, inputs):
return self.model(**inputs, output_hidden_states=True)
def postprocess(self, outputs):
sentence_embedding = torch.mean(outputs.hidden_states[-1][0], dim=0).numpy()
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
prediction_max_index = int(torch.argmax(predictions))
label = self.model.config.id2label[prediction_max_index]
return {'label': label, 'score': predictions[0][prediction_max_index].item(), 'embedding': sentence_embedding}
This pipeline gives you sentence embedding but you can convert it to word embedding.
You can use the pipeline like below:
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import AutoModelForSequenceClassification
PIPELINE_REGISTRY.register_pipeline(
'finbert-pipeline-with-sentence-embedding',
pipeline_class=FinBERTPipeline,
pt_model=AutoModelForSequenceClassification,
)
pipe = pipeline('finbert-pipeline-with-sentence-embedding', model='ProsusAI/finbert', device=0)
outputs = pipe('EXAMPLE SENTENCE')
print(outputs['label'], outputs['score'], outputs['embedding'].shape)
Thank you.