AiContract / embeddingsProcessor.py
karthikeyan-r's picture
Create embeddingsProcessor.py
d997e06 verified
raw
history blame contribute delete
No virus
1.31 kB
from typing import List
from transformers import AutoTokenizer, AutoModel
import torch
import os
import numpy as np
class EmbeddingsProcessor:
"""
Class for processing text to obtain embeddings using a transformer model.
"""
def __init__(self, model_name: str):
"""
Initialize the EmbeddingsProcessor with a pre-trained model.
Args:
model_name (str): The name of the pre-trained model to use for generating embeddings.
"""
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name).to('cpu') # Change 'cuda' to 'cpu'
def get_embeddings(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts.
Args:
texts (List[str]): A list of text strings for which to generate embeddings.
Returns:
np.ndarray: A NumPy array of embeddings for the provided texts.
"""
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
encoded_input = {k: v.to('cpu') for k, v in encoded_input.items()} # Ensure all tensors are on CPU
model_output = self.model(**encoded_input)
return model_output.last_hidden_state.mean(dim=1).detach().numpy()