Spaces:

GT4SD
/

PatentToolkit

Runtime error

App Files Files Community

EmicoBinsfinder commited on Apr 10, 2023

Commit

af3419a

•

1 Parent(s): a00f952

Update tridentmodel/classification.py

Browse files

Files changed (1) hide show

tridentmodel/classification.py +98 -98

tridentmodel/classification.py CHANGED Viewed

@@ -126,101 +126,101 @@ def mean_pooling(model_output, attention_mask):
     return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
 ### Sentence Embedder
-def sentence_embedder(sentences, model_path):
-  """
-  Calling the sentence similarity model to generate embeddings on input text.
-  :param sentences: takes input text in the form of a string
-  :param model_path: path to the text similarity model
-  :return returns a (1, 384) embedding of the input text
-  """
-  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
-  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
-  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-  # Compute token embeddings
-  with torch.no_grad():
-    model_output = model(**encoded_input)
-  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
-  return sentence_embeddings
-### Sentence Embedding Preparation Function
-def convert_saved_embeddings(embedding_string):
-    """
-    Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
-    Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
-    :param embedding_string:
-    :return: Should be a single tensor with dims (,384) in string formate
-    """
-    embedding = embedding_string.replace('(', '')
-    embedding = embedding.replace(')', '')
-    embedding = embedding.replace('[', '')
-    embedding = embedding.replace(']', '')
-    embedding = embedding.replace('tensor', '')
-    embedding = embedding.replace(' ', '')
-    embedding = embedding.split(',')
-    embedding = [float(x) for x in embedding]
-    embedding = np.array(embedding)
-    embedding = np.expand_dims(embedding, axis=0)
-    embedding = torch.from_numpy(embedding)
-    return embedding
-### Generating Class Embeddings
-Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
-def class_embbedding_generator(classes):
-    """
-    This function is to be used to generate and save class embeddings
-    Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv
-    :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with
-    """
-    class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding'])
-    for i in range(len(classes)):
-        class_name = classes.iloc[i, 0]
-        print(class_name)
-        class_description = classes.iloc[i, 1]
-        class_description_embedding = sentence_embedder(class_description, Model_Path)
-        class_description_embedding = class_description_embedding.numpy()
-        class_description_embedding = torch.from_numpy(class_description_embedding)
-        embedding_entry = [class_name, class_description, class_description_embedding]
-        class_embeddings.loc[len(class_embeddings)] = embedding_entry
-### Broad Scope Classifier
-Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
-def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
-    """
-    Takes in pre-computed class embeddings and abstract texts, converts abstract text into
-    :param class_embeddings: dataframe of class embeddings
-    :param abstract: a single abstract embedding
-    :param N: N highest matching classes to return, from highest to lowest, default is 5
-    :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
-    """
-    predictions = pd.DataFrame(columns=['Class Name', 'Score'])
-    for i in range(len(class_embeddings)):
-        class_name = class_embeddings.iloc[i, 0]
-        embedding = class_embeddings.iloc[i, 2]
-        embedding = convert_saved_embeddings(embedding)
-        abstract_embedding = abstract_embedding.numpy()
-        abstract_embedding = torch.from_numpy(abstract_embedding)
-        cos = torch.nn.CosineSimilarity(dim=1)
-        score = cos(abstract_embedding, embedding).numpy().tolist()
-        result = [class_name, score[0]]
-        predictions.loc[len(predictions)] = result
-    greenpredictions = predictions.tail(52)
-    if Sensitivity == 'High':
-        Threshold = 0.5
-    elif Sensitivity == 'Medium':
-        Threshold = 0.40
-    elif Sensitivity == 'Low':
-        Threshold = 0.35
-    GreenLikelihood = 'False'
-    for i in range(len(greenpredictions)):
-        score = greenpredictions.iloc[i, 1]
-        if float(score) >= Threshold:
-            GreenLikelihood = 'True'
-            break
-        else:
-            continue
-    HighestSimilarity = predictions.nlargest(N, ['Score'])
-    print(HighestSimilarity)
-    print(GreenLikelihood)
-    return predictions, HighestSimilarity, GreenLikelihood

     return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
 ### Sentence Embedder
+# def sentence_embedder(sentences, model_path):
+#   """
+#   Calling the sentence similarity model to generate embeddings on input text.
+#   :param sentences: takes input text in the form of a string
+#   :param model_path: path to the text similarity model
+#   :return returns a (1, 384) embedding of the input text
+#   """
+#   tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
+#   model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
+#   encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+#   # Compute token embeddings
+#   with torch.no_grad():
+#     model_output = model(**encoded_input)
+#   sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
+#   return sentence_embeddings
+# ### Sentence Embedding Preparation Function
+# def convert_saved_embeddings(embedding_string):
+#     """
+#     Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
+#     Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
+#     :param embedding_string:
+#     :return: Should be a single tensor with dims (,384) in string formate
+#     """
+#     embedding = embedding_string.replace('(', '')
+#     embedding = embedding.replace(')', '')
+#     embedding = embedding.replace('[', '')
+#     embedding = embedding.replace(']', '')
+#     embedding = embedding.replace('tensor', '')
+#     embedding = embedding.replace(' ', '')
+#     embedding = embedding.split(',')
+#     embedding = [float(x) for x in embedding]
+#     embedding = np.array(embedding)
+#     embedding = np.expand_dims(embedding, axis=0)
+#     embedding = torch.from_numpy(embedding)
+#     return embedding
+# ### Generating Class Embeddings
+# Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
+# def class_embbedding_generator(classes):
+#     """
+#     This function is to be used to generate and save class embeddings
+#     Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv
+#     :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with
+#     """
+#     class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding'])
+#     for i in range(len(classes)):
+#         class_name = classes.iloc[i, 0]
+#         print(class_name)
+#         class_description = classes.iloc[i, 1]
+#         class_description_embedding = sentence_embedder(class_description, Model_Path)
+#         class_description_embedding = class_description_embedding.numpy()
+#         class_description_embedding = torch.from_numpy(class_description_embedding)
+#         embedding_entry = [class_name, class_description, class_description_embedding]
+#         class_embeddings.loc[len(class_embeddings)] = embedding_entry
+# ### Broad Scope Classifier
+# Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
+# def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
+#     """
+#     Takes in pre-computed class embeddings and abstract texts, converts abstract text into
+#     :param class_embeddings: dataframe of class embeddings
+#     :param abstract: a single abstract embedding
+#     :param N: N highest matching classes to return, from highest to lowest, default is 5
+#     :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
+#     """
+#     predictions = pd.DataFrame(columns=['Class Name', 'Score'])
+#     for i in range(len(class_embeddings)):
+#         class_name = class_embeddings.iloc[i, 0]
+#         embedding = class_embeddings.iloc[i, 2]
+#         embedding = convert_saved_embeddings(embedding)
+#         abstract_embedding = abstract_embedding.numpy()
+#         abstract_embedding = torch.from_numpy(abstract_embedding)
+#         cos = torch.nn.CosineSimilarity(dim=1)
+#         score = cos(abstract_embedding, embedding).numpy().tolist()
+#         result = [class_name, score[0]]
+#         predictions.loc[len(predictions)] = result
+#     greenpredictions = predictions.tail(52)
+#     if Sensitivity == 'High':
+#         Threshold = 0.5
+#     elif Sensitivity == 'Medium':
+#         Threshold = 0.40
+#     elif Sensitivity == 'Low':
+#         Threshold = 0.35
+#     GreenLikelihood = 'False'
+#     for i in range(len(greenpredictions)):
+#         score = greenpredictions.iloc[i, 1]
+#         if float(score) >= Threshold:
+#             GreenLikelihood = 'True'
+#             break
+#         else:
+#             continue
+#     HighestSimilarity = predictions.nlargest(N, ['Score'])
+#     print(HighestSimilarity)
+#     print(GreenLikelihood)
+#     return predictions, HighestSimilarity, GreenLikelihood