Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""TridentModel.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1u07dSU0DoKnNzGzySXMTisXnaloqpUEO | |
TRIDENT MODEL IMPLEMENTATION | |
Date: 14 January 2023 | |
Authors: Egheosa Ogbomo & Amran Mohammed (The Polymer Guys) | |
Description: This script combines three ML-based models to identify whether an input text is related to green plastics or not. | |
""" | |
#pip install transformers | |
########## IMPORTING REQUIRED PYTHON PACKAGES ########## | |
import pandas as pd | |
import tensorflow as tf | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
import math | |
import time | |
import csv | |
import pandas as pd | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
import string | |
########## DEFINING FUNCTIONS FOR MODEL IMPLEMENTATIONS ########## | |
### Input data cleaner | |
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords | |
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts | |
all_stopwords.extend(extra_stopwords) | |
def clean_data(input, type='Dataframe'): | |
""" | |
As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of | |
classifications, or an input string, in order for embeddings to be calculated for them. Removes: | |
• Entries with missing abstracts/descriptions/classifications/typos | |
• Duplicate entries | |
• Unnecessary punctuation | |
• Stop words (e.g., by, a , an, he, she, it) | |
• URLs | |
• All entries are in the same language | |
:param input: Either a dataframe or an individual string | |
:param type: Tells fucntion whether input is a dataframe or an individual string | |
:return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description | |
:return: (if string), returns a 'cleaned' version of the input string | |
""" | |
if type == 'Dataframe': | |
cleaneddf = pd.DataFrame(columns=['Class', 'Description']) | |
for i in range(0, len(input)): | |
row_list = input.loc[i, :].values.flatten().tolist() | |
noNaN_row = [x for x in row_list if str(x) != 'nan'] | |
listrow = [] | |
if len(noNaN_row) > 0: | |
row = noNaN_row[:-1] | |
row = [x.strip() for x in row] | |
row = (" ").join(row) | |
text_tokens = word_tokenize(row) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension | |
Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords | |
row = (" ").join(Stopword_Filtered_List) # returns abstract to string form | |
removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$'] | |
for char in removechars: | |
row = list(map(lambda x: x.replace(char, ''), row)) | |
row = ''.join(row) | |
wnum = row.split(' ') | |
wnum = [x.lower() for x in wnum] | |
#remove duplicate words | |
wnum = list(dict.fromkeys(wnum)) | |
#removing numbers | |
wonum = [] | |
for x in wnum: | |
xv = list(x) | |
xv = [i.isnumeric() for i in xv] | |
if True in xv: | |
continue | |
else: | |
wonum.append(x) | |
row = ' '.join(wonum) | |
l = [noNaN_row[-1], row] | |
cleaneddf.loc[len(cleaneddf)] = l | |
cleaneddf = cleaneddf.drop_duplicates(subset=['Description']) | |
cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False) | |
return cleaneddf | |
elif type == 'String': | |
text_tokens = word_tokenize(input) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension | |
Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords | |
row = (" ").join(Stopword_Filtered_List) # returns abstract to string form | |
removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$'] | |
for char in removechars: | |
row = list(map(lambda x: x.replace(char, ''), row)) | |
row = ''.join(row) | |
wnum = row.split(' ') | |
wnum = [x.lower() for x in wnum] | |
# remove duplicate words | |
wnum = list(dict.fromkeys(wnum)) | |
# removing numbers | |
wonum = [] | |
for x in wnum: | |
xv = list(x) | |
xv = [i.isnumeric() for i in xv] | |
if True in xv: | |
continue | |
else: | |
wonum.append(x) | |
row = ' '.join(wonum) | |
return row | |
### Mean Pooler | |
""" | |
Performs a mean pooling to reduce dimension of embedding | |
""" | |
def mean_pooling(model_output, attention_mask): | |
token_embeddings = model_output[0] #First element of model_output contains all token embeddings | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf) | |
### Sentence Embedder | |
def sentence_embedder(sentences, model_path): | |
""" | |
Calling the sentence similarity model to generate embeddings on input text. | |
:param sentences: takes input text in the form of a string | |
:param model_path: path to the text similarity model | |
:return returns a (1, 384) embedding of the input text | |
""" | |
tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library | |
model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance | |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') | |
# Compute token embeddings | |
with torch.no_grad(): | |
model_output = model(**encoded_input) | |
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text | |
return sentence_embeddings | |
### Sentence Embedding Preparation Function | |
def convert_saved_embeddings(embedding_string): | |
""" | |
Preparing pre-computed embeddings for use for comparison with new abstract embeddings . | |
Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity. | |
:param embedding_string: | |
:return: Should be a single tensor with dims (,384) in string formate | |
""" | |
embedding = embedding_string.replace('(', '') | |
embedding = embedding.replace(')', '') | |
embedding = embedding.replace('[', '') | |
embedding = embedding.replace(']', '') | |
embedding = embedding.replace('tensor', '') | |
embedding = embedding.replace(' ', '') | |
embedding = embedding.split(',') | |
embedding = [float(x) for x in embedding] | |
embedding = np.array(embedding) | |
embedding = np.expand_dims(embedding, axis=0) | |
embedding = torch.from_numpy(embedding) | |
return embedding | |
### Generating Class Embeddings | |
Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here | |
def class_embbedding_generator(classes): | |
""" | |
This function is to be used to generate and save class embeddings | |
Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv | |
:classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with | |
""" | |
class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding']) | |
for i in range(len(classes)): | |
class_name = classes.iloc[i, 0] | |
print(class_name) | |
class_description = classes.iloc[i, 1] | |
class_description_embedding = sentence_embedder(class_description, Model_Path) | |
class_description_embedding = class_description_embedding.numpy() | |
class_description_embedding = torch.from_numpy(class_description_embedding) | |
embedding_entry = [class_name, class_description, class_description_embedding] | |
class_embeddings.loc[len(class_embeddings)] = embedding_entry | |
### Broad Scope Classifier | |
Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here | |
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'): | |
""" | |
Takes in pre-computed class embeddings and abstract texts, converts abstract text into | |
:param class_embeddings: dataframe of class embeddings | |
:param abstract: a single abstract embedding | |
:param N: N highest matching classes to return, from highest to lowest, default is 5 | |
:return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes | |
""" | |
predictions = pd.DataFrame(columns=['Class Name', 'Score']) | |
for i in range(len(class_embeddings)): | |
class_name = class_embeddings.iloc[i, 0] | |
embedding = class_embeddings.iloc[i, 2] | |
embedding = convert_saved_embeddings(embedding) | |
abstract_embedding = abstract_embedding.numpy() | |
abstract_embedding = torch.from_numpy(abstract_embedding) | |
cos = torch.nn.CosineSimilarity(dim=1) | |
score = cos(abstract_embedding, embedding).numpy().tolist() | |
result = [class_name, score[0]] | |
predictions.loc[len(predictions)] = result | |
greenpredictions = predictions.tail(52) | |
if Sensitivity == 'High': | |
Threshold = 0.5 | |
elif Sensitivity == 'Medium': | |
Threshold = 0.40 | |
elif Sensitivity == 'Low': | |
Threshold = 0.35 | |
GreenLikelihood = 'False' | |
for i in range(len(greenpredictions)): | |
score = greenpredictions.iloc[i, 1] | |
if float(score) >= Threshold: | |
GreenLikelihood = 'True' | |
break | |
else: | |
continue | |
HighestSimilarity = predictions.nlargest(N, ['Score']) | |
print(HighestSimilarity) | |
print(GreenLikelihood) | |
return predictions, HighestSimilarity, GreenLikelihood | |