# -*- coding: utf-8 -*- """TridentModel.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1u07dSU0DoKnNzGzySXMTisXnaloqpUEO TRIDENT MODEL IMPLEMENTATION Date: 14 January 2023 Authors: Egheosa Ogbomo & Amran Mohammed (The Polymer Guys) Description: This script combines three ML-based models to identify whether an input text is related to green plastics or not. """ #pip install transformers ########## IMPORTING REQUIRED PYTHON PACKAGES ########## import pandas as pd import tensorflow as tf import numpy as np from transformers import AutoTokenizer, AutoModel #import torch import math import time import csv import pandas as pd import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('punkt') import string ########## DEFINING FUNCTIONS FOR MODEL IMPLEMENTATIONS ########## ### Input data cleaner all_stopwords = stopwords.words('english') # Making sure to only use English stopwords extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts all_stopwords.extend(extra_stopwords) def clean_data(input, type='Dataframe'): """ As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of classifications, or an input string, in order for embeddings to be calculated for them. Removes: • Entries with missing abstracts/descriptions/classifications/typos • Duplicate entries • Unnecessary punctuation • Stop words (e.g., by, a , an, he, she, it) • URLs • All entries are in the same language :param input: Either a dataframe or an individual string :param type: Tells fucntion whether input is a dataframe or an individual string :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description :return: (if string), returns a 'cleaned' version of the input string """ if type == 'Dataframe': cleaneddf = pd.DataFrame(columns=['Class', 'Description']) for i in range(0, len(input)): row_list = input.loc[i, :].values.flatten().tolist() noNaN_row = [x for x in row_list if str(x) != 'nan'] listrow = [] if len(noNaN_row) > 0: row = noNaN_row[:-1] row = [x.strip() for x in row] row = (" ").join(row) text_tokens = word_tokenize(row) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords row = (" ").join(Stopword_Filtered_List) # returns abstract to string form removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$'] for char in removechars: row = list(map(lambda x: x.replace(char, ''), row)) row = ''.join(row) wnum = row.split(' ') wnum = [x.lower() for x in wnum] #remove duplicate words wnum = list(dict.fromkeys(wnum)) #removing numbers wonum = [] for x in wnum: xv = list(x) xv = [i.isnumeric() for i in xv] if True in xv: continue else: wonum.append(x) row = ' '.join(wonum) l = [noNaN_row[-1], row] cleaneddf.loc[len(cleaneddf)] = l cleaneddf = cleaneddf.drop_duplicates(subset=['Description']) cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False) return cleaneddf elif type == 'String': text_tokens = word_tokenize(input) # splits abstracts into individual tokens to allow removal of stopwords by list comprehension Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords] # removes stopwords row = (" ").join(Stopword_Filtered_List) # returns abstract to string form removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$'] for char in removechars: row = list(map(lambda x: x.replace(char, ''), row)) row = ''.join(row) wnum = row.split(' ') wnum = [x.lower() for x in wnum] # remove duplicate words wnum = list(dict.fromkeys(wnum)) # removing numbers wonum = [] for x in wnum: xv = list(x) xv = [i.isnumeric() for i in xv] if True in xv: continue else: wonum.append(x) row = ' '.join(wonum) return row ### Mean Pooler """ Performs a mean pooling to reduce dimension of embedding """ def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf) ### Sentence Embedder # def sentence_embedder(sentences, model_path): # """ # Calling the sentence similarity model to generate embeddings on input text. # :param sentences: takes input text in the form of a string # :param model_path: path to the text similarity model # :return returns a (1, 384) embedding of the input text # """ # tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library # model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # # Compute token embeddings # with torch.no_grad(): # model_output = model(**encoded_input) # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text # return sentence_embeddings # ### Sentence Embedding Preparation Function # def convert_saved_embeddings(embedding_string): # """ # Preparing pre-computed embeddings for use for comparison with new abstract embeddings . # Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity. # :param embedding_string: # :return: Should be a single tensor with dims (,384) in string formate # """ # embedding = embedding_string.replace('(', '') # embedding = embedding.replace(')', '') # embedding = embedding.replace('[', '') # embedding = embedding.replace(']', '') # embedding = embedding.replace('tensor', '') # embedding = embedding.replace(' ', '') # embedding = embedding.split(',') # embedding = [float(x) for x in embedding] # embedding = np.array(embedding) # embedding = np.expand_dims(embedding, axis=0) # embedding = torch.from_numpy(embedding) # return embedding # ### Generating Class Embeddings # Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here # def class_embbedding_generator(classes): # """ # This function is to be used to generate and save class embeddings # Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv # :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with # """ # class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding']) # for i in range(len(classes)): # class_name = classes.iloc[i, 0] # print(class_name) # class_description = classes.iloc[i, 1] # class_description_embedding = sentence_embedder(class_description, Model_Path) # class_description_embedding = class_description_embedding.numpy() # class_description_embedding = torch.from_numpy(class_description_embedding) # embedding_entry = [class_name, class_description, class_description_embedding] # class_embeddings.loc[len(class_embeddings)] = embedding_entry # ### Broad Scope Classifier # Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here # def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'): # """ # Takes in pre-computed class embeddings and abstract texts, converts abstract text into # :param class_embeddings: dataframe of class embeddings # :param abstract: a single abstract embedding # :param N: N highest matching classes to return, from highest to lowest, default is 5 # :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes # """ # predictions = pd.DataFrame(columns=['Class Name', 'Score']) # for i in range(len(class_embeddings)): # class_name = class_embeddings.iloc[i, 0] # embedding = class_embeddings.iloc[i, 2] # embedding = convert_saved_embeddings(embedding) # abstract_embedding = abstract_embedding.numpy() # abstract_embedding = torch.from_numpy(abstract_embedding) # cos = torch.nn.CosineSimilarity(dim=1) # score = cos(abstract_embedding, embedding).numpy().tolist() # result = [class_name, score[0]] # predictions.loc[len(predictions)] = result # greenpredictions = predictions.tail(52) # if Sensitivity == 'High': # Threshold = 0.5 # elif Sensitivity == 'Medium': # Threshold = 0.40 # elif Sensitivity == 'Low': # Threshold = 0.35 # GreenLikelihood = 'False' # for i in range(len(greenpredictions)): # score = greenpredictions.iloc[i, 1] # if float(score) >= Threshold: # GreenLikelihood = 'True' # break # else: # continue # HighestSimilarity = predictions.nlargest(N, ['Score']) # print(HighestSimilarity) # print(GreenLikelihood) # return predictions, HighestSimilarity, GreenLikelihood