File size: 10,352 Bytes
8c44dfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3df528
8c44dfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# -*- coding: utf-8 -*-
"""TridentModel.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1u07dSU0DoKnNzGzySXMTisXnaloqpUEO

TRIDENT MODEL IMPLEMENTATION

Date: 14 January 2023
Authors:  Egheosa Ogbomo & Amran Mohammed (The Polymer Guys)
Description: This script combines three ML-based models to identify whether an input text is related to green plastics or not.
"""

#pip install transformers

########## IMPORTING REQUIRED PYTHON PACKAGES ##########
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import math
import time
import csv
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import string

########## DEFINING FUNCTIONS FOR MODEL IMPLEMENTATIONS ##########

### Input data cleaner
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
all_stopwords.extend(extra_stopwords)
def clean_data(input, type='Dataframe'):
    """
    As preparation for use with the text similarity model, this function removes superfluous data from either a dataframe full of
    classifications, or an input string, in order for embeddings to be calculated for them. Removes:
    •	Entries with missing abstracts/descriptions/classifications/typos
    •	Duplicate entries
    •   Unnecessary punctuation
    •	Stop words (e.g., by, a , an, he, she, it)
    •  	URLs
    •	All entries are in the same language

    :param input: Either a dataframe or an individual string
    :param type: Tells fucntion whether input is a dataframe or an individual string
    :return: (if dataframe), returns a dataframe containing CPC classfication codes and their associated 'cleaned' description
    :return:  (if string), returns a 'cleaned' version of the input string
    """
    if type == 'Dataframe':
        cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
        for i in range(0, len(input)):
            row_list = input.loc[i, :].values.flatten().tolist()
            noNaN_row = [x for x in row_list if str(x) != 'nan']
            listrow = []
            if len(noNaN_row) > 0:
                row = noNaN_row[:-1]
                row = [x.strip() for x in row]
                row = (" ").join(row)
                text_tokens = word_tokenize(row)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
                Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
                row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
                removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
                for char in removechars:
                    row = list(map(lambda x: x.replace(char, ''), row))

                row = ''.join(row)
                wnum = row.split(' ')
                wnum = [x.lower() for x in wnum]
                #remove duplicate words
                wnum = list(dict.fromkeys(wnum))
                #removing numbers
                wonum = []
                for x in wnum:
                    xv = list(x)
                    xv = [i.isnumeric() for i in xv]
                    if True in xv:
                        continue
                    else:
                        wonum.append(x)
                row = ' '.join(wonum)
                l = [noNaN_row[-1], row]
                cleaneddf.loc[len(cleaneddf)] = l
        cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
        cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
        return cleaneddf

    elif type == 'String':
        text_tokens = word_tokenize(input)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
        Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
        row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
        removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
        for char in removechars:
            row = list(map(lambda x: x.replace(char, ''), row))
        row = ''.join(row)
        wnum = row.split(' ')
        wnum = [x.lower() for x in wnum]
        # remove duplicate words
        wnum = list(dict.fromkeys(wnum))
        # removing numbers
        wonum = []
        for x in wnum:
            xv = list(x)
            xv = [i.isnumeric() for i in xv]
            if True in xv:
                continue
            else:
                wonum.append(x)
        row = ' '.join(wonum)
        return row

### Mean Pooler
"""
Performs a mean pooling to reduce dimension of embedding
"""
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)

### Sentence Embedder
def sentence_embedder(sentences, model_path):
  """
  Calling the sentence similarity model to generate embeddings on input text.
  :param sentences: takes input text in the form of a string
  :param model_path: path to the text similarity model
  :return returns a (1, 384) embedding of the input text
  """
  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
  return sentence_embeddings

### Sentence Embedding Preparation Function
def convert_saved_embeddings(embedding_string):
    """
    Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
    Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
    :param embedding_string:
    :return: Should be a single tensor with dims (,384) in string formate
    """
    embedding = embedding_string.replace('(', '')
    embedding = embedding.replace(')', '')
    embedding = embedding.replace('[', '')
    embedding = embedding.replace(']', '')
    embedding = embedding.replace('tensor', '')
    embedding = embedding.replace(' ', '')
    embedding = embedding.split(',')
    embedding = [float(x) for x in embedding]
    embedding = np.array(embedding)
    embedding = np.expand_dims(embedding, axis=0)
    embedding = torch.from_numpy(embedding)
    return embedding


### Generating Class Embeddings

Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
def class_embbedding_generator(classes):
    """
    This function is to be used to generate and save class embeddings
    Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv
    :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with
    """
    class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding'])
    for i in range(len(classes)):
        class_name = classes.iloc[i, 0]
        print(class_name)
        class_description = classes.iloc[i, 1]
        class_description_embedding = sentence_embedder(class_description, Model_Path)
        class_description_embedding = class_description_embedding.numpy()
        class_description_embedding = torch.from_numpy(class_description_embedding)
        embedding_entry = [class_name, class_description, class_description_embedding]
        class_embeddings.loc[len(class_embeddings)] = embedding_entry

### Broad Scope Classifier
Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
    """
    Takes in pre-computed class embeddings and abstract texts, converts abstract text into
    :param class_embeddings: dataframe of class embeddings
    :param abstract: a single abstract embedding
    :param N: N highest matching classes to return, from highest to lowest, default is 5
    :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
    """
    predictions = pd.DataFrame(columns=['Class Name', 'Score'])
    for i in range(len(class_embeddings)):
        class_name = class_embeddings.iloc[i, 0]
        embedding = class_embeddings.iloc[i, 2]
        embedding = convert_saved_embeddings(embedding)
        abstract_embedding = abstract_embedding.numpy()
        abstract_embedding = torch.from_numpy(abstract_embedding)
        cos = torch.nn.CosineSimilarity(dim=1)
        score = cos(abstract_embedding, embedding).numpy().tolist()
        result = [class_name, score[0]]
        predictions.loc[len(predictions)] = result
    greenpredictions = predictions.tail(52)
    if Sensitivity == 'High':
        Threshold = 0.5
    elif Sensitivity == 'Medium':
        Threshold = 0.40
    elif Sensitivity == 'Low':
        Threshold = 0.35
    GreenLikelihood = 'False'
    for i in range(len(greenpredictions)):
        score = greenpredictions.iloc[i, 1]
        if float(score) >= Threshold:
            GreenLikelihood = 'True'
            break
        else:
            continue
    HighestSimilarity = predictions.nlargest(N, ['Score'])
    print(HighestSimilarity)
    print(GreenLikelihood)
    return predictions, HighestSimilarity, GreenLikelihood