File size: 16,689 Bytes
f084e2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
from cherche import retrieve
from sentence_transformers import SentenceTransformer, util
from transformers import RobertaTokenizer, RobertaModel, EncoderDecoderModel
from config import classifier_class_mapping, config
import pandas as pd
import numpy as np 
import pickle
import torch
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

class wrappedTokenizer(RobertaTokenizer):
    def __call__(self, text_input):
        return self.tokenize(text_input)

def generate_index(db):
    db_cp = db.copy()
    index_list = []
    for id_, dirname in db_cp.values:
        index_list.append(
        {
            'id': id_,
            'library': dirname.lower()
        })
    return index_list

def load_db(db_metadata_path, db_constructor_path):
    '''
    Function to load dataframe

    Params:
    db_metadata_path (string): the path to the db_metadata file
    db_constructor_path (string): the path to the db_constructor file

    Output:
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
    '''
    db_metadata = pd.read_csv(db_metadata_path)
    db_metadata.dropna(inplace=True)
    db_constructor = pd.read_csv(db_constructor_path)
    db_constructor.dropna(inplace=True)
    return db_metadata, db_constructor



def load_retrieval_model_lexical(tokenizer_path, max_k, db_metadata):
    '''
    Function to load BM25 model

    Params:
    tokenizer_path (string): the path to a tokenizer (can be a path to either a huggingface model or local directory)
    max_k (int): the maximum number of returned sequences
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
    
    Returns:
    retrieval_model: a retrieval model
    '''
    # generate index
    index_list = generate_index(db_metadata[['id', 'library']])

    # load model
    tokenizer = wrappedTokenizer.from_pretrained(tokenizer_path)
    retrieval_model = retrieve.BM25Okapi(
        key='id',
        on='library',
        documents=index_list,
        k=max_k,
        tokenizer=tokenizer
    )
    return retrieval_model


def load_retrieval_model_deep_learning(model_path, max_k, db_metadata):
    '''
    Function to load a deep learning-based model

    Params:
    model_path (string): the path to the model (can be a path to either a huggingface model or local directory)
    max_k (int): the maximum number of returned sequences
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
    
    Returns:
    retrieval_model: a retrieval model
    '''
    # generate index
    index_list = generate_index(db_metadata[['id', 'library']])

    # load model
    retrieval_model = retrieve.Encoder(
        key='id',
        on='library',
        encoder=SentenceTransformer(model_path).encode,
        k=max_k,
        path=f"../temp/dl.pkl"
    )
    retrieval_model = dl_retriever.add(documents=index_list)
    
    return retrieval_model

def load_generative_model_codebert(model_path):
    '''
    Function load a generative model using codebert checkpoint

    Params: 
    model_path (string): path to the model (can be a path to either a huggingface model or local directory)
    
    Returns:
    tokenizer: a huggingface tokenizer
    generative_model: a generative model to generate API pattern given the library name as the input
    '''
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    generative_model = EncoderDecoderModel.from_pretrained(model_path)
    return tokenizer, generative_model


def get_metadata_library(predictions, db_metadata):
    '''
    Function to get the metadata of a library using the library unique id

    Params:
    predictions (list): a list of dictionary containing the prediction details
    db_metadata: a dataframe containing metadata information about the library

    Returns:
    metadata_dict (dict): a dictionary where the key is the metadata type and the value is the metadata value
    '''
    predictions_cp = predictions.copy()
    for prediction_dict in predictions_cp:
        temp_db = db_metadata[db_metadata.id==prediction_dict.get('id')]
        assert(len(temp_db)==1)

        prediction_dict['Sensor Type'] = temp_db.iloc[0]['cat'].capitalize()
        prediction_dict['Github URL'] = temp_db.iloc[0]['url']
        
        # prefer the description from the arduino library list, if not found use the repo description
        if temp_db.iloc[0].desc_ardulib != 'nan':
            prediction_dict['Description'] = temp_db.iloc[0].desc_ardulib
        
        elif temp_db.iloc[0].desc_repo != 'nan':
            prediction_dict['Description'] = temp_db.iloc[0].desc_repo

        else:
            prediction_dict['Description'] = "Description not found"
        print(prediction_dict)
        print("-----------------------------------------------------------------")
    return predictions_cp

def id_to_libname(id_, db_metadata):
    '''
    Function to convert a library id to its library name

    Params:
    id_ (int): a unique library id
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library

    Returns:
    library_name (string): the library name that corresponds to the input id
    '''
    temp_db = db_metadata[db_metadata.id==id_]
    assert(len(temp_db)==1)
    library_name = temp_db.iloc[0].library
    return library_name


def retrieve_libraries(retrieval_model, model_input, db_metadata):
    '''
    Function to retrieve a set of relevant libraries using a model based on the input query

    Params:
    retrieval_model: a model to perform retrieval
    model_input (string): an input query from the user

    Returns:
    library_ids (list): a list of library unique ids
    library_names (list): a list of library names
    '''
    results = retrieval_model(model_input)
    library_ids = [item.get('id') for item in results]
    library_names = [id_to_libname(item, db_metadata) for item in library_ids]
    return library_ids, library_names

def prepare_input_generative_model(library_ids, db_constructor):
    '''
    Function to prepare the input of the model to generate API usage patterns

    Params:
    library_ids (list): a list of library ids
    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor

    Returns:
    output_dict (dictionary): a dictionary where the key is library id and the value is a list of valid inputs
    '''
    output_dict = {}
    for id_ in library_ids:
        temp_db = db_constructor[db_constructor.id==id_]
        output_dict[id_] = []
        for id__, library_name, methods, constructor in temp_db.values:
            output_dict[id_].append(
                f'{library_name} [SEP] {constructor}'
            )
    return output_dict

def generate_api_usage_patterns(generative_model, tokenizer, model_input, num_beams, num_return_sequences):
    '''
    Function to generate API usage patterns

    Params:
    generative_model: a huggingface model
    tokenizer: a huggingface tokenizer
    model_input (string): a string in the form of <library-name> [SEP] constructor
    num_beams (int): the beam width used for decoding
    num_return_sequences (int): how many API usage patterns are returned by the model

    Returns:
    api_usage_patterns (list): a list of API usage patterns
    '''
    model_input = tokenizer(model_input, return_tensors='pt').input_ids
    model_output = generative_model.generate(
        model_input,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences
    )
    api_usage_patterns = tokenizer.batch_decode(
        model_output,
        skip_special_tokens=True
    )
    return api_usage_patterns

def generate_api_usage_patterns_batch(generative_model, tokenizer, library_ids, db_constructor, num_beams, num_return_sequences):
    '''
    Function to generate API usage patterns in batch

    Params:
    generative_model: a huggingface model
    tokenizer: a huggingface tokenizer
    library_ids (list): a list of libary ids
    db_constructor (pandas dataframe):  a dataframe containing the mapping of library names to valid constructor
    num_beams (int): the beam width used for decoding
    num_return_sequences (int): how many API usage patterns are returned by the model

    Returns:
    predictions (list): a list of dictionary containing the api usage patterns, library name, and id
    '''
    input_generative_model_dict = prepare_input_generative_model(library_ids, db_constructor)

    predictions = []
    for id_ in input_generative_model_dict:
        temp_dict = {
            'id': id_,
            'library_name': None,
            'hw_config': None,
            'usage_patterns': {}
        }
        for input_generative_model in input_generative_model_dict.get(id_):
            api_usage_patterns = generate_api_usage_patterns(
                generative_model,
                tokenizer,
                input_generative_model,
                num_beams,
                num_return_sequences
            )

            temp = input_generative_model.split("[SEP]")
            library_name = temp[0].strip()
            constructor = temp[1].strip()

            assert(constructor not in temp_dict.get('usage_patterns'))
            temp_dict['usage_patterns'][constructor] = api_usage_patterns
        
        assert(temp_dict.get('library_name')==None)
        temp_dict['library_name'] = library_name
        predictions.append(temp_dict)
    return predictions

# def generate_api_usage_patterns(generative_model, tokenizer, model_inputs, num_beams, num_return_sequences):
#     '''
#     Function to generate API usage patterns

#     Params:
#     generative_model: a huggingface model
#     tokenizer: a huggingface tokenizer
#     model_inputs (list): a list of <library-name> [SEP] <constructor>
#     num_beams (int): the beam width used for decoding
#     num_return_sequences (int): how many API usage patterns are returned by the model

#     Returns:
#     api_usage_patterns (list): a list of API usage patterns
#     '''
#     model_inputs = tokenizer(
#         model_inputs, 
#         max_length=max_length,
#         padding='max_length',
#         return_tensors='pt',
#         truncation=True)
    
#     model_output = generative_model.generate(
#         **model_inputs,
#         num_beams=num_beams,
#         num_return_sequences=num_return_sequences
#     )
#     api_usage_patterns = tokenizer.batch_decode(
#         model_output,
#         skip_special_tokens=True
#     )

#     api_usage_patterns = [api_usage_patterns[i:i+num_return_sequences] for i in range(0, len(api_usage_patterns), num_return_sequences)] 
#     return api_usage_patterns

def prepare_input_classification_model(id_, db_metadata):
    '''
    Function to get a feature for a classification model using library id

    Params:
    id_ (int): a unique library id
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library

    Returns:
    feature (string): a feature used for the classification model input 
    '''
    temp_db = db_metadata[db_metadata.id == id_]
    assert(len(temp_db)==1)
    feature = temp_db.iloc[0].features
    return feature

def load_hw_classifier(model_path_classifier, model_path_classifier_head):
    '''
    Function to load a classifier model and classifier head

    Params:
    model_path_classifier (string): path to the classifier checkpoint (can be either huggingface path or local directory)
    model_path_classifier_head (string): path to the classifier head checkpoint (should be a local directory)

    Returns:
    classifier_model: a huggingface model
    classifier_head: a classifier model (can be either svm or rf)
    tokenizer: a huggingface tokenizer
    '''
    tokenizer = RobertaTokenizer.from_pretrained(model_path_classifier)
    classifier_model = RobertaModel.from_pretrained(model_path_classifier)
    with open(model_path_classifier_head, 'rb') as f:
        classifier_head = pickle.load(f)
    return classifier_model, classifier_head, tokenizer

def predict_hw_config(classifier_model, classifier_tokenizer, classifier_head, library_ids, db_metadata, max_length):
    '''
    Function to predict hardware configs

    Params:
    classifier_model: a huggingface model to convert a feature to a feature vector
    classifier_tokenizer: a huggingface tokenizer
    classifier_head: a classifier head
    library_ids (list): a list of library ids
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
    max_length (int): max length of the tokenizer output

    Returns:
    prediction (list): a list of prediction
    '''
    
    features = [prepare_input_classification_model(id_, db_metadata) for id_ in library_ids]
    tokenized_features = classifier_tokenizer(
            features,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )
    with torch.no_grad():
        embedding_features = classifier_model(**tokenized_features).pooler_output.numpy()
    prediction = classifier_head.predict_proba(embedding_features).tolist()
    prediction = np.argmax(prediction, axis=1).tolist()
    prediction = [classifier_class_mapping.get(idx) for idx in prediction]
    return prediction


def initialize_all_components(config):
    '''
    Function to initialize all components of ArduProg

    Params:
    config (dict): a dictionary containing the configuration to initialize all components

    Returns:
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
    model_retrieval, model_generative : a huggingface model
    tokenizer_generative, tokenizer_classifier: a huggingface tokenizer
    model_classifier: a huggingface model
    classifier_head: a random forest model
    '''
    # load db
    db_metadata, db_constructor = load_db(
        config.get('db_metadata_path'), 
        config.get('db_constructor_path')
    )

    # load model
    model_retrieval = load_retrieval_model_lexical(
        config.get('tokenizer_path_retrieval'),
        config.get('max_k'),
        db_metadata,
    )

    tokenizer_generative, model_generative = load_generative_model_codebert(config.get('model_path_generative'))

    model_classifier, classifier_head, tokenizer_classifier = load_hw_classifier(
        config.get('model_path_classifier'),
        config.get('classifier_head_path')
    )

    return db_metadata, db_constructor, model_retrieval, model_generative, tokenizer_generative, model_classifier, classifier_head, tokenizer_classifier

def make_predictions(input_query, 
    model_retrieval, 
    model_generative,  
    model_classifier, classifier_head,
    tokenizer_generative, tokenizer_classifier,
    db_metadata, db_constructor,
    config):
    '''
    Function to retrieve relevant libraries, generate API usage patterns, and predict the hw configs

    Params:
    input_query (string): a query from the user
    model_retrieval, model_generative, model_classifier: a huggingface model
    classifier_head: a random forest classifier
    toeknizer_generative, tokenizer_classifier: a hugggingface tokenizer,
    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
    config (dict): a dictionary containing the configuration to initialize all components
    
    Returns:
    predictions (list): a list of dictionary containing the prediction details
    '''
    library_ids, library_names = retrieve_libraries(model_retrieval, input_query, db_metadata)

    predictions = generate_api_usage_patterns_batch(
        model_generative,
        tokenizer_generative,
        library_ids,
        db_constructor,
        config.get('num_beams'),
        config.get('num_return_sequences')
    )
    
    hw_configs = predict_hw_config(
        model_classifier,
        tokenizer_classifier,
        classifier_head,
        library_ids,
        db_metadata,
        config.get('max_length')
    )

    for output_dict, hw_config in zip(predictions, hw_configs):
        output_dict['hw_config'] = hw_config
    
    predictions = get_metadata_library(predictions, db_metadata)

    return predictions