Spaces:

imamnurby
/

ArduinoProg

Build error

App Files Files Community

imamnurby commited on Nov 23, 2022

Commit

f084e2e

•

1 Parent(s): f778d0e

Create backend_utils.py

Browse files

Files changed (1) hide show

backend_utils.py +461 -0

backend_utils.py ADDED Viewed

	@@ -0,0 +1,461 @@

+from cherche import retrieve
+from sentence_transformers import SentenceTransformer, util
+from transformers import RobertaTokenizer, RobertaModel, EncoderDecoderModel
+from config import classifier_class_mapping, config
+import pandas as pd
+import numpy as np
+import pickle
+import torch
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.ensemble import RandomForestClassifier
+class wrappedTokenizer(RobertaTokenizer):
+    def __call__(self, text_input):
+        return self.tokenize(text_input)
+def generate_index(db):
+    db_cp = db.copy()
+    index_list = []
+    for id_, dirname in db_cp.values:
+        index_list.append(
+        {
+            'id': id_,
+            'library': dirname.lower()
+        })
+    return index_list
+def load_db(db_metadata_path, db_constructor_path):
+    '''
+    Function to load dataframe
+    Params:
+    db_metadata_path (string): the path to the db_metadata file
+    db_constructor_path (string): the path to the db_constructor file
+    Output:
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
+    '''
+    db_metadata = pd.read_csv(db_metadata_path)
+    db_metadata.dropna(inplace=True)
+    db_constructor = pd.read_csv(db_constructor_path)
+    db_constructor.dropna(inplace=True)
+    return db_metadata, db_constructor
+def load_retrieval_model_lexical(tokenizer_path, max_k, db_metadata):
+    '''
+    Function to load BM25 model
+    Params:
+    tokenizer_path (string): the path to a tokenizer (can be a path to either a huggingface model or local directory)
+    max_k (int): the maximum number of returned sequences
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    Returns:
+    retrieval_model: a retrieval model
+    '''
+    # generate index
+    index_list = generate_index(db_metadata[['id', 'library']])
+    # load model
+    tokenizer = wrappedTokenizer.from_pretrained(tokenizer_path)
+    retrieval_model = retrieve.BM25Okapi(
+        key='id',
+        on='library',
+        documents=index_list,
+        k=max_k,
+        tokenizer=tokenizer
+    )
+    return retrieval_model
+def load_retrieval_model_deep_learning(model_path, max_k, db_metadata):
+    '''
+    Function to load a deep learning-based model
+    Params:
+    model_path (string): the path to the model (can be a path to either a huggingface model or local directory)
+    max_k (int): the maximum number of returned sequences
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    Returns:
+    retrieval_model: a retrieval model
+    '''
+    # generate index
+    index_list = generate_index(db_metadata[['id', 'library']])
+    # load model
+    retrieval_model = retrieve.Encoder(
+        key='id',
+        on='library',
+        encoder=SentenceTransformer(model_path).encode,
+        k=max_k,
+        path=f"../temp/dl.pkl"
+    )
+    retrieval_model = dl_retriever.add(documents=index_list)
+    return retrieval_model
+def load_generative_model_codebert(model_path):
+    '''
+    Function load a generative model using codebert checkpoint
+    Params:
+    model_path (string): path to the model (can be a path to either a huggingface model or local directory)
+    Returns:
+    tokenizer: a huggingface tokenizer
+    generative_model: a generative model to generate API pattern given the library name as the input
+    '''
+    tokenizer = RobertaTokenizer.from_pretrained(model_path)
+    generative_model = EncoderDecoderModel.from_pretrained(model_path)
+    return tokenizer, generative_model
+def get_metadata_library(predictions, db_metadata):
+    '''
+    Function to get the metadata of a library using the library unique id
+    Params:
+    predictions (list): a list of dictionary containing the prediction details
+    db_metadata: a dataframe containing metadata information about the library
+    Returns:
+    metadata_dict (dict): a dictionary where the key is the metadata type and the value is the metadata value
+    '''
+    predictions_cp = predictions.copy()
+    for prediction_dict in predictions_cp:
+        temp_db = db_metadata[db_metadata.id==prediction_dict.get('id')]
+        assert(len(temp_db)==1)
+        prediction_dict['Sensor Type'] = temp_db.iloc[0]['cat'].capitalize()
+        prediction_dict['Github URL'] = temp_db.iloc[0]['url']
+        # prefer the description from the arduino library list, if not found use the repo description
+        if temp_db.iloc[0].desc_ardulib != 'nan':
+            prediction_dict['Description'] = temp_db.iloc[0].desc_ardulib
+        elif temp_db.iloc[0].desc_repo != 'nan':
+            prediction_dict['Description'] = temp_db.iloc[0].desc_repo
+        else:
+            prediction_dict['Description'] = "Description not found"
+        print(prediction_dict)
+        print("-----------------------------------------------------------------")
+    return predictions_cp
+def id_to_libname(id_, db_metadata):
+    '''
+    Function to convert a library id to its library name
+    Params:
+    id_ (int): a unique library id
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    Returns:
+    library_name (string): the library name that corresponds to the input id
+    '''
+    temp_db = db_metadata[db_metadata.id==id_]
+    assert(len(temp_db)==1)
+    library_name = temp_db.iloc[0].library
+    return library_name
+def retrieve_libraries(retrieval_model, model_input, db_metadata):
+    '''
+    Function to retrieve a set of relevant libraries using a model based on the input query
+    Params:
+    retrieval_model: a model to perform retrieval
+    model_input (string): an input query from the user
+    Returns:
+    library_ids (list): a list of library unique ids
+    library_names (list): a list of library names
+    '''
+    results = retrieval_model(model_input)
+    library_ids = [item.get('id') for item in results]
+    library_names = [id_to_libname(item, db_metadata) for item in library_ids]
+    return library_ids, library_names
+def prepare_input_generative_model(library_ids, db_constructor):
+    '''
+    Function to prepare the input of the model to generate API usage patterns
+    Params:
+    library_ids (list): a list of library ids
+    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
+    Returns:
+    output_dict (dictionary): a dictionary where the key is library id and the value is a list of valid inputs
+    '''
+    output_dict = {}
+    for id_ in library_ids:
+        temp_db = db_constructor[db_constructor.id==id_]
+        output_dict[id_] = []
+        for id__, library_name, methods, constructor in temp_db.values:
+            output_dict[id_].append(
+                f'{library_name} [SEP] {constructor}'
+            )
+    return output_dict
+def generate_api_usage_patterns(generative_model, tokenizer, model_input, num_beams, num_return_sequences):
+    '''
+    Function to generate API usage patterns
+    Params:
+    generative_model: a huggingface model
+    tokenizer: a huggingface tokenizer
+    model_input (string): a string in the form of <library-name> [SEP] constructor
+    num_beams (int): the beam width used for decoding
+    num_return_sequences (int): how many API usage patterns are returned by the model
+    Returns:
+    api_usage_patterns (list): a list of API usage patterns
+    '''
+    model_input = tokenizer(model_input, return_tensors='pt').input_ids
+    model_output = generative_model.generate(
+        model_input,
+        num_beams=num_beams,
+        num_return_sequences=num_return_sequences
+    )
+    api_usage_patterns = tokenizer.batch_decode(
+        model_output,
+        skip_special_tokens=True
+    )
+    return api_usage_patterns
+def generate_api_usage_patterns_batch(generative_model, tokenizer, library_ids, db_constructor, num_beams, num_return_sequences):
+    '''
+    Function to generate API usage patterns in batch
+    Params:
+    generative_model: a huggingface model
+    tokenizer: a huggingface tokenizer
+    library_ids (list): a list of libary ids
+    db_constructor (pandas dataframe):  a dataframe containing the mapping of library names to valid constructor
+    num_beams (int): the beam width used for decoding
+    num_return_sequences (int): how many API usage patterns are returned by the model
+    Returns:
+    predictions (list): a list of dictionary containing the api usage patterns, library name, and id
+    '''
+    input_generative_model_dict = prepare_input_generative_model(library_ids, db_constructor)
+    predictions = []
+    for id_ in input_generative_model_dict:
+        temp_dict = {
+            'id': id_,
+            'library_name': None,
+            'hw_config': None,
+            'usage_patterns': {}
+        }
+        for input_generative_model in input_generative_model_dict.get(id_):
+            api_usage_patterns = generate_api_usage_patterns(
+                generative_model,
+                tokenizer,
+                input_generative_model,
+                num_beams,
+                num_return_sequences
+            )
+            temp = input_generative_model.split("[SEP]")
+            library_name = temp[0].strip()
+            constructor = temp[1].strip()
+            assert(constructor not in temp_dict.get('usage_patterns'))
+            temp_dict['usage_patterns'][constructor] = api_usage_patterns
+        assert(temp_dict.get('library_name')==None)
+        temp_dict['library_name'] = library_name
+        predictions.append(temp_dict)
+    return predictions
+# def generate_api_usage_patterns(generative_model, tokenizer, model_inputs, num_beams, num_return_sequences):
+#     '''
+#     Function to generate API usage patterns
+#     Params:
+#     generative_model: a huggingface model
+#     tokenizer: a huggingface tokenizer
+#     model_inputs (list): a list of <library-name> [SEP] <constructor>
+#     num_beams (int): the beam width used for decoding
+#     num_return_sequences (int): how many API usage patterns are returned by the model
+#     Returns:
+#     api_usage_patterns (list): a list of API usage patterns
+#     '''
+#     model_inputs = tokenizer(
+#         model_inputs,
+#         max_length=max_length,
+#         padding='max_length',
+#         return_tensors='pt',
+#         truncation=True)
+#     model_output = generative_model.generate(
+#         **model_inputs,
+#         num_beams=num_beams,
+#         num_return_sequences=num_return_sequences
+#     )
+#     api_usage_patterns = tokenizer.batch_decode(
+#         model_output,
+#         skip_special_tokens=True
+#     )
+#     api_usage_patterns = [api_usage_patterns[i:i+num_return_sequences] for i in range(0, len(api_usage_patterns), num_return_sequences)]
+#     return api_usage_patterns
+def prepare_input_classification_model(id_, db_metadata):
+    '''
+    Function to get a feature for a classification model using library id
+    Params:
+    id_ (int): a unique library id
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    Returns:
+    feature (string): a feature used for the classification model input
+    '''
+    temp_db = db_metadata[db_metadata.id == id_]
+    assert(len(temp_db)==1)
+    feature = temp_db.iloc[0].features
+    return feature
+def load_hw_classifier(model_path_classifier, model_path_classifier_head):
+    '''
+    Function to load a classifier model and classifier head
+    Params:
+    model_path_classifier (string): path to the classifier checkpoint (can be either huggingface path or local directory)
+    model_path_classifier_head (string): path to the classifier head checkpoint (should be a local directory)
+    Returns:
+    classifier_model: a huggingface model
+    classifier_head: a classifier model (can be either svm or rf)
+    tokenizer: a huggingface tokenizer
+    '''
+    tokenizer = RobertaTokenizer.from_pretrained(model_path_classifier)
+    classifier_model = RobertaModel.from_pretrained(model_path_classifier)
+    with open(model_path_classifier_head, 'rb') as f:
+        classifier_head = pickle.load(f)
+    return classifier_model, classifier_head, tokenizer
+def predict_hw_config(classifier_model, classifier_tokenizer, classifier_head, library_ids, db_metadata, max_length):
+    '''
+    Function to predict hardware configs
+    Params:
+    classifier_model: a huggingface model to convert a feature to a feature vector
+    classifier_tokenizer: a huggingface tokenizer
+    classifier_head: a classifier head
+    library_ids (list): a list of library ids
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    max_length (int): max length of the tokenizer output
+    Returns:
+    prediction (list): a list of prediction
+    '''
+    features = [prepare_input_classification_model(id_, db_metadata) for id_ in library_ids]
+    tokenized_features = classifier_tokenizer(
+            features,
+            max_length=max_length,
+            padding='max_length',
+            return_tensors='pt',
+            truncation=True
+        )
+    with torch.no_grad():
+        embedding_features = classifier_model(**tokenized_features).pooler_output.numpy()
+    prediction = classifier_head.predict_proba(embedding_features).tolist()
+    prediction = np.argmax(prediction, axis=1).tolist()
+    prediction = [classifier_class_mapping.get(idx) for idx in prediction]
+    return prediction
+def initialize_all_components(config):
+    '''
+    Function to initialize all components of ArduProg
+    Params:
+    config (dict): a dictionary containing the configuration to initialize all components
+    Returns:
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
+    model_retrieval, model_generative : a huggingface model
+    tokenizer_generative, tokenizer_classifier: a huggingface tokenizer
+    model_classifier: a huggingface model
+    classifier_head: a random forest model
+    '''
+    # load db
+    db_metadata, db_constructor = load_db(
+        config.get('db_metadata_path'),
+        config.get('db_constructor_path')
+    )
+    # load model
+    model_retrieval = load_retrieval_model_lexical(
+        config.get('tokenizer_path_retrieval'),
+        config.get('max_k'),
+        db_metadata,
+    )
+    tokenizer_generative, model_generative = load_generative_model_codebert(config.get('model_path_generative'))
+    model_classifier, classifier_head, tokenizer_classifier = load_hw_classifier(
+        config.get('model_path_classifier'),
+        config.get('classifier_head_path')
+    )
+    return db_metadata, db_constructor, model_retrieval, model_generative, tokenizer_generative, model_classifier, classifier_head, tokenizer_classifier
+def make_predictions(input_query,
+    model_retrieval,
+    model_generative,
+    model_classifier, classifier_head,
+    tokenizer_generative, tokenizer_classifier,
+    db_metadata, db_constructor,
+    config):
+    '''
+    Function to retrieve relevant libraries, generate API usage patterns, and predict the hw configs
+    Params:
+    input_query (string): a query from the user
+    model_retrieval, model_generative, model_classifier: a huggingface model
+    classifier_head: a random forest classifier
+    toeknizer_generative, tokenizer_classifier: a hugggingface tokenizer,
+    db_metadata (pandas dataframe): a dataframe containing metadata information about the library
+    db_constructor (pandas dataframe): a dataframe containing the mapping of library names to valid constructor
+    config (dict): a dictionary containing the configuration to initialize all components
+    Returns:
+    predictions (list): a list of dictionary containing the prediction details
+    '''
+    library_ids, library_names = retrieve_libraries(model_retrieval, input_query, db_metadata)
+    predictions = generate_api_usage_patterns_batch(
+        model_generative,
+        tokenizer_generative,
+        library_ids,
+        db_constructor,
+        config.get('num_beams'),
+        config.get('num_return_sequences')
+    )
+    hw_configs = predict_hw_config(
+        model_classifier,
+        tokenizer_classifier,
+        classifier_head,
+        library_ids,
+        db_metadata,
+        config.get('max_length')
+    )
+    for output_dict, hw_config in zip(predictions, hw_configs):
+        output_dict['hw_config'] = hw_config
+    predictions = get_metadata_library(predictions, db_metadata)
+    return predictions