File size: 1,265 Bytes
9e80f82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from embedding import embedding
from preprocessing import preprocess
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import streamlit as st

def pipeline(input_doc:str , ori_documents, embedding_type='bert'):
    documents = np.array([doc['content'] for doc in ori_documents])
    documents = np.insert(documents, 0, input_doc)
    # st.write(documents)
    preprocessed_documents = preprocess(documents)
    # st.write(preprocessed_documents)
    print("Encoding with BERT...")
    documents_vectors = embedding(preprocessed_documents, embedding=embedding_type)
    print("Encoding finished")

    #compute cosine similarity
    pairwise = cosine_similarity(documents_vectors)

    #only retain useful information
    pairwise = pairwise[0,1:]
    sorted_idx = np.argsort(pairwise)[::-1]
    result_pairwise = pairwise[sorted_idx]

    results = []
    print('Resume ranking:')
    for idx in sorted_idx:
        single_result = {
            'rank': idx,
            'name': ori_documents[idx]['name'],
            'similarity': pairwise[idx].item()
        }
        results.append(single_result)
        print(f'Resume of candidite {idx}')
        print(f'Cosine Similarity: {pairwise[idx]}\n')
    
    return results, result_pairwise