liberatoratif commited on
Commit
4770145
β€’
1 Parent(s): 57ed03b

Create careermatcher.py

Browse files
Files changed (1) hide show
  1. careermatcher.py +241 -0
careermatcher.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import plotly.express as px
3
+ import pandas as pd
4
+ import numpy as np
5
+ import pickle as pkl
6
+ import spacy
7
+ from spacy.lang.en.stop_words import STOP_WORDS
8
+ nlp = spacy.load('en_core_web_lg')
9
+ import re
10
+ import docx2txt
11
+ from spacy.matcher import PhraseMatcher
12
+
13
+ # from transformers import BertForSequenceClassification
14
+ # from transformers import BertTokenizer
15
+ # Load model directly
16
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
+
18
+ # tokenizer = AutoTokenizer.from_pretrained("liberatoratif/BERT-resume-job-recommender")
19
+ # model = AutoModelForSequenceClassification.from_pretrained("liberatoratif/BERT-resume-job-recommender")
20
+
21
+ matcher = PhraseMatcher(nlp.vocab)
22
+
23
+ import torch
24
+
25
+ st.set_page_config(
26
+ page_title="Resume Scanner",
27
+ page_icon="πŸ“",
28
+ layout="wide",
29
+ initial_sidebar_state="expanded",
30
+ )
31
+
32
+
33
+ # output_dir = "model_save"
34
+ enc_dir = "target_encodings.pkl"
35
+ matcher_dir = "linkedin_skill.txt"
36
+
37
+
38
+ # @st.cache
39
+ def bert():
40
+ # model_loaded_temp = BertForSequenceClassification.from_pretrained(output_dir)
41
+ model_loaded_temp = AutoModelForSequenceClassification.from_pretrained("liberatoratif/BERT-resume-job-recommender")
42
+ return model_loaded_temp
43
+
44
+ # @st.cache
45
+ def bert_token():
46
+ # tokenizer_loaded_temp = BertTokenizer.from_pretrained(output_dir)
47
+ tokenizer_loaded_temp = AutoTokenizer.from_pretrained("liberatoratif/BERT-resume-job-recommender")
48
+ return tokenizer_loaded_temp
49
+
50
+ # @st.cache
51
+ def label_enc():
52
+ enc = pkl.load(open(enc_dir, 'rb'))
53
+ return enc
54
+
55
+ # @st.cache
56
+ def ph_match():
57
+ with open(matcher_dir, 'r', encoding='utf-8') as file:
58
+ text = file.read()
59
+
60
+ return text
61
+
62
+
63
+ label_encoder = label_enc()
64
+ model_loaded = bert()
65
+ tokenizer_loaded = bert_token()
66
+
67
+ txt = ph_match()
68
+
69
+ st.markdown(
70
+ """
71
+ <style>
72
+ [data-testid="stSidebar"][aria-expanded="true"] > div:first-child {
73
+ width: 250px;
74
+ }
75
+ [data-testid="stSidebar"][aria-expanded="false"] > div:first-child {
76
+ width: 150px;
77
+ margin-left: -500px;
78
+ }
79
+ </style>
80
+ """,
81
+ unsafe_allow_html=True,
82
+ )
83
+
84
+ st.markdown("<h1 style='text-align: centre; color: cyan;'>RESUME/CV SCANNER</h1>",
85
+ unsafe_allow_html=True)
86
+ st.markdown("<h6 style='text-align: centre; color: white;'>Know which domain fit's your resume :)</h1>",
87
+ unsafe_allow_html=True)
88
+
89
+ stops = list(STOP_WORDS)
90
+
91
+ def extract_text_from_docx(docx_path):
92
+ txt = docx2txt.process(docx_path)
93
+ if txt:
94
+ return txt.replace('\t', ' ')
95
+ return None
96
+
97
+ def cleanResume(resumeText):
98
+ resumeText = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",resumeText).split())
99
+ resumeText = re.sub(r'[^\x00-\x7F]+',r' ', resumeText)
100
+ resumeText = ''.join(resumeText.splitlines())
101
+ return resumeText
102
+
103
+ def complete_pack(x):
104
+ demo = nlp(x)
105
+ lst = [i.text.lower() for i in demo if i.text.lower() not in stops]
106
+ return lst
107
+
108
+
109
+ with st.sidebar:
110
+ global resume_text, upload
111
+ global resume_text_spacy, re_temp
112
+ upload = st.file_uploader("DRAG AND DROP YOUR RESUME NOW")
113
+ st.markdown("<h5 style='text-align: centre; color: red;'>Only .docx type files accepted</h1>",
114
+ unsafe_allow_html=True)
115
+ if upload:
116
+ try:
117
+ resume_text = extract_text_from_docx(upload)
118
+ resume_text = resume_text.replace('\n\n', ' ')
119
+ re_temp = cleanResume(resume_text)
120
+ resume_text_spacy = nlp(re_temp)
121
+ except Exception as e:
122
+ st.error('WRONG FILE FORMAT : Only .docx(WORD DOC) type of files are accepted')
123
+
124
+
125
+ scan = st.button('SCAN πŸ“')
126
+ if scan:
127
+ try:
128
+ emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", resume_text)
129
+ phone = re.findall(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', resume_text)
130
+ links = re.findall(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?Β«Β»β€œβ€β€˜β€™]))", resume_text)
131
+
132
+
133
+ txt = txt.split('\n')
134
+ ev = [nlp.make_doc(i) for i in txt]
135
+ matcher.add("SKILLS", None, *ev)
136
+ get_skills = matcher(resume_text_spacy)
137
+
138
+ demo = []
139
+ for match_id, start, end in get_skills:
140
+ span = resume_text_spacy[start : end]
141
+ demo.append(span.text)
142
+
143
+ re_text = ' '.join(demo)
144
+ my_skills_re_text = re_text
145
+ my_skills_clean_re_text = cleanResume(my_skills_re_text)
146
+
147
+ skills = complete_pack(my_skills_clean_re_text)
148
+ skills = ' '.join(skills)
149
+ lst = []
150
+ lst.append(skills)
151
+
152
+
153
+ model_loaded.eval()
154
+
155
+ # Tokenize the input text
156
+ input_ids = tokenizer_loaded.encode(lst[0], add_special_tokens=True)
157
+ input_ids = torch.tensor(input_ids).unsqueeze(0) # Add batch dimension
158
+
159
+ # Move the input tensor to the same device as the model
160
+ # input_ids = input_ids.to(device)
161
+ # model_loaded = model_loaded.to(device)
162
+
163
+ # Perform the forward pass to get the model's predictions
164
+ with torch.no_grad():
165
+ result = model_loaded(input_ids, token_type_ids=None, attention_mask=None, return_dict=True)
166
+ logits = result.logits
167
+
168
+ # Move the logits to the CPU and convert to numpy array
169
+ logits = logits.detach().cpu().numpy()
170
+
171
+ # Get the predicted label
172
+ predicted_label = np.argmax(logits)
173
+
174
+ # Print the predicted label
175
+ # st.write("Predicted Label:", predicted_label)
176
+
177
+ probs = logits[0]
178
+ # print("text:", lst[0])
179
+ # print("predictions:", probs)
180
+ pred_idx = np.argmax(probs)
181
+ # kp = list(pred_idx)
182
+ d = {}
183
+ ind = 0
184
+
185
+ for i in probs:
186
+ d[label_encoder.inverse_transform([ind])[0]] = i
187
+ ind+=1
188
+ # st.write("Your skills are matching to : ", label_encoder.inverse_transform([pred_idx])[0])
189
+ domain = label_encoder.inverse_transform([pred_idx])[0]
190
+ data = pd.DataFrame({'Domains' : list(d.keys()), 'Probs' : list(d.values())})
191
+ # st.markdown(f"**Your skills are matching to:** <span style='color: cyan;'>{domain}</span>", unsafe_allow_html=True) #BF3EFF
192
+ st.markdown(f"<span style='color: #BF3EFF;'>**Your skills are matching to :**</span> <span style='color: #54FF9F;'>{domain}</span>", unsafe_allow_html=True)
193
+ datacpy = data.copy()
194
+ datacpy['Probs'] = datacpy['Probs']*10
195
+ datacpy.rename(columns={'Probs': 'Percentage Prediction of your Domain'}, inplace=True)
196
+
197
+ st.markdown("<h3 style='text-align: centre; color: blue;'>PERCENT OF YOUR DOMAIN MATCH</h3>",
198
+ unsafe_allow_html=True)
199
+
200
+ st.dataframe(datacpy.sort_values('Percentage Prediction of your Domain', ascending=False))
201
+ domains = px.bar(data, x = 'Domains', y = 'Probs',width=800, height=400)
202
+ st.plotly_chart(domains)
203
+
204
+
205
+ if len(list(set(emails))) > 0:
206
+ st.markdown("<h4 style='text-align: centre; color: blue;'>EMAIL βœ”οΈ </h1>",
207
+ unsafe_allow_html=True)
208
+ st.success(list(set(emails)))
209
+ else:
210
+ st.markdown("<h4 style='text-align: centre; color: blue;'>EMAIL ❌ </h1>",
211
+ unsafe_allow_html=True)
212
+ st.error('Email-Id is not present try including it in your Resume')
213
+
214
+
215
+
216
+ if len(list(set(phone))) > 0:
217
+ st.markdown("<h4 style='text-align: centre; color: blue;'>MOBILE NO βœ”οΈ </h1>",
218
+ unsafe_allow_html=True)
219
+ st.success(list(set(phone)))
220
+ else:
221
+ st.markdown("<h4 style='text-align: centre; color: blue;'>MOBILE NO ❌ </h1>",
222
+ unsafe_allow_html=True)
223
+ st.error('Mobile number is not present try including it in your Resume')
224
+
225
+
226
+
227
+ if len(list(set(links))) > 0:
228
+ st.markdown("<h4 style='text-align: centre; color: blue;'>LINKS βœ”οΈ </h1>",
229
+ unsafe_allow_html=True)
230
+ st.success(list(set(links)))
231
+ else:
232
+ st.markdown("<h4 style='text-align: centre; color: blue;'>LINKS ❌</h1>",
233
+ unsafe_allow_html=True)
234
+ st.error("Link's are not present try including your Github or LinkedIn Profile in your Resume")
235
+
236
+
237
+ except Exception as e:
238
+ st.write(e)
239
+ st.error("😲 Try uploading your file again")
240
+
241
+