Sudhir878786 commited on
Commit
9e80f82
β€’
1 Parent(s): fdb6adb
Files changed (12) hide show
  1. .gitignore +5 -0
  2. README.md +26 -6
  3. app.py +75 -0
  4. core.py +37 -0
  5. demo.py +26 -0
  6. embedding.py +24 -0
  7. gradio_app.py +24 -0
  8. install.sh +12 -0
  9. main.py +6 -0
  10. pdf_loader.py +64 -0
  11. preprocessing.py +107 -0
  12. requirements.txt +11 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__
2
+ venv
3
+ nltk_packages
4
+ embedding
5
+ documents
README.md CHANGED
@@ -1,13 +1,33 @@
1
  ---
2
- title: Resume Ranker LLM
3
- emoji: πŸ“š
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.21.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Resume Ranking
3
+ emoji: πŸ“Š
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
  sdk: streamlit
7
  sdk_version: 1.21.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # resume-ranker
13
+ <hr>
14
+
15
+ ## How to Use?
16
+
17
+ Install all the dependencies with:
18
+
19
+ ```bash
20
+ ./install.sh
21
+ ```
22
+
23
+ Run the Streamlilt with:
24
+
25
+ ```bash
26
+ streamlit run app.py
27
+ ```
28
+
29
+ Or run it from with:
30
+
31
+ ```bash
32
+ python demo.py
33
+ ```
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf_loader import load_btyes_io
3
+ from core import pipeline
4
+
5
+ # Developer Details
6
+ developer_details = {
7
+ "Sudhir Sharma": {
8
+ "Education": "B.Tech in Computer Science and Engineering, IIT Bhilai, 2024",
9
+ "Email": "[email protected]",
10
+ "GitHub": "[GitHub Profile](https://github.com/Sudhir878786)",
11
+ "LinkedIn": "[LinkedIn Profile](https://www.linkedin.com/in/sudhirsharma87/)"
12
+ }
13
+ }
14
+
15
+
16
+ def inference(query, files, embedding_type):
17
+
18
+ # pdfReader = PyPDF2.PdfReader(files[0])
19
+ # text = ''
20
+ # for page in pdfReader.pages:
21
+ # text += page.extract_text()
22
+ # st.write(text)
23
+
24
+ results, _ = pipeline(query, load_btyes_io(files), embedding_type=embedding_type)
25
+ prob_per_documents = {result['name']: result['similarity'] for result in results}
26
+ return prob_per_documents
27
+
28
+ st.sidebar.header("Developer Details")
29
+ selected_developer = st.sidebar.selectbox("Select a developer", list(developer_details.keys()))
30
+ st.sidebar.markdown(developer_details[selected_developer]["Education"])
31
+ st.sidebar.markdown(developer_details[selected_developer]["Email"])
32
+ st.sidebar.markdown(developer_details[selected_developer]["GitHub"])
33
+ st.sidebar.markdown(developer_details[selected_developer]["LinkedIn"])
34
+
35
+ sample_files = [
36
+ "documents/business.pdf",
37
+ "documents/data_science.pdf",
38
+ ]
39
+
40
+ sample_job_descriptions = {
41
+ "Software Engineer": """We are looking for a software engineer with experience in Python and web development. The ideal candidate should have a strong background in building scalable and robust applications. Knowledge of frameworks such as Flask and Django is a plus. Experience with front-end technologies like HTML, CSS, and JavaScript is desirable. The candidate should also have a good understanding of databases and SQL. Strong problem-solving and communication skills are required for this role.
42
+ """,
43
+ "Data Scientist": """We are seeking a data scientist with expertise in machine learning and statistical analysis. The candidate should have a solid understanding of data manipulation, feature engineering, and model development. Proficiency in Python and popular data science libraries such as NumPy, Pandas, and Scikit-learn is required. Experience with deep learning frameworks like TensorFlow or PyTorch is a plus. Strong analytical and problem-solving skills are essential for this position.
44
+ """
45
+ }
46
+
47
+ st.sidebar.header("Sample Files")
48
+ for sample_file in sample_files:
49
+ st.sidebar.markdown(f"[{sample_file}](./sample_files/{sample_file})")
50
+
51
+ st.sidebar.header("Sample Job Descriptions")
52
+ selected_job = st.sidebar.selectbox("Select a job description", list(sample_job_descriptions.keys()))
53
+ st.sidebar.markdown("```")
54
+ st.sidebar.code(sample_job_descriptions[selected_job])
55
+ st.title("πŸ‘¨πŸΌβ€πŸŽ“Resume Ranker ")
56
+
57
+ query = st.text_area("Job Description", height=200, value=sample_job_descriptions[selected_job])
58
+ uploaded_files = st.file_uploader("Upload Resume", accept_multiple_files=True, type=["txt", "pdf"])
59
+ embedding_type = st.selectbox("Embedding Type", ["bert", "minilm", "tfidf"])
60
+
61
+ if st.button("Submit"):
62
+ if not query:
63
+ st.warning("Please enter a job description.")
64
+ elif not uploaded_files:
65
+ st.warning("Please upload one or more resumes.")
66
+ else:
67
+ with st.spinner("Processing..."):
68
+ results = inference(query, uploaded_files,embedding_type)
69
+ st.subheader("Results")
70
+ for document, similarity in results.items():
71
+ # make similiarty round to 2 decimal place
72
+ if similarity >= 1:
73
+ similarity = round(similarity, 2)
74
+ st.write(f"- {document}:")
75
+ st.progress(similarity, text=f"{similarity:.2%}")
core.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from embedding import embedding
2
+ from preprocessing import preprocess
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ import streamlit as st
6
+
7
+ def pipeline(input_doc:str , ori_documents, embedding_type='bert'):
8
+ documents = np.array([doc['content'] for doc in ori_documents])
9
+ documents = np.insert(documents, 0, input_doc)
10
+ # st.write(documents)
11
+ preprocessed_documents = preprocess(documents)
12
+ # st.write(preprocessed_documents)
13
+ print("Encoding with BERT...")
14
+ documents_vectors = embedding(preprocessed_documents, embedding=embedding_type)
15
+ print("Encoding finished")
16
+
17
+ #compute cosine similarity
18
+ pairwise = cosine_similarity(documents_vectors)
19
+
20
+ #only retain useful information
21
+ pairwise = pairwise[0,1:]
22
+ sorted_idx = np.argsort(pairwise)[::-1]
23
+ result_pairwise = pairwise[sorted_idx]
24
+
25
+ results = []
26
+ print('Resume ranking:')
27
+ for idx in sorted_idx:
28
+ single_result = {
29
+ 'rank': idx,
30
+ 'name': ori_documents[idx]['name'],
31
+ 'similarity': pairwise[idx].item()
32
+ }
33
+ results.append(single_result)
34
+ print(f'Resume of candidite {idx}')
35
+ print(f'Cosine Similarity: {pairwise[idx]}\n')
36
+
37
+ return results, result_pairwise
demo.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_documents
2
+ from core import pipeline
3
+
4
+ if __name__ == '__main__':
5
+ pipeline('''About Sleek
6
+
7
+ Sleek is on a mission to revolutionize how entrepreneurs operate their business. We want to give small business owners peace of mind and the power of online solutions to allow them to focus on what they do best - growing their business. As we work for our thousands of customers, we gather millions of data points about their business, and in turn we transform those into useful, actionable insights and recommendations to accelerate their growth through smart algorithms.
8
+
9
+ We are a team of 400 builders from 17 countries, with offices in Singapore, Philippines, Hong Kong, Australia and the UK committed to delivering a delightful experience to our clients!
10
+
11
+ You will be working in the Data & Analytics organization to solve a wide range of business problems leveraging advanced analytics. You will deploy a flexible analytical skill set to deliver insightful data and analysis and model business scenarios. Your principal goal will be to use data to drive better business decisions. This means translating data into meaningful insights and recommendations and, where relevant, proactively implement improvements. You will be developing the business reporting and analysis for our internal operations world-wide. The job will require working closely with the various Business Units to understand their business question as well as the whole data team to understand and access available data.
12
+
13
+ Position Duties
14
+ Drive analytical problem-solving and deep dives. Work with large, complex data sets. Solve difficult, non-routine problems, applying advanced quantitative methods.
15
+ Collaborate with a wide variety of cross-functional partners to determine business needs, drive analytical projects from start to finish.
16
+ Align with involved stakeholders to set up dashboards and reports to drive data driven decision across all departments
17
+ Working very closely with our Data team, Tech and Product team to understand the business logic to generate accurate reports and correct analysis
18
+
19
+ Requirements
20
+
21
+ Performance Standards
22
+ Able to commit for a period of at least 4 months
23
+ Currently pursuing a degree in Business Science, Engineering or relevant disciplines with a focus on data.
24
+ Good knowledge in SQL, R and Python.
25
+ Experience in data visualization tools (Tableau, PowerBI, Google DataStudio or equivalent) will be an added advantage.''',
26
+ load_documents(source_dir = 'documents'))
embedding.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sentence_transformers import SentenceTransformer
3
+ import os
4
+
5
+ def embedding(documents, embedding='bert'):
6
+ if embedding == 'bert':
7
+ sbert_model = SentenceTransformer('bert-base-nli-mean-tokens', cache_folder=os.path.join(os.getcwd(), 'embedding'))
8
+
9
+ document_embeddings = sbert_model.encode(documents)
10
+ return document_embeddings
11
+
12
+ if embedding == 'minilm':
13
+ sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=os.path.join(os.getcwd(), 'embedding'))
14
+
15
+ document_embeddings = sbert_model.encode(documents)
16
+ return document_embeddings
17
+
18
+ if embedding == 'tfidf':
19
+ word_vectorizer = TfidfVectorizer(
20
+ sublinear_tf=True, stop_words='english')
21
+ word_vectorizer.fit(documents)
22
+ word_features = word_vectorizer.transform(documents)
23
+
24
+ return word_features
gradio_app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_documents
2
+ from core import pipeline
3
+
4
+ import gradio as gr
5
+
6
+ def inference(query, files):
7
+ #get path of uploaded files
8
+ files = [file.name for file in files]
9
+ results,_ = pipeline(query, load_documents(file_paths=files))
10
+
11
+ prob_per_documents = {result['name']: result['similarity'] for result in results}
12
+ return prob_per_documents
13
+
14
+ with gr.Blocks() as demo:
15
+ #write a header
16
+
17
+ job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
18
+ files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
19
+ btn = gr.Button("Submit")
20
+ output = gr.Label(label="Results")
21
+ # output = gr.Number(label="Results")
22
+ btn.click(inference, inputs=[job_desc, files], outputs=output)
23
+
24
+ demo.launch()
install.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ find . \( -name __pycache__ -o -name "*.pyc" \) -delete
3
+ python3 -m venv venv
4
+ # Check the operating system
5
+ if [[ "$OSTYPE" == "msys" ]]; then
6
+ # Windows
7
+ source venv/Scripts/activate
8
+ else
9
+ # Unix-like systems (macOS, Linux)
10
+ source venv/bin/activate
11
+ fi
12
+ pip install --no-cache-dir -r requirements.txt
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # from fastapi import FastAPI, File, UploadFile
2
+ # app = FastAPI()
3
+
4
+ # @app.post("/resume")
5
+ # async def root(name:str, email:str, about:str, file:UploadFile = File(...)):
6
+ # return {"name":name, "email":email, "about":about, "file_name":file.filename}
pdf_loader.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+
4
+ def load_single_document(file_path: str):
5
+ # Loads a single document from file path
6
+ if file_path[-4:] == '.txt':
7
+ with open(file_path, 'r') as f:
8
+ return f.read()
9
+
10
+ elif file_path[-4:] == '.pdf':
11
+ pdfFileObj = open(file_path, 'rb')
12
+ pdfReader = PyPDF2.PdfReader(pdfFileObj)
13
+ text = ''
14
+ for page in pdfReader.pages:
15
+ text += page.extract_text()
16
+ return text
17
+
18
+ elif file_path[-4:] == '.csv':
19
+ with open(file_path, 'r') as f:
20
+ return f.read()
21
+
22
+ else:
23
+ raise Exception('Invalid file type')
24
+
25
+
26
+ def load_documents(file_paths: list[str] = None, source_dir: str = None):
27
+ # Loads all documents from source documents directory
28
+ if file_paths:
29
+ all_files = file_paths
30
+ elif source_dir:
31
+ all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
32
+ else:
33
+ raise Exception('No file paths or source directory provided')
34
+
35
+ return [
36
+ {
37
+ 'name': os.path.basename(file_path),
38
+ 'content': load_single_document(f"{file_path}")
39
+ } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
40
+ ]
41
+
42
+ def load_io(file_byte = None):
43
+ # Loads a single document from file path
44
+ if file_byte.name[-3:] == 'txt':
45
+ return file_byte.read().decode("utf-8")
46
+
47
+ elif file_byte.name[-3:] == 'pdf':
48
+ pdfReader = PyPDF2.PdfReader(file_byte)
49
+ text = ''
50
+ for page in pdfReader.pages:
51
+ text += page.extract_text()
52
+ return text
53
+
54
+ else:
55
+ raise Exception('Invalid file type')
56
+
57
+ def load_btyes_io(files = None):
58
+
59
+ return [
60
+ {
61
+ 'name': file_btye.name,
62
+ 'content': load_io(file_btye)
63
+ } for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
64
+ ]
preprocessing.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ import os
4
+ import unicodedata
5
+ import nltk
6
+ import inflect
7
+ from nltk import word_tokenize, sent_tokenize
8
+ from nltk.corpus import stopwords
9
+ from nltk.stem import LancasterStemmer, WordNetLemmatizer
10
+
11
+ # download_path = os.path.join(os.getcwd(), 'nltk_packages')
12
+ # nltk.data.path.append(download_path)
13
+ nltk.download('wordnet')
14
+ nltk.download('stopwords')
15
+ nltk.download('punkt')
16
+
17
+ def remove_non_ascii(words):
18
+ """Remove non-ASCII characters from list of tokenized words"""
19
+ new_words = []
20
+ for word in words:
21
+ new_word = unicodedata.normalize('NFKD', word).encode(
22
+ 'ascii', 'ignore').decode('utf-8', 'ignore')
23
+ new_words.append(new_word)
24
+ return new_words
25
+
26
+
27
+ def to_lowercase(words):
28
+ """Convert all characters to lowercase from list of tokenized words"""
29
+ new_words = []
30
+ for word in words:
31
+ new_word = word.lower()
32
+ new_words.append(new_word)
33
+ return new_words
34
+
35
+
36
+ def remove_punctuation(words):
37
+ """Remove punctuation from list of tokenized words"""
38
+ new_words = []
39
+ for word in words:
40
+ new_word = re.sub(r'[^\w\s]', '', word)
41
+ if new_word != '':
42
+ new_words.append(new_word)
43
+ return new_words
44
+
45
+
46
+ def replace_numbers(words):
47
+ """Replace all interger occurrences in list of tokenized words with textual representation"""
48
+ p = inflect.engine()
49
+ new_words = []
50
+ for word in words:
51
+ if word.isdigit():
52
+ new_word = p.number_to_words(word)
53
+ new_words.append(new_word)
54
+ else:
55
+ new_words.append(word)
56
+ return new_words
57
+
58
+
59
+ def remove_stopwords(words):
60
+ """Remove stop words from list of tokenized words"""
61
+ new_words = []
62
+ for word in words:
63
+ # print(word)
64
+ if word not in stopwords.words('english'):
65
+ new_words.append(word)
66
+ return new_words
67
+
68
+
69
+ def stem_words(words):
70
+ """Stem words in list of tokenized words"""
71
+ stemmer = LancasterStemmer()
72
+ stems = []
73
+ for word in words:
74
+ stem = stemmer.stem(word)
75
+ stems.append(stem)
76
+ return stems
77
+
78
+
79
+ def lemmatize_verbs(words):
80
+ """Lemmatize verbs in list of tokenized words"""
81
+ lemmatizer = WordNetLemmatizer()
82
+ lemmas = []
83
+ for word in words:
84
+ lemma = lemmatizer.lemmatize(word, pos='v')
85
+ lemmas.append(lemma)
86
+ return lemmas
87
+
88
+ def normalize(words):
89
+ words = remove_non_ascii(words)
90
+ words = to_lowercase(words)
91
+ words = remove_punctuation(words)
92
+ # words = replace_numbers(words)
93
+ words = remove_stopwords(words)
94
+ # words = stem_words(words)
95
+ # words = lemmatize_verbs(words)
96
+ return words
97
+
98
+
99
+ def preprocess(documents):
100
+ preprocessed_documents = []
101
+ for document in documents:
102
+ tokens = nltk.word_tokenize(document)
103
+ preprocessed = normalize(tokens)
104
+ preprocessed = ' '.join(map(str, preprocessed))
105
+ preprocessed_documents.append(preprocessed)
106
+
107
+ return preprocessed_documents
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ inflect==6.0.4
2
+ nltk==3.8.1
3
+ numpy==1.24.3
4
+ PyPDF2==3.0.1
5
+ scikit_learn==1.2.2
6
+ sentence_transformers==2.2.2
7
+ fastapi
8
+ uvicorn[standard]
9
+ python-multipart
10
+ python-dotenv
11
+ streamlit