LeoWalker commited on
Commit
712685a
1 Parent(s): 1ac1b15

Added character to the write up and moved functions to the helpers file.

Browse files
Files changed (3) hide show
  1. app.py +22 -97
  2. llamallms.png +0 -0
  3. resume_helpers.py +131 -0
app.py CHANGED
@@ -20,6 +20,8 @@ import json
20
  import time
21
  import os
22
 
 
 
23
 
24
  # Set the LANGCHAIN_TRACING_V2 environment variable to 'true'
25
  os.environ['LANGCHAIN_TRACING_V2'] = 'true'
@@ -29,102 +31,25 @@ os.environ['LANGCHAIN_PROJECT'] = 'Resume_Project'
29
 
30
  load_dotenv()
31
 
32
- def pdf_to_string(file):
33
- """
34
- Convert a PDF file to a string.
35
-
36
- Parameters:
37
- file (io.BytesIO): A file-like object representing the PDF file.
38
-
39
- Returns:
40
- str: The extracted text from the PDF.
41
- """
42
- pdf_reader = PyPDF2.PdfReader(file)
43
- num_pages = len(pdf_reader.pages)
44
- text = ''
45
- for i in range(num_pages):
46
- page = pdf_reader.pages[i]
47
- text += page.extract_text()
48
- file.close()
49
- return text
50
-
51
- class CustomOutputParserException(Exception):
52
- pass
53
-
54
- def extract_resume_fields(full_text, model):
55
- """
56
- Analyze a resume text and extract structured information using a specified language model.
57
- Parameters:
58
- full_text (str): The text content of the resume.
59
- model (str): The language model object to use for processing the text.
60
- Returns:
61
- dict: A dictionary containing structured information extracted from the resume.
62
- """
63
- # The Resume object is imported from the local resume_template file
64
-
65
- with open("prompts/resume_extraction.prompt", "r") as f:
66
- template = f.read()
67
-
68
- parser = PydanticOutputParser(pydantic_object=Resume)
69
-
70
- prompt_template = PromptTemplate(
71
- template=template,
72
- input_variables=["resume"],
73
- partial_variables={"response_template": parser.get_format_instructions()},
74
- )
75
- llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
76
-
77
- chain = prompt_template | llm | parser
78
- max_attempts = 3
79
- attempt = 1
80
-
81
- while attempt <= max_attempts:
82
- try:
83
- output = chain.invoke(full_text)
84
- print(output)
85
- return output
86
- except (CustomOutputParserException, ValidationError) as e:
87
- if attempt == max_attempts:
88
- raise e
89
- else:
90
- print(f"Parsing error occurred. Retrying (attempt {attempt + 1}/{max_attempts})...")
91
- attempt += 1
92
-
93
- return None
94
-
95
- def display_extracted_fields(obj, section_title=None, indent=0):
96
- if section_title:
97
- st.subheader(section_title)
98
- for field_name, field_value in obj:
99
- if field_name in ["personal_details", "education", "work_experience", "projects", "skills", "certifications", "publications", "awards", "additional_sections"]:
100
- st.write(" " * indent + f"**{field_name.replace('_', ' ').title()}**:")
101
- if isinstance(field_value, BaseModel):
102
- display_extracted_fields(field_value, None, indent + 1)
103
- elif isinstance(field_value, list):
104
- for item in field_value:
105
- if isinstance(item, BaseModel):
106
- display_extracted_fields(item, None, indent + 1)
107
- else:
108
- st.write(" " * (indent + 1) + "- " + str(item))
109
- else:
110
- st.write(" " * (indent + 1) + str(field_value))
111
- else:
112
- st.write(" " * indent + f"{field_name.replace('_', ' ').title()}: " + str(field_value))
113
-
114
- def get_json_download_link(json_str, download_name):
115
- # Convert the JSON string back to a dictionary
116
- data = json.loads(json_str)
117
-
118
- # Convert the dictionary back to a JSON string with 4 spaces indentation
119
- json_str_formatted = json.dumps(data, indent=4)
120
-
121
- b64 = base64.b64encode(json_str_formatted.encode()).decode()
122
- href = f'<a href="data:file/json;base64,{b64}" download="{download_name}.json">Click here to download the JSON file</a>'
123
- return href
124
 
125
  st.set_page_config(layout="wide")
126
 
127
  st.title("Resume Parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  llm_dict = {
130
  "GPT 3.5 turbo": ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
@@ -146,23 +71,23 @@ with col2:
146
  selected_model2 = st.selectbox("Select Model 2", list(llm_dict.keys()), index=list(llm_dict.keys()).index("Mixtral 8x7b"))
147
 
148
  if uploaded_file is not None:
149
- text = pdf_to_string(uploaded_file)
150
 
151
  if st.button("Extract Resume Fields"):
152
  col1, col2 = st.columns(2)
153
 
154
  with col1:
155
  start_time = time.time()
156
- extracted_fields1 = extract_resume_fields(text, selected_model1)
157
  end_time = time.time()
158
  elapsed_time = end_time - start_time
159
  st.write(f"Extraction completed in {elapsed_time:.2f} seconds")
160
- display_extracted_fields(extracted_fields1, f"{selected_model1} Extracted Fields ")
161
 
162
  with col2:
163
  start_time = time.time()
164
- extracted_fields2 = extract_resume_fields(text, selected_model2)
165
  end_time = time.time()
166
  elapsed_time = end_time - start_time
167
  st.write(f"Extraction completed in {elapsed_time:.2f} seconds")
168
- display_extracted_fields(extracted_fields2, f"{selected_model2} Extracted Fields ")
 
20
  import time
21
  import os
22
 
23
+ import resume_helpers
24
+
25
 
26
  # Set the LANGCHAIN_TRACING_V2 environment variable to 'true'
27
  os.environ['LANGCHAIN_TRACING_V2'] = 'true'
 
31
 
32
  load_dotenv()
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  st.set_page_config(layout="wide")
36
 
37
  st.title("Resume Parser")
38
+ col1, col2 = st.columns([1,6])
39
+
40
+ with col1:
41
+ st.image("llamallms.png", use_column_width=True)
42
+
43
+ with col2:
44
+ st.write("""
45
+ ## 📝 Unlocking the Power of LLMs 🔓
46
+
47
+ Welcome to the Resume Parser, a powerful tool designed to extract structured information from resumes using the magic of Language Models (LLMs)! 🪄📄 As a data scientist and military veteran, I understand the importance of efficiency and accuracy when it comes to processing information. That's why I've created this app to showcase how different LLMs can help us parse resumes with ease. 💪
48
+
49
+ Resumes come in all shapes and sizes, and standardization is often a distant dream. 😴 But with the right LLM by your side, you can extract key information like personal details, education, work experience, and more, all with just a few clicks! 🖱️ Plus, by comparing the performance of various models, you can find the perfect balance of speed, accuracy, and cost for your specific use case. 💰
50
+
51
+ So, whether you're a recruiter looking to streamline your hiring process, or a data enthusiast curious about the capabilities of LLMs, the Resume Parser has got you covered! 🙌 Upload a resume, select your models, and watch the magic happen. 🎩✨ Let's unlock the full potential of LLMs together and make resume parsing a breeze! 😎
52
+ """)
53
 
54
  llm_dict = {
55
  "GPT 3.5 turbo": ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
 
71
  selected_model2 = st.selectbox("Select Model 2", list(llm_dict.keys()), index=list(llm_dict.keys()).index("Mixtral 8x7b"))
72
 
73
  if uploaded_file is not None:
74
+ text = resume_helpers.pdf_to_string(uploaded_file)
75
 
76
  if st.button("Extract Resume Fields"):
77
  col1, col2 = st.columns(2)
78
 
79
  with col1:
80
  start_time = time.time()
81
+ extracted_fields1 = resume_helpers.extract_resume_fields(text, selected_model1)
82
  end_time = time.time()
83
  elapsed_time = end_time - start_time
84
  st.write(f"Extraction completed in {elapsed_time:.2f} seconds")
85
+ resume_helpers.display_extracted_fields(extracted_fields1, f"{selected_model1} Extracted Fields ")
86
 
87
  with col2:
88
  start_time = time.time()
89
+ extracted_fields2 = resume_helpers.extract_resume_fields(text, selected_model2)
90
  end_time = time.time()
91
  elapsed_time = end_time - start_time
92
  st.write(f"Extraction completed in {elapsed_time:.2f} seconds")
93
+ resume_helpers.display_extracted_fields(extracted_fields2, f"{selected_model2} Extracted Fields ")
llamallms.png ADDED
resume_helpers.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import io
3
+ import streamlit as st
4
+ import streamlit.components.v1 as components
5
+ import base64
6
+
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain_core.output_parsers import PydanticOutputParser
9
+ from langchain_anthropic import ChatAnthropic
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_groq import ChatGroq
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ from langchain_core.exceptions import OutputParserException
14
+ from pydantic import ValidationError
15
+ from langchain_core.pydantic_v1 import BaseModel, Field
16
+ from resume_template import Resume
17
+ from json import JSONDecodeError
18
+ import PyPDF2
19
+ import json
20
+ import time
21
+ import os
22
+
23
+
24
+ # Set the LANGCHAIN_TRACING_V2 environment variable to 'true'
25
+ os.environ['LANGCHAIN_TRACING_V2'] = 'true'
26
+
27
+ # Set the LANGCHAIN_PROJECT environment variable to the desired project name
28
+ os.environ['LANGCHAIN_PROJECT'] = 'Resume_Project'
29
+
30
+ load_dotenv()
31
+ llm_dict = {
32
+ "GPT 3.5 turbo": ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
33
+ "Anthropic Sonnet": ChatAnthropic(model_name="claude-3-sonnet-20240229"),
34
+ "Llama 3 8b": ChatGroq(model_name="llama3-8b-8192"),
35
+ "Llama 3 70b": ChatGroq(model_name="llama3-70b-8192"),
36
+ "Gemma 7b": ChatGroq(model_name="gemma-7b-it"),
37
+ "Mixtral 8x7b": ChatGroq(model_name="mixtral-8x7b-32768"),
38
+ # "Gemini 1.5 Pro": ChatGoogleGenerativeAI(model_name="gemini-1.5-pro-latest"),
39
+ }
40
+ def pdf_to_string(file):
41
+ """
42
+ Convert a PDF file to a string.
43
+
44
+ Parameters:
45
+ file (io.BytesIO): A file-like object representing the PDF file.
46
+
47
+ Returns:
48
+ str: The extracted text from the PDF.
49
+ """
50
+ pdf_reader = PyPDF2.PdfReader(file)
51
+ num_pages = len(pdf_reader.pages)
52
+ text = ''
53
+ for i in range(num_pages):
54
+ page = pdf_reader.pages[i]
55
+ text += page.extract_text()
56
+ file.close()
57
+ return text
58
+
59
+ class CustomOutputParserException(Exception):
60
+ pass
61
+
62
+ def extract_resume_fields(full_text, model):
63
+ """
64
+ Analyze a resume text and extract structured information using a specified language model.
65
+ Parameters:
66
+ full_text (str): The text content of the resume.
67
+ model (str): The language model object to use for processing the text.
68
+ Returns:
69
+ dict: A dictionary containing structured information extracted from the resume.
70
+ """
71
+ # The Resume object is imported from the local resume_template file
72
+
73
+ with open("prompts/resume_extraction.prompt", "r") as f:
74
+ template = f.read()
75
+
76
+ parser = PydanticOutputParser(pydantic_object=Resume)
77
+
78
+ prompt_template = PromptTemplate(
79
+ template=template,
80
+ input_variables=["resume"],
81
+ partial_variables={"response_template": parser.get_format_instructions()},
82
+ )
83
+ llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
84
+
85
+ chain = prompt_template | llm | parser
86
+ max_attempts = 3
87
+ attempt = 1
88
+
89
+ while attempt <= max_attempts:
90
+ try:
91
+ output = chain.invoke(full_text)
92
+ print(output)
93
+ return output
94
+ except (CustomOutputParserException, ValidationError) as e:
95
+ if attempt == max_attempts:
96
+ raise e
97
+ else:
98
+ print(f"Parsing error occurred. Retrying (attempt {attempt + 1}/{max_attempts})...")
99
+ attempt += 1
100
+
101
+ return None
102
+
103
+ def display_extracted_fields(obj, section_title=None, indent=0):
104
+ if section_title:
105
+ st.subheader(section_title)
106
+ for field_name, field_value in obj:
107
+ if field_name in ["personal_details", "education", "work_experience", "projects", "skills", "certifications", "publications", "awards", "additional_sections"]:
108
+ st.write(" " * indent + f"**{field_name.replace('_', ' ').title()}**:")
109
+ if isinstance(field_value, BaseModel):
110
+ display_extracted_fields(field_value, None, indent + 1)
111
+ elif isinstance(field_value, list):
112
+ for item in field_value:
113
+ if isinstance(item, BaseModel):
114
+ display_extracted_fields(item, None, indent + 1)
115
+ else:
116
+ st.write(" " * (indent + 1) + "- " + str(item))
117
+ else:
118
+ st.write(" " * (indent + 1) + str(field_value))
119
+ else:
120
+ st.write(" " * indent + f"{field_name.replace('_', ' ').title()}: " + str(field_value))
121
+
122
+ def get_json_download_link(json_str, download_name):
123
+ # Convert the JSON string back to a dictionary
124
+ data = json.loads(json_str)
125
+
126
+ # Convert the dictionary back to a JSON string with 4 spaces indentation
127
+ json_str_formatted = json.dumps(data, indent=4)
128
+
129
+ b64 = base64.b64encode(json_str_formatted.encode()).decode()
130
+ href = f'<a href="data:file/json;base64,{b64}" download="{download_name}.json">Click here to download the JSON file</a>'
131
+ return href