Spaces:
Running
Running
from dotenv import load_dotenv | |
import io | |
import streamlit as st | |
from langchain.prompts import PromptTemplate | |
from langchain.output_parsers import PydanticOutputParser | |
from langchain_community.chat_models import ChatAnthropic | |
from langchain_openai import ChatOpenAI | |
from pydantic import ValidationError | |
from resume_template import Resume | |
from json import JSONDecodeError | |
import PyPDF2 | |
import json | |
import time | |
load_dotenv() | |
def pdf_to_string(file): | |
""" | |
Convert a PDF file to a string. | |
Parameters: | |
file (io.BytesIO): A file-like object representing the PDF file. | |
Returns: | |
str: The extracted text from the PDF. | |
""" | |
pdf_reader = PyPDF2.PdfReader(file) | |
num_pages = len(pdf_reader.pages) | |
text = '' | |
for i in range(num_pages): | |
page = pdf_reader.pages[i] | |
text += page.extract_text() | |
file.close() | |
return text | |
def extract_resume_fields(full_text, model): | |
""" | |
Analyze a resume text and extract structured information using a specified language model. | |
Parameters: | |
full_text (str): The text content of the resume. | |
model (str): The language model object to use for processing the text. | |
Returns: | |
dict: A dictionary containing structured information extracted from the resume. | |
""" | |
# The Resume object is imported from the local resume_template file | |
with open("prompts/resume_extraction.prompt", "r") as f: | |
template = f.read() | |
parser = PydanticOutputParser(pydantic_object=Resume) | |
prompt_template = PromptTemplate( | |
template=template, | |
input_variables=["resume"], | |
partial_variables={"response_template": parser.get_format_instructions()}, | |
) | |
# Invoke the language model and process the resume | |
formatted_input = prompt_template.format_prompt(resume=full_text) | |
llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model)) | |
# print("llm", llm) | |
output = llm.invoke(formatted_input.to_string()) | |
# print(output) # Print the output object for debugging | |
try: | |
parsed_output = parser.parse(output.content) | |
json_output = parsed_output.json() | |
print(json_output) | |
return json_output | |
except ValidationError as e: | |
print(f"Validation error: {e}") | |
print(output) | |
return output.content | |
except JSONDecodeError as e: | |
print(f"JSONDecodeError error: {e}") | |
print(output) | |
return output.content | |
st.title("Resume Parser") | |
# Set up the LLM dictionary | |
llm_dict = { | |
"gpt-4-1106-preview": ChatOpenAI(temperature=0, model="gpt-4-1106-preview"), | |
"gpt-4": ChatOpenAI(temperature=0, model="gpt-4"), | |
"gpt-3.5-turbo-1106": ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106"), | |
"claude-2": ChatAnthropic(model="claude-2", max_tokens=20_000), | |
"claude-instant-1": ChatAnthropic(model="claude-instant-1", max_tokens=20_000) | |
} | |
# Add a Streamlit dropdown menu for model selection | |
selected_model = st.selectbox("Select a model", list(llm_dict.keys())) | |
# Add a file uploader | |
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") | |
# Check if a file is uploaded | |
if uploaded_file is not None: | |
# Add a button to trigger the conversion | |
if st.button("Convert PDF to Text"): | |
start_time = time.time() # Start the timer | |
# Convert the uploaded file to a string | |
text = pdf_to_string(uploaded_file) | |
# Extract resume fields using the selected model | |
extracted_fields = extract_resume_fields(text, selected_model) | |
end_time = time.time() # Stop the timer | |
elapsed_time = end_time - start_time # Calculate the elapsed time | |
# Display the elapsed time | |
st.write(f"Extraction completed in {elapsed_time:.2f} seconds") | |
# # Display the extracted fields on the Streamlit app | |
# st.json(extracted_fields) | |
# If extracted_fields is a JSON string, convert it to a dictionary | |
if isinstance(extracted_fields, str): | |
extracted_fields = json.loads(extracted_fields) | |
for key, value in extracted_fields.items(): | |
st.write(f"{key}: {value}") | |