Spaces:
Running
Running
from langchain_community.document_loaders import PyMuPDFLoader | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
import re | |
import string | |
def load_pdf(file_path): | |
loader = PyMuPDFLoader(file_path) | |
data = loader.load() | |
return data | |
def clean_text(text): | |
# Remove special characters (customize as needed) | |
special_characters = "βββ’β¦" | |
text = re.sub(f"[{re.escape(special_characters)}]", "", text) | |
# Remove punctuation | |
text = text.translate(str.maketrans("", "", string.punctuation)) | |
# Remove numbers | |
text = re.sub(r'\d+', '', text) | |
# Remove extra whitespace | |
text = " ".join(text.split()) | |
# Convert text to lowercase | |
text = text.lower() | |
# Remove stopwords (optional) | |
stop_words = set(stopwords.words('english')) | |
text = " ".join(word for word in text.split() if word not in stop_words) | |
# Stemming (optional) | |
#ps = PorterStemmer() | |
#text = " ".join(ps.stem(word) for word in text.split()) | |
#Lemmatization | |
lemmatizer = WordNetLemmatizer() | |
text= " ".join(lemmatizer.lemmatize(word) for word in text.split()) | |
return text | |
def get_full_resume_text(file_path): | |
resume_pages = load_pdf(file_path) | |
resume_text = "" | |
for page in resume_pages: | |
resume_text += page.page_content | |
resume_text += "\n\n" | |
resume_text = clean_text(resume_text) | |
return resume_text | |
def process_pdf(file): | |
return get_full_resume_text(file.name) | |