Spaces:
Running
Running
import PyPDF2 | |
import spacy | |
from collections import Counter | |
import heapq | |
import io | |
# Load spaCy model | |
nlp = spacy.load("./en_core_web_sm-3.7.1") | |
def read_pdf(file_stream): | |
text = '' | |
reader = PyPDF2.PdfReader(file_stream) | |
for page in reader.pages: | |
text += page.extract_text() + ' ' | |
return text.strip() | |
def extract_key_phrases(text): | |
doc = nlp(text) | |
# Combine noun chunks and named entities as candidates for key phrases | |
key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents] | |
return key_phrases | |
def score_sentences(text, key_phrases): | |
sentence_scores = {} | |
doc = nlp(text) | |
for sent in doc.sents: | |
for phrase in key_phrases: | |
if phrase in sent.text: | |
if sent in sentence_scores: | |
sentence_scores[sent] += 1 | |
else: | |
sentence_scores[sent] = 1 | |
return sentence_scores | |
def summarize_text(sentence_scores, num_points=5): | |
summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get) | |
# Format summary as bullet points | |
summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences]) | |
return summary | |