hybridsummarization / extractive.py
arousrihab's picture
Upload 3 files
22a5607 verified
# extractive.py
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import networkx as nx
import numpy as np
import torch
nltk.download('stopwords')
nltk.download('punkt')
def preprocess_text(text):
sentences = sent_tokenize(text)
return sentences
def get_sentence_embeddings(sentences, model, tokenizer):
embeddings = []
with torch.no_grad():
for sentence in sentences:
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
embeddings.append(sentence_embedding.squeeze().numpy())
return np.array(embeddings)
def build_semantic_graph(embeddings, similarity_threshold=0.75):
graph = nx.Graph()
for i, emb1 in enumerate(embeddings):
for j, emb2 in enumerate(embeddings):
if i != j:
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
if similarity >= similarity_threshold:
graph.add_edge(i, j, weight=similarity)
return graph
def apply_textrank(graph, sentences, damping_factor=0.85, max_iter=100):
num_nodes = len(sentences)
personalization = {i: 1 / num_nodes for i in range(num_nodes)}
scores = nx.pagerank(graph, personalization=personalization, max_iter=max_iter)
ranked_sentences = sorted(((score, idx) for idx, score in scores.items()), reverse=True)
return ranked_sentences
def generate_summary(ranked_sentences, sentences, max_length_ratio=0.5):
stop_words = set(stopwords.words('english'))
summary = []
current_length = 0
total_length = sum(len(sentence.split()) for sentence in sentences)
max_length = int(total_length * max_length_ratio)
for score, idx in ranked_sentences:
sentence = sentences[idx]
sentence_length = len(sentence.split())
sentence_words = [word for word in sentence.split() if word.lower() not in stop_words]
if current_length + sentence_length <= max_length:
summary.append(" ".join(sentence_words))
current_length += sentence_length
else:
break
return " ".join(summary)