Spaces:
Runtime error
Runtime error
# extractive.py | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import sent_tokenize | |
import networkx as nx | |
import numpy as np | |
import torch | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
def preprocess_text(text): | |
sentences = sent_tokenize(text) | |
return sentences | |
def get_sentence_embeddings(sentences, model, tokenizer): | |
embeddings = [] | |
with torch.no_grad(): | |
for sentence in sentences: | |
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
outputs = model(**inputs) | |
sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1) | |
embeddings.append(sentence_embedding.squeeze().numpy()) | |
return np.array(embeddings) | |
def build_semantic_graph(embeddings, similarity_threshold=0.75): | |
graph = nx.Graph() | |
for i, emb1 in enumerate(embeddings): | |
for j, emb2 in enumerate(embeddings): | |
if i != j: | |
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) | |
if similarity >= similarity_threshold: | |
graph.add_edge(i, j, weight=similarity) | |
return graph | |
def apply_textrank(graph, sentences, damping_factor=0.85, max_iter=100): | |
num_nodes = len(sentences) | |
personalization = {i: 1 / num_nodes for i in range(num_nodes)} | |
scores = nx.pagerank(graph, personalization=personalization, max_iter=max_iter) | |
ranked_sentences = sorted(((score, idx) for idx, score in scores.items()), reverse=True) | |
return ranked_sentences | |
def generate_summary(ranked_sentences, sentences, max_length_ratio=0.5): | |
stop_words = set(stopwords.words('english')) | |
summary = [] | |
current_length = 0 | |
total_length = sum(len(sentence.split()) for sentence in sentences) | |
max_length = int(total_length * max_length_ratio) | |
for score, idx in ranked_sentences: | |
sentence = sentences[idx] | |
sentence_length = len(sentence.split()) | |
sentence_words = [word for word in sentence.split() if word.lower() not in stop_words] | |
if current_length + sentence_length <= max_length: | |
summary.append(" ".join(sentence_words)) | |
current_length += sentence_length | |
else: | |
break | |
return " ".join(summary) | |