Amirizaniani commited on
Commit
e273bef
1 Parent(s): b2c2432

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -32
app.py CHANGED
@@ -61,50 +61,40 @@ def calculate_similarity(word, other_words, model, threshold=0.5):
61
 
62
 
63
  def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
64
- # Load a pre-trained sentence-transformer model
65
  model = SentenceTransformer('all-MiniLM-L6-v2')
66
 
67
  # Split each paragraph into sentences
68
  all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
69
- flattened_sentences = [sentence for sublist in all_sentences for sentence in sublist] # Flatten the list
 
 
 
 
 
 
70
 
71
  # Encode all sentences into vectors
 
72
  sentence_embeddings = model.encode(flattened_sentences)
73
 
74
- # Calculate cosine similarities between sentence vectors
75
  cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
76
 
77
- # A list of colors for highlighting, add more if needed
78
- colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
79
-
80
- # Initialize a list to keep track of which sentences are semantically similar
81
- highlighted_sentences = [''] * len(flattened_sentences) # Pre-fill with empty strings
82
-
83
- # Iterate over the matrix to find sentences with high cosine similarity
84
- color_index = 0 # Initialize color index
85
- for i in range(len(cosine_similarities)):
86
- for j in range(i + 1, len(cosine_similarities)):
87
- if cosine_similarities[i, j] > similarity_threshold and not highlighted_sentences[i]:
88
- # Select color for highlighting
89
  color = colors[color_index % len(colors)]
 
 
 
 
90
  color_index += 1 # Move to the next color
91
-
92
- # Highlight the similar sentences
93
- highlighted_sentences[i] = ("<span style='color: "+ color +"'>"+ flattened_sentences[i]+"</span>")
94
- highlighted_sentences[j] = ("<span style='color: "+ color +"'>"+ flattened_sentences[j]+"</span>")
95
-
96
- # Reconstruct the paragraphs with highlighted sentences
97
- highlighted_paragraphs = []
98
- sentence_index = 0
99
- for paragraph_sentences in all_sentences:
100
- highlighted_paragraph = ''
101
- for _ in paragraph_sentences:
102
- # Use the original sentence if it wasn't highlighted; otherwise, use the highlighted version.
103
- highlighted_sentence = highlighted_sentences[sentence_index] if highlighted_sentences[sentence_index] else flattened_sentences[sentence_index]
104
- highlighted_paragraph += highlighted_sentence + ' '
105
- sentence_index += 1
106
- highlighted_paragraphs.append(highlighted_paragraph)
107
-
108
  # Combine all paragraphs into one HTML string
109
  html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
110
  return highlighted_paragraphs
 
61
 
62
 
63
  def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
 
64
  model = SentenceTransformer('all-MiniLM-L6-v2')
65
 
66
  # Split each paragraph into sentences
67
  all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
68
+
69
+ # Initialize storage for highlighted sentences
70
+ highlighted_sentences = [['' for sentence in para] for para in all_sentences]
71
+ colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
72
+
73
+ # Track which sentences belong to which paragraph
74
+ sentence_to_paragraph_index = [idx for idx, para in enumerate(all_sentences) for sentence in para]
75
 
76
  # Encode all sentences into vectors
77
+ flattened_sentences = [sentence for para in all_sentences for sentence in para]
78
  sentence_embeddings = model.encode(flattened_sentences)
79
 
80
+ # Calculate cosine similarities between all pairs of sentences
81
  cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
82
 
83
+ # Iterate through each sentence pair and highlight if they are similar but from different paragraphs
84
+ color_index = 0
85
+ for i, embedding_i in enumerate(sentence_embeddings):
86
+ for j, embedding_j in enumerate(sentence_embeddings):
87
+ if i != j and cosine_similarities[i, j] > similarity_threshold and sentence_to_paragraph_index[i] != sentence_to_paragraph_index[j]:
 
 
 
 
 
 
 
88
  color = colors[color_index % len(colors)]
89
+ if highlighted_sentences[sentence_to_paragraph_index[i]][i % len(all_sentences[sentence_to_paragraph_index[i]])] == '':
90
+ highlighted_sentences[sentence_to_paragraph_index[i]][i % len(all_sentences[sentence_to_paragraph_index[i]])] = ("<span style='color: "+ color +"'>"+ flattened_sentences[i]+"</span>")
91
+ if highlighted_sentences[sentence_to_paragraph_index[j]][j % len(all_sentences[sentence_to_paragraph_index[j]])] == '':
92
+ highlighted_sentences[sentence_to_paragraph_index[j]][j % len(all_sentences[sentence_to_paragraph_index[j]])] = ("<span style='color: "+ color +"'>"+ flattened_sentences[j]+"</span>")
93
  color_index += 1 # Move to the next color
94
+
95
+ # Combine sentences back into paragraphs
96
+ highlighted_paragraphs = [' '.join(para) for para in highlighted_sentences]
97
+
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # Combine all paragraphs into one HTML string
99
  html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
100
  return highlighted_paragraphs