Spaces:

Amirizaniani
/

AuditLLM

Running

App Files Files Community

Amirizaniani commited on Mar 6

Commit

e273bef

•

1 Parent(s): b2c2432

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -32

app.py CHANGED Viewed

@@ -61,50 +61,40 @@ def calculate_similarity(word, other_words, model, threshold=0.5):
 def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
-    # Load a pre-trained sentence-transformer model
     model = SentenceTransformer('all-MiniLM-L6-v2')
     # Split each paragraph into sentences
     all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
-    flattened_sentences = [sentence for sublist in all_sentences for sentence in sublist]  # Flatten the list
     # Encode all sentences into vectors
     sentence_embeddings = model.encode(flattened_sentences)
-    # Calculate cosine similarities between sentence vectors
     cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
-    # A list of colors for highlighting, add more if needed
-    colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
-    # Initialize a list to keep track of which sentences are semantically similar
-    highlighted_sentences = [''] * len(flattened_sentences)  # Pre-fill with empty strings
-    # Iterate over the matrix to find sentences with high cosine similarity
-    color_index = 0  # Initialize color index
-    for i in range(len(cosine_similarities)):
-        for j in range(i + 1, len(cosine_similarities)):
-            if cosine_similarities[i, j] > similarity_threshold and not highlighted_sentences[i]:
-                # Select color for highlighting
                 color = colors[color_index % len(colors)]
                 color_index += 1  # Move to the next color
-                # Highlight the similar sentences
-                highlighted_sentences[i] = ("<span style='color: "+  color  +"'>"+ flattened_sentences[i]+"</span>")
-                highlighted_sentences[j] = ("<span style='color: "+  color +"'>"+ flattened_sentences[j]+"</span>")
-    # Reconstruct the paragraphs with highlighted sentences
-    highlighted_paragraphs = []
-    sentence_index = 0
-    for paragraph_sentences in all_sentences:
-        highlighted_paragraph = ''
-        for _ in paragraph_sentences:
-            # Use the original sentence if it wasn't highlighted; otherwise, use the highlighted version.
-            highlighted_sentence = highlighted_sentences[sentence_index] if highlighted_sentences[sentence_index] else flattened_sentences[sentence_index]
-            highlighted_paragraph += highlighted_sentence + ' '
-            sentence_index += 1
-        highlighted_paragraphs.append(highlighted_paragraph)
     # Combine all paragraphs into one HTML string
     html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
     return highlighted_paragraphs

 def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
     model = SentenceTransformer('all-MiniLM-L6-v2')
     # Split each paragraph into sentences
     all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
+    # Initialize storage for highlighted sentences
+    highlighted_sentences = [['' for sentence in para] for para in all_sentences]
+    colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
+    # Track which sentences belong to which paragraph
+    sentence_to_paragraph_index = [idx for idx, para in enumerate(all_sentences) for sentence in para]
     # Encode all sentences into vectors
+    flattened_sentences = [sentence for para in all_sentences for sentence in para]
     sentence_embeddings = model.encode(flattened_sentences)
+    # Calculate cosine similarities between all pairs of sentences
     cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
+    # Iterate through each sentence pair and highlight if they are similar but from different paragraphs
+    color_index = 0
+    for i, embedding_i in enumerate(sentence_embeddings):
+        for j, embedding_j in enumerate(sentence_embeddings):
+            if i != j and cosine_similarities[i, j] > similarity_threshold and sentence_to_paragraph_index[i] != sentence_to_paragraph_index[j]:
                 color = colors[color_index % len(colors)]
+                if highlighted_sentences[sentence_to_paragraph_index[i]][i % len(all_sentences[sentence_to_paragraph_index[i]])] == '':
+                    highlighted_sentences[sentence_to_paragraph_index[i]][i % len(all_sentences[sentence_to_paragraph_index[i]])] = ("<span style='color: "+  color  +"'>"+ flattened_sentences[i]+"</span>")
+                if highlighted_sentences[sentence_to_paragraph_index[j]][j % len(all_sentences[sentence_to_paragraph_index[j]])] == '':
+                    highlighted_sentences[sentence_to_paragraph_index[j]][j % len(all_sentences[sentence_to_paragraph_index[j]])] = ("<span style='color: "+  color  +"'>"+ flattened_sentences[j]+"</span>")
                 color_index += 1  # Move to the next color
+    # Combine sentences back into paragraphs
+    highlighted_paragraphs = [' '.join(para) for para in highlighted_sentences]
     # Combine all paragraphs into one HTML string
     html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
     return highlighted_paragraphs