Spaces:

Amirizaniani
/

AuditLLM

Running

App Files Files Community

Amirizaniani commited on Mar 4

Commit

b7242c7

•

1 Parent(s): 6db0826

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -106

app.py CHANGED Viewed

@@ -59,90 +59,61 @@ def calculate_similarity(word, other_words, model, threshold=0.5):
     return None, None
-def highlight_words_within_cluster(sentences, model, exclude_words):
-    # Create a dictionary to map words to color codes
-    word_to_color = {}
-    color_codes = [
-    "\033[41m",  # Background Red
-    "\033[42m",  # Background Green
-    "\033[43m",  # Background Yellow
-    "\033[44m",  # Background Blue
-    "\033[45m",  # Background Purple
-    "\033[46m",  # Background Cyan
-    "\033[100m", # Background Dark Gray
-    "\033[101m", # Background Light Red
-    "\033[102m", # Background Light Green
-    "\033[103m", # Background Light Yellow
-    "\033[104m", # Background Light Blue
-    "\033[105m", # Background Light Purple
-    "\033[106m", # Background Light Cyan
-    "\033[47m"   # Background Gray
-    ]
-    html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
-    color_index = 0
-    highlighted_sentences = []
-    for sentence in sentences:
-        words = word_tokenize(sentence)
-        other_sentences = [s for s in sentences if s != sentence]
-        all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
-        highlighted_words = []
-        for word in words:
-            if word.lower() not in exclude_words and word.isalnum():
-                match_index, similarity = calculate_similarity(word, all_other_words, model)
-                if match_index is not None:
-                    # Assign color to the word if not already assigned
-                    if word not in word_to_color:
-                        word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
-                        color_index += 1
-                    # Highlight the word
-                    #highlighted_word = f"{word_to_color[word]}{word}\033[0m"
-                    highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
-                else:
-                    highlighted_word = word
-                highlighted_words.append(highlighted_word)
-            else:
-                highlighted_words.append(word)
-        highlighted_sentences.append(' '.join(highlighted_words))
-    return highlighted_sentences
-# Rest of the code, including the cluster_sentences function, remains the same
-exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
-def cluster_sentences(sentences, model, num_clusters=3):
-    embeddings = model.encode(sentences)
-    kmeans = KMeans(n_clusters=num_clusters)
-    kmeans.fit(embeddings)
-    return kmeans.labels_
-model = SentenceTransformer('all-mpnet-base-v2')
-exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
-sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
-"Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
-"A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
-"In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
-# Step 1: Cluster the sentences
-num_clusters = 1
-sentence_clusters = cluster_sentences(sentences, model, num_clusters)
-# Step 2: Highlight similar words within each cluster
-clustered_sentences = [[] for _ in range(num_clusters)]
-for sentence, cluster_id in zip(sentences, sentence_clusters):
-    clustered_sentences[cluster_id].append(sentence)
-highlighted_clustered_sentences = []
-for cluster in clustered_sentences:
-    highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
 def calculate_similarity_score(sentences):
     # Encode all sentences to get their embeddings
     model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -233,36 +204,20 @@ def updateChoices(prompt):
     return gr.CheckboxGroup(choices=newChoices)
 def setTextVisibility(cbg, model_name_input):
-    sentences = []
-    result = []
-    model = SentenceTransformer('all-mpnet-base-v2')
-    exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
-    sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
-                "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
-                "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
-                "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
-    for text in cbg:
-         sentences.append(answer_question(text, model_name_input))
-    # Step 1: Cluster the sentences
-    num_clusters = 1
-    sentence_clusters = cluster_sentences(sentences, model, num_clusters)
-    # Step 2: Highlight similar words within each cluster
-    clustered_sentences = [[] for _ in range(num_clusters)]
-    for sentence, cluster_id in zip(sentences, sentence_clusters):
-        clustered_sentences[cluster_id].append(sentence)
-    highlighted_clustered_sentences = []
-    for cluster in clustered_sentences:
-        highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
-    for idx, sentence in enumerate(highlighted_clustered_sentences):
         result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
-    score = round(calculate_similarity_score(sentences))
     final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""

     return None, None
+from sentence_transformers import SentenceTransformer, util
+import nltk
+nltk.download('punkt')  # Ensure you have the punkt tokenizer models
+from nltk import tokenize
+def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
+    # Load a pre-trained sentence-transformer model
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    # Split each paragraph into sentences
+    all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
+    flattened_sentences = [sentence for sublist in all_sentences for sentence in sublist]  # Flatten the list
+    # Encode all sentences into vectors
+    sentence_embeddings = model.encode(flattened_sentences)
+    # Calculate cosine similarities between sentence vectors
+    cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
+    # A list of colors for highlighting, add more if needed
+    colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
+    # Initialize a list to keep track of which sentences are semantically similar
+    highlighted_sentences = [''] * len(flattened_sentences)  # Pre-fill with empty strings
+    # Iterate over the matrix to find sentences with high cosine similarity
+    color_index = 0  # Initialize color index
+    for i in range(len(cosine_similarities)):
+        for j in range(i + 1, len(cosine_similarities)):
+            if cosine_similarities[i, j] > similarity_threshold and not highlighted_sentences[i]:
+                # Select color for highlighting
+                color = colors[color_index % len(colors)]
+                color_index += 1  # Move to the next color
+                # Highlight the similar sentences
+                highlighted_sentences[i] = ("<span style='color: "+  color  +"'>"+ flattened_sentences[i]+"</span>")
+                highlighted_sentences[j] = ("<span style='color: "+  color +"'>"+ flattened_sentences[j]+"</span>")
+    # Reconstruct the paragraphs with highlighted sentences
+    highlighted_paragraphs = []
+    sentence_index = 0
+    for paragraph_sentences in all_sentences:
+        highlighted_paragraph = ''
+        for _ in paragraph_sentences:
+            # Use the original sentence if it wasn't highlighted; otherwise, use the highlighted version.
+            highlighted_sentence = highlighted_sentences[sentence_index] if highlighted_sentences[sentence_index] else flattened_sentences[sentence_index]
+            highlighted_paragraph += highlighted_sentence + ' '
+            sentence_index += 1
+        highlighted_paragraphs.append(highlighted_paragraph)
+    # Combine all paragraphs into one HTML string
+    html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
+    return highlighted_paragraphs
 def calculate_similarity_score(sentences):
     # Encode all sentences to get their embeddings
     model = SentenceTransformer('all-MiniLM-L6-v2')
     return gr.CheckboxGroup(choices=newChoices)
 def setTextVisibility(cbg, model_name_input):
+    sentences = [answer_question(text, model_name_input) for text in cbg]
+    # Apply highlighting to all processed sentences, receiving one complete HTML string.
+    highlighted_html = []
+    highlighted_html = highlight_similar_paragraphs_with_colors(sentences, similarity_threshold=0.05)
+    result = []
+    # Iterate through each original 'cbg' sentence and pair it with the entire highlighted block.
+    for idx, sentence in enumerate(highlighted_html):
         result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
+    score = round(calculate_similarity_score(highlighted_html))
     final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""