Spaces:

Amirizaniani
/

AuditLLM

Running

App Files Files Community

Amirizaniani commited on Feb 29

Commit

2303155

•

1 Parent(s): e629fe7

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -47

app.py CHANGED Viewed

@@ -43,42 +43,77 @@ def answer_question(prompt):
     return generated_answer
-def calculate_similarity(word, other_sentences, model, threshold=0.1, upper_limit=0.80):
-    word_embedding = model.encode([word], convert_to_tensor=True)
-    sentence_embeddings = model.encode(other_sentences, convert_to_tensor=True)
-    similarities = scipy.spatial.distance.cdist(word_embedding, sentence_embeddings, "cosine")[0]
-    return [(i, 1-similarity) for i, similarity in enumerate(similarities) if threshold < 1-similarity < upper_limit]
-def highlight_words(sentence, other_sentences, model, exclude_words):
-    words = word_tokenize(sentence)
-    color_codes = ["\033[41m", "\033[42m", "\033[43m", "\033[44m", "\033[45m", "\033[46m", "\033[47m"]
-    html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray", "DodgerBlue", "Tomato"]
-    all_matched_pairs = []
-    for i, word in enumerate(words):
-        if word.lower() not in exclude_words and word.isalnum():
-            matches = calculate_similarity(word, other_sentences, model)
-            for match_index, similarity in matches:
-                if word not in all_matched_pairs:
-                    all_matched_pairs.append((i, match_index, similarity))
-    # Correction for variable name and HTML formatting
     color_index = 0
-    for pair in all_matched_pairs:
-        color_code = html_color_codes[color_index % len(html_color_codes)]
-        # Correctly apply HTML span with style for coloring
-        words[pair[0]] = f"<span style='color: {color_code};'>{words[pair[0]]}</span>"
-        tokenized_other_sentence = word_tokenize(other_sentences[pair[1]])
-        tokenized_other_sentence = [f"<span style='color: {color_code};'>{word}</span>" if idx == pair[0] else word for idx, word in enumerate(tokenized_other_sentence)]
-        other_sentences[pair[1]] = ' '.join(tokenized_other_sentence)
-        color_index += 1
-    return ' '.join(words)
 model = SentenceTransformer('all-mpnet-base-v2')
 sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
@@ -89,8 +124,6 @@ sentences = ["In a quaint little town nestled in the heart of the mountains, a s
 "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
 text_list = []
 def updateChoices(prompt):
@@ -101,29 +134,33 @@ def setTextVisibility(cbg, model_name_input):
     sentences = []
     result = []
     model = SentenceTransformer('all-mpnet-base-v2')
-    exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to", "However"}
     sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
                 "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
                 "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
                 "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
     for text in cbg:
          sentences.append(answer_question(text, model_name_input))
-    highlighted_sentences = []
-    for i, sentence in enumerate(sentences):
-        other_sentences = sentences[:i] + sentences[i+1:]
-        highlighted_sentence = highlight_words(sentence, other_sentences, model, exclude_words)
-        highlighted_sentences.append(highlighted_sentence)
-    for idx, sentence in enumerate(highlighted_sentences):
-        result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
-    score = round(calculate_similarity_score(sentences))
-    final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""
-    return final_html
 def upload_file(files):
     file_paths = [file.name for file in files]

     return generated_answer
+def calculate_similarity(word, other_words, model, threshold=0.5):
+    embeddings_word = model.encode([word])
+    embeddings_other_words = model.encode(other_words)
+    for i, embedding in enumerate(embeddings_other_words):
+        similarity = 1 - scipy.spatial.distance.cosine(embeddings_word[0], embedding)
+        if similarity > threshold and similarity < 0.85:
+            return i, similarity
+    return None, None
+def highlight_words_within_cluster(sentences, model, exclude_words):
+    # Create a dictionary to map words to color codes
+    word_to_color = {}
+    color_codes = [
+    "\033[41m",  # Background Red
+    "\033[42m",  # Background Green
+    "\033[43m",  # Background Yellow
+    "\033[44m",  # Background Blue
+    "\033[45m",  # Background Purple
+    "\033[46m",  # Background Cyan
+    "\033[100m", # Background Dark Gray
+    "\033[101m", # Background Light Red
+    "\033[102m", # Background Light Green
+    "\033[103m", # Background Light Yellow
+    "\033[104m", # Background Light Blue
+    "\033[105m", # Background Light Purple
+    "\033[106m", # Background Light Cyan
+    "\033[47m"   # Background Gray
+    ]
+    html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
     color_index = 0
+    highlighted_sentences = []
+    for sentence in sentences:
+        words = word_tokenize(sentence)
+        other_sentences = [s for s in sentences if s != sentence]
+        all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
+        highlighted_words = []
+        for word in words:
+            if word.lower() not in exclude_words and word.isalnum():
+                match_index, similarity = calculate_similarity(word, all_other_words, model)
+                if match_index is not None:
+                    # Assign color to the word if not already assigned
+                    if word not in word_to_color:
+                        word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
+                        color_index += 1
+                    # Highlight the word
+                    #highlighted_word = f"{word_to_color[word]}{word}\033[0m"
+                    highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
+                else:
+                    highlighted_word = word
+                highlighted_words.append(highlighted_word)
+            else:
+                highlighted_words.append(word)
+        highlighted_sentences.append(' '.join(highlighted_words))
+    return highlighted_sentences
+# Rest of the code, including the cluster_sentences function, remains the same
+exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
+def cluster_sentences(sentences, model, num_clusters=1):
+    embeddings = model.encode(sentences)
+    kmeans = KMeans(n_clusters=num_clusters)
+    kmeans.fit(embeddings)
+    return kmeans.labels_
 model = SentenceTransformer('all-mpnet-base-v2')
+exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
 sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
 "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
 text_list = []
 def updateChoices(prompt):
     sentences = []
     result = []
     model = SentenceTransformer('all-mpnet-base-v2')
+    exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
     sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
                 "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
                 "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
                 "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
     for text in cbg:
          sentences.append(answer_question(text, model_name_input))
+    # Step 1: Cluster the sentences
+    num_clusters = 1
+    sentence_clusters = cluster_sentences(sentences, model, num_clusters)
+    # Step 2: Highlight similar words within each cluster
+    clustered_sentences = [[] for _ in range(num_clusters)]
+    for sentence, cluster_id in zip(sentences, sentence_clusters):
+        clustered_sentences[cluster_id].append(sentence)
+    highlighted_clustered_sentences = []
+    for cluster in clustered_sentences:
+        highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
+    for idx, sentence in enumerate(highlighted_clustered_sentences):
+        result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
+    return result
 def upload_file(files):
     file_paths = [file.name for file in files]