Spaces:

Amirizaniani
/

AuditLLM

Running

App Files Files Community

Amirizaniani commited on Feb 18

Commit

2508cf4

•

1 Parent(s): b6528b0

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py CHANGED Viewed

@@ -35,6 +35,80 @@ def answer_question(prompt):
     generated_answer = hub_chain.run(input_data)
     return generated_answer
 text_list = []
 def updateChoices(prompt):

     generated_answer = hub_chain.run(input_data)
     return generated_answer
+    def calculate_similarity(word, other_words, model, threshold=0.5):
+    embeddings_word = model.encode([word])
+    embeddings_other_words = model.encode(other_words)
+    for i, embedding in enumerate(embeddings_other_words):
+        similarity = 1 - scipy.spatial.distance.cosine(embeddings_word[0], embedding)
+        if similarity > threshold and similarity < 0.85:
+            return i, similarity
+    return None, None
+def highlight_words_within_cluster(sentences, model, exclude_words):
+    # Create a dictionary to map words to color codes
+    word_to_color = {}
+    color_codes = [
+    "\033[41m",  # Background Red
+    "\033[42m",  # Background Green
+    "\033[43m",  # Background Yellow
+    "\033[44m",  # Background Blue
+    "\033[45m",  # Background Purple
+    "\033[46m",  # Background Cyan
+    "\033[100m", # Background Dark Gray
+    "\033[101m", # Background Light Red
+    "\033[102m", # Background Light Green
+    "\033[103m", # Background Light Yellow
+    "\033[104m", # Background Light Blue
+    "\033[105m", # Background Light Purple
+    "\033[106m", # Background Light Cyan
+    "\033[47m"   # Background Gray
+    ]
+    html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
+    color_index = 0
+    highlighted_sentences = []
+    for sentence in sentences:
+        words = word_tokenize(sentence)
+        other_sentences = [s for s in sentences if s != sentence]
+        all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
+        highlighted_words = []
+        for word in words:
+            if word.lower() not in exclude_words and word.isalnum():
+                match_index, similarity = calculate_similarity(word, all_other_words, model)
+                if match_index is not None:
+                    # Assign color to the word if not already assigned
+                    if word not in word_to_color:
+                        word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
+                        color_index += 1
+                    # Highlight the word
+                    #highlighted_word = f"{word_to_color[word]}{word}\033[0m"
+                    highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
+                else:
+                    highlighted_word = word
+                highlighted_words.append(highlighted_word)
+            else:
+                highlighted_words.append(word)
+        highlighted_sentences.append(' '.join(highlighted_words))
+    return highlighted_sentences
+# Rest of the code, including the cluster_sentences function, remains the same
+exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
+def cluster_sentences(sentences, model, num_clusters=3):
+    embeddings = model.encode(sentences)
+    kmeans = KMeans(n_clusters=num_clusters)
+    kmeans.fit(embeddings)
+    return kmeans.labels_
+model = SentenceTransformer('all-mpnet-base-v2')
+exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
 text_list = []
 def updateChoices(prompt):