Amirizaniani commited on
Commit
2508cf4
1 Parent(s): b6528b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py CHANGED
@@ -35,6 +35,80 @@ def answer_question(prompt):
35
  generated_answer = hub_chain.run(input_data)
36
  return generated_answer
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  text_list = []
39
 
40
  def updateChoices(prompt):
 
35
  generated_answer = hub_chain.run(input_data)
36
  return generated_answer
37
 
38
+ def calculate_similarity(word, other_words, model, threshold=0.5):
39
+ embeddings_word = model.encode([word])
40
+ embeddings_other_words = model.encode(other_words)
41
+ for i, embedding in enumerate(embeddings_other_words):
42
+ similarity = 1 - scipy.spatial.distance.cosine(embeddings_word[0], embedding)
43
+ if similarity > threshold and similarity < 0.85:
44
+ return i, similarity
45
+ return None, None
46
+
47
+
48
+ def highlight_words_within_cluster(sentences, model, exclude_words):
49
+ # Create a dictionary to map words to color codes
50
+ word_to_color = {}
51
+ color_codes = [
52
+ "\033[41m", # Background Red
53
+ "\033[42m", # Background Green
54
+ "\033[43m", # Background Yellow
55
+ "\033[44m", # Background Blue
56
+ "\033[45m", # Background Purple
57
+ "\033[46m", # Background Cyan
58
+ "\033[100m", # Background Dark Gray
59
+ "\033[101m", # Background Light Red
60
+ "\033[102m", # Background Light Green
61
+ "\033[103m", # Background Light Yellow
62
+ "\033[104m", # Background Light Blue
63
+ "\033[105m", # Background Light Purple
64
+ "\033[106m", # Background Light Cyan
65
+ "\033[47m" # Background Gray
66
+ ]
67
+ html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
68
+ color_index = 0
69
+
70
+ highlighted_sentences = []
71
+ for sentence in sentences:
72
+ words = word_tokenize(sentence)
73
+ other_sentences = [s for s in sentences if s != sentence]
74
+ all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
75
+
76
+ highlighted_words = []
77
+ for word in words:
78
+ if word.lower() not in exclude_words and word.isalnum():
79
+ match_index, similarity = calculate_similarity(word, all_other_words, model)
80
+ if match_index is not None:
81
+ # Assign color to the word if not already assigned
82
+ if word not in word_to_color:
83
+ word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
84
+ color_index += 1
85
+ # Highlight the word
86
+ #highlighted_word = f"{word_to_color[word]}{word}\033[0m"
87
+ highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
88
+ else:
89
+ highlighted_word = word
90
+ highlighted_words.append(highlighted_word)
91
+ else:
92
+ highlighted_words.append(word)
93
+
94
+ highlighted_sentences.append(' '.join(highlighted_words))
95
+ return highlighted_sentences
96
+
97
+ # Rest of the code, including the cluster_sentences function, remains the same
98
+
99
+ exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
100
+
101
+ def cluster_sentences(sentences, model, num_clusters=3):
102
+ embeddings = model.encode(sentences)
103
+ kmeans = KMeans(n_clusters=num_clusters)
104
+ kmeans.fit(embeddings)
105
+ return kmeans.labels_
106
+
107
+ model = SentenceTransformer('all-mpnet-base-v2')
108
+ exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
109
+
110
+
111
+
112
  text_list = []
113
 
114
  def updateChoices(prompt):