Amirizaniani commited on
Commit
2303155
1 Parent(s): e629fe7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -47
app.py CHANGED
@@ -43,42 +43,77 @@ def answer_question(prompt):
43
  return generated_answer
44
 
45
 
46
- def calculate_similarity(word, other_sentences, model, threshold=0.1, upper_limit=0.80):
47
- word_embedding = model.encode([word], convert_to_tensor=True)
48
- sentence_embeddings = model.encode(other_sentences, convert_to_tensor=True)
49
- similarities = scipy.spatial.distance.cdist(word_embedding, sentence_embeddings, "cosine")[0]
50
- return [(i, 1-similarity) for i, similarity in enumerate(similarities) if threshold < 1-similarity < upper_limit]
51
-
52
-
53
- def highlight_words(sentence, other_sentences, model, exclude_words):
54
- words = word_tokenize(sentence)
55
- color_codes = ["\033[41m", "\033[42m", "\033[43m", "\033[44m", "\033[45m", "\033[46m", "\033[47m"]
56
- html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray", "DodgerBlue", "Tomato"]
57
-
58
- all_matched_pairs = []
59
- for i, word in enumerate(words):
60
- if word.lower() not in exclude_words and word.isalnum():
61
- matches = calculate_similarity(word, other_sentences, model)
62
- for match_index, similarity in matches:
63
- if word not in all_matched_pairs:
64
- all_matched_pairs.append((i, match_index, similarity))
65
-
66
-
67
- # Correction for variable name and HTML formatting
 
 
 
 
 
 
 
 
68
  color_index = 0
69
- for pair in all_matched_pairs:
70
- color_code = html_color_codes[color_index % len(html_color_codes)]
71
- # Correctly apply HTML span with style for coloring
72
- words[pair[0]] = f"<span style='color: {color_code};'>{words[pair[0]]}</span>"
73
- tokenized_other_sentence = word_tokenize(other_sentences[pair[1]])
74
- tokenized_other_sentence = [f"<span style='color: {color_code};'>{word}</span>" if idx == pair[0] else word for idx, word in enumerate(tokenized_other_sentence)]
75
- other_sentences[pair[1]] = ' '.join(tokenized_other_sentence)
76
- color_index += 1
77
-
78
- return ' '.join(words)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  model = SentenceTransformer('all-mpnet-base-v2')
 
82
 
83
  sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
84
 
@@ -89,8 +124,6 @@ sentences = ["In a quaint little town nestled in the heart of the mountains, a s
89
  "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
90
 
91
 
92
-
93
-
94
  text_list = []
95
 
96
  def updateChoices(prompt):
@@ -101,29 +134,33 @@ def setTextVisibility(cbg, model_name_input):
101
  sentences = []
102
  result = []
103
  model = SentenceTransformer('all-mpnet-base-v2')
104
- exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to", "However"}
105
  sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
106
  "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
107
  "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
108
  "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
109
  for text in cbg:
110
  sentences.append(answer_question(text, model_name_input))
 
 
 
 
111
 
112
- highlighted_sentences = []
113
- for i, sentence in enumerate(sentences):
114
- other_sentences = sentences[:i] + sentences[i+1:]
115
- highlighted_sentence = highlight_words(sentence, other_sentences, model, exclude_words)
116
- highlighted_sentences.append(highlighted_sentence)
117
-
118
- for idx, sentence in enumerate(highlighted_sentences):
119
- result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
120
-
121
- score = round(calculate_similarity_score(sentences))
122
 
123
- final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""
 
 
 
124
 
 
 
 
 
 
125
 
126
- return final_html
127
 
128
  def upload_file(files):
129
  file_paths = [file.name for file in files]
 
43
  return generated_answer
44
 
45
 
46
+ def calculate_similarity(word, other_words, model, threshold=0.5):
47
+ embeddings_word = model.encode([word])
48
+ embeddings_other_words = model.encode(other_words)
49
+ for i, embedding in enumerate(embeddings_other_words):
50
+ similarity = 1 - scipy.spatial.distance.cosine(embeddings_word[0], embedding)
51
+ if similarity > threshold and similarity < 0.85:
52
+ return i, similarity
53
+ return None, None
54
+
55
+
56
+ def highlight_words_within_cluster(sentences, model, exclude_words):
57
+ # Create a dictionary to map words to color codes
58
+ word_to_color = {}
59
+ color_codes = [
60
+ "\033[41m", # Background Red
61
+ "\033[42m", # Background Green
62
+ "\033[43m", # Background Yellow
63
+ "\033[44m", # Background Blue
64
+ "\033[45m", # Background Purple
65
+ "\033[46m", # Background Cyan
66
+ "\033[100m", # Background Dark Gray
67
+ "\033[101m", # Background Light Red
68
+ "\033[102m", # Background Light Green
69
+ "\033[103m", # Background Light Yellow
70
+ "\033[104m", # Background Light Blue
71
+ "\033[105m", # Background Light Purple
72
+ "\033[106m", # Background Light Cyan
73
+ "\033[47m" # Background Gray
74
+ ]
75
+ html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
76
  color_index = 0
 
 
 
 
 
 
 
 
 
 
77
 
78
+ highlighted_sentences = []
79
+ for sentence in sentences:
80
+ words = word_tokenize(sentence)
81
+ other_sentences = [s for s in sentences if s != sentence]
82
+ all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
83
+
84
+ highlighted_words = []
85
+ for word in words:
86
+ if word.lower() not in exclude_words and word.isalnum():
87
+ match_index, similarity = calculate_similarity(word, all_other_words, model)
88
+ if match_index is not None:
89
+ # Assign color to the word if not already assigned
90
+ if word not in word_to_color:
91
+ word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
92
+ color_index += 1
93
+ # Highlight the word
94
+ #highlighted_word = f"{word_to_color[word]}{word}\033[0m"
95
+ highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
96
+ else:
97
+ highlighted_word = word
98
+ highlighted_words.append(highlighted_word)
99
+ else:
100
+ highlighted_words.append(word)
101
+
102
+ highlighted_sentences.append(' '.join(highlighted_words))
103
+ return highlighted_sentences
104
+
105
+ # Rest of the code, including the cluster_sentences function, remains the same
106
+
107
+ exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
108
+
109
+ def cluster_sentences(sentences, model, num_clusters=1):
110
+ embeddings = model.encode(sentences)
111
+ kmeans = KMeans(n_clusters=num_clusters)
112
+ kmeans.fit(embeddings)
113
+ return kmeans.labels_
114
 
115
  model = SentenceTransformer('all-mpnet-base-v2')
116
+ exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
117
 
118
  sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
119
 
 
124
  "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
125
 
126
 
 
 
127
  text_list = []
128
 
129
  def updateChoices(prompt):
 
134
  sentences = []
135
  result = []
136
  model = SentenceTransformer('all-mpnet-base-v2')
137
+ exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
138
  sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
139
  "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
140
  "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
141
  "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
142
  for text in cbg:
143
  sentences.append(answer_question(text, model_name_input))
144
+
145
+ # Step 1: Cluster the sentences
146
+ num_clusters = 1
147
+ sentence_clusters = cluster_sentences(sentences, model, num_clusters)
148
 
149
+ # Step 2: Highlight similar words within each cluster
150
+ clustered_sentences = [[] for _ in range(num_clusters)]
 
 
 
 
 
 
 
 
151
 
152
+ for sentence, cluster_id in zip(sentences, sentence_clusters):
153
+ clustered_sentences[cluster_id].append(sentence)
154
+
155
+ highlighted_clustered_sentences = []
156
 
157
+ for cluster in clustered_sentences:
158
+ highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
159
+
160
+ for idx, sentence in enumerate(highlighted_clustered_sentences):
161
+ result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
162
 
163
+ return result
164
 
165
  def upload_file(files):
166
  file_paths = [file.name for file in files]