Amirizaniani commited on
Commit
b7242c7
1 Parent(s): 6db0826

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -106
app.py CHANGED
@@ -59,90 +59,61 @@ def calculate_similarity(word, other_words, model, threshold=0.5):
59
  return None, None
60
 
61
 
62
- def highlight_words_within_cluster(sentences, model, exclude_words):
63
- # Create a dictionary to map words to color codes
64
- word_to_color = {}
65
- color_codes = [
66
- "\033[41m", # Background Red
67
- "\033[42m", # Background Green
68
- "\033[43m", # Background Yellow
69
- "\033[44m", # Background Blue
70
- "\033[45m", # Background Purple
71
- "\033[46m", # Background Cyan
72
- "\033[100m", # Background Dark Gray
73
- "\033[101m", # Background Light Red
74
- "\033[102m", # Background Light Green
75
- "\033[103m", # Background Light Yellow
76
- "\033[104m", # Background Light Blue
77
- "\033[105m", # Background Light Purple
78
- "\033[106m", # Background Light Cyan
79
- "\033[47m" # Background Gray
80
- ]
81
- html_color_codes = ["red", "green", "blue", "purple", "cyan", "fuchsia", "lime", "maroon", "olive", "navy", "teal", "gray"]
82
- color_index = 0
83
-
84
- highlighted_sentences = []
85
- for sentence in sentences:
86
- words = word_tokenize(sentence)
87
- other_sentences = [s for s in sentences if s != sentence]
88
- all_other_words = [word for s in other_sentences for word in word_tokenize(s) if word.lower() not in exclude_words and word.isalnum()]
89
-
90
- highlighted_words = []
91
- for word in words:
92
- if word.lower() not in exclude_words and word.isalnum():
93
- match_index, similarity = calculate_similarity(word, all_other_words, model)
94
- if match_index is not None:
95
- # Assign color to the word if not already assigned
96
- if word not in word_to_color:
97
- word_to_color[word] = html_color_codes[color_index % len(html_color_codes)]
98
- color_index += 1
99
- # Highlight the word
100
- #highlighted_word = f"{word_to_color[word]}{word}\033[0m"
101
- highlighted_word = "<span style='color: "+ word_to_color[word] +"'>"+ word +"</span>"
102
- else:
103
- highlighted_word = word
104
- highlighted_words.append(highlighted_word)
105
- else:
106
- highlighted_words.append(word)
107
-
108
- highlighted_sentences.append(' '.join(highlighted_words))
109
- return highlighted_sentences
110
-
111
- # Rest of the code, including the cluster_sentences function, remains the same
112
-
113
- exclude_words = {"a", "the", "for", "from", "of", "in","over", "as", "on", "is", "am", "have", "an","has", "had", "and", "by", "it", "its", "those", "these", "was", "were", "their", "them", "I", "you", "also", "your", "me", "after"}
114
-
115
- def cluster_sentences(sentences, model, num_clusters=3):
116
- embeddings = model.encode(sentences)
117
- kmeans = KMeans(n_clusters=num_clusters)
118
- kmeans.fit(embeddings)
119
- return kmeans.labels_
120
-
121
- model = SentenceTransformer('all-mpnet-base-v2')
122
- exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
123
-
124
- sentences = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
125
-
126
- "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
127
-
128
- "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
129
-
130
- "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
131
-
132
- # Step 1: Cluster the sentences
133
- num_clusters = 1
134
- sentence_clusters = cluster_sentences(sentences, model, num_clusters)
135
-
136
- # Step 2: Highlight similar words within each cluster
137
- clustered_sentences = [[] for _ in range(num_clusters)]
138
- for sentence, cluster_id in zip(sentences, sentence_clusters):
139
- clustered_sentences[cluster_id].append(sentence)
140
 
141
- highlighted_clustered_sentences = []
142
- for cluster in clustered_sentences:
143
- highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
 
 
 
 
 
 
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
 
146
  def calculate_similarity_score(sentences):
147
  # Encode all sentences to get their embeddings
148
  model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -233,36 +204,20 @@ def updateChoices(prompt):
233
  return gr.CheckboxGroup(choices=newChoices)
234
 
235
  def setTextVisibility(cbg, model_name_input):
236
- sentences = []
237
- result = []
238
- model = SentenceTransformer('all-mpnet-base-v2')
239
- exclude_words = {"a", "the", "for", "from", "of", "in", "over", "as", "on", "is", "am", "have", "an", "has", "had", "and", "by", "it", "its", "those", "these", "above", "to"}
240
- sentences_org = ["In a quaint little town nestled in the heart of the mountains, a small bakery famous for its artisanal breads and pastries had a line of customers stretching out the door, eagerly waiting to savor the freshly baked goods that were known far and wide for their delightful flavors.",
241
- "Within a picturesque mountain village, there stood a renowned bakery, celebrated for its handcrafted bread and sweet treats, attracting a long queue of patrons each morning, all keen to enjoy the baked delicacies that had gained widespread acclaim for their exceptional taste.",
242
- "A charming bakery, located in a small mountainous hamlet, renowned for producing exquisite handmade pastries and bread, was bustling with a crowd of eager customers lined up outside, each anticipating the chance to indulge in the famous baked items celebrated for their extraordinary deliciousness.",
243
- "In a cozy, mountain-encircled village, a beloved bakery was the center of attraction, known for its traditional baking methods and delightful pastries, drawing a consistent stream of people waiting outside, all desiring to experience the renowned flavors that made the bakery's products distinctively mouth-watering."]
244
- for text in cbg:
245
- sentences.append(answer_question(text, model_name_input))
246
-
247
- # Step 1: Cluster the sentences
248
- num_clusters = 1
249
- sentence_clusters = cluster_sentences(sentences, model, num_clusters)
250
-
251
- # Step 2: Highlight similar words within each cluster
252
- clustered_sentences = [[] for _ in range(num_clusters)]
253
 
254
- for sentence, cluster_id in zip(sentences, sentence_clusters):
255
- clustered_sentences[cluster_id].append(sentence)
 
 
 
256
 
257
- highlighted_clustered_sentences = []
258
-
259
- for cluster in clustered_sentences:
260
- highlighted_clustered_sentences.extend(highlight_words_within_cluster(cluster, model, exclude_words))
261
 
262
- for idx, sentence in enumerate(highlighted_clustered_sentences):
 
 
263
  result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
264
 
265
- score = round(calculate_similarity_score(sentences))
266
 
267
  final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""
268
 
 
59
  return None, None
60
 
61
 
62
+ from sentence_transformers import SentenceTransformer, util
63
+ import nltk
64
+ nltk.download('punkt') # Ensure you have the punkt tokenizer models
65
+ from nltk import tokenize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ def highlight_similar_paragraphs_with_colors(paragraphs, similarity_threshold=0.25):
68
+ # Load a pre-trained sentence-transformer model
69
+ model = SentenceTransformer('all-MiniLM-L6-v2')
70
+
71
+ # Split each paragraph into sentences
72
+ all_sentences = [tokenize.sent_tokenize(paragraph) for paragraph in paragraphs]
73
+ flattened_sentences = [sentence for sublist in all_sentences for sentence in sublist] # Flatten the list
74
+
75
+ # Encode all sentences into vectors
76
+ sentence_embeddings = model.encode(flattened_sentences)
77
 
78
+ # Calculate cosine similarities between sentence vectors
79
+ cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
80
+
81
+ # A list of colors for highlighting, add more if needed
82
+ colors = ['yellow', 'lightgreen', 'lightblue', 'pink', 'lavender', 'salmon', 'peachpuff', 'powderblue', 'khaki', 'wheat']
83
+
84
+ # Initialize a list to keep track of which sentences are semantically similar
85
+ highlighted_sentences = [''] * len(flattened_sentences) # Pre-fill with empty strings
86
+
87
+ # Iterate over the matrix to find sentences with high cosine similarity
88
+ color_index = 0 # Initialize color index
89
+ for i in range(len(cosine_similarities)):
90
+ for j in range(i + 1, len(cosine_similarities)):
91
+ if cosine_similarities[i, j] > similarity_threshold and not highlighted_sentences[i]:
92
+ # Select color for highlighting
93
+ color = colors[color_index % len(colors)]
94
+ color_index += 1 # Move to the next color
95
+
96
+ # Highlight the similar sentences
97
+ highlighted_sentences[i] = ("<span style='color: "+ color +"'>"+ flattened_sentences[i]+"</span>")
98
+ highlighted_sentences[j] = ("<span style='color: "+ color +"'>"+ flattened_sentences[j]+"</span>")
99
+
100
+ # Reconstruct the paragraphs with highlighted sentences
101
+ highlighted_paragraphs = []
102
+ sentence_index = 0
103
+ for paragraph_sentences in all_sentences:
104
+ highlighted_paragraph = ''
105
+ for _ in paragraph_sentences:
106
+ # Use the original sentence if it wasn't highlighted; otherwise, use the highlighted version.
107
+ highlighted_sentence = highlighted_sentences[sentence_index] if highlighted_sentences[sentence_index] else flattened_sentences[sentence_index]
108
+ highlighted_paragraph += highlighted_sentence + ' '
109
+ sentence_index += 1
110
+ highlighted_paragraphs.append(highlighted_paragraph)
111
+
112
+ # Combine all paragraphs into one HTML string
113
+ html_output = '<div>' + '<br/><br/>'.join(highlighted_paragraphs) + '</div>'
114
+ return highlighted_paragraphs
115
 
116
+
117
  def calculate_similarity_score(sentences):
118
  # Encode all sentences to get their embeddings
119
  model = SentenceTransformer('all-MiniLM-L6-v2')
 
204
  return gr.CheckboxGroup(choices=newChoices)
205
 
206
  def setTextVisibility(cbg, model_name_input):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ sentences = [answer_question(text, model_name_input) for text in cbg]
209
+
210
+ # Apply highlighting to all processed sentences, receiving one complete HTML string.
211
+ highlighted_html = []
212
+ highlighted_html = highlight_similar_paragraphs_with_colors(sentences, similarity_threshold=0.05)
213
 
 
 
 
 
214
 
215
+ result = []
216
+ # Iterate through each original 'cbg' sentence and pair it with the entire highlighted block.
217
+ for idx, sentence in enumerate(highlighted_html):
218
  result.append("<p><strong>"+ cbg[idx] +"</strong></p><p>"+ sentence +"</p><br/>")
219
 
220
+ score = round(calculate_similarity_score(highlighted_html))
221
 
222
  final_html = f"""<div>{result}<div style="text-align: center; font-size: 24px; font-weight: bold;">Similarity Score: {score}</div></div>"""
223