DexterSptizu commited on
Commit
8afde48
β€’
1 Parent(s): fbf258f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -103
app.py CHANGED
@@ -41,36 +41,97 @@ def load_wordllama():
41
 
42
  wl = load_wordllama()
43
 
44
- def create_3d_visualization(texts, embeddings):
45
- # Reduce to 3D using PCA
46
- pca = PCA(n_components=3)
47
- embeddings_3d = pca.fit_transform(embeddings)
48
 
49
- # Create DataFrame
50
- df = pd.DataFrame(
51
- embeddings_3d,
52
- columns=['X', 'Y', 'Z']
53
- )
54
  df['text'] = texts
55
 
56
- fig = px.scatter_3d(
57
- df, x='X', y='Y', z='Z',
58
- text='text',
59
- title='Word Embeddings in 3D Space'
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- fig.update_traces(
63
- marker=dict(size=8, opacity=0.8),
64
- textposition='top center'
65
- )
66
  fig.update_layout(
67
- scene=dict(
68
- xaxis_title='Component 1',
69
- yaxis_title='Component 2',
70
- zaxis_title='Component 3'
71
- ),
72
- height=700
73
  )
 
74
  return fig
75
 
76
  def main():
@@ -78,10 +139,10 @@ def main():
78
  st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
79
  unsafe_allow_html=True)
80
 
81
- tabs = st.tabs(["πŸ’« Similarity Explorer", "🎯 Document Ranking", "πŸ” Fuzzy Deduplication"])
82
 
83
  with tabs[0]:
84
- st.markdown("### Compare Text Similarity")
85
 
86
  col1, col2 = st.columns(2)
87
  with col1:
@@ -92,97 +153,71 @@ def main():
92
  if st.button("Calculate Similarity", key="sim_button"):
93
  similarity = wl.similarity(text1, text2)
94
 
95
- st.markdown("### Similarity Score")
96
- st.metric(
97
- label="Cosine Similarity",
98
- value=f"{similarity:.4f}",
99
- help="Score ranges from 0 (different) to 1 (identical)"
100
- )
101
 
102
- # Visualize both texts in 3D space
103
- embeddings = wl.embed([text1, text2])
104
- st.plotly_chart(
105
- create_3d_visualization([text1, text2], embeddings),
106
- use_container_width=True
107
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  with tabs[1]:
110
- st.markdown("### Rank Documents by Similarity")
111
 
112
- query = st.text_area("Query Text", value="I went to the car", height=100)
 
113
 
114
- # Multiple document input
115
- st.markdown("### Enter Documents to Rank")
116
- num_docs = st.slider("Number of documents:", 2, 6, 4)
117
-
118
- documents = []
119
- for i in range(num_docs):
120
- doc = st.text_area(f"Document {i+1}",
121
- value=f"Example document {i+1}",
122
- height=50,
123
- key=f"doc_{i}")
124
- documents.append(doc)
125
 
126
- if st.button("Rank Documents", key="rank_button"):
127
- ranked_docs = wl.rank(query, documents)
128
 
129
- st.markdown("### Ranking Results")
130
- for doc, score in ranked_docs:
131
- st.markdown(f"""
132
- <div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
133
- <b>Score: {score:.4f}</b><br>
134
- {doc}
135
- </div>
136
- """, unsafe_allow_html=True)
137
-
138
- # Visualize all texts including query
139
- all_texts = [query] + documents
140
- embeddings = wl.embed(all_texts)
141
  st.plotly_chart(
142
- create_3d_visualization(all_texts, embeddings),
143
  use_container_width=True
144
  )
145
-
146
- with tabs[2]:
147
- st.markdown("### Fuzzy Deduplication")
148
- st.markdown("""
149
- Remove similar documents based on a similarity threshold.
150
- Documents with similarity above the threshold will be considered duplicates.
151
- """)
152
-
153
- # Document input for deduplication
154
- st.markdown("### Enter Documents")
155
- num_dedup_docs = st.slider("Number of documents:", 2, 8, 4, key="dedup_slider")
156
-
157
- dedup_docs = []
158
- for i in range(num_dedup_docs):
159
- doc = st.text_area(f"Document {i+1}",
160
- value=f"Example document {i+1}",
161
- height=50,
162
- key=f"dedup_doc_{i}")
163
- dedup_docs.append(doc)
164
-
165
- threshold = st.slider("Similarity Threshold:", 0.0, 1.0, 0.8)
166
-
167
- if st.button("Find Duplicates", key="dedup_button"):
168
- unique_docs = wl.deduplicate(dedup_docs, threshold=threshold)
169
 
170
- st.markdown("### Results")
171
- st.markdown(f"Found {len(unique_docs)} unique documents:")
172
-
173
- for doc in unique_docs:
174
- st.markdown(f"""
175
- <div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
176
- {doc}
177
- </div>
178
- """, unsafe_allow_html=True)
179
-
180
- # Visualize all documents
181
- embeddings = wl.embed(dedup_docs)
182
  st.plotly_chart(
183
- create_3d_visualization(dedup_docs, embeddings),
184
  use_container_width=True
185
  )
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  if __name__ == "__main__":
188
  main()
 
41
 
42
  wl = load_wordllama()
43
 
44
+ def create_visualization(texts, embeddings):
45
+ """Create appropriate visualization based on number of samples"""
46
+ n_samples = len(embeddings)
 
47
 
48
+ # Create DataFrame with original embeddings
49
+ df = pd.DataFrame(embeddings)
 
 
 
50
  df['text'] = texts
51
 
52
+ if n_samples == 2:
53
+ # For 2 samples, create a 2D visualization
54
+ fig = go.Figure()
55
+
56
+ # Add points
57
+ fig.add_trace(go.Scatter(
58
+ x=[0, 1],
59
+ y=[0, wl.similarity(texts[0], texts[1])],
60
+ mode='markers+text',
61
+ text=texts,
62
+ textposition='top center',
63
+ marker=dict(size=10)
64
+ ))
65
+
66
+ fig.update_layout(
67
+ title="Text Similarity Visualization",
68
+ xaxis_title="Position",
69
+ yaxis_title="Similarity",
70
+ height=400,
71
+ showlegend=False
72
+ )
73
+
74
+ else:
75
+ # For 3 or more samples, use PCA for 3D visualization
76
+ pca = PCA(n_components=min(3, n_samples))
77
+ embeddings_reduced = pca.fit_transform(embeddings)
78
+
79
+ # Pad with zeros if needed
80
+ if embeddings_reduced.shape[1] < 3:
81
+ padding = np.zeros((embeddings_reduced.shape[0], 3 - embeddings_reduced.shape[1]))
82
+ embeddings_reduced = np.hstack([embeddings_reduced, padding])
83
+
84
+ # Create DataFrame for plotting
85
+ df_plot = pd.DataFrame(
86
+ embeddings_reduced,
87
+ columns=['X', 'Y', 'Z']
88
+ )
89
+ df_plot['text'] = texts
90
+
91
+ fig = px.scatter_3d(
92
+ df_plot, x='X', y='Y', z='Z',
93
+ text='text',
94
+ title='Text Embeddings Visualization'
95
+ )
96
+
97
+ fig.update_traces(
98
+ marker=dict(size=8, opacity=0.8),
99
+ textposition='top center'
100
+ )
101
+ fig.update_layout(
102
+ scene=dict(
103
+ xaxis_title='Component 1',
104
+ yaxis_title='Component 2',
105
+ zaxis_title='Component 3'
106
+ ),
107
+ height=700
108
+ )
109
+
110
+ return fig
111
+
112
+ def create_similarity_matrix(texts):
113
+ n = len(texts)
114
+ similarity_matrix = np.zeros((n, n))
115
+
116
+ for i in range(n):
117
+ for j in range(n):
118
+ similarity_matrix[i][j] = wl.similarity(texts[i], texts[j])
119
+
120
+ fig = go.Figure(data=go.Heatmap(
121
+ z=similarity_matrix,
122
+ x=texts,
123
+ y=texts,
124
+ colorscale='Viridis',
125
+ text=np.round(similarity_matrix, 3),
126
+ texttemplate='%{text}',
127
+ textfont={"size": 10},
128
+ ))
129
 
 
 
 
 
130
  fig.update_layout(
131
+ title="Similarity Matrix",
132
+ height=400
 
 
 
 
133
  )
134
+
135
  return fig
136
 
137
  def main():
 
139
  st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
140
  unsafe_allow_html=True)
141
 
142
+ tabs = st.tabs(["πŸ’« Text Similarity", "🎯 Multi-Text Analysis"])
143
 
144
  with tabs[0]:
145
+ st.markdown("### Compare Two Texts")
146
 
147
  col1, col2 = st.columns(2)
148
  with col1:
 
153
  if st.button("Calculate Similarity", key="sim_button"):
154
  similarity = wl.similarity(text1, text2)
155
 
156
+ st.markdown("### Results")
157
+ col1, col2 = st.columns(2)
 
 
 
 
158
 
159
+ with col1:
160
+ st.metric(
161
+ label="Similarity Score",
162
+ value=f"{similarity:.4f}",
163
+ help="1.0 = identical, 0.0 = completely different"
164
+ )
165
+
166
+ interpretation = (
167
+ "Very Similar" if similarity > 0.8
168
+ else "Moderately Similar" if similarity > 0.5
169
+ else "Different"
170
+ )
171
+ st.info(f"Interpretation: {interpretation}")
172
+
173
+ with col2:
174
+ embeddings = wl.embed([text1, text2])
175
+ st.plotly_chart(
176
+ create_visualization([text1, text2], embeddings),
177
+ use_container_width=True
178
+ )
179
 
180
  with tabs[1]:
181
+ st.markdown("### Analyze Multiple Texts")
182
 
183
+ num_texts = st.slider("Number of texts:", 2, 6, 3)
184
+ texts = []
185
 
186
+ for i in range(num_texts):
187
+ text = st.text_area(
188
+ f"Text {i+1}",
189
+ value=f"Example text {i+1}",
190
+ height=100,
191
+ key=f"text_{i}"
192
+ )
193
+ texts.append(text)
 
 
 
194
 
195
+ if st.button("Analyze Texts", key="analyze_button"):
196
+ embeddings = wl.embed(texts)
197
 
198
+ st.markdown("### Visualization")
 
 
 
 
 
 
 
 
 
 
 
199
  st.plotly_chart(
200
+ create_visualization(texts, embeddings),
201
  use_container_width=True
202
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ st.markdown("### Similarity Matrix")
 
 
 
 
 
 
 
 
 
 
 
205
  st.plotly_chart(
206
+ create_similarity_matrix(texts),
207
  use_container_width=True
208
  )
209
+
210
+ # Pairwise similarity analysis
211
+ st.markdown("### Pairwise Similarities")
212
+ for i in range(len(texts)):
213
+ for j in range(i+1, len(texts)):
214
+ similarity = wl.similarity(texts[i], texts[j])
215
+ interpretation = (
216
+ "🟒 Very Similar" if similarity > 0.8
217
+ else "🟑 Moderately Similar" if similarity > 0.5
218
+ else "πŸ”΄ Different"
219
+ )
220
+ st.write(f"{interpretation} ({similarity:.3f}): Text {i+1} vs Text {j+1}")
221
 
222
  if __name__ == "__main__":
223
  main()