DexterSptizu
commited on
Commit
β’
8afde48
1
Parent(s):
fbf258f
Update app.py
Browse files
app.py
CHANGED
@@ -41,36 +41,97 @@ def load_wordllama():
|
|
41 |
|
42 |
wl = load_wordllama()
|
43 |
|
44 |
-
def
|
45 |
-
|
46 |
-
|
47 |
-
embeddings_3d = pca.fit_transform(embeddings)
|
48 |
|
49 |
-
# Create DataFrame
|
50 |
-
df = pd.DataFrame(
|
51 |
-
embeddings_3d,
|
52 |
-
columns=['X', 'Y', 'Z']
|
53 |
-
)
|
54 |
df['text'] = texts
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
fig.update_traces(
|
63 |
-
marker=dict(size=8, opacity=0.8),
|
64 |
-
textposition='top center'
|
65 |
-
)
|
66 |
fig.update_layout(
|
67 |
-
|
68 |
-
|
69 |
-
yaxis_title='Component 2',
|
70 |
-
zaxis_title='Component 3'
|
71 |
-
),
|
72 |
-
height=700
|
73 |
)
|
|
|
74 |
return fig
|
75 |
|
76 |
def main():
|
@@ -78,10 +139,10 @@ def main():
|
|
78 |
st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
|
79 |
unsafe_allow_html=True)
|
80 |
|
81 |
-
tabs = st.tabs(["π« Similarity
|
82 |
|
83 |
with tabs[0]:
|
84 |
-
st.markdown("### Compare
|
85 |
|
86 |
col1, col2 = st.columns(2)
|
87 |
with col1:
|
@@ -92,97 +153,71 @@ def main():
|
|
92 |
if st.button("Calculate Similarity", key="sim_button"):
|
93 |
similarity = wl.similarity(text1, text2)
|
94 |
|
95 |
-
st.markdown("###
|
96 |
-
st.
|
97 |
-
label="Cosine Similarity",
|
98 |
-
value=f"{similarity:.4f}",
|
99 |
-
help="Score ranges from 0 (different) to 1 (identical)"
|
100 |
-
)
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
with tabs[1]:
|
110 |
-
st.markdown("###
|
111 |
|
112 |
-
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
height=50,
|
123 |
-
key=f"doc_{i}")
|
124 |
-
documents.append(doc)
|
125 |
|
126 |
-
if st.button("
|
127 |
-
|
128 |
|
129 |
-
st.markdown("###
|
130 |
-
for doc, score in ranked_docs:
|
131 |
-
st.markdown(f"""
|
132 |
-
<div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
|
133 |
-
<b>Score: {score:.4f}</b><br>
|
134 |
-
{doc}
|
135 |
-
</div>
|
136 |
-
""", unsafe_allow_html=True)
|
137 |
-
|
138 |
-
# Visualize all texts including query
|
139 |
-
all_texts = [query] + documents
|
140 |
-
embeddings = wl.embed(all_texts)
|
141 |
st.plotly_chart(
|
142 |
-
|
143 |
use_container_width=True
|
144 |
)
|
145 |
-
|
146 |
-
with tabs[2]:
|
147 |
-
st.markdown("### Fuzzy Deduplication")
|
148 |
-
st.markdown("""
|
149 |
-
Remove similar documents based on a similarity threshold.
|
150 |
-
Documents with similarity above the threshold will be considered duplicates.
|
151 |
-
""")
|
152 |
-
|
153 |
-
# Document input for deduplication
|
154 |
-
st.markdown("### Enter Documents")
|
155 |
-
num_dedup_docs = st.slider("Number of documents:", 2, 8, 4, key="dedup_slider")
|
156 |
-
|
157 |
-
dedup_docs = []
|
158 |
-
for i in range(num_dedup_docs):
|
159 |
-
doc = st.text_area(f"Document {i+1}",
|
160 |
-
value=f"Example document {i+1}",
|
161 |
-
height=50,
|
162 |
-
key=f"dedup_doc_{i}")
|
163 |
-
dedup_docs.append(doc)
|
164 |
-
|
165 |
-
threshold = st.slider("Similarity Threshold:", 0.0, 1.0, 0.8)
|
166 |
-
|
167 |
-
if st.button("Find Duplicates", key="dedup_button"):
|
168 |
-
unique_docs = wl.deduplicate(dedup_docs, threshold=threshold)
|
169 |
|
170 |
-
st.markdown("###
|
171 |
-
st.markdown(f"Found {len(unique_docs)} unique documents:")
|
172 |
-
|
173 |
-
for doc in unique_docs:
|
174 |
-
st.markdown(f"""
|
175 |
-
<div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
|
176 |
-
{doc}
|
177 |
-
</div>
|
178 |
-
""", unsafe_allow_html=True)
|
179 |
-
|
180 |
-
# Visualize all documents
|
181 |
-
embeddings = wl.embed(dedup_docs)
|
182 |
st.plotly_chart(
|
183 |
-
|
184 |
use_container_width=True
|
185 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
if __name__ == "__main__":
|
188 |
main()
|
|
|
41 |
|
42 |
wl = load_wordllama()
|
43 |
|
44 |
+
def create_visualization(texts, embeddings):
|
45 |
+
"""Create appropriate visualization based on number of samples"""
|
46 |
+
n_samples = len(embeddings)
|
|
|
47 |
|
48 |
+
# Create DataFrame with original embeddings
|
49 |
+
df = pd.DataFrame(embeddings)
|
|
|
|
|
|
|
50 |
df['text'] = texts
|
51 |
|
52 |
+
if n_samples == 2:
|
53 |
+
# For 2 samples, create a 2D visualization
|
54 |
+
fig = go.Figure()
|
55 |
+
|
56 |
+
# Add points
|
57 |
+
fig.add_trace(go.Scatter(
|
58 |
+
x=[0, 1],
|
59 |
+
y=[0, wl.similarity(texts[0], texts[1])],
|
60 |
+
mode='markers+text',
|
61 |
+
text=texts,
|
62 |
+
textposition='top center',
|
63 |
+
marker=dict(size=10)
|
64 |
+
))
|
65 |
+
|
66 |
+
fig.update_layout(
|
67 |
+
title="Text Similarity Visualization",
|
68 |
+
xaxis_title="Position",
|
69 |
+
yaxis_title="Similarity",
|
70 |
+
height=400,
|
71 |
+
showlegend=False
|
72 |
+
)
|
73 |
+
|
74 |
+
else:
|
75 |
+
# For 3 or more samples, use PCA for 3D visualization
|
76 |
+
pca = PCA(n_components=min(3, n_samples))
|
77 |
+
embeddings_reduced = pca.fit_transform(embeddings)
|
78 |
+
|
79 |
+
# Pad with zeros if needed
|
80 |
+
if embeddings_reduced.shape[1] < 3:
|
81 |
+
padding = np.zeros((embeddings_reduced.shape[0], 3 - embeddings_reduced.shape[1]))
|
82 |
+
embeddings_reduced = np.hstack([embeddings_reduced, padding])
|
83 |
+
|
84 |
+
# Create DataFrame for plotting
|
85 |
+
df_plot = pd.DataFrame(
|
86 |
+
embeddings_reduced,
|
87 |
+
columns=['X', 'Y', 'Z']
|
88 |
+
)
|
89 |
+
df_plot['text'] = texts
|
90 |
+
|
91 |
+
fig = px.scatter_3d(
|
92 |
+
df_plot, x='X', y='Y', z='Z',
|
93 |
+
text='text',
|
94 |
+
title='Text Embeddings Visualization'
|
95 |
+
)
|
96 |
+
|
97 |
+
fig.update_traces(
|
98 |
+
marker=dict(size=8, opacity=0.8),
|
99 |
+
textposition='top center'
|
100 |
+
)
|
101 |
+
fig.update_layout(
|
102 |
+
scene=dict(
|
103 |
+
xaxis_title='Component 1',
|
104 |
+
yaxis_title='Component 2',
|
105 |
+
zaxis_title='Component 3'
|
106 |
+
),
|
107 |
+
height=700
|
108 |
+
)
|
109 |
+
|
110 |
+
return fig
|
111 |
+
|
112 |
+
def create_similarity_matrix(texts):
|
113 |
+
n = len(texts)
|
114 |
+
similarity_matrix = np.zeros((n, n))
|
115 |
+
|
116 |
+
for i in range(n):
|
117 |
+
for j in range(n):
|
118 |
+
similarity_matrix[i][j] = wl.similarity(texts[i], texts[j])
|
119 |
+
|
120 |
+
fig = go.Figure(data=go.Heatmap(
|
121 |
+
z=similarity_matrix,
|
122 |
+
x=texts,
|
123 |
+
y=texts,
|
124 |
+
colorscale='Viridis',
|
125 |
+
text=np.round(similarity_matrix, 3),
|
126 |
+
texttemplate='%{text}',
|
127 |
+
textfont={"size": 10},
|
128 |
+
))
|
129 |
|
|
|
|
|
|
|
|
|
130 |
fig.update_layout(
|
131 |
+
title="Similarity Matrix",
|
132 |
+
height=400
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
+
|
135 |
return fig
|
136 |
|
137 |
def main():
|
|
|
139 |
st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
|
140 |
unsafe_allow_html=True)
|
141 |
|
142 |
+
tabs = st.tabs(["π« Text Similarity", "π― Multi-Text Analysis"])
|
143 |
|
144 |
with tabs[0]:
|
145 |
+
st.markdown("### Compare Two Texts")
|
146 |
|
147 |
col1, col2 = st.columns(2)
|
148 |
with col1:
|
|
|
153 |
if st.button("Calculate Similarity", key="sim_button"):
|
154 |
similarity = wl.similarity(text1, text2)
|
155 |
|
156 |
+
st.markdown("### Results")
|
157 |
+
col1, col2 = st.columns(2)
|
|
|
|
|
|
|
|
|
158 |
|
159 |
+
with col1:
|
160 |
+
st.metric(
|
161 |
+
label="Similarity Score",
|
162 |
+
value=f"{similarity:.4f}",
|
163 |
+
help="1.0 = identical, 0.0 = completely different"
|
164 |
+
)
|
165 |
+
|
166 |
+
interpretation = (
|
167 |
+
"Very Similar" if similarity > 0.8
|
168 |
+
else "Moderately Similar" if similarity > 0.5
|
169 |
+
else "Different"
|
170 |
+
)
|
171 |
+
st.info(f"Interpretation: {interpretation}")
|
172 |
+
|
173 |
+
with col2:
|
174 |
+
embeddings = wl.embed([text1, text2])
|
175 |
+
st.plotly_chart(
|
176 |
+
create_visualization([text1, text2], embeddings),
|
177 |
+
use_container_width=True
|
178 |
+
)
|
179 |
|
180 |
with tabs[1]:
|
181 |
+
st.markdown("### Analyze Multiple Texts")
|
182 |
|
183 |
+
num_texts = st.slider("Number of texts:", 2, 6, 3)
|
184 |
+
texts = []
|
185 |
|
186 |
+
for i in range(num_texts):
|
187 |
+
text = st.text_area(
|
188 |
+
f"Text {i+1}",
|
189 |
+
value=f"Example text {i+1}",
|
190 |
+
height=100,
|
191 |
+
key=f"text_{i}"
|
192 |
+
)
|
193 |
+
texts.append(text)
|
|
|
|
|
|
|
194 |
|
195 |
+
if st.button("Analyze Texts", key="analyze_button"):
|
196 |
+
embeddings = wl.embed(texts)
|
197 |
|
198 |
+
st.markdown("### Visualization")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
st.plotly_chart(
|
200 |
+
create_visualization(texts, embeddings),
|
201 |
use_container_width=True
|
202 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
+
st.markdown("### Similarity Matrix")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
st.plotly_chart(
|
206 |
+
create_similarity_matrix(texts),
|
207 |
use_container_width=True
|
208 |
)
|
209 |
+
|
210 |
+
# Pairwise similarity analysis
|
211 |
+
st.markdown("### Pairwise Similarities")
|
212 |
+
for i in range(len(texts)):
|
213 |
+
for j in range(i+1, len(texts)):
|
214 |
+
similarity = wl.similarity(texts[i], texts[j])
|
215 |
+
interpretation = (
|
216 |
+
"π’ Very Similar" if similarity > 0.8
|
217 |
+
else "π‘ Moderately Similar" if similarity > 0.5
|
218 |
+
else "π΄ Different"
|
219 |
+
)
|
220 |
+
st.write(f"{interpretation} ({similarity:.3f}): Text {i+1} vs Text {j+1}")
|
221 |
|
222 |
if __name__ == "__main__":
|
223 |
main()
|