Spaces:

DexterSptizu
/

word-llama-embeddings-explorer

Running

App Files Files Community

DexterSptizu commited on 2 days ago

Commit

8afde48

•

1 Parent(s): fbf258f

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -103

app.py CHANGED Viewed

@@ -41,36 +41,97 @@ def load_wordllama():
 wl = load_wordllama()
-def create_3d_visualization(texts, embeddings):
-    # Reduce to 3D using PCA
-    pca = PCA(n_components=3)
-    embeddings_3d = pca.fit_transform(embeddings)
-    # Create DataFrame
-    df = pd.DataFrame(
-        embeddings_3d,
-        columns=['X', 'Y', 'Z']
-    )
     df['text'] = texts
-    fig = px.scatter_3d(
-        df, x='X', y='Y', z='Z',
-        text='text',
-        title='Word Embeddings in 3D Space'
-    )
-    fig.update_traces(
-        marker=dict(size=8, opacity=0.8),
-        textposition='top center'
-    )
     fig.update_layout(
-        scene=dict(
-            xaxis_title='Component 1',
-            yaxis_title='Component 2',
-            zaxis_title='Component 3'
-        ),
-        height=700
     )
     return fig
 def main():
@@ -78,10 +139,10 @@ def main():
     st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
                 unsafe_allow_html=True)
-    tabs = st.tabs(["💫 Similarity Explorer", "🎯 Document Ranking", "🔍 Fuzzy Deduplication"])
     with tabs[0]:
-        st.markdown("### Compare Text Similarity")
         col1, col2 = st.columns(2)
         with col1:
@@ -92,97 +153,71 @@ def main():
         if st.button("Calculate Similarity", key="sim_button"):
             similarity = wl.similarity(text1, text2)
-            st.markdown("### Similarity Score")
-            st.metric(
-                label="Cosine Similarity",
-                value=f"{similarity:.4f}",
-                help="Score ranges from 0 (different) to 1 (identical)"
-            )
-            # Visualize both texts in 3D space
-            embeddings = wl.embed([text1, text2])
-            st.plotly_chart(
-                create_3d_visualization([text1, text2], embeddings),
-                use_container_width=True
-            )
     with tabs[1]:
-        st.markdown("### Rank Documents by Similarity")
-        query = st.text_area("Query Text", value="I went to the car", height=100)
-        # Multiple document input
-        st.markdown("### Enter Documents to Rank")
-        num_docs = st.slider("Number of documents:", 2, 6, 4)
-        documents = []
-        for i in range(num_docs):
-            doc = st.text_area(f"Document {i+1}",
-                             value=f"Example document {i+1}",
-                             height=50,
-                             key=f"doc_{i}")
-            documents.append(doc)
-        if st.button("Rank Documents", key="rank_button"):
-            ranked_docs = wl.rank(query, documents)
-            st.markdown("### Ranking Results")
-            for doc, score in ranked_docs:
-                st.markdown(f"""
-                <div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
-                    <b>Score: {score:.4f}</b><br>
-                    {doc}
-                </div>
-                """, unsafe_allow_html=True)
-            # Visualize all texts including query
-            all_texts = [query] + documents
-            embeddings = wl.embed(all_texts)
             st.plotly_chart(
-                create_3d_visualization(all_texts, embeddings),
                 use_container_width=True
             )
-    with tabs[2]:
-        st.markdown("### Fuzzy Deduplication")
-        st.markdown("""
-        Remove similar documents based on a similarity threshold.
-        Documents with similarity above the threshold will be considered duplicates.
-        """)
-        # Document input for deduplication
-        st.markdown("### Enter Documents")
-        num_dedup_docs = st.slider("Number of documents:", 2, 8, 4, key="dedup_slider")
-        dedup_docs = []
-        for i in range(num_dedup_docs):
-            doc = st.text_area(f"Document {i+1}",
-                             value=f"Example document {i+1}",
-                             height=50,
-                             key=f"dedup_doc_{i}")
-            dedup_docs.append(doc)
-        threshold = st.slider("Similarity Threshold:", 0.0, 1.0, 0.8)
-        if st.button("Find Duplicates", key="dedup_button"):
-            unique_docs = wl.deduplicate(dedup_docs, threshold=threshold)
-            st.markdown("### Results")
-            st.markdown(f"Found {len(unique_docs)} unique documents:")
-            for doc in unique_docs:
-                st.markdown(f"""
-                <div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
-                    {doc}
-                </div>
-                """, unsafe_allow_html=True)
-            # Visualize all documents
-            embeddings = wl.embed(dedup_docs)
             st.plotly_chart(
-                create_3d_visualization(dedup_docs, embeddings),
                 use_container_width=True
             )
 if __name__ == "__main__":
     main()

 wl = load_wordllama()
+def create_visualization(texts, embeddings):
+    """Create appropriate visualization based on number of samples"""
+    n_samples = len(embeddings)
+    # Create DataFrame with original embeddings
+    df = pd.DataFrame(embeddings)
     df['text'] = texts
+    if n_samples == 2:
+        # For 2 samples, create a 2D visualization
+        fig = go.Figure()
+        # Add points
+        fig.add_trace(go.Scatter(
+            x=[0, 1],
+            y=[0, wl.similarity(texts[0], texts[1])],
+            mode='markers+text',
+            text=texts,
+            textposition='top center',
+            marker=dict(size=10)
+        ))
+        fig.update_layout(
+            title="Text Similarity Visualization",
+            xaxis_title="Position",
+            yaxis_title="Similarity",
+            height=400,
+            showlegend=False
+        )
+    else:
+        # For 3 or more samples, use PCA for 3D visualization
+        pca = PCA(n_components=min(3, n_samples))
+        embeddings_reduced = pca.fit_transform(embeddings)
+        # Pad with zeros if needed
+        if embeddings_reduced.shape[1] < 3:
+            padding = np.zeros((embeddings_reduced.shape[0], 3 - embeddings_reduced.shape[1]))
+            embeddings_reduced = np.hstack([embeddings_reduced, padding])
+        # Create DataFrame for plotting
+        df_plot = pd.DataFrame(
+            embeddings_reduced,
+            columns=['X', 'Y', 'Z']
+        )
+        df_plot['text'] = texts
+        fig = px.scatter_3d(
+            df_plot, x='X', y='Y', z='Z',
+            text='text',
+            title='Text Embeddings Visualization'
+        )
+        fig.update_traces(
+            marker=dict(size=8, opacity=0.8),
+            textposition='top center'
+        )
+        fig.update_layout(
+            scene=dict(
+                xaxis_title='Component 1',
+                yaxis_title='Component 2',
+                zaxis_title='Component 3'
+            ),
+            height=700
+        )
+    return fig
+def create_similarity_matrix(texts):
+    n = len(texts)
+    similarity_matrix = np.zeros((n, n))
+    for i in range(n):
+        for j in range(n):
+            similarity_matrix[i][j] = wl.similarity(texts[i], texts[j])
+    fig = go.Figure(data=go.Heatmap(
+        z=similarity_matrix,
+        x=texts,
+        y=texts,
+        colorscale='Viridis',
+        text=np.round(similarity_matrix, 3),
+        texttemplate='%{text}',
+        textfont={"size": 10},
+    ))
     fig.update_layout(
+        title="Similarity Matrix",
+        height=400
     )
     return fig
 def main():
     st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
                 unsafe_allow_html=True)
+    tabs = st.tabs(["💫 Text Similarity", "🎯 Multi-Text Analysis"])
     with tabs[0]:
+        st.markdown("### Compare Two Texts")
         col1, col2 = st.columns(2)
         with col1:
         if st.button("Calculate Similarity", key="sim_button"):
             similarity = wl.similarity(text1, text2)
+            st.markdown("### Results")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric(
+                    label="Similarity Score",
+                    value=f"{similarity:.4f}",
+                    help="1.0 = identical, 0.0 = completely different"
+                )
+                interpretation = (
+                    "Very Similar" if similarity > 0.8
+                    else "Moderately Similar" if similarity > 0.5
+                    else "Different"
+                )
+                st.info(f"Interpretation: {interpretation}")
+            with col2:
+                embeddings = wl.embed([text1, text2])
+                st.plotly_chart(
+                    create_visualization([text1, text2], embeddings),
+                    use_container_width=True
+                )
     with tabs[1]:
+        st.markdown("### Analyze Multiple Texts")
+        num_texts = st.slider("Number of texts:", 2, 6, 3)
+        texts = []
+        for i in range(num_texts):
+            text = st.text_area(
+                f"Text {i+1}",
+                value=f"Example text {i+1}",
+                height=100,
+                key=f"text_{i}"
+            )
+            texts.append(text)
+        if st.button("Analyze Texts", key="analyze_button"):
+            embeddings = wl.embed(texts)
+            st.markdown("### Visualization")
             st.plotly_chart(
+                create_visualization(texts, embeddings),
                 use_container_width=True
             )
+            st.markdown("### Similarity Matrix")
             st.plotly_chart(
+                create_similarity_matrix(texts),
                 use_container_width=True
             )
+            # Pairwise similarity analysis
+            st.markdown("### Pairwise Similarities")
+            for i in range(len(texts)):
+                for j in range(i+1, len(texts)):
+                    similarity = wl.similarity(texts[i], texts[j])
+                    interpretation = (
+                        "🟢 Very Similar" if similarity > 0.8
+                        else "🟡 Moderately Similar" if similarity > 0.5
+                        else "🔴 Different"
+                    )
+                    st.write(f"{interpretation} ({similarity:.3f}): Text {i+1} vs Text {j+1}")
 if __name__ == "__main__":
     main()