import streamlit as st import numpy as np from wordllama import WordLlama import plotly.graph_objects as go import plotly.express as px from sklearn.decomposition import PCA import pandas as pd # Page configuration st.set_page_config( page_title="WordLlama Explorer", page_icon="🦙", layout="wide" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_wordllama(): return WordLlama.load() wl = load_wordllama() def create_3d_visualization(texts, embeddings): # Reduce to 3D using PCA pca = PCA(n_components=3) embeddings_3d = pca.fit_transform(embeddings) # Create DataFrame df = pd.DataFrame( embeddings_3d, columns=['X', 'Y', 'Z'] ) df['text'] = texts fig = px.scatter_3d( df, x='X', y='Y', z='Z', text='text', title='Word Embeddings in 3D Space' ) fig.update_traces( marker=dict(size=8, opacity=0.8), textposition='top center' ) fig.update_layout( scene=dict( xaxis_title='Component 1', yaxis_title='Component 2', zaxis_title='Component 3' ), height=700 ) return fig def main(): st.title("🦙 WordLlama Embedding Explorer") st.markdown("
Explore the power of WordLlama embeddings
", unsafe_allow_html=True) tabs = st.tabs(["💫 Similarity Explorer", "🎯 Document Ranking", "🔍 Fuzzy Deduplication"]) with tabs[0]: st.markdown("### Compare Text Similarity") col1, col2 = st.columns(2) with col1: text1 = st.text_area("First Text", value="I love programming in Python", height=100) with col2: text2 = st.text_area("Second Text", value="Coding with Python is amazing", height=100) if st.button("Calculate Similarity", key="sim_button"): similarity = wl.similarity(text1, text2) st.markdown("### Similarity Score") st.metric( label="Cosine Similarity", value=f"{similarity:.4f}", help="Score ranges from 0 (different) to 1 (identical)" ) # Visualize both texts in 3D space embeddings = wl.embed([text1, text2]) st.plotly_chart( create_3d_visualization([text1, text2], embeddings), use_container_width=True ) with tabs[1]: st.markdown("### Rank Documents by Similarity") query = st.text_area("Query Text", value="I went to the car", height=100) # Multiple document input st.markdown("### Enter Documents to Rank") num_docs = st.slider("Number of documents:", 2, 6, 4) documents = [] for i in range(num_docs): doc = st.text_area(f"Document {i+1}", value=f"Example document {i+1}", height=50, key=f"doc_{i}") documents.append(doc) if st.button("Rank Documents", key="rank_button"): ranked_docs = wl.rank(query, documents) st.markdown("### Ranking Results") for doc, score in ranked_docs: st.markdown(f"""