DexterSptizu commited on
Commit
ded7cbd
β€’
1 Parent(s): 560c430

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -0
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from wordllama import WordLlama
4
+ import plotly.graph_objects as go
5
+ import plotly.express as px
6
+ from sklearn.decomposition import PCA
7
+ import pandas as pd
8
+
9
+ # Page configuration
10
+ st.set_page_config(
11
+ page_title="WordLlama Explorer",
12
+ page_icon="πŸ¦™",
13
+ layout="wide"
14
+ )
15
+
16
+ # Custom CSS
17
+ st.markdown("""
18
+ <style>
19
+ .main {
20
+ background-color: #f8f9fa;
21
+ }
22
+ .stTabs [data-baseweb="tab-list"] {
23
+ gap: 24px;
24
+ }
25
+ .stTabs [data-baseweb="tab"] {
26
+ height: 50px;
27
+ padding-left: 20px;
28
+ padding-right: 20px;
29
+ }
30
+ .title-font {
31
+ font-size: 28px !important;
32
+ font-weight: bold;
33
+ color: #2c3e50;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ @st.cache_resource
39
+ def load_wordllama():
40
+ return WordLlama.load()
41
+
42
+ wl = load_wordllama()
43
+
44
+ def create_3d_visualization(texts, embeddings):
45
+ # Reduce to 3D using PCA
46
+ pca = PCA(n_components=3)
47
+ embeddings_3d = pca.fit_transform(embeddings)
48
+
49
+ # Create DataFrame
50
+ df = pd.DataFrame(
51
+ embeddings_3d,
52
+ columns=['X', 'Y', 'Z']
53
+ )
54
+ df['text'] = texts
55
+
56
+ fig = px.scatter_3d(
57
+ df, x='X', y='Y', z='Z',
58
+ text='text',
59
+ title='Word Embeddings in 3D Space'
60
+ )
61
+
62
+ fig.update_traces(
63
+ marker=dict(size=8, opacity=0.8),
64
+ textposition='top center'
65
+ )
66
+ fig.update_layout(
67
+ scene=dict(
68
+ xaxis_title='Component 1',
69
+ yaxis_title='Component 2',
70
+ zaxis_title='Component 3'
71
+ ),
72
+ height=700
73
+ )
74
+ return fig
75
+
76
+ def main():
77
+ st.title("πŸ¦™ WordLlama Embedding Explorer")
78
+ st.markdown("<p class='title-font'>Explore the power of WordLlama embeddings</p>",
79
+ unsafe_allow_html=True)
80
+
81
+ tabs = st.tabs(["πŸ’« Similarity Explorer", "🎯 Document Ranking", "πŸ” Fuzzy Deduplication"])
82
+
83
+ with tabs[0]:
84
+ st.markdown("### Compare Text Similarity")
85
+
86
+ col1, col2 = st.columns(2)
87
+ with col1:
88
+ text1 = st.text_area("First Text", value="I love programming in Python", height=100)
89
+ with col2:
90
+ text2 = st.text_area("Second Text", value="Coding with Python is amazing", height=100)
91
+
92
+ if st.button("Calculate Similarity", key="sim_button"):
93
+ similarity = wl.similarity(text1, text2)
94
+
95
+ st.markdown("### Similarity Score")
96
+ st.metric(
97
+ label="Cosine Similarity",
98
+ value=f"{similarity:.4f}",
99
+ help="Score ranges from 0 (different) to 1 (identical)"
100
+ )
101
+
102
+ # Visualize both texts in 3D space
103
+ embeddings = wl.embed([text1, text2])
104
+ st.plotly_chart(
105
+ create_3d_visualization([text1, text2], embeddings),
106
+ use_container_width=True
107
+ )
108
+
109
+ with tabs[1]:
110
+ st.markdown("### Rank Documents by Similarity")
111
+
112
+ query = st.text_area("Query Text", value="I went to the car", height=100)
113
+
114
+ # Multiple document input
115
+ st.markdown("### Enter Documents to Rank")
116
+ num_docs = st.slider("Number of documents:", 2, 6, 4)
117
+
118
+ documents = []
119
+ for i in range(num_docs):
120
+ doc = st.text_area(f"Document {i+1}",
121
+ value=f"Example document {i+1}",
122
+ height=50,
123
+ key=f"doc_{i}")
124
+ documents.append(doc)
125
+
126
+ if st.button("Rank Documents", key="rank_button"):
127
+ ranked_docs = wl.rank(query, documents)
128
+
129
+ st.markdown("### Ranking Results")
130
+ for doc, score in ranked_docs:
131
+ st.markdown(f"""
132
+ <div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
133
+ <b>Score: {score:.4f}</b><br>
134
+ {doc}
135
+ </div>
136
+ """, unsafe_allow_html=True)
137
+
138
+ # Visualize all texts including query
139
+ all_texts = [query] + documents
140
+ embeddings = wl.embed(all_texts)
141
+ st.plotly_chart(
142
+ create_3d_visualization(all_texts, embeddings),
143
+ use_container_width=True
144
+ )
145
+
146
+ with tabs[2]:
147
+ st.markdown("### Fuzzy Deduplication")
148
+ st.markdown("""
149
+ Remove similar documents based on a similarity threshold.
150
+ Documents with similarity above the threshold will be considered duplicates.
151
+ """)
152
+
153
+ # Document input for deduplication
154
+ st.markdown("### Enter Documents")
155
+ num_dedup_docs = st.slider("Number of documents:", 2, 8, 4, key="dedup_slider")
156
+
157
+ dedup_docs = []
158
+ for i in range(num_dedup_docs):
159
+ doc = st.text_area(f"Document {i+1}",
160
+ value=f"Example document {i+1}",
161
+ height=50,
162
+ key=f"dedup_doc_{i}")
163
+ dedup_docs.append(doc)
164
+
165
+ threshold = st.slider("Similarity Threshold:", 0.0, 1.0, 0.8)
166
+
167
+ if st.button("Find Duplicates", key="dedup_button"):
168
+ unique_docs = wl.deduplicate(dedup_docs, threshold=threshold)
169
+
170
+ st.markdown("### Results")
171
+ st.markdown(f"Found {len(unique_docs)} unique documents:")
172
+
173
+ for doc in unique_docs:
174
+ st.markdown(f"""
175
+ <div style='padding: 10px; margin: 5px; background-color: #f0f2f6; border-radius: 5px;'>
176
+ {doc}
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ # Visualize all documents
181
+ embeddings = wl.embed(dedup_docs)
182
+ st.plotly_chart(
183
+ create_3d_visualization(dedup_docs, embeddings),
184
+ use_container_width=True
185
+ )
186
+
187
+ if __name__ == "__main__":
188
+ main()