DexterSptizu commited on
Commit
c58af7d
1 Parent(s): eb257c3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -0
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from transformers import AutoModel
4
+ import plotly.graph_objects as go
5
+ from sklearn.manifold import MDS
6
+ import pandas as pd
7
+ import torch
8
+
9
+ # Page configuration
10
+ st.set_page_config(
11
+ page_title="Jina Embeddings Explorer",
12
+ page_icon="🔮",
13
+ layout="wide"
14
+ )
15
+
16
+ # Custom CSS
17
+ st.markdown("""
18
+ <style>
19
+ .title-font {
20
+ font-size: 28px !important;
21
+ font-weight: bold;
22
+ color: #2c3e50;
23
+ }
24
+ </style>
25
+ """, unsafe_allow_html=True)
26
+
27
+ @st.cache_resource
28
+ def load_model():
29
+ return AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
30
+
31
+ model = load_model()
32
+
33
+ def get_embeddings(texts, task="text-matching"):
34
+ """Get embeddings using Jina v3 model"""
35
+ with torch.no_grad():
36
+ embeddings = model.encode(texts, task=task)
37
+ return embeddings
38
+
39
+ def create_similarity_based_visualization(texts, task="text-matching"):
40
+ """Create visualization based on similarity distances"""
41
+ n = len(texts)
42
+
43
+ # Get embeddings
44
+ embeddings = get_embeddings(texts, task=task)
45
+
46
+ # Calculate similarity matrix using cosine similarity
47
+ similarity_matrix = np.zeros((n, n))
48
+ for i in range(n):
49
+ for j in range(n):
50
+ similarity_matrix[i][j] = np.dot(embeddings[i], embeddings[j]) / (
51
+ np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
52
+
53
+ # Convert similarities to distances
54
+ distance_matrix = 1 - similarity_matrix
55
+
56
+ # Use MDS for visualization
57
+ mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
58
+ coords = mds.fit_transform(distance_matrix)
59
+
60
+ # Create 3D visualization
61
+ fig = go.Figure()
62
+
63
+ # Add points
64
+ fig.add_trace(go.Scatter3d(
65
+ x=coords[:, 0],
66
+ y=coords[:, 1],
67
+ z=coords[:, 2],
68
+ mode='markers+text',
69
+ text=texts,
70
+ textposition='top center',
71
+ marker=dict(
72
+ size=10,
73
+ color=list(range(len(texts))),
74
+ colorscale='Viridis',
75
+ opacity=0.8
76
+ ),
77
+ name='Texts'
78
+ ))
79
+
80
+ # Add lines between points
81
+ for i in range(n):
82
+ for j in range(i+1, n):
83
+ opacity = max(0.1, min(1.0, similarity_matrix[i,j]))
84
+ fig.add_trace(go.Scatter3d(
85
+ x=[coords[i,0], coords[j,0]],
86
+ y=[coords[i,1], coords[j,1]],
87
+ z=[coords[i,2], coords[j,2]],
88
+ mode='lines',
89
+ line=dict(
90
+ color='gray',
91
+ width=2
92
+ ),
93
+ opacity=opacity,
94
+ showlegend=False,
95
+ hoverinfo='skip'
96
+ ))
97
+
98
+ fig.update_layout(
99
+ title=f"3D Similarity Visualization (Task: {task})",
100
+ scene=dict(
101
+ xaxis_title="Dimension 1",
102
+ yaxis_title="Dimension 2",
103
+ zaxis_title="Dimension 3",
104
+ camera=dict(
105
+ up=dict(x=0, y=0, z=1),
106
+ center=dict(x=0, y=0, z=0),
107
+ eye=dict(x=1.5, y=1.5, z=1.5)
108
+ )
109
+ ),
110
+ height=700
111
+ )
112
+ return fig, similarity_matrix
113
+
114
+ def main():
115
+ st.title("🔮 Jina Embeddings v3 Explorer")
116
+ st.markdown("<p class='title-font'>Explore text similarities using state-of-the-art embeddings</p>",
117
+ unsafe_allow_html=True)
118
+
119
+ with st.expander("ℹ️ About Jina Embeddings v3", expanded=True):
120
+ st.markdown("""
121
+ This tool uses Jina Embeddings v3, a powerful multilingual embedding model that supports:
122
+ - Multiple tasks: text-matching, retrieval, classification, separation
123
+ - Long sequences: up to 8192 tokens
124
+ - 30+ languages
125
+ - State-of-the-art performance
126
+ """)
127
+
128
+ # Task selection
129
+ task = st.selectbox(
130
+ "Select Task",
131
+ ["text-matching", "retrieval.query", "retrieval.passage", "separation", "classification"],
132
+ help="Different tasks optimize embeddings for specific use cases"
133
+ )
134
+
135
+ # Example templates
136
+ examples = {
137
+ "Similar Concepts": [
138
+ "I love programming in Python",
139
+ "Coding with Python is amazing",
140
+ "Software development is fun",
141
+ "I enjoy writing code"
142
+ ],
143
+ "Multilingual": [
144
+ "Hello, how are you?",
145
+ "Hola, ¿cómo estás?",
146
+ "Bonjour, comment allez-vous?",
147
+ "你好,你好吗?"
148
+ ],
149
+ "Technical Concepts": [
150
+ "Machine learning is a subset of artificial intelligence",
151
+ "AI systems can learn from data",
152
+ "Neural networks process information",
153
+ "Deep learning models require training"
154
+ ]
155
+ }
156
+
157
+ col1, col2 = st.columns([3, 1])
158
+ with col1:
159
+ selected_example = st.selectbox("Choose an example set:", list(examples.keys()))
160
+ with col2:
161
+ if st.button("Load Example"):
162
+ st.session_state.texts = examples[selected_example]
163
+
164
+ # Text input
165
+ num_texts = st.slider("Number of texts:", 2, 6, 4)
166
+ texts = []
167
+
168
+ for i in range(num_texts):
169
+ default_text = (examples[selected_example][i]
170
+ if selected_example in examples and i < len(examples[selected_example])
171
+ else f"Example text {i+1}")
172
+ text = st.text_area(
173
+ f"Text {i+1}",
174
+ value=default_text,
175
+ height=100,
176
+ key=f"text_{i}"
177
+ )
178
+ texts.append(text)
179
+
180
+ if st.button("Analyze Texts", type="primary"):
181
+ if all(texts):
182
+ fig, similarity_matrix = create_similarity_based_visualization(texts, task)
183
+
184
+ # Display visualization
185
+ st.plotly_chart(fig, use_container_width=True)
186
+
187
+ # Show similarity matrix
188
+ st.markdown("### Similarity Matrix")
189
+ fig_matrix = go.Figure(data=go.Heatmap(
190
+ z=similarity_matrix,
191
+ x=[f"Text {i+1}" for i in range(len(texts))],
192
+ y=[f"Text {i+1}" for i in range(len(texts))],
193
+ colorscale='Viridis',
194
+ text=np.round(similarity_matrix, 3),
195
+ texttemplate='%{text}',
196
+ textfont={"size": 12},
197
+ ))
198
+
199
+ fig_matrix.update_layout(
200
+ title=f"Similarity Matrix (Task: {task})",
201
+ height=400
202
+ )
203
+
204
+ st.plotly_chart(fig_matrix, use_container_width=True)
205
+
206
+ # Interpretation
207
+ st.markdown("### 📊 Similarity Analysis")
208
+ for i in range(len(texts)):
209
+ for j in range(i+1, len(texts)):
210
+ similarity = similarity_matrix[i][j]
211
+ interpretation = (
212
+ "🟢 Very Similar" if similarity > 0.8
213
+ else "🟡 Moderately Similar" if similarity > 0.5
214
+ else "🔴 Different"
215
+ )
216
+ st.write(f"{interpretation} ({similarity:.3f}): Text {i+1} vs Text {j+1}")
217
+
218
+ if __name__ == "__main__":
219
+ main()