Spaces:
Running
Running
File size: 7,103 Bytes
c58af7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import streamlit as st
import numpy as np
from transformers import AutoModel
import plotly.graph_objects as go
from sklearn.manifold import MDS
import pandas as pd
import torch
# Page configuration
st.set_page_config(
page_title="Jina Embeddings Explorer",
page_icon="🔮",
layout="wide"
)
# Custom CSS
st.markdown("""
<style>
.title-font {
font-size: 28px !important;
font-weight: bold;
color: #2c3e50;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def load_model():
return AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
model = load_model()
def get_embeddings(texts, task="text-matching"):
"""Get embeddings using Jina v3 model"""
with torch.no_grad():
embeddings = model.encode(texts, task=task)
return embeddings
def create_similarity_based_visualization(texts, task="text-matching"):
"""Create visualization based on similarity distances"""
n = len(texts)
# Get embeddings
embeddings = get_embeddings(texts, task=task)
# Calculate similarity matrix using cosine similarity
similarity_matrix = np.zeros((n, n))
for i in range(n):
for j in range(n):
similarity_matrix[i][j] = np.dot(embeddings[i], embeddings[j]) / (
np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
# Convert similarities to distances
distance_matrix = 1 - similarity_matrix
# Use MDS for visualization
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
coords = mds.fit_transform(distance_matrix)
# Create 3D visualization
fig = go.Figure()
# Add points
fig.add_trace(go.Scatter3d(
x=coords[:, 0],
y=coords[:, 1],
z=coords[:, 2],
mode='markers+text',
text=texts,
textposition='top center',
marker=dict(
size=10,
color=list(range(len(texts))),
colorscale='Viridis',
opacity=0.8
),
name='Texts'
))
# Add lines between points
for i in range(n):
for j in range(i+1, n):
opacity = max(0.1, min(1.0, similarity_matrix[i,j]))
fig.add_trace(go.Scatter3d(
x=[coords[i,0], coords[j,0]],
y=[coords[i,1], coords[j,1]],
z=[coords[i,2], coords[j,2]],
mode='lines',
line=dict(
color='gray',
width=2
),
opacity=opacity,
showlegend=False,
hoverinfo='skip'
))
fig.update_layout(
title=f"3D Similarity Visualization (Task: {task})",
scene=dict(
xaxis_title="Dimension 1",
yaxis_title="Dimension 2",
zaxis_title="Dimension 3",
camera=dict(
up=dict(x=0, y=0, z=1),
center=dict(x=0, y=0, z=0),
eye=dict(x=1.5, y=1.5, z=1.5)
)
),
height=700
)
return fig, similarity_matrix
def main():
st.title("🔮 Jina Embeddings v3 Explorer")
st.markdown("<p class='title-font'>Explore text similarities using state-of-the-art embeddings</p>",
unsafe_allow_html=True)
with st.expander("ℹ️ About Jina Embeddings v3", expanded=True):
st.markdown("""
This tool uses Jina Embeddings v3, a powerful multilingual embedding model that supports:
- Multiple tasks: text-matching, retrieval, classification, separation
- Long sequences: up to 8192 tokens
- 30+ languages
- State-of-the-art performance
""")
# Task selection
task = st.selectbox(
"Select Task",
["text-matching", "retrieval.query", "retrieval.passage", "separation", "classification"],
help="Different tasks optimize embeddings for specific use cases"
)
# Example templates
examples = {
"Similar Concepts": [
"I love programming in Python",
"Coding with Python is amazing",
"Software development is fun",
"I enjoy writing code"
],
"Multilingual": [
"Hello, how are you?",
"Hola, ¿cómo estás?",
"Bonjour, comment allez-vous?",
"你好,你好吗?"
],
"Technical Concepts": [
"Machine learning is a subset of artificial intelligence",
"AI systems can learn from data",
"Neural networks process information",
"Deep learning models require training"
]
}
col1, col2 = st.columns([3, 1])
with col1:
selected_example = st.selectbox("Choose an example set:", list(examples.keys()))
with col2:
if st.button("Load Example"):
st.session_state.texts = examples[selected_example]
# Text input
num_texts = st.slider("Number of texts:", 2, 6, 4)
texts = []
for i in range(num_texts):
default_text = (examples[selected_example][i]
if selected_example in examples and i < len(examples[selected_example])
else f"Example text {i+1}")
text = st.text_area(
f"Text {i+1}",
value=default_text,
height=100,
key=f"text_{i}"
)
texts.append(text)
if st.button("Analyze Texts", type="primary"):
if all(texts):
fig, similarity_matrix = create_similarity_based_visualization(texts, task)
# Display visualization
st.plotly_chart(fig, use_container_width=True)
# Show similarity matrix
st.markdown("### Similarity Matrix")
fig_matrix = go.Figure(data=go.Heatmap(
z=similarity_matrix,
x=[f"Text {i+1}" for i in range(len(texts))],
y=[f"Text {i+1}" for i in range(len(texts))],
colorscale='Viridis',
text=np.round(similarity_matrix, 3),
texttemplate='%{text}',
textfont={"size": 12},
))
fig_matrix.update_layout(
title=f"Similarity Matrix (Task: {task})",
height=400
)
st.plotly_chart(fig_matrix, use_container_width=True)
# Interpretation
st.markdown("### 📊 Similarity Analysis")
for i in range(len(texts)):
for j in range(i+1, len(texts)):
similarity = similarity_matrix[i][j]
interpretation = (
"🟢 Very Similar" if similarity > 0.8
else "🟡 Moderately Similar" if similarity > 0.5
else "🔴 Different"
)
st.write(f"{interpretation} ({similarity:.3f}): Text {i+1} vs Text {j+1}")
if __name__ == "__main__":
main() |