DexterSptizu's picture
Update app.py
b1ec3a2 verified
raw
history blame
4.53 kB
import gradio as gr
from wordllama import WordLlama
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load the default WordLlama model
wl = WordLlama.load()
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
def calculate_similarities(sentence1, sentence2):
# WordLlama similarity
wordllama_score = wl.similarity(sentence1, sentence2)
# TF-IDF similarity
tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return float(wordllama_score), float(tfidf_score)
# Carefully selected examples to compare both methods
examples = [
# High similarity in both methods
["The cat is sleeping on the couch", "The cat is resting on the sofa"],
["I love eating pizza", "I enjoy eating pizza"],
["The weather is sunny today", "It is a sunny day today"],
# Medium similarity in both methods
["She is reading a book", "She is holding a novel"],
["The car is red", "The automobile is crimson"],
["The children are playing in the park", "Kids are having fun at the playground"],
# Cases where WordLlama should perform better
["The food was great", "The meal was excellent"],
["The student is studying hard", "The pupil is working diligently"],
["This movie is fantastic", "This film is amazing"],
# Cases where TF-IDF should perform better
["The red car is parked", "The red car is moving"],
["The book is on the table", "The book is under the table"],
["She went to the store", "She went to the mall"],
# Semantic similarity cases
["The laptop is expensive", "The computer costs a lot"],
["The dog is barking", "The canine is making noise"],
["The house is large", "The home is spacious"],
# Word order importance cases
["The cat chased the mouse", "The mouse chased the cat"],
["John gave Mary a book", "Mary gave John a book"],
["The teacher helped the student", "The student helped the teacher"],
# Synonym cases
["The car is fast", "The vehicle is quick"],
["The building is tall", "The structure is high"],
["The food is delicious", "The cuisine is tasty"]
]
# Define Gradio interface with updated layout
with gr.Blocks(theme=gr.themes.Soft()) as iface:
gr.Markdown("# Text Similarity Comparison")
gr.Markdown("""
Compare sentences using both WordLlama and TF-IDF similarity metrics.
Examples are categorized to demonstrate strengths of each method.
""")
with gr.Row():
with gr.Column():
sentence1 = gr.Textbox(
lines=2,
placeholder="Enter first sentence...",
label="First Sentence",
info="Type or select from examples below"
)
with gr.Column():
sentence2 = gr.Textbox(
lines=2,
placeholder="Enter second sentence...",
label="Second Sentence",
info="Type or select from examples below"
)
button = gr.Button("Calculate Similarities", variant="primary")
with gr.Row():
wordllama_output = gr.Number(
label="WordLlama Similarity",
info="Contextual similarity score (0-1)",
value=0.0
)
tfidf_output = gr.Number(
label="TF-IDF Similarity",
info="Term frequency-based similarity score (0-1)",
value=0.0
)
gr.Markdown("""
### Understanding the Scores
- **WordLlama Similarity**: Better at understanding semantic meaning and context
- **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison
### Example Categories
1. High Similarity: Both methods should show high scores
2. Medium Similarity: Both methods should show moderate scores
3. Semantic Similarity: WordLlama typically performs better
4. Word Order Cases: Shows how each method handles word order
5. Synonym Cases: Tests semantic understanding
""")
button.click(
calculate_similarities,
inputs=[sentence1, sentence2],
outputs=[wordllama_output, tfidf_output]
)
gr.Examples(
examples=examples,
inputs=[sentence1, sentence2],
label="Click on any example to load it"
)
# Launch the interface
iface.launch(share=True)