import gradio as gr from wordllama import WordLlama from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load the default WordLlama model wl = WordLlama.load() # Initialize TF-IDF vectorizer tfidf_vectorizer = TfidfVectorizer() def calculate_similarities(sentence1, sentence2): # WordLlama similarity wordllama_score = wl.similarity(sentence1, sentence2) # TF-IDF similarity tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2]) tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] return float(wordllama_score), float(tfidf_score) # Carefully selected examples to compare both methods examples = [ # High similarity in both methods ["The cat is sleeping on the couch", "The cat is resting on the sofa"], ["I love eating pizza", "I enjoy eating pizza"], ["The weather is sunny today", "It is a sunny day today"], # Medium similarity in both methods ["She is reading a book", "She is holding a novel"], ["The car is red", "The automobile is crimson"], ["The children are playing in the park", "Kids are having fun at the playground"], # Cases where WordLlama should perform better ["The food was great", "The meal was excellent"], ["The student is studying hard", "The pupil is working diligently"], ["This movie is fantastic", "This film is amazing"], # Cases where TF-IDF should perform better ["The red car is parked", "The red car is moving"], ["The book is on the table", "The book is under the table"], ["She went to the store", "She went to the mall"], # Semantic similarity cases ["The laptop is expensive", "The computer costs a lot"], ["The dog is barking", "The canine is making noise"], ["The house is large", "The home is spacious"], # Word order importance cases ["The cat chased the mouse", "The mouse chased the cat"], ["John gave Mary a book", "Mary gave John a book"], ["The teacher helped the student", "The student helped the teacher"], # Synonym cases ["The car is fast", "The vehicle is quick"], ["The building is tall", "The structure is high"], ["The food is delicious", "The cuisine is tasty"] ] # Define Gradio interface with updated layout with gr.Blocks(theme=gr.themes.Soft()) as iface: gr.Markdown("# Text Similarity Comparison") gr.Markdown(""" Compare sentences using both WordLlama and TF-IDF similarity metrics. Examples are categorized to demonstrate strengths of each method. """) with gr.Row(): with gr.Column(): sentence1 = gr.Textbox( lines=2, placeholder="Enter first sentence...", label="First Sentence", info="Type or select from examples below" ) with gr.Column(): sentence2 = gr.Textbox( lines=2, placeholder="Enter second sentence...", label="Second Sentence", info="Type or select from examples below" ) button = gr.Button("Calculate Similarities", variant="primary") with gr.Row(): wordllama_output = gr.Number( label="WordLlama Similarity", info="Contextual similarity score (0-1)", value=0.0 ) tfidf_output = gr.Number( label="TF-IDF Similarity", info="Term frequency-based similarity score (0-1)", value=0.0 ) gr.Markdown(""" ### Understanding the Scores - **WordLlama Similarity**: Better at understanding semantic meaning and context - **TF-IDF Similarity**: Better at exact word matching and frequency-based comparison ### Example Categories 1. High Similarity: Both methods should show high scores 2. Medium Similarity: Both methods should show moderate scores 3. Semantic Similarity: WordLlama typically performs better 4. Word Order Cases: Shows how each method handles word order 5. Synonym Cases: Tests semantic understanding """) button.click( calculate_similarities, inputs=[sentence1, sentence2], outputs=[wordllama_output, tfidf_output] ) gr.Examples( examples=examples, inputs=[sentence1, sentence2], label="Click on any example to load it" ) # Launch the interface iface.launch(share=True)