import streamlit as st import torch import sentence_transformers as sent import datasets as ds d = ds.load_dataset("wikipedia", "20220301.simple") t = d["train"] titles = t['title'] @st.cache(allow_output_mutation=True) def load_model(): return sent.SentenceTransformer("distiluse-base-multilingual-cased-v1")#"all-MiniLM-L6-v2") @st.cache def load_wikipedia_embeddings(): return torch.load("titles-simple-0.pt", map_location=torch.device('cpu')) st.title("Multilingual Semantic Search for Wikipedia Simple English") st.markdown(""" Use semantic search to find related articles in Wikipedia Simple English: using a language model (sentence-transformers/distiluse-base-multilingual-cased-v1) we can find the closests titles from Wikipedia Simple English (wikipedia) queried in any of the model's trained languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish: - colesterol - développement humain - Crise dos mísseis de Cuba Also, "near natural language" queries are usually enough to bring up relevant results. Try: - ¿cuál es el edificio más alto del mundo? - comment préparer du poulet frit - melhores películas de pixar (note: search is done only on the article titles, not the content) """) model = load_model() embeddings = load_wikipedia_embeddings() #queries = ["Aristoteles", "Autismo", "Mental", "crecimiento poblacional"] query = st.text_input("Query (es, fr, pt, ...)") if query != "": queries = [query] queries_emb = model.encode(queries, convert_to_tensor=True) hits = sent.util.semantic_search(queries_emb, embeddings, top_k=5) for i,q in enumerate(queries): f"----\n{q}:\n" for hit in hits[i]: cid = hit['corpus_id'] title = titles[cid] url = t[cid]['url'] text = t[cid]['text'][:500] + "..." st.header(f"{title}") url text hit