import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel import os import pandas as pd import numpy as np from transformers import pipeline from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances @st.cache(allow_output_mutation=True) def load_model(): tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/pubmedgpt") model = AutoModel.from_pretrained("stanford-crfm/pubmedgpt") return tokenizer, model tokenizer, model = load_model() pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer) def get_embedding(desc): return np.squeeze(pipe(desc)).mean(axis=0) st.set_page_config( page_title="Clinical Trials Best Match [Eye Diseases]", page_icon="🧑‍💻", layout="wide", ) # Constants embs = [] # Heading st.title('Clinical Trials Search') # Gene File, 128 dim embeddings data = np.load("data.npy") @st.cache(allow_output_mutation=True) def get_sim(emb_desc, data): ids = [] scores = [] for i in data: score = cosine_similarity(emb_desc, i['data']) ids.append(i['ids']) scores.append(score) df = pd.DataFrame(data={"url": ids, "scores": scores}).sort_values(by='scores') return df st.subheader("🖮 Enter your clinical trial study description") text = st.text_area('Example') with st.spinner(): emb = get_embedding(text) st.subheader("💻 Hit Search") if st.button("Compute"): with st.spinner('Searching...'): df = get_sim(emb, data=data) st.dataframe(df)