import streamlit as st import torch from datasets import combine from datasets import load_dataset from transformers import AutoTokenizer from transformers import pipeline # Load HUPD dataset dataset_dict = load_dataset( "HUPD/hupd", name="sample", data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date="2016-01-01", train_filing_end_date="2016-01-21", val_filing_start_date="2016-01-22", val_filing_end_date="2016-01-31", ) # Process data filtered_dataset = dataset_dict["validation"].filter( lambda e: e["decision"] == "ACCEPTED" or e["decision"] == "REJECTED" ) seed = 88 accepted = filtered_dataset.filter(lambda e: e["decision"] == "ACCEPTED").shuffle(seed).select(range(5)) rejected = filtered_dataset.filter(lambda e: e["decision"] == "REJECTED").shuffle(seed).select(range(5)) dataset = combine.concatenate_datasets([accepted, rejected]) dataset = dataset.sort("patent_number") # Create pipeline using model trainned on Colab model = torch.load("patent_classifier_v4.pt", map_location=torch.device("cpu")) tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") tokenizer_kwargs = {'padding':True,'truncation':True} def load_data(): selected_application = dataset.select([applications[st.session_state.id]]) st.session_state.abstract = selected_application["abstract"][0] st.session_state.claims = selected_application["claims"][0] st.session_state.title = selected_application["title"][0] st.session_state.decision = selected_application["decision"][0] st.title("CS-GY-6613 Project Milestone 3") # List patent numbers for select box applications = {} for ds_index, example in enumerate(dataset): applications.update({example["patent_number"]: ds_index}) st.selectbox( "Select a sample patent application:", applications, on_change=load_data, key="id" ) # Sample title/decision displayed for additional context only, not used with model st.text_input("Sample Title", key="title", value=dataset[0]["title"],) st.text_input("Sample Decision", key="decision", value=dataset[0]["decision"]) # Classifier input form with st.form("Input Form"): abstract = st.text_area( "Abstract", key="abstract", value=dataset[0]["abstract"], height=200 ) claims = st.text_area( "Claims", key="claims", value=dataset[0]["abstract"], height=200 ) submitted = st.form_submit_button("Get Patentability Score") if submitted: tokens = tokenizer(abstract, claims, return_tensors='pt', **tokenizer_kwargs) with torch.no_grad(): output = model(**tokens) logits = output.logits pred = torch.softmax(logits, dim=1) score = pred[0][1] # index 1 of softmax output is probability that decision = ACCEPTED st.markdown( "This application's patentability score is **{}**.".format(score) )