Spaces:

joinportiko
/

scientist_stalker

Sleeping

App Files Files Community

Portiko commited on Jun 8

Commit

70efd11

•

1 Parent(s): 5896611

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -5

app.py CHANGED Viewed

@@ -1,7 +1,93 @@
-from fastapi import FastAPI
-app = FastAPI()
-@app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+import streamlit as st
+from Bio import Entrez, Medline
+import pandas as pd
+import time
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
+from datasets import Dataset
+# Configure the Entrez API
+st.title("Prediction of Paper Impact on PubMed")
+email = st.text_input("Enter your email address")
+author_name = st.text_input("Enter the corresponding author's name")
+if email and author_name:
+    Entrez.email = email
+    def fetch_papers(query, max_results=5):
+        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
+        record = Entrez.read(handle)
+        handle.close()
+        id_list = record["IdList"]
+        papers = []
+        for pubmed_id in id_list:
+            time.sleep(0.5)
+            handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
+            paper_record = handle.read()
+            handle.close()
+            papers.append(paper_record)
+        return papers
+    def parse_papers(paper_records):
+        parsed_data = []
+        for record in paper_records:
+            handle = Medline.read(record)
+            title = handle.get("TI", "")
+            authors = handle.get("AU", [])
+            last_author = authors[-1] if authors else ""
+            citations = 0
+            parsed_data.append({
+                "title": title,
+                "last_author": last_author,
+                "citations": citations
+            })
+        return parsed_data
+    st.write("Searching for papers for the author:", author_name)
+    papers = fetch_papers(author_name, max_results=5)
+    parsed_papers = parse_papers(papers)
+    df = pd.DataFrame(parsed_papers)
+    st.write("Results obtained:")
+    st.write(df)
+    if not df.empty:
+        dataset = Dataset.from_pandas(df)
+        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        def preprocess_function(examples):
+            return tokenizer(examples['last_author'], truncation=True, padding='max_length', max_length=50)
+        tokenized_dataset = dataset.map(preprocess_function, batched=True)
+        tokenized_dataset = tokenized_dataset.remove_columns(['last_author'])
+        tokenized_dataset = tokenized_dataset.rename_column('citations', 'labels')
+        tokenized_dataset.set_format('torch')
+        model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
+        training_args = TrainingArguments(
+            output_dir='./results',
+            evaluation_strategy='epoch',
+            learning_rate=2e-5,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=16,
+            num_train_epochs=3,
+            weight_decay=0.01,
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_dataset,
+            eval_dataset=tokenized_dataset,
+        )
+        st.write("Training the model...")
+        trainer.train()
+        results = trainer.evaluate()
+        st.write("Model results:")
+        st.write(results)