Portiko commited on
Commit
70efd11
β€’
1 Parent(s): 5896611

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -5
app.py CHANGED
@@ -1,7 +1,93 @@
1
- from fastapi import FastAPI
 
 
 
 
 
2
 
3
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- @app.get("/")
6
- def greet_json():
7
- return {"Hello": "World!"}
 
1
+ import streamlit as st
2
+ from Bio import Entrez, Medline
3
+ import pandas as pd
4
+ import time
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
6
+ from datasets import Dataset
7
 
8
+ # Configure the Entrez API
9
+ st.title("Prediction of Paper Impact on PubMed")
10
+
11
+ email = st.text_input("Enter your email address")
12
+ author_name = st.text_input("Enter the corresponding author's name")
13
+
14
+ if email and author_name:
15
+ Entrez.email = email
16
+
17
+ def fetch_papers(query, max_results=5):
18
+ handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
19
+ record = Entrez.read(handle)
20
+ handle.close()
21
+ id_list = record["IdList"]
22
+ papers = []
23
+ for pubmed_id in id_list:
24
+ time.sleep(0.5)
25
+ handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
26
+ paper_record = handle.read()
27
+ handle.close()
28
+ papers.append(paper_record)
29
+ return papers
30
+
31
+ def parse_papers(paper_records):
32
+ parsed_data = []
33
+ for record in paper_records:
34
+ handle = Medline.read(record)
35
+ title = handle.get("TI", "")
36
+ authors = handle.get("AU", [])
37
+ last_author = authors[-1] if authors else ""
38
+ citations = 0
39
+ parsed_data.append({
40
+ "title": title,
41
+ "last_author": last_author,
42
+ "citations": citations
43
+ })
44
+ return parsed_data
45
+
46
+ st.write("Searching for papers for the author:", author_name)
47
+ papers = fetch_papers(author_name, max_results=5)
48
+ parsed_papers = parse_papers(papers)
49
+
50
+ df = pd.DataFrame(parsed_papers)
51
+ st.write("Results obtained:")
52
+ st.write(df)
53
+
54
+ if not df.empty:
55
+ dataset = Dataset.from_pandas(df)
56
+
57
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
58
+
59
+ def preprocess_function(examples):
60
+ return tokenizer(examples['last_author'], truncation=True, padding='max_length', max_length=50)
61
+
62
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
63
+ tokenized_dataset = tokenized_dataset.remove_columns(['last_author'])
64
+ tokenized_dataset = tokenized_dataset.rename_column('citations', 'labels')
65
+ tokenized_dataset.set_format('torch')
66
+
67
+ model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
68
+
69
+ training_args = TrainingArguments(
70
+ output_dir='./results',
71
+ evaluation_strategy='epoch',
72
+ learning_rate=2e-5,
73
+ per_device_train_batch_size=16,
74
+ per_device_eval_batch_size=16,
75
+ num_train_epochs=3,
76
+ weight_decay=0.01,
77
+ )
78
+
79
+ trainer = Trainer(
80
+ model=model,
81
+ args=training_args,
82
+ train_dataset=tokenized_dataset,
83
+ eval_dataset=tokenized_dataset,
84
+ )
85
+
86
+ st.write("Training the model...")
87
+
88
+ trainer.train()
89
+ results = trainer.evaluate()
90
+
91
+ st.write("Model results:")
92
+ st.write(results)
93