joinportiko commited on
Commit
361966d
β€’
1 Parent(s): 47ff92d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -15
app.py CHANGED
@@ -4,6 +4,9 @@ import pandas as pd
4
  import igraph as ig
5
  import matplotlib.pyplot as plt
6
  from collections import Counter
 
 
 
7
  import numpy as np
8
 
9
  # Configure the Entrez API
@@ -40,8 +43,8 @@ def extract_details(records):
40
 
41
  return keywords, co_authors, links, pub_dates
42
 
43
- # Function to predict future collaborations
44
- def predict_collaborations(co_authors, pub_dates):
45
  author_pairs = []
46
  for authors, date in zip(co_authors, pub_dates):
47
  for i in range(len(authors)):
@@ -54,15 +57,42 @@ def predict_collaborations(co_authors, pub_dates):
54
  author_df["weight"] = author_df["date"].apply(lambda x: (pd.Timestamp.now() - x).days)
55
  author_df["weight"] = 1 / (1 + np.exp((author_df["weight"] - 180) / 30)) # Sigmoid function to give more weight to recent collaborations
56
 
57
- collaboration_counts = author_df.groupby(["source", "target"])["weight"].sum().reset_index()
58
- collaboration_counts = collaboration_counts.sort_values(by="weight", ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- # Map author names to integer IDs
61
- author_ids = {name: idx for idx, name in enumerate(set(collaboration_counts["source"]).union(set(collaboration_counts["target"])))}
62
- collaboration_counts["source_id"] = collaboration_counts["source"].map(author_ids)
63
- collaboration_counts["target_id"] = collaboration_counts["target"].map(author_ids)
64
 
65
- return collaboration_counts, author_ids
66
 
67
  # Streamlit app
68
  st.title("Researcher Profile")
@@ -91,22 +121,28 @@ if st.button("Fetch Data"):
91
  for link in links[:5]:
92
  st.write(link)
93
 
 
 
 
 
 
 
94
  # Predict future collaborations
95
  st.subheader("Predicted Future Collaborations")
96
- collaboration_predictions, author_ids = predict_collaborations(co_authors, pub_dates)
97
 
98
  if not collaboration_predictions.empty:
99
  g = ig.Graph(directed=False)
100
- g.add_vertices(len(author_ids))
101
- g.add_edges(zip(collaboration_predictions["source_id"], collaboration_predictions["target_id"]))
102
- g.es["weight"] = collaboration_predictions["weight"]
103
 
104
  layout = g.layout("fr")
105
 
106
  visual_style = {
107
  "vertex_size": 20,
108
- "vertex_label": [name for name, idx in sorted(author_ids.items(), key=lambda item: item[1])],
109
- "edge_width": [2 * w for w in g.es["weight"]],
110
  "layout": layout,
111
  "bbox": (800, 800),
112
  "margin": 50
 
4
  import igraph as ig
5
  import matplotlib.pyplot as plt
6
  from collections import Counter
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ from sklearn.preprocessing import LabelEncoder
10
  import numpy as np
11
 
12
  # Configure the Entrez API
 
43
 
44
  return keywords, co_authors, links, pub_dates
45
 
46
+ # Function to prepare the dataset for training
47
+ def prepare_dataset(co_authors, pub_dates):
48
  author_pairs = []
49
  for authors, date in zip(co_authors, pub_dates):
50
  for i in range(len(authors)):
 
57
  author_df["weight"] = author_df["date"].apply(lambda x: (pd.Timestamp.now() - x).days)
58
  author_df["weight"] = 1 / (1 + np.exp((author_df["weight"] - 180) / 30)) # Sigmoid function to give more weight to recent collaborations
59
 
60
+ le = LabelEncoder()
61
+ author_df["source_id"] = le.fit_transform(author_df["source"])
62
+ author_df["target_id"] = le.fit_transform(author_df["target"])
63
+
64
+ return author_df, le
65
+
66
+ # Function to train the decision tree model
67
+ def train_model(author_df):
68
+ X = author_df[["source_id", "target_id", "weight"]]
69
+ y = author_df["weight"] > author_df["weight"].median() # Binary classification: above/below median weight
70
+
71
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
72
+
73
+ model = DecisionTreeClassifier(random_state=42)
74
+ model.fit(X_train, y_train)
75
+
76
+ return model
77
+
78
+ # Function to predict future collaborations
79
+ def predict_collaborations(model, author_df, le, num_predictions=10):
80
+ unique_authors = list(set(author_df["source"]).union(set(author_df["target"])))
81
+ predictions = []
82
+
83
+ for i, author1 in enumerate(unique_authors):
84
+ for author2 in unique_authors[i+1:]:
85
+ source_id = le.transform([author1])[0]
86
+ target_id = le.transform([author2])[0]
87
+ weight = 1 # Assume a recent date for prediction
88
+
89
+ prediction = model.predict_proba([[source_id, target_id, weight]])[0][1] # Probability of collaboration
90
+ predictions.append((author1, author2, prediction))
91
 
92
+ predictions_df = pd.DataFrame(predictions, columns=["source", "target", "prediction"])
93
+ predictions_df = predictions_df.sort_values(by="prediction", ascending=False).head(num_predictions)
 
 
94
 
95
+ return predictions_df
96
 
97
  # Streamlit app
98
  st.title("Researcher Profile")
 
121
  for link in links[:5]:
122
  st.write(link)
123
 
124
+ # Prepare dataset
125
+ author_df, le = prepare_dataset(co_authors, pub_dates)
126
+
127
+ # Train model
128
+ model = train_model(author_df)
129
+
130
  # Predict future collaborations
131
  st.subheader("Predicted Future Collaborations")
132
+ collaboration_predictions = predict_collaborations(model, author_df, le)
133
 
134
  if not collaboration_predictions.empty:
135
  g = ig.Graph(directed=False)
136
+ g.add_vertices(len(set(collaboration_predictions["source"]).union(set(collaboration_predictions["target"]))))
137
+ edges = [(le.transform([row["source"]])[0], le.transform([row["target"]])[0]) for _, row in collaboration_predictions.iterrows()]
138
+ g.add_edges(edges)
139
 
140
  layout = g.layout("fr")
141
 
142
  visual_style = {
143
  "vertex_size": 20,
144
+ "vertex_label": [le.inverse_transform([idx])[0] for idx in range(len(le.classes_))],
145
+ "edge_width": [2 * w for w in collaboration_predictions["prediction"]],
146
  "layout": layout,
147
  "bbox": (800, 800),
148
  "margin": 50