Spaces:
Sleeping
Sleeping
joinportiko
commited on
Commit
β’
361966d
1
Parent(s):
47ff92d
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,9 @@ import pandas as pd
|
|
4 |
import igraph as ig
|
5 |
import matplotlib.pyplot as plt
|
6 |
from collections import Counter
|
|
|
|
|
|
|
7 |
import numpy as np
|
8 |
|
9 |
# Configure the Entrez API
|
@@ -40,8 +43,8 @@ def extract_details(records):
|
|
40 |
|
41 |
return keywords, co_authors, links, pub_dates
|
42 |
|
43 |
-
# Function to
|
44 |
-
def
|
45 |
author_pairs = []
|
46 |
for authors, date in zip(co_authors, pub_dates):
|
47 |
for i in range(len(authors)):
|
@@ -54,15 +57,42 @@ def predict_collaborations(co_authors, pub_dates):
|
|
54 |
author_df["weight"] = author_df["date"].apply(lambda x: (pd.Timestamp.now() - x).days)
|
55 |
author_df["weight"] = 1 / (1 + np.exp((author_df["weight"] - 180) / 30)) # Sigmoid function to give more weight to recent collaborations
|
56 |
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
collaboration_counts["source_id"] = collaboration_counts["source"].map(author_ids)
|
63 |
-
collaboration_counts["target_id"] = collaboration_counts["target"].map(author_ids)
|
64 |
|
65 |
-
return
|
66 |
|
67 |
# Streamlit app
|
68 |
st.title("Researcher Profile")
|
@@ -91,22 +121,28 @@ if st.button("Fetch Data"):
|
|
91 |
for link in links[:5]:
|
92 |
st.write(link)
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
# Predict future collaborations
|
95 |
st.subheader("Predicted Future Collaborations")
|
96 |
-
collaboration_predictions
|
97 |
|
98 |
if not collaboration_predictions.empty:
|
99 |
g = ig.Graph(directed=False)
|
100 |
-
g.add_vertices(len(
|
101 |
-
|
102 |
-
g.
|
103 |
|
104 |
layout = g.layout("fr")
|
105 |
|
106 |
visual_style = {
|
107 |
"vertex_size": 20,
|
108 |
-
"vertex_label": [
|
109 |
-
"edge_width": [2 * w for w in
|
110 |
"layout": layout,
|
111 |
"bbox": (800, 800),
|
112 |
"margin": 50
|
|
|
4 |
import igraph as ig
|
5 |
import matplotlib.pyplot as plt
|
6 |
from collections import Counter
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.tree import DecisionTreeClassifier
|
9 |
+
from sklearn.preprocessing import LabelEncoder
|
10 |
import numpy as np
|
11 |
|
12 |
# Configure the Entrez API
|
|
|
43 |
|
44 |
return keywords, co_authors, links, pub_dates
|
45 |
|
46 |
+
# Function to prepare the dataset for training
|
47 |
+
def prepare_dataset(co_authors, pub_dates):
|
48 |
author_pairs = []
|
49 |
for authors, date in zip(co_authors, pub_dates):
|
50 |
for i in range(len(authors)):
|
|
|
57 |
author_df["weight"] = author_df["date"].apply(lambda x: (pd.Timestamp.now() - x).days)
|
58 |
author_df["weight"] = 1 / (1 + np.exp((author_df["weight"] - 180) / 30)) # Sigmoid function to give more weight to recent collaborations
|
59 |
|
60 |
+
le = LabelEncoder()
|
61 |
+
author_df["source_id"] = le.fit_transform(author_df["source"])
|
62 |
+
author_df["target_id"] = le.fit_transform(author_df["target"])
|
63 |
+
|
64 |
+
return author_df, le
|
65 |
+
|
66 |
+
# Function to train the decision tree model
|
67 |
+
def train_model(author_df):
|
68 |
+
X = author_df[["source_id", "target_id", "weight"]]
|
69 |
+
y = author_df["weight"] > author_df["weight"].median() # Binary classification: above/below median weight
|
70 |
+
|
71 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
72 |
+
|
73 |
+
model = DecisionTreeClassifier(random_state=42)
|
74 |
+
model.fit(X_train, y_train)
|
75 |
+
|
76 |
+
return model
|
77 |
+
|
78 |
+
# Function to predict future collaborations
|
79 |
+
def predict_collaborations(model, author_df, le, num_predictions=10):
|
80 |
+
unique_authors = list(set(author_df["source"]).union(set(author_df["target"])))
|
81 |
+
predictions = []
|
82 |
+
|
83 |
+
for i, author1 in enumerate(unique_authors):
|
84 |
+
for author2 in unique_authors[i+1:]:
|
85 |
+
source_id = le.transform([author1])[0]
|
86 |
+
target_id = le.transform([author2])[0]
|
87 |
+
weight = 1 # Assume a recent date for prediction
|
88 |
+
|
89 |
+
prediction = model.predict_proba([[source_id, target_id, weight]])[0][1] # Probability of collaboration
|
90 |
+
predictions.append((author1, author2, prediction))
|
91 |
|
92 |
+
predictions_df = pd.DataFrame(predictions, columns=["source", "target", "prediction"])
|
93 |
+
predictions_df = predictions_df.sort_values(by="prediction", ascending=False).head(num_predictions)
|
|
|
|
|
94 |
|
95 |
+
return predictions_df
|
96 |
|
97 |
# Streamlit app
|
98 |
st.title("Researcher Profile")
|
|
|
121 |
for link in links[:5]:
|
122 |
st.write(link)
|
123 |
|
124 |
+
# Prepare dataset
|
125 |
+
author_df, le = prepare_dataset(co_authors, pub_dates)
|
126 |
+
|
127 |
+
# Train model
|
128 |
+
model = train_model(author_df)
|
129 |
+
|
130 |
# Predict future collaborations
|
131 |
st.subheader("Predicted Future Collaborations")
|
132 |
+
collaboration_predictions = predict_collaborations(model, author_df, le)
|
133 |
|
134 |
if not collaboration_predictions.empty:
|
135 |
g = ig.Graph(directed=False)
|
136 |
+
g.add_vertices(len(set(collaboration_predictions["source"]).union(set(collaboration_predictions["target"]))))
|
137 |
+
edges = [(le.transform([row["source"]])[0], le.transform([row["target"]])[0]) for _, row in collaboration_predictions.iterrows()]
|
138 |
+
g.add_edges(edges)
|
139 |
|
140 |
layout = g.layout("fr")
|
141 |
|
142 |
visual_style = {
|
143 |
"vertex_size": 20,
|
144 |
+
"vertex_label": [le.inverse_transform([idx])[0] for idx in range(len(le.classes_))],
|
145 |
+
"edge_width": [2 * w for w in collaboration_predictions["prediction"]],
|
146 |
"layout": layout,
|
147 |
"bbox": (800, 800),
|
148 |
"margin": 50
|