Spaces:

joinportiko
/

scientist_stalker

Sleeping

App Files Files Community

joinportiko commited on Jun 8

Commit

361966d

•

1 Parent(s): 47ff92d

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -15

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ import pandas as pd
 import igraph as ig
 import matplotlib.pyplot as plt
 from collections import Counter
 import numpy as np
 # Configure the Entrez API
@@ -40,8 +43,8 @@ def extract_details(records):
     return keywords, co_authors, links, pub_dates
-# Function to predict future collaborations
-def predict_collaborations(co_authors, pub_dates):
     author_pairs = []
     for authors, date in zip(co_authors, pub_dates):
         for i in range(len(authors)):
@@ -54,15 +57,42 @@ def predict_collaborations(co_authors, pub_dates):
     author_df["weight"] = author_df["date"].apply(lambda x: (pd.Timestamp.now() - x).days)
     author_df["weight"] = 1 / (1 + np.exp((author_df["weight"] - 180) / 30))  # Sigmoid function to give more weight to recent collaborations
-    collaboration_counts = author_df.groupby(["source", "target"])["weight"].sum().reset_index()
-    collaboration_counts = collaboration_counts.sort_values(by="weight", ascending=False)
-    # Map author names to integer IDs
-    author_ids = {name: idx for idx, name in enumerate(set(collaboration_counts["source"]).union(set(collaboration_counts["target"])))}
-    collaboration_counts["source_id"] = collaboration_counts["source"].map(author_ids)
-    collaboration_counts["target_id"] = collaboration_counts["target"].map(author_ids)
-    return collaboration_counts, author_ids
 # Streamlit app
 st.title("Researcher Profile")
@@ -91,22 +121,28 @@ if st.button("Fetch Data"):
             for link in links[:5]:
                 st.write(link)
             # Predict future collaborations
             st.subheader("Predicted Future Collaborations")
-            collaboration_predictions, author_ids = predict_collaborations(co_authors, pub_dates)
             if not collaboration_predictions.empty:
                 g = ig.Graph(directed=False)
-                g.add_vertices(len(author_ids))
-                g.add_edges(zip(collaboration_predictions["source_id"], collaboration_predictions["target_id"]))
-                g.es["weight"] = collaboration_predictions["weight"]
                 layout = g.layout("fr")
                 visual_style = {
                     "vertex_size": 20,
-                    "vertex_label": [name for name, idx in sorted(author_ids.items(), key=lambda item: item[1])],
-                    "edge_width": [2 * w for w in g.es["weight"]],
                     "layout": layout,
                     "bbox": (800, 800),
                     "margin": 50

 import igraph as ig
 import matplotlib.pyplot as plt
 from collections import Counter
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.preprocessing import LabelEncoder
 import numpy as np
 # Configure the Entrez API
     return keywords, co_authors, links, pub_dates
+# Function to prepare the dataset for training
+def prepare_dataset(co_authors, pub_dates):
     author_pairs = []
     for authors, date in zip(co_authors, pub_dates):
         for i in range(len(authors)):
     author_df["weight"] = author_df["date"].apply(lambda x: (pd.Timestamp.now() - x).days)
     author_df["weight"] = 1 / (1 + np.exp((author_df["weight"] - 180) / 30))  # Sigmoid function to give more weight to recent collaborations
+    le = LabelEncoder()
+    author_df["source_id"] = le.fit_transform(author_df["source"])
+    author_df["target_id"] = le.fit_transform(author_df["target"])
+    return author_df, le
+# Function to train the decision tree model
+def train_model(author_df):
+    X = author_df[["source_id", "target_id", "weight"]]
+    y = author_df["weight"] > author_df["weight"].median()  # Binary classification: above/below median weight
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    model = DecisionTreeClassifier(random_state=42)
+    model.fit(X_train, y_train)
+    return model
+# Function to predict future collaborations
+def predict_collaborations(model, author_df, le, num_predictions=10):
+    unique_authors = list(set(author_df["source"]).union(set(author_df["target"])))
+    predictions = []
+    for i, author1 in enumerate(unique_authors):
+        for author2 in unique_authors[i+1:]:
+            source_id = le.transform([author1])[0]
+            target_id = le.transform([author2])[0]
+            weight = 1  # Assume a recent date for prediction
+            prediction = model.predict_proba([[source_id, target_id, weight]])[0][1]  # Probability of collaboration
+            predictions.append((author1, author2, prediction))
+    predictions_df = pd.DataFrame(predictions, columns=["source", "target", "prediction"])
+    predictions_df = predictions_df.sort_values(by="prediction", ascending=False).head(num_predictions)
+    return predictions_df
 # Streamlit app
 st.title("Researcher Profile")
             for link in links[:5]:
                 st.write(link)
+            # Prepare dataset
+            author_df, le = prepare_dataset(co_authors, pub_dates)
+            # Train model
+            model = train_model(author_df)
             # Predict future collaborations
             st.subheader("Predicted Future Collaborations")
+            collaboration_predictions = predict_collaborations(model, author_df, le)
             if not collaboration_predictions.empty:
                 g = ig.Graph(directed=False)
+                g.add_vertices(len(set(collaboration_predictions["source"]).union(set(collaboration_predictions["target"]))))
+                edges = [(le.transform([row["source"]])[0], le.transform([row["target"]])[0]) for _, row in collaboration_predictions.iterrows()]
+                g.add_edges(edges)
                 layout = g.layout("fr")
                 visual_style = {
                     "vertex_size": 20,
+                    "vertex_label": [le.inverse_transform([idx])[0] for idx in range(len(le.classes_))],
+                    "edge_width": [2 * w for w in collaboration_predictions["prediction"]],
                     "layout": layout,
                     "bbox": (800, 800),
                     "margin": 50