import glob
import os
import json
import glob
import tiktoken
import pandas as pd
import copy
import numpy as np
import matplotlib.pyplot as plt
import re
import time
def estimate_tokens(path):
enc = tiktoken.encoding_for_model("gpt-4")
prompt_tokens = 0
completed_tokens = 0
num_steps = 0
step_logs = path.replace("trace.json", "../agent_log/*.log")
for file in glob.glob(step_logs):
with open(file, "r") as f:
content =
if "langchain" not in file:
prompts = re.findall(r"===================prompt=====================" + r"(.*?)" + r"===================.*?response.*?=====================", content, re.DOTALL)
prompt_tokens += sum([len(enc.encode(p)) for p in prompts])
completed = re.findall(r"===================.*?response.*?=====================" + r"(.*?)" + r"===================tokens=====================", content, re.DOTALL)
completed_tokens += sum([len(enc.encode(p)) for p in completed])
prompts = re.findall(r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL)
prompt_tokens += sum([len(enc.encode(p)) for p in prompts])
completed = re.findall(r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL)
completed_tokens += sum([len(enc.encode(p)) for p in completed])
num_steps = len(json.load(open(path, "r"))["steps"])
total_time = float(open(path.replace("trace.json", "overall_time.txt"), "r").read())
total_time = 0
tool_step_logs = path.replace("trace.json", "tool_logs/*.log")
tool_prompt_tokens = 0
tool_completed_tokens = 0
for file in glob.glob(tool_step_logs):
with open(file, "r") as f:
content =
if "langchain" not in file:
prompts = re.findall(r"===================prompt=====================" + r"(.*?)" + r"===================.*?response.*?=====================", content, re.DOTALL)
tool_prompt_tokens += sum([len(enc.encode(p)) for p in prompts])
completed = re.findall(r"===================.*?response.*?=====================" + r"(.*?)" + r"===================tokens=====================", content, re.DOTALL)
tool_completed_tokens += sum([len(enc.encode(p)) for p in completed])
prompts = re.findall(r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL)
tool_prompt_tokens += sum([len(enc.encode(p)) for p in prompts])
completed = re.findall(r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL)
tool_completed_tokens += sum([len(enc.encode(p)) for p in completed])
return prompt_tokens, completed_tokens, tool_prompt_tokens, tool_completed_tokens, num_steps, total_time
def oom_error(path):
log = path.replace("trace.json", "../log")
main_log = path.replace("trace.json", "../agent_log/main_log")
message = "CUDA out of memory"
return (message in open(log, "r").read()) or (message in open(main_log, "r").read())
def mkl_error(path):
log = path.replace("trace.json", "../log")
main_log = path.replace("trace.json", "../agent_log/main_log")
messages = ["rror: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with library.", "OpenBLAS blas_thread_init:"]
return any([m in open(log, "r").read() for m in messages]) or any([m in open(main_log, "r").read() for m in messages])
def quota_error(path):
log = path.replace("trace.json", "error.txt")
if os.path.exists(log):
message = "RemoteServiceError: EXCEPTION: total quota"
return message in open(log, "r").read()
return False
def connection_error(path):
log = path.replace("trace.json", "../log")
main_log = path.replace("trace.json", "../agent_log/main_log")
bad = ["You exceeded your current quota, please check your plan and billing details.", "Error: 'text-similarity-ada-001'", "Error: 'text-embedding-ada-001'"]
return ("Connection aborted" in open(log, "r").read()) or (any([b in open(main_log, "r").read() for b in bad]))
def langchain_error(path):
if os.path.exists(os.path.join(path.replace("trace.json", ""), "error.txt")):
return "langchain.schema.OutputParserException" in open(os.path.join(path.replace("trace.json", ""), "error.txt"), "r").read()
return False
def error(path):
return (os.path.exists(os.path.join(path.replace("trace.json", ""), "error.txt")) and not langchain_error(path)) or not os.path.exists(os.path.join(path.replace("trace.json", ""), "overall_time.txt"))
def json_error(path):
main_log = path.replace("trace.json", "../agent_log/main_log")
return open(main_log, "r").read().count("JSONDecodeError") > 2
def langchain_final(path):
return "Final Answer" in open(path.replace("trace.json", "../agent_log/main_log"), "r").read()
def autogpt_final(path):
return "Goal achieved" in open(path.replace("trace.json", "../agent_log/main_log"), "r").read()
def long_prompt_error(path):
main_log = path.replace("trace.json", "../agent_log/main_log")
return "EnvError: too long input for the tool" in open(main_log, "r").read()
def get_all_runs_with_log():
#TODO: fix paths to where your trace.json are
df = pd.DataFrame()
for r in all_runs:
exp, task, run = r.split("/")[-5:-2]
if task in os.listdir("../research_assistant_final/MLAgentBench/benchmarks"):
new_row={"task": task, "exp": exp, "run": run, "path": r}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
df["error"] = df["path"].apply(error)
df["json_error"] = df["path"].apply(json_error)
df["long_prompt_error"] = df["path"].apply(long_prompt_error)
df["oom_error"] = df["path"].apply(oom_error)
df["connection_error"] = df["path"].apply(connection_error)
df['mkl_error'] = df["path"].apply(mkl_error)
df['quota_error'] = df["path"].apply(quota_error)
df["langchain_error"] = df["path"].apply(langchain_error)
df_no_error = df[(((~df["error"]) & (~df["connection_error"])) | df["exp"].isin(["no_retrieval_gpt4", "full_gpt4_long"]) | (df["exp"].isin(["langchain", "langchain_long"]) & df["langchain_error"]) )& (~df["oom_error"]) & (~df["mkl_error"])]
return df , df_no_error
lower_the_better_tasks = [ "parkinsons-disease", "feedback", "BabyLM", "llama-inference", "house-price", "vectorization"]
# TODO: add propoer label mapping and task name mapping for pretty printing in the figure
print_labels = {
"no_retrieval_gpt4" : "GPT-4",
"no_retrieval" : "Claude v1.0",
"autogpt" : "AutoGPT",
"react" : "React",
"langchain" : "LangChain (React)",
"sanity_check" : "Baseline"
print_task_labels = {
"cifar10_training" : "cifar10",
"imdb" : "imdb",
"ogbn-arxiv" : "ogbn-arxiv",
"home-data-for-ml-course" : "house-price",
"kaggle_training_reg" : "house-price",
"kaggle_training_class" : "spaceship-titanic",
"amp-parkinsons-disease-progression-prediction" : "parkinsons-disease",
"fathomnet-out-of-sample-detection" : "fathomnet",
"feedback-prize-english-language-learning" : "feedback",
"google-research-identify-contrails-reduce-global-warming" : "identify-contrails",
"speed-up" : "llama-inference",
"vectorisation" : "vectorization",
"CLRS" : "CLRS",
"babylm" : "BabyLM"
def get_improvement(df, baseline, thresh = None, prefix=""):
if prefix:
df[f"{prefix}increase"] = df[[f"{prefix}score", "task"]].apply(lambda x: (x[f"{prefix}score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x[f"{prefix}score"] is not None else None, axis=1)
df[f"{prefix}decrease"] = df[[f"{prefix}score", "task"]].apply(lambda x: (x[f"{prefix}score"] - baseline[(baseline["task"] == x["task"])][f"final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x[f"{prefix}score"] is not None else None, axis=1)
if thresh:
return df[["task", f"{prefix}increase", f"{prefix}decrease"]].apply(lambda x: (x[f"{prefix}increase"] > thresh if x["task"] not in lower_the_better_tasks else x[f"{prefix}decrease"] < - thresh) if x[f"{prefix}increase"] is not None else False, axis=1)
return df[["task", f"{prefix}increase", f"{prefix}decrease"]].apply(lambda x: (x[f"{prefix}increase"] if x["task"] not in lower_the_better_tasks else - x[f"{prefix}decrease"]) if x[f"{prefix}increase"] is not None else None, axis=1)
# performance
def get_all_runs_eval(print_labels = print_labels, print_task_labels = print_task_labels):
# TODO: collect all evaluation jsons into all_results
all_results = {}
for f in glob.glob("/lfs/local/0/qhwang/nlp_logs/*.json"):
all_results.update(json.load(open(f, "r")))
df = pd.DataFrame()
for n, results in all_results.items():
if n.endswith(".json"):
results = {n: results}
exp, task, run = n.split("/")[-3:]
exp = exp.strip()
if exp == "react":
task = task.strip()
run = run.strip()
for source_file, r in results.items():
r_ = copy.deepcopy(r)
if len(r["score"]) < len(r["score_steps"])+1:
r_["score_steps"].append(len(json.load(open(r_["path"], "r"))["steps"]))
r_["score"] = np.array(r_["score"])
r_["score_steps"] = np.array(r_["score_steps"])
if exp == "no_retrieval":
r_["score"] = r_["score"][r_["score_steps"] < 16]
r_["score_steps"] = r_["score_steps"][r_["score_steps"] < 16]
if exp == "langchain":
r_["submitted_final_answer"] = langchain_final(r_["path"])
if exp == "autogpt":
r_["submitted_final_answer"] = autogpt_final(r_["path"])
new_row={"task": task, "exp": exp, "run": run, **r_}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
df["connection_error"] = df["path"].apply(connection_error)
df["has_error"] = df["path"].apply(error)
df["oom_error"] = df["path"].apply(oom_error)
df["mkl_error"] = df["path"].apply(mkl_error)
df["langchain_error"] = df["path"].apply(langchain_error)
print(len(df[(df["error"] != "") | (df["connection_error"] == True)]))
df = df[(((~df["has_error"]) & (df["connection_error"] == False)) | df["exp"].isin(["no_retrieval_gpt4", "full_gpt4_long"])| (df["exp"].isin(["langchain", "langchain_long"]) & df["langchain_error"]) ) & (~df["oom_error"]) & (~df["mkl_error"])]
df["exp"] = df["exp"].apply(lambda x: x if not x.endswith("_long") else x[:-5])
df = df[df["exp"].isin(list(print_labels.keys()))]
df["exp"] = df["exp"].apply(lambda x: print_labels[x])
df["task"] = df["task"].apply(lambda x: print_task_labels.get(x, x))
df["final_submitted_score"] = df[["final_score", "submitted_final_answer"]].apply(lambda x: x["final_score"] if x["final_score"] > 0 and x["submitted_final_answer"] else None, axis=1)
df["final_score"] = df["final_score"].apply(lambda x: x if x > 0 else None)
baseline = df[df["exp"] == "Baseline"][[ "task", "exp", "final_score"]].groupby(["task", "exp"]).mean().reset_index()
# special baseline numbers
try:[baseline[baseline["task"] == "imdb"].index.values[0], "final_score"] = 0.5[baseline[baseline["task"] == "fathomnet"].index.values[0], "final_score"] = 1e-10
baseline = pd.concat(
[{"task": "imdb", "exp": "Baseline", "final_score": 0.5}]
baseline = pd.concat(
[{"task": "fathomnet", "exp": "Baseline", "final_score": 1e-10}]
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "spaceship-titanic", "exp" :"Baseline", "final_score": 0.5}])], ignore_index=True)
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "house-price", "exp" :"Baseline", "final_score": 1e10}])], ignore_index=True)
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "ogbn-arxiv", "exp" :"Baseline", "final_score": 0.3134}])], ignore_index=True)
baseline = pd.concat([baseline, pd.DataFrame([{"task" : "vectorization", "exp" :"Baseline", "final_score": 6.1742}])], ignore_index=True)
return df, baseline
def get_all_runs_results(df = None, baseline = None, print_labels = print_labels, print_task_labels = print_task_labels):
if df is None or baseline is None:
df, baseline = get_all_runs_eval(print_labels = print_labels, print_task_labels = print_task_labels)
df[df["final_score"] > -1]["task"].unique()
df = df[df["task"].isin(baseline["task"].unique())]
df["max_score"] = df["score"].apply(lambda x: max(list(filter(lambda a: a > 0, x))) if len(list(filter(lambda a: a > 0, x))) > 0 else None)
df["min_score"] = df["score"].apply(lambda x: min(list(filter(lambda a: a > 0, x))) if len(list(filter(lambda a: a > 0, x))) > 0 else None)
df["increase"] = df[["max_score", "task"]].apply(lambda x: (x["max_score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x["max_score"] is not None else None, axis=1)
df["decrease"] = df[["min_score", "task"]].apply(lambda x: (x["min_score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x["min_score"] is not None else None, axis=1)
df["improve"] = get_improvement(df, baseline)
df["improve_5"] = get_improvement(df, baseline, 0.05)
df["improve_10"] = get_improvement(df, baseline, 0.1)
df["improve_15"] = get_improvement(df, baseline, 0.15)
df["improve_20"] = get_improvement(df, baseline, 0.2)
df["improve_30"] = get_improvement(df, baseline, 0.3)
for prefix in ["final_"]:
df[f"{prefix}improve"] = get_improvement(df, baseline, None, prefix)
df[f"{prefix}improve_5"] = get_improvement(df, baseline, 0.05, prefix)
df[f"{prefix}improve_10"] = get_improvement(df, baseline, 0.1, prefix)
df[f"{prefix}improve_15"] = get_improvement(df, baseline, 0.15, prefix)
df[f"{prefix}improve_20"] = get_improvement(df, baseline, 0.2, prefix)
df[f"{prefix}improve_30"] = get_improvement(df, baseline, 0.3, prefix)
# uncomment these to count tokens
# df[["prompt_tokens", "completed_tokens", "tool_prompt_tokens", "tool_completed_tokens", "num_steps", "total_time"]] = df.apply((lambda row: estimate_tokens(row["path"])), axis=1, result_type="expand")
# df['total_tokens'] = df["prompt_tokens"] + df["completed_tokens"] + df["tool_prompt_tokens"] + df["tool_completed_tokens"]
return df
import seaborn as sns
from pandas.api.types import CategoricalDtype
colors = {
"GPT-4" : "#d62728",
"Claude v1.0" : "#2ca02c",
"AutoGPT" : "#9467bd",
"React" : "#8c564b",
"LangChain (React)" : "#e377c2",
"Baseline" : "#7f7f7f"
def get_tradeoff_plot(df):
def sample_and_mean(group):
if "GPT-4" in group["exp"].values[0]:
sample = group.sample(n=min(len(group), 8), random_state=1)
sample = group.sample(n=min(len(group), 25), random_state=1)
return sample.groupby(["task", "exp"]).mean().reset_index().drop(columns=["task", "exp"])
grouped_df = df[["task", "exp", "final_improve_10", "total_tokens"]].groupby(["task", "exp"]).apply(sample_and_mean).round(4).reset_index()
x = grouped_df[["total_tokens","exp"]].groupby([ "exp"]).mean().values.flatten().tolist()
y = grouped_df[["final_improve_10","exp"]].groupby([ "exp"]).mean().values.flatten().tolist()
labels = ["AutoGPT", "Baseline", "Claude v1.0", "GPT-4", "LangChain (React)"]
for i in range(len(x)):
plt.annotate(labels[i], # this is the text
(x[i], y[i]), # these are the coordinates to position the label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.xlim((-30000, 200000))
plt.ylim((0, 0.3))
plt.xlabel("Average Nsumber of Tokens Spent")
plt.ylabel("Average Success Rate")
def get_plot(df, column_name = "improve_5", titile = "Improvement of 5%", save_name = "improve_5", plot_tokens = False, plot_time = False):
def sample_and_mean(group):
if "GPT-4" in group["exp"].values[0]:
sample = group.sample(n=min(len(group), 8), random_state=1)
sample = group.sample(n=min(len(group), 25), random_state=1)
return sample.groupby(["task", "exp"]).mean().reset_index().drop(columns=["task", "exp"])
grouped_df = df[["task", "exp", column_name]].groupby(["task", "exp"]).apply(sample_and_mean).round(4).reset_index()
grouped_df.fillna(0, inplace=True)
if plot_time:
grouped_df[column_name] = grouped_df[column_name] / 60
elif not plot_tokens:
grouped_df[column_name] = grouped_df[column_name] * 100
# Define the order
task_order = list(print_task_labels.values())
exp_order = ["GPT-4", "Claude v1.0", "AutoGPT", "LangChain (React)", "Baseline"]
cat_type = CategoricalDtype(categories=task_order, ordered=True)
grouped_df['task'] = grouped_df['task'].astype(cat_type)
cat_type = CategoricalDtype(categories=exp_order, ordered=True)
grouped_df['exp'] = grouped_df['exp'].astype(cat_type)
palette = [colors[x] for x in exp_order]
barplot = sns.barplot(x='task', y=column_name, hue='exp', data=grouped_df, palette=palette, ci=95)
# Get the current x-tick labels
labels = [item.get_text() for item in barplot.get_xticklabels()]
# Modify the labels
new_labels = labels # [ l.split("_")[0].split("-")[0] for l in labels]
# Set the new labels
plt.xticks(range(len(labels)), new_labels, rotation=30)
plt.ylim(plt.ylim()[0], plt.ylim()[1] + (plt.ylim()[1]-plt.ylim()[0]) * 0.1)
leg = barplot.get_legend()
for t in leg.texts:
t.set_text(t.get_text().replace("Year=", ""))
plt.legend(loc='upper center', fancybox=True, shadow=True, ncol=4)
if plot_tokens:
elif plot_time:
plt.ylabel("Time (minutes)")
plt.savefig(f"plots/{save_name}.pdf", bbox_inches='tight')
if __name__ == "__main__":
df = get_all_runs_results()
get_plot(df, "improve_5", "Percentage of runs that improve objective by over 5% at any point", "improve_5")
get_plot(df, "improve_10", "Percentage of runs that improve objective by over 10% at any point", "improve_10")
get_plot(df, "final_improve_5", "Percentage of runs that improves objective by over 5% at the end", "final_improve_5")
get_plot(df, "final_improve_10", "Percentage of runs that improves objective by over 10% at the end", "final_improve_10")
get_plot(df, "final_improve_30", "Percentage of runs that improves objective by over 30% at the end", "final_improve_30")
get_plot(df, "final_improve", "Average improvement in objective among runs that made a submission at the end.", "final_improve")
get_plot(df[df["submitted_final_answer"]], "final_improve", "Average improvement in objective among runs that made a final submission.", "final_improve_submitted")
get_plot(df, "total_tokens", "", "total_tokens", plot_tokens= True)
get_plot(df, "total_time", "", "total_time",plot_time=True)