import glob import os import json import glob import tiktoken import pandas as pd import copy import numpy as np import matplotlib.pyplot as plt import re import time def estimate_tokens(path): enc = tiktoken.encoding_for_model("gpt-4") prompt_tokens = 0 completed_tokens = 0 num_steps = 0 step_logs = path.replace("trace.json", "../agent_log/*.log") for file in glob.glob(step_logs): with open(file, "r") as f: content = f.read() if "langchain" not in file: prompts = re.findall(r"===================prompt=====================" + r"(.*?)" + r"===================.*?response.*?=====================", content, re.DOTALL) prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) completed = re.findall(r"===================.*?response.*?=====================" + r"(.*?)" + r"===================tokens=====================", content, re.DOTALL) completed_tokens += sum([len(enc.encode(p)) for p in completed]) else: prompts = re.findall(r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) completed = re.findall(r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) completed_tokens += sum([len(enc.encode(p)) for p in completed]) num_steps = len(json.load(open(path, "r"))["steps"]) try: total_time = float(open(path.replace("trace.json", "overall_time.txt"), "r").read()) except: total_time = 0 tool_step_logs = path.replace("trace.json", "tool_logs/*.log") tool_prompt_tokens = 0 tool_completed_tokens = 0 for file in glob.glob(tool_step_logs): with open(file, "r") as f: content = f.read() if "langchain" not in file: prompts = re.findall(r"===================prompt=====================" + r"(.*?)" + r"===================.*?response.*?=====================", content, re.DOTALL) tool_prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) completed = re.findall(r"===================.*?response.*?=====================" + r"(.*?)" + r"===================tokens=====================", content, re.DOTALL) tool_completed_tokens += sum([len(enc.encode(p)) for p in completed]) else: prompts = re.findall(r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) tool_prompt_tokens += sum([len(enc.encode(p)) for p in prompts]) completed = re.findall(r"\x1B\[0m\n\n\x1B\[1m> Finished chain.\x1B\[0m\n\x1B\[32;1m\x1B\[1;3m" + r"(.*?)" + r"Prompt after formatting:\n\x1B\[32;1m\x1B\[1;3m", content, re.DOTALL) tool_completed_tokens += sum([len(enc.encode(p)) for p in completed]) return prompt_tokens, completed_tokens, tool_prompt_tokens, tool_completed_tokens, num_steps, total_time def oom_error(path): log = path.replace("trace.json", "../log") main_log = path.replace("trace.json", "../agent_log/main_log") message = "CUDA out of memory" return (message in open(log, "r").read()) or (message in open(main_log, "r").read()) def mkl_error(path): log = path.replace("trace.json", "../log") main_log = path.replace("trace.json", "../agent_log/main_log") messages = ["rror: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.", "OpenBLAS blas_thread_init:"] return any([m in open(log, "r").read() for m in messages]) or any([m in open(main_log, "r").read() for m in messages]) def quota_error(path): log = path.replace("trace.json", "error.txt") if os.path.exists(log): message = "RemoteServiceError: EXCEPTION: total quota" return message in open(log, "r").read() return False def connection_error(path): log = path.replace("trace.json", "../log") main_log = path.replace("trace.json", "../agent_log/main_log") bad = ["You exceeded your current quota, please check your plan and billing details.", "Error: 'text-similarity-ada-001'", "Error: 'text-embedding-ada-001'"] return ("Connection aborted" in open(log, "r").read()) or (any([b in open(main_log, "r").read() for b in bad])) def langchain_error(path): if os.path.exists(os.path.join(path.replace("trace.json", ""), "error.txt")): return "langchain.schema.OutputParserException" in open(os.path.join(path.replace("trace.json", ""), "error.txt"), "r").read() return False def error(path): return (os.path.exists(os.path.join(path.replace("trace.json", ""), "error.txt")) and not langchain_error(path)) or not os.path.exists(os.path.join(path.replace("trace.json", ""), "overall_time.txt")) def json_error(path): main_log = path.replace("trace.json", "../agent_log/main_log") return open(main_log, "r").read().count("JSONDecodeError") > 2 def langchain_final(path): return "Final Answer" in open(path.replace("trace.json", "../agent_log/main_log"), "r").read() def autogpt_final(path): return "Goal achieved" in open(path.replace("trace.json", "../agent_log/main_log"), "r").read() def long_prompt_error(path): main_log = path.replace("trace.json", "../agent_log/main_log") return "EnvError: too long input for the tool" in open(main_log, "r").read() def get_all_runs_with_log(): #TODO: fix paths to where your trace.json are all_runs.extend(glob.glob("/lfs/local/0/qhwang/nlp_logs/final_exp_logs*/*/*/*/env_log/trace.json")) df = pd.DataFrame() for r in all_runs: exp, task, run = r.split("/")[-5:-2] if task in os.listdir("../research_assistant_final/MLAgentBench/benchmarks"): new_row={"task": task, "exp": exp, "run": run, "path": r} df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) df["error"] = df["path"].apply(error) df["json_error"] = df["path"].apply(json_error) df["long_prompt_error"] = df["path"].apply(long_prompt_error) df["oom_error"] = df["path"].apply(oom_error) df["connection_error"] = df["path"].apply(connection_error) df['mkl_error'] = df["path"].apply(mkl_error) df['quota_error'] = df["path"].apply(quota_error) df["langchain_error"] = df["path"].apply(langchain_error) df_no_error = df[(((~df["error"]) & (~df["connection_error"])) | df["exp"].isin(["no_retrieval_gpt4", "full_gpt4_long"]) | (df["exp"].isin(["langchain", "langchain_long"]) & df["langchain_error"]) )& (~df["oom_error"]) & (~df["mkl_error"])] return df , df_no_error lower_the_better_tasks = [ "parkinsons-disease", "feedback", "BabyLM", "llama-inference", "house-price", "vectorization"] # TODO: add propoer label mapping and task name mapping for pretty printing in the figure print_labels = { "no_retrieval_gpt4" : "GPT-4", "no_retrieval" : "Claude v1.0", "autogpt" : "AutoGPT", "react" : "React", "langchain" : "LangChain (React)", "sanity_check" : "Baseline" } print_task_labels = { "cifar10_training" : "cifar10", "imdb" : "imdb", "ogbn-arxiv" : "ogbn-arxiv", "home-data-for-ml-course" : "house-price", "kaggle_training_reg" : "house-price", "kaggle_training_class" : "spaceship-titanic", "amp-parkinsons-disease-progression-prediction" : "parkinsons-disease", "fathomnet-out-of-sample-detection" : "fathomnet", "feedback-prize-english-language-learning" : "feedback", "google-research-identify-contrails-reduce-global-warming" : "identify-contrails", "speed-up" : "llama-inference", "vectorisation" : "vectorization", "CLRS" : "CLRS", "babylm" : "BabyLM" } def get_improvement(df, baseline, thresh = None, prefix=""): if prefix: df[f"{prefix}increase"] = df[[f"{prefix}score", "task"]].apply(lambda x: (x[f"{prefix}score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x[f"{prefix}score"] is not None else None, axis=1) df[f"{prefix}decrease"] = df[[f"{prefix}score", "task"]].apply(lambda x: (x[f"{prefix}score"] - baseline[(baseline["task"] == x["task"])][f"final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x[f"{prefix}score"] is not None else None, axis=1) if thresh: return df[["task", f"{prefix}increase", f"{prefix}decrease"]].apply(lambda x: (x[f"{prefix}increase"] > thresh if x["task"] not in lower_the_better_tasks else x[f"{prefix}decrease"] < - thresh) if x[f"{prefix}increase"] is not None else False, axis=1) else: return df[["task", f"{prefix}increase", f"{prefix}decrease"]].apply(lambda x: (x[f"{prefix}increase"] if x["task"] not in lower_the_better_tasks else - x[f"{prefix}decrease"]) if x[f"{prefix}increase"] is not None else None, axis=1) # performance def get_all_runs_eval(print_labels = print_labels, print_task_labels = print_task_labels): # TODO: collect all evaluation jsons into all_results all_results = {} for f in glob.glob("/lfs/local/0/qhwang/nlp_logs/*.json"): all_results.update(json.load(open(f, "r"))) df = pd.DataFrame() for n, results in all_results.items(): if n.endswith(".json"): n=n.split("/env_log")[0] results = {n: results} exp, task, run = n.split("/")[-3:] exp = exp.strip() if exp == "react": continue task = task.strip() run = run.strip() for source_file, r in results.items(): r_ = copy.deepcopy(r) if len(r["score"]) < len(r["score_steps"])+1: r_["score"].append(r["final_score"]) r_["score_steps"].append(len(json.load(open(r_["path"], "r"))["steps"])) r_["score"] = np.array(r_["score"]) r_["score_steps"] = np.array(r_["score_steps"]) if exp == "no_retrieval": r_["score"] = r_["score"][r_["score_steps"] < 16] r_["score_steps"] = r_["score_steps"][r_["score_steps"] < 16] if exp == "langchain": r_["submitted_final_answer"] = langchain_final(r_["path"]) if exp == "autogpt": r_["submitted_final_answer"] = autogpt_final(r_["path"]) new_row={"task": task, "exp": exp, "run": run, **r_} df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) df["connection_error"] = df["path"].apply(connection_error) df["has_error"] = df["path"].apply(error) df["oom_error"] = df["path"].apply(oom_error) df["mkl_error"] = df["path"].apply(mkl_error) df["langchain_error"] = df["path"].apply(langchain_error) print(len(df[(df["error"] != "") | (df["connection_error"] == True)])) df = df[(((~df["has_error"]) & (df["connection_error"] == False)) | df["exp"].isin(["no_retrieval_gpt4", "full_gpt4_long"])| (df["exp"].isin(["langchain", "langchain_long"]) & df["langchain_error"]) ) & (~df["oom_error"]) & (~df["mkl_error"])] df["exp"] = df["exp"].apply(lambda x: x if not x.endswith("_long") else x[:-5]) df = df[df["exp"].isin(list(print_labels.keys()))] df["exp"] = df["exp"].apply(lambda x: print_labels[x]) df["task"] = df["task"].apply(lambda x: print_task_labels.get(x, x)) df["final_submitted_score"] = df[["final_score", "submitted_final_answer"]].apply(lambda x: x["final_score"] if x["final_score"] > 0 and x["submitted_final_answer"] else None, axis=1) df["final_score"] = df["final_score"].apply(lambda x: x if x > 0 else None) baseline = df[df["exp"] == "Baseline"][[ "task", "exp", "final_score"]].groupby(["task", "exp"]).mean().reset_index() # special baseline numbers try: baseline.at[baseline[baseline["task"] == "imdb"].index.values[0], "final_score"] = 0.5 baseline.at[baseline[baseline["task"] == "fathomnet"].index.values[0], "final_score"] = 1e-10 except: baseline = pd.concat( [ baseline, pd.DataFrame( [{"task": "imdb", "exp": "Baseline", "final_score": 0.5}] ), ], ignore_index=True, ) baseline = pd.concat( [ baseline, pd.DataFrame( [{"task": "fathomnet", "exp": "Baseline", "final_score": 1e-10}] ), ], ignore_index=True, ) baseline = pd.concat([baseline, pd.DataFrame([{"task" : "spaceship-titanic", "exp" :"Baseline", "final_score": 0.5}])], ignore_index=True) baseline = pd.concat([baseline, pd.DataFrame([{"task" : "house-price", "exp" :"Baseline", "final_score": 1e10}])], ignore_index=True) baseline = pd.concat([baseline, pd.DataFrame([{"task" : "ogbn-arxiv", "exp" :"Baseline", "final_score": 0.3134}])], ignore_index=True) baseline = pd.concat([baseline, pd.DataFrame([{"task" : "vectorization", "exp" :"Baseline", "final_score": 6.1742}])], ignore_index=True) return df, baseline def get_all_runs_results(df = None, baseline = None, print_labels = print_labels, print_task_labels = print_task_labels): if df is None or baseline is None: df, baseline = get_all_runs_eval(print_labels = print_labels, print_task_labels = print_task_labels) df[df["final_score"] > -1]["task"].unique() df = df[df["task"].isin(baseline["task"].unique())] df["max_score"] = df["score"].apply(lambda x: max(list(filter(lambda a: a > 0, x))) if len(list(filter(lambda a: a > 0, x))) > 0 else None) df["min_score"] = df["score"].apply(lambda x: min(list(filter(lambda a: a > 0, x))) if len(list(filter(lambda a: a > 0, x))) > 0 else None) df["increase"] = df[["max_score", "task"]].apply(lambda x: (x["max_score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x["max_score"] is not None else None, axis=1) df["decrease"] = df[["min_score", "task"]].apply(lambda x: (x["min_score"] - baseline[(baseline["task"] == x["task"])]["final_score"].values[0])/baseline[(baseline["task"] == x["task"])]["final_score"].values[0] if x["min_score"] is not None else None, axis=1) print(time.time()) df["improve"] = get_improvement(df, baseline) df["improve_5"] = get_improvement(df, baseline, 0.05) df["improve_10"] = get_improvement(df, baseline, 0.1) df["improve_15"] = get_improvement(df, baseline, 0.15) df["improve_20"] = get_improvement(df, baseline, 0.2) df["improve_30"] = get_improvement(df, baseline, 0.3) for prefix in ["final_"]: df[f"{prefix}improve"] = get_improvement(df, baseline, None, prefix) df[f"{prefix}improve_5"] = get_improvement(df, baseline, 0.05, prefix) df[f"{prefix}improve_10"] = get_improvement(df, baseline, 0.1, prefix) df[f"{prefix}improve_15"] = get_improvement(df, baseline, 0.15, prefix) df[f"{prefix}improve_20"] = get_improvement(df, baseline, 0.2, prefix) df[f"{prefix}improve_30"] = get_improvement(df, baseline, 0.3, prefix) print(time.time()) # uncomment these to count tokens # df[["prompt_tokens", "completed_tokens", "tool_prompt_tokens", "tool_completed_tokens", "num_steps", "total_time"]] = df.apply((lambda row: estimate_tokens(row["path"])), axis=1, result_type="expand") # df['total_tokens'] = df["prompt_tokens"] + df["completed_tokens"] + df["tool_prompt_tokens"] + df["tool_completed_tokens"] print(time.time()) return df import seaborn as sns from pandas.api.types import CategoricalDtype colors = { "GPT-4" : "#d62728", "Claude v1.0" : "#2ca02c", "AutoGPT" : "#9467bd", "React" : "#8c564b", "LangChain (React)" : "#e377c2", "Baseline" : "#7f7f7f" } def get_tradeoff_plot(df): def sample_and_mean(group): if "GPT-4" in group["exp"].values[0]: sample = group.sample(n=min(len(group), 8), random_state=1) else: sample = group.sample(n=min(len(group), 25), random_state=1) return sample.groupby(["task", "exp"]).mean().reset_index().drop(columns=["task", "exp"]) grouped_df = df[["task", "exp", "final_improve_10", "total_tokens"]].groupby(["task", "exp"]).apply(sample_and_mean).round(4).reset_index() x = grouped_df[["total_tokens","exp"]].groupby([ "exp"]).mean().values.flatten().tolist() y = grouped_df[["final_improve_10","exp"]].groupby([ "exp"]).mean().values.flatten().tolist() labels = ["AutoGPT", "Baseline", "Claude v1.0", "GPT-4", "LangChain (React)"] plt.figure() plt.scatter(x,y) for i in range(len(x)): plt.annotate(labels[i], # this is the text (x[i], y[i]), # these are the coordinates to position the label textcoords="offset points", # how to position the text xytext=(0,10), # distance from text to points (x,y) ha='center') # horizontal alignment can be left, right or center plt.xlim((-30000, 200000)) plt.ylim((0, 0.3)) # plt.show() plt.xlabel("Average Nsumber of Tokens Spent") plt.ylabel("Average Success Rate") plt.savefig("plots/tradeoff.pdf") def get_plot(df, column_name = "improve_5", titile = "Improvement of 5%", save_name = "improve_5", plot_tokens = False, plot_time = False): def sample_and_mean(group): if "GPT-4" in group["exp"].values[0]: sample = group.sample(n=min(len(group), 8), random_state=1) else: sample = group.sample(n=min(len(group), 25), random_state=1) return sample.groupby(["task", "exp"]).mean().reset_index().drop(columns=["task", "exp"]) grouped_df = df[["task", "exp", column_name]].groupby(["task", "exp"]).apply(sample_and_mean).round(4).reset_index() grouped_df.fillna(0, inplace=True) if plot_time: grouped_df[column_name] = grouped_df[column_name] / 60 elif not plot_tokens: grouped_df[column_name] = grouped_df[column_name] * 100 # Define the order task_order = list(print_task_labels.values()) task_order.remove("house-price") exp_order = ["GPT-4", "Claude v1.0", "AutoGPT", "LangChain (React)", "Baseline"] cat_type = CategoricalDtype(categories=task_order, ordered=True) grouped_df['task'] = grouped_df['task'].astype(cat_type) cat_type = CategoricalDtype(categories=exp_order, ordered=True) grouped_df['exp'] = grouped_df['exp'].astype(cat_type) plt.figure(figsize=(10,6)) palette = [colors[x] for x in exp_order] barplot = sns.barplot(x='task', y=column_name, hue='exp', data=grouped_df, palette=palette, ci=95) print(titile) # Get the current x-tick labels labels = [item.get_text() for item in barplot.get_xticklabels()] # Modify the labels new_labels = labels # [ l.split("_")[0].split("-")[0] for l in labels] # Set the new labels plt.xticks(range(len(labels)), new_labels, rotation=30) plt.ylim(plt.ylim()[0], plt.ylim()[1] + (plt.ylim()[1]-plt.ylim()[0]) * 0.1) leg = barplot.get_legend() leg.set_title(None) for t in leg.texts: t.set_text(t.get_text().replace("Year=", "")) plt.legend(loc='upper center', fancybox=True, shadow=True, ncol=4) plt.xlabel("Task") if plot_tokens: plt.ylabel("Tokens") elif plot_time: plt.ylabel("Time (minutes)") else: plt.ylabel("Percentage") plt.savefig(f"plots/{save_name}.pdf", bbox_inches='tight') plt.show() if __name__ == "__main__": df = get_all_runs_results() get_plot(df, "improve_5", "Percentage of runs that improve objective by over 5% at any point", "improve_5") get_plot(df, "improve_10", "Percentage of runs that improve objective by over 10% at any point", "improve_10") get_plot(df, "final_improve_5", "Percentage of runs that improves objective by over 5% at the end", "final_improve_5") get_plot(df, "final_improve_10", "Percentage of runs that improves objective by over 10% at the end", "final_improve_10") get_plot(df, "final_improve_30", "Percentage of runs that improves objective by over 30% at the end", "final_improve_30") get_plot(df, "final_improve", "Average improvement in objective among runs that made a submission at the end.", "final_improve") get_plot(df[df["submitted_final_answer"]], "final_improve", "Average improvement in objective among runs that made a final submission.", "final_improve_submitted") get_plot(df, "total_tokens", "", "total_tokens", plot_tokens= True) get_plot(df, "total_time", "", "total_time",plot_time=True)