File size: 24,739 Bytes
4c65bff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
#!/usr/bin/env python
# coding=utf-8

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .agents import BASE_PYTHON_TOOLS, clean_code_for_chat, clean_code_for_run
from .python_interpreter import InterpretorError, evaluate


### Fake tools for test
def classifier(text, labels):
    return f"This is the classification of {text} along {labels}."


def translator(text, src_lang, tgt_lang):
    return f"This is the translation of {text} from {src_lang} to {tgt_lang}."


def speaker(text):
    return f"This is actually a sound reading {text}."


def transcriber(audio):
    if "sound" not in audio:
        raise ValueError(f"`audio` ({audio}) is not a sound.")
    return f"This is the transcribed text from {audio}."


def image_generator(prompt):
    return f"This is actually an image representing {prompt}."


def image_captioner(image):
    if "image" not in image:
        raise ValueError(f"`image` ({image}) is not an image.")
    return f"This is a description of {image}."


def image_transformer(image, prompt):
    if "image" not in image:
        raise ValueError(f"`image` ({image}) is not an image.")
    return f"This is a transformation of {image} according to {prompt}."


def question_answerer(text, question):
    return f"This is the answer to {question} from {text}."


def image_qa(image, question):
    if "image" not in image:
        raise ValueError(f"`image` ({image}) is not an image.")
    return f"This is the answer to {question} from {image}."


def text_downloader(url):
    return f"This is the content of {url}."


def summarizer(text):
    return f"This is a summary of {text}."


def video_generator(prompt, seconds=2):
    return f"A video of {prompt}"


def document_qa(image, question):
    return f"This is the answer to {question} from the document {image}."


def image_segmenter(image, prompt):
    return f"This is the mask of {prompt} in {image}"


TEST_TOOLS = {
    "text_classifier": classifier,
    "translator": translator,
    "text_reader": speaker,
    "summarizer": summarizer,
    "transcriber": transcriber,
    "image_generator": image_generator,
    "image_captioner": image_captioner,
    "image_transformer": image_transformer,
    "text_qa": question_answerer,
    "text_downloader": text_downloader,
    "image_qa": image_qa,
    "video_generator": video_generator,
    "document_qa": document_qa,
    "image_segmenter": image_segmenter,
}


class Problem:
    """
    A class regrouping all the information to solve a problem on which we will evaluate agents.

    Args:
        task (`str` ou `list[str]`):
            One or several descriptions of the task to perform. If a list, it should contain variations on the
            phrasing, but for the same task.
        inputs (`list[str]` or `dict[str, str]`):
            The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
            values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
            inputs expected (the value used will be `<<input_name>>` in this case).
        answer (`str` or `list[str`]):
            The theoretical answer (or list of possible valid answers) to the problem, as code.
    """

    def __init__(self, task, inputs, answer):
        self.task = task
        self.inputs = inputs
        self.answer = answer


### The list of problems the agent will be evaluated on.
EVALUATION_TASKS = [
    Problem(
        task=[
            "Is the following `text` (in Spanish) positive or negative?",
            "Is the text in the variable `text` (in Spanish) positive or negative?",
            "Translate the following `text` from Spanish to English then tell me if its positive or negative.",
        ],
        inputs=["text"],
        answer="""text_classifier(translator(text, src_lang="Spanish", tgt_lang="English"), labels=["positive", "negative"])""",
    ),
    Problem(
        task=[
            "Tell me out loud what the `image` contains.",
            "Describe the following `image` out loud.",
            "Find what is in the picture stored in `image` then read it out loud.",
        ],
        inputs=["image"],
        answer=[
            "text_reader(image_captioner(image))",
            "text_reader(image_qa(image, question='What is in the image?'))",
        ],
    ),
    Problem(
        task=[
            "Generate an image from the text given in `text_input`. Then transform it according to the text in `prompt`.",
            "Use the following `text_input` to generate an image, then transform it by using the text in `prompt`.",
        ],
        inputs=["text_input", "prompt"],
        answer="image_transformer(image_generator(text_input), prompt)",
    ),
    Problem(
        task=[
            "Download the content of `url`, summarize it then generate an image from its content.",
            "Use a summary of the web page at `url` to generate an image.",
            "Summarize the content of the web page at `url`, and use the result to generate an image.",
        ],
        inputs=["url"],
        answer="image_generator(summarizer(text_downloader(url)))",
    ),
    Problem(
        task=[
            "Transform the following `image` using the prompt in `text`. The prompt is in Spanish.",
            "Use the text prompt in `text` (in Spanish) to transform the following `image`.",
            "Translate the `text` from Spanish to English then use it to transform the picture in `image`.",
        ],
        inputs=["text", "image"],
        answer="image_transformer(image, translator(text, src_lang='Spanish', tgt_lang='English'))",
    ),
    Problem(
        task=[
            "Download the content of `url`, summarize it then read it out loud to me.",
            "Read me a summary of the web page at `url`.",
        ],
        inputs=["url"],
        answer="text_reader(summarizer(text_downloader(url)))",
    ),
    Problem(
        task=[
            "Generate an image from the text given in `text_input`.",
        ],
        inputs=["text_input"],
        answer="image_generator(text_input)",
    ),
    Problem(
        task=[
            "Replace the beaver in the `image` by the `prompt`.",
            "Transform the `image` so that it contains the `prompt`.",
            "Use `prompt` to transform this `image`.",
        ],
        inputs=["image", "prompt"],
        answer="image_transformer(image, prompt)",
    ),
    Problem(
        task=[
            "Provide me the summary of the `text`, then read it to me before transcribing it and translating it in French.",
            "Summarize `text`, read it out loud then transcribe the audio and translate it in French.",
            "Read me a summary of the the `text` out loud. Transcribe this and translate it in French.",
        ],
        inputs=["text"],
        answer="translator(transcriber(text_reader(summarizer(text))), src_lang='English', tgt_lang='French')",
    ),
    Problem(
        task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
        inputs={"prompt": "A lobster swimming"},
        answer="video_generator('A lobster swimming')",
    ),
    Problem(
        task=[
            "Download the following file `url`, summarize it in a few words and generate a video from it."
            "Fetch the file at this `url`, summarize it, and create an animation out of it."
        ],
        inputs=["url"],
        answer="video_generator(summarizer(text_downloader(url)))",
    ),
]


EVALUATION_CHATS = [
    [
        Problem(
            task=[
                "Translate the following `text` from Spanish to English.",
                "Translate the following `text` from Spanish to English.",
            ],
            inputs=["text"],
            answer="translated_text=translator(text, src_lang='Spanish', tgt_lang='English')",
        ),
        Problem(
            task=[
                "Is it positive or negative?",
                "Tell me if its positive or negative.",
            ],
            inputs=[],
            answer="text_classifier(translated_text, labels=['positive', 'negative'])",
        ),
    ],
    [
        Problem(
            task=[
                "What does this `image` contain?",
                "Describe the following `image`.",
                "Find what is in the picture stored in `image`",
            ],
            inputs=["image"],
            answer=[
                "description=image_captioner(image)",
                "description=image_qa(image, question='What is in the image?')",
            ],
        ),
        Problem(
            task=["Now, read the description out loud.", "Great! Can you read it out loud?", "Read it out loud."],
            inputs=[],
            answer=["audio=text_reader(description)", "audio=text_reader(description)"],
        ),
    ],
    [
        Problem(
            task=[
                "Generate an image from the text given in `text_input`.",
                "Use the following `text_input` to generate an image",
            ],
            inputs=["text_input"],
            answer="image = image_generator(text_input)",
        ),
        Problem(
            task=[
                "Transform it according to the text in `prompt`.",
                "Transform it by using the text in `prompt`.",
            ],
            inputs=["prompt"],
            answer="image_transformer(image, prompt)",
        ),
    ],
    [
        Problem(
            task=[
                "Download the content of `url` and summarize it.",
                "Summarize the content of the web page at `url`.",
            ],
            inputs=["url"],
            answer="summary = summarizer(text_downloader(url))",
        ),
        Problem(
            task=[
                "Generate an image from its content.",
                "Use the previous result to generate an image.",
            ],
            inputs=[],
            answer="image_generator(summary)",
        ),
    ],
    [
        Problem(
            task=[
                "Translate this Spanish `text` in English.",
                "Translate the `text` from Spanish to English.",
            ],
            inputs=["text"],
            answer="translated_text = translator(text, src_lang='Spanish', tgt_lang='English')",
        ),
        Problem(
            task=[
                "Transform the following `image` using the translated `text`.",
                "Use the previous result to transform the following `image`.",
            ],
            inputs=["image"],
            answer="image_transformer(image, translated_text)",
        ),
    ],
    [
        Problem(
            task=["Download the content of `url`.", "Get me the text on the weg page `url`."],
            inputs=["url"],
            answer="text = text_downloader(url)",
        ),
        Problem(
            task=["Summarize this text.", "Summarize this text."],
            inputs=[],
            answer="summary = summarizer(text)",
        ),
        Problem(
            task=["Read it out loud to me.", "Read me the previous result."],
            inputs=[],
            answer="text_reader(summary)",
        ),
    ],
    [
        Problem(
            task=[
                "Generate an image from the text given in `text_input`.",
            ],
            inputs=["text_input"],
            answer="image_generator(text_input)",
        ),
    ],
    [
        Problem(
            task=[
                "Replace the beaver in the `image` by the `prompt`.",
                "Transform the `image` so that it contains the `prompt`.",
                "Use `prompt` to transform this `image`.",
            ],
            inputs=["image", "prompt"],
            answer="image_transformer(image, prompt)",
        ),
    ],
    [
        Problem(
            task=["Provide me the summary of the `text`.", "Summarize `text`."],
            inputs=["text"],
            answer="summary = summarizer(text)",
        ),
        Problem(
            task=["Read this summary to me.", "Read it out loud."],
            inputs=[],
            answer="audio = text_reader(summarizer(text))",
        ),
        Problem(
            task=["Transcribing the previous result back in text.", "Transcribe the audio."],
            inputs=[],
            answer="text = transcriber(audio)",
        ),
        Problem(
            task=["Translating the last result in French.", "Translate this in French."],
            inputs=[],
            answer="translator(text, src_lang='English', tgt_lang='French')",
        ),
    ],
    [
        Problem(
            task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
            inputs={"prompt": "A lobster swimming"},
            answer="video_generator('A lobster swimming')",
        ),
    ],
    [
        Problem(
            task=[
                "Download the content of `url` and summarize it.",
                "Summarize the content of the web page at `url`.",
            ],
            inputs=["url"],
            answer="summary = summarizer(text_downloader(url))",
        ),
        Problem(
            task=["generate a video from it.", "Create an animation from the last result."],
            inputs=[],
            answer="video_generator(summary)",
        ),
    ],
]


def get_theoretical_tools(agent_answer, theoretical_answer, code_answer):
    if not isinstance(theoretical_answer, list):
        return {name for name in TEST_TOOLS if name in code_answer}

    if isinstance(agent_answer, dict):
        for one_answer, one_code in zip(theoretical_answer, code_answer):
            if one_answer in agent_answer.values():
                return {name for name in TEST_TOOLS if name in one_code}

    for one_answer, one_code in zip(theoretical_answer, code_answer):
        if agent_answer == one_answer:
            return {name for name in TEST_TOOLS if name in one_code}

    return {name for name in TEST_TOOLS if name in code_answer[0]}


def evaluate_code(code, inputs=None, state=None, verbose=False, return_interpretor_error=False):
    tools = BASE_PYTHON_TOOLS.copy()
    for name, tool in TEST_TOOLS.items():
        if name not in code:
            continue
        tools[name] = tool

    if isinstance(inputs, dict):
        inputs = inputs.copy()
    elif inputs is not None:
        inputs = {inp: f"<<{inp}>>" for inp in inputs}

    if state is not None:
        state.update(inputs)
    else:
        state = inputs

    try:
        return evaluate(code, tools, state)
    except InterpretorError as e:
        return str(e)
    except Exception as e:
        if verbose:
            print(e)
        return None


def score_code(agent_answer, theoretical_answer, verbose: bool = False):
    if verbose:
        print(agent_answer, theoretical_answer)
    theoretical_answer = theoretical_answer if isinstance(theoretical_answer, list) else [theoretical_answer]

    if agent_answer in theoretical_answer:
        if verbose:
            print("Perfect!")
        return 1
    elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
        if verbose:
            print("Almsot perfect, result in state!")
        return 0.75
    else:
        if verbose:
            print("Result is not the right one but code executed.")
        return 0.3


def evaluate_one_result(explanation, code, agent_answer, theoretical_answer, answer, verbose=False):
    tools_in_explanation = {name for name in TEST_TOOLS if f"`{name}`" in explanation}
    theoretical_tools = get_theoretical_tools(agent_answer, theoretical_answer, answer)
    if tools_in_explanation == theoretical_tools:
        tool_selection_score = 1.0
        tool_selection_errors = None
    else:
        missing_tools = len(theoretical_tools - tools_in_explanation)
        unexpected_tools = len(tools_in_explanation - theoretical_tools)
        tool_selection_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)

        tool_selection_errors = {
            "selected_tools": tools_in_explanation,
            "theoretical_tools": theoretical_tools,
        }

    tools_in_code = {name for name in TEST_TOOLS if name in code}
    if tools_in_code == theoretical_tools:
        tool_used_score = 1.0
        tool_used_errors = None
    else:
        missing_tools = len(theoretical_tools - tools_in_code)
        unexpected_tools = len(tools_in_code - theoretical_tools)
        tool_used_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)

        tool_used_errors = {
            "selected_tools": tools_in_explanation,
            "theoretical_tools": theoretical_tools,
        }

    score = score_code(agent_answer, theoretical_answer, verbose=verbose)
    if score < 1.0:
        code_errors = {
            "code_produced": code,
            "evaluation": agent_answer,
            "theoretical_answer": theoretical_answer,
        }
    else:
        code_errors = None

    return (tool_selection_score, tool_used_score, score), (tool_selection_errors, tool_used_errors, code_errors)


def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
    """
    Evaluates a new agent on all `EVALUATION_TASKS`.

    Example:

    ```py
    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
    bads = new_evaluate_agent(agent)
    for bad in bads:
        print(bad)
    ```
    """
    # Sanity check
    agent_tools = set(agent.toolbox.keys())
    if agent_tools != set(TEST_TOOLS):
        missing_tools = set(TEST_TOOLS) - agent_tools
        unexpected_tools = set(agent_tools) - TEST_TOOLS
        raise ValueError(
            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
        )

    eval_tasks = []
    eval_idx = []
    for idx, pb in enumerate(EVALUATION_TASKS):
        if isinstance(pb.task, list):
            eval_tasks.extend(pb.task)
            eval_idx.extend([idx] * len(pb.task))
        else:
            eval_tasks.append(pb.task)
            eval_idx.append(idx)

    tool_selection_score = 0
    tool_used_score = 0
    code_score = 0

    if return_errors:
        tool_selection_errors = {}
        tool_used_errors = {}
        code_errors = {}

    for start_idx in range(0, len(eval_tasks), batch_size):
        end_idx = min(start_idx + batch_size, len(eval_tasks))
        batch_tasks = eval_tasks[start_idx:end_idx]

        prompts = [agent.format_prompt(task) for task in batch_tasks]
        results = agent.generate_many(prompts, stop=["Task:"])

        for idx, result in enumerate(results):
            problem = EVALUATION_TASKS[eval_idx[start_idx + idx]]
            if verbose:
                print(f"====Task {start_idx + idx}====\n{batch_tasks[idx]}\n")
            explanation, code = clean_code_for_run(result)

            # Evaluate agent answer and code answer
            agent_answer = evaluate_code(code, problem.inputs, verbose=verbose)
            if isinstance(problem.answer, list):
                theoretical_answer = [evaluate_code(answer, problem.inputs) for answer in problem.answer]
            else:
                theoretical_answer = evaluate_code(problem.answer, problem.inputs)

            scores, errors = evaluate_one_result(
                explanation, code, agent_answer, theoretical_answer, problem.answer, verbose=verbose
            )

            tool_selection_score += scores[0]
            tool_used_score += scores[1]
            code_score += scores[2]

            if return_errors:
                if errors[0] is not None:
                    tool_selection_errors[batch_tasks[idx]] = errors[0]
                if errors[1] is not None:
                    tool_used_errors[batch_tasks[idx]] = errors[1]
                if errors[2] is not None:
                    code_errors[batch_tasks[idx]] = errors[2]

    scores = {
        "tool selection score": 100 * (tool_selection_score / len(eval_tasks)),
        "tool used score": 100 * (tool_used_score / len(eval_tasks)),
        "code score": 100 * (code_score / len(eval_tasks)),
    }

    if return_errors:
        return scores, tool_selection_errors, tool_used_errors, code_errors
    else:
        return scores


def evaluate_chat_agent(agent, verbose=False, return_errors=False):
    """
    Evaluates a new agent on all `EVALUATION_CHATS`.

    Example:

    ```py
    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
    bads = new_evaluate_agent(agent)
    for bad in bads:
        print(bad)
    ```
    """
    # Sanity check
    agent_tools = set(agent.toolbox.keys())
    if agent_tools != set(TEST_TOOLS):
        missing_tools = set(TEST_TOOLS) - agent_tools
        unexpected_tools = agent_tools - set(TEST_TOOLS)
        raise ValueError(
            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
        )

    tool_selection_score = 0
    tool_used_score = 0
    code_score = 0
    total_steps = 0

    if return_errors:
        tool_selection_errors = {}
        tool_used_errors = {}
        code_errors = {}

    for chat_problem in EVALUATION_CHATS:
        if isinstance(chat_problem[0].task, str):
            resolved_problems = [chat_problem]
        else:
            resolved_problems = [
                [Problem(task=pb.task[i], inputs=pb.inputs, answer=pb.answer) for pb in chat_problem]
                for i in range(len(chat_problem[0].task))
            ]
        for problem in resolved_problems:
            agent.prepare_for_new_chat()
            agent_state = {}
            theoretical_state = (
                [{} for _ in range(len(problem[0].answer))] if isinstance(problem[0].answer, list) else {}
            )

            for step, step_problem in enumerate(problem):
                if verbose:
                    print(step_problem.task)
                total_steps += 1
                prompt = agent.format_prompt(step_problem.task, chat_mode=True)
                result = agent.generate_one(prompt, stop=["Human:", "====="])
                agent.chat_history = prompt + result + "\n"

                explanation, code = clean_code_for_chat(result)

                if verbose:
                    print(f"==Explanation from the agent==\n{explanation}")
                    print(f"\n==Code generated by the agent==\n{code}")

                # Evaluate agent answer and code answer
                agent_answer = evaluate_code(code, step_problem.inputs, state=agent_state, verbose=verbose)

                answer = step_problem.answer
                if isinstance(answer, list):
                    theoretical_answer = [
                        evaluate_code(a, step_problem.inputs, state=state)
                        for a, state in zip(answer, theoretical_state)
                    ]
                else:
                    theoretical_answer = evaluate_code(answer, step_problem.inputs, state=theoretical_state)

                scores, errors = evaluate_one_result(
                    explanation, code, agent_answer, theoretical_answer, answer, verbose=verbose
                )

                tool_selection_score += scores[0]
                tool_used_score += scores[1]
                code_score += scores[2]

                if return_errors:
                    if errors[0] is not None:
                        tool_selection_errors[step_problem.task] = errors[0]
                    if errors[1] is not None:
                        tool_used_errors[step_problem.task] = errors[1]
                    if errors[2] is not None:
                        code_errors[step_problem.task] = errors[2]

    scores = {
        "tool selection score": 100 * (tool_selection_score / total_steps),
        "tool used score": 100 * (tool_used_score / total_steps),
        "code score": 100 * (code_score / total_steps),
    }

    if return_errors:
        return scores, tool_selection_errors, tool_used_errors, code_errors
    else:
        return scores