File size: 14,563 Bytes
85e3d20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import os
import torch
import datasets
import transformers
import json
from .schema import ActionInfo, EnvException, EnhancedJSONEncoder

from reactagent.prompt2model.prompt_parser import MockPromptSpec, TaskType
from reactagent.prompt2model.dataset_retriever import DescriptionDatasetRetriever
from reactagent.prompt2model.dataset_generator import PromptBasedDatasetGenerator, DatasetSplit
from reactagent.prompt2model.dataset_processor import TextualizeProcessor
from reactagent.prompt2model.model_retriever import DescriptionModelRetriever
from reactagent.prompt2model.model_trainer import GenerationModelTrainer
from reactagent.prompt2model.model_executor import GenerationModelExecutor, ModelOutput
from reactagent.prompt2model.model_evaluator import Seq2SeqEvaluator

def generate_dataset(instruction, examples, save_dir, num_train, num_valid, num_test, work_dir = '.', **kwargs):
    try:
        num_train = int(num_train)
        num_valid = int(num_valid)
        num_test = int(num_test)
    except ValueError:
        raise EnvException("Number of examples should be an integer")

    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples=examples)
    generator = PromptBasedDatasetGenerator()
    dataset_dict = generator.generate_dataset_dict(prompt_spec, {
        DatasetSplit.TRAIN: num_train,
        DatasetSplit.VAL: num_valid,
        DatasetSplit.TEST: num_test
    })

    save_path = os.path.join(work_dir, save_dir)
    dataset_dict.save_to_disk(save_path)

    return f"Dataset successfully generated and saved to {save_path}"

def retrieve_dataset(instruction, save_dir, work_dir = '.', **kwargs):
    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
    retriever = DescriptionDatasetRetriever()
    dataset_dict = retriever.retrieve_dataset_dict(prompt_spec)

    save_path = os.path.join(work_dir, save_dir)
    dataset_dict.save_to_disk(save_path)

    return f"Dataset successfully generated and saved to {save_path}"

def retrieve_model(instruction, work_dir = '.', **kwargs):
    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
    retriever = DescriptionModelRetriever(use_bm25=True, use_HyDE=True)
    top_models = retriever.retrieve(prompt_spec)

    return "Top Models:\n" + "".join(f"{i+1}. {model}\n" for i, model in enumerate(top_models))

def process_dataset(instruction, load_dirs, save_dirs, work_dir = '.', **kwargs):
    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
    load_dirs = load_dirs.split(':')
    save_dirs = save_dirs.split(':')
    if len(load_dirs) != len(save_dirs):
        raise EnvException("Number of load directories should match number of save directories")
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    save_paths = [os.path.join(work_dir, save_dir) for save_dir in save_dirs]

    # load the datasets
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]

    # process the datasets
    processor = TextualizeProcessor(has_encoder=True)
    modified_dataset_dicts = processor.process_dataset_dict(prompt_spec, dataset_dicts)

    # save the processed datasets
    for dataset_dict, save_path in zip(modified_dataset_dicts, save_paths):
        dataset_dict.save_to_disk(save_path)

    return f"Data successfully processed and saved to {save_paths}"

def train_model(model_name, load_dirs, result_dir, epochs, batch_size, warmup_steps, weight_decay, learning_rate, work_dir = '.', **kwargs):
    try:
        epochs = int(epochs)
        batch_size = int(batch_size)
        warmup_steps = int(warmup_steps)
        weight_decay = float(weight_decay)
        learning_rate = float(learning_rate)
    except ValueError:
        raise EnvException("Numerical parameters should be integers or floats as appropriate")

    load_dirs = load_dirs.split(':')
    result_dir = os.path.join(work_dir, result_dir)

    # load the datasets
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]

    training_datasets = [dataset_dict["train"] for dataset_dict in dataset_dicts]
    validation_datasets = [dataset_dict["val"] for dataset_dict in dataset_dicts]
        
    trainer = GenerationModelTrainer(
        model_name,
        has_encoder=True,
        executor_batch_size=batch_size,
        tokenizer_max_length=1024,
        sequence_max_length=1280,
    )

    hparams ={
        "output_dir": os.path.join(result_dir, "training_output"),
        "save_strategy": "epoch",
        "num_train_epochs": epochs,
        "per_device_train_batch_size": batch_size,
        "evaluation_strategy": "epoch",
        "warmup_steps": warmup_steps,
        "weight_decay": weight_decay,
        "learning_rate": learning_rate,
    },

    trained_model, trained_tokenizer = trainer.train_model(
        hyperparameter_choices=hparams,
        training_datasets=training_datasets,
        validation_datasets=validation_datasets,
    )

    trained_model.save_pretrained(os.path.join(result_dir, "trained_model"))
    trained_tokenizer.save_pretrained(os.path.join(result_dir, "trained_tokenizer"))

    return f"Model and Tokenizer successfully trained and saved respectively to {result_dir}/trained_model and {result_dir}/trained_tokenizer"

def execute_model(result_dir, load_dirs, save_path, batch_size, input_column, work_dir = '.', **kwargs):
    load_dirs = load_dirs.split(':')
    result_dir = os.path.join(work_dir, result_dir)
    save_path = os.path.join(work_dir, save_path)

    try:
        batch_size = int(batch_size)
    except ValueError:
        raise EnvException("Batch size should be an integer")

    # load the datasets
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]
    test_datasets = [dataset_dict["test"] for dataset_dict in dataset_dicts]
    test_dataset = datasets.concatenate_datasets(test_datasets)

    trained_model_path = os.path.join(result_dir, "trained_model")
    trained_tokenizer_path = os.path.join(result_dir, "trained_tokenizer")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trained_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(trained_model_path).to(device)
    trained_tokenizer = transformers.AutoTokenizer.from_pretrained(trained_tokenizer_path)

    executor = GenerationModelExecutor(
        trained_model,
        trained_tokenizer,
        batch_size,
        tokenizer_max_length=1024,
        sequence_max_length=1280,
    )

    outputs = executor.make_prediction(
        test_set=test_dataset,
        input_column=input_column
    )

    with open(save_path, 'w') as f:
        json.dump(outputs, f, cls=EnhancedJSONEncoder)

    return f"Model successfully executed on the test sets of the specified datasets and saved to {save_path}"

def evaluate_model(load_dirs, save_path, output_column, work_dir = '.', **kwargs):
    load_dirs = load_dirs.split(':')
    # load the datasets
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]
    test_datasets = [dataset_dict["test"] for dataset_dict in dataset_dicts]
    test_dataset = datasets.concatenate_datasets(test_datasets)

    save_path = os.path.join(work_dir, save_path)
    with open(save_path, 'r') as f:
        outputs = json.load(f)
    outputs = [ModelOutput(**output) for output in outputs]

    evaluator = Seq2SeqEvaluator()
    metric_values = evaluator.evaluate_model(
        test_dataset,
        gt_column=output_column,
        predictions=outputs,
        encoder_model_name="xlm-roberta-base",
    )

    return f"Evaluation metrics: {metric_values}"

P2M_ACTIONS = [
    ActionInfo(
        name="Retrieve Model",
        description="Retrieve a suitable model based on a detailed description of the requirements. You can obtain the model given the name using the transformers.AutoModel.from_pretrained function.",
        usage={
            "instruction": "an instruction on how to generate the output from the input",
        },
        return_value="The observation will be a list of suitable models. You can choose one of them based on the requirements.",
        is_primitive=False,
        function=retrieve_model
    ),
]
# P2M_ACTIONS = [
#     ActionInfo(
#         name="Generate Dataset",
#         description="Generate a dataset based on an instruction and examples. You can load the dataset later from `save_dir` using the load_from_disk function of the HuggingFace datasets library.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#             "examples": "examples of input-output pairs",
#             "save_dir": "directory to save the generated dataset dict to. We recommend saving to data/generated/",
#             "num_train": "number of examples to generate in the training set",
#             "num_valid": "number of examples to generate in the validation set",
#             "num_test": "number of examples to generate in the test set",
#         },
#         return_value="The observation will be a success message if the dataset was generated successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=generate_dataset
#     ),
#     ActionInfo(
#         name="Retrieve Dataset",
#         description="Retrieve a suitable dataset based on a detailed description of the requirements. You can load the dataset later from `save_dir` using the load_from_disk function of the HuggingFace datasets library.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#             "save_dir": "directory to save the generated dataset dict to. We recommend saving to data/retrieved/",
#         },
#         return_value="The observation will be a success message if the dataset was retrieved successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=retrieve_dataset
#     ),
#     ActionInfo(
#         name="Retrieve Model",
#         description="Retrieve a suitable model based on a detailed description of the requirements. You can obtain the model given the name using the transformers.AutoModelForSeq2SeqLM.from_pretrained function.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#         },
#         return_value="The observation will be a list of suitable models. You can choose one of them based on the requirements.",
#         is_primitive=False,
#         function=retrieve_model
#     ),
#     ActionInfo(
#         name="Process Dataset",
#         description="Process dataset based on a detailed description of the requirements. You can load the processed data later from `save_dirs` using the load_from_disk function of the HuggingFace datasets library. The input text will be in the `model_input` column and the output text will be in the `model_output` column.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "save_dirs": "directories to save the processed dataset dicts to, separated by colons. The order should match the order of the loaded datasets. We recommend saving to data/processed/",
#         },
#         return_value="The observation will be a success message if the data was processed successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=process_dataset
#     ),
#     ActionInfo(
#         name="Train Model",
#         description="Train a Seq2Seq model from HuggingFace transformers library using the processed datasets and given hyperparameters.",
#         usage={
#             "model_name": "name of the model to train",
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "result_dir": "directory to save the trained model and tokenizer to. We recommend using results/{trial_id}/. The trained model will be available as `{result_dir}/trained_model/` and the tokenizer will be available as `{result_dir}/trained_tokenizer/`.",
#             "epochs": "number of epochs to train the model for",
#             "batch_size": "batch size for training the model",
#             "warmup_steps": "number of warmup steps for the optimizer",
#             "weight_decay": "weight decay for the optimizer",
#             "learning_rate": "learning rate for the optimizer",
#         },
#         return_value="The observation will be a success message if the model was trained successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=train_model
#     ),
#     ActionInfo(
#         name="Execute Model on Test Set",
#         description="Execute a trained model on the test sets of specified dataset dicts.",
#         usage={
#             "result_dir": "directory where the trained model and tokenizer are saved",
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "save_path": "file to save the results of the model execution in json format",
#             "batch_size": "batch size for executing the model",
#             "input_column": "column name of the input text",
#         },
#         return_value="The observation will be a success message if the model was executed successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=execute_model,
#     ),
#     ActionInfo(
#         name="Evaluate Model",
#         description="Evaluate a trained model on the test sets of specified dataset dicts.",
#         usage={
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "save_path": "file to load the results of the model execution in json format",
#             "output_column": "column name of the output text",
#         },
#         return_value="The values for various evaluation metrics will be returned.",
#         is_primitive=False,
#         function=evaluate_model,
#     )
# ]