File size: 3,194 Bytes
d90b3a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
"""
https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_dpo.py
adapted to just grab the dataset
"""
import os
from alignment import (
DataArguments,
DPOConfig,
H4ArgumentParser,
ModelArguments,
apply_chat_template,
decontaminate_humaneval,
get_checkpoint,
get_datasets,
get_kbit_device_map,
get_peft_config,
get_quantization_config,
get_tokenizer,
is_adapter_model,
)
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import jsonlines
###############
# Load datasets
###############
raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
raw_datasets = DatasetDict(
{
"train": raw_datasets["train_prefs"],
"test": raw_datasets["test_prefs"],
}
)
column_names = list(raw_datasets["train"].features)
#####################################
# Load tokenizer and process datasets
#####################################
truncation_side = (
"left" # Truncate from left to ensure we don't lose labels in final turn
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
#####################
# Apply chat template
#####################
raw_datasets = raw_datasets.map(
apply_chat_template,
fn_kwargs={
"tokenizer": tokenizer,
"task": "dpo",
"auto_insert_empty_system_msg": True,
},
desc="Formatting comparisons with prompt template",
)
##########################
# Decontaminate benchmarks
##########################
num_raw_train_samples = len(raw_datasets["train"])
raw_datasets = raw_datasets.filter(
decontaminate_humaneval,
fn_kwargs={"text_column": "text_chosen"},
batched=True,
batch_size=10_000,
num_proc=1,
desc="Decontaminating HumanEval samples",
)
num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
print(
f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples / num_raw_train_samples * 100:.2f}%) samples from the training set."
)
###############
# Length filter
###############
# Since the alignment handbook recipes call for a max token limit of 1024...
num_filtered_train_samples = len(raw_datasets["train"])
def length_filter(example):
return (len(tokenizer.apply_chat_template(example["chosen"])) < 1024) and (
len(tokenizer.apply_chat_template(example["rejected"])) < 1024
)
num_length_filtered_train_samples = num_filtered_train_samples - len(
raw_datasets["train"]
)
print(
f"Length Filtered {num_length_filtered_train_samples} ({num_length_filtered_train_samples / num_filtered_train_samples * 100:.2f}%) samples from the training set."
)
# get directory of the python script
dir_path = os.path.dirname(os.path.realpath(__file__))
for split in ["train", "test"]:
with open(os.path.join(dir_path, f"dpo_{split}_filtered.jsonl"), "w") as f:
writer = jsonlines.Writer(f)
for item in raw_datasets[split]:
# add empty system messages
item["chosen"] = [{"role": "system", "content": ""}] + item["chosen"]
item["rejected"] = [{"role": "system", "content": ""}] + item["rejected"]
writer.write(item)
|