# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: refer https://github.com/ThilinaRajapakse/simpletransformers
"""
import json
import os
import sys
from dataclasses import asdict, dataclass, field
from multiprocessing import cpu_count
from typing import Optional
from loguru import logger
from torch.utils.data import Dataset
def get_default_process_count():
process_count = cpu_count() - 2 if cpu_count() > 2 else 1
if sys.platform == "win32":
process_count = min(process_count, 61)
return process_count
def get_special_tokens():
return ["", "", "", "", ""]
@dataclass
class ModelArgs:
adafactor_beta1: float = None
adafactor_clip_threshold: float = 1.0
adafactor_decay_rate: float = -0.8
adafactor_eps: tuple = field(default_factory=lambda: (1e-30, 1e-3))
adafactor_relative_step: bool = True
adafactor_scale_parameter: bool = True
adafactor_warmup_init: bool = True
adam_epsilon: float = 1e-8
best_model_dir: str = "outputs/best_model"
cache_dir: str = "cache_dir/"
config: dict = field(default_factory=dict)
cosine_schedule_num_cycles: float = 0.5
custom_layer_parameters: list = field(default_factory=list)
custom_parameter_groups: list = field(default_factory=list)
dataloader_num_workers: int = 0
do_lower_case: bool = False
dynamic_quantize: bool = False
early_stopping_consider_epochs: bool = False
early_stopping_delta: float = 0
early_stopping_metric: str = "eval_loss"
early_stopping_metric_minimize: bool = True
early_stopping_patience: int = 3
encoding: str = "utf-8"
eval_batch_size: int = 8
evaluate_during_training: bool = False
evaluate_during_training_silent: bool = True
evaluate_during_training_steps: int = 6000
evaluate_during_training_verbose: bool = False
evaluate_each_epoch: bool = True
fp16: bool = False
gradient_accumulation_steps: int = 1
learning_rate: float = 2e-5
local_rank: int = -1
logging_steps: int = 50
manual_seed: int = None
max_grad_norm: float = 1.0
max_seq_length: int = 128 # max length of input sequence
model_name: str = None
model_type: str = None
multiprocessing_chunksize: int = -1
n_gpu: int = 2
no_cache: bool = False
no_save: bool = False
not_saved_args: list = field(default_factory=list)
num_train_epochs: int = 1
optimizer: str = "AdamW"
output_dir: str = "outputs/"
overwrite_output_dir: bool = True
polynomial_decay_schedule_lr_end: float = 1e-7
polynomial_decay_schedule_power: float = 1.0
process_count: int = field(default_factory=get_default_process_count)
quantized_model: bool = False
reprocess_input_data: bool = False
save_best_model: bool = True
save_eval_checkpoints: bool = True
save_model_every_epoch: bool = False
save_optimizer_and_scheduler: bool = True
save_steps: int = 10000
scheduler: str = "linear_schedule_with_warmup"
silent: bool = False
skip_special_tokens: bool = True
tensorboard_dir: str = None
thread_count: int = None
tokenizer_name: str = None
tokenizer_type: str = None
train_batch_size: int = 8
train_custom_parameters_only: bool = False
use_cached_eval_features: bool = False
use_early_stopping: bool = False
use_hf_datasets: bool = False
use_multiprocessing: bool = True
use_multiprocessing_for_evaluation: bool = True
wandb_kwargs: dict = field(default_factory=dict)
wandb_project: str = None
warmup_ratio: float = 0.06
warmup_steps: int = 0
weight_decay: float = 0.0
def update_from_dict(self, new_values):
if isinstance(new_values, dict):
for key, value in new_values.items():
setattr(self, key, value)
else:
raise (TypeError(f"{new_values} is not a Python dict."))
def get_args_for_saving(self):
args_for_saving = {key: value for key, value in asdict(self).items() if key not in self.not_saved_args}
return args_for_saving
def save(self, output_dir):
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "model_args.json"), "w", encoding='utf-8') as f:
args_dict = self.get_args_for_saving()
if args_dict['dataset_class'] is not None and not isinstance(args_dict["dataset_class"], str):
args_dict['dataset_class'] = type(args_dict['dataset_class']).__name__
if args_dict["tokenizer_type"] is not None and not isinstance(args_dict["tokenizer_type"], str):
args_dict["tokenizer_type"] = type(args_dict["tokenizer_type"]).__name__
json.dump(args_dict, f)
def load(self, input_dir):
if input_dir:
model_args_file = os.path.join(input_dir, "model_args.json")
if os.path.isfile(model_args_file):
with open(model_args_file, "r", encoding='utf-8') as f:
model_args = json.load(f)
if model_args["dataset_class"]:
logger.warning(
"This model was trained using a custom dataset_class."
"This cannot be loaded automatically and must be specified in the model args"
"when loading the model."
)
self.update_from_dict(model_args)
@dataclass
class T5Args(ModelArgs):
"""
Model args for a T5Model
"""
model_class: str = "T5Model"
dataset_class: Dataset = None
do_sample: bool = False
early_stopping: bool = True
evaluate_generated_text: bool = False
length_penalty: float = 2.0
max_length: int = 180 # max length of the sequence to be generated
max_steps: int = -1
num_beams: int = 1
num_return_sequences: int = 1
preprocess_inputs: bool = True
repetition_penalty: float = 1.0
scheduler: str = "constant_schedule_with_warmup"
adafactor_relative_step: bool = False
adafactor_scale_parameter: bool = False
adafactor_warmup_init: bool = False
learning_rate: float = 5e-4
optimizer: str = "AdamW"
special_tokens_list: list = field(default_factory=list)
top_k: float = None
top_p: float = None
use_multiprocessed_decoding: bool = False
@dataclass
class CopyT5Args(ModelArgs):
"""
Model args for a CopyT5Model
"""
model_class: str = "CopyT5Model"
dataset_class: Dataset = None
do_sample: bool = False
early_stopping: bool = True
evaluate_generated_text: bool = False
length_penalty: float = 2.0
max_length: int = 128 # max length of the sequence to be generated
max_steps: int = -1
num_beams: int = 3
num_return_sequences: int = 1
preprocess_inputs: bool = True
repetition_penalty: float = 1.0
scheduler: str = "linear_schedule_with_warmup"
adafactor_relative_step: bool = False
adafactor_scale_parameter: bool = False
adafactor_warmup_init: bool = False
learning_rate: float = 1e-3
optimizer: str = "AdamW"
special_tokens_list: list = field(default_factory=list)
top_k: float = None
top_p: float = None
use_multiprocessed_decoding: bool = False
@dataclass
class LanguageModelingArgs(ModelArgs):
"""
Model args for a LanguageModelingModel
"""
model_class: str = "LanguageModelingModel"
block_size: int = -1
config_name: str = None
dataset_class: Dataset = None
dataset_type: str = "None"
discriminator_config: dict = field(default_factory=dict)
discriminator_loss_weight: float = 50.0
generator_config: dict = field(default_factory=dict)
max_steps: int = -1
min_frequency: int = 2
mlm: bool = True
mlm_probability: float = 0.15
sliding_window: bool = False
special_tokens: list = field(default_factory=get_special_tokens)
stride: float = 0.8
tie_generator_and_discriminator_embeddings: bool = True
tokenizer_name: str = None
vocab_size: int = None
clean_text: bool = True
handle_chinese_chars: bool = True
special_tokens_list: list = field(default_factory=list)
strip_accents: bool = True
local_rank: int = -1
@dataclass
class Seq2SeqArgs(ModelArgs):
"""
Model args for a Seq2SeqModel
"""
model_class: str = "Seq2SeqModel"
base_marian_model_name: str = None
dataset_class: Dataset = None
do_sample: bool = False
early_stopping: bool = True
evaluate_generated_text: bool = False
faiss_d: int = 768
faiss_m: int = 128
length_penalty: float = 2.0
max_length: int = 128 # max length of the sequence to be generated
max_steps: int = -1
num_beams: int = 1
num_return_sequences: int = 1
rag_embed_batch_size: int = 16
repetition_penalty: float = 1.0
top_k: float = None
top_p: float = None
use_multiprocessed_decoding: bool = False
save_knowledge_dataset: bool = True
save_knowledge_dataset_with_checkpoints: bool = False
split_text_character: str = " "
split_text_n: int = 100
src_lang: str = "en_XX"
tgt_lang: str = "ro_RO"
@dataclass
class LanguageGenerationArgs(ModelArgs):
"""
Model args for a LanguageGenerationModel
"""
model_class: str = "LanguageGenerationModel"
do_sample: bool = True
early_stopping: bool = True
evaluate_generated_text: bool = False
length_penalty: float = 2.0
max_length: int = 128 # max length of the sequence to be generated
max_steps: int = -1
num_beams: int = 1
num_return_sequences: int = 1
repetition_penalty: float = 1.0
top_k: float = 50
top_p: float = 0.95
prompt: str = ""
stop_token: str = None
temperature: float = 1.0
padding_text: str = ""
xlm_language: str = ""
config_name: str = None
tokenizer_name: str = None
special_tokens_list: list = field(default_factory=list)
@dataclass
class SongNetArgs(LanguageModelingArgs):
"""
Model args for a SongNetModel
"""
model_class: str = "SongNetModel"
dataset_class: Dataset = None
do_sample: bool = False
early_stopping: bool = True
evaluate_generated_text: bool = False
length_penalty: float = 2.0
max_length: int = 128
min_length: int = 10
max_steps: int = -1
num_beams: int = 3
num_return_sequences: int = 1
repetition_penalty: float = 1.0
scheduler: str = None
adafactor_relative_step: bool = False
adafactor_scale_parameter: bool = False
adafactor_warmup_init: bool = False
learning_rate: float = 1e-3
early_stopping_metric: str = "eval_ppl"
special_tokens_list: list = field(default_factory=list)
save_eval_checkpoints: bool = False
skip_special_tokens: bool = False
k: int = 16
use_multiprocessed_decoding: bool = False
embed_dim: int = 768
ff_embed_dim: int = 3072
num_heads: int = 12
num_layers: int = 12
dropout: float = 0.2
warmup_ratio: float = 0.05
weight_decay: float = 0.0
smoothing_factor: float = 0.1
@dataclass
class ChatGlmArgs(ModelArgs):
"""
Model args for a ChatGLMModel
"""
model_class: str = "ChatGlmArgs"
dataset_class: Dataset = None
learning_rate: float = 2e-5
fp16: bool = True
bf16: bool = False
int8: bool = False
int4: bool = False
debug: bool = False
max_seq_length: int = 256 # max length of input sequence
max_length = 384 # max length of the sequence to be generated
do_sample: bool = True
early_stopping: bool = True
is_train_on_prompt: bool = False # if compute loss with prompt labels
evaluate_generated_text: bool = True
report_to = "tensorboard"
optimizer: str = "adamw_torch"
save_strategy: str = "steps"
evaluation_strategy: str = "no"
eval_steps: int = 50
save_steps: int = 400
max_eval_samples: int = 20
length_penalty: float = 2.0
num_beams: int = 4
num_return_sequences: int = 1
repetition_penalty: float = 1.0
temperature: float = 0.1
special_tokens_list: list = field(default_factory=list)
top_k: float = 40
top_p: float = 0.75
model_name_or_path: Optional[str] = field(default="THUDM/chatglm-6b")
use_peft: bool = True
peft_type: str = "LORA"
peft_bin_name: str = "adapter_model.bin"
lora_r: int = 8
lora_alpha = 32
lora_dropout = 0.05
lora_target_modules = ["all"] # ["all"] or ["query_key_value"]
lora_bias = "none"
adalora_init_r: int = 12
adalora_tinit: int = 200
adalora_tfinal: int = 1000
adalora_delta_t: int = 10
lora_beta: float = 0.85
num_virtual_tokens: int = 20
prompt_encoder_hidden_size: int = 128
num_train_epochs = 1
max_steps = -1
per_device_train_batch_size = 2
eval_batch_size: int = 4
gradient_accumulation_steps = 1
gradient_checkpointing: bool = True
torch_compile: bool = False
save_total_limit = 10
remove_unused_columns = False
logging_steps = 50
resume_from_checkpoint: str = None
qlora: bool = False
@dataclass
class GptArgs(ModelArgs):
"""
Model args for a GptModel
"""
model_class: str = "GptArgs"
dataset_class: Dataset = None
learning_rate: float = 2e-5
fp16: bool = True
bf16: bool = False
int8: bool = False
int4: bool = False
debug: bool = False
max_seq_length: int = 256 # max length of input sequence
max_length = 256 # max length of the sequence to be generated
do_sample: bool = True
early_stopping: bool = True
evaluate_generated_text: bool = True
is_train_on_prompt: bool = False # if compute loss with prompt labels
warmup_steps: int = 50
report_to = "tensorboard"
optimizer: str = "adamw_torch"
save_strategy: str = "steps"
eval_steps: int = 200
save_steps: int = 400
pad_to_multiple_of: int = 8
max_eval_samples: int = 20
length_penalty: float = 2.0
num_beams: int = 1
num_return_sequences: int = 1
repetition_penalty: float = 1.3
temperature: float = 0.4
special_tokens_list: list = field(default_factory=list)
top_k: float = 40
top_p: float = 0.9
model_name_or_path: Optional[str] = field(default="shibing624/chinese-alpaca-plus-7b-hf")
use_peft: bool = True
peft_type: str = "LORA"
peft_bin_name: str = "adapter_model.bin"
lora_r: int = 8
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules = ["all"] # ["all"] or ["k_proj"]
lora_bias = "none"
adalora_init_r: int = 12
adalora_tinit: int = 200
adalora_tfinal: int = 1000
adalora_delta_t: int = 10
lora_beta: float = 0.85
num_virtual_tokens: int = 20
prompt_encoder_hidden_size: int = 128
num_train_epochs = 3
max_steps = -1
per_device_train_batch_size = 2
eval_batch_size: int = 4
gradient_accumulation_steps = 1
save_total_limit = 10
remove_unused_columns = False
logging_steps = 50
resume_from_checkpoint: str = None
gradient_checkpointing: bool = True
torch_compile: bool = False
trust_remote_code: bool = True
qlora: bool = False