from transformers import BertModel, BertTokenizer, TrainingArguments, Trainer from datasets import Dataset # Prepare the dataset (simplified) def prepare_text_dataset(data, chunk_size): # Split the text into smaller chunks (consider logical divisions of the Constitution) chunks = [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)] # Convert chunks to dictionaries with a single feature "text" formatted_data = [{"text": chunk} for chunk in chunks] # Create the dataset from the list of dictionaries formatted_dataset = Dataset.from_list(formatted_data) # Tokenize the text using the MBart tokenizer formatted_dataset = formatted_dataset.map( lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True ) # Set the format of the dataset to "torch" for compatibility with the model formatted_dataset.set_format("torch") # Print a message indicating preparation completion (optional) print('Prep done') return formatted_dataset def init(): # Load the model and tokenizer model_name = "language-ml-lab/AzerBert" # Replace with your model name if different tokenizer = BertTokenizer.from_pretrained(model_name) model = BertModel.from_pretrained(model_name) chunk_size = 512 # Load the plain text (replace with your actual loading logic) with open("constitution.txt", "r", encoding="utf-8") as f: constitution_text = f.read() # Prepare the dataset train_dataset = prepare_text_dataset(constitution_text, chunk_size) # Define training arguments training_args = TrainingArguments( output_dir="./results", # Adjust output directory overwrite_output_dir=True, num_train_epochs=3, # Adjust training epochs per_device_train_batch_size=1, # Adjust batch size based on your GPU memory save_steps=500, save_total_limit=2, ) # Create the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) # Start training trainer.train() # Save the fine-tuned model model.save_pretrained("./fine-tuned_model") tokenizer.save_pretrained("./fine-tuned_model")