Spaces:
Runtime error
Runtime error
File size: 2,270 Bytes
e0cc050 f2eeb87 1ea0d00 f2eeb87 0f18d6d 2eec429 b606f8a e0cc050 0f18d6d b459e61 0f18d6d b459e61 0f18d6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
from transformers import BertModel, BertTokenizer, TrainingArguments, Trainer
from datasets import Dataset
# Prepare the dataset (simplified)
def prepare_text_dataset(data, chunk_size):
# Split the text into smaller chunks (consider logical divisions of the Constitution)
chunks = [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
# Convert chunks to dictionaries with a single feature "text"
formatted_data = [{"text": chunk} for chunk in chunks]
# Create the dataset from the list of dictionaries
formatted_dataset = Dataset.from_list(formatted_data)
# Tokenize the text using the MBart tokenizer
formatted_dataset = formatted_dataset.map(
lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
batched=True
)
# Set the format of the dataset to "torch" for compatibility with the model
formatted_dataset.set_format("torch")
# Print a message indicating preparation completion (optional)
print('Prep done')
return formatted_dataset
def init():
# Load the model and tokenizer
model_name = "language-ml-lab/AzerBert" # Replace with your model name if different
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
chunk_size = 512
# Load the plain text (replace with your actual loading logic)
with open("constitution.txt", "r", encoding="utf-8") as f:
constitution_text = f.read()
# Prepare the dataset
train_dataset = prepare_text_dataset(constitution_text, chunk_size)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results", # Adjust output directory
overwrite_output_dir=True,
num_train_epochs=3, # Adjust training epochs
per_device_train_batch_size=1, # Adjust batch size based on your GPU memory
save_steps=500,
save_total_limit=2,
)
# Create the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Start training
trainer.train()
# Save the fine-tuned model
model.save_pretrained("./fine-tuned_model")
tokenizer.save_pretrained("./fine-tuned_model")
|