Baseline code using BERT and recording of code walkthrough
#3
pinned
by
janbelke
- opened
Hi everyone,
Here's the link to a recording (YouTube) of the baseline code walkthrough using BERT.
Below the code for your reference:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from sklearn import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
class ClassificationDataset:
def __init__(self, data, tokenizer):
self.data = data
self.tokenizer = tokenizer
def __len__(self):
return len(self.data)
def __getitem__(self, item):
text = str(self.data[item]["synopsis"])
target = int(self.data[item]["genre"])
inputs = self.tokenizer(text, max_length=128, padding="max_length", truncation=True)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
return {
"input_ids": torch.tensor(ids, dtype=torch.long),
"attention_mask": torch.tensor(mask, dtype=torch.long),
"labels": torch.tensor(target, dtype=torch.long),
}
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy = metrics.accuracy_score(labels, predictions)
return {"accuracy": accuracy}
def train():
ds = load_dataset("datadrivenscience/movie-genre-prediction")
ds = ds.class_encode_column("genre")
ds_train = ds["train"]
ds_test = ds["test"]
temp_ds = ds_train.train_test_split(test_size=0.2, stratify_by_column="genre")
ds_train = temp_ds["train"]
ds_val = temp_ds["test"]
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=len(ds_train.features["genre"]._int2str),
)
train_dataset = ClassificationDataset(ds_train, tokenizer)
valid_dataset = ClassificationDataset(ds_val, tokenizer)
test_dataset = ClassificationDataset(ds_test, tokenizer)
args = TrainingArguments(
"model",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=1,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
report_to="none",
save_total_limit=1,
)
trainer = Trainer(
model,
args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
preds = trainer.predict(test_dataset).predictions
preds = np.argmax(preds, axis=1)
# generate submission file
submission = pd.DataFrame({"id": ds_test["id"], "genre": preds})
submission.loc[:, "genre"] = submission.genre.apply(lambda x: ds_train.features["genre"].int2str(x))
submission.to_csv("submission.csv", index=False)
if __name__ == "__main__":
train()
abhishek
pinned discussion
This comment has been hidden