In [None]:
! pip install datasets -q
! pip install transformers -q 
! pip install evaluate -q
! pip install accelerate -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Colab Notebooks/Project"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Project


In [None]:
import transformers
from datasets import load_dataset, ClassLabel
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import numpy as np
import evaluate

# Loading the Dataset

In [None]:
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
raw_training_data = dataset_dict["train"]
validation_data = dataset_dict["validation"]

# Filtering Dataset to only include the relevant variables

In [None]:
features_to_remove = ['patent_number', 'title', 'background', 'summary', 'description', 'cpc_label', 
                      'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id']
# Removing irrelevant columns
raw_training_data = raw_training_data.remove_columns(features_to_remove)
validation_data = validation_data.remove_columns(features_to_remove)

# Renaming Column names to match expected input
raw_training_data = raw_training_data.rename_column('decision', 'labels')
validation_data = validation_data.rename_column('decision', 'labels')

# Converting Dataset labels to encoded values

In [None]:
features = raw_training_data.features.copy()
features["labels"] = ClassLabel(names = ["REJECTED", "PENDING", "ACCEPTED"])
raw_training_data = raw_training_data.cast(features)

features = validation_data.features.copy()
features["labels"] = ClassLabel(names = ["REJECTED", "PENDING", "ACCEPTED"])
validation_data = validation_data.cast(features)



# Getting a Pre-Trained Model

In [None]:
model_name = 'distilbert-base-cased'

label2id = {
    "REJECTED" : 0,
    "PENDING" : 1,
    "ACCEPTED": 2
}

id2label = {
    0 : "REJECTED",
    1 : "PENDING",
    2 : "ACCEPTED"
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels = 3,
    id2label=id2label,
    label2id=label2id
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.w

In [None]:
def tokenize_function(data):
  tokenized_data = tokenizer(data["abstract"], padding = "max_length", truncation = True)
  tokenized_data = tokenizer(data["claims"], padding = "max_length", truncation = True)
  return tokenized_data

In [None]:
tokenized_training_data = raw_training_data.map(tokenize_function, batched = True)
tokenized_validation_data = validation_data.map(tokenize_function, batched = True)



Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [None]:
# Removing Text Columns
training_data = tokenized_training_data
training_data = training_data.remove_columns(["abstract", "claims"])
validation_data = tokenized_validation_data
validation_data = validation_data.remove_columns(["abstract", "claims"])
# Setting to return tensors
training_data.set_format("torch")
validation_data.set_format("torch")

In [None]:
# smaller_training_data = training_data.shuffle(seed = 129).select(range(1000))
# smaller_validation_data = validation_data.shuffle(seed = 129).select(range(750))

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="Bert-Patent-Model-2",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=12,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    gradient_accumulation_steps=16,
    optim="adafactor",
    resume_from_checkpoint="./Bert-Patent-Model/checkpoint-504"
)

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=training_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
transformers.logging.set_verbosity_info()

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.932718,0.556081
1,0.713200,1.062583,0.537387
2,0.713200,1.149405,0.545854
3,0.484300,1.394087,0.518474
4,0.484300,1.625637,0.520013
5,0.234500,1.928906,0.534638
6,0.234500,2.10189,0.535188
7,0.113600,2.447903,0.521553
8,0.113600,2.633792,0.512756
9,0.052100,3.018095,0.52925


TrainOutput(global_step=3024, training_loss=0.26791910230916327, metrics={'train_runtime': 4668.6932, 'train_samples_per_second': 41.518, 'train_steps_per_second': 0.648, 'total_flos': 2.563329616742707e+16, 'train_loss': 0.26791910230916327, 'epoch': 11.98})