{
  "cells": [
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "b3_hlnrYh30E"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Acknowledgement:\n",
        "Thanks to @RajKKapadia <br>\n",
        "Link: https://github.com/RajKKapadia/Transformers-Text-Classification-BERT-Blog"
      ],
      "metadata": {
        "id": "zK4VsKufh4gZ"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XLhB2j_Hemio"
      },
      "source": [
        "## Read the dataset csv file"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hgYEtrYgemir",
        "outputId": "d3ddedc7-8bd7-4ba9-c82e-68e4eb1309c3"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Unnamed: 0</th>\n",
              "      <th>Text</th>\n",
              "      <th>target</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0.0</td>\n",
              "      <td>polis tangkap</td>\n",
              "      <td>NonCyberbully</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1.0</td>\n",
              "      <td>kenapa lokasi kebakaran terlalu spesifik</td>\n",
              "      <td>NonCyberbully</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2.0</td>\n",
              "      <td>menyesal tanya nak for birthday</td>\n",
              "      <td>NonCyberbully</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3.0</td>\n",
              "      <td>meriah tah</td>\n",
              "      <td>NonCyberbully</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4.0</td>\n",
              "      <td>asal bs kelar kerja jam sik kl baru diajak mee...</td>\n",
              "      <td>NonCyberbully</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   Unnamed: 0                                               Text  \\\n",
              "0         0.0                                     polis tangkap    \n",
              "1         1.0          kenapa lokasi kebakaran terlalu spesifik    \n",
              "2         2.0                   menyesal tanya nak for birthday    \n",
              "3         3.0                                        meriah tah    \n",
              "4         4.0  asal bs kelar kerja jam sik kl baru diajak mee...   \n",
              "\n",
              "          target  \n",
              "0  NonCyberbully  \n",
              "1  NonCyberbully  \n",
              "2  NonCyberbully  \n",
              "3  NonCyberbully  \n",
              "4  NonCyberbully  "
            ]
          },
          "execution_count": 3,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "import pandas as pd\n",
        "df = pd.read_csv('C:/Users/user/Documents/PSM/BERT_Ver2/Transformers-Text-Classification-BERT-Blog-main/input/Tagged_MixedNew.csv')\n",
        "df.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fGUtFkVfemit"
      },
      "source": [
        "## Process the data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7C3uRWECemiu",
        "outputId": "8e764d84-010d-4e42-987a-af7162627f6e",
        "colab": {
          "referenced_widgets": [
            "042c8b0b8dcf42eb84660c93778d8ea7",
            "4ab6074437a849f79be038b043025283",
            "9aed4d88c18e4e28a1efbbed94331228"
          ]
        }
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "042c8b0b8dcf42eb84660c93778d8ea7",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading (…)okenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "C:\\Users\\user\\anaconda3\\lib\\site-packages\\huggingface_hub\\file_download.py:133: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\user\\.cache\\huggingface\\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
            "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
            "  warnings.warn(message)\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "4ab6074437a849f79be038b043025283",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/233k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "9aed4d88c18e4e28a1efbbed94331228",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "#from transformers import BertTokenizer\n",
        "#tokenizer = BertTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n",
        "\n",
        "from transformers import AutoTokenizer\n",
        "tokenizer = AutoTokenizer.from_pretrained('mesolitica/bert-base-standard-bahasa-cased')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ks3XobW0emiu"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n",
        "import torch\n",
        "from transformers import TrainingArguments, Trainer\n",
        "from transformers import BertTokenizer, BertForSequenceClassification"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0ZZx6mUdemiv"
      },
      "outputs": [],
      "source": [
        "def process_data(row):\n",
        "\n",
        "    text = row['Text']\n",
        "    text = str(text)\n",
        "    text = ' '.join(text.split())\n",
        "\n",
        "    encodings = tokenizer(text, padding=\"max_length\", truncation=True, max_length=128)\n",
        "\n",
        "    label = 0\n",
        "    if row['target'] == 'Cyberbully':\n",
        "        label += 1\n",
        "\n",
        "    encodings['label'] = label\n",
        "    encodings['Text'] = text\n",
        "\n",
        "    return encodings"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MaFmqSc-emiv",
        "outputId": "03eb6491-b646-45dd-ef3d-318c81313430"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "{'input_ids': [2, 2039, 3058, 9857, 1606, 1164, 2161, 8062, 1219, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 0, 'Text': 'Saya suka masakan beliau dan cara penyampaiannya'}\n"
          ]
        }
      ],
      "source": [
        "print(process_data({\n",
        "    'Text': 'Saya suka masakan beliau dan cara penyampaiannya',\n",
        "    'target': 'NonCyberbully'\n",
        "}))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Lel-2lqKemiw"
      },
      "outputs": [],
      "source": [
        "processed_data = []\n",
        "\n",
        "for i in range(len(df[:1383])):\n",
        "    processed_data.append(process_data(df.iloc[i]))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "x_DGsKzHemiw"
      },
      "source": [
        "## Generate the dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oc_NsbnXemiw"
      },
      "outputs": [],
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "new_df = pd.DataFrame(processed_data)\n",
        "\n",
        "train_df, valid_df = train_test_split(\n",
        "    new_df,\n",
        "    test_size=0.2,\n",
        "    random_state=2022\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4qSci5CRemix"
      },
      "outputs": [],
      "source": [
        "import pyarrow as pa\n",
        "from datasets import Dataset\n",
        "\n",
        "train_hg = Dataset(pa.Table.from_pandas(train_df))\n",
        "valid_hg = Dataset(pa.Table.from_pandas(valid_df))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xDgnim7iemix",
        "outputId": "59858161-59a4-4731-fbfc-7e30a1246eed"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "Dataset({\n",
              "    features: ['Text', 'attention_mask', 'input_ids', 'label', 'token_type_ids', '__index_level_0__'],\n",
              "    num_rows: 277\n",
              "})"
            ]
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "valid_hg"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8Uqq0cKKemiy"
      },
      "source": [
        "## Create a model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QQkDAXmRemiz",
        "outputId": "e00faff0-c7d7-456d-dab2-73d9839c0274",
        "colab": {
          "referenced_widgets": [
            "b9faad28a43547029c8b13ab639f8d05",
            "6175ea4206304020823d86e0bbc23298"
          ]
        }
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "b9faad28a43547029c8b13ab639f8d05",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading (…)lve/main/config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "6175ea4206304020823d86e0bbc23298",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at mesolitica/bert-base-standard-bahasa-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
            "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mesolitica/bert-base-standard-bahasa-cased and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ]
        }
      ],
      "source": [
        "#from transformers import BertForSequenceClassification\n",
        "\n",
        "#model = BertForSequenceClassification.from_pretrained(\n",
        "#    'malay-huggingface/bert-tiny-bahasa-cased',\n",
        "#    num_labels=2\n",
        "#)\n",
        "\n",
        "\n",
        "from transformers import AutoModelForSequenceClassification\n",
        "\n",
        "model = AutoModelForSequenceClassification.from_pretrained(\n",
        "    'mesolitica/bert-base-standard-bahasa-cased',\n",
        "    num_labels=2\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ifvtnwBMemi1"
      },
      "outputs": [],
      "source": [
        "def compute_metrics(p):\n",
        "    print(type(p))\n",
        "    pred, labels = p\n",
        "    pred = np.argmax(pred, axis=1)\n",
        "\n",
        "    accuracy = accuracy_score(y_true=labels, y_pred=pred)\n",
        "    recall = recall_score(y_true=labels, y_pred=pred)\n",
        "    precision = precision_score(y_true=labels, y_pred=pred)\n",
        "    f1 = f1_score(y_true=labels, y_pred=pred)\n",
        "\n",
        "    return {\"accuracy\": accuracy, \"precision\": precision, \"recall\": recall, \"f1\": f1}\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "50Xy9P7Remi2"
      },
      "outputs": [],
      "source": [
        "from transformers import TrainingArguments, Trainer\n",
        "\n",
        "training_args = TrainingArguments(output_dir=\"./result\", evaluation_strategy=\"epoch\")\n",
        "\n",
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    train_dataset=train_hg,\n",
        "    eval_dataset=valid_hg,\n",
        "    tokenizer=tokenizer,\n",
        "    compute_metrics=compute_metrics\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "myIstfgJemi3"
      },
      "source": [
        "## Train and Evaluate the model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-UtAkNHUemi4",
        "outputId": "5af038f3-a77c-41eb-e48d-747a8e776e38"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "C:\\Users\\user\\anaconda3\\lib\\site-packages\\transformers\\optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
            "  warnings.warn(\n",
            "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='417' max='417' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [417/417 56:36, Epoch 3/3]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              " <tr style=\"text-align: left;\">\n",
              "      <th>Epoch</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>Validation Loss</th>\n",
              "      <th>Accuracy</th>\n",
              "      <th>Precision</th>\n",
              "      <th>Recall</th>\n",
              "      <th>F1</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.493876</td>\n",
              "      <td>0.779783</td>\n",
              "      <td>0.657343</td>\n",
              "      <td>0.886792</td>\n",
              "      <td>0.755020</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.542367</td>\n",
              "      <td>0.870036</td>\n",
              "      <td>0.850000</td>\n",
              "      <td>0.801887</td>\n",
              "      <td>0.825243</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.725669</td>\n",
              "      <td>0.848375</td>\n",
              "      <td>0.820000</td>\n",
              "      <td>0.773585</td>\n",
              "      <td>0.796117</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "<class 'transformers.trainer_utils.EvalPrediction'>\n",
            "<class 'transformers.trainer_utils.EvalPrediction'>\n",
            "<class 'transformers.trainer_utils.EvalPrediction'>\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "TrainOutput(global_step=417, training_loss=0.2771467213436282, metrics={'train_runtime': 3405.0836, 'train_samples_per_second': 0.974, 'train_steps_per_second': 0.122, 'total_flos': 218053287129600.0, 'train_loss': 0.2771467213436282, 'epoch': 3.0})"
            ]
          },
          "execution_count": 16,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "trainer.train()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fZYGhNyremi4",
        "outputId": "5119c379-d7e9-48f7-9137-d788f99a3731"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='35' max='35' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [35/35 00:43]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "<class 'transformers.trainer_utils.EvalPrediction'>\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "{'eval_loss': 0.7256694436073303,\n",
              " 'eval_accuracy': 0.8483754512635379,\n",
              " 'eval_precision': 0.82,\n",
              " 'eval_recall': 0.7735849056603774,\n",
              " 'eval_f1': 0.796116504854369,\n",
              " 'eval_runtime': 44.9419,\n",
              " 'eval_samples_per_second': 6.164,\n",
              " 'eval_steps_per_second': 0.779,\n",
              " 'epoch': 3.0}"
            ]
          },
          "execution_count": 17,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "trainer.evaluate()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tlw24Ccdemi5"
      },
      "source": [
        "## Save the model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "69n4eVBHemi6"
      },
      "outputs": [],
      "source": [
        "model.save_pretrained('./model/')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gC9qDoERemi6",
        "outputId": "a5514df7-d322-48b9-df27-c799dca6d884"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Looking in indexes: https://download.pytorch.org/whl/cu117\n",
            "Requirement already satisfied: torch in c:\\users\\user\\anaconda3\\lib\\site-packages (2.0.1+cu118)\n",
            "Requirement already satisfied: torchvision in c:\\users\\user\\anaconda3\\lib\\site-packages (0.15.2+cu117)\n",
            "Requirement already satisfied: torchaudio in c:\\users\\user\\anaconda3\\lib\\site-packages (2.0.2+cu117)\n",
            "Requirement already satisfied: sympy in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (1.11.1)\n",
            "Requirement already satisfied: jinja2 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (3.1.2)\n",
            "Requirement already satisfied: filelock in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (3.9.0)\n",
            "Requirement already satisfied: networkx in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (2.5.1)\n",
            "Requirement already satisfied: typing-extensions in c:\\users\\user\\anaconda3\\lib\\site-packages (from torch) (4.4.0)\n",
            "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (9.4.0)\n",
            "Requirement already satisfied: numpy in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (1.23.5)\n",
            "Requirement already satisfied: requests in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (2.28.1)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from jinja2->torch) (2.1.1)\n",
            "Requirement already satisfied: decorator<5,>=4.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from networkx->torch) (4.4.2)\n",
            "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2.0.4)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (1.26.14)\n",
            "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2.10)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2022.12.7)\n",
            "Requirement already satisfied: mpmath>=0.19 in c:\\users\\user\\anaconda3\\lib\\site-packages (from sympy->torch) (1.2.1)\n"
          ]
        }
      ],
      "source": [
        "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3NBugUKAemi7"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-W3_K_Kjemi7"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yMiT54Ddemi7"
      },
      "source": [
        "## Load the model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mEFnUaM3emi7"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import AutoModelForSequenceClassification\n",
        "\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "\n",
        "new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zkDeulcTemi8",
        "outputId": "2500b324-398b-471b-9c08-48fa79ea9de3"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "ERROR: torch-1.0.1-cp36-cp36m-win_amd64.whl is not a supported wheel on this platform.\n",
            "\n",
            "[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
            "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: torchvision in c:\\users\\user\\anaconda3\\lib\\site-packages (0.14.0)\n",
            "Requirement already satisfied: typing-extensions in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (4.1.1)\n",
            "Requirement already satisfied: requests in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (2.27.1)\n",
            "Requirement already satisfied: torch==1.13.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (1.13.0)\n",
            "Requirement already satisfied: numpy in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (1.24.2)\n",
            "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from torchvision) (9.0.1)\n",
            "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (3.3)\n",
            "Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (2022.9.24)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->torchvision) (1.26.9)\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n",
            "[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
            "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
          ]
        }
      ],
      "source": [
        "!pip install https://download.pytorch.org/whl/cpu/torch-1.0.1-cp36-cp36m-win_amd64.whl\n",
        "!pip install torchvision"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WtI-WDBhemi8"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoTokenizer\n",
        "\n",
        "new_tokenizer = AutoTokenizer.from_pretrained('mesolitica/bert-base-standard-bahasa-cased')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "S2X_uPYJemi9"
      },
      "source": [
        "## Get predictions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qXKQEiWxemi9"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import numpy as np\n",
        "\n",
        "def get_prediction(text):\n",
        "    encoding = new_tokenizer(text, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=128)\n",
        "    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}\n",
        "\n",
        "    outputs = new_model(**encoding)\n",
        "\n",
        "    logits = outputs.logits\n",
        "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "    sigmoid = torch.nn.Sigmoid()\n",
        "    print(sigmoid)\n",
        "    probs = sigmoid(logits.squeeze().cpu())\n",
        "    probs = probs.detach().numpy()\n",
        "    label = np.argmax(probs, axis=-1)\n",
        "\n",
        "    if label == 1:\n",
        "        return {\n",
        "            'Target': 'Cyberbully',\n",
        "            'probability': probs[1]\n",
        "        }\n",
        "    else:\n",
        "        return {\n",
        "            'Target': 'Not Cyberbully',\n",
        "            'probability': probs[0]\n",
        "        }"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NcYq4vmVemi9"
      },
      "outputs": [],
      "source": [
        "# dir()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CS_2FfAeemi_",
        "outputId": "106776a5-fced-4329-aa1f-5970a4a71386"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Sigmoid()\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "{'Target': 'Cyberbully', 'probability': 0.9651532}"
            ]
          },
          "execution_count": 24,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "get_prediction('Aku malas kerja dengan orang macam ni menyusahkan orang je')"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.9"
    },
    "vscode": {
      "interpreter": {
        "hash": "173fe52379437b78f95c8980b8ee9f2930fd7b56889ab31a72735475ddc10c81"
      }
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}