{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from datasets import load_dataset\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load the DailyDialog dataset\n", "dataset = load_dataset('daily_dialog')\n", "\n", "# Concatenate all utterances within a dialogue and map to 'dialog' key\n", "def concatenate_utterances(example):\n", " example['dialog'] = \" \".join(example['dialog'])\n", " return example\n", "\n", "# Apply the function to all examples in the dataset\n", "dataset = dataset.map(concatenate_utterances)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Load the tokenizer and model\n", "tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')\n", "tokenizer.pad_token = tokenizer.eos_token\n", "model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5d576321ac974a118f75b83cd8437256", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1000 [00:00