{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 829,
"status": "ok",
"timestamp": 1641588786523,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "YELVqGxMxnbG",
"outputId": "876761c1-2e03-411b-e61b-07ac4ad61377"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wed Dec 28 20:57:11 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 515.86.01 Driver Version: 515.86.01 CUDA Version: 11.7 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 NVIDIA GeForce ... Off | 00000000:0A:00.0 On | N/A |\n",
"| 0% 31C P8 36W / 390W | 1401MiB / 24576MiB | 3% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| 0 N/A N/A 1267 G /usr/lib/xorg/Xorg 574MiB |\n",
"| 0 N/A N/A 2054 G /usr/bin/kwalletd5 4MiB |\n",
"| 0 N/A N/A 2222 G ...ec/xdg-desktop-portal-kde 4MiB |\n",
"| 0 N/A N/A 2259 G /usr/bin/ksmserver 4MiB |\n",
"| 0 N/A N/A 2261 G /usr/bin/kded5 4MiB |\n",
"| 0 N/A N/A 2262 G /usr/bin/kwin_x11 97MiB |\n",
"| 0 N/A N/A 2309 G /usr/bin/plasmashell 130MiB |\n",
"| 0 N/A N/A 2332 G ...de-authentication-agent-1 4MiB |\n",
"| 0 N/A N/A 2399 G ...x-gnu/libexec/kdeconnectd 4MiB |\n",
"| 0 N/A N/A 2401 G .../usr/bin/telegram-desktop 7MiB |\n",
"| 0 N/A N/A 2415 G /usr/bin/kaccess 4MiB |\n",
"| 0 N/A N/A 2421 G .../libexec/DiscoverNotifier 4MiB |\n",
"| 0 N/A N/A 2438 G ...1/usr/lib/firefox/firefox 216MiB |\n",
"| 0 N/A N/A 2626 G /usr/bin/dolphin 4MiB |\n",
"| 0 N/A N/A 2774 G /usr/bin/dolphin 4MiB |\n",
"| 0 N/A N/A 2824 G /usr/bin/dolphin 4MiB |\n",
"| 0 N/A N/A 3559 G /usr/bin/dolphin 4MiB |\n",
"| 0 N/A N/A 3665 G /usr/bin/dolphin 4MiB |\n",
"| 0 N/A N/A 4830 G ...RendererForSitePerProcess 308MiB |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
],
"source": [
"gpu_info = !nvidia-smi\n",
"gpu_info = '\\n'.join(gpu_info)\n",
"if gpu_info.find('failed') >= 0:\n",
" print('Not connected to a GPU')\n",
"else:\n",
" print(gpu_info)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "c8eh87Hoee5d"
},
"outputs": [],
"source": [
"#%%capture\n",
"#!pip install datasets==1.13.3\n",
"#!pip install transformers==4.11.3\n",
"#!pip install huggingface_hub==0.1\n",
"#!pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html\n",
"#!pip install jiwer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 5334,
"status": "ok",
"timestamp": 1641588811766,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "2MMXcWFFgCXU",
"outputId": "be9fd72e-4395-4cd0-ff87-631dad046e71"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Reusing dataset common_voice_10_0 (/home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e)\n",
"Reusing dataset common_voice_10_0 (/home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e)\n"
]
}
],
"source": [
"from datasets import load_dataset, load_metric, Audio\n",
"\n",
"common_voice_train = load_dataset(\"mozilla-foundation/common_voice_10_0\", \"uk\", split=\"train\", use_auth_token=True)\n",
"common_voice_test = load_dataset(\"mozilla-foundation/common_voice_10_0\", \"uk\", split=\"test\", use_auth_token=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
" num_rows: 11463\n",
"})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_voice_train"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_voice_train.cleanup_cache_files()\n",
"common_voice_test.cleanup_cache_files()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "kbyq6lDgQc2a"
},
"outputs": [],
"source": [
"common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
"common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "72737oog2F6U"
},
"outputs": [],
"source": [
"from datasets import ClassLabel\n",
"import random\n",
"import pandas as pd\n",
"from IPython.display import display, HTML\n",
"\n",
"def show_random_elements(dataset, num_examples=10):\n",
" assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n",
" picks = []\n",
" for _ in range(num_examples):\n",
" pick = random.randint(0, len(dataset)-1)\n",
" while pick in picks:\n",
" pick = random.randint(0, len(dataset)-1)\n",
" picks.append(pick)\n",
" \n",
" df = pd.DataFrame(dataset[picks])\n",
" display(HTML(df.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"executionInfo": {
"elapsed": 39,
"status": "ok",
"timestamp": 1641588811771,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "K_JUmf3G3b9S",
"outputId": "8603c909-09e1-43ae-f7c2-b27b25d795a3"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" | \n",
" sentence | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" У червоних — невдачі на фронті. | \n",
"
\n",
" \n",
" 1 | \n",
" Він нагадував недавні ночі в кам'янському парку. | \n",
"
\n",
" \n",
" 2 | \n",
" Творення займенників | \n",
"
\n",
" \n",
" 3 | \n",
" Коли бідний жениться, ніч мала. | \n",
"
\n",
" \n",
" 4 | \n",
" Чорнота і Бугай злізли з дерев і пішли назирцем. | \n",
"
\n",
" \n",
" 5 | \n",
" крик. | \n",
"
\n",
" \n",
" 6 | \n",
" Крім того, мало не завжди погода примушує його десь заночувати. | \n",
"
\n",
" \n",
" 7 | \n",
" Така вже мода тепер. | \n",
"
\n",
" \n",
" 8 | \n",
" Летить що має сили до вікна і — грим грудьми до шибки. | \n",
"
\n",
" \n",
" 9 | \n",
" Ворожа лава проминула вже балку, а Василенко не стріляв. | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 30,
"status": "ok",
"timestamp": 1641588811775,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "XIHocAuTQbBR",
"outputId": "e8392853-e0d1-45ba-df74-065c50565654"
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cac2ebe21a844f7c8d3699f811555e9c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12 [00:00, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7068bb21616a4fd3b4eb1976653787d1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/7 [00:00, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"filter_func = lambda x: not (\"joki\" in x or \"ы\" in x)\n",
"common_voice_train = common_voice_train.filter(filter_func, input_columns=[\"sentence\"])\n",
"common_voice_test = common_voice_test.filter(filter_func, input_columns=[\"sentence\"])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "ZcVsD0ETElrR"
},
"outputs": [
{
"data": {
"text/plain": [
"{'sentence': \"привіт як у тебе справи загалом м'якотілий друже\"}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def cleaner(batch):\n",
" replace_as_space = \"!:;,—…–“”?\\\"«»\"\n",
" special_words = {\n",
" \"ХIХ\": \"дев'ятнадцятого\",\n",
" \"Linux\": \"Лінукс\",\n",
" \"Maace\": \"Маасе\",\n",
" \"м 'ясо\": \"м'ясо\",\n",
" \"'іде\": \"іде\",\n",
" \"Д'Аламбер\": \"даламбер\",\n",
" \" - \": \" \",\n",
" \"--\": \" \",\n",
" \"....\": \" \",\n",
" \"...\": \" \",\n",
" \"..\": \" \",\n",
" \" '\": \" \",\n",
" \"О'\": \"о\",\n",
" \"-\": \" \" #further check needed\n",
" }\n",
" # check abbreviations later\n",
" abbreviations = {\n",
" 'ЧК': \"чека\",\n",
" 'ҐПУ': \"ґепеу\",\n",
" 'ЄС.': \"єес\",\n",
" 'УНР': \"уенер\",\n",
" 'ДТП.': \"детепе\",\n",
" 'РНБО': \"еренбео\",\n",
" 'СРСР': \"есересер\",\n",
" 'ДБР': \"дебеер\",\n",
" 'КП': \"капе\",\n",
" 'ОС': \"оес\",\n",
" } \n",
" chars_dict = {\n",
" \"C\": \"С\",\n",
" \"I\": \"І\",\n",
" \"P\": \"Р\",\n",
" \"a\": \"а\",\n",
" \"e\": \"е\",\n",
" \"x\": \"х\",\n",
" \"y\": \"у\",\n",
" \"p\": \"р\",\n",
" \"o\": \"о\",\n",
" \"i\": \"і\",\n",
" \"\\u0301\": \"\",\n",
" \"`\": \"'\",\n",
" \"՚\": \"'\",\n",
" \".\": \" \",\n",
" \"’\": \"'\"\n",
" \n",
" }\n",
" for word in special_words.keys():\n",
" batch[\"sentence\"] = batch[\"sentence\"].replace(word, special_words[word])\n",
" for word in abbreviations.keys():\n",
" batch[\"sentence\"] = batch[\"sentence\"].replace(word, abbreviations[word])\n",
" for char in chars_dict.keys():\n",
" batch[\"sentence\"] = batch[\"sentence\"].replace(char, chars_dict[char])\n",
" for char in replace_as_space:\n",
" batch[\"sentence\"] = batch[\"sentence\"].replace(char, \" \")\n",
" batch[\"sentence\"] = \" \".join(filter(lambda x: x != \"\", batch[\"sentence\"].strip().lower().split(\" \")))\n",
" return batch\n",
"\n",
"sentence = {\"sentence\": \"Привіт, - як у тебе справи загалом, м'якотілий друже?\"}\n",
"cleaner(sentence)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 401
},
"executionInfo": {
"elapsed": 32,
"status": "ok",
"timestamp": 1641588811774,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "6falIJSBED65",
"outputId": "2f0ca829-dbfa-4d70-ee4a-ded2ae342117"
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "21f88692aec04acea6056893c8b6b1bc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/11463 [00:00, ?ex/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" | \n",
" sentence | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" отаманенко почав пояснювати з наукової точки але дід перебив його | \n",
"
\n",
" \n",
" 1 | \n",
" енею глуздівно сказав | \n",
"
\n",
" \n",
" 2 | \n",
" ні розвідки вперед ні стежі до лісу | \n",
"
\n",
" \n",
" 3 | \n",
" ну ну та я нічого не кажу | \n",
"
\n",
" \n",
" 4 | \n",
" якось прийшов зв'язковий із мельників | \n",
"
\n",
" \n",
" 5 | \n",
" я хоч не з мельників так мені оповідав батько був козаком у холодному яру | \n",
"
\n",
" \n",
" 6 | \n",
" ну бо | \n",
"
\n",
" \n",
" 7 | \n",
" макітру одділив од плеч | \n",
"
\n",
" \n",
" 8 | \n",
" видно стріляла розвідка ударників що йшла із собакою попереду групи | \n",
"
\n",
" \n",
" 9 | \n",
" левко слабий лежить просить щоб зайшов | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"show_random_elements(common_voice_train.map(cleaner).remove_columns([\"path\", \"audio\"]), num_examples=10)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading cached processed dataset at /home/robinhad/.cache/huggingface/datasets/mozilla-foundation___common_voice_10_0/uk/10.0.0/27df768ab1b5cac48a7616f145b79b62599167b0ffa2e054bf4c3e74e9619e5e/cache-96af4ec6cf30f0d6.arrow\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "217b24de248145d3af8d71497dc39b6e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6783 [00:00, ?ex/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"common_voice_train = common_voice_train.map(cleaner)\n",
"common_voice_test = common_voice_test.map(cleaner)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"executionInfo": {
"elapsed": 24,
"status": "ok",
"timestamp": 1641588811775,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "RBDRAAYxRE6n",
"outputId": "a16beae1-84e6-4388-d601-2ed3a92bf451"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" | \n",
" sentence | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" чому алгоритм зупиниться | \n",
"
\n",
" \n",
" 1 | \n",
" конем | \n",
"
\n",
" \n",
" 2 | \n",
" наступного дня нас прийняли на службу до міліції | \n",
"
\n",
" \n",
" 3 | \n",
" я знав що це неправда | \n",
"
\n",
" \n",
" 4 | \n",
" і взявши з запічка кресало | \n",
"
\n",
" \n",
" 5 | \n",
" скоріше б на гору бо тачанки ар'єргарду вже відкрили вогонь | \n",
"
\n",
" \n",
" 6 | \n",
" удень відтягалася вглиб села залишаючи наглядати за виходом із міста неозброєних | \n",
"
\n",
" \n",
" 7 | \n",
" чи співвідноситься це твердження з поняттям карми | \n",
"
\n",
" \n",
" 8 | \n",
" селяни мусили шукати бодай якоїсь їжі | \n",
"
\n",
" \n",
" 9 | \n",
" більшість убитих і полонених були одягнені в селянські кожухи | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"show_random_elements(common_voice_train.remove_columns([\"path\",\"audio\"]))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "LwCshNbbeRZR"
},
"outputs": [],
"source": [
"def extract_all_chars(batch):\n",
" all_text = \" \".join(batch[\"sentence\"])\n",
" vocab = list(set(all_text))\n",
" return {\"vocab\": [vocab], \"all_text\": [all_text]}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 81,
"referenced_widgets": [
"116786d9364a4a57b521cddaabeda688",
"9baa2f69aa9c4387bf1086a04ed78420",
"a1e2c04dc2cb45ea80bec125e3dbf56f",
"b6d46d40efa14b21814f41531f5a2f41",
"d8bf8dc5d6c84140a4e96c9c435b8f17",
"04ec68b059df4c628839c3ac29e2ebdd",
"427056895c674c428400bee0f5b43995",
"d518f2c2ab6945b78a6d336dad6262bd",
"77f1a51099b24831ad8b2be3d2dc833a",
"5815ae1348994bfebba4a8e968489a96",
"22ba979142074f1d976e1a905544fd2d",
"8b6b7f28751c45c8869aa86eb2a0ab26",
"445c84e1e2e541f2a54fb989def386ae",
"68502fb433564eee8dfdf272ed7e4f56",
"1f3abdf2e0f6459da4179a94d691c4c4",
"48c60be3ca9349a295b83f65769c7f27",
"6c80bd8a8fe14a5989fe27445c14650f",
"5c2a7fea8c434d51ada69a0854b88baf",
"414efa8a08cd491cb78af8a95a151daa",
"c31a747e18df4b4aa4449a30e387448c",
"3dedffa30b774426bd474072a3a0d591",
"05d8496d54174ae298c319b0194fc710"
]
},
"executionInfo": {
"elapsed": 560,
"status": "ok",
"timestamp": 1641588812313,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "_m6uUjjcfbjH",
"outputId": "4cc94e18-9295-4414-c611-c98916fe3d4d"
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3324cb796c2e4ac582a6ba5386336e8f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b27a77fd9fe54cceba55cc3de23fac60",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
"vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"id": "aQfneNsmlJI0"
},
"outputs": [],
"source": [
"vocab_list = list(set(vocab_train[\"vocab\"][0]) | set(vocab_test[\"vocab\"][0]))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 18,
"status": "ok",
"timestamp": 1641588812314,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "_0kRndSvqaKk",
"outputId": "35c48e76-5060-470b-8405-bd6d288296ea"
},
"outputs": [
{
"data": {
"text/plain": [
"{' ': 0,\n",
" \"'\": 1,\n",
" 'а': 2,\n",
" 'б': 3,\n",
" 'в': 4,\n",
" 'г': 5,\n",
" 'д': 6,\n",
" 'е': 7,\n",
" 'ж': 8,\n",
" 'з': 9,\n",
" 'и': 10,\n",
" 'й': 11,\n",
" 'к': 12,\n",
" 'л': 13,\n",
" 'м': 14,\n",
" 'н': 15,\n",
" 'о': 16,\n",
" 'п': 17,\n",
" 'р': 18,\n",
" 'с': 19,\n",
" 'т': 20,\n",
" 'у': 21,\n",
" 'ф': 22,\n",
" 'х': 23,\n",
" 'ц': 24,\n",
" 'ч': 25,\n",
" 'ш': 26,\n",
" 'щ': 27,\n",
" 'ь': 28,\n",
" 'ю': 29,\n",
" 'я': 30,\n",
" 'є': 31,\n",
" 'і': 32,\n",
" 'ї': 33,\n",
" 'ґ': 34}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}\n",
"vocab_dict"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"id": "npbIbBoLgaFX"
},
"outputs": [],
"source": [
"vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
"del vocab_dict[\" \"]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 15,
"status": "ok",
"timestamp": 1641588812316,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "znF0bNunsjbl",
"outputId": "480da4c9-b3d4-41c6-fc5c-b87b8b66202e"
},
"outputs": [
{
"data": {
"text/plain": [
"37"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
"vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
"len(vocab_dict)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"id": "ehyUoh9vk191"
},
"outputs": [],
"source": [
"import json\n",
"with open('vocab.json', 'w') as vocab_file:\n",
" json.dump(vocab_dict, vocab_file)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 8013,
"status": "ok",
"timestamp": 1641588820318,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "xriFGEWQkO4M",
"outputId": "a4497f75-d6f5-411a-d983-2ad519f65b8b"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"from transformers import Wav2Vec2CTCTokenizer\n",
"\n",
"tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"id": "A1XApZBAF2zr"
},
"outputs": [],
"source": [
"repo_name = \"wav2vec2-xls-r-base-uk\""
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "kAR0-2KLkopp"
},
"outputs": [],
"source": [
"from transformers import Wav2Vec2FeatureExtractor\n",
"\n",
"feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"id": "KYZtoW-tlZgl"
},
"outputs": [],
"source": [
"from transformers import Wav2Vec2Processor\n",
"\n",
"processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# save tokenizer to folder\n",
"processor.save_pretrained(repo_name)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"executionInfo": {
"elapsed": 18,
"status": "ok",
"timestamp": 1641588820325,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "TTCS7W6XJ9BG",
"outputId": "18b0d44f-a498-4a79-f0a7-984fae48cad1"
},
"outputs": [
{
"data": {
"text/plain": [
"'/home/robinhad/.cache/huggingface/datasets/downloads/extracted/ee7155196e5d51620d53e48cf58eb693b7839b8ff183604c8bb948d3e0aad92d/cv-corpus-10.0-2022-07-04/uk/clips/common_voice_uk_20907128.mp3'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_voice_train[0][\"path\"]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 863,
"status": "ok",
"timestamp": 1641588821172,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "qj_z5Zc3GAs9",
"outputId": "ace70f42-dcf0-445c-9b81-b23d4089c90d"
},
"outputs": [
{
"data": {
"text/plain": [
"{'path': '/home/robinhad/.cache/huggingface/datasets/downloads/extracted/ee7155196e5d51620d53e48cf58eb693b7839b8ff183604c8bb948d3e0aad92d/cv-corpus-10.0-2022-07-04/uk/clips/common_voice_uk_20907128.mp3',\n",
" 'array': array([ 0.0000000e+00, -3.5002383e-14, 9.4785833e-15, ...,\n",
" -5.0386465e-08, -4.4114326e-08, -1.9402206e-08], dtype=float32),\n",
" 'sampling_rate': 48000}"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_voice_train[0][\"audio\"]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"id": "rrv65aj7G95i"
},
"outputs": [],
"source": [
"common_voice_train = common_voice_train.cast_column(\"audio\", Audio(sampling_rate=16_000))\n",
"common_voice_test = common_voice_test.cast_column(\"audio\", Audio(sampling_rate=16_000))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 31,
"status": "ok",
"timestamp": 1641588821174,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "aKtkc1o_HWHC",
"outputId": "55538536-b8c6-484f-d695-5c8e0492747a"
},
"outputs": [
{
"data": {
"text/plain": [
"{'path': '/home/robinhad/.cache/huggingface/datasets/downloads/extracted/ee7155196e5d51620d53e48cf58eb693b7839b8ff183604c8bb948d3e0aad92d/cv-corpus-10.0-2022-07-04/uk/clips/common_voice_uk_20907128.mp3',\n",
" 'array': array([ 1.00456624e-13, -1.54340042e-13, 7.00158518e-13, ...,\n",
" -1.50335762e-08, -1.92623926e-08, -2.21930367e-08], dtype=float32),\n",
" 'sampling_rate': 16000}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_voice_train[0][\"audio\"]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 80
},
"executionInfo": {
"elapsed": 27,
"status": "ok",
"timestamp": 1641588821175,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "dueM6U7Ev0OA",
"outputId": "8f8e14bf-6d59-43e2-ae2d-525bac8e5097"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"здогадалась дівочити по семій дитині\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import IPython.display as ipd\n",
"import numpy as np\n",
"import random\n",
"\n",
"rand_int = random.randint(0, len(common_voice_train)-1)\n",
"\n",
"print(common_voice_train[rand_int][\"sentence\"])\n",
"ipd.Audio(data=common_voice_train[rand_int][\"audio\"][\"array\"], autoplay=True, rate=16000)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 22,
"status": "ok",
"timestamp": 1641588821176,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "1Po2g7YPuRTx",
"outputId": "ad79ec8a-ab5a-4c52-edfa-a20d0eec2282"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Target text: там і стали на ночівлю\n",
"Input array shape: (36288,)\n",
"Sampling rate: 16000\n"
]
}
],
"source": [
"rand_int = random.randint(0, len(common_voice_train)-1)\n",
"\n",
"print(\"Target text:\", common_voice_train[rand_int][\"sentence\"])\n",
"print(\"Input array shape:\", common_voice_train[rand_int][\"audio\"][\"array\"].shape)\n",
"print(\"Sampling rate:\", common_voice_train[rand_int][\"audio\"][\"sampling_rate\"])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"id": "eJY7I0XAwe9p"
},
"outputs": [],
"source": [
"def prepare_dataset(batch):\n",
" audio = batch[\"audio\"]\n",
"\n",
" # batched output is \"un-batched\"\n",
" batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n",
" batch[\"input_length\"] = len(batch[\"input_values\"])\n",
" \n",
" with processor.as_target_processor():\n",
" batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n",
" return batch"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 87,
"referenced_widgets": [
"a29f88f174f8499082fbb36a36c47fa4",
"efc3bc0c48124ebeb79d245216eaf0fe",
"d45747150d0b434593a3a7c98399599a",
"ea73f7deb1c643f7b81de7fb7acaaf1b",
"18bc63944343440f837cdff76db004fc",
"9c875952cdd649a5bab87de9bb3f5200",
"aa329cb93df44a6da6012c7cc49d7489",
"b39b6e9131ca4ce3b31e84ceb04e1b83",
"c5eed102ef134a4e8ca41713b82ff6a4",
"e6e50da6516847878309fdc5c463edb3",
"a4ae510b4f3845f891a796cf844fc2bb"
]
},
"executionInfo": {
"elapsed": 107521,
"status": "ok",
"timestamp": 1641588928679,
"user": {
"displayName": "Yurii Paniv",
"photoUrl": "https://lh3.googleusercontent.com/a/default-user=s64",
"userId": "13095662915325887123"
},
"user_tz": -120
},
"id": "-np9xYK-wl8q",
"outputId": "779b4637-0606-4cc8-be3c-16c1c4241e63"
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a4013929a3b945ef9dcd3041f0cc3e91",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/11463 [00:00, ?ex/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "785794e2e56b4260bea488093f20798e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6783 [00:00, ?ex/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)\n",
"common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"id": "tdHfbUJ_09iA"
},
"outputs": [],
"source": [
"#max_input_length_in_sec = 5.0\n",
"#common_voice_train = common_voice_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=[\"input_length\"])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mkdir: cannot create directory ‘cached_dataset’: File exists\n"
]
}
],
"source": [
"!mkdir cached_dataset"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"common_voice_train.save_to_disk(\"cached_dataset/cv_train\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"common_voice_test.save_to_disk(\"cached_dataset/cv_test\")"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"machine_shape": "hm",
"name": "Копія записника \"Fine-Tune XLS-R on Common Voice.ipynb\"",
"provenance": [
{
"file_id": "https://github.com/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLS_R_on_Common_Voice.ipynb",
"timestamp": 1641583715050
}
]
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"vscode": {
"interpreter": {
"hash": "a5cdd9abf8df3af0fd61fdb3838d6c6f2f66a9ba4bf4484f45cd88abf9f04fe9"
}
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"04ec68b059df4c628839c3ac29e2ebdd": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"05d8496d54174ae298c319b0194fc710": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"116786d9364a4a57b521cddaabeda688": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HBoxModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_a1e2c04dc2cb45ea80bec125e3dbf56f",
"IPY_MODEL_b6d46d40efa14b21814f41531f5a2f41",
"IPY_MODEL_d8bf8dc5d6c84140a4e96c9c435b8f17"
],
"layout": "IPY_MODEL_9baa2f69aa9c4387bf1086a04ed78420"
}
},
"18bc63944343440f837cdff76db004fc": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a4ae510b4f3845f891a796cf844fc2bb",
"placeholder": "",
"style": "IPY_MODEL_e6e50da6516847878309fdc5c463edb3",
"value": " 6962/6962 [01:46<00:00, 78.15ex/s]"
}
},
"1f3abdf2e0f6459da4179a94d691c4c4": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "FloatProgressModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c31a747e18df4b4aa4449a30e387448c",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_414efa8a08cd491cb78af8a95a151daa",
"value": 1
}
},
"22ba979142074f1d976e1a905544fd2d": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"3dedffa30b774426bd474072a3a0d591": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"414efa8a08cd491cb78af8a95a151daa": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "ProgressStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"427056895c674c428400bee0f5b43995": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"445c84e1e2e541f2a54fb989def386ae": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"48c60be3ca9349a295b83f65769c7f27": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_05d8496d54174ae298c319b0194fc710",
"placeholder": "",
"style": "IPY_MODEL_3dedffa30b774426bd474072a3a0d591",
"value": " 1/1 [00:00<00:00, 11.09ba/s]"
}
},
"5815ae1348994bfebba4a8e968489a96": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"5c2a7fea8c434d51ada69a0854b88baf": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"68502fb433564eee8dfdf272ed7e4f56": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_5c2a7fea8c434d51ada69a0854b88baf",
"placeholder": "",
"style": "IPY_MODEL_6c80bd8a8fe14a5989fe27445c14650f",
"value": "100%"
}
},
"6c80bd8a8fe14a5989fe27445c14650f": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"77f1a51099b24831ad8b2be3d2dc833a": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"8b6b7f28751c45c8869aa86eb2a0ab26": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HBoxModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_68502fb433564eee8dfdf272ed7e4f56",
"IPY_MODEL_1f3abdf2e0f6459da4179a94d691c4c4",
"IPY_MODEL_48c60be3ca9349a295b83f65769c7f27"
],
"layout": "IPY_MODEL_445c84e1e2e541f2a54fb989def386ae"
}
},
"9baa2f69aa9c4387bf1086a04ed78420": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9c875952cdd649a5bab87de9bb3f5200": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"a1e2c04dc2cb45ea80bec125e3dbf56f": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_427056895c674c428400bee0f5b43995",
"placeholder": "",
"style": "IPY_MODEL_04ec68b059df4c628839c3ac29e2ebdd",
"value": "100%"
}
},
"a29f88f174f8499082fbb36a36c47fa4": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HBoxModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_d45747150d0b434593a3a7c98399599a",
"IPY_MODEL_ea73f7deb1c643f7b81de7fb7acaaf1b",
"IPY_MODEL_18bc63944343440f837cdff76db004fc"
],
"layout": "IPY_MODEL_efc3bc0c48124ebeb79d245216eaf0fe"
}
},
"a4ae510b4f3845f891a796cf844fc2bb": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"aa329cb93df44a6da6012c7cc49d7489": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b39b6e9131ca4ce3b31e84ceb04e1b83": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "ProgressStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"b6d46d40efa14b21814f41531f5a2f41": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "FloatProgressModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_77f1a51099b24831ad8b2be3d2dc833a",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_d518f2c2ab6945b78a6d336dad6262bd",
"value": 1
}
},
"c31a747e18df4b4aa4449a30e387448c": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c5eed102ef134a4e8ca41713b82ff6a4": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d45747150d0b434593a3a7c98399599a": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_aa329cb93df44a6da6012c7cc49d7489",
"placeholder": "",
"style": "IPY_MODEL_9c875952cdd649a5bab87de9bb3f5200",
"value": "100%"
}
},
"d518f2c2ab6945b78a6d336dad6262bd": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "ProgressStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"d8bf8dc5d6c84140a4e96c9c435b8f17": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_22ba979142074f1d976e1a905544fd2d",
"placeholder": "",
"style": "IPY_MODEL_5815ae1348994bfebba4a8e968489a96",
"value": " 1/1 [00:00<00:00, 7.95ba/s]"
}
},
"e6e50da6516847878309fdc5c463edb3": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"ea73f7deb1c643f7b81de7fb7acaaf1b": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "FloatProgressModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c5eed102ef134a4e8ca41713b82ff6a4",
"max": 6962,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_b39b6e9131ca4ce3b31e84ceb04e1b83",
"value": 6962
}
},
"efc3bc0c48124ebeb79d245216eaf0fe": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
}
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}