{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Setting Up the data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "source_language = \"en\"\n", "target_language = \"ln\" # ln is the language code of lingala \n", "lc = False # If True, lowercase the data.\n", "seed = 42 # Random seed for shuffling.\n", "tag = \"baseline\" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted\n", "\n", "os.environ[\"src\"] = source_language # Sets them in bash as well, since we often use bash scripts\n", "os.environ[\"tgt\"] = target_language\n", "os.environ[\"tag\"] = tag\n", "\n", "# No need to use gdrive since am using a gpu\n", "!mkdir -p \"$src-$tgt-$tag\"\n", "os.environ[\"gdrive_path\"] = \"%s-%s-%s\" % (source_language, target_language, tag) # saving directly on the vm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "en-ln-baseline\r\n" ] } ], "source": [ "!echo $gdrive_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Downloading the corpus data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Alignment file /proj/nlpl/data/OPUS/JW300/latest/xml/en-ln.xml.gz not found. The following files are available for downloading:\n", "\n", " 5 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en-ln.xml.gz\n", " 263 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en.zip\n", " 60 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/ln.zip\n", "\n", " 328 MB Total size\n", "./JW300_latest_xml_en-ln.xml.gz ... 100% of 5 MB\n", "./JW300_latest_xml_en.zip ... 100% of 263 MB\n", "./JW300_latest_xml_ln.zip ... 100% of 60 MB\n" ] } ], "source": [ "# Downloading our corpus\n", "! opus_read -d JW300 -s $src -t $tgt -wm moses -w jw300.$src jw300.$tgt -q\n", "\n", "# extract the corpus file\n", "! gunzip JW300_latest_xml_$src-$tgt.xml.gz" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2020-01-27 07:54:38-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.192.133, 151.101.128.133, 151.101.64.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.192.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 277791 (271K) [text/plain]\n", "Saving to: ‘test.en-any.en’\n", "\n", "test.en-any.en 100%[===================>] 271.28K --.-KB/s in 0.04s \n", "\n", "2020-01-27 07:54:39 (6.66 MB/s) - ‘test.en-any.en’ saved [277791/277791]\n", "\n", "--2020-01-27 07:54:39-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-ln.en\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.192.133, 151.101.128.133, 151.101.64.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.192.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 205304 (200K) [text/plain]\n", "Saving to: ‘test.en-ln.en’\n", "\n", "test.en-ln.en 100%[===================>] 200.49K --.-KB/s in 0.04s \n", "\n", "2020-01-27 07:54:39 (4.92 MB/s) - ‘test.en-ln.en’ saved [205304/205304]\n", "\n", "--2020-01-27 07:54:39-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-ln.ln\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.192.133, 151.101.128.133, 151.101.64.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.192.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 231696 (226K) [text/plain]\n", "Saving to: ‘test.en-ln.ln’\n", "\n", "test.en-ln.ln 100%[===================>] 226.27K --.-KB/s in 0.04s \n", "\n", "2020-01-27 07:54:40 (5.27 MB/s) - ‘test.en-ln.ln’ saved [231696/231696]\n", "\n" ] } ], "source": [ "! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en\n", " \n", "# And the specific test set for this language pair.\n", "os.environ[\"trg\"] = target_language \n", "os.environ[\"src\"] = source_language \n", "\n", "! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.en \n", "! mv test.en-$trg.en test.en\n", "! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.$trg \n", "! mv test.en-$trg.$trg test.$trg" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 3571 global test sentences to filter from the training/dev data.\n" ] } ], "source": [ "# Read the test data to filter from train and dev splits.\n", "# Store english portion in set for quick filtering checks.\n", "en_test_sents = set()\n", "filter_test_sents = \"test.en-any.en\"\n", "j = 0\n", "with open(filter_test_sents) as f:\n", " for line in f:\n", " en_test_sents.add(line.strip())\n", " j += 1\n", "print('Loaded {} global test sentences to filter from the training/dev data.'.format(j))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "JW300_latest_xml_en-ln.xml baseline_2020.ipynb jw300.ln\t test.ln\r\n", "JW300_latest_xml_en.zip en-ln-baseline\t test.en\r\n", "JW300_latest_xml_ln.zip jw300.en\t\t test.en-any.en\r\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Building the model dataset" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded data and skipped 6663/601113 lines since contained in test set.\n" ] }, { "data": { "text/html": [ "
\n", " | source_sentence | \n", "target_sentence | \n", "
---|---|---|
0 | \n", "Who Wants to Be a Millionaire ? | \n", "Nani alingi kozala milionere ? | \n", "
1 | \n", "THE answer seems to be : almost everybody . | \n", "EYANO emonani lokola ete , wana ezali mposa ya... | \n", "
2 | \n", "And the easiest way to become one — accordin... | \n", "Nzokande , na makanisi ya bato , nzela ya pɛtɛ... | \n", "
3 | \n", "Pandering to prevailing taste — and wanting ... | \n", "Kolamusáká mposa ya bato mingi — mpe kolulák... | \n", "
4 | \n", "A few people do become millionaires . | \n", "Mwa babɛti na yango bazali mpenza kokóma bamil... | \n", "
5 | \n", "One Englishman had filled out soccer coupons f... | \n", "Mongelesi moko oyo azalaki kosala momekano na ... | \n", "
6 | \n", "For a stake of 50 cents , he won nearly $ 1.5 ... | \n", "Abɛtaki bobele na mosolo mokokani na franka 3 ... | \n", "
7 | \n", "Even more spectacular was the payoff for a wom... | \n", "Oyo ekamwisi mpenza ezali likambo ya mwasi oyo... | \n", "
8 | \n", "But they are exceptions . | \n", "Kasi baoyo bazali kolónga boye bazali sé moke ... | \n", "
9 | \n", "More typical is the middle - aged Spanish cl... | \n", "Ndakisa emonisi yango malamu ezali mosali na b... | \n", "