{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "MACHINE TRANSLATION FOR KISWAHILI SAWA CORPUS USING JOEY NMT " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# TODO: Set your source and target languages. Keep in mind, these traditionally use language codes as found here:\n", "# These will also become the suffix's of all vocab and corpus files used throughout\n", "import os\n", "source_language = \"en\"\n", "target_language = \"sw\" \n", "lc = False # If True, lowercase the data.\n", "seed = 42 # Random seed for shuffling.\n", "tag = \"baseline\" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted\n", "\n", "os.environ[\"src\"] = source_language # Sets them in bash as well, since we often use bash scripts\n", "os.environ[\"tgt\"] = target_language\n", "os.environ[\"tag\"] = tag\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mThe directory '/home/freshiasackey_gmail_com/.cache/pip/http' or its parent directory is not owned by the current user and the cache has been disabled. Please check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.\u001b[0m\n", "\u001b[33mThe directory '/home/freshiasackey_gmail_com/.cache/pip' or its parent directory is not owned by the current user and caching wheels has been disabled. check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.\u001b[0m\n", "Requirement already satisfied (use --upgrade to upgrade): opustools-pkg in /home/freshiasackey_gmail_com/.local/lib/python3.6/site-packages\n", "\u001b[33mYou are using pip version 8.1.1, however version 20.1.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "# Install opus-tools\n", "! pip3 install opustools-pkg" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Alignment file /proj/nlpl/data/OPUS/JW300/latest/xml/en-sw.xml.gz not found. The following files are available for downloading:\n", "\n", " 8 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en-sw.xml.gz\n", " 263 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en.zip\n", " 94 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/sw.zip\n", "\n", " 365 MB Total size\n", "./JW300_latest_xml_en-sw.xml.gz ... 100% of 8 MB\n", "./JW300_latest_xml_en.zip ... 100% of 263 MB\n", "./JW300_latest_xml_sw.zip ... 100% of 94 MB\n" ] } ], "source": [ "# Downloading JW300 corpus\n", "! opus_read -d JW300 -s $src -t $tgt -wm moses -w jw300.$src jw300.$tgt -q\n", "\n", "# extract the corpus file\n", "! gunzip JW300_latest_xml_$src-$tgt.xml.gz" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mThe directory '/home/freshiasackey_gmail_com/.cache/pip/http' or its parent directory is not owned by the current user and the cache has been disabled. Please check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.\u001b[0m\n", "\u001b[33mThe directory '/home/freshiasackey_gmail_com/.cache/pip' or its parent directory is not owned by the current user and caching wheels has been disabled. check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.\u001b[0m\n", "Requirement already satisfied (use --upgrade to upgrade): pandas in /home/freshiasackey_gmail_com/.local/lib/python3.6/site-packages\n", "Requirement already satisfied (use --upgrade to upgrade): numpy>=1.13.3 in /home/freshiasackey_gmail_com/.local/lib/python3.6/site-packages (from pandas)\n", "Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2.6.1 in /home/freshiasackey_gmail_com/.local/lib/python3.6/site-packages (from pandas)\n", "Requirement already satisfied (use --upgrade to upgrade): pytz>=2017.2 in /home/freshiasackey_gmail_com/.local/lib/python3.6/site-packages (from pandas)\n", "Requirement already satisfied (use --upgrade to upgrade): six>=1.5 in /home/freshiasackey_gmail_com/.local/lib/python3.6/site-packages (from python-dateutil>=2.6.1->pandas)\n", "\u001b[33mYou are using pip version 8.1.1, however version 20.1.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "! pip3 install pandas\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2020-06-02 19:47:29-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 277791 (271K) [text/plain]\n", "Saving to: ‘test.en-any.en’\n", "\n", "test.en-any.en 100%[===================>] 271.28K --.-KB/s in 0.002s \n", "\n", "2020-06-02 19:47:30 (108 MB/s) - ‘test.en-any.en’ saved [277791/277791]\n", "\n", "--2020-06-02 19:47:30-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-sw.en\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 206336 (202K) [text/plain]\n", "Saving to: ‘test.en-sw.en’\n", "\n", "test.en-sw.en 100%[===================>] 201.50K --.-KB/s in 0.007s \n", "\n", "2020-06-02 19:47:30 (27.9 MB/s) - ‘test.en-sw.en’ saved [206336/206336]\n", "\n", "--2020-06-02 19:47:30-- https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-sw.sw\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 214836 (210K) [text/plain]\n", "Saving to: ‘test.en-sw.sw’\n", "\n", "test.en-sw.sw 100%[===================>] 209.80K --.-KB/s in 0.002s \n", "\n", "2020-06-02 19:47:30 (104 MB/s) - ‘test.en-sw.sw’ saved [214836/214836]\n", "\n" ] } ], "source": [ "# Download the global test set.\n", "! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en\n", " \n", "# And the specific test set for this language pair.\n", "os.environ[\"trg\"] = target_language \n", "os.environ[\"src\"] = source_language \n", "\n", "! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.en \n", "! mv test.en-$trg.en test.en\n", "! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.$trg \n", "! mv test.en-$trg.$trg test.$trg" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 3571 global test sentences to filter from the training/dev data.\n" ] } ], "source": [ "# Read the test data to filter from train and dev splits.\n", "# Store english portion in set for quick filtering checks.\n", "en_test_sents = set()\n", "filter_test_sents = \"test.en-any.en\"\n", "j = 0\n", "with open(filter_test_sents) as f:\n", " for line in f:\n", " en_test_sents.add(line.strip())\n", " j += 1\n", "print('Loaded {} global test sentences to filter from the training/dev data.'.format(j))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Loading JW + Sawa data to df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded data and skipped 6478/979526 lines since contained in test set.\n", "973049\n", "973049\n", "[\"'No'\", \"'Source_language'\", \"'Target_language'\"]\n", "0\n", "1458206\n", "1458206\n", "cat\n", "\n", "nyau\n", "\n", "be foolish stupid\n", "\n", "zuzuwaa\n", "\n", "Dear Shareholders It is my pleasure to present to you the first Annual Report and Audited Financial Statements of AccessKenya Group following our successful listing on the Nairobi Stock Exchange in June 2007. \n", "\n", "WapendwaWenyehisa Nifuraha yangu kuu kuwasilisha kwenu taarifa ya kwanza ya kila mwaka na taarifa za ukaguzi wa hesabu za pesa za kundi la kampuni yaAccess Kenya kufuatia ufanisi wetu wa kuorodheshwa kwenye soko la hisa la Nairobi mwezi Juni mwaka wa 2007. \n", "\n", "Overriding all of these factors was an assessment of how effectively the target could roll out our new IT services solutions to the existingAccessKenya corporate customer base of approximately 2,000 customers.\n", "\n", "Maswala yaliyopita haya yote ni ukadiriaji wa jinsi lengo hili litakavyoleta suluhisho kwa huduma zetu mpya za teknolojia ya habari IT kwa kiwango cha wateja wa kampuni ya Access Kenya wapatao elfu 2. \n", "\n", "Tail\n" ] }, { "data": { "text/html": [ "
\n", " | source_sentence | \n", "target_sentence | \n", "
---|---|---|
1458196 | \n", "idiot\\n | \n", "zuzu\\n | \n", "
1458197 | \n", "blockhead\\n | \n", "zuzu\\n | \n", "
1458198 | \n", "simp-leton\\n | \n", "zuzu\\n | \n", "
1458199 | \n", "disturb\\n | \n", "zuzua\\n | \n", "
1458200 | \n", "derange\\n | \n", "zuzua\\n | \n", "
1458201 | \n", "infatuate\\n | \n", "zuzua\\n | \n", "
1458202 | \n", "craze\\n | \n", "zuzua\\n | \n", "
1458203 | \n", "vaunt fool flatter oneself\\n | \n", "zuzuka\\n | \n", "
1458204 | \n", "sink into oblivion\\n | \n", "zuzuwaa\\n | \n", "
1458205 | \n", "be foolish stupid\\n | \n", "zuzuwaa\\n | \n", "