{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ac3a07af-2b66-41c8-8548-6f951460aedb", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "ACCESS_TOKEN = \n", "NUM_SAMPLES = 500000\n", "\n", "dataset = load_dataset(\"uonlp/CulturaX\",\n", " \"ur\",\n", " split=f\"train[:{NUM_SAMPLES}]\",\n", " token = ACCESS_TOKEN\n", " )" ] }, { "cell_type": "code", "execution_count": 2, "id": "b6515d96-1129-4aac-a670-796fee9302db", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'timestamp', 'url', 'source'],\n", " num_rows: 500000\n", "})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "13831730-9fc1-4d89-b4fd-060ce0a976cb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text'],\n", " num_rows: 500000\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove columns other than text\n", "\n", "dataset = dataset.remove_columns([col for col in dataset.column_names if col != 'text'])\n", "dataset" ] }, { "cell_type": "code", "execution_count": 4, "id": "69466306-5190-4581-82fc-c5839bf15a80", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "500000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset)" ] }, { "cell_type": "markdown", "id": "25249452-d545-45ce-8c47-a6ee4b20eee1", "metadata": {}, "source": [ "Curiously, I found out that number of rows counted using \"wc -l file.csv\" in a Linux terminal gives number of lines, not number of rows. See comment in https://stackoverflow.com/questions/32913151/is-it-possible-to-get-the-number-of-rows-in-a-csv-file-without-opening-it" ] }, { "cell_type": "code", "execution_count": 6, "id": "68b6087b-0c27-4a9f-bc6c-1317a87c3f3f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████| 500000/500000 [00:31<00:00, 16019.97it/s]\n" ] } ], "source": [ "from tqdm import tqdm\n", "\n", "for idx in tqdm(range(NUM_SAMPLES)):\n", " with open(f'data/culturaX_ur_500k/ur_sample_{idx}.txt', 'w') as file:\n", " file.write(dataset[idx][\"text\"])" ] }, { "cell_type": "markdown", "id": "a3bd917f-a70f-421d-9680-33ee676f193b", "metadata": {}, "source": [ "### Bengali" ] }, { "cell_type": "code", "execution_count": 1, "id": "7c6a1722-7f4f-436b-a8b3-612f24483ee5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3d90843e65214727bf8ccf27c76caac9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Resolving data files: 0%| | 0/18 [00:00