{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "730ba509", "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "d9acd4b6", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import sys\n", "proj_dir = Path.cwd().parent\n", "\n", "sys.path.append(str(proj_dir))\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "62452860", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 10, "id": "9264a232", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration derek-thomas--dataset-creator-askreddit-806417599346c17a\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-806417599346c17a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b65ec8c7f33a40eeac5d15e6a527f830", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
scorenum_commentstitlepermalinkselftexturlcreated_utcauthoriddownsups
024Reddit, if someone had to describe you to a st.../r/AskReddit/comments/15sn6y/reddit_if_someone...They would be talking about you without your p...http://www.reddit.com/r/AskReddit/comments/15s...2013-01-01 23:59:40[deleted]15sn6y02
1524What kind of car does the average \\nRedditor d.../r/AskReddit/comments/15sn6m/what_kind_of_car_...I've always wanted to know what kind of car th...http://www.reddit.com/r/AskReddit/comments/15s...2013-01-01 23:59:31PaytonAdams15sn6m05
215What movies have made you go back to the theat.../r/AskReddit/comments/15sn6b/what_movies_have_...http://www.reddit.com/r/AskReddit/comments/15s...2013-01-01 23:59:20[deleted]15sn6b01
3018Worst fear(s)?/r/AskReddit/comments/15sn4u/worst_fears/So what is your worst fear, reddit?http://www.reddit.com/r/AskReddit/comments/15s...2013-01-01 23:58:37[deleted]15sn4u00
41129If there was a type of ink that lasted only fo.../r/AskReddit/comments/15sn44/if_there_was_a_ty...http://www.reddit.com/r/AskReddit/comments/15s...2013-01-01 23:58:15Honeybeard15sn44011
....................................
329362811Help me get an idea of cost of living/r/AskReddit/comments/2cjj63/help_me_get_an_id...http://www.reddit.com/r/AskReddit/comments/2cj...2014-08-04 00:01:20bbent46982cjj6301
329362920If you used a prism to separate light and then.../r/AskReddit/comments/2cjj5v/if_you_used_a_pri...http://www.reddit.com/r/AskReddit/comments/2cj...2014-08-04 00:01:19Ajmb_882cjj5v02
3293630011Reddit, what was it like the first time you go.../r/AskReddit/comments/2cjj4s/reddit_what_was_i...http://www.reddit.com/r/AskReddit/comments/2cj...2014-08-04 00:01:01da-gonzo2cjj4s00
329363114523140People who refuse to be organ donors, why do y.../r/AskReddit/comments/2cjj31/people_who_refuse...R.I.P my inboxhttp://www.reddit.com/r/AskReddit/comments/2cj...2014-08-04 00:00:36JohnnySniperr2cjj3101452
329363229What always happens when you travel abroad?/r/AskReddit/comments/2cjj2a/what_always_happe...http://www.reddit.com/r/AskReddit/comments/2cj...2014-08-04 00:00:23Nicopip2cjj2a02
\n", "

3293633 rows × 11 columns

\n", "" ], "text/plain": [ " score num_comments \\\n", "0 2 4 \n", "1 5 24 \n", "2 1 5 \n", "3 0 18 \n", "4 11 29 \n", "... ... ... \n", "3293628 1 1 \n", "3293629 2 0 \n", "3293630 0 11 \n", "3293631 1452 3140 \n", "3293632 2 9 \n", "\n", " title \\\n", "0 Reddit, if someone had to describe you to a st... \n", "1 What kind of car does the average \\nRedditor d... \n", "2 What movies have made you go back to the theat... \n", "3 Worst fear(s)? \n", "4 If there was a type of ink that lasted only fo... \n", "... ... \n", "3293628 Help me get an idea of cost of living \n", "3293629 If you used a prism to separate light and then... \n", "3293630 Reddit, what was it like the first time you go... \n", "3293631 People who refuse to be organ donors, why do y... \n", "3293632 What always happens when you travel abroad? \n", "\n", " permalink \\\n", "0 /r/AskReddit/comments/15sn6y/reddit_if_someone... \n", "1 /r/AskReddit/comments/15sn6m/what_kind_of_car_... \n", "2 /r/AskReddit/comments/15sn6b/what_movies_have_... \n", "3 /r/AskReddit/comments/15sn4u/worst_fears/ \n", "4 /r/AskReddit/comments/15sn44/if_there_was_a_ty... \n", "... ... \n", "3293628 /r/AskReddit/comments/2cjj63/help_me_get_an_id... \n", "3293629 /r/AskReddit/comments/2cjj5v/if_you_used_a_pri... \n", "3293630 /r/AskReddit/comments/2cjj4s/reddit_what_was_i... \n", "3293631 /r/AskReddit/comments/2cjj31/people_who_refuse... \n", "3293632 /r/AskReddit/comments/2cjj2a/what_always_happe... \n", "\n", " selftext \\\n", "0 They would be talking about you without your p... \n", "1 I've always wanted to know what kind of car th... \n", "2 \n", "3 So what is your worst fear, reddit? \n", "4 \n", "... ... \n", "3293628 \n", "3293629 \n", "3293630 \n", "3293631 R.I.P my inbox \n", "3293632 \n", "\n", " url \\\n", "0 http://www.reddit.com/r/AskReddit/comments/15s... \n", "1 http://www.reddit.com/r/AskReddit/comments/15s... \n", "2 http://www.reddit.com/r/AskReddit/comments/15s... \n", "3 http://www.reddit.com/r/AskReddit/comments/15s... \n", "4 http://www.reddit.com/r/AskReddit/comments/15s... \n", "... ... \n", "3293628 http://www.reddit.com/r/AskReddit/comments/2cj... \n", "3293629 http://www.reddit.com/r/AskReddit/comments/2cj... \n", "3293630 http://www.reddit.com/r/AskReddit/comments/2cj... \n", "3293631 http://www.reddit.com/r/AskReddit/comments/2cj... \n", "3293632 http://www.reddit.com/r/AskReddit/comments/2cj... \n", "\n", " created_utc author id downs ups \n", "0 2013-01-01 23:59:40 [deleted] 15sn6y 0 2 \n", "1 2013-01-01 23:59:31 PaytonAdams 15sn6m 0 5 \n", "2 2013-01-01 23:59:20 [deleted] 15sn6b 0 1 \n", "3 2013-01-01 23:58:37 [deleted] 15sn4u 0 0 \n", "4 2013-01-01 23:58:15 Honeybeard 15sn44 0 11 \n", "... ... ... ... ... ... \n", "3293628 2014-08-04 00:01:20 bbent4698 2cjj63 0 1 \n", "3293629 2014-08-04 00:01:19 Ajmb_88 2cjj5v 0 2 \n", "3293630 2014-08-04 00:01:01 da-gonzo 2cjj4s 0 0 \n", "3293631 2014-08-04 00:00:36 JohnnySniperr 2cjj31 0 1452 \n", "3293632 2014-08-04 00:00:23 Nicopip 2cjj2a 0 2 \n", "\n", "[3293633 rows x 11 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = dataset['all_days'].to_pandas()\n", "df" ] }, { "cell_type": "code", "execution_count": 16, "id": "b5bbfa15", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "score Int64\n", "num_comments Int64\n", "title string\n", "permalink string\n", "selftext string\n", "url string\n", "created_utc string\n", "author string\n", "id string\n", "downs Int64\n", "ups Int64\n", "dtype: object" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.convert_dtypes().dtypes" ] }, { "cell_type": "code", "execution_count": 18, "id": "c4292c7c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 21, "id": "5a516c19", "metadata": {}, "outputs": [], "source": [ "df['created_utc'] = pd.to_datetime(df['created_utc'])\n", "df['date'] = df['created_utc'].dt.date\n", "df['time'] = df['created_utc'].dt.time" ] }, { "cell_type": "code", "execution_count": 25, "id": "22d87986", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.date.hist(bins=400)" ] }, { "cell_type": "code", "execution_count": 26, "id": "19d6539b", "metadata": {}, "outputs": [], "source": [ "new_df = df.drop_duplicates(subset=['id'], keep=\"first\")" ] }, { "cell_type": "code", "execution_count": null, "id": "466cd2c7", "metadata": {}, "outputs": [], "source": [ "new_df.date.hist(bins-)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }