derek-thomas HF staff commited on
Commit
9de4dba
1 Parent(s): fc00c85

Updates for datetime format and correcting most_recent_date

Browse files
Files changed (3) hide show
  1. main.py +15 -11
  2. notebooks/validate.ipynb +545 -0
  3. utilities/pushshift_data.py +3 -2
main.py CHANGED
@@ -59,7 +59,7 @@ def main(date_to_fetch):
59
  Runs the main data processing function to fetch and process subreddit data for the specified date.
60
 
61
  Args:
62
- date_to_fetch (str): The date to fetch subreddit data for, in the format "YYYY-MM-DD".
63
 
64
  Returns:
65
  most_recent_date (str): Most recent date in dataset
@@ -67,7 +67,7 @@ def main(date_to_fetch):
67
 
68
  # Load the existing dataset from the Hugging Face hub or create a new one
69
  try:
70
- dataset = load_dataset(dataset_name)
71
  logger.info("Loading existing dataset")
72
  if "__index_level_0__" in dataset["all_days"].column_names:
73
  dataset = dataset.remove_columns(["__index_level_0__"])
@@ -76,11 +76,11 @@ def main(date_to_fetch):
76
  dataset = DatasetDict()
77
 
78
  # Call get_subreddit_day with the calculated date
79
- logger.info(f"Fetching data for {date_to_fetch}")
80
- submissions = scrape_submissions_by_day(subreddit, date_to_fetch)
81
  df = submissions_to_dataframe(submissions)
82
- logger.info(f"Data fetched for {date_to_fetch}")
83
- most_recent_date = datetime.strptime(date_to_fetch, '%Y-%m-%d').date()
84
 
85
  # Append DataFrame to split 'all_days' or create new split
86
  if "all_days" in dataset:
@@ -93,10 +93,14 @@ def main(date_to_fetch):
93
 
94
  # Drop duplicates just in case
95
  new_data = new_data.drop_duplicates(subset=['id'], keep="first")
96
- new_data_most_recent_date_raw = new_data['created_utc'].max()
97
- new_data_most_recent_date_dt = datetime.strptime(new_data_most_recent_date_raw.split(' ')[0], '%Y-%m-%d').date()
98
- # Adding timedelta in case there is rounding error
99
- most_recent_date = max(new_data_most_recent_date_dt - timedelta(days=1), most_recent_date)
 
 
 
 
100
 
101
  # Convert back to dataset
102
  dataset["all_days"] = Dataset.from_pandas(new_data)
@@ -133,7 +137,7 @@ def run_main_continuously():
133
 
134
  if start_date <= two_days_ago:
135
  logger.info(f"Running main function for date: {start_date}")
136
- most_recent_date = main(str(start_date))
137
  start_date = most_recent_date + timedelta(days=1)
138
  else:
139
  tomorrow = today + timedelta(days=1)
 
59
  Runs the main data processing function to fetch and process subreddit data for the specified date.
60
 
61
  Args:
62
+ date_to_fetch (datetime.date): The date to fetch subreddit data for
63
 
64
  Returns:
65
  most_recent_date (str): Most recent date in dataset
 
67
 
68
  # Load the existing dataset from the Hugging Face hub or create a new one
69
  try:
70
+ dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
71
  logger.info("Loading existing dataset")
72
  if "__index_level_0__" in dataset["all_days"].column_names:
73
  dataset = dataset.remove_columns(["__index_level_0__"])
 
76
  dataset = DatasetDict()
77
 
78
  # Call get_subreddit_day with the calculated date
79
+ logger.info(f"Fetching data for {str(date_to_fetch)}")
80
+ submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
81
  df = submissions_to_dataframe(submissions)
82
+ logger.info(f"Data fetched for {str(date_to_fetch)}")
83
+ most_recent_date = start_date
84
 
85
  # Append DataFrame to split 'all_days' or create new split
86
  if "all_days" in dataset:
 
93
 
94
  # Drop duplicates just in case
95
  new_data = new_data.drop_duplicates(subset=['id'], keep="first")
96
+
97
+ # Figure out dates when we restart
98
+ old_data_most_recent_date = old_data['date'].max()
99
+ most_recent_date = max(old_data_most_recent_date, most_recent_date)
100
+
101
+ if len(old_data) == len(new_data):
102
+ logger.warning("Data in hub is much more recent, using that next!")
103
+ return most_recent_date
104
 
105
  # Convert back to dataset
106
  dataset["all_days"] = Dataset.from_pandas(new_data)
 
137
 
138
  if start_date <= two_days_ago:
139
  logger.info(f"Running main function for date: {start_date}")
140
+ most_recent_date = main(start_date)
141
  start_date = most_recent_date + timedelta(days=1)
142
  else:
143
  tomorrow = today + timedelta(days=1)
notebooks/validate.ipynb ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "730ba509",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from IPython.core.interactiveshell import InteractiveShell\n",
11
+ "InteractiveShell.ast_node_interactivity = \"all\""
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "d9acd4b6",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "from pathlib import Path\n",
22
+ "import sys\n",
23
+ "proj_dir = Path.cwd().parent\n",
24
+ "\n",
25
+ "sys.path.append(str(proj_dir))\n"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 4,
31
+ "id": "62452860",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "from datasets import load_dataset"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 10,
41
+ "id": "9264a232",
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "Using custom data configuration derek-thomas--dataset-creator-askreddit-806417599346c17a\n"
49
+ ]
50
+ },
51
+ {
52
+ "name": "stdout",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "Downloading and preparing dataset None/None to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-806417599346c17a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
56
+ ]
57
+ },
58
+ {
59
+ "data": {
60
+ "application/vnd.jupyter.widget-view+json": {
61
+ "model_id": "b65ec8c7f33a40eeac5d15e6a527f830",
62
+ "version_major": 2,
63
+ "version_minor": 0
64
+ },
65
+ "text/plain": [
66
+ "Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
67
+ ]
68
+ },
69
+ "metadata": {},
70
+ "output_type": "display_data"
71
+ },
72
+ {
73
+ "data": {
74
+ "application/vnd.jupyter.widget-view+json": {
75
+ "model_id": "2d93949f1f0144779349c73c58a68ca9",
76
+ "version_major": 2,
77
+ "version_minor": 0
78
+ },
79
+ "text/plain": [
80
+ "Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
81
+ ]
82
+ },
83
+ "metadata": {},
84
+ "output_type": "display_data"
85
+ },
86
+ {
87
+ "data": {
88
+ "application/vnd.jupyter.widget-view+json": {
89
+ "model_id": "",
90
+ "version_major": 2,
91
+ "version_minor": 0
92
+ },
93
+ "text/plain": [
94
+ "Generating all_days split: 0%| | 0/2468888 [00:00<?, ? examples/s]"
95
+ ]
96
+ },
97
+ "metadata": {},
98
+ "output_type": "display_data"
99
+ },
100
+ {
101
+ "name": "stdout",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "Dataset parquet downloaded and prepared to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-806417599346c17a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
105
+ ]
106
+ },
107
+ {
108
+ "data": {
109
+ "application/vnd.jupyter.widget-view+json": {
110
+ "model_id": "0e62c7e8b3c74aa5af3b87ab17e6cb1f",
111
+ "version_major": 2,
112
+ "version_minor": 0
113
+ },
114
+ "text/plain": [
115
+ " 0%| | 0/1 [00:00<?, ?it/s]"
116
+ ]
117
+ },
118
+ "metadata": {},
119
+ "output_type": "display_data"
120
+ }
121
+ ],
122
+ "source": [
123
+ "dataset = load_dataset('derek-thomas/dataset-creator-askreddit', download_mode=\"reuse_cache_if_exists\", ignore_verifications=True)"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 12,
129
+ "id": "ba84be68",
130
+ "metadata": {},
131
+ "outputs": [
132
+ {
133
+ "data": {
134
+ "text/html": [
135
+ "<div>\n",
136
+ "<style scoped>\n",
137
+ " .dataframe tbody tr th:only-of-type {\n",
138
+ " vertical-align: middle;\n",
139
+ " }\n",
140
+ "\n",
141
+ " .dataframe tbody tr th {\n",
142
+ " vertical-align: top;\n",
143
+ " }\n",
144
+ "\n",
145
+ " .dataframe thead th {\n",
146
+ " text-align: right;\n",
147
+ " }\n",
148
+ "</style>\n",
149
+ "<table border=\"1\" class=\"dataframe\">\n",
150
+ " <thead>\n",
151
+ " <tr style=\"text-align: right;\">\n",
152
+ " <th></th>\n",
153
+ " <th>score</th>\n",
154
+ " <th>num_comments</th>\n",
155
+ " <th>title</th>\n",
156
+ " <th>permalink</th>\n",
157
+ " <th>selftext</th>\n",
158
+ " <th>url</th>\n",
159
+ " <th>created_utc</th>\n",
160
+ " <th>author</th>\n",
161
+ " <th>id</th>\n",
162
+ " <th>downs</th>\n",
163
+ " <th>ups</th>\n",
164
+ " </tr>\n",
165
+ " </thead>\n",
166
+ " <tbody>\n",
167
+ " <tr>\n",
168
+ " <th>0</th>\n",
169
+ " <td>2</td>\n",
170
+ " <td>4</td>\n",
171
+ " <td>Reddit, if someone had to describe you to a st...</td>\n",
172
+ " <td>/r/AskReddit/comments/15sn6y/reddit_if_someone...</td>\n",
173
+ " <td>They would be talking about you without your p...</td>\n",
174
+ " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
175
+ " <td>2013-01-01 23:59:40</td>\n",
176
+ " <td>[deleted]</td>\n",
177
+ " <td>15sn6y</td>\n",
178
+ " <td>0</td>\n",
179
+ " <td>2</td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>1</th>\n",
183
+ " <td>5</td>\n",
184
+ " <td>24</td>\n",
185
+ " <td>What kind of car does the average \\nRedditor d...</td>\n",
186
+ " <td>/r/AskReddit/comments/15sn6m/what_kind_of_car_...</td>\n",
187
+ " <td>I've always wanted to know what kind of car th...</td>\n",
188
+ " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
189
+ " <td>2013-01-01 23:59:31</td>\n",
190
+ " <td>PaytonAdams</td>\n",
191
+ " <td>15sn6m</td>\n",
192
+ " <td>0</td>\n",
193
+ " <td>5</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>2</th>\n",
197
+ " <td>1</td>\n",
198
+ " <td>5</td>\n",
199
+ " <td>What movies have made you go back to the theat...</td>\n",
200
+ " <td>/r/AskReddit/comments/15sn6b/what_movies_have_...</td>\n",
201
+ " <td></td>\n",
202
+ " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
203
+ " <td>2013-01-01 23:59:20</td>\n",
204
+ " <td>[deleted]</td>\n",
205
+ " <td>15sn6b</td>\n",
206
+ " <td>0</td>\n",
207
+ " <td>1</td>\n",
208
+ " </tr>\n",
209
+ " <tr>\n",
210
+ " <th>3</th>\n",
211
+ " <td>0</td>\n",
212
+ " <td>18</td>\n",
213
+ " <td>Worst fear(s)?</td>\n",
214
+ " <td>/r/AskReddit/comments/15sn4u/worst_fears/</td>\n",
215
+ " <td>So what is your worst fear, reddit?</td>\n",
216
+ " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
217
+ " <td>2013-01-01 23:58:37</td>\n",
218
+ " <td>[deleted]</td>\n",
219
+ " <td>15sn4u</td>\n",
220
+ " <td>0</td>\n",
221
+ " <td>0</td>\n",
222
+ " </tr>\n",
223
+ " <tr>\n",
224
+ " <th>4</th>\n",
225
+ " <td>11</td>\n",
226
+ " <td>29</td>\n",
227
+ " <td>If there was a type of ink that lasted only fo...</td>\n",
228
+ " <td>/r/AskReddit/comments/15sn44/if_there_was_a_ty...</td>\n",
229
+ " <td></td>\n",
230
+ " <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
231
+ " <td>2013-01-01 23:58:15</td>\n",
232
+ " <td>Honeybeard</td>\n",
233
+ " <td>15sn44</td>\n",
234
+ " <td>0</td>\n",
235
+ " <td>11</td>\n",
236
+ " </tr>\n",
237
+ " <tr>\n",
238
+ " <th>...</th>\n",
239
+ " <td>...</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>...</td>\n",
242
+ " <td>...</td>\n",
243
+ " <td>...</td>\n",
244
+ " <td>...</td>\n",
245
+ " <td>...</td>\n",
246
+ " <td>...</td>\n",
247
+ " <td>...</td>\n",
248
+ " <td>...</td>\n",
249
+ " <td>...</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <th>3293628</th>\n",
253
+ " <td>1</td>\n",
254
+ " <td>1</td>\n",
255
+ " <td>Help me get an idea of cost of living</td>\n",
256
+ " <td>/r/AskReddit/comments/2cjj63/help_me_get_an_id...</td>\n",
257
+ " <td></td>\n",
258
+ " <td>http://www.reddit.com/r/AskReddit/comments/2cj...</td>\n",
259
+ " <td>2014-08-04 00:01:20</td>\n",
260
+ " <td>bbent4698</td>\n",
261
+ " <td>2cjj63</td>\n",
262
+ " <td>0</td>\n",
263
+ " <td>1</td>\n",
264
+ " </tr>\n",
265
+ " <tr>\n",
266
+ " <th>3293629</th>\n",
267
+ " <td>2</td>\n",
268
+ " <td>0</td>\n",
269
+ " <td>If you used a prism to separate light and then...</td>\n",
270
+ " <td>/r/AskReddit/comments/2cjj5v/if_you_used_a_pri...</td>\n",
271
+ " <td></td>\n",
272
+ " <td>http://www.reddit.com/r/AskReddit/comments/2cj...</td>\n",
273
+ " <td>2014-08-04 00:01:19</td>\n",
274
+ " <td>Ajmb_88</td>\n",
275
+ " <td>2cjj5v</td>\n",
276
+ " <td>0</td>\n",
277
+ " <td>2</td>\n",
278
+ " </tr>\n",
279
+ " <tr>\n",
280
+ " <th>3293630</th>\n",
281
+ " <td>0</td>\n",
282
+ " <td>11</td>\n",
283
+ " <td>Reddit, what was it like the first time you go...</td>\n",
284
+ " <td>/r/AskReddit/comments/2cjj4s/reddit_what_was_i...</td>\n",
285
+ " <td></td>\n",
286
+ " <td>http://www.reddit.com/r/AskReddit/comments/2cj...</td>\n",
287
+ " <td>2014-08-04 00:01:01</td>\n",
288
+ " <td>da-gonzo</td>\n",
289
+ " <td>2cjj4s</td>\n",
290
+ " <td>0</td>\n",
291
+ " <td>0</td>\n",
292
+ " </tr>\n",
293
+ " <tr>\n",
294
+ " <th>3293631</th>\n",
295
+ " <td>1452</td>\n",
296
+ " <td>3140</td>\n",
297
+ " <td>People who refuse to be organ donors, why do y...</td>\n",
298
+ " <td>/r/AskReddit/comments/2cjj31/people_who_refuse...</td>\n",
299
+ " <td>R.I.P my inbox</td>\n",
300
+ " <td>http://www.reddit.com/r/AskReddit/comments/2cj...</td>\n",
301
+ " <td>2014-08-04 00:00:36</td>\n",
302
+ " <td>JohnnySniperr</td>\n",
303
+ " <td>2cjj31</td>\n",
304
+ " <td>0</td>\n",
305
+ " <td>1452</td>\n",
306
+ " </tr>\n",
307
+ " <tr>\n",
308
+ " <th>3293632</th>\n",
309
+ " <td>2</td>\n",
310
+ " <td>9</td>\n",
311
+ " <td>What always happens when you travel abroad?</td>\n",
312
+ " <td>/r/AskReddit/comments/2cjj2a/what_always_happe...</td>\n",
313
+ " <td></td>\n",
314
+ " <td>http://www.reddit.com/r/AskReddit/comments/2cj...</td>\n",
315
+ " <td>2014-08-04 00:00:23</td>\n",
316
+ " <td>Nicopip</td>\n",
317
+ " <td>2cjj2a</td>\n",
318
+ " <td>0</td>\n",
319
+ " <td>2</td>\n",
320
+ " </tr>\n",
321
+ " </tbody>\n",
322
+ "</table>\n",
323
+ "<p>3293633 rows × 11 columns</p>\n",
324
+ "</div>"
325
+ ],
326
+ "text/plain": [
327
+ " score num_comments \\\n",
328
+ "0 2 4 \n",
329
+ "1 5 24 \n",
330
+ "2 1 5 \n",
331
+ "3 0 18 \n",
332
+ "4 11 29 \n",
333
+ "... ... ... \n",
334
+ "3293628 1 1 \n",
335
+ "3293629 2 0 \n",
336
+ "3293630 0 11 \n",
337
+ "3293631 1452 3140 \n",
338
+ "3293632 2 9 \n",
339
+ "\n",
340
+ " title \\\n",
341
+ "0 Reddit, if someone had to describe you to a st... \n",
342
+ "1 What kind of car does the average \\nRedditor d... \n",
343
+ "2 What movies have made you go back to the theat... \n",
344
+ "3 Worst fear(s)? \n",
345
+ "4 If there was a type of ink that lasted only fo... \n",
346
+ "... ... \n",
347
+ "3293628 Help me get an idea of cost of living \n",
348
+ "3293629 If you used a prism to separate light and then... \n",
349
+ "3293630 Reddit, what was it like the first time you go... \n",
350
+ "3293631 People who refuse to be organ donors, why do y... \n",
351
+ "3293632 What always happens when you travel abroad? \n",
352
+ "\n",
353
+ " permalink \\\n",
354
+ "0 /r/AskReddit/comments/15sn6y/reddit_if_someone... \n",
355
+ "1 /r/AskReddit/comments/15sn6m/what_kind_of_car_... \n",
356
+ "2 /r/AskReddit/comments/15sn6b/what_movies_have_... \n",
357
+ "3 /r/AskReddit/comments/15sn4u/worst_fears/ \n",
358
+ "4 /r/AskReddit/comments/15sn44/if_there_was_a_ty... \n",
359
+ "... ... \n",
360
+ "3293628 /r/AskReddit/comments/2cjj63/help_me_get_an_id... \n",
361
+ "3293629 /r/AskReddit/comments/2cjj5v/if_you_used_a_pri... \n",
362
+ "3293630 /r/AskReddit/comments/2cjj4s/reddit_what_was_i... \n",
363
+ "3293631 /r/AskReddit/comments/2cjj31/people_who_refuse... \n",
364
+ "3293632 /r/AskReddit/comments/2cjj2a/what_always_happe... \n",
365
+ "\n",
366
+ " selftext \\\n",
367
+ "0 They would be talking about you without your p... \n",
368
+ "1 I've always wanted to know what kind of car th... \n",
369
+ "2 \n",
370
+ "3 So what is your worst fear, reddit? \n",
371
+ "4 \n",
372
+ "... ... \n",
373
+ "3293628 \n",
374
+ "3293629 \n",
375
+ "3293630 \n",
376
+ "3293631 R.I.P my inbox \n",
377
+ "3293632 \n",
378
+ "\n",
379
+ " url \\\n",
380
+ "0 http://www.reddit.com/r/AskReddit/comments/15s... \n",
381
+ "1 http://www.reddit.com/r/AskReddit/comments/15s... \n",
382
+ "2 http://www.reddit.com/r/AskReddit/comments/15s... \n",
383
+ "3 http://www.reddit.com/r/AskReddit/comments/15s... \n",
384
+ "4 http://www.reddit.com/r/AskReddit/comments/15s... \n",
385
+ "... ... \n",
386
+ "3293628 http://www.reddit.com/r/AskReddit/comments/2cj... \n",
387
+ "3293629 http://www.reddit.com/r/AskReddit/comments/2cj... \n",
388
+ "3293630 http://www.reddit.com/r/AskReddit/comments/2cj... \n",
389
+ "3293631 http://www.reddit.com/r/AskReddit/comments/2cj... \n",
390
+ "3293632 http://www.reddit.com/r/AskReddit/comments/2cj... \n",
391
+ "\n",
392
+ " created_utc author id downs ups \n",
393
+ "0 2013-01-01 23:59:40 [deleted] 15sn6y 0 2 \n",
394
+ "1 2013-01-01 23:59:31 PaytonAdams 15sn6m 0 5 \n",
395
+ "2 2013-01-01 23:59:20 [deleted] 15sn6b 0 1 \n",
396
+ "3 2013-01-01 23:58:37 [deleted] 15sn4u 0 0 \n",
397
+ "4 2013-01-01 23:58:15 Honeybeard 15sn44 0 11 \n",
398
+ "... ... ... ... ... ... \n",
399
+ "3293628 2014-08-04 00:01:20 bbent4698 2cjj63 0 1 \n",
400
+ "3293629 2014-08-04 00:01:19 Ajmb_88 2cjj5v 0 2 \n",
401
+ "3293630 2014-08-04 00:01:01 da-gonzo 2cjj4s 0 0 \n",
402
+ "3293631 2014-08-04 00:00:36 JohnnySniperr 2cjj31 0 1452 \n",
403
+ "3293632 2014-08-04 00:00:23 Nicopip 2cjj2a 0 2 \n",
404
+ "\n",
405
+ "[3293633 rows x 11 columns]"
406
+ ]
407
+ },
408
+ "execution_count": 12,
409
+ "metadata": {},
410
+ "output_type": "execute_result"
411
+ }
412
+ ],
413
+ "source": [
414
+ "df = dataset['all_days'].to_pandas()\n",
415
+ "df"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 16,
421
+ "id": "b5bbfa15",
422
+ "metadata": {},
423
+ "outputs": [
424
+ {
425
+ "data": {
426
+ "text/plain": [
427
+ "score Int64\n",
428
+ "num_comments Int64\n",
429
+ "title string\n",
430
+ "permalink string\n",
431
+ "selftext string\n",
432
+ "url string\n",
433
+ "created_utc string\n",
434
+ "author string\n",
435
+ "id string\n",
436
+ "downs Int64\n",
437
+ "ups Int64\n",
438
+ "dtype: object"
439
+ ]
440
+ },
441
+ "execution_count": 16,
442
+ "metadata": {},
443
+ "output_type": "execute_result"
444
+ }
445
+ ],
446
+ "source": [
447
+ "df.convert_dtypes().dtypes"
448
+ ]
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "execution_count": 18,
453
+ "id": "c4292c7c",
454
+ "metadata": {},
455
+ "outputs": [],
456
+ "source": [
457
+ "import pandas as pd"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": 21,
463
+ "id": "5a516c19",
464
+ "metadata": {},
465
+ "outputs": [],
466
+ "source": [
467
+ "df['created_utc'] = pd.to_datetime(df['created_utc'])\n",
468
+ "df['date'] = df['created_utc'].dt.date\n",
469
+ "df['time'] = df['created_utc'].dt.time"
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "code",
474
+ "execution_count": 25,
475
+ "id": "22d87986",
476
+ "metadata": {},
477
+ "outputs": [
478
+ {
479
+ "data": {
480
+ "text/plain": [
481
+ "<Axes: >"
482
+ ]
483
+ },
484
+ "execution_count": 25,
485
+ "metadata": {},
486
+ "output_type": "execute_result"
487
+ },
488
+ {
489
+ "data": {
490
+ "image/png": "\n",
491
+ "text/plain": [
492
+ "<Figure size 640x480 with 1 Axes>"
493
+ ]
494
+ },
495
+ "metadata": {},
496
+ "output_type": "display_data"
497
+ }
498
+ ],
499
+ "source": [
500
+ "df.date.hist(bins=400)"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": 26,
506
+ "id": "19d6539b",
507
+ "metadata": {},
508
+ "outputs": [],
509
+ "source": [
510
+ "new_df = df.drop_duplicates(subset=['id'], keep=\"first\")"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "code",
515
+ "execution_count": null,
516
+ "id": "466cd2c7",
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": [
520
+ "new_df.date.hist(bins-)"
521
+ ]
522
+ }
523
+ ],
524
+ "metadata": {
525
+ "kernelspec": {
526
+ "display_name": "Python 3 (ipykernel)",
527
+ "language": "python",
528
+ "name": "python3"
529
+ },
530
+ "language_info": {
531
+ "codemirror_mode": {
532
+ "name": "ipython",
533
+ "version": 3
534
+ },
535
+ "file_extension": ".py",
536
+ "mimetype": "text/x-python",
537
+ "name": "python",
538
+ "nbconvert_exporter": "python",
539
+ "pygments_lexer": "ipython3",
540
+ "version": "3.10.8"
541
+ }
542
+ },
543
+ "nbformat": 4,
544
+ "nbformat_minor": 5
545
+ }
utilities/pushshift_data.py CHANGED
@@ -148,8 +148,9 @@ def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
148
  df = df.convert_dtypes()
149
  df = df[cols]
150
  # Convert the "created_utc" column to a datetime column with timezone information
151
- df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC').dt.strftime(
152
- '%Y-%m-%d %H:%M:%S')
 
153
  return df
154
 
155
 
 
148
  df = df.convert_dtypes()
149
  df = df[cols]
150
  # Convert the "created_utc" column to a datetime column with timezone information
151
+ df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
152
+ df['date'] = df['created_utc'].dt.date
153
+ df['time'] = df['created_utc'].dt.time
154
  return df
155
 
156