File size: 5,705 Bytes
749d1d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import csv
import json
import sys
import time
import traceback
from datetime import datetime

import requests

username = ""  # put the username you want to download in the quotes
subreddit = "BestofRedditorUpdates"  # put the subreddit you want to download in the quotes
thread_id = ""  # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit

# change this to one of "human", "csv" or "json"
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
# - json: the full json object
output_format = "csv"

# default start time is the current time and default end time is all history
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
# start_time = datetime.utcnow()  # datetime.strptime("10/05/2021", "%m/%d/%Y")
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
end_time = None  # datetime.strptime("09/25/2021", "%m/%d/%Y")

convert_to_ascii = False  # don't touch this unless you know what you're doing
convert_thread_id_to_base_ten = True  # don't touch this unless you know what you're doing


def write_csv_line(writer, obj, is_submission):
    output_list = []
    output_list.append(str(obj['score']))
    output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
    if is_submission:
        output_list.append(obj['title'])
    output_list.append(f"u/{obj['author']}")
    output_list.append(f"https://www.reddit.com{obj['permalink']}")
    if is_submission:
        if obj['is_self']:
            if 'selftext' in obj:
                output_list.append(obj['selftext'])
            else:
                output_list.append("")
        else:
            output_list.append(obj['url'])
    else:
        output_list.append(obj['body'])
    writer.writerow(output_list)


def write_json_line(handle, obj):
    handle.write(json.dumps(obj))
    handle.write("\n")


def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
    print(f"Saving to {filename}")

    count = 0
    if output_format == "human" or output_format == "json":
        if convert_to_ascii:
            handle = open(filename, 'w', encoding='ascii')
        else:
            handle = open(filename, 'w', encoding='UTF-8')
    else:
        handle = open(filename, 'w', encoding='UTF-8', newline='')
        writer = csv.writer(handle)

    previous_epoch = int(start_datetime.timestamp())
    break_out = False
    while True:
        new_url = url_base + str(previous_epoch)
        json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
        try:
            json_data = json_text.json()
        except json.decoder.JSONDecodeError:
            time.sleep(1)
            continue

        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break

        for obj in objects:
            previous_epoch = obj['created_utc'] - 1
            if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
                break_out = True
                break
            count += 1
            try:
                if output_format == "csv":
                    write_csv_line(writer, obj, is_submission)
                elif output_format == "json":
                    write_json_line(handle, obj)
            except Exception as err:
                if 'permalink' in obj:
                    print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
                else:
                    print(f"Couldn't print object, missing permalink: {obj['id']}")
                print(err)
                print(traceback.format_exc())

        if break_out:
            break

        print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")

    print(f"Saved {count}")
    handle.close()


if __name__ == "__main__":
    filter_string = None
    if username == "" and subreddit == "" and thread_id == "":
        print("Fill in username, subreddit or thread id")
        sys.exit(0)
    if output_format not in ("human", "csv", "json"):
        print("Output format must be one of human, csv, json")
        sys.exit(0)

    filters = []
    if username:
        filters.append(f"author={username}")
    if subreddit:
        filters.append(f"subreddit={subreddit}")
    if thread_id:
        if convert_thread_id_to_base_ten:
            filters.append(f"link_id={int(thread_id, 36)}")
        else:
            filters.append(f"link_id=t3_{thread_id}")
    filter_string = '&'.join(filters)

    url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="

    if not thread_id:
        download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
                          end_time, True, convert_to_ascii)
    # download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
    #                   end_time, False, convert_to_ascii)