Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

dataset-creator-reddit-bestofredditorupdates

File size: 5,705 Bytes

749d1d8

import csv
import json
import sys
import time
import traceback
from datetime import datetime

import requests

username = ""  # put the username you want to download in the quotes
subreddit = "BestofRedditorUpdates"  # put the subreddit you want to download in the quotes
thread_id = ""  # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit

# change this to one of "human", "csv" or "json"
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
# - json: the full json object
output_format = "csv"

# default start time is the current time and default end time is all history
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
# start_time = datetime.utcnow()  # datetime.strptime("10/05/2021", "%m/%d/%Y")
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
end_time = None  # datetime.strptime("09/25/2021", "%m/%d/%Y")

convert_to_ascii = False  # don't touch this unless you know what you're doing
convert_thread_id_to_base_ten = True  # don't touch this unless you know what you're doing


def write_csv_line(writer, obj, is_submission):
    output_list = []
    output_list.append(str(obj['score']))
    output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
    if is_submission:
        output_list.append(obj['title'])
    output_list.append(f"u/{obj['author']}")
    output_list.append(f"https://www.reddit.com{obj['permalink']}")
    if is_submission:
        if obj['is_self']:
            if 'selftext' in obj:
                output_list.append(obj['selftext'])
            else:
                output_list.append("")
        else:
            output_list.append(obj['url'])
    else:
        output_list.append(obj['body'])
    writer.writerow(output_list)


def write_json_line(handle, obj):
    handle.write(json.dumps(obj))
    handle.write("\n")


def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
    print(f"Saving to {filename}")

    count = 0
    if output_format == "human" or output_format == "json":
        if convert_to_ascii:
            handle = open(filename, 'w', encoding='ascii')
        else:
            handle = open(filename, 'w', encoding='UTF-8')
    else:
        handle = open(filename, 'w', encoding='UTF-8', newline='')
        writer = csv.writer(handle)

    previous_epoch = int(start_datetime.timestamp())
    break_out = False
    while True:
        new_url = url_base + str(previous_epoch)
        json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
        try:
            json_data = json_text.json()
        except json.decoder.JSONDecodeError:
            time.sleep(1)
            continue

        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break

        for obj in objects:
            previous_epoch = obj['created_utc'] - 1
            if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
                break_out = True
                break
            count += 1
            try:
                if output_format == "csv":
                    write_csv_line(writer, obj, is_submission)
                elif output_format == "json":
                    write_json_line(handle, obj)
            except Exception as err:
                if 'permalink' in obj:
                    print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
                else:
                    print(f"Couldn't print object, missing permalink: {obj['id']}")
                print(err)
                print(traceback.format_exc())

        if break_out:
            break

        print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")

    print(f"Saved {count}")
    handle.close()


if __name__ == "__main__":
    filter_string = None
    if username == "" and subreddit == "" and thread_id == "":
        print("Fill in username, subreddit or thread id")
        sys.exit(0)
    if output_format not in ("human", "csv", "json"):
        print("Output format must be one of human, csv, json")
        sys.exit(0)

    filters = []
    if username:
        filters.append(f"author={username}")
    if subreddit:
        filters.append(f"subreddit={subreddit}")
    if thread_id:
        if convert_thread_id_to_base_ten:
            filters.append(f"link_id={int(thread_id, 36)}")
        else:
            filters.append(f"link_id=t3_{thread_id}")
    filter_string = '&'.join(filters)

    url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="

    if not thread_id:
        download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
                          end_time, True, convert_to_ascii)
    # download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
    #                   end_time, False, convert_to_ascii)