pushshift / api

Pushshift API
1.29k stars 107 forks source link

When grabbing submissions, submission counts does not match the aggs for August 2018 #52

Open rosecongou opened 4 years ago

rosecongou commented 4 years ago
version = "-V3"
datestring = datetime.now().strftime("%y%m%d-%H%M%S")
output_folder = "Reddit-Scrapes-" + datestring + version
for subreddit in subreddits:
    make_directory(output_folder + "/" + subreddit)
    for year in range(start, end):
        fail = open("fail.txt", "a")
        submissionWriter = pandas.ExcelWriter(output_folder + "/" + subreddit + '/' + subreddit + ' ' + str(year) + " submissions.xlsx", engine="xlsxwriter")
        commentWriter = pandas.ExcelWriter(output_folder + "/" + subreddit + '/' + subreddit + ' ' + str(year) + " comments.xlsx", engine="xlsxwriter")
        for month in range(month_start, month_end):
            since = timestamp(year, month, 1) # first of the month; midnight
            before = timestamp(year + month // 12, month % 12 + 1, 1)
            month = date(1900, month, 1).strftime('%B') # Return month as a nice string, e.g. "January"
            pull_result = pull_comments(subreddit, since, before)
            comments = pull_result
            while len(pull_result) == 100:
                print(len(pull_result))
                before = pull_result[-1]["created_utc"]
                pull_result = pull_comments(subreddit, since, before)
                if isinstance(comments, str):
                  fail.write("subreddit: " + subreddit + "year: " + str(year) + ", month: " + str(month) + ", type: comments, " + "check: " + comments)
                else:
                  comments += pull_result
            if isinstance(comments, list):
              pandas.DataFrame(comments).to_excel(commentWriter, sheet_name=month)
              print("Succeeded pulling comments for ", year, month, subreddit)
            else:
              pandas.DataFrame([]).to_excel(commentWriter, sheet_name=month)
        commentWriter.save()
        commentWriter.close()