LIVENODING 1110 / Digging Reddit using Python and PushShift API

enzyme69 commented 6 years ago

stuckInTheMatrix_005.blend.zip

enzyme69 commented 6 years ago

Example of scrapped data as JSON:

{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "&lt;div class=\"md\"&gt;&lt;p&gt;[removed]&lt;/p&gt;\n&lt;/div&gt;", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "&lt;div class=\"md\"&gt;&lt;p&gt;[removed]&lt;/p&gt;\n&lt;/div&gt;", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "&lt;div class=\"md\"&gt;&lt;p&gt;[removed]&lt;/p&gt;\n&lt;/div&gt;", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "&lt;div class=\"md\"&gt;&lt;p&gt;[removed]&lt;/p&gt;\n&lt;/div&gt;", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}

enzyme69 commented 6 years ago

Simple Python to read JSON passed into it and get the date time and body:

import json
from datetime import datetime

null = "none"
true = 1
false = 0

data = json.loads(V1[0][0])
print(datetime.utcfromtimestamp(data["created_utc"]), data["body"])

#append(data["body"])

enzyme69 commented 6 years ago

Python from NEO to scrap some Reddit data using Pushshift API:

#!/usr/bin/env python3

import requests
import time
import json

def get_comments_from_pushshift(**kwargs):
    r = requests.get("https://api.pushshift.io/reddit/comment/search/",params=kwargs)
    data = r.json()
    return data['data']

def get_comments_from_reddit_api(comment_ids,author):
    headers = {'User-agent':'Comment Collector for /u/{}'.format(author)}
    params = {}
    params['id'] = ','.join(["t1_" + id for id in comment_ids])
    r = requests.get("https://api.reddit.com/api/info",params=params,headers=headers)
    data = r.json()
    return data['data']['children']

before = None

### IMPORTANT ######################
# Set this variable to your username
author = "stuck_in_the_matrix"
####################################

while True:
    comments = get_comments_from_pushshift(author=author,size=100,before=before,sort='desc',sort_type='created_utc')
    if not comments: break

    # This will get the comment ids from Pushshift in batches of 100 -- Reddit's API only allows 100 at a time
    comment_ids = []
    for comment in comments:
        before = comment['created_utc'] # This will keep track of your position for the next call in the while loop
        comment_ids.append(comment['id'])

    # This will then pass the ids collected from Pushshift and query Reddit's API for the most up to date information
    comments = get_comments_from_reddit_api(comment_ids,author)
    for comment in comments:
        comment = comment['data']
        # Do stuff with the comments (this will print out a JSON blob for each comment)
        comment_json = json.dumps(comment,ensure_ascii=True,sort_keys=True)
        print(comment_json)

    time.sleep(2)

enzyme69 / blendersushi

LIVENODING 1110 / Digging Reddit using Python and PushShift API #511