Open enzyme69 opened 6 years ago
Example of scrapped data as JSON:
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "<div class=\"md\"><p>[removed]</p>\n</div>", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "<div class=\"md\"><p>[removed]</p>\n</div>", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "<div class=\"md\"><p>[removed]</p>\n</div>", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
{"approved_at_utc": null, "approved_by": null, "archived": true, "author": "[deleted]", "author_flair_background_color": "", "author_flair_css_class": null, "author_flair_template_id": null, "author_flair_text": null, "author_flair_text_color": "dark", "banned_at_utc": null, "banned_by": null, "body": "[removed]", "body_html": "<div class=\"md\"><p>[removed]</p>\n</div>", "can_gild": true, "can_mod_post": false, "collapsed": true, "collapsed_reason": null, "controversiality": 0, "created": 1518611215.0, "created_utc": 1518582415.0, "distinguished": null, "downs": 0, "edited": false, "gilded": 0, "gildings": {"gid_1": 0, "gid_2": 0, "gid_3": 0}, "id": "du7wn6t", "is_submitter": false, "likes": null, "link_id": "t3_7xcua4", "mod_note": null, "mod_reason_by": null, "mod_reason_title": null, "mod_reports": [], "name": "t1_du7wn6t", "no_follow": true, "num_reports": null, "parent_id": "t1_du7u7k9", "permalink": "/r/science/comments/7xcua4/scientists_have_developed_a_brain_implant_that/du7wn6t/", "removal_reason": null, "replies": "", "report_reasons": null, "saved": false, "score": 16, "score_hidden": false, "send_replies": true, "stickied": false, "subreddit": "science", "subreddit_id": "t5_mouw", "subreddit_name_prefixed": "r/science", "subreddit_type": "public", "ups": 16, "user_reports": []}
Simple Python to read JSON passed into it and get the date time and body:
import json
from datetime import datetime
null = "none"
true = 1
false = 0
data = json.loads(V1[0][0])
print(datetime.utcfromtimestamp(data["created_utc"]), data["body"])
#append(data["body"])
Python from NEO to scrap some Reddit data using Pushshift API:
#!/usr/bin/env python3
import requests
import time
import json
def get_comments_from_pushshift(**kwargs):
r = requests.get("https://api.pushshift.io/reddit/comment/search/",params=kwargs)
data = r.json()
return data['data']
def get_comments_from_reddit_api(comment_ids,author):
headers = {'User-agent':'Comment Collector for /u/{}'.format(author)}
params = {}
params['id'] = ','.join(["t1_" + id for id in comment_ids])
r = requests.get("https://api.reddit.com/api/info",params=params,headers=headers)
data = r.json()
return data['data']['children']
before = None
### IMPORTANT ######################
# Set this variable to your username
author = "stuck_in_the_matrix"
####################################
while True:
comments = get_comments_from_pushshift(author=author,size=100,before=before,sort='desc',sort_type='created_utc')
if not comments: break
# This will get the comment ids from Pushshift in batches of 100 -- Reddit's API only allows 100 at a time
comment_ids = []
for comment in comments:
before = comment['created_utc'] # This will keep track of your position for the next call in the while loop
comment_ids.append(comment['id'])
# This will then pass the ids collected from Pushshift and query Reddit's API for the most up to date information
comments = get_comments_from_reddit_api(comment_ids,author)
for comment in comments:
comment = comment['data']
# Do stuff with the comments (this will print out a JSON blob for each comment)
comment_json = json.dumps(comment,ensure_ascii=True,sort_keys=True)
print(comment_json)
time.sleep(2)
stuckInTheMatrix_005.blend.zip