Idea: Use Pushshift instead of Reddit API

https://pushshift.io/api-parameters/

It's super convenient and does not require API keys. Here's some snippet I wrote a while ago to archive text posts from a subreddit, should be VERY easy to adapt to handle non-text posts as well.

class NoQuotedCommasSession(requests.Session):
    def send(self, *a, **kw):
        a[0].url = a[0].url.replace(urllib.parse.quote(','), ',')
        return super().send(*a, **kw)

def fetch_chunk(after=None):
    params = {
        'subreddit': 'yourfavoritesmutplace',
        'fields': 'id,created_utc,domain,author,title,selftext,permalink',
        'sort': 'created_utc',
        'order': 'asc',
        'size': 1000,
    }
    if after is not None:
        params['after'] = after
    resp = NoQuotedCommasSession().get('https://api.pushshift.io/reddit/submission/search', params=params)
    resp.raise_for_status()
    return resp.json()['data']

def fetch_all_subreddit_posts(after=None):
    i = 1
    while True:
        print(f'loading chunk {i}')
        chunk = fetch_chunk(after)
        if not chunk:
            break
        yield from chunk
        after = chunk[-1]['created_utc'] + 1
        if i % 10 == 0:
            print(f'loaded until {datetime.fromtimestamp(after)}')
        i += 1
    print(f'done! loaded until {datetime.fromtimestamp(after)}')

NSFWUTILS / RedditScrape

Idea: Use Pushshift instead of Reddit API #5