kevinzg / facebook-scraper

Scrape Facebook public pages without an API key
MIT License
2.42k stars 628 forks source link

Issue received while scraping with comments #457

Open fashan7 opened 3 years ago

fashan7 commented 3 years ago

post urls = [10158609033378741, 10159694338583054,1200387097127839,3176654032562645,204010715100946]

From this below code I get this error error name: name 'time' is not defined

from facebook_scraper import *
set_cookies("cookies.txt")
results = []
start_url = None
post_result = []

def handle_pagination_url(url):
    global start_url
    start_url = url

while True:
    try:
        post = next(
            get_posts(
                post_urls=[10158609033378741],
                options={
                    "comments": "generator",
                    "comment_start_url": start_url,
                    "comment_request_url_callback": handle_pagination_url,
                },
            )
        )
        comments = list(post["comments_full"])
        for comment in comments:
            comment["replies"] = list(comment["replies"])

            replies_list = []
            if comment["replies"]:
                for replies in comment["replies"]:
                    replies_list.append(replies)
            comment.update({"replies":replies_list})
            results.append(comment)

        print("All done")
        post.update({"comments_full":results})
        post_result.append(post)
        break
    except exceptions.TemporarilyBanned:
        print("Temporarily banned, sleeping for 10m")
        time.sleep(600)
fashan7 commented 3 years ago

@neon-ninja why am i even getting ban, even i used this above method

neon-ninja commented 3 years ago

The intention with using "comments": "generator" is that it gives you greater flexibility around how often requests are made, when to pause & resume and so on. It doesn't offer any greater protection from temporary bans on it's own. You need to sprinkle some time.sleep statements in there. See https://github.com/kevinzg/facebook-scraper/issues/409#issuecomment-905603342 and https://github.com/kevinzg/facebook-scraper/issues/409#issuecomment-907639417

fashan7 commented 3 years ago

@neon-ninja why this syntax are wrong. example while post := next(posts, None):

def scrape_group_posts(self, group_ids: List[Union[int, str]]):
    def handle_pagination_url(url):
        nonlocal start_url
        start_url = url

    for k, group_id in enumerate(group_ids, 1):
        group_name = self.group_information[group_id]['name']
        log.info(f"[{k}] Scraping group: {group_name}...")

        start_url = None
        post_counter = 0
        keep_alive = True
        while keep_alive:
            try:
                posts = self.get_group_posts(
                    group=group_id,
                    options={
                        "comments": "generator" if config.SCRAPE_COMMENTS else False,
                        "comment_start_url": start_url,
                        "comment_request_url_callback": handle_pagination_url
                    }
                )

                while post := next(posts, None):
                    post_counter += 1

                    if post["time"] < config.DATELIMIT:
                        log.info(f"[{group_name}] Reached datelimit: {config.DATELIMIT}")
                        keep_alive = False
                        break

                    log.info(f"[{group_name}] Scraping post {post_counter} from {str(post['time'])[:19]}...")

                    for item in ('post_text', 'shared_text', 'text'):

                    comments = post['comments_full']
                    # It is possible, that comments are of type list
                    if type(comments) == iter:
                        comment_counter = 0

                        while comment := next(comments, None):
                            comment_counter += 1
                            log.info(f"[{group_name}] Scraping comment {comment_counter} from {str(comment['comment_time'])[:19]} to post {post_counter}...")

                            replies = comment['replies']
                            if type(replies) == iter:
                                replies_counter = 0

                                while reply := next(replies, None):
                                    replies_counter += 1
                                    log.info(f"[{group_name}] Scraping reply {replies_counter} from {str(reply['comment_time'])[:19]} to comment {comment_counter} of post {post_counter}...")

                                    random_sleep((3, 3.5), (4, 5), reason=f"{group_name}|WAIT_BEFORE_NEXT_REPLY")

                            elif type(replies) == list and replies:
                                log.warning(f"Found non-empty comment-replies as list ->\n{format_json(replies)}")

                            random_sleep((3, 3.5), (4, 5), reason=f"{group_name}|WAIT_BEFORE_NEXT_COMMENT")

                    elif type(comments) == list and comments:
                        log.warning(f"Found non-empty comments as list ->\n{format_json(comments)}")

                    random_sleep((3, 3.5), (4, 5), reason=f"{group_name}|WAIT_BEFORE_NEXT_POST")

                # We reach this without exception, then we have completed scraping this group
                keep_alive = False

            except TemporarilyBanned as e:
                random_sleep((580, 590), (600, 610), reason=f"{group_name}|{e.__class__.__name__}")
neon-ninja commented 3 years ago

The syntax isn't wrong, it's just a new operator that was introduced with Python 3.8. It's called the walrus operator.

fashan7 commented 3 years ago
random_sleep((580, 590), (600, 610), reason=f"{group_name}|{e.__class__.__name__}")

@neon-ninja

while post := next(posts, None):
TypeError: 'dict' object is not an iterator
fashan7 commented 3 years ago

@neon-ninja can we set time sleep to this code snippet

from facebook_scraper import *
set_cookies("cookies.txt")
results = []
start_url = None
post_result = []

def handle_pagination_url(url):
    global start_url
    start_url = url

while True:
    try:
        post = next(
            get_posts(
                post_urls=[10158609033378741],
                options={
                    "comments": "generator",
                    "comment_start_url": start_url,
                    "comment_request_url_callback": handle_pagination_url,
                },
            )
        )
        comments = list(post["comments_full"])
        for comment in comments:
            comment["replies"] = list(comment["replies"])

            replies_list = []
            if comment["replies"]:
                for replies in comment["replies"]:

                    sleep(randint(2,4))
                    replies_list.append(replies)

            sleep(randint(1,4))
            comment.update({"replies":replies_list})
            results.append(comment)

        post.update({"comments_full":results})
        post_result.append(post)
        break
    except exceptions.TemporarilyBanned:
        print("Temporarily banned, sleeping for 10m")
        sleep(randint(580,600))
neon-ninja commented 3 years ago

Yes

fashan7 commented 3 years ago

@neon-ninja can i know some information regarding this

for comment in comments:
        comment["replies"] = list(comment["replies"])

          replies_list = []
          if comment["replies"]:
              for replies in comment["replies"]:

                  sleep(randint(2,4))
                  replies_list.append(replies)

          sleep(randint(1,4))
          comment.update({"replies":replies_list})
          results.append(comment)
Screenshot 2021-09-02 at 16 05 48

When we are setting up sleep, are we really clicking view more comments or visiting each & every comment is being browsed from our script.

fashan7 commented 3 years ago

@neon-ninja please explain regarding the above query

neon-ninja commented 3 years ago

The scraper uses the mobile version of Facebook (m.facebook.com). Consuming replies results in an HTTP request that would be equivalent to clicking on the reply link. Consuming more comments would result in an HTTP request equivalent to clicking the "View more comments" link

fashan7 commented 3 years ago

@neon-ninja so we are actually clicking, let's say 3 unfold replies are there under the comment, so what the script does is clicking to unfold the replies and getting the data and setting up to the output JSON of the script. If we have more comments means we are basically clicking the view more comments. Note: I thought we are visiting each & every comments and replies via their URL, to get the text of the comment & replies

Screenshot 2021-09-03 at 04 49 24
neon-ninja commented 3 years ago

There's no clicking, this is all lower level web requests. The URLs are fetched with the requests library.

fashan7 commented 3 years ago

@neon-ninja we ain't visiting all comments URLs, right?

neon-ninja commented 3 years ago

No. Comments are retrieved in batches of 30.

neon-ninja commented 3 years ago

See https://github.com/kevinzg/facebook-scraper/issues/409#issuecomment-892232322 for an example of the number of requests

fashan7 commented 3 years ago

@neon-ninja thanks

fashan7 commented 3 years ago

@neon-ninja I have a scenario when I set a cookie and scrape a post [facebook.com/1610702495934559]. It has more than 700 comments. When the script scrapes the comments, In the middle of scraping after around 300comments, suddenly the cookies get temporarily ban, can we get the scraped comments before the ban and generate the JSON. Is it possible any workaround? available

milesgratz commented 3 years ago

@fashandatafields -- this example of code for private groups has very cautious sleep timers (and unfortunately pretty shoddy Python) - but you can try it this way and adjust the timers as necessary. I've yet to be banned with it. It appends to pandas dataframes as it goes, so even if you are temp banned you should be able to write the comments you've parsed so far. It writes to CSV at the end, but you could change the pandas "to_csv" to "to_json" -

Lots of ways to do this.

#   https://github.com/kevinzg/facebook-scraper
#   
#
#---------------------------------------------------------------------------------------
#   https://github.com/kevinzg/facebook-scraper/issues/409
#---------------------------------------------------------------------------------------
#   Best practice is to reduce your requests per second ;). 
#   Try to stay under one request per second. What were you doing? 
#   Some requests (getting friend lists for example) are more sensitive to temp bans than others.
#
#---------------------------------------------------------------------------------------
#   https://github.com/kevinzg/facebook-scraper/issues/383
#---------------------------------------------------------------------------------------
#   When you scroll down on a Page or a Group on Facebook, it loads posts in pages. 
#   This varies depending on whether you're scraping Pages or Groups - for Pages, 
#   you get 2 posts on the first request, then 4 posts per page for each subsequent request 
#   (unless you change that default with the posts_per_page parameter). 
#   Groups give you roughly 20 posts per page, but this can vary by +-10 posts.
#   The number of posts per page for a group cannot be changed. 
#   The pages parameter limits how many pages to retrieve. 
#   Extracting comments from posts is a separate process, so yes, 
#   the page limit is independent from the number of comments. 
#   The number of comments would vary from Page to Page,
#   as some Pages are more popular than others, 
#   and tend to have more comments per post.
#   
#   As get_posts returns a generator, you can consume just enough posts until you have your
#   comment limit, then terminate iteration. You can set pages to None, so that you'll continue
#   retrieving posts until you hit your comment limit. Here's is a code example:
#
#   comments = 0
#   for post in get_posts("Nintendo", cookies="cookies.txt", pages=None, options={"comments": 3000}):
#     comments += len(post["comments_full"] or [])
#     if comments > 3000:
#       break

#   Note that I also set the limit of comments on a given post to 3000 -
#   so if the first post you get has > 3000 comments (possible for Pages belonging to movie stars), 
#   you'll only get comments for that one post
#---------------------------------------------------------------------------------------

from facebook_scraper import *
import pandas as pd
from collections import defaultdict
from time import time
from time import sleep
from datetime import datetime
from datetime import date
from random import randint

fbpage_id = 'xxxxxxxxxxxxxxxx'
posts_file = '/opt/fbscraping/data/' + fbpage_id + '_posts.csv'
comments_file = '/opt/fbscraping/data/' + fbpage_id + '_comments.csv'
replies_file = '/opt/fbscraping/data/' + fbpage_id + '_replies.csv'
cookies_file = '/opt/fbscraping/data/cookies.json'

# ------------------- current sleeping behavior -------------------
#   [POSTS] @ begin iteration
#    -> [COMMENTS] @ start, sleep for 3-7 seconds 
#    ----> [REPLIES] @ end, sleep for 10-40 seconds
#    -> [COMMENTS] @ end, sleep for 5-15 seconds
#   [POSTS] @ end, sleep for 37-89 seconds
# -----------------------------------------------------------------

# define pagination info
start_url = None
def handle_pagination_url(url):
    global start_url
    start_url = url

# define pandas dataframe
#   (refer to code example here: https://github.com/kevinzg/facebook-scraper/issues/414)
posts_df_ori = pd.DataFrame(columns = ['username', 'time', 'likes', 'comments', 'shares', 'reactions', 'post_text'])
comments_df_ori = pd.DataFrame(columns = ['post_id', 'commenter_name', 'comment_time', 'comment_reactors', 'replies', 'comment_text'])
replies_df_ori = pd.DataFrame(columns = ['post_id', 'parent_comment_id', 'commenter_name', 'comment_time', 'comment_reactors', 'comment_text'])

# [ALL_POSTS] retrieve all posts
print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "] STARTED - Retrieving posts")
pi=0
all_posts = get_posts(
        group=fbpage_id,
        extra_info=True,
        cookies = cookies_file,
        pages=1,
        timeout = 60,
        options={
            "comments": "generator",
            "comment_start_url": start_url,
            "comment_request_url_callback": handle_pagination_url
        },
    )

# [ALL_POSTS] iterate through using next() pagination 
while post := next(all_posts, None):
    pi += 1
    try:        
        # [POST] pandas dataframe
        print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "][", post["post_id"], "] Appending post info to 'posts_df_ori' dataframe. Post index: ", pi, " Total comments: ", post["comments"])
        post_dataframe = post
        post_df = pd.DataFrame.from_dict(post_dataframe, orient='index')
        post_df = post_df.transpose()
        posts_df_ori = posts_df_ori.append(post_df)

        # [COMMENT] begin loop
        ci=0
        comments = post["comments_full"]
        for comment in comments:

            # [COMMENT] determine replies
            ci += 1
            comment["replies"] = list(comment["replies"])

            # [COMMENT] pandas dataframe - transpose and add post_id
            comment_dataframe = comment
            comment_df = pd.DataFrame.from_dict(comment_dataframe, orient='index')
            comment_df = comment_df.transpose()
            comment_df.insert(0,'post_id',post['post_id'])

            # [COMMENT] append new object with post_id and comment* data to master 
            sleepCalc = randint(3,7)
            print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "][", post["post_id"], "] Appending comments info to 'comments_df_ori' dataframe. Post index: ", pi, " Comment index: ", ci, " Sleeping for: ", sleepCalc)
            comments_df_ori = comments_df_ori.append(comment_df)

            # [COMMENT] determine if replies exist 
            if comment["replies"]:
                ri = 0
                replies = comment['replies']
                for reply in replies:
                    ri += 1

                    # [COMMENT][REPLIES] pandas dataframe - transpose and add post_id, parent_comment_id
                    reply_dataframe = reply
                    reply_df = pd.DataFrame.from_dict(reply_dataframe, orient='index')
                    reply_df = reply_df.transpose()
                    reply_df.insert(0,'post_id',post['post_id'])
                    reply_df.insert(1,'parent_comment_id',comment['comment_id'])

                    # [COMMENT][REPLIES] append new object with post_id, parent_comment_id, and comment* data to master, sleep
                    sleepCalc = randint(10,40)
                    print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "][", post["post_id"], "][", comment['comment_id'],"] Appending replies to 'replies_df_ori' dataframe. Post index: ", pi, " Comment index: ", ci, "Replies index: ", ri, " Sleeping for: ", sleepCalc)
                    replies_df_ori = replies_df_ori.append(reply_df)
                    sleep(sleepCalc)

            # [COMMENT] sleep for sleepCalc duration
            sleepCalc = randint(5,15)
            sleep(sleepCalc)

        # [POST] increment index, sleep
        sleepCalc = randint(37,89)
        print("---------------------------sleeping for ", sleepCalc, " seconds-----------------------------")

    except exceptions.TemporarilyBanned:
        print("Temporarily banned..... HALTING")
        break

    except Exception as err:
        print("Error... let's try continuing...?", err)

# [ALL_POSTS] finished looping through all posts
print("========================================================================")
print("-------------------FINISHED LOOPING THROUGH ALL POSTS-------------------")
print("========================================================================")

############################################################
# finish
############################################################
print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "] COMPLETED - Writing posts and comments to file")
posts_df_ori.to_csv(posts_file, encoding='utf-8', index = False)
comments_df_ori.to_csv(comments_file, encoding='utf-8', index = False)
replies_df_ori.to_csv(replies_file, encoding='utf-8', index = False)
fashan7 commented 3 years ago

@milesgratz can we make it all in one JSON, I mean posts+comments & replies in one JSON. comments & replies needs to be in proper chain

fashan7 commented 3 years ago

@neon-ninja Have an issue with scraping. I will attach the sample JSON, which comment count is 0 ["comments":0] sample.txt

from facebook_scraper import *
import pandas as pd
from collections import defaultdict
from time import time
from time import sleep
from datetime import datetime
from datetime import date
from random import randint
# 1685525861651632   comments has 123

# 1724454324431012  comments has 182
fbpage_id = '1724454324431012'
posts_file = '/Users/aaa/Documents/' + fbpage_id + '_posts.csv'
comments_file = '/Users/aaa/Documents/' + fbpage_id + '_comments.csv'
replies_file = '/Users/aaa/Documents/' + fbpage_id + '_replies.csv'
cookies_file = '/Users/aaa/Documents/ggg/sss/pro/test.txt'

start_url = None
def handle_pagination_url(url):
    global start_url
    start_url = url

posts_df_ori = pd.DataFrame(columns = ['username', 'time', 'likes', 'comments', 'shares', 'reactions', 'post_text'])
comments_df_ori = pd.DataFrame(columns = ['post_id', 'commenter_name', 'comment_time', 'comment_reactors', 'replies', 'comment_text'])
replies_df_ori = pd.DataFrame(columns = ['post_id', 'parent_comment_id', 'commenter_name', 'comment_time', 'comment_reactors', 'comment_text'])

posts_df_list = []
comments_df_list = []
replies_df_list = []

pi=0
all_posts = get_posts(
    post_urls=[fbpage_id],
    extra_info=True,
    cookies = cookies_file,
    timeout = 60,
    options={
        "comments": "generator",
        "comment_start_url": start_url,
        "comment_request_url_callback": handle_pagination_url
    },
)

# [ALL_POSTS] iterate through using next() pagination 
while post := next(all_posts, None):
    pi += 1
    try:        
        # [POST] pandas dataframe
        print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "][", post["post_id"], "] Appending post info to 'posts_df_ori' dataframe. Post index: ", pi, " Total comments: ", post["comments"])
        post_dataframe = post
        post_df = pd.DataFrame.from_dict(post_dataframe, orient='index')
        post_df = post_df.transpose()
        posts_df_ori = posts_df_ori.append(post_df)
        posts_df_list.append(post)

        # [COMMENT] begin loop
        ci=0
        comments = post["comments_full"]
        for comment in comments:

            # [COMMENT] determine replies
            ci += 1
            comment["replies"] = list(comment["replies"])

            # [COMMENT] pandas dataframe - transpose and add post_id
            comment_dataframe = comment
            comment_df = pd.DataFrame.from_dict(comment_dataframe, orient='index')
            comment_df = comment_df.transpose()
            comment_df.insert(0,'post_id',post['post_id'])

            # [COMMENT] append new object with post_id and comment* data to master 
            sleepCalc = randint(3,7)
            print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "][", post["post_id"], "] Appending comments info to 'comments_df_ori' dataframe. Post index: ", pi, " Comment index: ", ci, " Sleeping for: ", sleepCalc)
            comments_df_ori = comments_df_ori.append(comment_df)
            comments_df_list.append(comment)

            # [COMMENT] determine if replies exist 
            if comment["replies"]:
                ri = 0
                replies = comment['replies']
                for reply in replies:
                    ri += 1

                    # [COMMENT][REPLIES] pandas dataframe - transpose and add post_id, parent_comment_id
                    reply_dataframe = reply
                    reply_df = pd.DataFrame.from_dict(reply_dataframe, orient='index')
                    reply_df = reply_df.transpose()
                    reply_df.insert(0,'post_id',post['post_id'])
                    reply_df.insert(1,'parent_comment_id',comment['comment_id'])

                    # [COMMENT][REPLIES] append new object with post_id, parent_comment_id, and comment* data to master, sleep
                    sleepCalc = randint(10,40)
                    print("[", datetime.now().strftime("%x %-I:%M:%S %p"), "][",  fbpage_id, "][", post["post_id"], "][", comment['comment_id'],"] Appending replies to 'replies_df_ori' dataframe. Post index: ", pi, " Comment index: ", ci, "Replies index: ", ri, " Sleeping for: ", sleepCalc)
                    replies_df_ori = replies_df_ori.append(reply_df)
                    replies_df_list.append(reply_df)
                    sleep(sleepCalc)

            # [COMMENT] sleep for sleepCalc duration
            sleepCalc = randint(5,15)
            sleep(sleepCalc)

        # [POST] increment index, sleep
        sleepCalc = randint(37,89)
        print("---------------------------sleeping for ", sleepCalc, " seconds-----------------------------")

    except exceptions.TemporarilyBanned:
        print("Temporarily banned..... HALTING")
        break

    except Exception as err:
        print("Error... let's try continuing...?", err)

posts_df_ori.to_csv(posts_file, encoding='utf-8', index = False)
comments_df_ori.to_csv(comments_file, encoding='utf-8', index = False)
replies_df_ori.to_csv(replies_file, encoding='utf-8', index = False)

print(posts_df_ori)
print("*********************")
print(comments_df_ori)
print("-------------------------")
print(replies_df_ori)
print("-----------END----------------")
print("-----------Start List----------------")
raw_json = posts_df_list[0]
raw_json.update({"comments_full":comments_df_list})
print(raw_json)
print("*********************")
neon-ninja commented 3 years ago

This is working fine for me, print(next(get_posts(post_urls=[1724454324431012], cookies="cookies.json"))["comments"]) outputs 202

fashan7 commented 3 years ago

@neon-ninja I cannot get comments from this post https://facebook.com/1685525861651632 using generator method

neon-ninja commented 3 years ago

This is working fine for me:

set_cookies("cookies.json")
post = next(get_posts(post_urls=[1685525861651632], options={"comments": "generator"}))
print(post["comments"], post["comments_full"])
pprint(next(post["comments_full"]))

outputs:

0 <generator object PostExtractor.extract_comments_full at 0x7f5f85bf9ba0>
{'comment_id': '1685644794973072',
 'comment_image': None,
 'comment_reactors': None,
 'comment_text': 'Masha Allah. It has been my habits, reciting it after every '
                 'sallat. Alhamdullilah.',
 'comment_time': datetime.datetime(2021, 9, 4, 0, 0),
 'comment_url': 'https://facebook.com/1685644794973072',
 'commenter_id': '100007158907479',
 'commenter_meta': None,
 'commenter_name': 'Idris Aisha',
 'commenter_url': 'https://facebook.com/idris.aisha.169?fref=nf&rc=p&__tn__=R',
 'replies': []}