kevinzg / facebook-scraper

Scrape Facebook public pages without an API key
MIT License
2.29k stars 618 forks source link

Won't let me get_posts() for a public profile before a given date #1001 #1002

Open wrmadsen opened 1 year ago

wrmadsen commented 1 year ago

Hi! I'm trying to collect the posts of a public Facebook profil. Its ID is as follows: 100008565963538. However, when I begin the process, the collection does not collect any posts before February 12th 2023. In fact, when it hits February 12th, the collection then, for whatever reason, begins again collecting posts around February 22nd. Then it proceeds to collect towards February 12th, then going back to February 22nd. This continues until I stop it or the code is blocked by Facebook.

Any help much appreciated. I include the code below.

The start_url .txt file which is read in consists simply of the following text "https://m.facebook.com//profile.php?id=100008565963538". The idea is that, after the initial run of the code, this .txt file will have the start_url to be used later.

from facebook_scraper import *
from facebook_scraper import get_posts
import pandas as pd
import time
from datetime import date
import logging
import random
import os

# Cookies path
path_to_cookies = "data-raw/Facebook/cookies_en.json"

# Handle pagination URL
start_url = None

def handle_pagination_url(url):
    # Set as global
    global start_url
    start_url = url

    # Print
    print("start url check: " + start_url)

    # Save as txt file
    # First extract ID
    #user_id = re.sub(".+profile_id|&replace_id.+|=|.+php\\?id", " ", url).replace(" ", "")
    #print("user id check: " + user_id)
    #save_start_url_as_txt(user_id, start_url)

# Function to read start_url
def read_start_url(path_to_start_url, id):

    if os.path.isfile(path_to_start_url):

        start_url = open(path_to_start_url, "r")

        start_url = start_url.read().replace("\n", "")

        return start_url
    else:
        return None #"https://m.facebook.com//profile.php?id=" + str(id)

# Save start_url as .txt
def save_start_url_as_txt(id, start_url):

    start_url_file_name = "data-raw/Facebook/start_url_" + str(id) + ".txt"

    start_url = str(start_url)

    with open(start_url_file_name, "w") as text_file:
        text_file.write(start_url)

# Create and save dataframe
def create_save_dataframe(id, attributes_container):

    facebook_df = pd.DataFrame(attributes_container,
                               columns=["User", "Time", "Post URL", "Is_live", "Video ID", "Text", "Comments", "Likes", "Shared text"])

    time_now = time.time()
    time_now = str(time_now)

    facebook_df.to_csv("data-raw/Facebook/facebook_" + str(id) + "_" + time_now + ".csv", index = False)

# Create function scrape Facebook
def get_facebook_posts(id, pages_n, path_to_cookies, which_type):

    # Take time
    start = time.time()

    # Counter
    counter = 0

    # Created a list to append all tweet attributes(data)
    attributes_container = []

    # Read in start_url file (if it exists)
    path_to_start_url = "data-raw/Facebook/start_url_" + str(id) + ".txt"
    start_url = read_start_url(path_to_start_url, id)
    print("Start url being used: ", start_url)

    # Run for loop
    while True:
        try:
            for post in get_posts(id,
                                  pages = pages_n,
                                  #options={"progress": True, "username": True, "time": True, "post_url": True,
                                  #         "posts_per_page": 200,
                                  #         "allow_extra_requests": False,
                                  #         },
                                  start_url = start_url,
                                  request_url_callback = handle_pagination_url,
                                  cookies = path_to_cookies
                                  ):

                # Print counter and info after each scraping
                counter += 1

                print(str(counter) + " " + str(id) + " from " + str(post["time"]) + ".      Downloaded at " + str(time.ctime()))

                # Remove \n from strings in shared text
                shared_text_new = post["shared_text"]
                shared_text_new = shared_text_new.replace("\n", " ")

                # Append to dataframe
                attributes_container.append([post["username"], post["time"], post["post_url"], post["is_live"], post["video_id"], post["text"],
                                             post["comments"], post["likes"], shared_text_new])

                time.sleep(random.randint(4, 8))

            break

        except exceptions.TemporarilyBanned:

            save_start_url_as_txt(id, start_url)

            create_save_dataframe(id, attributes_container)

            print("Saved whatever was scraped before temporary ban.")

            print("Temporarily banned at " + str(time.ctime()) + ". Now sleeping for 11 minutes.")

            time.sleep(700)

    save_start_url_as_txt(id, start_url)

    create_save_dataframe(id, attributes_container)

    # Print time elapsed
    end = time.time()
    time_elapsed = end - start
    minutes_elapsed = time_elapsed/60
    minutes_elapsed = round(minutes_elapsed, 3)
    print("#################")
    print("time elapsed: " + str(minutes_elapsed) + " min")
    print("#################")

# Run code
get_facebook_posts(id = 100008565963538, pages_n = 150, path_to_cookies = path_to_cookies, which_type = "profile")
tienchanhtri commented 1 year ago

I also encounter this bug, this a bug in facebook mbasic. If you click on the "See More Stories" enough times, eventually it will loop. The only way to work around this issue is to use the -rf flag to pass in the file with contains the cursor URL you want to continue. Patient and do it page by page manually, eventually you will get all the posts

Since it not related to our side, we should close this @kevinzg