unclecode / crawl4ai

🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper
Apache License 2.0
16.37k stars 1.2k forks source link

How Can I scroll the page? - js_code does not work #86

Closed navin-hariharan closed 2 months ago

navin-hariharan commented 2 months ago
import base64
from bs4 import BeautifulSoup
from crawl4ai import WebCrawler

js_code = ["""
const scrollPage = () => new Promise(resolve => {
    const scroll = () => {
        window.scrollBy(0, 100);
        if (window.innerHeight + window.scrollY >= document.body.offsetHeight) {
            resolve();
        } else {
            setTimeout(scroll, 100);
        }
    };
    scroll();
});
scrollPage();
"""]

def create_crawler():
    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler

crawler = create_crawler()

result = crawler.run(
    url="https://www.amazon.com/Best-Sellers-Automotive-Automotive-Replacement-Parts/zgbs/automotive/15719731/ref=zg_bs_pg_2_automotive?_encoding=UTF8&pg=1",
    js=js_code,
    css_selector='#gridItemRoot',
    screenshot=True,
    bypass_cache=True
)

soup = BeautifulSoup(result.html, 'html.parser')

products = []
for item in soup.select('#gridItemRoot'):
    item_soup = BeautifulSoup(str(item), 'html.parser')

    rank = item_soup.find(class_='zg-bdg-text').get_text() if item_soup.find(class_='zg-bdg-text') else None
    image_link = item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image').get('src') if item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image') else None
    link = item_soup.find(class_='a-link-normal')['href'] if item_soup.find(class_='a-link-normal') else None
    name = item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1') else None
    rating = item_soup.find(class_='a-icon-alt').get_text() if item_soup.find(class_='a-icon-alt') else None
    price = item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z') else None

    products.append({'rank': rank,'image_link': image_link,'link': link,'name': name,'rating': rating,'price': price})

with open("screenshot.png", "wb") as f:
    f.write(base64.b64decode(result.screenshot))

print(products)
Anticope12 commented 2 months ago

Hi @unclecode, I don't assume you got a minute. Would really appreciate your thoughts on the scrolling issue.

unclecode commented 2 months ago

Hi @Anticope12 sorry for delay, let me check your code and update you soon.

navin-hariharan commented 2 months ago

@unclecode Thanks, Would help a lot!

Anticope12 commented 2 months ago

Hi @unclecode , I know you are busy, I would really appreciate your thoughts?

unclecode commented 2 months ago

@Anticope12 @navin-hariharan Hello everyone, sorry for the delay. Just a few things:

First, there's a better way to run JavaScript code. I suggest creating an instance of the Selenium crawler strategy and passing your code there. Second, the default crawl behavior scrolls to the bottom of the page, which is why it seems like your code isn't doing anything, it’s actually executed but starts at the bottom. I’ve modified your script to scroll up instead, set headless to false so you can see the browser, and added a 5-second delay since you’re scrolling up every 100 pixels to help visualize what's happening.

We’re also releasing a new version that fully supports asynchronous crawling with Playwright. It handles JavaScript code much better, with more cool features coming soon. Stay tuned!

Export-1725792256575

import base64, time
from bs4 import BeautifulSoup
from crawl4ai import WebCrawler
from crawl4ai.crawler_strategy import LocalSeleniumCrawlerStrategy

js_code = ["""
const scrollToTop = () => new Promise(resolve => {
    const scroll = () => {
        window.scrollBy(0, -100); // Scroll upwards by 100px
        if (window.scrollY === 0) { // Check if we're at the top
            resolve(); // Resolve the promise when we reach the top
        } else {
            setTimeout(scroll, 100); // Keep scrolling until we reach the top
        }
    };
    scroll();
});
scrollToTop();
"""]

crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True, headless=False, js_code=js_code)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()    
result = crawler.run(
    crawler_strategy=crawler_strategy,
    url="https://www.amazon.com/Best-Sellers-Automotive-Automotive-Replacement-Parts/zgbs/automotive/15719731/ref=zg_bs_pg_2_automotive?_encoding=UTF8&pg=1",
    css_selector='#gridItemRoot',
    screenshot=False,
    bypass_cache=True
)

time.sleep(5) 
soup = BeautifulSoup(result.html, 'html.parser')

products = []
for item in soup.select('#gridItemRoot'):
    item_soup = BeautifulSoup(str(item), 'html.parser')

    rank = item_soup.find(class_='zg-bdg-text').get_text() if item_soup.find(class_='zg-bdg-text') else None
    image_link = item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image').get('src') if item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image') else None
    link = item_soup.find(class_='a-link-normal')['href'] if item_soup.find(class_='a-link-normal') else None
    name = item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1') else None
    rating = item_soup.find(class_='a-icon-alt').get_text() if item_soup.find(class_='a-icon-alt') else None
    price = item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z') else None

    products.append({'rank': rank,'image_link': image_link,'link': link,'name': name,'rating': rating,'price': price})

with open("screenshot.png", "wb") as f:
    f.write(base64.b64decode(result.screenshot))

print(products)
navin-hariharan commented 2 months ago

There are 50 items on the page but the code shows only 30 items on nscrape! Using headless=False I can see that it's scrolling but the additional data that appears after scrolling is not being fetched by the crawler!

import base64, time
from bs4 import BeautifulSoup
from crawl4ai import WebCrawler
from crawl4ai.crawler_strategy import LocalSeleniumCrawlerStrategy

js_code = ["""
const scrollToTop = () => new Promise(resolve => {
    const scroll = () => {
        window.scrollBy(0, -100); // Scroll upwards by 100px
        if (window.scrollY === 0) { // Check if we're at the top
            resolve(); // Resolve the promise when we reach the top
        } else {
            setTimeout(scroll, 100); // Keep scrolling until we reach the top
        }
    };
    scroll();
});
scrollToTop();
"""]

crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True, headless=False, js_code=js_code)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(
    crawler_strategy=crawler_strategy,
    url="https://www.amazon.com/Best-Sellers-Automotive-Automotive-Replacement-Parts/zgbs/automotive/15719731/ref=zg_bs_pg_2_automotive?_encoding=UTF8&pg=1",
    css_selector='#gridItemRoot',
    screenshot=False,
    bypass_cache=True
)

time.sleep(10)
soup = BeautifulSoup(result.html, 'html.parser')

products = []
for item in soup.select('#gridItemRoot'):
    item_soup = BeautifulSoup(str(item), 'html.parser')

    rank = item_soup.find(class_='zg-bdg-text').get_text() if item_soup.find(class_='zg-bdg-text') else None
    image_link = item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image').get('src') if item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image') else None
    link = item_soup.find(class_='a-link-normal')['href'] if item_soup.find(class_='a-link-normal') else None
    name = item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1') else None
    rating = item_soup.find(class_='a-icon-alt').get_text() if item_soup.find(class_='a-icon-alt') else None
    price = item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z') else None
    products.append({'rank': rank,'image_link': image_link,'link': link,'name': name,'rating': rating,'price': price})

print(len(products))
navin-hariharan commented 2 months ago

I have tried the below too!

js_code = ["""
const scrollToTop = () => new Promise(resolve => {
    const scroll = () => {
        window.scrollBy(0, -100); // Scroll upwards by 100px
        if (window.scrollY === 0) { // Check if we're at the top
            resolve(); // Resolve the promise when we reach the top
        } else {
            setTimeout(scroll, 100); // Keep scrolling until we reach the top
        }
    };
    scroll();
});

const scrollToBottom = () => new Promise(resolve => {
    const scroll = () => {
        window.scrollBy(0, 100); // Scroll down by 1000px
        if ((window.innerHeight + window.scrollY) >= document.body.offsetHeight) {
            resolve(); // Resolve when we reach the bottom of the page
        } else {
            setTimeout(scroll, 100); // Keep scrolling until we reach the bottom
        }
    };
    scroll();
});

const waitForContentLoad = () => new Promise(resolve => {
    setTimeout(resolve, 3000); // Wait for 3 seconds after scrolling
});

(async () => {
    await scrollToTop();  // Wait for scrolling to top
    await scrollToBottom();  // After reaching top, scroll to bottom
    await waitForContentLoad();  // Wait for additional content to load
})();
"""]