Closed navin-hariharan closed 2 months ago
Hi @unclecode, I don't assume you got a minute. Would really appreciate your thoughts on the scrolling issue.
Hi @Anticope12 sorry for delay, let me check your code and update you soon.
@unclecode Thanks, Would help a lot!
Hi @unclecode , I know you are busy, I would really appreciate your thoughts?
@Anticope12 @navin-hariharan Hello everyone, sorry for the delay. Just a few things:
First, there's a better way to run JavaScript code. I suggest creating an instance of the Selenium crawler strategy and passing your code there. Second, the default crawl behavior scrolls to the bottom of the page, which is why it seems like your code isn't doing anything, it’s actually executed but starts at the bottom. I’ve modified your script to scroll up instead, set headless to false so you can see the browser, and added a 5-second delay since you’re scrolling up every 100 pixels to help visualize what's happening.
We’re also releasing a new version that fully supports asynchronous crawling with Playwright. It handles JavaScript code much better, with more cool features coming soon. Stay tuned!
import base64, time
from bs4 import BeautifulSoup
from crawl4ai import WebCrawler
from crawl4ai.crawler_strategy import LocalSeleniumCrawlerStrategy
js_code = ["""
const scrollToTop = () => new Promise(resolve => {
const scroll = () => {
window.scrollBy(0, -100); // Scroll upwards by 100px
if (window.scrollY === 0) { // Check if we're at the top
resolve(); // Resolve the promise when we reach the top
} else {
setTimeout(scroll, 100); // Keep scrolling until we reach the top
}
};
scroll();
});
scrollToTop();
"""]
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True, headless=False, js_code=js_code)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(
crawler_strategy=crawler_strategy,
url="https://www.amazon.com/Best-Sellers-Automotive-Automotive-Replacement-Parts/zgbs/automotive/15719731/ref=zg_bs_pg_2_automotive?_encoding=UTF8&pg=1",
css_selector='#gridItemRoot',
screenshot=False,
bypass_cache=True
)
time.sleep(5)
soup = BeautifulSoup(result.html, 'html.parser')
products = []
for item in soup.select('#gridItemRoot'):
item_soup = BeautifulSoup(str(item), 'html.parser')
rank = item_soup.find(class_='zg-bdg-text').get_text() if item_soup.find(class_='zg-bdg-text') else None
image_link = item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image').get('src') if item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image') else None
link = item_soup.find(class_='a-link-normal')['href'] if item_soup.find(class_='a-link-normal') else None
name = item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1') else None
rating = item_soup.find(class_='a-icon-alt').get_text() if item_soup.find(class_='a-icon-alt') else None
price = item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z') else None
products.append({'rank': rank,'image_link': image_link,'link': link,'name': name,'rating': rating,'price': price})
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
print(products)
There are 50 items on the page but the code shows only 30 items on nscrape! Using headless=False I can see that it's scrolling but the additional data that appears after scrolling is not being fetched by the crawler!
import base64, time
from bs4 import BeautifulSoup
from crawl4ai import WebCrawler
from crawl4ai.crawler_strategy import LocalSeleniumCrawlerStrategy
js_code = ["""
const scrollToTop = () => new Promise(resolve => {
const scroll = () => {
window.scrollBy(0, -100); // Scroll upwards by 100px
if (window.scrollY === 0) { // Check if we're at the top
resolve(); // Resolve the promise when we reach the top
} else {
setTimeout(scroll, 100); // Keep scrolling until we reach the top
}
};
scroll();
});
scrollToTop();
"""]
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True, headless=False, js_code=js_code)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(
crawler_strategy=crawler_strategy,
url="https://www.amazon.com/Best-Sellers-Automotive-Automotive-Replacement-Parts/zgbs/automotive/15719731/ref=zg_bs_pg_2_automotive?_encoding=UTF8&pg=1",
css_selector='#gridItemRoot',
screenshot=False,
bypass_cache=True
)
time.sleep(10)
soup = BeautifulSoup(result.html, 'html.parser')
products = []
for item in soup.select('#gridItemRoot'):
item_soup = BeautifulSoup(str(item), 'html.parser')
rank = item_soup.find(class_='zg-bdg-text').get_text() if item_soup.find(class_='zg-bdg-text') else None
image_link = item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image').get('src') if item_soup.find('img', class_='a-dynamic-image p13n-sc-dynamic-image p13n-product-image') else None
link = item_soup.find(class_='a-link-normal')['href'] if item_soup.find(class_='a-link-normal') else None
name = item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-css-line-clamp-3_g3dy1') else None
rating = item_soup.find(class_='a-icon-alt').get_text() if item_soup.find(class_='a-icon-alt') else None
price = item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z').get_text() if item_soup.find(class_='_cDEzb_p13n-sc-price_3mJ9Z') else None
products.append({'rank': rank,'image_link': image_link,'link': link,'name': name,'rating': rating,'price': price})
print(len(products))
I have tried the below too!
js_code = ["""
const scrollToTop = () => new Promise(resolve => {
const scroll = () => {
window.scrollBy(0, -100); // Scroll upwards by 100px
if (window.scrollY === 0) { // Check if we're at the top
resolve(); // Resolve the promise when we reach the top
} else {
setTimeout(scroll, 100); // Keep scrolling until we reach the top
}
};
scroll();
});
const scrollToBottom = () => new Promise(resolve => {
const scroll = () => {
window.scrollBy(0, 100); // Scroll down by 1000px
if ((window.innerHeight + window.scrollY) >= document.body.offsetHeight) {
resolve(); // Resolve when we reach the bottom of the page
} else {
setTimeout(scroll, 100); // Keep scrolling until we reach the bottom
}
};
scroll();
});
const waitForContentLoad = () => new Promise(resolve => {
setTimeout(resolve, 3000); // Wait for 3 seconds after scrolling
});
(async () => {
await scrollToTop(); // Wait for scrolling to top
await scrollToBottom(); // After reaching top, scroll to bottom
await waitForContentLoad(); // Wait for additional content to load
})();
"""]