scrapy-plugins / scrapy-playwright

🎭 Playwright integration for Scrapy
BSD 3-Clause "New" or "Revised" License
992 stars 108 forks source link

Scrapy playwright infinite scroll isn't working #255

Closed AliHassan89 closed 9 months ago

AliHassan89 commented 9 months ago

I have the following code. It opens up the headless browser and I also see the page getting scrolled but the response object in parse method doesn't have any HTML. When I don't use auto-scrolling this spider works perfectly.

The code is supposed only to extract the product name and product price from this website.

import scrapy
import re
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup

def should_abort_request(req):
   if req.resource_type == "image":
     return True
   if req.method.lower() == 'post':
     return True

return False

scrolling_script = """
  const scrolls = 8
  let scrollCount = 0

  // scroll down and then wait for 5s
  const scrollInterval = setInterval(() => {
    window.scrollTo(0, document.body.scrollHeight)
    scrollCount++

    if (scrollCount === numScrolls) {
      clearInterval(scrollInterval)
    }
  }, 5000)
  """

class AuchanSpider(scrapy.Spider):
  name = 'auchan'
  custom_settings = {
    'PLAYWRIGHT_ABORT_REQUEST': should_abort_request
  }
  start_urls = ['https://zakupy.auchan.pl/shop/list/8029?shType=id']

  def start_requests(self):
    for url in self.start_urls:
        yield scrapy.Request(
            url=url,
            callback=self.parse,
            meta={
                "playwright": True,
                "playwright_include_page": True,
                "playwright_page_methods": [
                    PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_"),
                    PageMethod("evaluate", scrolling_script),
                    #PageMethod("wait_for_timeout", 30000),
                    PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_:nth-child(60)")
                ],
            },
            errback=self.close_page,
            cb_kwargs=dict(main_url=url, page_number=0),
        )

async def parse(self, response, main_url, page_number):
    soup = BeautifulSoup(response.text, 'html.parser')
    product_containers = soup.find_all('div', class_='_1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_')
    for product_container in product_containers:
        price = product_container.find(class_='_1-UB _1Evs').get_text()
        price = re.sub(r"[\n\t\s]*", "", price)
        yield {
            'productName': product_container.find(class_='_1DGZ').get_text(),
            'price': price
        }

async def close_page(self, failure):
    page = failure.request.meta["playwright_page"]
    await page.close()
elacuesta commented 9 months ago

I see two issues here:

You can access the response in your errback to verify it contains full HTML with failure.response. For instance:

async def close_page(self, failure):
    ...
    with open("page.html", "w") as fp:
        fp.write(failure.response.text)

In any case, this is a support question and not a bug report or feature request. If you need further assistance please use a more appropriate place, e.g. the scrapy-playwright tag at StackOverflow.