scrapy-plugins / scrapy-playwright

🎭 Playwright integration for Scrapy
BSD 3-Clause "New" or "Revised" License
992 stars 108 forks source link

Page refreshes unexpectedly on form submission instead of loading dynamic results #263

Closed cjamesanderson closed 6 months ago

cjamesanderson commented 6 months ago

Running the following playwright test works as expected and passes:

import asyncio
from playwright.async_api import Playwright, async_playwright, expect

async def run(playwright: Playwright) -> None:
    browser = await playwright.chromium.launch(headless=True)
    context = await browser.new_context()
    page = await context.new_page()
    await page.goto("https://www.vmi.lt/evmi/metines-gyventojo-seimos-turto-deklaracijos-duomenu-israsai?lang=en")
    await page.frame_locator("iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator("#ddMetai").select_option("2022")
    await page.frame_locator("iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator("#ddPareigGr").select_option("501")
    await page.frame_locator("iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").get_by_role("button", name="Ieškoti").click()
    text = await page.frame_locator("iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator("#ListViewAsm_ctrl0_Label1").inner_text()
    print(text)
    await expect(page.frame_locator("iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator("#ListViewAsm_ctrl0_Label1")).to_contain_text("ALMINAS MAČIULIS")

    # ---------------------
    await context.close()
    await browser.close()

async def main():
    async with async_playwright() as playwright:
        await run(playwright)

asyncio.run(main())

Two drop-down selectors are changed, the submit button is clicked, and the correct results are verified after loading.

However, running the identical code in a scrapy spider with scrapy-playwright results in a pause after the form is submitted, and then an apparent page reload returning the page to its original state instead of loading results. The script then hangs waiting for the "#ListViewAsm_ctrl0_Label1" element. Adding error handling with screenshots confirms that the page has been reloaded.

import scrapy

class LithuanianspiderSpider(scrapy.Spider):
    name = "lithuanianspider"
    allowed_domains = ["www.vmi.lt"]
    start_urls = ["https://www.vmi.lt/evmi/metines-gyventojo-seimos-turto-deklaracijos-duomenu-israsai?lang=en"]

    def start_requests(self):
        self.logger.info("Running requests...")
        yield scrapy.Request(
            self.start_urls[0],
            meta=dict(
                playwright=True,
                playwright_include_page=True,
            ),
            callback=self.parse_form,
        )

    async def parse_form(self, response):
        page = response.meta["playwright_page"]
        await page.frame_locator(
            "iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator(
            "#ddMetai").select_option("2022")
        await page.frame_locator(
            "iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator(
            "#ddPareigGr").select_option("501")
        await page.frame_locator(
            "iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").get_by_role(
            "button", name="Ieškoti").click()
        self.logger.info("========== WAITING FOR RESULTS ==========")
        try:
            text = await page.frame_locator(
                "iframe[name=\"_com_liferay_iframe_web_portlet_IFramePortlet_INSTANCE_nv94dDWrzyD2_iframe\"]").locator(
                "#ListViewAsm_ctrl0_Label1").inner_text()
            await page.screenshot(path="screenshot.png")
            self.logger.info("==========>>>> RETRIEVED CONTENT: {}".format(text))
        except Exception as e:
            self.logger.info("========== TIMED OUT ==========")
            await page.screenshot(path="timeout.png")

        await page.close()
elacuesta commented 6 months ago

Seems to be working correctly for me with PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None, i.e. giving complete control over headers processing to Playwright. The default behavior is to somehow emulate how Scrapy handles headers, but it doesn't work for all cases.

cjamesanderson commented 6 months ago

@elacuesta that was it. thank you!

elacuesta commented 6 months ago

Glad you got it working, closing the issue now.