scrapy-plugins / scrapy-playwright

🎭 Playwright integration for Scrapy
BSD 3-Clause "New" or "Revised" License
911 stars 101 forks source link

Javascript not triggering in ASPX web page (works in regular playwright). #271

Open rasert opened 1 month ago

rasert commented 1 month ago

After clicking two radio buttons, the page should post-back and display a form. Unfortunately this is not happening. In regular playwright it works. I can't understand why.

This is the broken code:

import scrapy

class EmpenhosSpider(scrapy.Spider):
    name = "empenhos.py"
    custom_settings = {
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
    }

    def start_requests(self):
        url = "https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/ConsultaDespesaAno.aspx?orgao="
        yield scrapy.Request(
            url,
            meta={
                "playwright": True,
                "playwright_include_page": True,
            },
        )

    async def parse(self, response):
        page = response.meta["playwright_page"]

        await page.get_by_label('Empenhado').click()
        await page.get_by_label('Ordem Bancária').click()

        await page.wait_for_load_state(state='load')

        await self.enter_cnpj(page)

        await page.wait_for_timeout(10000)
        await page.close()
        return {}

    async def enter_cnpj(self, page):
        input_cnpj = page.locator('#ctl00_ContentPlaceHolder1_txtCPF')
        await input_cnpj.fill('07797967000195') # TODO: executar para todos os CNPJs conhecidos
        await page.locator('#ctl00_ContentPlaceHolder1_btnPesquisar').click()
        credor = page.locator('#ctl00_ContentPlaceHolder1_gdvCredor td a')
        await credor.click()
        await page.wait_for_load_state(state="load")

And this is the pure Playwright working code:

import asyncio
import json
from playwright.async_api import async_playwright

count = 1
empenhos = []

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/ConsultaDespesaAno.aspx?orgao=")
        print(await page.title())

        await page.get_by_label('Empenhado').click()
        await page.get_by_label('Ordem Bancária').click()

        await page.wait_for_load_state(state="load")

        await entra_cnpj(page)
        await processa_orgaos(page)

        await browser.close()

        # Escrever todos os empenhos novamente no arquivo JSON
        with open('results/empenhos.json', 'w', encoding='utf-8') as json_file:
            json.dump(empenhos, json_file, ensure_ascii=False)

async def entra_cnpj(page):
    input_cnpj = page.locator('#ctl00_ContentPlaceHolder1_txtCPF')
    await input_cnpj.fill('07797967000195') # TODO: executar para todos os CNPJs conhecidos
    await page.locator('#ctl00_ContentPlaceHolder1_btnPesquisar').click()
    credor = page.locator('#ctl00_ContentPlaceHolder1_gdvCredor td a')
    await credor.click()
    await page.wait_for_load_state(state="load")

async def processa_orgaos(page):
    tabela_orgaos = page.locator('#ctl00_ContentPlaceHolder1_gdvListaOrgao')
    links = await tabela_orgaos.locator('td a').all()
    for link in links:
        await link.click()
        await page.wait_for_load_state(state="load")
        await processa_unidades_gestoras(page)

async def processa_unidades_gestoras(page):
    global count
    tabela_ugestoras = page.locator('#ctl00_ContentPlaceHolder1_gdvListaUg')
    links = await tabela_ugestoras.locator('td a').all()
    for link in links:
        await link.click()
        await page.wait_for_load_state(state="load")
        await page.screenshot(path=f'results/tela{count}.png')
        count += 1
        await processa_empenhos(page)
        # Necessário voltar para continuar processando demais Órgãos e Unidades Gestoras
        await page.go_back(wait_until='load')

async def processa_empenhos(page):
    global empenhos
    novos_empenhos = []

    exercicio = await page.locator('#ctl00_ContentPlaceHolder1_lblAno').inner_text()
    credor = await page.locator('#ctl00_ContentPlaceHolder1_lblCgcCpfNomeCredor').inner_text()
    orgao = await page.locator('#ctl00_ContentPlaceHolder1_lblCodNomeOrgao').inner_text()
    unidade_gestora = await page.locator('#ctl00_ContentPlaceHolder1_lblCodNomeUgResponsavel').inner_text()

    # Extrair os campos desejados da tabela
    tabela_documentos = page.locator('#ctl00_ContentPlaceHolder1_gdvDocumento')
    linhas = await tabela_documentos.locator('tr.linha_grid_alt').all()
    for linha in linhas:
        colunas = linha.locator('td')
        data = await colunas.locator("nth=0").inner_text()
        numero_doc = await colunas.locator("nth=1").inner_text()
        pagamento_referente = await colunas.locator("nth=2").inner_text()
        descricao = await colunas.locator("nth=3").inner_text()
        nota_empenho_origem = await colunas.locator("nth=4").inner_text()
        fonte_recurso = await colunas.locator("nth=5").inner_text()
        valor_documento = await colunas.locator("nth=6").inner_text()
        # Criar um dicionário com os campos extraídos
        item = {
            'Exercicio': exercicio,
            'Credor': credor,
            'Orgao': orgao,
            'UnidadeGestora': unidade_gestora,
            'Data': data,
            'NumeroDoc': numero_doc,
            'PagamentoReferente': pagamento_referente,
            'Descricao': descricao,
            'NotaDeEmpenhoOrigem': nota_empenho_origem,
            'FonteRecurso': fonte_recurso,
            'ValorDocumento': valor_documento
        }
        novos_empenhos.append(item)

    empenhos += novos_empenhos

asyncio.run(main())
elacuesta commented 1 month ago

Found this in the logs:

2024-05-16 16:17:16 [scrapy-playwright] DEBUG: [Context=default] Overridden method for Playwright request to ...: original=GET new=POST

Seems like a bug with the method override (https://github.com/scrapy-plugins/scrapy-playwright/pull/177), I got expected behavior by commenting out these two lines. This happens because the site makes a POST request to the same URL after each radio button click and that triggers this logic. It's a tricky problem to recognize which Playwright request corresponds to the Scrapy request and I've attempted a few ways, at this point I'm not sure exactly how to solve it once and for all (other than having some meta key like no_method_override, which I don't really like).

However, there's a workaround: by making the first request as POST the methods match and there is no override. You also need PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None in your settings, otherwise the page does not reload (it probably doesn't recognize the headers as correct).

dziugas505hq commented 1 month ago

@elacuesta I've had the same problem and spent a lot of hours debugging :D The site was malfunctioning about the sent headers, PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None fixed the problem, huge thanks!

runa commented 1 month ago

@elacuesta I found this issue after debugging for hours and hours. Setting PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None does something good (all JS ends up loading and __doPostBack is available) but then the app is not working; I suspect a cookie is not being set.

Any other workarounds?

elacuesta commented 4 weeks ago

I suspect a cookie is not being set.

This shouldn't be the case with PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None, the handler will not override headers in this case.

Any other workarounds?

Hard to know without seeing what you're trying to do.

runa commented 4 weeks ago

@elacuesta thanks, I'll debug this further with PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None and report back.

If I cannot find it, I'll show a working script using playwright directly and other, not working, using scrapy-playwright

runa commented 4 weeks ago

So, this are my findings:

I've uploaded the HAR files to: https://drive.google.com/drive/folders/1sxvG_Suh-XYg6DGHB-761DCW1cpnHaLS?usp=sharing

Code for the scrapy-playwright:

import scrapy

class TestSpider(scrapy.Spider):
    name = "test"
    custom_settings = {
        "DOWNLOAD_HANDLERS": {
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
        "PLAYWRIGHT_LAUNCH_OPTIONS": {
            "headless": False,
            "proxy": {"server": "http://127.0.0.1:8888"},
            "timeout": 120 * 1000,  # 20 seconds

        },
    }
    def start_requests(self):
        # GET request
        yield scrapy.Request("https://ccms.clerk.org/", meta={"playwright": True,                     "playwright_include_page": True,      
                                                              "playwright_context_kwargs": { "record_har_mode": "full", "record_har_path":"/tmp/scrapy-playwright-NO_PROCESS_HEADERS.har"}       
})

    async def parse(self, response, **kwargs):
        page=response.meta['playwright_page']
        print("click")
        await page.locator("#Content1_button_accept").click()
        print(await page.title())
        await page.wait_for_selector("#Content1_CaseNum")

        return {"url": response.url}

Code for the playwright only test script:

import asyncio
import playwright
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

async def main():
     async with async_playwright() as p:
        browser = await p.chromium.launch( headless=False, proxy={"server": "http://127.0.0.1:8888"})
        context = await browser.new_context( record_har_mode="full", record_har_path="/tmp/playwright.har",)
        page = await context.new_page()
        await page.goto('https://ccms.clerk.org/')
        print("click")
        await  page.locator("#Content1_button_accept").click()
        print( await  page.title())
        await  page.wait_for_selector("#Content1_CaseNum")
        await context.close()
asyncio.run(main())

Unfortunately the website only works for US IP addresses (I'm using a proxy)

Thanks in advance!

elacuesta commented 3 weeks ago

It seems to be the same issue, the method override is being triggered. The same workaround I've mentioned before applies too: add method="POST" to your initial request.