scrapy-plugins / scrapy-playwright

🎭 Playwright integration for Scrapy
BSD 3-Clause "New" or "Revised" License
911 stars 101 forks source link

Modifying headers when sending out the request #258

Open louislsh68 opened 5 months ago

louislsh68 commented 5 months ago

I am using scrapy-playwright with brave. I find out that the code with add "pragma" and "cache-control" as an header. Is it possible to remove it? Usually, we should use route function. But route is being used in the base code of scrapy-playwright

elacuesta commented 5 months ago

Have you tried PLAYWRIGHT_PROCESS_REQUEST_HEADERS?

louislsh68 commented 5 months ago

Tried it is the same. I also read something funny in pupeteer user on chromium, They also faced the same issue https://github.com/puppeteer/puppeteer/issues/3403 Not sure if this is the same issue across automation control via Chromium

My code for ref

#Brave-For discussion
import scrapy,random
from scrapy.exceptions import CloseSpider
from http.cookies import SimpleCookie
from playwright import async_api

max_spider_run_time = 1200

class MySpider(scrapy.Spider):
    name = 'Scraper'

    def should_abort_request(request):
        return (
            request.resource_type in ["image", "images", "font", "stylesheet", "media"]
            )

    def get_custom_headers(browser_type: str, playwright_request: async_api.Request, scrapy_headers: scrapy.http.headers.Headers) -> dict:
        return {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", 
            "Accept-Encoding": "gzip, deflate", 
            "Accept-Language": "en-GB", 
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        }

    custom_settings = {
        'PLAYWRIGHT_BROWSER_TYPE':'chromium',
        'DOWNLOAD_DELAY': 4.0,
        'PLAYWRIGHT_LAUNCH_OPTIONS': {
            'headless': False,
            "slow_mo": 20000,
            "timeout": 30000,
            "executable_path": "/Applications/Brave Browser.app/contents/MacOS/Brave Browser",
            "args": [
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--no-first-run',
                '--disable-blink-features=AutomationControlled'],
        },
        "PLAYWRIGHT_ABORT_REQUEST" : should_abort_request,
        "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": get_custom_headers,
    }

    def __init__(self, *args, **kwargs):
        super(MySpider, self).__init__(*args, **kwargs)

    async def closed(self, reason):
        if hasattr(self, 'playwright_context') and self.playwright_context is not None:
            await self.playwright_context.close()
            self.playwright_context = None

        if reason == 'closespider_timeout':
            self.log("Spider run time exceeded")

    def start_requests(self):
        self.current_country_index = 0
        yield from self.go_to_website()

    def go_to_website(self, offset=0):
        if self.current_country_index < len(self.countryabbr_list):
            countryabbr = self.countryabbr_list[self.current_country_index]
            rand_port = random.randint(10000, 20000)
            first_two_chars = countryabbr[:2]
            init_request = scrapy.Request(
                url = "https://httpbin.org/headers",
                callback=self.retrieve_cookies,
                errback=self.handle_error,
                dont_filter=True,  
                meta={
                    'playwright': True, 
                    'dont_merge_cookies': True,
                    'offset': offset,
                    'playwright_context': f'{first_two_chars}_{rand_port}',
                },
            )
            yield init_request
        else:
            raise CloseSpider('No more things to scrape')

    def retrieve_cookies(self, response):
        raw_cookies = response.headers.getlist('Set-Cookie')
        self.cookies = {}
        for rc in raw_cookies:
            cookie = SimpleCookie(rc.decode())
            for key, morsel in cookie.items():
                self.cookies[key] = morsel.value
        self.headers = response.request.headers`

The headers received 5DF50325-991E-4C9C-A148-D5F8298FDF4A

(edited for syntax highlighting)

elacuesta commented 5 months ago

I see. This might be an upstream issue with Playwright and/or the underlying browser. Please try to reproduce using standalone Playwright as indicated in https://github.com/scrapy-plugins/scrapy-playwright#reporting-issues. If the issue also appears that way, it's not caused by this package.