Open louislsh68 opened 5 months ago
Have you tried PLAYWRIGHT_PROCESS_REQUEST_HEADERS?
Tried it is the same. I also read something funny in pupeteer user on chromium, They also faced the same issue https://github.com/puppeteer/puppeteer/issues/3403 Not sure if this is the same issue across automation control via Chromium
My code for ref
#Brave-For discussion
import scrapy,random
from scrapy.exceptions import CloseSpider
from http.cookies import SimpleCookie
from playwright import async_api
max_spider_run_time = 1200
class MySpider(scrapy.Spider):
name = 'Scraper'
def should_abort_request(request):
return (
request.resource_type in ["image", "images", "font", "stylesheet", "media"]
)
def get_custom_headers(browser_type: str, playwright_request: async_api.Request, scrapy_headers: scrapy.http.headers.Headers) -> dict:
return {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-GB",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
custom_settings = {
'PLAYWRIGHT_BROWSER_TYPE':'chromium',
'DOWNLOAD_DELAY': 4.0,
'PLAYWRIGHT_LAUNCH_OPTIONS': {
'headless': False,
"slow_mo": 20000,
"timeout": 30000,
"executable_path": "/Applications/Brave Browser.app/contents/MacOS/Brave Browser",
"args": [
'--no-sandbox',
'--disable-setuid-sandbox',
'--no-first-run',
'--disable-blink-features=AutomationControlled'],
},
"PLAYWRIGHT_ABORT_REQUEST" : should_abort_request,
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": get_custom_headers,
}
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
async def closed(self, reason):
if hasattr(self, 'playwright_context') and self.playwright_context is not None:
await self.playwright_context.close()
self.playwright_context = None
if reason == 'closespider_timeout':
self.log("Spider run time exceeded")
def start_requests(self):
self.current_country_index = 0
yield from self.go_to_website()
def go_to_website(self, offset=0):
if self.current_country_index < len(self.countryabbr_list):
countryabbr = self.countryabbr_list[self.current_country_index]
rand_port = random.randint(10000, 20000)
first_two_chars = countryabbr[:2]
init_request = scrapy.Request(
url = "https://httpbin.org/headers",
callback=self.retrieve_cookies,
errback=self.handle_error,
dont_filter=True,
meta={
'playwright': True,
'dont_merge_cookies': True,
'offset': offset,
'playwright_context': f'{first_two_chars}_{rand_port}',
},
)
yield init_request
else:
raise CloseSpider('No more things to scrape')
def retrieve_cookies(self, response):
raw_cookies = response.headers.getlist('Set-Cookie')
self.cookies = {}
for rc in raw_cookies:
cookie = SimpleCookie(rc.decode())
for key, morsel in cookie.items():
self.cookies[key] = morsel.value
self.headers = response.request.headers`
The headers received
(edited for syntax highlighting)
I see. This might be an upstream issue with Playwright and/or the underlying browser. Please try to reproduce using standalone Playwright as indicated in https://github.com/scrapy-plugins/scrapy-playwright#reporting-issues. If the issue also appears that way, it's not caused by this package.
I am using scrapy-playwright with brave. I find out that the code with add "pragma" and "cache-control" as an header. Is it possible to remove it? Usually, we should use route function. But route is being used in the base code of scrapy-playwright