scrapy-plugins / scrapy-playwright

🎭 Playwright integration for Scrapy
BSD 3-Clause "New" or "Revised" License
996 stars 110 forks source link

error: playwright._impl._api_types.Error: net::ERR_PROXY_CONNECTION_FAILED #85

Closed dsism closed 2 years ago

dsism commented 2 years ago

I have the following spider:

# Python
from typing import Any, Dict, Iterator, List
from urllib.parse import urlparse

# 3rd Party
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.link import Link
from scrapy.http import Response
from scrapy.http import Request
from scrapy_playwright.page import PageMethod

class ClientSideSiteSpider(CrawlSpider):
    name = "client-side-site"
    handle_httpstatus_list = [301, 302, 401, 403, 404, 408, 429, 500, 503]
    exclude_patterns: List[str] = []

    custom_settings = {
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        "ITEM_PIPELINES": {
            "scrapy.pipelines.files.FilesPipeline": 1,
            # some pipelines..
        },
        "DOWNLOADER_MIDDLEWARES": {
            # some middlewares...
            "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 810,
        },
        "PLAYWRIGHT_CONTEXTS": {
            1: {
                "ignore_https_errors": True,
                "proxy": {
                    "server": "my proxy server address ...",
                    "username": "user",
                    "password": "pwd",
                }
            }
        }
    }

    custom_meta = {
        "playwright": True,
        "playwright_context": 1,
        "playwright_include_page": True,
        "playwright_page_methods": [
            PageMethod("wait_for_load_state", "networkidle"),
        ],
    }

    def __init__(
        self,
        crawl_id: str,
        domain: str,
        start_url: str,
        site_id: str,
        user_agent: str = None,
        user_agent_id: str = None,
        **kwargs: Any
    ):
        self.crawl_id = crawl_id
        self.site_id = site_id
        self.user_agent = user_agent
        self.user_agent_id = user_agent_id
        self.allowed_domains: List[str] = [domain]
        self.start_urls: List[str] = [start_url]

        if user_agent is not None:
            self.custom_settings["USER_AGENT"] = user_agent

        url_parsed = urlparse(start_url)
        allow_path = url_parsed.path
        self.rules = (
            Rule(
                LinkExtractor(allow=allow_path),
                callback="parse_item",
                process_links="process_links",
                follow=True,
            ),
        )

        super().__init__(**kwargs)

    def start_requests(self) -> Iterator[Request]:
        for url in self.start_urls:
            yield Request(url, meta=self.custom_meta)

    def process_links(self, links: List[Link]) -> List[Link]:
        ret: List[Link] = []

        for link in links:
            temp = link
            temp.url = temp.url.split("?", 1)[0]  # remove all query strings!
            temp.url = temp.url.split("#", 1)[0]  # remove anchor links!

            if self.not_in_pattern(temp.url):
                ret.append(temp)

        return ret

    def not_in_pattern(self, compare: str) -> bool:
        for pattern in ["tel:", "mailto:"]:
            if pattern in compare:
                return False

        return True

    def parse_start_url(self, response: Response) -> Dict[str, Any]:
        return self.parse_item(response)

    def parse_item(self, response: Response) -> Dict[str, Any]:
        return {
            "status": response.status,
            "file_urls": [response.url],
            "body": response._get_body(),
            "type": response.headers.get("Content-Type", ""),
            "latency": response.meta.get("download_latency"),
        }

but I am always getting the error:

2022-05-05 12:13:13 [scrapy.core.scraper] ERROR: Error downloading <GET https://xxxxx.com/>
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/twisted/internet/defer.py", line 1416, in _inlineCallbacks
    result = result.throwExceptionIntoGenerator(g)
  File "/usr/local/lib/python3.8/dist-packages/twisted/python/failure.py", line 512, in throwExceptionIntoGenerator
    return g.throw(self.type, self.value, self.tb)
  File "/usr/local/lib/python3.8/dist-packages/scrapy/core/downloader/middleware.py", line 44, in process_request
    return (yield download_func(request=request, spider=spider))
  File "/usr/local/lib/python3.8/dist-packages/twisted/internet/defer.py", line 824, in adapt
    extracted = result.result()
  File "/usr/local/lib/python3.8/dist-packages/scrapy_playwright/handler.py", line 241, in _download_request
    result = await self._download_request_with_page(request, page)
  File "/usr/local/lib/python3.8/dist-packages/scrapy_playwright/handler.py", line 252, in _download_request_with_page
    response = await page.goto(request.url)
  File "/usr/local/lib/python3.8/dist-packages/playwright/async_api/_generated.py", line 7581, in goto
    await self._async(
  File "/usr/local/lib/python3.8/dist-packages/playwright/_impl/_page.py", line 493, in goto
    return await self._main_frame.goto(**locals_to_params(locals()))
  File "/usr/local/lib/python3.8/dist-packages/playwright/_impl/_frame.py", line 122, in goto
    await self._channel.send("goto", locals_to_params(locals()))
  File "/usr/local/lib/python3.8/dist-packages/playwright/_impl/_connection.py", line 39, in send
    return await self.inner_send(method, params, False)
  File "/usr/local/lib/python3.8/dist-packages/playwright/_impl/_connection.py", line 63, in inner_send
    result = next(iter(done)).result()
playwright._impl._api_types.Error: net::ERR_PROXY_CONNECTION_FAILED at https://xxxxx.com/

Base on this, I also tried with

PLAYWRIGHT_LAUNCH_OPTIONS = {
    "proxy": {
        "server": "my server address ...",
        "username": "user",
        "password": "pass",
    },
}

But got the same error. The proxy provider I'm using is ok, since it's working normally with scrapy.

Can anyone identify where the problem is? I've been looking for days :(

elacuesta commented 2 years ago

What result do you get with plain playwright, without scrapy or scrapy-playwright? e.g.:

import asyncio
from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as p:
        for browser_type in [p.firefox, p.chromium]:
            browser = await browser_type.launch(
                proxy={
                    "server": "***",
                    "username": "***",
                    "password": "***",
                },
            )
            context = await browser.new_context(ignore_https_errors=True)
            page = await context.new_page()
            await page.goto("https://httpbin.org/ip")
            print(await page.content())
            await browser.close()

if __name__ == "__main__":
    asyncio.run(main())
dsism commented 2 years ago

What result do you get with plain playwright, without scrapy or scrapy-playwright? e.g.:

import asyncio
from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as p:
        for browser_type in [p.firefox, p.chromium]:
            browser = await browser_type.launch(
                proxy={
                    "server": "***",
                    "username": "***",
                    "password": "***",
                },
            )
            context = await browser.new_context(ignore_https_errors=True)
            page = await context.new_page()
            await page.goto("https://httpbin.org/ip")
            print(await page.content())
            await browser.close()

if __name__ == "__main__":
    asyncio.run(main())

Hey @elacuesta , here it is:

params **** {'sdkLanguage': 'python'}
params **** {'proxy': {'server': '*', 'username': '*', 'password': '*'}}
params **** {'ignoreHTTPSErrors': True}
params **** {}
params **** {'url': 'https://url.com'}
params **** {}
<!DOCTYPE html><html lang="en"><head>
    <meta charset="utf-8">
    <meta name="description" content="">

    <meta name="twitter:card" content="summary">
    <meta name="twitter:title" content="">
    <meta name="twitter:description" content="">

    <meta property="og:type" content="website">
    <meta property="og:title" content="">
    <meta property="og:description" content="">

    <base href="/">
    <link id="faviconIco" rel="icon" type="image/png" href="assets/favicon.png">
    <link id="appleTouchIconIco" rel="apple-touch-icon" href="assests/appleTouchIcon.png">
    <link rel="manifest" id="manifest-placeholder">
    <meta name="theme-color" content="#000000">
    <meta name="viewport" id="viewport" content="width=device-width, initial-scale=1">
    <script>
      function isInternetExplorer() {
        if (/MSIE|Trident/.test(navigator.userAgent)) {
          {
            if (navigator.language === 'tr') location.href = '/assets/unsupported_tr.html';
            else location.href = '/assets/unsupported.html';
          }
        }
      }
      isInternetExplorer();
      const mobileAndTabletCheck = function () {
        let check = false;
        (function (a) {
          if (
            /(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino|android|ipad|playbook|silk/i.test(
              a
            ) ||
            /1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-/i.test(
              a.substr(0, 4)
            )
          )
            check = true;
        })(navigator.userAgent || navigator.vendor || window.opera);

        if (check) {
          var vp = document.getElementById('viewport');
          vp.setAttribute('content', 'width=device-width, initial-scale=1, maximum-scale=1');
        }

        return check;
      };

      mobileAndTabletCheck();
    </script>
  <style>@charset "UTF-8";html{box-sizing:border-box;-ms-overflow-style:scrollbar;}*,:after,:before{box-sizing:inherit;}:root{--swiper-theme-color:#007aff;}:root{--swiper-navigation-size:44px;}body{--background:#1a1a1a;--background-variant:#1a1a1a;--surface:#0f0f0f;--surface-4:#0f0f0f40;--primary:#fe9900;--primary-variant:#fe9900;--secondary:#1a1a1a;--secondary-variant:#1a1a1a;--error:#b00020;--on-background:#fff;--on-background-variant:#fff;--on-surface:#fff;--on-primary:#fff;--on-primary-variant:#fff;--on-secondary:#fff;--on-secondary-variant:#fff;--on-error:#fff;--modal-backdrop-opacity:0.32;--modal-backdrop-blur:3px;}body{margin:0 auto;min-height:100vh;background:var(--background);-webkit-overflow-scrolling:auto;}body :focus{outline:none;}</style><link rel="stylesheet" href="styles.d715a958203282df90b1.css" media="all" onload="this.media='all'"><noscript><link rel="stylesheet" href="styles.d715a958203282df90b1.css"></noscript></head>

  <body>
    <os-root></os-root>
    <noscript>Please enable JavaScript to continue using this application.</noscript>
  <script src="runtime-es2015.c7553bac13e77f812e59.js" type="module"></script><script src="runtime-es5.c7553bac13e77f812e59.js" nomodule="" defer=""></script><script src="polyfills-es5.40338712dbfe8c5fd56a.js" nomodule="" defer=""></script><script src="polyfills-es2015.77ed2742568a17467b11.js" type="module"></script><script src="main-es2015.0db1f02c749d435c5f2e.js" type="module"></script><script src="main-es5.0db1f02c749d435c5f2e.js" nomodule="" defer=""></script>

</body></html>
params **** {}
params **** {'proxy': {'server': '*', 'username': '*', 'password': '*'}}
params **** {'ignoreHTTPSErrors': True}
params **** {}
params **** {'url': 'https://url.com'}
params **** {}
<!DOCTYPE html><html lang="en"><head>
    <meta charset="utf-8">
    <meta name="description" content="">

    <meta name="twitter:card" content="summary">
    <meta name="twitter:title" content="">
    <meta name="twitter:description" content="">

    <meta property="og:type" content="website">
    <meta property="og:title" content="">
    <meta property="og:description" content="">

    <base href="/">
    <link id="faviconIco" rel="icon" type="image/png" href="assets/favicon.png">
    <link id="appleTouchIconIco" rel="apple-touch-icon" href="assests/appleTouchIcon.png">
    <link rel="manifest" id="manifest-placeholder">
    <meta name="theme-color" content="#000000">
    <meta name="viewport" id="viewport" content="width=device-width, initial-scale=1">
    <script>
      function isInternetExplorer() {
        if (/MSIE|Trident/.test(navigator.userAgent)) {
          {
            if (navigator.language === 'tr') location.href = '/assets/unsupported_tr.html';
            else location.href = '/assets/unsupported.html';
          }
        }
      }
      isInternetExplorer();
      const mobileAndTabletCheck = function () {
        let check = false;
        (function (a) {
          if (
            /(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino|android|ipad|playbook|silk/i.test(
              a
            ) ||
            /1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-/i.test(
              a.substr(0, 4)
            )
          )
            check = true;
        })(navigator.userAgent || navigator.vendor || window.opera);

        if (check) {
          var vp = document.getElementById('viewport');
          vp.setAttribute('content', 'width=device-width, initial-scale=1, maximum-scale=1');
        }

        return check;
      };

      mobileAndTabletCheck();
    </script>
  <style>@charset "UTF-8";html{box-sizing:border-box;-ms-overflow-style:scrollbar;}*,:after,:before{box-sizing:inherit;}:root{--swiper-theme-color:#007aff;}:root{--swiper-navigation-size:44px;}body{--background:#1a1a1a;--background-variant:#1a1a1a;--surface:#0f0f0f;--surface-4:#0f0f0f40;--primary:#fe9900;--primary-variant:#fe9900;--secondary:#1a1a1a;--secondary-variant:#1a1a1a;--error:#b00020;--on-background:#fff;--on-background-variant:#fff;--on-surface:#fff;--on-primary:#fff;--on-primary-variant:#fff;--on-secondary:#fff;--on-secondary-variant:#fff;--on-error:#fff;--modal-backdrop-opacity:0.32;--modal-backdrop-blur:3px;}body{margin:0 auto;min-height:100vh;background:var(--background);-webkit-overflow-scrolling:auto;}body :focus{outline:none;}</style><link rel="stylesheet" href="styles.d715a958203282df90b1.css" media="all" onload="this.media='all'"><noscript><link rel="stylesheet" href="styles.d715a958203282df90b1.css"></noscript></head>

  <body>
    <os-root></os-root>
    <noscript>Please enable JavaScript to continue using this application.</noscript>
  <script src="runtime-es2015.c7553bac13e77f812e59.js" type="module"></script><script src="runtime-es5.c7553bac13e77f812e59.js" nomodule="" defer=""></script><script src="polyfills-es5.40338712dbfe8c5fd56a.js" nomodule="" defer=""></script><script src="polyfills-es2015.77ed2742568a17467b11.js" type="module"></script><script src="main-es2015.0db1f02c749d435c5f2e.js" type="module"></script><script src="main-es5.0db1f02c749d435c5f2e.js" nomodule="" defer=""></script>

</body></html>
params **** {}
elacuesta commented 2 years ago

Ok, that seems like it's not a problem with upstream playwright. You should try to reproduce the issue with the minimum amount of components involved (middlewares, pieplines, etc). How does it work with a minimal spider, like the one in the proxy support section from the readme?