probberechts / soccerdata

⛏⚽ Scrape soccer data from Club Elo, ESPN, FBref, FiveThirtyEight, Football-Data.co.uk, FotMob, Sofascore, SoFIFA, Understat and WhoScored.
https://soccerdata.readthedocs.io/en/latest/
Other
544 stars 95 forks source link

[General] Selenium fails with SOCKS proxy (for tor) with `WebDriverException: Message: unknown error: net::ERR_PROXY_CONNECTION_FAILED` #23

Closed tonyelhabr closed 2 years ago

tonyelhabr commented 2 years ago
import soccerdata as sd
import sys
print(sd.__version__)
print(sys.version)
0.0.2
3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]

I tried to set use_tor=True for downloading events for a match with tor running in the background, but read_events ended with an error indicating that the proxy connection failed.

ws = sd.WhoScored(leagues="ENG-Premier League", seasons="20-21", use_tor=True)
events = ws.read_events(match_id=1485185)
[03/19/22 09:54:01] INFO     Saving cached data to                              [_common.py](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/_common.py):[59](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/_common.py#59)
                             C:\Users\antho\soccerdata\data\WhoScored                        
[03/19/22 09:54:04] INFO     Retrieving game schedule of ENG-Premier League  [whoscored.py](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/whoscored.py):[314](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/whoscored.py#314)
                             - 2021 from the cache                                           
                    INFO     [2/1] Retrieving game with id=1485185           [whoscored.py](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/whoscored.py):[499](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/whoscored.py#499)
                    INFO     Scraping                                        [whoscored.py](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/whoscored.py):[577](file:///C:/Users/antho/anaconda3/envs/soccerdata/lib/site-packages/soccerdata/whoscored.py#577)
                             https://www.whoscored.com/Matches/1485185/Live                  
---------------------------------------------------------------------------
WebDriverException                        Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_27592\4024154899.py in <module>
      1 ws = sd.WhoScored(leagues="ENG-Premier League", seasons="20-21", use_tor=True, path_to_browser="c:/users/antho/downloads/chromedriver.exe")
----> 2 events = ws.read_events(match_id=1485185)

~\anaconda3\envs\soccerdata\lib\site-packages\soccerdata\whoscored.py in read_events(self, match_id, force_cache, live)
    507                 filepath,
    508                 var="requirejs.s.contexts._.config.config.params.args.matchCentreData",
--> 509                 no_cache=live,
    510             )
    511             json_data = json.load(reader)

~\anaconda3\envs\soccerdata\lib\site-packages\soccerdata\whoscored.py in _download_and_save(self, url, filepath, max_age, no_cache, var)
    576         if cache_invalid or filepath is None or not filepath.exists():
    577             logger.info("Scraping %s", url)
--> 578             self.driver.get(url)
    579             time.sleep(5 + random.random() * 5)
    580             if "Incapsula incident ID" in self.driver.page_source:

~\anaconda3\envs\soccerdata\lib\site-packages\undetected_chromedriver\__init__.py in get_wrapped(*args, **kwargs)
    495                     },
    496                 )
--> 497             return orig_get(*args, **kwargs)
    498 
    499         self.get = get_wrapped

~\anaconda3\envs\soccerdata\lib\site-packages\undetected_chromedriver\__init__.py in get(self, url)
    533         if self._get_cdc_props():
    534             self._hook_remove_cdc_props()
--> 535         return super().get(url)
    536 
    537     def add_cdp_listener(self, event_name, callback):

~\anaconda3\envs\soccerdata\lib\site-packages\selenium\webdriver\remote\webdriver.py in get(self, url)
    435         Loads a web page in the current browser session.
    436         """
--> 437         self.execute(Command.GET, {'url': url})
    438 
    439     @property

~\anaconda3\envs\soccerdata\lib\site-packages\selenium\webdriver\remote\webdriver.py in execute(self, driver_command, params)
    423         response = self.command_executor.execute(driver_command, params)
    424         if response:
--> 425             self.error_handler.check_response(response)
    426             response['value'] = self._unwrap_value(
    427                 response.get('value', None))

~\anaconda3\envs\soccerdata\lib\site-packages\selenium\webdriver\remote\errorhandler.py in check_response(self, response)
    245                 alert_text = value['alert'].get('text')
    246             raise exception_class(message, screen, stacktrace, alert_text)  # type: ignore[call-arg]  # mypy is not smart enough here
--> 247         raise exception_class(message, screen, stacktrace)
    248 
    249     def _value_or_default(self, obj: Mapping[_KT, _VT], key: _KT, default: _VT) -> _VT:

WebDriverException: Message: unknown error: net::ERR_PROXY_CONNECTION_FAILED
  (Session info: headless chrome=99.0.4844.74)
Stacktrace:
Backtrace:
    Ordinal0 [0x00509943+2595139]
    Ordinal0 [0x0049C9F1+2148849]
    Ordinal0 [0x00394528+1066280]
    Ordinal0 [0x00390DB4+1052084]
    Ordinal0 [0x003863BD+1008573]
    Ordinal0 [0x00386F7C+1011580]
    Ordinal0 [0x003865CA+1009098]
    Ordinal0 [0x00385BC6+1006534]
    Ordinal0 [0x00384AD0+1002192]
    Ordinal0 [0x00384FAD+1003437]
    Ordinal0 [0x00395C4A+1072202]
    Ordinal0 [0x003EC19D+1425821]
    Ordinal0 [0x003DB9EC+1358316]
    Ordinal0 [0x003EBAF2+1424114]
    Ordinal0 [0x003DB806+1357830]
    Ordinal0 [0x003B6086+1204358]
    Ordinal0 [0x003B6F96+1208214]
    GetHandleVerifier [0x006AB232+1658114]
    GetHandleVerifier [0x0076312C+2411516]
    GetHandleVerifier [0x0059F261+560433]
    GetHandleVerifier [0x0059E366+556598]
    Ordinal0 [0x004A286B+2173035]
    Ordinal0 [0x004A75F8+2192888]
    Ordinal0 [0x004A76E5+2193125]
    Ordinal0 [0x004B11FC+2232828]
    BaseThreadInitThunk [0x76106739+25]
    RtlGetFullPathName_UEx [0x76FF8E7F+1215]
    RtlGetFullPathName_UEx [0x76FF8E4D+1165]

Here's what my terminal looks like with tor running (prior to calling read_events()

tony@desktop:/c/Users/antho/soccerdata$ tor
Mar 19 09:53:33.865 [notice] Tor 0.4.2.7 running on Linux with Libevent 2.1.11-stable, OpenSSL 1.1.1f, Zlib 1.2.11, Liblzma 5.2.4, and Libzstd 1.4.4.
Mar 19 09:53:33.865 [notice] Tor can't help you if you use it wrong! Learn how to be safe at https://www.torproject.org/download/download#warning
Mar 19 09:53:33.865 [notice] Read configuration file "/etc/tor/torrc".
Mar 19 09:53:33.866 [notice] Opening Socks listener on 127.0.0.1:9050
Mar 19 09:53:33.866 [notice] Opened Socks listener on 127.0.0.1:9050
Mar 19 09:53:33.000 [notice] Parsing GEOIP IPv4 file /usr/share/tor/geoip.
Mar 19 09:53:33.000 [notice] Parsing GEOIP IPv6 file /usr/share/tor/geoip6.
Mar 19 09:53:34.000 [notice] Bootstrapped 0% (starting): Starting
Mar 19 09:53:34.000 [notice] Starting with guard context "default"
Mar 19 09:53:35.000 [notice] Bootstrapped 5% (conn): Connecting to a relay
Mar 19 09:53:35.000 [notice] Bootstrapped 10% (conn_done): Connected to a relay
Mar 19 09:53:35.000 [notice] Bootstrapped 14% (handshake): Handshaking with a relay
Mar 19 09:53:35.000 [notice] Bootstrapped 15% (handshake_done): Handshake with a relay done
Mar 19 09:53:35.000 [notice] Bootstrapped 75% (enough_dirinfo): Loaded enough directory info to build circuits        
Mar 19 09:53:35.000 [notice] Bootstrapped 90% (ap_handshake_done): Handshake finished with a relay to build circuits  
Mar 19 09:53:35.000 [notice] Bootstrapped 95% (circuit_create): Establishing a Tor circuit
Mar 19 09:53:36.000 [notice] Bootstrapped 100% (done): Done

I've opened my browser to the port to verify that something is running, although this is using an HTTP proxy, so the warning here is expected.

image

probberechts commented 2 years ago

I can't reproduce this and I've got no clue what could be the problem here.

Could you check the following:

  1. Does it work with curl?

    $ curl -x socks5h://localhost:9050 https://check.torproject.org/api/ip
    {"IsTor":true,"IP":"..."}
  2. Does it work with google chrome?

Launch Chrome with

$ google-chrome --user-data-dir="/tmp" --proxy-server="socks5://127.0.0.1:9050" --host-resolver-rules="MAP * 0.0.0.0 , EXCLUDE myproxy"

and check whether Tor works by browsing to https://check.torproject.org/

tonyelhabr commented 2 years ago
  1. It does seem to work with curl
tony@desktop:/c/Users/antho$ curl -x socks5h://localhost:9050 https://check.torproject.org/api/ip
{"IsTor":true,"IP":"..."}
  1. I think it works with chrome? I only put a question mark since i think calling the chrome executable is the same as your google-chrome command
C:\Program Files\Google\Chrome\Application>chrome.exe --user-data-dir="c:/users/antho/downloads" --proxy-server="socks5://127.0.0.1:9050" --host-resolver-rules="MAP * 0.0.0.0 , EXCLUDE 127.0.0.1"

image

I've tried setting path_to_browser to my chromedriver and the normal chrome executable. I've also tried not setting it. All result in the same error 🤷

ws = sd.WhoScored(leagues="ENG-Premier League", seasons=2021, use_tor=True, path_to_browser="c:\\users\\antho\\downloads\\chromedriver.exe")
ws = sd.WhoScored(leagues="ENG-Premier League", seasons=2021, use_tor=True, path_to_browser="C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe")
ws = sd.WhoScored(leagues="ENG-Premier League", seasons=2021, use_tor=True)

I'm not super familiar with python debugging. Is there a good way for me to stop the execution somewhere in the selenium call for self.execute(Command.GET, {'url': url})? This seems to be where the error handling dispatches.

probberechts commented 2 years ago

Ok. Tor clearly functions properly. That means it has to be an issue with selenium / undetected_chromedriver.

I use undetected_chromedriver, which is a patched version of the original chromedriver to avoid detection by bot mitigation systems. I would first make sure it is not this patched version that causes your problem by running the code below. You'll first have to download the appropriate chromedriver version for your system from https://chromedriver.chromium.org/downloads.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
proxy = "socks5://127.0.0.1:9050"
resolver_rules = "MAP * 0.0.0.0 , EXCLUDE myproxy"
chrome_options.add_argument("--headless")  # maybe try without this line too
chrome_options.add_argument("--proxy-server=" + proxy)
chrome_options.add_argument("--host-resolver-rules=" + resolver_rules)
driver = webdriver.Chrome('<path to...>/chromedriver', options=chrome_options)
driver.get("https://check.torproject.org/api/ip")
driver.page_source

If this does not work, you could try with some additional arguments (Google for "windows selenium tor proxy") or create an issue in the selenium repo.

If it works and the code below does not (it shouldn't as this snippet is copied from soccerdata's source code), it is an issue with undetected-chromedriver and you should create an issue here.

import undetected_chromedriver as uc
proxy = "socks5://127.0.0.1:9050"
resolver_rules = "MAP * 0.0.0.0 , EXCLUDE myproxy"
chrome_options.add_argument("--headless")  # maybe try without this line too
chrome_options.add_argument("--proxy-server=" + proxy)
chrome_options.add_argument("--host-resolver-rules=" + resolver_rules)
driver = uc.Chrome(options=chrome_options)
driver.get("https://check.torproject.org/api/ip")
driver.page_source
tonyelhabr commented 2 years ago

The major thing I had to change with your snippets is replace myproxy with the actual value of the proxy 127.0.0.1. Is that supposed to be an environment variable?

The first worked for me on my second try. My first try was blocked, so I see why you might prefer undetected_chromedriver.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
proxy = "socks5://127.0.0.1:9050"
resolver_rules = "MAP * 0.0.0.0 , EXCLUDE 127.0.0.1"
chrome_options.add_argument("--proxy-server=" + proxy)
chrome_options.add_argument("--host-resolver-rules=" + resolver_rules)
driver = webdriver.Chrome('c:\\users\\antho\\downloads\\chromedriver.exe', options=chrome_options)
driver.get("https://check.torproject.org/api/ip")
driver.page_source
'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">{"IsTor":true,"IP":"..."}</pre></body></html>'
import json
driver.get('https://www.whoscored.com/Matches/1485477/Live/England-Premier-League-2020-2021-Crystal-Palace-Manchester-City')
element = driver.find_element(by='xpath',value='//*[@id="layout-wrapper"]/script[1]')
script_content = element.get_attribute('innerHTML')
script_ls = script_content.split(sep="  ")
script_ls = list(filter(None, script_ls))
script_ls = [name for name in script_ls if name.strip()]
dictstring = script_ls[2][17:-2]
matchdict = json.loads(dictstring)
matchdict['score']
'0 : 2'

The second snippet with undetected_chromedriver worked, after the replacement of myproxy.

import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
proxy = "socks5://127.0.0.1:9050"
resolver_rules = "MAP * 0.0.0.0 , EXCLUDE 127.0.0.1"
chrome_options.add_argument("--proxy-server=" + proxy)
chrome_options.add_argument("--host-resolver-rules=" + resolver_rules)
driver = uc.Chrome(options=chrome_options)
driver.get("https://check.torproject.org/api/ip") ## worked
driver.get('https://www.whoscored.com/Matches/1485477/Live/England-Premier-League-2020-2021-Crystal-Palace-Manchester-City')
element = driver.find_element(by='xpath',value='//*[@id="layout-wrapper"]/script[1]')
tonyelhabr commented 2 years ago

This gist seems to indicate that we need the value of the proxy specified in resolver_rules

probberechts commented 2 years ago

Oh yes, that makes sense! I copy-pasted the resolver rules and forgot to change myproxy to 127.0.0.1. Actually, it is odd that it works on my system.

Thanks for debugging this! I'll push a fix in a couple of minutes.

tonyelhabr commented 2 years ago

happy to help!