shaikhsajid1111 / twitter-scraper-selenium

Python's package to scrap Twitter's front-end easily
https://pypi.org/project/twitter-scraper-selenium
MIT License
305 stars 47 forks source link

scrap topic #30

Closed rachmadaniHaryono closed 1 year ago

rachmadaniHaryono commented 1 year ago

resolve #29

before merge the pr please label the pr as hacktoberfest-accepted

https://hacktoberfest.com/participation/#spam

selenium error

```python --------------------------------------------------------------------------- gaierror Traceback (most recent call last) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connection.py:174, in HTTPConnection._new_conn(self) 173 try: --> 174 conn = connection.create_connection( 175 (self._dns_host, self.port), self.timeout, **extra_kw 176 ) 178 except SocketTimeout: File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/util/connection.py:72, in create_connection(address, timeout, source_address, socket_options) 68 return six.raise_from( 69 LocationParseError(u"'%s', label empty or too long" % host), None 70 ) ---> 72 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM): 73 af, socktype, proto, canonname, sa = res File ~/.pyenv/versions/3.10.0/lib/python3.10/socket.py:955, in getaddrinfo(host, port, family, type, proto, flags) 954 addrlist = [] --> 955 for res in _socket.getaddrinfo(host, port, family, type, proto, flags): 956 af, socktype, proto, canonname, sa = res gaierror: [Errno -3] Temporary failure in name resolution During handling of the above exception, another exception occurred: NewConnectionError Traceback (most recent call last) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 702 # Make the request on the httplib connection object. --> 703 httplib_response = self._make_request( 704 conn, 705 method, 706 url, 707 timeout=timeout_obj, 708 body=body, 709 headers=headers, 710 chunked=chunked, 711 ) 713 # If we're going to release the connection in ``finally:``, then 714 # the response doesn't need to know about the connection. Otherwise 715 # it will also try to release it and we'll have a double-release 716 # mess. File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connectionpool.py:386, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 385 try: --> 386 self._validate_conn(conn) 387 except (SocketTimeout, BaseSSLError) as e: 388 # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout. File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connectionpool.py:1042, in HTTPSConnectionPool._validate_conn(self, conn) 1041 if not getattr(conn, "sock", None): # AppEngine might not have `.sock` -> 1042 conn.connect() 1044 if not conn.is_verified: File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connection.py:358, in HTTPSConnection.connect(self) 356 def connect(self): 357 # Add certificate verification --> 358 self.sock = conn = self._new_conn() 359 hostname = self.host File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connection.py:186, in HTTPConnection._new_conn(self) 185 except SocketError as e: --> 186 raise NewConnectionError( 187 self, "Failed to establish a new connection: %s" % e 188 ) 190 return conn NewConnectionError: : Failed to establish a new connection: [Errno -3] Temporary failure in name resolution During handling of the above exception, another exception occurred: MaxRetryError Traceback (most recent call last) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/requests/adapters.py:489, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 488 if not chunked: --> 489 resp = conn.urlopen( 490 method=request.method, 491 url=url, 492 body=request.body, 493 headers=request.headers, 494 redirect=False, 495 assert_same_host=False, 496 preload_content=False, 497 decode_content=False, 498 retries=self.max_retries, 499 timeout=timeout, 500 ) 502 # Send the request. 503 else: File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 785 e = ProtocolError("Connection aborted.", e) --> 787 retries = retries.increment( 788 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] 789 ) 790 retries.sleep() File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/urllib3/util/retry.py:592, in Retry.increment(self, method, url, response, error, _pool, _stacktrace) 591 if new_retry.is_exhausted(): --> 592 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 594 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry) MaxRetryError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/mozilla/geckodriver/releases/latest (Caused by NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) During handling of the above exception, another exception occurred: ConnectionError Traceback (most recent call last) File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/keyword.py:112, in Keyword.scrap(self) 111 try: --> 112 self.__start_driver() 113 self.__driver.get(self.URL) File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/keyword.py:35, in Keyword.__start_driver(self) 33 """changes the class member __driver value to driver on call""" 34 self.__driver = Initializer( ---> 35 self.browser, self.headless, self.proxy).init() File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/driver_initialization.py:82, in Initializer.init(self) 81 """returns driver instance""" ---> 82 driver = self.set_driver_for_browser(self.browser_name) 83 return driver File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/driver_initialization.py:75, in Initializer.set_driver_for_browser(self, browser_name) 74 # automatically installs geckodriver and initialize it and returns the instance ---> 75 return webdriver.Firefox(service=FirefoxService(executable_path=GeckoDriverManager().install()), options=self.set_properties(browser_option)) 76 else: 77 # if browser_name is not chrome neither firefox than raise an exception File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/webdriver_manager/firefox.py:28, in GeckoDriverManager.install(self) 27 def install(self): ---> 28 return self._get_driver_path(self.driver) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/webdriver_manager/manager.py:22, in DriverManager._get_driver_path(self, driver) 21 os_type = driver.get_os_type() ---> 22 driver_version = driver.get_version() 24 binary_path = self.driver_cache.find_driver(browser_version, driver_name, os_type, 25 driver_version) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/webdriver_manager/driver.py:40, in Driver.get_version(self) 39 if driver_version == "latest": ---> 40 return self.get_latest_release_version() 41 return self._version File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/webdriver_manager/driver.py:87, in GeckoDriver.get_latest_release_version(self) 85 def get_latest_release_version(self): 86 # type: () -> str ---> 87 resp = requests.get(url=self.latest_release_url, 88 headers=self.auth_header) 89 validate_response(resp) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/requests/api.py:73, in get(url, params, **kwargs) 63 r"""Sends a GET request. 64 65 :param url: URL for the new :class:`Request` object. (...) 70 :rtype: requests.Response 71 """ ---> 73 return request("get", url, params=params, **kwargs) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/requests/api.py:59, in request(method, url, **kwargs) 58 with sessions.Session() as session: ---> 59 return session.request(method=method, url=url, **kwargs) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/requests/sessions.py:587, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 586 send_kwargs.update(settings) --> 587 resp = self.send(prep, **send_kwargs) 589 return resp File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/requests/sessions.py:701, in Session.send(self, request, **kwargs) 700 # Send the request --> 701 r = adapter.send(request, **kwargs) 703 # Total elapsed time of the request (approximately) File ~/.pyenv/versions/3.10.0/envs/twitter-scraper-selenium/lib/python3.10/site-packages/requests/adapters.py:565, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 563 raise SSLError(e, request=request) --> 565 raise ConnectionError(e, request=request) 567 except ClosedPoolError as e: ConnectionError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/mozilla/geckodriver/releases/latest (Caused by NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) During handling of the above exception, another exception occurred: AttributeError Traceback (most recent call last) Cell In [9], line 1 ----> 1 import twitter_scraper_selenium; twitter_scraper_selenium.scrap_topic('steamdeck2', 'https://twitter.com/i/topics/1415728297065861123') File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/topic.py:37, in scrap_topic(filename, url, browser, proxy, tweets_count, output_format, directory, headless) 33 directory = pathlib.Path.cwd() 34 keyword_bot = Keyword( 35 keyword=filename, browser=browser, url=url, headless=headless, proxy=proxy, tweets_count=tweets_count 36 ) ---> 37 data = keyword_bot.scrap() 38 if output_format == 'json': 39 output_path = directory / '{}.json'.format(filename) File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/keyword.py:124, in Keyword.scrap(self) 121 return json.dumps(data) 123 except Exception as ex: --> 124 self.__close_driver() 125 print(ex) File /mnt/ac54dceb-73a5-4f94-b52c-cb7a426c0f29/Documents/twitter-scraper-selenium/twitter_scraper_selenium/keyword.py:38, in Keyword.__close_driver(self) 37 def __close_driver(self): ---> 38 self.__driver.close() 39 self.__driver.quit() AttributeError: 'Keyword' object has no attribute '_Keyword__driver' ```