Closed calvinomiguel closed 8 months ago
I have installed Chrome 118.0.5993.70and installed the suitable chromedriver too. I've got both, chrome and the chromedriver from here https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json
118.0.5993.70
However I see that's looping, yet after each loop the CLI is logging this:
ERROR:li:scraper:('[][Switzerland][1]', JavascriptException(), 'Traceback (most recent call last):\n File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/linkedin_jobs_scraper/strategies/anonymous_strategy.py", line 267, in run\n job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script(\n File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 407, in execute_script\n return self.execute(command, {"script": script, "args": converted_args})["value"]\n File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 347, in execute\n self.error_handler.check_response(response)\n File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response\n raise exception_class(message, screen, stacktrace)\nselenium.common.exce... Traceback (most recent call last): File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/linkedin_jobs_scraper/strategies/anonymous_strategy.py", line 267, in run job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script( File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 407, in execute_script return self.execute(command, {"script": script, "args": converted_args})["value"] File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 347, in execute self.error_handler.check_response(response) File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read properties of undefined (reading 'querySelector') (Session info: headless chrome=119.0.6045.105) Stacktrace: 0 chromedriver 0x00000001075afd28 chromedriver + 4795688 1 chromedriver 0x00000001075a72b3 chromedriver + 4760243 2 chromedriver 0x000000010718088d chromedriver + 407693 3 chromedriver 0x0000000107186d6f chromedriver + 433519 4 chromedriver 0x00000001071894b6 chromedriver + 443574 5 chromedriver 0x0000000107213543 chromedriver + 1008963 6 chromedriver 0x00000001071f6ca2 chromedriver + 892066 7 chromedriver 0x0000000107212b2b chromedriver + 1006379 8 chromedriver 0x00000001071f6a73 chromedriver + 891507 9 chromedriver 0x00000001071c1143 chromedriver + 672067 10 chromedriver 0x00000001071c231e chromedriver + 676638 11 chromedriver 0x0000000107570795 chromedriver + 4536213 12 chromedriver 0x0000000107575853 chromedriver + 4556883 13 chromedriver 0x0000000107556001 chromedriver + 4427777 14 chromedriver 0x000000010757659d chromedriver + 4560285 15 chromedriver 0x000000010754748c chromedriver + 4367500 16 chromedriver 0x00000001075960e8 chromedriver + 4690152 17 chromedriver 0x000000010759629e chromedriver + 4690590 18 chromedriver 0x00000001075a6eee chromedriver + 4759278 19 libsystem_pthread.dylib 0x00007ff807e27259 _pthread_start + 125 20 libsystem_pthread.dylib 0x00007ff807e22c7b thread_start + 15
This is how the insides of my python file look like:
import logging from linkedin_jobs_scraper import LinkedinScraper from linkedin_jobs_scraper.events import Events, EventData, EventMetrics from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, \ OnSiteOrRemoteFilters import csv from bs4 import BeautifulSoup from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument("--remote-allow-origins=*") options.add_argument("--headless=true") header = ['Country', 'Company', 'Job title', 'URL', 'Company logo', 'Place', 'Work type', 'Description', 'Publication date'] workType = 'Hybrid' countryCode = 'ch' countryName = 'Switzerland' # List of blacklisted companies blackList = ['telus', 'telus international', 'appen', 'telus international ai data solutions', 'OpenTalent', 'Open Talent', 'Euro Labora'] # Method to check if company isn't blacklisted def isnt_black_listed(company): lowercase = company.lower() if lowercase in blackList: return False else: return True # Method to replace h1 to h2 tag def replace_html_tag(html, tag, replacer): string = html.replace(tag, replacer) return string # Method to remove attributes from HTML tags def remove_attributes(html): soup = BeautifulSoup(html, 'lxml') for tag in soup.find_all(True): tag.attrs = {} return soup # Method to remove buttons and a tags def remove_a_and_button(html): soup = BeautifulSoup(html) for href in soup.find_all('a'): href.replace_with('') for button in soup.find_all('button'): button.replace_with('') return soup # Method to get city def get_city(html): city = html.split(",")[0] return city # Method to remove country and work type from place string def remove_country_and_work_type(country, place, worktype): string = place if country == 'Germany' or country == 'Deutschland': string = string.replace(', Germany', '') string = string.replace(', Deutschland', '') string = string.replace(worktype, '') return string if country == 'Switzerland' or country == 'Schweiz': string = string.replace(', Switzerland', '') string = string.replace(', Schweiz', '') string = string.replace(worktype, '') return string if country == 'Austria' or country == 'Österreich': string = string.replace(', Austria', '') string = string.replace(', Österreich', '') string = string.replace(worktype, '') return string # Method to translate country name def translate_country_name(location): if location == 'Schweiz': return 'Switzerland' if location == 'Deutschland': return 'Germany' if location == 'Österreich': return 'Austria' return location # Method to clean HTML def html_cleanse(html): string = remove_attributes(html) return string # Change root logger level (default is WARN) logging.basicConfig(level=logging.INFO) # open the file in the write mode file = open(f'jobs_{workType.lower()}_{countryCode}.csv', 'w', encoding='UTF8', newline='') writer = csv.writer(file) # write the header writer.writerow(header) # Fired once for each successfully processed job def on_data(data: EventData): if isnt_black_listed(data.company) and data.apply_link: writer.writerow([translate_country_name(data.location), data.company, data.title, data.apply_link, data.company_img_link, remove_country_and_work_type(data.location, data.place, workType), workType, html_cleanse(data.description_html), data.date]) print('[ON_DATA]', data.company, data.apply_link, data.company_link) # Fired once for each page (25 jobs) def on_metrics(metrics: EventMetrics): print('[ON_METRICS]', str(metrics)) def on_error(error): print('[ON_ERROR]', error) def on_end(): file.close() print('[ON_END]') scraper = LinkedinScraper( chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_options=options, # Custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=4, # Slow down the scraper to avoid 'Too many requests 429' errors (in seconds) page_load_timeout=90, # Page load timeout (in seconds) ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='', options=QueryOptions( locations=[countryName], apply_link=True, skip_promoted_jobs=True, limit=1000, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.WEEK, on_site_or_remote=[OnSiteOrRemoteFilters.HYBRID], # supported only with authenticated session ) ) ), ] scraper.run(queries)
And this is my folder/files setup:
Hi there, anonymous_strategy is not supported anymore
anonymous_strategy
I have installed Chrome
118.0.5993.70
and installed the suitable chromedriver too. I've got both, chrome and the chromedriver from here https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.jsonHowever I see that's looping, yet after each loop the CLI is logging this:
This is how the insides of my python file look like:
And this is my folder/files setup: