spinlud / py-linkedin-jobs-scraper

MIT License
307 stars 84 forks source link

Javascript exception #75

Closed calvinomiguel closed 8 months ago

calvinomiguel commented 8 months ago

I have installed Chrome 118.0.5993.70and installed the suitable chromedriver too. I've got both, chrome and the chromedriver from here https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json

However I see that's looping, yet after each loop the CLI is logging this:

ERROR:li:scraper:('[][Switzerland][1]', JavascriptException(), 'Traceback (most recent call last):\n  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/linkedin_jobs_scraper/strategies/anonymous_strategy.py", line 267, in run\n    job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script(\n  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 407, in execute_script\n    return self.execute(command, {"script": script, "args": converted_args})["value"]\n  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 347, in execute\n    self.error_handler.check_response(response)\n  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response\n    raise exception_class(message, screen, stacktrace)\nselenium.common.exce...
Traceback (most recent call last):
  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/linkedin_jobs_scraper/strategies/anonymous_strategy.py", line 267, in run
    job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script(
  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 407, in execute_script
    return self.execute(command, {"script": script, "args": converted_args})["value"]
  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 347, in execute
    self.error_handler.check_response(response)
  File "/Users/calvinomiguel/.conda/envs/pythonProject/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read properties of undefined (reading 'querySelector')
  (Session info: headless chrome=119.0.6045.105)
Stacktrace:
0   chromedriver                        0x00000001075afd28 chromedriver + 4795688
1   chromedriver                        0x00000001075a72b3 chromedriver + 4760243
2   chromedriver                        0x000000010718088d chromedriver + 407693
3   chromedriver                        0x0000000107186d6f chromedriver + 433519
4   chromedriver                        0x00000001071894b6 chromedriver + 443574
5   chromedriver                        0x0000000107213543 chromedriver + 1008963
6   chromedriver                        0x00000001071f6ca2 chromedriver + 892066
7   chromedriver                        0x0000000107212b2b chromedriver + 1006379
8   chromedriver                        0x00000001071f6a73 chromedriver + 891507
9   chromedriver                        0x00000001071c1143 chromedriver + 672067
10  chromedriver                        0x00000001071c231e chromedriver + 676638
11  chromedriver                        0x0000000107570795 chromedriver + 4536213
12  chromedriver                        0x0000000107575853 chromedriver + 4556883
13  chromedriver                        0x0000000107556001 chromedriver + 4427777
14  chromedriver                        0x000000010757659d chromedriver + 4560285
15  chromedriver                        0x000000010754748c chromedriver + 4367500
16  chromedriver                        0x00000001075960e8 chromedriver + 4690152
17  chromedriver                        0x000000010759629e chromedriver + 4690590
18  chromedriver                        0x00000001075a6eee chromedriver + 4759278
19  libsystem_pthread.dylib             0x00007ff807e27259 _pthread_start + 125
20  libsystem_pthread.dylib             0x00007ff807e22c7b thread_start + 15

This is how the insides of my python file look like:

import logging
from linkedin_jobs_scraper import LinkedinScraper
from linkedin_jobs_scraper.events import Events, EventData, EventMetrics
from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters
from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, \
    OnSiteOrRemoteFilters
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument("--remote-allow-origins=*")
options.add_argument("--headless=true")

header = ['Country', 'Company', 'Job title', 'URL', 'Company logo', 'Place', 'Work type',
          'Description', 'Publication date']

workType = 'Hybrid'
countryCode = 'ch'
countryName = 'Switzerland'

# List of blacklisted companies
blackList = ['telus', 'telus international', 'appen', 'telus international ai data solutions', 'OpenTalent',
             'Open Talent', 'Euro Labora']

# Method to check if company isn't blacklisted
def isnt_black_listed(company):
    lowercase = company.lower()

    if lowercase in blackList:
        return False
    else:
        return True

# Method to replace h1 to h2 tag
def replace_html_tag(html, tag, replacer):
    string = html.replace(tag, replacer)
    return string

# Method to remove attributes from HTML tags
def remove_attributes(html):
    soup = BeautifulSoup(html, 'lxml')
    for tag in soup.find_all(True):
        tag.attrs = {}
    return soup

# Method to remove buttons and a tags
def remove_a_and_button(html):
    soup = BeautifulSoup(html)
    for href in soup.find_all('a'):
        href.replace_with('')

    for button in soup.find_all('button'):
        button.replace_with('')

    return soup

# Method to get city
def get_city(html):
    city = html.split(",")[0]
    return city

# Method to remove country and work type from place string
def remove_country_and_work_type(country, place, worktype):
    string = place
    if country == 'Germany' or country == 'Deutschland':
        string = string.replace(', Germany', '')
        string = string.replace(', Deutschland', '')
        string = string.replace(worktype, '')
        return string

    if country == 'Switzerland' or country == 'Schweiz':
        string = string.replace(', Switzerland', '')
        string = string.replace(', Schweiz', '')
        string = string.replace(worktype, '')
        return string

    if country == 'Austria' or country == 'Österreich':
        string = string.replace(', Austria', '')
        string = string.replace(', Österreich', '')
        string = string.replace(worktype, '')
        return string

# Method to translate country name
def translate_country_name(location):
    if location == 'Schweiz':
        return 'Switzerland'

    if location == 'Deutschland':
        return 'Germany'

    if location == 'Österreich':
        return 'Austria'

    return location

# Method to clean HTML
def html_cleanse(html):
    string = remove_attributes(html)
    return string

# Change root logger level (default is WARN)
logging.basicConfig(level=logging.INFO)

# open the file in the write mode
file = open(f'jobs_{workType.lower()}_{countryCode}.csv', 'w', encoding='UTF8', newline='')
writer = csv.writer(file)

# write the header
writer.writerow(header)

# Fired once for each successfully processed job
def on_data(data: EventData):
    if isnt_black_listed(data.company) and data.apply_link:
        writer.writerow([translate_country_name(data.location), data.company, data.title, data.apply_link,
                         data.company_img_link, remove_country_and_work_type(data.location, data.place, workType),
                         workType, html_cleanse(data.description_html), data.date])
        print('[ON_DATA]', data.company, data.apply_link, data.company_link)

# Fired once for each page (25 jobs)
def on_metrics(metrics: EventMetrics):
    print('[ON_METRICS]', str(metrics))

def on_error(error):
    print('[ON_ERROR]', error)

def on_end():
    file.close()
    print('[ON_END]')

scraper = LinkedinScraper(
    chrome_executable_path=None,  # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
    chrome_options=options,  # Custom Chrome options here
    headless=True,  # Overrides headless mode only if chrome_options is None
    max_workers=1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
    slow_mo=4,  # Slow down the scraper to avoid 'Too many requests 429' errors (in seconds)
    page_load_timeout=90,  # Page load timeout (in seconds)
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='',
        options=QueryOptions(
            locations=[countryName],
            apply_link=True,
            skip_promoted_jobs=True,
            limit=1000,
            filters=QueryFilters(
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.WEEK,
                on_site_or_remote=[OnSiteOrRemoteFilters.HYBRID],  # supported only with authenticated session
            )
        )
    ),
]
scraper.run(queries)

And this is my folder/files setup:

grafik
spinlud commented 8 months ago

Hi there, anonymous_strategy is not supported anymore