Closed leonpawelzik closed 1 year ago
Could you share the code?
Yes, sorry! And thank you for your time and work! This code helps a lot during job search! I run it using "LI_AT_COOKIE=[...]" python maincopy.py
` import os import logging import numpy as np import pandas as pd
from linkedin_jobs_scraper import LinkedinScraper from linkedin_jobs_scraper.events import Events, EventData, EventMetrics from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, RemoteFilters
logging.basicConfig(level = logging.INFO)
def on_data(data: EventData): print('[ON_DATA]', data.title, data.company, data.company_link, data.date, data.link, data.insights, len(data.description))
def on_metrics(metrics: EventMetrics): print('[ON_METRICS]', str(metrics))
def on_error(error): print('[ON_ERROR]', error)
def on_end(): print('[ON_END]')
def on_data(data: EventData): job_postings=[] job_postings.append([data.job_id, data.location, data.title, data.company, data.date, data.link, data.description])
df = pd.DataFrame(job_postings, columns=['Job_ID','Location','Title', 'Company','Date', 'Link', 'Description'])
df.to_csv('14082022PM-1.csv', sep='\t', mode='a', header=False)
scraper = LinkedinScraper( chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_options=None, # Custom Chrome options here headless=None, # Ogooglverrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=4, # Slow down th5e scraper to avoid 'Too many requests 429' errors (in seconds) page_load_timeout=20, # Page load timeout (in seconds) )
scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end)
queries = [
Query(
query='Product Manager',
options=QueryOptions(
locations=['Germany'],
apply_link = False, # Try to uextract apply link (easy applies are skipped). Default to False.
limit=1000,
filters=QueryFilters(
#company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies.
#relevance=RelevanceFilters.RECENT,
time=TimeFilters.DAY,
type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
)
)
),
]
scraper.run(queries)
`
Seems to work fine here
import logging
from linkedin_jobs_scraper import LinkedinScraper
from linkedin_jobs_scraper.events import Events, EventData, EventMetrics
from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters
from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters
logging.basicConfig(level=logging.INFO)
logging.getLogger('li:scraper').setLevel(logging.INFO)
scraper = LinkedinScraper(
headless=True,
max_workers=1,
slow_mo=0.5,
page_load_timeout=40
)
queries = [
Query(
query='Product Manager',
options=QueryOptions(
locations=['Germany'],
apply_link=False,
limit=1000,
filters=QueryFilters(
time=TimeFilters.DAY,
type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
)
)
)
]
scraper.run(queries)
3.7.13
104.0.5112.79
For some reason my selenium version was off. No everything works fine. Thanks alot!
Everything is working fine (also finally got the authenticated session to work), since I updated to 15.3. For some reason, the scraper is not able to go to the second page. Any idea why?
INFO:li:scraper:('[Product Manager][Germany][1]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][2]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][3]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][4]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][5]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][6]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][7]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][8]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][9]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][10]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][11]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][12]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][13]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][14]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][15]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][16]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][17]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][18]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][19]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][20]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][21]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][22]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][23]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][24]', 'Processed') INFO:li:scraper:('[Product Manager][Germany][25]', 'Processed') INFO:li:scraper:('[Product Manager][Germany]', 'No more jobs to process in this page') INFO:li:scraper:('[Product Manager][Germany]', 'Metrics:', '{ processed: 25, failed: 0, missed: 0 }') INFO:li:scraper:('[Product Manager][Germany]', 'Pagination requested [1]') INFO:li:scraper:('[Product Manager][Germany]', 'Opening https://www.linkedin.com/jobs/search?keywords=Product+Manager&location=Germany&f_TPR=r86400&f_JT=F%2CI&start=25') INFO:li:scraper:('[Product Manager][Germany]', 'Waiting for new jobs to load') INFO:li:scraper:('[Product Manager][Germany]', 'Session is valid') INFO:li:scraper:('[Product Manager][Germany]', 'No jobs found, skip')