I have built a spider where I have to scroll multiple times to fetch all the content. This works when I scrape a single URL. As soon as I add more URLs, the scraped data from the page I have to scroll is duplicated. So the response.request.url and the content I scrape using XPath are correct. But what I scrape using the driver is sometimes duplicated, making this plugin unusable.
I guess that the driver that is exposed in response.request.meta['driver'] is the same in different requests because the scraping takes so long.
Here is some simplified code of what I'm doing:
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from ..base_spider import BaseSpider, enough_objects_loaded
from ..items import ExampleResult
import os
from pprint import pprint
from dataclasses import asdict
class ExampleSpider(BaseSpider):
name = 'example_spider'
def start_requests(self):
self.start_urls = [
'https://www.example.com/company/company-1/',
'https://www.example.com/company/company-2',
]
self.login_script = 'document.getElementById("username").value = "' + os.getenv('EMAIL') + '";' \
'document.getElementById("password").value = "' + os.getenv('PASSWORD') + '";' \
'document.querySelector(\'button[data-control="login-submit"]\').click();'
request = SeleniumRequest(
url='https://example.com/login',
callback=self.check_login,
script=self.login_script,
dont_filter=True,
)
request.meta['login_attempts'] = 0
yield request
# Checks if the login was successful
def check_login(self, response):
title = response.selector.xpath('//title/text()').get()
print("Login attempts: " + str(response.meta['login_attempts']))
if "home" in title.lower():
print('START SCRAPING')
# Yield actual requests for start_urls
for i, url in enumerate(self.start_urls):
request = SeleniumRequest(
url=url,
callback=self.parse_basic_company_info,
wait_time=5,
wait_until=EC.element_to_be_clickable((By.CSS_SELECTOR, 'p.class_name'))
)
request.meta['site_url'] = url
self.random_delay()
yield request
elif response.meta['login_attempts'] < 5:
request = SeleniumRequest(
url='https://example.com/login',
callback=self.check_login,
script=self.login_script,
dont_filter=True,
)
request.meta['login_attempts'] = response.meta['login_attempts'] + 1
yield request
else:
print('Number of login attempts exceeded')
def parse_basic_company_info(self, response):
example_result = ExampleResult()
example_result.url = response.meta['site_url']
example_result.company_description = self.cleanup_string(response.xpath("//p[contains(@class, 'class_name')]/text()").get())
# Fetch number of jobs
jobs_url = self.jobs_url(example_result.url)
jobs_request = SeleniumRequest(
url=jobs_url,
callback=self.parse_job_page,
wait_time=5,
wait_until=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h4.class_name'))
)
jobs_request.meta['example_result'] = example_result
self.random_delay()
yield jobs_request
def parse_job_page(self, response):
example_result = response.meta['example_result']
raw_jobs_count = response.xpath("//h4[contains(@class, 'class_name')]/text()").get()
example_result.open_jobs = self.extract_number(raw_jobs_count)
people_url = self.people_url(example_result.url)
print(['PEOPLE URL: ', people_url])
people_request = SeleniumRequest(
url=people_url,
callback=self.parse_people_sample,
wait_time=5,
wait_until=EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.class_name'))
)
people_request.meta['example_result'] = example_result
self.random_delay()
yield people_request
def parse_people_sample(self, response):
example_result = response.meta['example_result']
driver = response.request.meta['driver']
print("example_result:")
pprint(example_result)
print("PEOPLE URL PEOPLE PARSER: " + response.request.url)
print("DRIVER")
pprint(response.request.meta['driver'])
# Scroll to the bottom of page repeatedly
scroll_depth = self.random_number(35, 40)
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
self.random_delay(2, 5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height or scroll_depth == 0:
break
last_height = new_height
scroll_depth = scroll_depth - 1
# Scrape data
elements = driver.find_elements_by_xpath("//div[contains(@class, 'class_name')]")
raw_employee_names = []
if(isinstance(elements, list)):
for el in elements:
raw_employee_names.append(el.text)
print("raw_employee_names")
pprint(raw_employee_names)
example_result.employee_names = list(map(self.cleanup_string, raw_employee_names))
yield example_result
The results for the example result and for the response.request.url are correct and distinct. But the output for response.request.meta['driver'] is identical. I'm not sure that this is correct. Furthermore, the values for driver.current_url are identical in both iterations of parse_people_sample.
The corresponding log output:
2022-03-01 15:26:25 PEOPLE URL PEOPLE PARSER: https://www.example.com/company/company-1/people
DRIVER <selenium.webdriver.chrome.webdriver.WebDriver (session="5715cce26b4d45dc6dd5db875fb90418")>
DRIVER URL https://www.example.com/company/company-2/people/
2022-03-01 15:28:27 PEOPLE URL PEOPLE PARSER: https://www.example.com/company/company-2/people
DRIVER <selenium.webdriver.chrome.webdriver.WebDriver (session="5715cce26b4d45dc6dd5db875fb90418")>
DRIVER URL https://www.example.com/company/company-2/people/
As you can see, driver.current_url is in both cases the one of company 2, where it should be first company 1 and then company 2. I don't know why it isn't in both cases company 1 but I guess that has to do with the way the requests are processed.
I disabled concurrent requests by the way in my BaseSpider:
I have built a spider where I have to scroll multiple times to fetch all the content. This works when I scrape a single URL. As soon as I add more URLs, the scraped data from the page I have to scroll is duplicated. So the
response.request.url
and the content I scrape using XPath are correct. But what I scrape using the driver is sometimes duplicated, making this plugin unusable.I guess that the driver that is exposed in
response.request.meta['driver']
is the same in different requests because the scraping takes so long.Here is some simplified code of what I'm doing:
What stands out here in these lines:
The results for the
example result
and for theresponse.request.url
are correct and distinct. But the output forresponse.request.meta['driver']
is identical. I'm not sure that this is correct. Furthermore, the values fordriver.current_url
are identical in both iterations ofparse_people_sample
.The corresponding log output:
As you can see,
driver.current_url
is in both cases the one of company 2, where it should be first company 1 and then company 2. I don't know why it isn't in both cases company 1 but I guess that has to do with the way the requests are processed.I disabled concurrent requests by the way in my
BaseSpider
:I'm not sure if I'm missing something and if there is a better way to let the spider wait and repeatedly scroll than
time.sleep()
or how I use it:But it looks like the way the driver is passed around is not working when scraping takes longer.