clemfromspace / scrapy-selenium

Scrapy middleware to handle javascript pages using selenium
Do What The F*ck You Want To Public License
922 stars 353 forks source link

Driver in meta seems to be incorrect if parsing takes too long #111

Open ospaarmann opened 2 years ago

ospaarmann commented 2 years ago

I have built a spider where I have to scroll multiple times to fetch all the content. This works when I scrape a single URL. As soon as I add more URLs, the scraped data from the page I have to scroll is duplicated. So the response.request.url and the content I scrape using XPath are correct. But what I scrape using the driver is sometimes duplicated, making this plugin unusable.

I guess that the driver that is exposed in response.request.meta['driver'] is the same in different requests because the scraping takes so long.

Here is some simplified code of what I'm doing:

import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from ..base_spider import BaseSpider, enough_objects_loaded
from ..items import ExampleResult
import os
from pprint import pprint
from dataclasses import asdict

class ExampleSpider(BaseSpider):
    name = 'example_spider'

    def start_requests(self):
      self.start_urls = [
        'https://www.example.com/company/company-1/',
        'https://www.example.com/company/company-2',
      ]

      self.login_script = 'document.getElementById("username").value = "' + os.getenv('EMAIL') + '";' \
                  'document.getElementById("password").value = "' + os.getenv('PASSWORD') + '";' \
                  'document.querySelector(\'button[data-control="login-submit"]\').click();'

      request = SeleniumRequest(
        url='https://example.com/login',
        callback=self.check_login,
        script=self.login_script,
        dont_filter=True,
      )

      request.meta['login_attempts'] = 0

      yield request

    # Checks if the login was successful
    def check_login(self, response):
      title = response.selector.xpath('//title/text()').get()
      print("Login attempts: " + str(response.meta['login_attempts']))

      if "home" in title.lower():
        print('START SCRAPING')
        # Yield actual requests for start_urls
        for i, url in enumerate(self.start_urls):
          request = SeleniumRequest(
            url=url,
            callback=self.parse_basic_company_info,
            wait_time=5,
            wait_until=EC.element_to_be_clickable((By.CSS_SELECTOR, 'p.class_name'))
          )
          request.meta['site_url'] = url

          self.random_delay()
          yield request

      elif  response.meta['login_attempts'] < 5:
        request = SeleniumRequest(
          url='https://example.com/login',
          callback=self.check_login,
          script=self.login_script,
          dont_filter=True,
        )

        request.meta['login_attempts'] = response.meta['login_attempts'] + 1

        yield request
      else:
        print('Number of login attempts exceeded')

    def parse_basic_company_info(self, response):
      example_result = ExampleResult()
      example_result.url = response.meta['site_url']
      example_result.company_description = self.cleanup_string(response.xpath("//p[contains(@class, 'class_name')]/text()").get())

      # Fetch number of jobs
      jobs_url = self.jobs_url(example_result.url)
      jobs_request = SeleniumRequest(
        url=jobs_url,
        callback=self.parse_job_page,
        wait_time=5,
        wait_until=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h4.class_name'))
      )
      jobs_request.meta['example_result'] = example_result

      self.random_delay()
      yield jobs_request

    def parse_job_page(self, response):
      example_result = response.meta['example_result']
      raw_jobs_count = response.xpath("//h4[contains(@class, 'class_name')]/text()").get()
      example_result.open_jobs = self.extract_number(raw_jobs_count)

      people_url = self.people_url(example_result.url)

      print(['PEOPLE URL: ', people_url])

      people_request = SeleniumRequest(
        url=people_url,
        callback=self.parse_people_sample,
        wait_time=5,
        wait_until=EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.class_name'))
      )
      people_request.meta['example_result'] = example_result

      self.random_delay()
      yield people_request

    def parse_people_sample(self, response):
      example_result = response.meta['example_result']
      driver = response.request.meta['driver']

      print("example_result:")
      pprint(example_result)
      print("PEOPLE URL PEOPLE PARSER: " + response.request.url)
      print("DRIVER")
      pprint(response.request.meta['driver'])

      # Scroll to the bottom of page repeatedly
      scroll_depth = self.random_number(35, 40)
      # Get scroll height
      last_height = driver.execute_script("return document.body.scrollHeight")

      while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        self.random_delay(2, 5)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height or scroll_depth == 0:
            break
        last_height = new_height
        scroll_depth = scroll_depth - 1

      # Scrape data

      elements = driver.find_elements_by_xpath("//div[contains(@class, 'class_name')]")
      raw_employee_names = []

      if(isinstance(elements, list)):
        for el in elements:
          raw_employee_names.append(el.text)

      print("raw_employee_names")
      pprint(raw_employee_names)

      example_result.employee_names = list(map(self.cleanup_string, raw_employee_names))

      yield example_result

What stands out here in these lines:

    def parse_people_sample(self, response):
      example_result = response.meta['example_result']
      driver = response.request.meta['driver']

      print("example_result:")
      pprint(example_result)
      print("PEOPLE URL PEOPLE PARSER: " + response.request.url)
      print("DRIVER")
      pprint(response.request.meta['driver'])

The results for the example result and for the response.request.url are correct and distinct. But the output for response.request.meta['driver'] is identical. I'm not sure that this is correct. Furthermore, the values for driver.current_url are identical in both iterations of parse_people_sample.

The corresponding log output:

2022-03-01 15:26:25 PEOPLE URL PEOPLE PARSER: https://www.example.com/company/company-1/people
DRIVER <selenium.webdriver.chrome.webdriver.WebDriver (session="5715cce26b4d45dc6dd5db875fb90418")>
DRIVER URL https://www.example.com/company/company-2/people/

2022-03-01 15:28:27 PEOPLE URL PEOPLE PARSER: https://www.example.com/company/company-2/people
DRIVER <selenium.webdriver.chrome.webdriver.WebDriver (session="5715cce26b4d45dc6dd5db875fb90418")>
DRIVER URL https://www.example.com/company/company-2/people/

As you can see, driver.current_url is in both cases the one of company 2, where it should be first company 1 and then company 2. I don't know why it isn't in both cases company 1 but I guess that has to do with the way the requests are processed.

I disabled concurrent requests by the way in my BaseSpider:

class BaseSpider(scrapy.Spider):
    name = "base_spider"
    start_urls = []
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_selenium.SeleniumMiddleware': 800
        },
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS': 1
    }

I'm not sure if I'm missing something and if there is a better way to let the spider wait and repeatedly scroll than time.sleep() or how I use it:

    def random_delay(self, min = 10, max = 21):
      delay = random.choice(range(min, max))
      print("WAITING: " + str(delay))
      time.sleep(delay)

But it looks like the way the driver is passed around is not working when scraping takes longer.

ospaarmann commented 2 years ago

Maybe a solution is to handle requests in a new tab. This SO thread might be of interest: https://stackoverflow.com/questions/28431765/open-web-in-new-tab-selenium-python