scrapy-plugins / scrapy-splash

Scrapy+Splash for JavaScript integration
BSD 3-Clause "New" or "Revised" License
3.15k stars 450 forks source link

Scrapy + Splash not rendering page correctly #137

Closed chairam closed 4 years ago

chairam commented 7 years ago

Im' using Scrapy + Splash, I have problems downloading this page: http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText=

It seems that Splash cannot execute the javascript correctly: in the list the spinner never goes away and the page numbers are not loaded. Here is a stripped down, working, self contanied, version of my program (sorry if not stripped down at best)

# -*- coding: utf-8 -*- 
import scrapy from scrapy_splash 
import SplashRequest from scrapy.selector 
import Selector from scrapy.http 
import HtmlResponse 
import sys 
import io 
import os 
import base64

def saveFile(ss, fileNameExt, folderName):
    f = open(folderName + '/' + fileNameExt, 'w')
    f.write(ss)
    f.close()
    return fileNameExt

def savePng(png_bytes, fileNameExt, folderName):
    f = open( folderName +'/' + fileNameExt, 'wb')
    f.write(png_bytes)
    f.close()
    return fileNameExt

def savePageOriginalInFolder(response, folderName, chiave='pag1'):
    fileName = "site.html"
    testo = response.data[chiave].decode('utf8')       
    return saveFile(testo, fileName, folderName)       def savePagePng(response, folderName, pngDataName):
    fileName = 'site.png'
    if hasattr(response, 'data'):
        png_bytes = base64.b64decode(response.data[pngDataName])
        return savePng(png_bytes, fileName, folderName)

class GenericoSpider(scrapy.Spider):
    name = 'provaAbb'

    def asSplashRequest(self, url, callback, id_elenco="no_id", id_sessione="no_id_sessione"):
        return SplashRequest(
                    url = url,
                    endpoint='execute',
                    args={'lua_source': self.script, 'id_elenco': id_elenco, 'id_sessione': id_sessione},
                    callback=callback,
                )

    outDir = name # prendo in nome della cartella dal nome dello spider
    db_name = ""

    def start_requests(self):   
        sito = 'http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='
        yield self.asSplashRequest(sito, self.parse_list, 'id_mio_elenco')

    script = """
    function main(splash)
      local url = splash.args.url
      splash:set_viewport_size(1280, 2500)      
      splash:init_cookies(splash.args.cookies)
      assert(splash:go(url))
      assert(splash:wait(10))
      return {
        url  = splash:url(),
        pag1 = splash:html(),
        png1  = splash:png(),
        id_elenco = splash.args.id_elenco,
        id_sessione = splash.args.id_sessione,

        cookies = splash:get_cookies(),
        tt = splash.args
      }
    end
    """
    def parse_list(self, response):
            for ss in response.data:
                if len(ss) >= 4:
                    if ss[0:3] == 'pag':
                        fileName = savePageOriginalInFolder(response, self.outDir, ss)
                    elif ss[0:3] == 'png':
                        fileName = savePagePng(response, self.outDir,ss)

A part of the settings.py (nothing exotic)

DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }

SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'

HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
Gallaecio commented 5 years ago

Could you please close this issue and report it on https://github.com/scrapinghub/splash?