It seems that Splash cannot execute the javascript correctly: in the list the spinner never goes away and the page numbers are not loaded. Here is a stripped down, working, self contanied, version of my program (sorry if not stripped down at best)
# -*- coding: utf-8 -*-
import scrapy from scrapy_splash
import SplashRequest from scrapy.selector
import Selector from scrapy.http
import HtmlResponse
import sys
import io
import os
import base64
def saveFile(ss, fileNameExt, folderName):
f = open(folderName + '/' + fileNameExt, 'w')
f.write(ss)
f.close()
return fileNameExt
def savePng(png_bytes, fileNameExt, folderName):
f = open( folderName +'/' + fileNameExt, 'wb')
f.write(png_bytes)
f.close()
return fileNameExt
def savePageOriginalInFolder(response, folderName, chiave='pag1'):
fileName = "site.html"
testo = response.data[chiave].decode('utf8')
return saveFile(testo, fileName, folderName) def savePagePng(response, folderName, pngDataName):
fileName = 'site.png'
if hasattr(response, 'data'):
png_bytes = base64.b64decode(response.data[pngDataName])
return savePng(png_bytes, fileName, folderName)
class GenericoSpider(scrapy.Spider):
name = 'provaAbb'
def asSplashRequest(self, url, callback, id_elenco="no_id", id_sessione="no_id_sessione"):
return SplashRequest(
url = url,
endpoint='execute',
args={'lua_source': self.script, 'id_elenco': id_elenco, 'id_sessione': id_sessione},
callback=callback,
)
outDir = name # prendo in nome della cartella dal nome dello spider
db_name = ""
def start_requests(self):
sito = 'http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='
yield self.asSplashRequest(sito, self.parse_list, 'id_mio_elenco')
script = """
function main(splash)
local url = splash.args.url
splash:set_viewport_size(1280, 2500)
splash:init_cookies(splash.args.cookies)
assert(splash:go(url))
assert(splash:wait(10))
return {
url = splash:url(),
pag1 = splash:html(),
png1 = splash:png(),
id_elenco = splash.args.id_elenco,
id_sessione = splash.args.id_sessione,
cookies = splash:get_cookies(),
tt = splash.args
}
end
"""
def parse_list(self, response):
for ss in response.data:
if len(ss) >= 4:
if ss[0:3] == 'pag':
fileName = savePageOriginalInFolder(response, self.outDir, ss)
elif ss[0:3] == 'png':
fileName = savePagePng(response, self.outDir,ss)
Im' using Scrapy + Splash, I have problems downloading this page: http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText=
It seems that Splash cannot execute the javascript correctly: in the list the spinner never goes away and the page numbers are not loaded. Here is a stripped down, working, self contanied, version of my program (sorry if not stripped down at best)
A part of the settings.py (nothing exotic)