function use_crawlera(splash)
local user = splash.args.crawlera_user
local host = 'proxy.crawlera.com'
local port = 8010
local session_header = 'X-Crawlera-Profile'
local session_id = 'desktop'
splash:on_request(function (request)
-- Discard requests to advertising and tracking domains.
if string.find(request.url, 'doubleclick%.net') or
string.find(request.url, 'api%-g%.weedmaps%.co') or
string.find(request.url, 'collector%-pxp') or
string.find(request.url, 'pixel%.sitescout%.com') or
string.find(request.url, 'api%.honeybadger%.io') or
string.find(request.url, 'google%-analytics%.com') or
string.find(request.url, 'clickserv%.sitescout%.com') or
string.find(request.url, 'trafficjunky%.net') or
string.find(request.url, 'client%.perimeterx%.net') or
string.find(request.url, 'evs%.segment%.weedmaps%.com') or
string.find(request.url, 'analytics%.google%.com') then
request.abort()
return
end
-- Avoids using Crawlera for URLS starting
-- with 'static.' and the ones ending with '.png'.
if string.find(request.url, '://static%.') ~= nil or
string.find(request.url, 'fill=solid$') ~= nil then
return
end
-- request:set_header('X-Crawlera-Cookies', 'disable')
request:set_header('X-Crawlera-Profile','desktop')
-- request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=''}
end)
splash:on_response_headers(function (response)
if type(response.headers[session_header]) ~= nil then
session_id = response.headers[session_header]
end
end)
end
function main(splash)
-- splash.request_body = 1
-- splash.response_body = 1
-- splash.response_body_enabled = true
use_crawlera(splash)
splash:go({http_method=splash.args.http_method,splash.args.url,headers=splash.args.headers,formdata=splash.args.formdata})
splash:wait(2)
return splash:html()
end
Error:
2021-03-28 23:14:51 [scrapy_splash.middleware] WARNING: Bad request to Splash: {'error': 400, 'type': 'ScriptError', 'description': 'Error happened while executing Lua script', 'info': {'type': 'SPLASH_LUA_ERROR', 'message': '[string "<python>"]:50: Can\'t convert Lua object to Python: depth limit is reached', 'source': '[string "<python>"]', 'line_number': 50, 'error': "Can't convert Lua object to Python: depth limit is reached"}}
Another thought if I do something like execute the scripts on the page instead, from https://www.iheartjane.com/embed/stores/1201/menu where the request url originates from, it should generate the xhr objects i'm trying to scrape, instead of me trying to hit the xhr with headers and formdata. However doing so I was running into runjs errors as splash things im just trying to load the object.
function main(splash)
local url = splash.args.url
splash:autoload(splash.args.js_source)
assert(splash:go(url))
Similar problem to the following https://github.com/scrapinghub/splash/issues/149
SplashRequest:
Lua Script:
Error:
Another thought if I do something like execute the scripts on the page instead, from https://www.iheartjane.com/embed/stores/1201/menu where the request url originates from, it should generate the xhr objects i'm trying to scrape, instead of me trying to hit the xhr with headers and formdata. However doing so I was running into runjs errors as splash things im just trying to load the object.