scrapinghub / splash

Lightweight, scriptable browser as a service with an HTTP API
BSD 3-Clause "New" or "Revised" License
4.08k stars 514 forks source link

Can't convert Lua object to Python: depth limit is reached #1116

Open chipzzz opened 3 years ago

chipzzz commented 3 years ago

Similar problem to the following https://github.com/scrapinghub/splash/issues/149

SplashRequest:

    def start_requests(self):
        yield SplashRequest("https://vfm4x0n23a-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.5.1)%3B%20Browser%20(lite)%3B%20JS%20Helper%20(3.1.1)%3B%20react%20(16.13.1)%3B%20react-instantsearch%20(6.4.0)&x-algolia-api-key=b499e29eb7542dc373ec0254e007205d&x-algolia-application-id=VFM4X0N23A", callback=self.parse,
                    endpoint='execute',
                    meta={},
                    method='POST',
                    args={'wait': 1,
                          'lua_source': self.lua_script,
                          'crawlera_user': self.settings['CRAWLERA_APIKEY'],
                          'formdata': {"requests":[{"indexName":"menu-products-production","params":"highlightPreTag=<ais-highlight-0000000000>&highlightPostTag=</ais-highlight-0000000000>&filters=store_id = 1201&hitsPerPage=0&userToken=12vUrVueHlXKkW1kAtMjV&enablePersonalization=true&personalizationImpact=50&maxValuesPerFacet=1000&query=&facets=[\"kind\",\"category\",\"percent_thc\",\"percent_cbd\",\"applicable_special_ids\",\"root_types\",\"aggregate_rating\",\"bucket_price\",\"available_weights\",\"brand\"]&tagFilters="}]}},
                    headers={'Accept': '*/*',
                             'Accept-Encoding': 'gzip, deflate, br',
                             'Accept-Language': 'en-US,en;q=0.9',
                             'Connection': 'keep-alive',
                             'content-type': 'application/x-www-form-urlencoded',
                             'DNT': '1',
                             'Content-Length': '530',
                             'Host': 'vfm4x0n23a-dsn.algolia.net',
                             'Origin': 'https://www.iheartjane.com',
                             'Referer': 'https://www.iheartjane.com/',
                             'sec-ch-ua': '\"Google Chrome\";v=\"89\", \"Chromium\";v=\"89\", \";Not A Brand\";v=\"99\"',
                             'sec-ch-ua-mobile': '?0',
                             'Sec-Fetch-Dest': 'empty',
                             'Sec-Fetch-Mode': 'cors',
                             'Sec-Fetch-Site': 'cross-site'},
                    splash_headers={'Authorization': basic_auth_header(self.settings['SPLASH_USER'], self.settings['SPLASH_PASS'])},
                    cache_args=['lua_source']
                    )

Lua Script:

function use_crawlera(splash)
    local user = splash.args.crawlera_user
    local host = 'proxy.crawlera.com'
    local port = 8010
    local session_header = 'X-Crawlera-Profile'
    local session_id = 'desktop'

    splash:on_request(function (request)
        -- Discard requests to advertising and tracking domains.
        if string.find(request.url, 'doubleclick%.net') or
           string.find(request.url, 'api%-g%.weedmaps%.co') or
           string.find(request.url, 'collector%-pxp') or
           string.find(request.url, 'pixel%.sitescout%.com') or
           string.find(request.url, 'api%.honeybadger%.io') or
           string.find(request.url, 'google%-analytics%.com') or
           string.find(request.url, 'clickserv%.sitescout%.com') or
           string.find(request.url, 'trafficjunky%.net') or
           string.find(request.url, 'client%.perimeterx%.net') or
           string.find(request.url, 'evs%.segment%.weedmaps%.com') or
           string.find(request.url, 'analytics%.google%.com') then
              request.abort()
           return
        end

        -- Avoids using Crawlera for URLS starting
        -- with 'static.' and the ones ending with '.png'.
        if string.find(request.url, '://static%.') ~= nil or
           string.find(request.url, 'fill=solid$') ~= nil then
           return
        end

        -- request:set_header('X-Crawlera-Cookies', 'disable')
        request:set_header('X-Crawlera-Profile','desktop')
        -- request:set_header(session_header, session_id)
        request:set_proxy{host, port, username=user, password=''}
    end)

    splash:on_response_headers(function (response)
        if type(response.headers[session_header]) ~= nil then
            session_id = response.headers[session_header]
        end
    end)
end

function main(splash)
    -- splash.request_body = 1
    -- splash.response_body = 1
    -- splash.response_body_enabled = true
    use_crawlera(splash)
    splash:go({http_method=splash.args.http_method,splash.args.url,headers=splash.args.headers,formdata=splash.args.formdata})
    splash:wait(2)
    return splash:html()
end

Error:

2021-03-28 23:14:51 [scrapy_splash.middleware] WARNING: Bad request to Splash: {'error': 400, 'type': 'ScriptError', 'description': 'Error happened while executing Lua script', 'info': {'type': 'SPLASH_LUA_ERROR', 'message': '[string "<python>"]:50: Can\'t convert Lua object to Python: depth limit is reached', 'source': '[string "<python>"]', 'line_number': 50, 'error': "Can't convert Lua object to Python: depth limit is reached"}}

Another thought if I do something like execute the scripts on the page instead, from https://www.iheartjane.com/embed/stores/1201/menu where the request url originates from, it should generate the xhr objects i'm trying to scrape, instead of me trying to hit the xhr with headers and formdata. However doing so I was running into runjs errors as splash things im just trying to load the object.

function main(splash)
  local url = splash.args.url

  splash:autoload(splash.args.js_source)
  assert(splash:go(url))