codemanki / cloudscraper

--DEPRECATED -- 🛑 🛑 Node.js library to bypass cloudflare's anti-ddos page
MIT License
601 stars 139 forks source link

Got an error on first attempt #164

Closed naorye closed 5 years ago

naorye commented 5 years ago

I am scraping 2000 urls and some of them got Cloudflare's page. I started my scraper and once I got Cloudflare's page, an error occurred:

Unhandled rejection StatusCodeError: 429 - "<!DOCTYPE HTML>\n<html lang=\"en-US\">\n<head>\n  <meta charset=\"UTF-8\" />\n  <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n  <meta http-equiv=\"X-UA-Compatible\" content=\"IE=Edge,chrome=1\" />\n  <meta name=\"robots\" content=\"noindex, nofollow\" />\n  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1, maximum-scale=1\" />\n  <title>Just a moment...</title>\n  <style type=\"text/css\">\n    html, body {width: 100%; height: 100%; margin: 0; padding: 0;}\n    body {background-color: #ffffff; font-family: Helvetica, Arial, sans-serif; font-size: 100%;}\n    h1 {font-size: 1.5em; color: #404040; text-align: center;}\n    p {font-size: 1em; color: #404040; text-align: center; margin: 10px 0 0 0;}\n    #spinner {margin: 0 auto 30px auto; display: block;}\n    .attribution {margin-top: 20px;}\n    @-webkit-keyframes bubbles { 33%: { -webkit-transform: translateY(10px); transform: translateY(10px); } 66% { -webkit-transform: translateY(-10px); transform: translateY(-10px); } 100% { -webkit-transform: translateY(0); transform: translateY(0); } }\n    @keyframes bubbles { 33%: { -webkit-transform: translateY(10px); transform: translateY(10px); } 66% { -webkit-transform: translateY(-10px); transform: translateY(-10px); } 100% { -webkit-transform: translateY(0); transform: translateY(0); } }\n    .bubbles { background-color: #404040; width:15px; height: 15px; margin:2px; border-radius:100%; -webkit-animation:bubbles 0.6s 0.07s infinite ease-in-out; animation:bubbles 0.6s 0.07s infinite ease-in-out; -webkit-animation-fill-mode:both; animation-fill-mode:both; display:inline-block; }\n  </style>\n\n    <script type=\"text/javascript\">\n  //<![CDATA[\n  (function(){\n    var a = function() {try{return !!window.addEventListener} catch(e) {return !1} },\n    b = function(b, c) {a() ? document.addEventListener(\"DOMContentLoaded\", b, c) : document.attachEvent(\"onreadystatechange\", b)};\n    b(function(){\n      var a = document.getElementById('cf-content');a.style.display = 'block';\n      setTimeout(function(){\n        var s,t,o,p,b,r,e,a,k,i,n,g,f, rRfHMdw={\"QJgUiFSc\":+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+!![]))};\n        t = document.createElement('div');\n        t.innerHTML=\"<a href='/'>x</a>\";\n        t = t.firstChild.href;r = t.match(/https?:\\/\\//)[0];\n        t = t.substr(r.length); t = t.substr(0,t.length-1); \n        a = document.getElementById('jschl-answer');\n        f = document.getElementById('challenge-form');\n        ;rRfHMdw.QJgUiFSc-=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]))/+((!+[]+!![]+!![]+[])+(!+[]+!![])+(+!![])+(+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]));rRfHMdw.QJgUiFSc-=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]));rRfHMdw.QJgUiFSca.value = (+rRfHMdw.QJgUiFSc + t.length).toFixed(10); '; 121'\n        f.action += location.hash;\n        f.submit();\n      }, 4000);\n    }, false);\n  })();\n  //]]>\n</script>\n\n\n</head>\n<body>\n  <table width=\"100%\" height=\"100%\" cellpadding=\"20\">\n    <tr>\n      <td align=\"center\" valign=\"middle\">\n          <div class=\"cf-browser-verification cf-im-under-attack\">\n  <noscript><h1 data-translate=\"turn_on_js\" style=\"color:#bd2426;\">Please turn JavaScript on and reload the page.</h1></noscript>\n  <div id=\"cf-content\" style=\"display:none\">\n    \n    <div>\n      <div class=\"bubbles\"></div>\n      <div class=\"bubbles\"></div>\n      <div class=\"bubbles\"></div>\n    </div>\n    <h1><span data-translate=\"checking_browser\">Checking your browser before accessing</span> coinmarketcap.com.</h1>\n    \n    <p data-translate=\"process_is_automatic\">This process is automatic. Your browser will redirect to your requested content shortly.</p>\n    <p data-translate=\"allow_5_secs\">Please allow up to 5 seconds&hellip;</p>\n  </div>\n   \n  <form id=\"challenge-form\" action=\"/cdn-cgi/l/chk_jschl\" method=\"get\">\n    <input type=\"hidden\" name=\"s\" value=\"d341f3ce070cdcebd4a2f66166ae27fc5b1817fb-1553589396-1800-AYyUBxmsc2D+SJcJIdDh1Xg1WxDUxhFrYTD53N9o++dluuXCBGXzEdNruJwaA1fvm3HWVySlnt6uAwvvR/wlQ6bT1DLvvsSU08MRLdYKSjWOqBwuj4DDvMoN4Mk2Xlz9FKBS1o6QfdNaORff8LeeUtVgUUBphbKoXbfPnT64zQ9Z\"></input>\n    <input type=\"hidden\" name=\"jschl_vc\" value=\"1fa554b4b911a7f4faa8c23c0848ba78\"/>\n    <input type=\"hidden\" name=\"pass\" value=\"1553589400.048-AY+6BjyyjV\"/>\n    <input type=\"hidden\" id=\"jschl-answer\" name=\"jschl_answer\"/>\n  </form>\n</div>\n\n          \n          <div class=\"attribution\">\n            <a href=\"https://www.cloudflare.com/5xx-error-landing?utm_source=iuam\" target=\"_blank\" style=\"font-size: 12px;\">DDoS protection by Cloudflare</a>\n            <br>\n            Ray ID: 4bd7cc3d4f486373\n          </div>\n      </td>\n     \n    </tr>\n  </table>\n  \n</body>\n</html>\n"
    at new StatusCodeError (/my-app/node_modules/request-promise-core/lib/errors.js:32:15)
    at Request.plumbing.callback (/my-app/node_modules/request-promise-core/lib/plumbing.js:104:33)
    at Request.RP$callback [as _callback] (/my-app/node_modules/request-promise-core/lib/plumbing.js:46:31)
    at self.callback (/my-app/node_modules/request/request.js:185:22)
    at processResponseBody (/my-app/node_modules/cloudscraper/index.js:344:3)
    at processRequestResponse (/my-app/node_modules/cloudscraper/index.js:162:12)
    at Object.onceWrapper (events.js:317:30)
    at emitTwo (events.js:126:13)
    at Request.emit (events.js:214:7)
    at Request.<anonymous> (/my-app/node_modules/request/request.js:1161:10)
    at emitOne (events.js:116:13)
    at Request.emit (events.js:211:7)
    at IncomingMessage.<anonymous> (/my-app/node_modules/request/request.js:1083:12)
    at Object.onceWrapper (events.js:313:30)
    at emitNone (events.js:111:20)
    at IncomingMessage.emit (events.js:208:7)
    at endReadableNT (_stream_readable.js:1064:12)
    at _combinedTickCallback (internal/process/next_tick.js:138:11)
    at process._tickCallback (internal/process/next_tick.js:180:9)
From previous event:
    at Request.plumbing.init (/my-app/node_modules/request-promise-core/lib/plumbing.js:36:28)
    at Request.RP$initInterceptor [as init] (/my-app/node_modules/request-promise-core/configure/request2.js:41:27)
    at new Request (/my-app/node_modules/request/request.js:127:8)
    at request (/my-app/node_modules/request/index.js:53:10)
    at performRequest (/my-app/node_modules/cloudscraper/index.js:94:17)
    at /my-app/node_modules/cloudscraper/index.js:41:14
    at Function.get (/my-app/node_modules/request/index.js:100:12)
    at Promise (/my-app/app/utils/scraping.js:88:26)
    at new Promise (<anonymous>)
    at scrapeSinglePage (/my-app/app/utils/scraping.js:79:12)
    at scrapeCurrencyPage (/my-app/app/services/coinMarketCap/scrapeCurrenciesUrls.js:39:31)
    at currencies.map (/my-app/app/services/coinMarketCap/scrapeCurrenciesUrls.js:82:35)
    at <anonymous>

Any idea what's wrong?

codemanki commented 5 years ago

hi @naorye . Are you sure that this was your first request? 429 sounds like you made some prior requests from that IP. Could you also please post the response headers for further investigation?

naorye commented 5 years ago

Hi @codemanki, I am sorry, wasn't clear. I meant that I tried cloudscraper for the first time and it threw an error. I performed 100 GET requests, the 7th requests got cloudflare's page and that error was thrown. ... When I am thinking about that, there is a possibility that previous requests also got cloudflare's page and it did work.

What causes this exception?

ghost commented 5 years ago

@naorye There was a very slim chance that the Cloudflare page could be returned. I raised that issue in #162 and @codemanki closed/fixed it with #163.

The status code error happens whenever the status code is not 200 OK. If that should not be an error, you may pass the simple: false option to cloudscraper.

cloudcraper.get({ uri, simple: false}).then(console.log);

For more information on the simple option, see request-promise's docs: https://github.com/request/request-promise#migration-from-v2-to-v3

Here is a gist of the HTML from the status code error above: https://gist.github.com/pro-src/54893b5153bcb9afcd1d3205b1bb3db2

naorye commented 5 years ago

@pro-src Is there a chance that cloudflare returns 429 code along with it's error page?

ghost commented 5 years ago

@naorye Yes but the challenge is always sent with 503 status so that error is mysterious. I ran the following test for a count of 50 and it didn't error at all. I can't reproduce this on the master branch so maybe we've already fixed it.

set -xe
for num in {1..50}
do
  node test.js
done
// test.js
var cloudscraper = require('.');
cloudscraper.get('http://coinmarketcap.com').then(console.log, error => {
  console.error(error);
  process.exit(1);
});
ghost commented 5 years ago

@naorye What version of cloudscraper are you using? Can you test with the master branch of this repository? rm -rf node_modules/cloudscraper npm install --save 'https://github.com/codemanki/cloudscraper'

naorye commented 5 years ago

@pro-src Tried with simple: false but then when I got cloudflares page, it doesn't manage to get to the page behind the protection. I am using "cloudscraper": "^3.3.0". I'll try to install directly from github.

Edit: npm install directly from github results the same.

ghost commented 5 years ago

@naorye btw, thanks for reporting this. Can you please provide a minimal test case?

Be sure to remove cloudscraper from node_modules before installing directly from github and maybe remove the "^3.3.0" from package.json. I still can't reproduce this using the master branch

naorye commented 5 years ago

I'll try to create a reproduction.

naorye commented 5 years ago

Running the following threw an error on the 15th scrape. It might take longer.

const cloudscraper = require('cloudscraper');

const urls = [
    'https://coinmarketcap.com/currencies/bitcoin/',
    'https://coinmarketcap.com/currencies/ethereum/',
    'https://coinmarketcap.com/currencies/ripple/',
    'https://coinmarketcap.com/currencies/litecoin/',
    'https://coinmarketcap.com/currencies/eos/',
    'https://coinmarketcap.com/currencies/bitcoin-cash/',
    'https://coinmarketcap.com/currencies/binance-coin/',
    'https://coinmarketcap.com/currencies/tether/',
    'https://coinmarketcap.com/currencies/stellar/',
    'https://coinmarketcap.com/currencies/cardano/',
    'https://coinmarketcap.com/currencies/tron/',
    'https://coinmarketcap.com/currencies/bitcoin-sv/',
    'https://coinmarketcap.com/currencies/monero/',
    'https://coinmarketcap.com/currencies/iota/',
    'https://coinmarketcap.com/currencies/dash/',
    'https://coinmarketcap.com/currencies/maker/',
    'https://coinmarketcap.com/currencies/neo/',
    'https://coinmarketcap.com/currencies/ontology/',
    'https://coinmarketcap.com/currencies/ethereum-classic/',
    'https://coinmarketcap.com/currencies/tezos/',
    'https://coinmarketcap.com/currencies/nem/',
    'https://coinmarketcap.com/currencies/zcash/',
    'https://coinmarketcap.com/currencies/vechain/',
    'https://coinmarketcap.com/currencies/basic-attention-token/',
    'https://coinmarketcap.com/currencies/waves/',
    'https://coinmarketcap.com/currencies/usd-coin/',
    'https://coinmarketcap.com/currencies/dogecoin/',
    'https://coinmarketcap.com/currencies/omisego/',
    'https://coinmarketcap.com/currencies/qtum/',
    'https://coinmarketcap.com/currencies/crypto-com-chain/',
    'https://coinmarketcap.com/currencies/bitcoin-gold/',
    'https://coinmarketcap.com/currencies/trueusd/',
    'https://coinmarketcap.com/currencies/decred/',
    'https://coinmarketcap.com/currencies/lisk/',
    'https://coinmarketcap.com/currencies/0x/',
    'https://coinmarketcap.com/currencies/augur/',
    'https://coinmarketcap.com/currencies/chainlink/',
    'https://coinmarketcap.com/currencies/zilliqa/',
    'https://coinmarketcap.com/currencies/bitshares/',
    'https://coinmarketcap.com/currencies/maximine-coin/',
    'https://coinmarketcap.com/currencies/ravencoin/',
    'https://coinmarketcap.com/currencies/icon/',
    'https://coinmarketcap.com/currencies/holo/',
    'https://coinmarketcap.com/currencies/digibyte/',
    'https://coinmarketcap.com/currencies/bytecoin-bcn/',
    'https://coinmarketcap.com/currencies/steem/',
    'https://coinmarketcap.com/currencies/bittorrent/',
    'https://coinmarketcap.com/currencies/nano/',
    'https://coinmarketcap.com/currencies/bitcoin-diamond/',
    'https://coinmarketcap.com/currencies/enjin-coin/',
    'https://coinmarketcap.com/currencies/huobi-token/',
    'https://coinmarketcap.com/currencies/paxos-standard-token/',
    'https://coinmarketcap.com/currencies/kucoin-shares/',
    'https://coinmarketcap.com/currencies/aeternity/',
    'https://coinmarketcap.com/currencies/verge/',
    'https://coinmarketcap.com/currencies/komodo/',
    'https://coinmarketcap.com/currencies/pundi-x/',
    'https://coinmarketcap.com/currencies/bytom/',
    'https://coinmarketcap.com/currencies/siacoin/',
    'https://coinmarketcap.com/currencies/iostoken/',
    'https://coinmarketcap.com/currencies/aurora/',
    'https://coinmarketcap.com/currencies/theta/',
    'https://coinmarketcap.com/currencies/abbc-coin/',
    'https://coinmarketcap.com/currencies/stratis/',
    'https://coinmarketcap.com/currencies/dai/',
    'https://coinmarketcap.com/currencies/insight-chain/',
    'https://coinmarketcap.com/currencies/golem-network-tokens/',
    'https://coinmarketcap.com/currencies/status/',
    'https://coinmarketcap.com/currencies/populous/',
    'https://coinmarketcap.com/currencies/ardor/',
    'https://coinmarketcap.com/currencies/project-pai/',
    'https://coinmarketcap.com/currencies/ark/',
    'https://coinmarketcap.com/currencies/revain/',
    'https://coinmarketcap.com/currencies/mixin/',
    'https://coinmarketcap.com/currencies/cryptonex/',
    'https://coinmarketcap.com/currencies/gemini-dollar/',
    'https://coinmarketcap.com/currencies/gxchain/',
    'https://coinmarketcap.com/currencies/hypercash/',
    'https://coinmarketcap.com/currencies/digitex-futures/',
    'https://coinmarketcap.com/currencies/factom/',
    'https://coinmarketcap.com/currencies/maidsafecoin/',
    'https://coinmarketcap.com/currencies/electroneum/',
    'https://coinmarketcap.com/currencies/wax/',
    'https://coinmarketcap.com/currencies/decentraland/',
    'https://coinmarketcap.com/currencies/loom-network/',
    'https://coinmarketcap.com/currencies/waltonchain/',
    'https://coinmarketcap.com/currencies/crypto-com/',
    'https://coinmarketcap.com/currencies/qash/',
    'https://coinmarketcap.com/currencies/loopring/',
    'https://coinmarketcap.com/currencies/pivx/',
    'https://coinmarketcap.com/currencies/zcoin/',
    'https://coinmarketcap.com/currencies/aelf/',
    'https://coinmarketcap.com/currencies/waykichain/',
    'https://coinmarketcap.com/currencies/thorecoin/',
    'https://coinmarketcap.com/currencies/qubitica/',
    'https://coinmarketcap.com/currencies/moac/',
    'https://coinmarketcap.com/currencies/repo/',
    'https://coinmarketcap.com/currencies/power-ledger/',
    'https://coinmarketcap.com/currencies/kyber-network/',
    'https://coinmarketcap.com/currencies/wanchain/',
];

function scrapePage(url) {
    return new Promise((resolve, reject) => {
        cloudscraper.get(url, (err, resp, html) => {
            if (!err) {
                resolve(html);
            } else {
                reject(err);
            }
        });
    });
}

urls.reduce(async (promise, url, index) => {
    await promise;
    await scrapePage(url);
    console.log(index);
}, Promise.resolve());
ghost commented 5 years ago

@naorye I was able to reproduce this and we'll have a fix for this soon.

@codemanki I think we should handle the case of 429 Too Many Requests by respecting the Retry-After header and partially revert #163 to re-include the isChallengePresent check irrespective of statusCode but only if isCloudflare === true. This may have been the one and only exception but to be safe...

As a side note: Anorov/cloudflare-scrape has this bug too

ghost commented 5 years ago

@naorye The fix in #165 successfully scrapes all 100 of them. Can you confirm?

npm install git://github.com/pro-src/cloudscraper.git#164_too_many_requests

naorye commented 5 years ago

Actually I have 2000 :) I'll check it in two hours and will update.

Thanks!

ghost commented 5 years ago

@naorye I just thought to mention that your test case can be simplified by replacing:

function scrapePage(url) {
    return new Promise((resolve, reject) => {
        cloudscraper.get(url, (err, resp, html) => {
            if (!err) {
                resolve(html);
            } else {
                reject(err);
            }
        });
    });
}

With it's equivalent:

function scrapePage(url) {
    return cloudscraper.get(url);
}

Edit: And if by chance that you need the response object, try the resolveWithFullResponse: true option, more info in request-promise's docs.

naorye commented 5 years ago

Scraping 2128/2128 [============================================================] 100%

Done successfully! Thanks!!

When do you plan to merge it to master branch?

ghost commented 5 years ago

Awesome! Once the PR is reviewed and if everything is good. (Probably less than 24H until a new NPM release) Thanks for your contribution!!!

naorye commented 5 years ago

You are awesome! 🥇

codemanki commented 5 years ago

Thank you guys for taking care of this. I will look through the PR tomorrow morning and will release a new version :)

codemanki commented 5 years ago

Done! 3.4.0 has been just published. Thank you @pro-src :)