Closed iliiliiliili closed 4 years ago
Experiencing the same error. Looks like new Captcha is in place
Cloudflare is blocking the requests. The http client has to be updated :(
An alternative is to replace axios with puppeteer by setting the user agent in scraper.js `const axios = require('axios'); const axiosCloudflare = require('axios-cloudflare'); axiosCloudflare(axios); const mapper = require('./mapper'); const puppeteer = require('puppeteer'); var userAgent = require('user-agents'); const POSTS_PER_PAGE = 10; const BASE_POSTS_URL = 'https://m.9gag.com/v1/group-posts/group/default/type/'; const BASE_COMMENTS_URL = 'https://comment-cdn.9gag.com/v1/topComments.json?appId=a_dd8f2b7d304a10edaf6f29517ea0ca4100a43d1b&urls=http%3A%2F%2F9gag.com%2Fgag%2F';
class Scraper {
constructor(postCount, section = 'hot', commentCount = 0) { if (postCount <= 0) throw new Error('Post count must be positive'); if (commentCount < 0) throw new Error('Comment count cannot be negative'); this.postCount = postCount; this.section = section; this.commentCount = commentCount; }
postsUrl(lastPostId) { let url = BASE_POSTS_URL + this.section; if (lastPostId) url += '?id=' + lastPostId; return url; }
commentsUrl(postId) {
return ${BASE_COMMENTS_URL}${postId}&order=score&commentL1=${this.commentCount}&commentL2=1
;
}
getPost9gag(url){
return new Promise(function(resolve, reject) {
(async () => {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.setUserAgent(userAgent.toString())
await page.goto(url);
var html = await page.evaluate(() => document.getElementsByTagName('pre')[0].outerText);
browser.close();
resolve(JSON.parse(html));
})();
})
}
/**
@param {string} [lastPostId] - Last scrapped post id */ async scrap(lastPostId) { let result = []; const pages = Math.ceil(this.postCount / POSTS_PER_PAGE); for (let i = 0; i < pages; i++) { //let response = await this.getPost9gag(this.postsUrl(lastPostId));
let response = await axios.get(this.postsUrl(lastPostId), { headers: { 'X-Requested-With': 'XMLHttpRequest'} }); response = response.data; const posts = mapper.readPosts(response); if (this.commentCount) { for (let post of posts) { response = await axios.get(this.commentsUrl(post.id)); post.comments = mapper.readComments(response); } } lastPostId = posts[posts.length - 1].id; result = result.concat(posts); } return result.slice(0, this.postCount); }
}
module.exports = Scraper; `
@luiscruzga followed your puppeteer suggestion :)
@iliiliiliili thank you for reporting the issue, version 3.1.0 fixes the problem
I don't have much time to maintain this but I can accept pull requests for a faster fix
Hello guys, I've updated to 9gag version 3.1.0. When I'm trying to use the scrapper I'm getting
"ReferenceError: HttpClient is not defined". Could you pls help me to fix? I'm newbie btw :)
const httpClient = new HttpClient();
await httpClient.init();
const scraper = new Scraper(httpClient, 10, 'hot', 0);``
Hello guys, I've updated to 9gag version 3.1.0. When I'm trying to use the scrapper I'm getting "ReferenceError: HttpClient is not defined". Could you pls help me to fix? I'm newbie btw :)
const httpClient = new HttpClient(); await httpClient.init(); const scraper = new Scraper(httpClient, 10, 'hot', 0);``
I tried passing axios as follows
const scraper = new Scraper(axios, 10, 'hot', 0)
Now it fails with Error:
Request failed with status code 403
I get the captcaha response in data
@hareendras I forgot to export the new HttpClient
I've updated the README and bumped the package version
Hello @ruial , Thank you for fixing that. I'm using this in heroku and looks like it requires the arguments --no-sandbox and --disable-setuid-sandbox puppeteer.launch to work. Created a pull request too. But I could not check this because I don't have working local instance of my app. Attahced is the current error I'm getting in heroku instance
@hareendras I've merged the changes and everything should be working now.
I'm curious about how you guys use this package :) I created it initially because I had limited mobile data and browsing 9gag consumed too much, so I saved a page offline to view during commute
@ruial. Yey! now it works. Check this bot I have created using this. :) Live FB messenger link source
Hello!
Starting form 26.02.2020 (today) I am receiving 403 errors (both from PC with Wi-Fi and phone with Termux and 4G) while trying to use this package. Possibly it's because of capcha. Error text below.
`Error: Request failed with status code 403 at createError (D:\FILES\$JAVASCRIPT_PROJECTS\Node.js\node_modules\axios\lib\core\createError.js:16:15) at settle (D:\FILES\$JAVASCRIPT_PROJECTS\Node.js\node_modules\axios\lib\core\settle.js:18:12) at IncomingMessage.handleStreamEnd (D:\FILES\$JAVASCRIPT_PROJECTS\Node.js\node_modules\axios\lib\adapters\http.js:201:11) at IncomingMessage.emit (events.js:215:7) at endReadableNT (_stream_readable.js:1184:12) at processTicksAndRejections (internal/process/task_queues.js:80:21) { config: { adapter: [Function: httpAdapter], transformRequest: { '0': [Function: transformRequest] }, transformResponse: { '0': [Function: transformResponse] }, timeout: 0, xsrfCookieName: 'XSRF-TOKEN', xsrfHeaderName: 'X-XSRF-TOKEN', maxContentLength: -1, validateStatus: [Function: validateStatus], headers: { Accept: 'application/json, text/plain, /', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'axios/0.18.0' }, method: 'get', url: 'https://m.9gag.com/v1/group-posts/group/default/type/trending', data: undefined }, request: ClientRequest { _events: [Object: null prototype] { socket: [Function], abort: [Function], aborted: [Function], error: [Function], timeout: [Function], prefinish: [Function: requestOnPrefinish] }, _eventsCount: 6, _maxListeners: undefined, outputData: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedConnection: false, _removedContLen: false, _removedTE: false, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: TLSSocket { _tlsOptions: [Object], _secureEstablished: true, _securePending: false, _newSessionPending: false, _controlReleased: true, _SNICallback: null, servername: 'm.9gag.com', alpnProtocol: false, authorized: true, authorizationError: null, encrypted: true, _events: [Object: null prototype], _eventsCount: 9, connecting: false, _hadError: false, _parent: null, _host: 'm.9gag.com', _readableState: [ReadableState], readable: true, _maxListeners: undefined, _writableState: [WritableState], writable: false, allowHalfOpen: false, _sockname: null, _pendingData: null, _pendingEncoding: '', server: undefined, _server: null, ssl: [TLSWrap], _requestCert: true, _rejectUnauthorized: true, parser: null, _httpMessage: [Circular],
}, response: { status: 403, statusText: 'Forbidden', headers: { date: 'Wed, 26 Feb 2020 21:09:50 GMT', 'content-type': 'text/html; charset=UTF-8', 'transfer-encoding': 'chunked', connection: 'close', 'cf-chl-bypass': '1', 'set-cookie': [Array], 'cache-control': 'no-cache', 'x-frame-options': 'SAMEORIGIN', 'expect-ct': 'max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"', server: 'cloudflare', 'cf-ray': '56b4e6fff8bcd895-CPH' }, config: { adapter: [Function: httpAdapter], transformRequest: [Object], transformResponse: [Object], timeout: 0, xsrfCookieName: 'XSRF-TOKEN', xsrfHeaderName: 'X-XSRF-TOKEN', maxContentLength: -1, validateStatus: [Function: validateStatus], headers: [Object], method: 'get', url: 'https://m.9gag.com/v1/group-posts/group/default/type/trending', data: undefined }, request: ClientRequest { _events: [Object: null prototype], _eventsCount: 6, _maxListeners: undefined, outputData: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedConnection: false, _removedContLen: false, _removedTE: false, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: [TLSSocket], connection: [TLSSocket], _header: 'GET /v1/group-posts/group/default/type/trending HTTP/1.1\r\n' + 'Accept: application/json, text/plain, /\r\n' + 'X-Requested-With: XMLHttpRequest\r\n' + 'User-Agent: axios/0.18.0\r\n' + 'Host: m.9gag.com\r\n' + 'Connection: close\r\n' + '\r\n', _onPendingData: [Function: noopPendingOutput], agent: [Agent], socketPath: undefined, method: 'GET', path: '/v1/group-posts/group/default/type/trending', _ended: true, res: [IncomingMessage], aborted: false, timeoutCb: null, upgradeOrConnect: false, parser: null, maxHeadersCount: null, _redirectable: [Writable],
} }`