Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously
var Crawler = require("node-webcrawler");
var url = require('url');
var jsdom = require('jsdom');
var c = new Crawler({
maxConnections : 10,
jQuery: jsdom,
// This will be called for each crawled page
callback : function (error, result, $) {
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
if(error){
console.log(error);
}else{
try {
console.log($("title").text());
$('a').each(function (index, a) {
var toQueueUrl = $(a).prop('href');
//console.log(toQueueUrl);
c.queue(toQueueUrl);
});
} catch (e) {
console.log(e);
}
}
}
});
c.queue('http://www.wandoujia.com/');
the memory will go up to 800MB after around 10 mins.
am I doing something wrong or it is the problem of the module itself?
with this code
the memory will go up to 800MB after around 10 mins.
am I doing something wrong or it is the problem of the module itself?
thanks,