ekalinin / robots.js

Parser for robots.txt for node.js
MIT License
66 stars 21 forks source link
javascript nodejs parser robots robots-txt

robots.js

robots.js — is parser for robots.txt files for node.js.

Installation

It's recommended to install via npm:

$ npm install -g robots

Usage

Here's an example of using robots.js:

var robots = require('robots')
  , parser = new robots.RobotsParser();

parser.setUrl('http://nodeguide.ru/robots.txt', function(parser, success) {
  if(success) {
    parser.canFetch('*', '/doc/dailyjs-nodepad/', function (access) {
      if (access) {
        // parse url
      }
    });
  }
});

Default crawler user-agent is:

Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/20100101 Firefox/5.0

Here's an example of using another user-agent and more detailed callback:

var robots = require('robots')
  , parser = new robots.RobotsParser(
                'http://nodeguide.ru/robots.txt',
                'Mozilla/5.0 (compatible; RobotTxtBot/1.0)',
                after_parse
            );

function after_parse(parser, success) {
  if(success) {
    parser.canFetch('*', '/doc/dailyjs-nodepad/', function (access, url, reason) {
      if (access) {
        console.log(' url: '+url+', access: '+access);
        // parse url ...
      }
    });
  }
};

Here's an example of getting list of sitemaps:

var robots = require('robots')
  , parser = new robots.RobotsParser();

parser.setUrl('http://nodeguide.ru/robots.txt', function(parser, success) {
  if(success) {
    parser.getSitemaps(function(sitemaps) {
      // sitemaps — array
    });
  }
});

Here's an example of getCrawlDelay usage:

    var robots = require('robots')
      , parser = new robots.RobotsParser();

    // for example:
    //
    // $ curl -s http://nodeguide.ru/robots.txt
    //
    // User-agent: Google-bot
    // Disallow: / 
    // Crawl-delay: 2
    //
    // User-agent: *
    // Disallow: /
    // Crawl-delay: 2

    parser.setUrl('http://nodeguide.ru/robots.txt', function(parser, success) {
      if(success) {
        var GoogleBotDelay = parser.getCrawlDelay("Google-bot");
        // ...
      }
    });

An example of passing options to the HTTP request:

var options = {
  headers:{
    Authorization:"Basic " + new Buffer("username:password").toString("base64")}
}

var robots = require('robots')
  , parser = new robots.RobotsParser(null, options);

parser.setUrl('http://nodeguide.ru/robots.txt', function(parser, success) {
  ...
});

API

RobotsParser — main class. This class provides a set of methods to read, parse and answer questions about a single robots.txt file.

License

See LICENSE file.

Resources