ekalinin / robots.js

Parser for robots.txt for node.js
MIT License
66 stars 21 forks source link

init parser from robot.txt string or allow to serialize the parser #12

Closed scherler closed 10 years ago

scherler commented 10 years ago

Hi,

I am trying to reduce the calls to the robot.txt of a domain. I am writing a crawler supervisor which should only access once to each domain and save the result in a redisDb. Later if we need the parser I contact the store and generate it again. So I use the RobotParser the following way:

// Allow to store the RobotsParser rep in redis and then populate a new instance
robots.RobotsParser.prototype.extend = function(x) {
  for(var i in x){
    this[i] = x[i];
  }
};

1) I create a parser and then save it to redis 2) next time I will get the result from redis (reply) I do

var parser = new robots.RobotsParser();
parser.extend(JSON.parse(reply));

3) then later I do something like

parser.canFetch('*', '/es/success-stories', function (access) {
 console.log(access)
})

However when I do it with the clone I get

/node_modules/robots/lib/parser.js:340
      callback(self.defaultEntry.allowance(url), url, {
                                 ^
TypeError: Object #<Object> has no method 'allowance'
    at RobotsParser.canFetch

I would love a pair of functions like parser.toJson() and parser.fromJson("{...}") or be able to pass the robot.txt to new robots.RobotsParser("robot.txtString")

Any tips how to implement such thing?

scherler commented 10 years ago

I came up with

/*
 * You can clone a JSON representation (based on a prior created one)
 * to enhance with it functions
 *
 * @param {Object} parserToClone JSON representation
 * @returns {RobotsParser}
 */
exports.clone = clone = function(parserToClone) {
  var parser = new RobotsParser ();
  var parseEntry = function(value){
    var defaultEntry = new Entry();
    for(var sub in value){
      var setValue;
      if(sub === "rules"){
        var rules = [];
        var orgRules = value[sub];
        for(var rule in orgRules){
          rules.push(new Rule(orgRules[rule].path, orgRules[rule].allowance));
        }
        setValue = rules;
      }else{
        setValue = value[sub];
      }
      defaultEntry[sub] = setValue;
      console.log("sub", sub, defaultEntry[sub]);
    }
    return defaultEntry;
  };
  for(var i in parserToClone){
    var value;
    var orgValue = parserToClone[i];
    if(i === "defaultEntry"){
      value = parseEntry(orgValue);
    }else if(i === "entries"){
      var entries = [];
      for (var entry in orgValue){
        entries.push(parseEntry(orgValue[entry]));
      }
      value = entries;
    }else{
      console.log("parserToClone[i]",i , value);
      value = orgValue;
    }
    parser[i] = value;
  }
  return parser;
};

Will prepare a pull request.