ruipgil / scraperjs

A complete and versatile web scraper.
MIT License
3.71k stars 188 forks source link

scraperPromise.request should support custom function #17

Closed atian25 closed 6 years ago

atian25 commented 9 years ago

sometime we need to use last scrape result to create request options

//mocha patch, suger method
var ddescribe = describe.only;
var xdescribe = describe.skip;
var iit = it.only;
var xit = it.skip;

var expect = require('chai').expect;
var _ = require('lodash');
var URL = require('url');
var scraperjs = require('scraperjs');

ddescribe('scraperjs', function(){
  it('should chain', function(done){
    var scraperPromise = scraperjs.StaticScraper.create();
    scraperPromise
      .get('http://echo.jsontest.com/k1/v1')
      .scrape(function($){
        return $.html();
      }, function(result){
        return result;
      })
      .request(function(result, utils){
        //FEATURE REQUEST: support custom options
        return {
          url: 'http://echo.jsontest.com/k2/v1',
          method: 'post',
          json: {xxx: result}
        }
      })
      .scrape(function($) {
        return $.html()
      }, function(result, utils){
        //how to got the first scrape result?
        //use utils.last got request result?
        result;
        done();
      });
  });
});
atian25 commented 9 years ago

after reading src, sloved by using async :

and question is whether it should provide a suger method scraperPromise.request(optionFunction)

var expect = require('chai').expect;
var _ = require('lodash');
var URL = require('url');
var scraperjs = require('scraperjs');
var request = require('request');

describe('scraperjs', function(){
  it('should chain', function(done){
    var scraperPromise = scraperjs.StaticScraper.create();
    scraperPromise
      .get('http://echo.jsontest.com/k1/v1')
      .scrape(function($){
        return $.html();
      }, function(result){
        return result;
      })
      .async(function(callback, utils) {
        var result = utils.lastReturn;
        request({
          url: 'http://echo.jsontest.com/k2/' + result,
          method: 'get'
        }, function(error, response, body){
          utils.lastReturn = {
            first: result,
            last: body
          };
          callback();
        });
      })
      .then(function(utils){
        var result = utils.lastReturn;
        done();
      })
  });
});
ruipgil commented 9 years ago

It could be resolved by "unrelaxing" the promise chain. Right now the request and get promises can be declared at any point in the chain and will be executed first. The API would look like this,

scraper.
  .request(..) // or get
  .scrape(..)
  .request(..) // or get
  .scrape(..)

I'll look into it.

atian25 commented 9 years ago

after research all source, found maybe I was wrong.

    var scraperPromise = scraperjs.StaticScraper.create();
    scraperPromise
      .get('http://echo.jsontest.com/k1/v1')
      .scrape(function($){
        return $.html();
      })
      .get('http://echo.jsontest.com/k2/v2')
      .scrape(function($){
        return $.html();
      })

the code what I want is get url1 -> scrape -> get url2 -> scrape . but the real is: get url1 -> exec scrape1 -> exec scrape2 -> get url2 -> exec scrape1 -> exec scrape2

so:

  //register process handlers
  scraperPromise.scrape(fn, fn).onStatusCode(fn).onError(fn);
  // start the work
  scraperPromise.get(url);

**the doc is misleading us**

so we don't need to use last scrape result to create request options, just create another scraper. and the doc should separate them(register handler method vs trigger method) out.

correct me if I was wrong.