matthewmueller / x-ray

The next web scraper. See through the <html> noise.
MIT License
5.87k stars 349 forks source link

Xray(...).abort is not a function #270

Closed Raidus closed 5 years ago

Raidus commented 7 years ago

Subject of the issue

The documentation says that the abort method accepts a callback function with two arguments. I've tried a minimal example but it didn't work. I'm not sure how to use the "abort" function.

Could someone provide a minimal example how to use this function?

I've tried following code to understand the abort method but I'm getting an error.

'use strict';

var Xray = require('x-ray');
var x = Xray().abort((result, next) => {
  console.log('result', result);
  console.log('next', next);
  return false;
});

x('https://dribbble.com', 'li.group', [
  {
    title: '.dribbble-img strong',
    image: '.dribbble-img [data-src]@data-src'
  }
])
  .paginate('.next_page@href')
  .limit(3)
  .write('results.json');

Error:

Error TypeError: Xray(...).abort is not a function

My environment

Raidus commented 7 years ago

I couldn't figure out how the abort functions works but meanwhile I came up with an own solutions. It's not really straight forward but at least it works :-)

const rp = require('request-promise');
const Xray = require('x-ray');

const x = Xray({
  filters: {
    correctURI: function(value) {
      return typeof value === 'string'
        ? `https://www.amazon.de${value}`
        : value;
    }
  }
});

const MAX_PAGES = 3;

const getPage = url => {
  return rp({ url }).then(html => {
    return new Promise((resolve, reject) => {
      const result = x(html, {
        pagnResult: x(html, '.s-result-item.celwidget', [
          {
            asin: '@data-asin'
          }
        ]),
        pagnNextLink: '#pagnNextLink@href | correctURI'
      })((err, result) => {
        if (err) reject(err);
        if (!err) resolve(result);
      });
    });
  });
};

function abort() {
  // some useful conditition
  return false;
}

async function scrapeNpages(book) {
  let i,
    results = [],
    refererrs = [];
  refererrs.push(
    `https://www.amazon.de/s?&field-keywords=${encodeURIComponent(book)}`
  );

  for (i = 0; i < MAX_PAGES; ++i) {
    try {
      const result = await getPage(refererrs[i]);
      refererrs.push(result.pagnNextLink);
      results.push(result.pagnResult);
      if (abort()) break;
    } catch (err) {
      console.log(err);
    }
  }
  return Array.prototype.concat(...results);
}

scrapeNpages('harry potter').then(res => console.log(res));
dfcowell commented 6 years ago

In case anyone else comes up against this problem (I just came back to this repo after many months out of the scraping world and the "abort" method was my work), here's a working sample from my implementation of abort:

var xray = require('x-ray'),
    x = xray(),
    moment = require('moment');

function scrape(data) {
        x(data.url, '.review', [{
            title: '.review-title',
            content: '.review-text',
            id: '@id',
            rating: '.review-rating',
            date: '.review-date',
            reviewer: {
                name: '.author',
                id: '.author@href'
            }
        }])
        .paginate('.a-pagination .a-last a@href')
        .abort((result, url) => {
            for(let i = 0; i < result.length; i++) {
                let dateStr = result[i].date.replace('on ', '');
                let date = moment(dateStr, 'MMMM D, YYYY');

                if(date.isBefore(moment().startOf('day'))) {
                    return true;
                }
            }

            return false;
        });
}

If this doesn't work, make sure you have the correct version of x-ray installed. :)