segment-boneyard / nightmare

A high-level browser automation library.
https://open.segment.com
19.53k stars 1.08k forks source link

Why is one Nightmare session seemingly cannabalizing the performance of others? #1198

Closed zeluspudding closed 6 years ago

zeluspudding commented 6 years ago

I have a scraping job I'd like to multithread because I have several thousand urls I need to test. The code below 1) reads in a csv with my url targets, 2) chunks those targets, 3) distributes those chunks to nightmare sessions so that they 4) visit each url in the chunk after logging into a website. Finally, each worker 5) writes their results to csv. The script below seems to work as desired except that one worker always scrapes most of its allotted urls (say 35 of 40) while the others don't (say 8 of 40). The same behavior is seen whether I have 2 workers or 15. Why?

At first I thought it was because the first worker would finish then somehow terminate other sessions. But that doesn't seem likely since other sessions finish saving their csv results up to a minute after the first one is done. What's more, each session creates its own memory space... so that can't be it.

In general, running multiple workers in any application chokes resources to all of them. But if that were the issue here I'd think all the workers would have similar throughput... not one worker with high throughput and the others very little.

Here's something weird: workers scrape the same urls multiple times. I'm not sure why but the error rate seems to increase with the number of workers... duplicating work and totally wasting scrape cycles. In some cases I've had the same url scraped 170 times.

What am I doing wrong?

var Nightmare = require('nightmare')
var vo = require('vo')
const fs = require('fs');
var async = require('async');
var csv = require('fast-csv');

function transpose(a) {...}
function make_csv(arr) {...}
function parseDate(arr, Nth) {...}
function getRandomInt(min, max) {...}
function chunkArray(myArray, chunk_size){...}

var urls = []
chunk_size = 40; // number of urls scraped per nightmare session
workers = 2; // number of async nightmare sessions
var targets_path = 'url_targets.csv';

fs.createReadStream(targets_path)
  .pipe(csv())
  .on('data',function (data){
    // Do stuff with data as it loads
    urls.push(data.toString())
  })
  .on('end', function (data) {
    // Do stuff with data once it's entirely loaded
    var chunked_urls = chunkArray(urls, chunk_size);
   // chunk urls (i.e. 40 at a time) to each nightmare worker (i.e. 2)
    try { 
      async.mapLimit(chunked_urls, workers, vo(main), function(e, result) {
      console.error(e)});
    } catch (e) {
      console.error(e)
      console.log('here2');
    }
  })

function* main(urls) {
  var nightmare = new Nightmare({
    show: false
  })
  try {
    // Go to landing page to press "accept" button
    nightmare // to yield or not to yield?
      .goto('https://landingpage_with_login.com') 
      .wait('input[name="submit"][value="Login"]')
      .click('input[name="submit"][value="Login"]')
      .wait('#search_results')
  // if login succeeded then serially visit urls that were chunked to this worker
  try {
      results = []
      for (var i = 0; i < urls.length; i++) {
        try {
          var table_exists = yield nightmare
            .wait(getRandomInt(3, 15) * 1000) // giver server breathing room.
            .goto(urls[i])
            .wait('#right_column')
            .wait(500)
            .exists('#searchResultsTable')
          if (table_exists) {
            // extract/parse thing1 of interest
            try { 
              var thing1 = yield nightmare
              .evaluate(function() {
                return [...document.querySelectorAll('#resultsTable tbody')]
                .map(el => el.innerText.substring(10));
              })
              // console.log(thing1);
            } catch (e) {
              console.error(e)
              console.log('here4');
            }
            try {   // extract/parse thing2 of interest
              var thing2 = yield nightmare
              .evaluate(function() {
                return [...document.querySelectorAll('#searchResultsTable tbody td a')]
                .map(el => el.href);
              })
              // console.log(thing2);
            } catch (e) {
              console.error(e)
              console.log('here5');
            }
            try { // Collect results to append to csv later
              payload = [thing1, thing2, Array(thing1.length).fill(urls[i])];  
              results.push(transpose(payload)); // reorient results for csv
            } catch (e) {
              console.error(e);
              console.log('here9');
            }
          }
          // Check if record doesn't exist
          else if (yield nightmare.exists('.noResultsMessage')) {
            try {
              results.push([[[''],[''],[''], [urls[i]]]])
            } catch (e) {
              console.error(e);
              console.log('here7');
            }
          }
          else { 
            try {
              results.push([[[''],[''],[''], [urls[i]], ['fetch error: unkown ']]])
            } catch (e) {
              console.error(e);
              console.log('here8');
            }
          }
        } catch(e) {
          console.log('here3 - ');
          console.error(e)
        }
      }
      // Append CSV to file
      try {
        flatResultsArray = [].concat(...results) //flatten array
        // Save data across several files so as not to form an io bottleneck
        fileName = '.\\scrape\\mortgage' + getRandomInt(1, 50).toString() + '.csv'
        fs.appendFile(fileName, make_csv(flatResultsArray) + '\n', function()
        { return null });
      } catch (e) {
        console.error(e);
        console.log('here10');
      }
    } catch (e) {
      // Handle else exception
      console.error(e)
      console.log('here11');
    }
  } catch(e) {
    console.error(e)
    console.log('total_failure');
  }
  yield nightmare.end();
}
matthewmueller commented 6 years ago

Hi @zeluspudding, nothing in your code looks really off. Without having everything setup on my end, it's a bit hard to help you with the problem. I would definitely yield up front before starting on each URL.

If you're still having this problem, please reopen with some additional details on how we can run this ourselves. Thanks!