luin / readability

📚 Turn any web page into a clean view
2.49k stars 312 forks source link

Memory is not freed after article.close() #44

Open masihyeganeh opened 9 years ago

masihyeganeh commented 9 years ago

Memory usage

Readability or jsdom are using a huge amount of ram (10+ MBs) to parse a small web page (500kb) and they never free used memory. This will not let us use node-readability for our web scraper.

I'm not sure if it is caused by jsdom or not, if so, will be an easy way to switch jsdom with cheerio? It would be great if there was a config for that.

My environment

var read = require('node-readability');

function useNodeReadability() {
    read('http://farsnews.com/newstext.php?nn=13930926000105', function(error, article, meta) {
        if (error)
        {
            console.error('Fetch Error');
            process.exit();
        }

        console.log('Readability work done here');
        article.close();
    });
}

setInterval(function() {
    console.log(process.memoryUsage());
}, 1000);

setInterval(useNodeReadability, 5000);

Expected result

Memory usage should be free after each execution

Actual result

Heap and RSS memory are increasing

My results:

{ rss: 93995008, heapTotal: 74054656, heapUsed: 44526608 }
{ rss: 94306304, heapTotal: 74054656, heapUsed: 44886264 }
{ rss: 94310400, heapTotal: 74054656, heapUsed: 44932392 }
{ rss: 94310400, heapTotal: 74054656, heapUsed: 44940960 }
{ rss: 94576640, heapTotal: 75074560, heapUsed: 45400320 }
{ rss: 94986240, heapTotal: 76106496, heapUsed: 45895440 }
{ rss: 95064064, heapTotal: 76106496, heapUsed: 46026496 }
{ rss: 95072256, heapTotal: 76106496, heapUsed: 46059808 }
Readability work done here
{ rss: 107675648, heapTotal: 86389760, heapUsed: 61067240 }
{ rss: 107819008, heapTotal: 86389760, heapUsed: 61295024 }
{ rss: 107819008, heapTotal: 86389760, heapUsed: 61302864 }
{ rss: 107819008, heapTotal: 86389760, heapUsed: 61313048 }
{ rss: 107925504, heapTotal: 86389760, heapUsed: 61485680 }
{ rss: 107933696, heapTotal: 86389760, heapUsed: 61532520 }
{ rss: 108036096, heapTotal: 86389760, heapUsed: 61665536 }
Readability work done here
{ rss: 115023872, heapTotal: 90505472, heapUsed: 56241112 }
{ rss: 115023872, heapTotal: 90505472, heapUsed: 56259632 }
{ rss: 115052544, heapTotal: 90505472, heapUsed: 56660656 }
{ rss: 115130368, heapTotal: 90505472, heapUsed: 56715088 }
{ rss: 115376128, heapTotal: 90505472, heapUsed: 56893912 }
Readability work done here
{ rss: 115843072, heapTotal: 90505472, heapUsed: 63118632 }
{ rss: 115843072, heapTotal: 90505472, heapUsed: 63210512 }
{ rss: 115851264, heapTotal: 90505472, heapUsed: 63264552 }
Readability work done here
{ rss: 116563968, heapTotal: 92569344, heapUsed: 69165648 }
{ rss: 116572160, heapTotal: 92569344, heapUsed: 69255816 }
{ rss: 116936704, heapTotal: 92569344, heapUsed: 69453480 }
Readability work done here
{ rss: 120549376, heapTotal: 95653120, heapUsed: 74474080 }
...

thanks

luin commented 9 years ago

Hi! I modified your code to invoke gc() before measuring memory usage, and the "leak" is fixed. I think it's just because V8's garbage collection didn't kick in.

It's possible to use cheerio instead of jsdom and I will work on it this week.

Here is the code:

var read = require('./src/readability');

function useNodeReadability() {
  read('http://farsnews.com/newstext.php?nn=13930926000105', function(error, article, meta) {
    if (error)
      {
        console.error('Fetch Error');
        process.exit();
      }

      console.log('Readability work done here');
      article.close();
  });
}

setInterval(function() {
  // invoke gc() manually
  gc();
  console.log(process.memoryUsage());
}, 1000);

setInterval(useNodeReadability, 1000);

Here is the result:

node-readablity git:master ❯ node --expose-gc memory.js
{ rss: 95911936, heapTotal: 74054656, heapUsed: 32973616 }
{ rss: 92536832, heapTotal: 70958848, heapUsed: 32188488 }
{ rss: 92807168, heapTotal: 70958848, heapUsed: 31538008 }
{ rss: 91865088, heapTotal: 70958848, heapUsed: 29624024 }
{ rss: 90894336, heapTotal: 68894976, heapUsed: 29717464 }
{ rss: 90894336, heapTotal: 68894976, heapUsed: 29694872 }
{ rss: 90906624, heapTotal: 68894976, heapUsed: 30046536 }
{ rss: 90390528, heapTotal: 68894976, heapUsed: 30022824 }
{ rss: 90906624, heapTotal: 68894976, heapUsed: 30026312 }
{ rss: 90906624, heapTotal: 68894976, heapUsed: 29951448 }
{ rss: 90316800, heapTotal: 68894976, heapUsed: 29947728 }
{ rss: 90943488, heapTotal: 68894976, heapUsed: 30016232 }
{ rss: 90177536, heapTotal: 68894976, heapUsed: 29731560 }
{ rss: 90943488, heapTotal: 68894976, heapUsed: 29731504 }
{ rss: 90943488, heapTotal: 68894976, heapUsed: 29858584 }
{ rss: 90251264, heapTotal: 68894976, heapUsed: 29854184 }
Readability work done here
{ rss: 100114432, heapTotal: 77114368, heapUsed: 36983144 }
{ rss: 97644544, heapTotal: 74042624, heapUsed: 36669056 }
{ rss: 97677312, heapTotal: 74042624, heapUsed: 32258792 }
{ rss: 94117888, heapTotal: 70946816, heapUsed: 31517080 }
{ rss: 93282304, heapTotal: 69926912, heapUsed: 30693072 }
{ rss: 92876800, heapTotal: 69926912, heapUsed: 30666992 }
{ rss: 93290496, heapTotal: 69926912, heapUsed: 30291792 }
Readability work done here
{ rss: 99975168, heapTotal: 76094464, heapUsed: 36653544 }
{ rss: 98095104, heapTotal: 74042624, heapUsed: 36299872 }
{ rss: 98455552, heapTotal: 74042624, heapUsed: 32254072 }
{ rss: 94498816, heapTotal: 69926912, heapUsed: 31272872 }
{ rss: 94265344, heapTotal: 69926912, heapUsed: 30434600 }
{ rss: 93818880, heapTotal: 69926912, heapUsed: 30645680 }
Readability work done here
{ rss: 101142528, heapTotal: 77114368, heapUsed: 36876512 }
{ rss: 97959936, heapTotal: 74042624, heapUsed: 32215192 }
{ rss: 95494144, heapTotal: 70946816, heapUsed: 32491976 }
{ rss: 95969280, heapTotal: 70946816, heapUsed: 31518840 }
{ rss: 94175232, heapTotal: 69926912, heapUsed: 30679464 }
{ rss: 93573120, heapTotal: 69926912, heapUsed: 30605224 }
{ rss: 92794880, heapTotal: 69926912, heapUsed: 30211272 }
{ rss: 93577216, heapTotal: 69926912, heapUsed: 30192224 }
{ rss: 93601792, heapTotal: 69926912, heapUsed: 30377776 }
{ rss: 92954624, heapTotal: 69926912, heapUsed: 30370184 }
{ rss: 93626368, heapTotal: 69926912, heapUsed: 30396936 }
{ rss: 93626368, heapTotal: 69926912, heapUsed: 30677936 }
{ rss: 93626368, heapTotal: 69926912, heapUsed: 30690752 }
{ rss: 93626368, heapTotal: 69926912, heapUsed: 30720816 }
{ rss: 93630464, heapTotal: 69926912, heapUsed: 30839056 }
{ rss: 93634560, heapTotal: 69926912, heapUsed: 30857640 }
{ rss: 93634560, heapTotal: 69926912, heapUsed: 30857880 }
{ rss: 93634560, heapTotal: 69926912, heapUsed: 30450040 }
{ rss: 92991488, heapTotal: 69926912, heapUsed: 30497912 }
{ rss: 93376512, heapTotal: 68894976, heapUsed: 30771344 }
{ rss: 93380608, heapTotal: 68894976, heapUsed: 30836104 }
Readability work done here
{ rss: 101642240, heapTotal: 77114368, heapUsed: 37034992 }
{ rss: 98238464, heapTotal: 74042624, heapUsed: 32222168 }
{ rss: 95629312, heapTotal: 70946816, heapUsed: 32405536 }
{ rss: 96325632, heapTotal: 70946816, heapUsed: 31658664 }
{ rss: 94560256, heapTotal: 69926912, heapUsed: 30827824 }
{ rss: 93474816, heapTotal: 68894976, heapUsed: 30798544 }
{ rss: 93499392, heapTotal: 68894976, heapUsed: 30772608 }
{ rss: 93114368, heapTotal: 69926912, heapUsed: 30766664 }
{ rss: 93523968, heapTotal: 69926912, heapUsed: 30770896 }
{ rss: 93552640, heapTotal: 69926912, heapUsed: 30859096 }
{ rss: 93179904, heapTotal: 70958848, heapUsed: 30852856 }
{ rss: 93601792, heapTotal: 70958848, heapUsed: 30834992 }
{ rss: 93646848, heapTotal: 71990784, heapUsed: 30940136 }
{ rss: 93659136, heapTotal: 71990784, heapUsed: 30947448 }
{ rss: 93671424, heapTotal: 71990784, heapUsed: 30956032 }
{ rss: 93675520, heapTotal: 71990784, heapUsed: 30556752 }
{ rss: 93233152, heapTotal: 73022720, heapUsed: 30700192 }
{ rss: 93749248, heapTotal: 73022720, heapUsed: 30608880 }
{ rss: 93216768, heapTotal: 74054656, heapUsed: 30612440 }
{ rss: 93777920, heapTotal: 74054656, heapUsed: 30428480 }
{ rss: 93380608, heapTotal: 75086592, heapUsed: 30683944 }
{ rss: 93925376, heapTotal: 75086592, heapUsed: 30912720 }
{ rss: 93925376, heapTotal: 75086592, heapUsed: 30913408 }
{ rss: 93949952, heapTotal: 75086592, heapUsed: 30960192 }
{ rss: 93982720, heapTotal: 75086592, heapUsed: 30995944 }
{ rss: 94027776, heapTotal: 75086592, heapUsed: 31090280 }
{ rss: 94126080, heapTotal: 75086592, heapUsed: 31040504 }
{ rss: 93790208, heapTotal: 76118528, heapUsed: 31029536 }
Readability work done here
{ rss: 102293504, heapTotal: 80210176, heapUsed: 37298736 }
Readability work done here
Readability work done here
{ rss: 110424064, heapTotal: 84325888, heapUsed: 39052136 }
Readability work done here
{ rss: 109940736, heapTotal: 84325888, heapUsed: 44442608 }
{ rss: 107823104, heapTotal: 81254144, heapUsed: 39483320 }
{ rss: 101552128, heapTotal: 75074560, heapUsed: 34321984 }
{ rss: 99885056, heapTotal: 74054656, heapUsed: 31560392 }
{ rss: 96325632, heapTotal: 71990784, heapUsed: 30611856 }
{ rss: 96911360, heapTotal: 71990784, heapUsed: 30764144 }
{ rss: 96403456, heapTotal: 73022720, heapUsed: 30677512 }