`hyp drive sync . -y` crashes with heap out of memory allocation failure

When I try to seed a directory with 99GB of data spread across 2.6 million files, I get a memory allocation error after 2 minutes:

peermaps@vmi595217:~/data/peermaps/edb$ time hyp drive sync . -y
Creating new hyperdrive...
Source: .
Target: hyper://a2f90b0157f7a9e810410e5480f40e345621002757fd1afeda31980c361dd9bc//
Syncing...
Comparing...

<--- Last few GCs --->
06[28308:0x60bc730]   130401 ms: Mark-sweep (reduce) 4096.2 (4102.0) -> 4095.5 (4103.3) MB, 4123.0 / 14.2 ms  (+ 0.0 ms in 16 steps since start of marking, biggest step 0.0 ms, walltime since start of marking 4148 ms) (average mu = 0.061, current mu = 0.006[28308:0x60bc730]   133968 ms: Mark-sweep (reduce) 4096.5 (4102.3) -> 4095.9 (4103.5) MB, 3350.9 / 15.6 ms  (+ 175.9 ms in 16 steps since start of marking, biggest step 108.7 ms, walltime since start of marking 3567 ms) (average mu = 0.036, current mu = 0

<--- JS stacktrace --->

FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
 1: 0xa222f0 node::Abort() [hyp]
 2: 0x96411f node::FatalError(char const*, char const*) [hyp]
 3: 0xb97f1e v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [hyp]
 4: 0xb98297 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [hyp]
 5: 0xd52fd5  [hyp]
 6: 0xd53b5f  [hyp]
 7: 0xd61beb v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace, v8::internal::GarbageCollectionReason, v8::GCCallbackFlags) [hyp]
 8: 0xd657ac v8::internal::Heap::AllocateRawWithRetryOrFailSlowPath(int, v8::internal::AllocationType, v8::internal::AllocationOrigin, v8::internal::AllocationAlignment) [hyp]
 9: 0xd33e5b v8::internal::Factory::NewFillerObject(int, bool, v8::internal::AllocationType, v8::internal::AllocationOrigin) [hyp]
10: 0x107c5af v8::internal::Runtime_AllocateInYoungGeneration(int, unsigned long*, v8::internal::Isolate*) [hyp]
11: 0x1423279  [hyp]
Aborted

real    2m18.102s
user    4m57.573s
sys     1m21.883s

Watching htop the whole time, the highest I saw the memory usage go was up to 10% on this machine with 60GB of memory.

peermaps@vmi595217:~/data/peermaps/edb$ node -v
v14.17.0
peermaps@vmi595217:~/data/peermaps/edb$ hyp --version
2.0.0

To reproduce, the data is also seeded to /ipfs/QmVCYUK51Miz4jEjJxCq3bA6dfq5FXD6s2EYp6LjHQhGmh

I was able to use the hyperdrive api directly to import this archive, so I'm pretty sure this memory issue is in hyp, one of the layers on top of hyperdrive that hyp uses, or in how the traversal for files happens.

This script runs to import the archive with very little memory use:

#!/usr/bin/env node
var Hyperdrive = require('hyperdrive')
var fs = require('fs')
var path = require('path')
var pump = require('pumpify')

var minimist = require('minimist')
var argv = minimist(process.argv.slice(2))

if (argv._[0] === 'import') {
  var hdir = argv.o || '.hyperdrive'
  var drive = new Hyperdrive(hdir)
  var cursors = [argv._[1]]
  var count = { files: 0, directories: 1 }
  var errors = []
  var prevLines = 0
  var iv = setInterval(print, 1000)
  function error(err) {
    errors.push({ time: Date.now(), error: err })
  }
  function print() {
    var clear = '\x1b[K'
    for (var i = 0; i < prevLines; i++) {
      clear += '\x1b[1A\x1b[K'
    }
    prevLines = 1 + errors.length
    var nFile = fmt(count.files).padStart(9)
    var nDir = fmt(count.directories).padStart(9)
    console.log(`${clear}${nFile} files, ${nDir} directories`)
    errors.forEach(function (e) {
      console.log('Error: ' + e.error.message)
    })
    errors = errors.filter(e => Date.now() - e.time < 5*60_000) // show for 5 minutes
  }
  ;(function next() {
    if (cursors.length === 0) {
      clearInterval(iv)
      print()
      return
    }
    var c = cursors.shift()
    fs.readdir(c, function (err, files) {
      if (err) {
        error(err)
        return next()
      }
      files = files.filter(file => !/^\./.test(file))
      var pending = 1 + files.length
      files.forEach(function (f) {
        var file = path.join(c,f)
        fs.stat(file, function (err, stat) {
          if (err) {
            error(err)
            if (--pending === 0) next()
            return
          } else if (stat.isDirectory()) {
            drive.mkdir(file, function (err) {
              if (err) error(err)
              else {
                count.directories++
                cursors.push(file)
              }
              if (--pending === 0) next()
            })
          } else {
            fs.readFile(file, function (err, src) {
              if (err) {
                error(err)
                if (--pending === 0) next()
                return
              }
              drive.writeFile(file, src, function (err) {
                if (err) error(err)
                else count.files++
                if (--pending === 0) next()
              })
            })
          }
        })
      })
      if (--pending === 0) next()
    })
  })()
} else if (argv._[0] === 'addr') {
  var drive = new Hyperdrive(argv.o || argv._[1] || '.hyperdrive')
  drive.once('ready', function () {
    console.log(`hyper://${drive.key.toString('hex')}`)
  })
} else if (argv._[0] === 'share') {
  var drive = new Hyperdrive(argv.o || argv._[1] || '.hyperdrive')
  drive.once('ready', function () {
    console.log(`hyper://${drive.key.toString('hex')}`)
    var swarm = require('hyperswarm')()
    var crypto = require('crypto')
    swarm.join(drive.discoveryKey, { lookup: true, announce: true })
    swarm.on('connection', (socket, info) => {
      socket.on('error', err => console.error(err))
      pump(socket, drive.replicate(info.client), socket)
    })
    swarm.on('error', err => console.error(err))
  })
}

function fmt(x) {
  return x.toString()
    .split('')
    .reverse()
    .join('')
    .replace(/(\d{3})/g,'$1_')
    .split('')
    .reverse()
    .join('')
    .replace(/^_/,'')
}

hypercore-protocol / cli

`hyp drive sync . -y` crashes with heap out of memory allocation failure #53