MTG / essentia.js

JavaScript library for music/audio analysis and processing powered by Essentia WebAssembly
https://essentia.upf.edu/essentiajs
GNU Affero General Public License v3.0
645 stars 42 forks source link

PitchContoursMultiMelody #31

Open gpeles opened 4 years ago

gpeles commented 4 years ago

Hello,

I'm trying to use the PitchContoursMultiMelody algorithm to implement real-time multi pitch detection. I'm following the recommended processing chain mentioned in the description of the PitchContours algorithm, which seems to work fine, but after I pass the outputs to the PitchContoursMultiMelody the audio worklet seems to freeze either immediately or after a couple of seconds (no errors are thrown). I'm unsure why this happens and would appreciate any help.

Note that I do try to explicitly delete the C++ objects every time the worklet process runs, as suggested by the Emscripten documentation (in case that's a memory issue), but this doesn't seem to make any difference (I don't have much experience with Emscripten so I might have got that wrong).

See my code below:

import { EssentiaModule } from '/essentia-wasm.module.js'
import Essentia from '/essentia.js-core.es.js'

class AnalyserProcessor extends AudioWorkletProcessor {
  constructor() {
    super()
    this.essentia = new Essentia(EssentiaModule)
    this.initFrames()
    this.initAggregate()
  }

  initFrames() {
    this.frames = []
    for (let i = 0; i < 16; i++) {
      this.frames[i] = {
        array: new Float32Array(2048),
        offset: -i - 1,
      }
    }
  }

  initAggregate() {
    this.aggregate = {
      bins: new this.essentia.module.VectorVectorFloat(),
      saliences: new this.essentia.module.VectorVectorFloat(),
      length: 0,
    }
  }

  process(inputs, outputs, parameters) {
    const essentia = this.essentia

    // assume mono
    const input = inputs[0][0]

    this.frames.forEach(frame => {
      frame.offset += 1

      if (frame.offset >= 0) {
        frame.offset %= 16
        frame.array.set(input, frame.offset * 128)

        if (frame.offset == 15) {
          // start processing chain
          const signal = essentia.arrayToVector(frame.array)
          const eqloud = essentia.EqualLoudness(signal).signal
          const window = essentia.Windowing(
            eqloud, true, 2048, 'hann', 4, true
          ).frame
          const spectrum = essentia.Spectrum(window).spectrum
          const spectralPeaks = essentia.SpectralPeaks(spectrum)

          // check sizes before converting to arrays
          if (
            spectralPeaks.frequencies.size()
            && spectralPeaks.magnitudes.size()
          ) {

            // check peaks before salience function
            const freq = essentia.vectorToArray(spectralPeaks.frequencies)
            const mag = essentia.vectorToArray(spectralPeaks.magnitudes)
            if (!freq.some(f => f <= 0) && !mag.some(m => m < 0)) {

              // pitch salience
              const salienceFunction = essentia.PitchSalienceFunction(
                ...Object.values(spectralPeaks)
              ).salienceFunction
              const salienceFunctionPeaks =
                essentia.PitchSalienceFunctionPeaks(salienceFunction)
              this.aggregate.bins.push_back(
                salienceFunctionPeaks.salienceBins
              )
              this.aggregate.saliences.push_back(
                salienceFunctionPeaks.salienceValues
              )
              this.aggregate.length++

              if (this.aggregate.length == 128) {
                const pitchContours = this.essentia.PitchContours(
                  this.aggregate.bins, this.aggregate.saliences
                )

                console.log('PitchContours', pitchContours)

                const pitchContoursMultiMelody =
                  this
                  .essentia
                  .PitchContoursMultiMelody(...Object.values(pitchContours))
                  .pitch

                console.log('MultiMelody', pitchContoursMultiMelody)

                // delete objects ?
                pitchContoursMultiMelody.delete()
                pitchContours.contoursStartTimes.delete()
                pitchContours.contoursSaliences.delete()
                pitchContours.contoursBins.delete()
                this.aggregate.bins.delete()
                this.aggregate.saliences.delete()
                this.initAggregate()
              }

              salienceFunctionPeaks.salienceValues.delete()
              salienceFunctionPeaks.salienceBins.delete()
              salienceFunction.delete()
            }
          }

          spectralPeaks.frequencies.delete()
          spectralPeaks.magnitudes.delete()
          spectrum.delete()
          window.delete()
          eqloud.delete()
          signal.delete()
        }
      }
    })

    return true
  }
}

registerProcessor('AnalyserProcessor', AnalyserProcessor)
dbogdanov commented 4 years ago

PitchContours* melody algorithms aren't designed for real-time, because they require a segment of audio to gather pitch contours and apply statistics. You can compute the chain up to PitchSalienceFunctionPeaks in real-time though.

For real-time Pitch detection we have a number of PitchYin* algorithms, but those are suited for monophonic signals.

gpeles commented 4 years ago

Thanks for the reply!

What is the minimum duration of the required audio segment?

In the code above I'm trying to implement a hybrid approach, so it's not really 'real-time'. The salience function peaks are calculated in real time using a frame size of 2048 and a hop size of 128 as suggested in the recommended processing chain. But I then store the resulting bins and saliences in two VectorVectorFloat objects (in this.aggregate), and wait to have 128 of those before passing them to the PitchContours and PitchContourMultiMelody algorithms (then, I reset this.aggregate and repeat).

The PitchContours algorithm seems to work fine and generates output. The chain seems to also work well if I replace PitchContoursMultiMelody with PitchContoursMonoMelody. Now I've tried to increase the size of the aggregate from 128 to 256. This works for a bit longer, but also eventually freezes.. The same happens if I increase it to 512 - it works for some time but eventually freezes.

dbogdanov commented 4 years ago

We'll have to look at the freezes. @albincorreya

dbogdanov commented 4 years ago

If the contours look fine, you can try working on shorter segments and see.