refactor audio loading - Githubissues

skeydan commented 1 year ago

As discussed with @dfalbel, this should involve evaluation of the following ideas:

R-wise, keep the option of having several backends, but go with a single one for now
also throw out the external library
while cleaning up, make backend-specific loaders private
while cleaning up, think of more descriptive function names
choose a backend prioritizing performance and robustness (e.g., what happens when files have wrong extension)

skeydan commented 1 year ago


library(bench)
library(tidyverse)

url <- "http://www.physics.uio.no/pow/wavbirds/chaffinch.wav"
wav <- fs::path(tempdir(), "chaffinch.wav")
download.file(
  file.path(url),
  destfile = wav
)
mp3 <- av::av_audio_convert(wav, fs::path(tempdir(), "chaffinch.mp3"))

############################ usage ############################

w <- tuneR::readWave(wav)
w
  # Wave Object
  # Number of Samples:      1864548
  # Duration (seconds):     42.28
  # Samplingrate (Hertz):   44100
  # Channels (Mono/Stereo): Stereo
  # PCM (integer format):   TRUE
  # Bit (8/16/24/32/64):    16

av::av_media_info(wav)$duration
# 42.28
w <- av::read_audio_bin(wav)
attributes(w)
# $channels
# [1] 2
#
# $sample_rate
# [1] 44100
length(w)
# 3729096
length(w)/attr(w, "channels")
# 1864548

m <- tuneR::readMP3(mp3)
m
  # Wave Object
  # Number of Samples:      1866240
  # Duration (seconds):     42.32
  # Samplingrate (Hertz):   44100
  # Channels (Mono/Stereo): Stereo
  # PCM (integer format):   TRUE
  # Bit (8/16/24/32/64):    16

av::av_media_info(mp3)$duration
# 42.31837

m <- av::read_audio_bin(mp3)
attributes(m)
length(m)/attr(m, "channels")
# 1864548

system(paste("soxi", mp3))
# Duration       : 00:00:42.32 = 1866224 samples = 3173.85 CDDA sectors

# ffprobe -show_format
# chaffinch.wav:
# Duration: 00:00:42.28
# chaffinch.mp3:
# Duration: 00:00:42.32, start: 0.025057

# It seems like ffmpeg strips off the superfluous samples
# don't quite get numbers to match though - unless this concerns the end as well:
(sup_samples <- 44100 * 0.025057) # 1105.014
(diff_samples <- 1866240 - 1864548) # 1692

############################ performance ############################

viz_benchmark <- function(df) {
  p <- df %>%
    mutate(
      expression =
        forcats::fct_reorder(as.character(expression),
                             min,
                             .desc = TRUE
        )
    ) %>%
    as_bench_mark() %>%
    autoplot(type = "ridge") + theme_minimal()
  p
}

compare_wav <- mark(
    tuneR::readWave(wav),
    av::read_audio_bin(wav),
    iterations = 100,
    check = FALSE
)
viz_benchmark(compare_wav)

compare_mp3 <- mark(
  tuneR::readMP3(mp3),
  av::read_audio_bin(mp3),
  iterations = 100,
  check = FALSE
)
viz_benchmark(compare_mp3) +
  theme("void")

############################ error handling ############################

notawav <- fs::path(tempdir(), "iamnotawav.wav")
notanmp3 <- fs::path(tempdir(), "iamnotanmp3.mp3")

# no error!!
nm <- tuneR::readMP3(notanmp3)
# Wave Object
# Number of Samples:      19227264
# Duration (seconds):     435.99

nw <- tuneR::readWave(notawav)
# Error in readBin(con, int, n = 4, size = 1, endian = "little", signed = FALSE) %*%  : non-conformable arguments

###

# no error/warning, but correct content is returned
nm <- av::read_audio_bin(notanmp3)
length(nm)/attr(nm, "channels")
# 1864548
av::av_media_info(notanmp3)
# $duration
# [1] 42.28
#
# $video
# NULL
#
# $audio
# channels sample_rate     codec frames bitrate     layout
# 1        2       44100 pcm_s16le     NA 1411200 2 channels

w <- av::read_audio_bin(wav)
(w == nm) %>% sum()

# same here: no warning, but content correct
nw <- av::read_audio_bin(notawav)
length(nm)/attr(nm, "channels")
# 1864548

m <- av::read_audio_bin(mp3)
(m == nw) %>% sum()

skeydan commented 1 year ago

https://github.com/mlverse/torchaudio/pull/62

mlverse / torchaudio

refactor audio loading #61