jhudsl / ari

:dancers: The Automated R Instructor
https://jhudatascience.org/ari/
Other
146 stars 37 forks source link

Bug in `ari_stitch()` #39

Closed howardbaek closed 1 year ago

howardbaek commented 1 year ago

When I run ari_stitch() on the images/audio generated from this test set of Google slides, I get a video file where the last slide is missing.

For example, if I have 6 PNG images and 6 WAV audio files, I only get 5 of them weaved in the final output.

Reproducible Example:

utils.R

pad_wav <- function(wav, duration = NULL) {
  # See if wav inherits from "Wave" class
  is_Wave <- inherits(wav, "Wave")
  if (is_Wave) {
    wav <- list(wav)
  }
  if (is.null(duration)) {
    duration <- rep(NA, length(wav))
  }
  stopifnot(length(duration) == length(wav))
  # Iterate over wav and find "ideal duration"
  duration <- map2_int(.x = wav, .y = duration,
                       .f = function(wav, dur) {
                         ideal_duration <- ceiling(length(wav@left) / wav@samp.rate)
                         if (!is.na(dur)) {
                           ideal_duration <- max(ideal_duration, dur)
                         }
                         ideal_duration
                       })
  # Iterate over wav and create end_wav that binds to existing wav
  out_wav <- map2(.x = wav,
                  .y = duration,
                  .f = function(wav, ideal_duration) {
                    left <- rep(0, wav@samp.rate * ideal_duration - length(wav@left))
                    right <- numeric(0)
                    if (wav@stereo) {
                      right <- left
                    }
                    end_wav <- tuneR::Wave(
                      left = left,
                      right = right,
                      bit = wav@bit,
                      samp.rate = wav@samp.rate, 
                      pcm = wav@pcm
                    )
                    wav <- tuneR::bind(wav, end_wav)
                    wav
                  })

  if (is_Wave) {
    out_wav <- out_wav[[1]]
  }

  return(out_wav)
}

match_sample_rate <- function(audio, verbose = TRUE) {
  if (inherits(audio, "Wave")) {
    return(audio)
  }
  # iterate over audio and extract sampling rate
  sample_rate <- sapply(audio, function(r) r@samp.rate)

  if (!all(sample_rate == sample_rate[[1]]) && verbose) {
    message("enforcing same sample rate, using minimum")
  }
  # get minimum sampling rate
  sample_rate <- min(sample_rate, na.rm = TRUE)
  if (verbose) {
    message(paste0("Sample rate downsampled to ", sample_rate))
  }
  # downsample wave object to sample_rate
  audio <- lapply(audio, function(x) {
    if (x@samp.rate == sample_rate) {
      return(x)
    }
    tuneR::downsample(x, samp.rate = sample_rate)
  })
  # iterate over audio and extract out sampling rate 
  sample_rate <- sapply(audio, function(r) r@samp.rate)
  # check if all the values in sample_rate are equal to the first value in sample_rate
  stopifnot(all(sample_rate == sample_rate[[1]]))

  return(audio)
}

# get random string
get_random_string <- function() {
  paste(sample(c(seq(10), letters, LETTERS),
               size = 12, replace = TRUE
  ), collapse = "")
}

wav_length <- function(wav) {
  stopifnot(is_Wave(wav))
  length(wav@left) / wav@samp.rate
}

is_Wave <- function(x) {
  identical(suppressWarnings(as.character(class(x))), "Wave")
}

set_encoders.R

get_os <- function() {
  sys_info <- Sys.info()
  os <- tolower(sys_info[["sysname"]])
  return(os)
}

#' Set Default Audio and Video Codecs
#'
#' @param codec The codec to use or get for audio/video.  Uses the
#' `ffmpeg_audio_codec` and `ffmpeg_video_codec` options
#' to store this information.
#' @seealso [ffmpeg_codecs()] for options
#' @return A `NULL` output
#'
#'
#' @rdname codecs
#' @export
#'
#' @examples
#' \dontrun{
#' if (have_ffmpeg_exec()) {
#'   print(ffmpeg_version())
#'   get_audio_codec()
#'   set_audio_codec(codec = "libfdk_aac")
#'   get_audio_codec()
#'   set_audio_codec(codec = "aac")
#'   get_audio_codec()
#' }
#' if (have_ffmpeg_exec()) {
#'   get_video_codec()
#'   set_video_codec(codec = "libx265")
#'   get_video_codec()
#'   set_video_codec(codec = "libx264")
#'   get_video_codec()
#' }
#' ## empty thing
#' if (have_ffmpeg_exec()) {
#'   video_codec_encode("libx264")
#'
#'   audio_codec_encode("aac")
#' }
#' }
set_audio_codec <- function(codec) {
  if (missing(codec)) {
    os <- get_os()
    codec <- switch(os,
                    darwin = "libfdk_aac",
                    windows = "ac3",
                    linux = "aac"
    )
  }
  options(ffmpeg_audio_codec = codec)
}

#' @export
#' @rdname codecs
set_video_codec <- function(codec = "libx264") {
  options(ffmpeg_video_codec = codec)
}

#' @export
#' @rdname codecs
get_audio_codec <- function() {
  codec <- getOption("ffmpeg_audio_codec")
  if (is.null(codec)) {
    os <- get_os()
    res <- ffmpeg_audio_codecs()
    if (is.null(res)) {
      fdk_enabled <- FALSE
    } else {
      fdk_enabled <- grepl("fdk", res[res$codec == "aac", "codec_name"])
    }
    if (fdk_enabled) {
      os_audio_codec <- "libfdk_aac"
    } else {
      os_audio_codec <- "aac"
    }
    codec <- switch(os,
                    darwin = os_audio_codec,
                    windows = "ac3",
                    linux = "aac"
    )
    set_audio_codec(codec = codec)
  }
  return(codec)
}

#' @export
#' @rdname codecs
get_video_codec <- function() {
  codec <- getOption("ffmpeg_video_codec")
  if (is.null(codec)) {
    codec <- "libx264"
    set_video_codec(codec = codec)
  }
  return(codec)
}

#' @rdname codecs
#' @export
audio_codec_encode <- function(codec) {
  res <- ffmpeg_audio_codecs()
  if (is.null(res)) {
    warning("Codec could not be checked")
    return(NA)
  }
  stopifnot(length(codec) == 1)
  res <- res[res$codec %in% codec |
               grepl(codec, res$codec_name), ]
  res$encoding_supported
}

#' @rdname codecs
#' @export
video_codec_encode <- function(codec) {
  res <- ffmpeg_video_codecs()
  if (is.null(res)) {
    warning("Codec could not be checked")
    return(NA)
  }
  stopifnot(length(codec) == 1)
  res <- res[res$codec %in% codec |
               grepl(codec, res$codec_name), ]
  res$encoding_supported
}

ffmpeg_codecs.R

#' Get Codecs for ffmpeg
#'
#' @return A `data.frame` of codec names and capabilities
#' @export
#'
#' @examples
#' \dontrun{
#' if (ffmpeg_version_sufficient()) {
#'   ffmpeg_codecs()
#'   ffmpeg_video_codecs()
#'   ffmpeg_audio_codecs()
#' }
#' }
ffmpeg_codecs <- function() {
  ffmpeg <- ari::ffmpeg_exec(quote = TRUE)
  cmd <- paste(ffmpeg, "-codecs")
  result <- system(cmd, ignore.stderr = TRUE, ignore.stdout = TRUE)
  res <- system(cmd, intern = TRUE, ignore.stderr = TRUE)
  res <- trimws(res)
  if (length(res) == 0) {
    res <- ""
  }
  if (result != 0 & all(res %in% "")) {
    warning("No codecs output from ffmpeg for codecs")
    return(NULL)
  }
  # extract elements of res that start with either a period or "D"
  res <- res[grepl("^([.]|D)", res)]
  # split by " "
  res <- strsplit(res, " ")
  res <- t(vapply(res, function(x) {
    # trims any leading or trailing white space
    x <- trimws(x)
    # removes any empty strings in each element
    x <- x[x != ""]
    # concatenate the elements that come after the second
    if (length(x) >= 3) {
      x[3:length(x)] <- paste(x[3:length(x)], collapse = " ")
    }
    # return the first 3 elements
    return(x[seq(3)])
  }, FUN.VALUE = character(3)))
  # name the 3 columns
  colnames(res) <- c("capabilities", "codec", "codec_name")
  # convert matrix to dataframe
  res <- as.data.frame(res, stringsAsFactors = FALSE)

  if (nrow(res) == 0) {
    warning("No codecs output from ffmpeg for codecs")
    return(NULL)
  }
  res$capabilities <- trimws(res$capabilities)

  cap_defns <- res[res$codec == "=", ]
  res <- res[res$codec != "=", ]
  # split each character and rbind 
  cap <- do.call("rbind", strsplit(res$capabilities, split = ""))

  cap_defns$codec_name <- tolower(cap_defns$codec_name)
  cap_defns$codec_name <- gsub(" ", "_", cap_defns$codec_name)
  cap_defns$codec_name <- gsub("-", "_", cap_defns$codec_name)
  cap_def <- do.call("rbind", strsplit(cap_defns$capabilities, split = ""))

  # create NA matrix
  mat <- matrix(NA, ncol = nrow(cap_defns), nrow = nrow(cap))
  colnames(mat) <- cap_defns$codec_name

  icol <- 4
  indices <- apply(cap_def, 1, function(x) which(x != "."))
  # output: vector of indices corresponding to non-"." values in each row of cap_def.
  for (icol in seq(nrow(cap_def))) {
    x <- cap[, indices[icol]]
    mat[, icol] <- x %in% cap_def[icol, indices[icol]]
  }
  mat <- as.data.frame(mat, stringsAsFactors = FALSE)

  res <- cbind(res, mat)
  if (any(rowSums(
    res[, c("video_codec", "audio_codec", "subtitle_codec")]
  )
  > 1)) {
    warning("Format may have changed, please post this issue")
  }

  # L = list(capabilities = cap_defns,
  #          codecs = res)
  # return(L)
  return(res)
}

#' @rdname ffmpeg_codecs
#' @export
ffmpeg_video_codecs <- function() {
  res <- ffmpeg_codecs()
  if (is.null(res)) {
    return(NULL)
  }
  res <- res[res$video_codec, ]
  res$video_codec <- NULL
  res$audio_codec <- NULL
  res$subtitle_codec <- NULL
  res
}

#' @rdname ffmpeg_codecs
#' @export
ffmpeg_audio_codecs <- function() {
  res <- ffmpeg_codecs()
  if (is.null(res)) {
    return(NULL)
  }
  res <- res[res$audio_codec, ]
  res$video_codec <- NULL
  res$audio_codec <- NULL
  res$subtitle_codec <- NULL
  res
}

#' @rdname ffmpeg_codecs
#' @export
ffmpeg_muxers <- function() {
  ffmpeg <- ffmpeg_exec(quote = TRUE)
  cmd <- paste(ffmpeg, "-muxers")
  result <- system(cmd, ignore.stderr = TRUE, ignore.stdout = TRUE)
  res <- system(cmd, intern = TRUE, ignore.stderr = TRUE)
  res <- trimws(res)
  if (length(res) == 0) {
    res <- ""
  }
  if (result != 0 & all(res %in% "")) {
    warning("No codecs output from ffmpeg for muxers")
    return(NULL)
  }
  res <- res[grepl("^E", res)]
  res <- strsplit(res, " ")
  res <- t(vapply(res, function(x) {
    x <- trimws(x)
    x <- x[x != ""]
    if (length(x) >= 3) {
      x[3:length(x)] <- paste(x[3:length(x)], collapse = " ")
    }
    return(x[seq(3)])
  }, FUN.VALUE = character(3)))
  colnames(res) <- c("capabilities", "muxer", "muxer_name")
  res <- as.data.frame(res, stringsAsFactors = FALSE)
  if (nrow(res) == 0) {
    warning("No codecs output from ffmpeg for muxers")
    return(NULL)
  }
  res$capabilities <- trimws(res$capabilities)

  return(res)
}

#' @rdname ffmpeg_codecs
#' @export
ffmpeg_version <- function() {
  ffmpeg <- ffmpeg_exec(quote = TRUE)
  cmd <- paste(ffmpeg, "-version")
  result <- system(cmd, ignore.stderr = TRUE, ignore.stdout = TRUE)
  res <- system(cmd, intern = TRUE, ignore.stderr = TRUE)
  res <- trimws(res)
  if (length(res) == 0) {
    res <- ""
  }
  if (result != 0 & all(res %in% "")) {
    warning("No codecs output from ffmpeg for version")
    return(NULL)
  }
  res <- res[grepl("^ffmpeg version", res)]
  res <- sub("ffmpeg version (.*) Copyright .*", "\\1", res)
  res <- sub("(ubuntu|debian).*", "", res)
  res <- sub("-.*", "", res)
  res <- sub("[+].*", "", res)
  res <- trimws(res)
  return(res)
}

#' @rdname ffmpeg_codecs
#' @export
ffmpeg_version_sufficient <- function() {
  if (have_ffmpeg_exec()) {
    ver <- package_version("3.2.4")
    ff_ver <- ffmpeg_version()
    if (is.null(ff_ver)) {
      warning(paste0(
        "Cannot get ffmpeg version from ",
        "ffmpeg_version, returning FALSE"
      ))
      return(FALSE)
    }
    ff_ver_char <- ff_ver
    ff_ver <- package_version(ff_ver, strict = FALSE)
    if (is.na(ff_ver)) {
      warning(
        paste0(
          "ffmpeg version is not parsed, probably a development version,",
          "version was ", ff_ver_char, ", make sure you have >= ",
          as.character(ver)
        )
      )
      return(TRUE)
    }
    res <- ff_ver >= ver
  } else {
    res <- FALSE
  }
  res
}

#' @rdname ffmpeg_codecs
#' @export
check_ffmpeg_version <- function() {
  if (!ffmpeg_version_sufficient()) {
    ff <- ffmpeg_version()
    stop(paste0(
      "ffmpeg version is not high enough,",
      " ffmpeg version is: ", ff
    ))
  }
  return(invisible(NULL))
}

Setup (Source above three R scripts and define ari_stitch()

library(ariExtra)
library(googledrive)
library(pdftools)
library(purrr)

source("utils.R")
source("set_encoders.R")
source("ffmpeg_codecs.R")

## Weave audio and images together
ari_stitch <- function(images, 
                       audio,
                       output = tempfile(fileext = ".mp4"),
                       verbose = FALSE,
                       cleanup = TRUE,
                       ffmpeg_opts = "",
                       divisible_height = TRUE,
                       audio_codec = get_audio_codec(),
                       video_codec = get_video_codec(),
                       video_sync_method = "2",
                       audio_bitrate = NULL,
                       video_bitrate = NULL,
                       pixel_format = "yuv420p",
                       fast_start = FALSE,
                       deinterlace = FALSE,
                       stereo_audio = TRUE,
                       duration = NULL,
                       video_filters = NULL,
                       frames_per_second = NULL,
                       check_inputs = TRUE) {
  # Stop if there are no images
  stopifnot(length(images) > 0)
  # Normalize paths of images and output (return absolute path)
  images <- normalizePath(images)
  output_dir <- normalizePath(dirname(output))
  output <- file.path(output_dir, basename(output))
  # Stop if there is no audio 
  stopifnot(
    length(audio) > 0,
    dir.exists(output_dir)
  )
  # Stop if images and audio are the same length
  if (check_inputs) {
    stopifnot(
      identical(length(images), length(audio)),
      all(file.exists(images))
    )
  }
  # Read in wav file using tuneR::readWave()
  audio <- map(audio, tuneR::readWave)
  # pad wav file
  audio <- pad_wav(audio, duration = duration)

  if (verbose > 0) {
    message("Writing out Wav for audio")
  }
  if (verbose > 1) {
    print(audio)
  }
  audio <- match_sample_rate(audio, verbose = verbose)
  # reduce audio (list) to single value
  wav <- purrr::reduce(audio, tuneR::bind)
  # create path to store wave file
  wav_path <- file.path(output_dir, paste0("ari_audio_", get_random_string(), ".wav"))
  # write wave file
  tuneR::writeWave(wav, filename = wav_path)
  # output: wav file that contains a voiceover of the entire script

  if (cleanup) {
    on.exit(unlink(wav_path, force = TRUE), add = TRUE)
  }

  # converting all images to gif (if there any gif images)
  img_ext <- tolower(tools::file_ext(images))
  any_gif <- any(img_ext %in% "gif")
  if (any_gif & !all(img_ext %in% "gif")) {
    if (verbose > 0) {
      message("Converting All files to gif!")
    }
    for (i in seq_along(images)) {
      iext <- img_ext[i]
      if (iext != "gif") {
        tfile <- tempfile(fileext = ".gif")
        ffmpeg_convert(images[i], outfile = tfile)
        images[i] <- tfile
      }
    }
  }

  # create txt path
  input_txt_path <- file.path(output_dir, 
                              paste0("ari_input_", get_random_string(), ".txt"))

  ## on windows ffmpeg cancats names adding the working directory, so if
  ## complete url is provided it adds it twice.
  if (.Platform$OS.type == "windows") {
    new_image_names <- file.path(output_dir, basename(images))
    if (!any(file.exists(new_image_names))) {
      file.copy(images, to = new_image_names)
    } else {
      warning("On windows must make basename(images) for ffmpeg to work")
    }
    images <- basename(images)
  }

  # adds "file 'IMAGE_PATH'" and duration 
  # in a .txt file located at input_txt_path
  for (ii in seq_along(images)) {
    cat(paste0("file ", "'", images[ii], "'", "\n"),
        file = input_txt_path, 
        append = TRUE)
    cat(paste0("duration ", wav_length(audio[[ii]]), "\n"),
        file = input_txt_path, 
        append = TRUE)
  }
  # winslash: the separator to be used on Windows
  input_txt_path <- normalizePath(input_txt_path, winslash = "/")

  # needed for users as per
  # https://superuser.com/questions/718027/
  # ffmpeg-concat-doesnt-work-with-absolute-path
  # input_txt_path = normalizePath(input_txt_path, winslash = "\\")

  # find path to ffmpeg exectuable
  ffmpeg <- ari::ffmpeg_exec(quote = TRUE)

  # set video filters
  if (!is.null(frames_per_second)) {
    video_filters <- c(video_filters, paste0("fps=", frames_per_second))
  } else {
    video_filters <- c(video_filters, "fps=5")
  }
  if (divisible_height) {
    video_filters <- c(video_filters, '"scale=trunc(iw/2)*2:trunc(ih/2)*2"')
  }

  # workaround for older ffmpeg
  # https://stackoverflow.com/questions/32931685/
  # the-encoder-aac-is-experimental-but-experimental-codecs-are-not-enabled
  experimental <- FALSE
  if (!is.null(audio_codec)) {
    if (audio_codec == "aac") {
      experimental <- TRUE
    }
  }
  if (deinterlace) {
    video_filters <- c(video_filters, "yadif")
  }
  video_filters <- paste(video_filters, collapse = ",")
  video_filters <- paste0("-vf ", video_filters)

  if (any(grepl("-vf", ffmpeg_opts))) {
    warning("Found video filters in ffmpeg_opts, may not be used correctly!")
  }
  ffmpeg_opts <- c(video_filters, ffmpeg_opts)
  ffmpeg_opts <- paste(ffmpeg_opts, collapse = " ")
  # output: options to input into ffmpeg

  # shQuote should seankross/ari#5
  command <- paste(
    ffmpeg, "-y",
    "-f concat -safe 0 -i", shQuote(input_txt_path),
    "-i", shQuote(wav_path),
    ifelse(!is.null(video_codec), paste("-c:v", video_codec),
           ""
    ),
    ifelse(!is.null(audio_codec), paste("-c:a", audio_codec),
           ""
    ),
    ifelse(stereo_audio, "-ac 2", ""),
    ifelse(!is.null(audio_bitrate), paste("-b:a", audio_bitrate),
           ""
    ),
    ifelse(!is.null(video_bitrate), paste("-b:v", video_bitrate),
           ""
    ),
    " -shortest",
    # ifelse(deinterlace, "-vf yadif", ""),
    ifelse(!is.null(video_sync_method), paste("-vsync", video_sync_method),
           ""
    ),
    ifelse(!is.null(pixel_format), paste("-pix_fmt", pixel_format),
           ""
    ),
    ifelse(fast_start, "-movflags +faststart", ""),
    ffmpeg_opts,
    ifelse(!is.null(frames_per_second), paste0("-r ", frames_per_second), ""),
    ifelse(experimental, "-strict experimental", ""),
    "-max_muxing_queue_size 9999",
    "-threads 2",
    shQuote(output)
  )
  if (verbose > 0) {
    message(command)
  }
  if (verbose > 1) {
    message("Input text path is:")
    cat(readLines(input_txt_path), sep = "\n")
  }

  # IMPORTANT: run command in system
  res <- system(command)

  if (res != 0) {
    warning("Result was non-zero for ffmpeg")
  }

  if (cleanup) {
    on.exit(unlink(input_txt_path, force = TRUE), add = TRUE)
  }
  res <- file.exists(output) && file.size(output) > 0
  if (!cleanup) {
    attr(res, "txt_path") <- input_txt_path
    attr(res, "wav_path") <- wav_path
    attr(res, "cmd") <- command
  }
  attr(res, "outfile") <- output
  attr(res, "images") <- images

  # return a (temporarily) invisible copy of res
  invisible(res)
}

Image Files

slide1.png

slide1

slide2.png

slide2

slide3.png

slide3

slide4.png

slide4

slide5.png

slide5

slide6.png

slide6

Audio Files

audio_files.zip

Run ari_stitch()


# run ari_stitch() and save output
output <- ari_stitch(images = c("slide1.png",
                                "slide2.png",
                      "slide3.png",
                      "slide4.png",
                      "slide5.png",
                      "slide6.png"),
           audio = c("tts_output1.wav",
                     "tts_output2.wav",
                     "tts_output3.wav",
                     "tts_output4.wav",
                     "tts_output5.wav",
                     "tts_output6.wav"))

# get file path of mp4 file
attr(output, "outfile")

Navigate to this mp4 in Finder (Mac) and open the video

Message when running res <- system(command):


ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with Apple clang version 14.0.0 (clang-1400.0.29.202)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/5.1.2_6 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox --enable-neon
  libavutil      57. 28.100 / 57. 28.100
  libavcodec     59. 37.100 / 59. 37.100
  libavformat    59. 27.100 / 59. 27.100
  libavdevice    59.  7.100 / 59.  7.100
  libavfilter     8. 44.100 /  8. 44.100
  libswscale      6.  7.100 /  6.  7.100
  libswresample   4.  7.100 /  4.  7.100
  libpostproc    56.  6.100 / 56.  6.100
-vsync is deprecated. Use -fps_mode
Passing a number to -vsync is deprecated, use a string argument as described in the manual.
Input #0, concat, from '/private/var/folders/bb/m2b0ry595ys7bfs1r397lnf40000gp/T/RtmpphX5JC/ari_input_gPwzkFnlk3HL.txt':
  Duration: 00:00:37.00, start: 0.000000, bitrate: 0 kb/s
  Stream #0:0: Video: png, rgb24(pc), 6000x3375 [SAR 23622:23622 DAR 16:9], 25 fps, 25 tbr, 25 tbn
Input #1, wav, from '/private/var/folders/bb/m2b0ry595ys7bfs1r397lnf40000gp/T/RtmpphX5JC/ari_audio_S5ZSsBGmGzBU.wav':
  Duration: 00:00:37.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, 1 channels (FL), s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (png (native) -> h264 (libx264))
  Stream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native))
Press [q] to stop, [?] for help
[libx264 @ 0x14c63a330] using SAR=3374/3375
[libx264 @ 0x14c63a330] using cpu capabilities: ARMv8 NEON
[libx264 @ 0x14c63a330] profile High, level 6.0, 4:2:0, 8-bit
[libx264 @ 0x14c63a330] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=2 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=5 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00
Output #0, mp4, to '/private/var/folders/bb/m2b0ry595ys7bfs1r397lnf40000gp/T/RtmpphX5JC/file3c302d23b261.mp4':
  Metadata:
    encoder         : Lavf59.27.100
  Stream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 6000x3374 [SAR 3374:3375 DAR 16:9], q=2-31, 5 fps, 10240 tbn
    Metadata:
      encoder         : Lavc59.37.100 libx264
    Side data:
      cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A
  Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 22050 Hz, stereo, fltp, 128 kb/s
    Metadata:
      encoder         : Lavc59.37.100 aac
frame=  150 fps= 11 q=-1.0 Lsize=     658kB time=00:00:30.00 bitrate= 179.7kbits/s speed=2.16x    
video:286kB audio:364kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.219465%
[libx264 @ 0x14c63a330] frame I:2     Avg QP: 2.27  size: 77096
[libx264 @ 0x14c63a330] frame P:38    Avg QP:12.33  size:  1831
[libx264 @ 0x14c63a330] frame B:110   Avg QP:12.66  size:   622
[libx264 @ 0x14c63a330] consecutive B-frames:  2.0%  0.0%  2.0% 96.0%
[libx264 @ 0x14c63a330] mb I  I16..4: 95.0%  2.4%  2.5%
[libx264 @ 0x14c63a330] mb P  I16..4:  0.0%  0.0%  0.0%  P16..4:  0.1%  0.0%  0.0%  0.0%  0.0%    skip:99.9%
[libx264 @ 0x14c63a330] mb B  I16..4:  0.0%  0.0%  0.0%  B16..8:  0.1%  0.0%  0.0%  direct: 0.0%  skip:99.9%  L0:60.3% L1:39.7% BI: 0.0%
[libx264 @ 0x14c63a330] 8x8 transform intra:3.0% inter:3.7%
[libx264 @ 0x14c63a330] coded y,uvDC,uvAC intra: 1.7% 0.4% 0.4% inter: 0.0% 0.0% 0.0%
[libx264 @ 0x14c63a330] i16 v,h,dc,p: 99%  1%  1%  0%
[libx264 @ 0x14c63a330] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 79%  2% 19%  0%  0%  0%  0%  0%  0%
[libx264 @ 0x14c63a330] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 47% 16% 24%  2%  2%  3%  3%  2%  2%
[libx264 @ 0x14c63a330] i8c dc,h,v,p: 99%  1%  0%  0%
[libx264 @ 0x14c63a330] Weighted P-Frames: Y:0.0% UV:0.0%
[libx264 @ 0x14c63a330] ref P L0: 87.6%  1.9%  9.4%  1.1%
[libx264 @ 0x14c63a330] ref B L0: 56.8% 41.9%  1.3%
[libx264 @ 0x14c63a330] ref B L1: 97.9%  2.1%
[libx264 @ 0x14c63a330] kb/s:77.92
[aac @ 0x14c63b650] Qavg: 60691.250
```_
muschellij2 commented 1 year ago

Can you post a reprex/MCVE

howardbaek commented 1 year ago

Thanks for the quick reply, @muschellij2

Since our last meeting with @cansavvy, we decided to use the coqui-ai/TTS library as our free TTS engine. Using this, I generated .wav files and input them and the PNG image files into ari_stitch(), but ran into this aforementioned bug.

I edited my original post with a reprex, which contains the R scripts + ari_stitch() function body + PNG files + WAV files needed to reproduce my problem.

muschellij2 commented 1 year ago

OK - just FYI, I'm not working on this currently, so probably won't look at it for a few weeks.

howardbaek commented 1 year ago

I figured out the bug!

When I was testing out ari_stitch(), I had removed line 195-197. I thought that was a typo because we were using the i from the previous for-loop. I didn't know that for assigns the i to the current environment, overwriting any existing variable with the same name (https://adv-r.hadley.nz/control-flow.html#loops).

Instead of images[i], I think this is a more direct expression: images[length(images)]. So, future self will not be confused again.

This change is committed here: https://github.com/jhudsl/ari/pull/40/commits/247a33b10c1c260985606b51408e7d9511b31d85#diff-61326062bb54dd0efa36b7319032beda8de959a05922fd7d5904a36ba407e46cR198