jdtuck / fdasrvf_R

Functional Data Analysis using Square-Root Slope Framework
10 stars 13 forks source link

Following up on integration error with `q_to_curve()` #44

Closed astamm closed 4 months ago

astamm commented 5 months ago

I have been trying to improve the functions curve_to_q() and q_to_curve() to minimise integration error.

This seems to be a first draft of an improvement:

library(tidyverse)
library(fdasrvf)

curve_to_q_alt <- function(beta, scale = TRUE) {
  L <- nrow(beta)
  M <- ncol(beta)
  grd <- seq(0, 1, length.out = M)

  v <- matrix(nrow = L, ncol = M)
  for (l in 1:L) {
    fun <- splinefun(grd, beta[l, ], method = "natural")
    v[l, ] <- fun(grd, deriv = 1)
  }

  q <- matrix(nrow = L, ncol = M)
  for (m in 1:M) {
    denom <- max(sqrt(fdasrvf:::pvecnorm(v[, m], p = 2)), 1e-4)
    q[, m] <- v[, m] / denom
  }

  len <- fdasrvf:::innerprod_q2(q, q)
  len_q <- sqrt(len)
  if (scale) q <- q / len_q

  list(q = q, len = len, lenq = len_q)
}

q_to_curve_alt <- function(q, f0, scale = 1) {
  L <- nrow(q)
  M <- ncol(q)
  grd <- seq(0, 1, length.out = M)

  q <- scale * q
  qnorm <- numeric(M)
  for (m in 1:M)
    qnorm[m] <- fdasrvf:::pvecnorm(q[, m], p = 2)

  beta <- matrix(nrow = L, ncol = M)
  for (l in 1:L) {
    values <- q[l, ] * qnorm
    fun <- splinefun(grd, values, method = "natural")
    beta[l, ] <- sapply(grd, \(.t) integrate(fun, lower = 0, upper = .t, subdivisions = 1000L)$value)
    beta[l, ] <- beta[l, ] + f0[l]
  }

  beta
}

N <- dim(beta)[4]
error1 <- numeric(N)
error2 <- numeric(N)
for (n in 1:N) {
  feval <- beta[, , 1, n]
  qeval1 <- curve_to_q(feval, scale = FALSE)$q
  feval1 <- q_to_curve(qeval1) + feval[, 1]

  qeval2 <- curve_to_q_alt(feval, scale = FALSE)$q
  feval2 <- q_to_curve_alt(qeval2, f0 = feval[, 1])

  error1[n] <- norm(feval - feval1, "F")
  error2[n] <- norm(feval - feval2, "F")
}

plot(qeval1[1, ], type = "l")
lines(qeval2[1, ], col = "red")

plot(feval[1, ], type = "l")
lines(feval1[1, ], col = "red")
plot(feval[1, ], type = "l")
lines(feval2[1, ], col = "blue")

tibble(error1 = error1, error2 = error2) |> 
  pivot_longer(everything(), names_to = "method", values_to = "error") |> 
  ggplot(aes(x = method, y = error)) +
  geom_boxplot() +
  labs(title = "Proposal for new curve_to_q and q_to_curve",
       x = "Method",
       y = "Frobenius norm of error") +
  theme_minimal()

tibble(error1 = error1, error2 = error2) |> 
  mutate(diff = error1 - error2) |>
  select(diff) |>
  ggplot(aes(y = 1, x = diff)) +
  geom_boxplot() +
  scale_x_continuous(breaks = 0:15) +
  scale_y_continuous(breaks = NULL) +
  labs(title = "Difference between methods",
       x = "Error of old method minus error of new method",
       y = "") +
  theme_minimal()

Let me know what you think.

astamm commented 5 months ago

We can also almost make the error vanish if we use R function representations all the way through. The following code demonstrates that. It is not at all optimised from a computational point of view.

library(tidyverse)
library(fdasrvf)

curve_to_q_alt <- function(beta, scale = TRUE) {
  L <- nrow(beta)
  M <- ncol(beta)
  grd <- seq(0, 1, length.out = M)

  funs <- list()
  for (l in 1:L)
    funs[[l]] <- splinefun(grd, beta[l, ], method = "natural")

  qfun <- function(t) {
    Mi <- length(t)
    q <- matrix(nrow = L, ncol = Mi)
    for (m in 1:Mi) {
      fprime <- sapply(funs, \(.fun) .fun(t[m], deriv = 1))
      denom <- max(sqrt(sqrt(sum(fprime^2))), 1e-4)
      q[, m] <- fprime / denom
    }
    q
  }

  len <- 0
  for (l in 1:L)
    len <- len + integrate(\(.t) qfun(.t)[l, ]^2, lower = 0, upper = 1, subdivisions = 1000L)$value
  len_q <- sqrt(len)
  if (scale) q <- q / len_q

  list(q = qfun, len = len, lenq = len_q)
}

q_to_curve_alt <- function(q, f0, scale = 1) {
  L <- length(f0)
  function(t) {
    Mi <- length(t)
    beta <- matrix(nrow = L, ncol = Mi)
    for (l in 1:L) {
      fun <- function(s) {
        qvals <- scale * q(s)
        qnorm <- sqrt(colSums(qvals^2))
        qvals[l, ] * qnorm
      }
      beta[l, ] <- sapply(t, \(.t) integrate(fun, lower = 0, upper = .t, 
                                             subdivisions = 1000L, stop.on.error = FALSE)$value)
      beta[l, ] <- beta[l, ] + f0[l]
    }
    beta 
  }
}

N <- dim(beta)[4]
error1 <- numeric(N)
error2 <- numeric(N)
for (n in 1:N) {
  cli::cli_alert_info("Processing curve {n} of {N}...")

  feval <- beta[, , 1, n]
  qeval1 <- curve_to_q(feval, scale = FALSE)$q
  feval1 <- q_to_curve(qeval1) + feval[, 1]

  qeval2 <- curve_to_q_alt(feval, scale = FALSE)$q
  feval2 <- q_to_curve_alt(qeval2, f0 = feval[, 1])(seq(0, 1, len=100))

  error1[n] <- norm(feval - feval1, "F")
  error2[n] <- norm(feval - feval2, "F")
}

plot(qeval1[1, ], type = "l")
lines(qeval2(seq(0, 1, len=100))[1, ], col = "blue")

plot(feval[1, ], type = "l")
lines(feval1[1, ], col = "red")
plot(feval[1, ], type = "l")
lines(feval2[1, ], col = "blue")

error1
error2

tibble(error1 = error1, error2 = error2) |> 
  pivot_longer(everything(), names_to = "method", values_to = "error") |> 
  ggplot(aes(x = method, y = error)) +
  geom_boxplot() +
  labs(title = "Proposal for new curve_to_q and q_to_curve",
       x = "Method",
       y = "Frobenius norm of error") +
  theme_minimal()

tibble(error1 = error1, error2 = error2) |> 
  mutate(diff = error1 - error2) |>
  select(diff) |>
  ggplot(aes(y = 1, x = diff)) +
  geom_boxplot() +
  scale_x_continuous(breaks = 0:15) +
  scale_y_continuous(breaks = NULL) +
  labs(title = "Difference between methods",
       x = "Error of old method minus error of new method",
       y = "") +
  theme_minimal()
jdtuck commented 5 months ago

Both look good, some of the R representations were nor available 15years ago and it would be good to use them now.

astamm commented 5 months ago

The best for controlling appropriately the integration error is, as shown above, to view curves and their corresponding SRVFs as R functions and not arrays of function evaluations on a grid of $[0,1]$. How would that be compatible with the rest of the package? I mean, how big of a change would that represent?

That being said, if q and beta are functions, if need be, we can always to

q(seq(0, 1, len = T1))

to get the array.

jdtuck commented 5 months ago

It would actually be a very large change. When I have done this in other programming languages i output the sampled version as a lot of functions expect an array. So I would get the array out of the curve_to_q currently.

astamm commented 5 months ago

Ok so the best we can achieve if we stick to array as output of curve_to_q() is the following:

library(tidyverse)
library(fdasrvf)

curve_to_q_alt <- function(beta, scale = TRUE) {
  L <- nrow(beta)
  M <- ncol(beta)
  grd <- seq(0, 1, length.out = M)

  funs <- list()
  for (l in 1:L)
    funs[[l]] <- splinefun(grd, beta[l, ], method = "natural")

  qfun <- function(t) {
    Mi <- length(t)
    q <- matrix(nrow = L, ncol = Mi)
    for (m in 1:Mi) {
      fprime <- sapply(funs, \(.fun) .fun(t[m], deriv = 1))
      denom <- max(sqrt(sqrt(sum(fprime^2))), 1e-4)
      q[, m] <- fprime / denom
    }
    q
  }

  len <- 0
  for (l in 1:L)
    len <- len + integrate(\(.t) qfun(.t)[l, ]^2, lower = 0, upper = 1, subdivisions = 1000L)$value
  len_q <- sqrt(len)
  if (scale) q <- q / len_q

  list(q = qfun(grd), len = len, lenq = len_q)
}

q_to_curve_alt <- function(q, f0 = rep(0, nrow(q)), scale = 1) {
  L <- dim(q)[1]
  M <- dim(q)[2]
  grd <- seq(0, 1, length.out = M)
  qfuns <- list()
  for (l in 1:L)
    qfuns[[l]] <- splinefun(grd, q[l, ], method = "natural")
  beta <- matrix(nrow = L, ncol = M)
  for (l in 1:L) {
    fun <- function(s) {
      Mi <- length(s)
      qvals <- matrix(nrow = L, ncol = Mi)
      for (ll in 1:L)
        qvals[ll, ] <- scale * qfuns[[ll]](s)
      qnorm <- sqrt(colSums(qvals^2))
      qvals[l, ] * qnorm
    }
    beta[l, ] <- sapply(grd, \(.t) integrate(fun, lower = 0, upper = .t, 
                                           subdivisions = 1000L, stop.on.error = FALSE)$value)
    beta[l, ] <- beta[l, ] + f0[l]
  }
  beta
}

N <- dim(beta)[4]
error1 <- numeric(N)
error2 <- numeric(N)
for (n in 1:N) {
  cli::cli_alert_info("Processing curve {n} of {N}...")

  feval <- beta[, , 1, n]
  qeval1 <- curve_to_q(feval, scale = FALSE)$q
  feval1 <- q_to_curve(qeval1) + feval[, 1]

  qeval2 <- curve_to_q_alt(feval, scale = FALSE)$q
  feval2 <- q_to_curve_alt(qeval2, f0 = feval[, 1])

  error1[n] <- norm(feval - feval1, "F")
  error2[n] <- norm(feval - feval2, "F")
}

plot(qeval1[1, ], type = "l")
lines(qeval2[1, ], col = "blue")

plot(feval[1, ], type = "l")
lines(feval1[1, ], col = "red")
plot(feval[1, ], type = "l")
lines(feval2[1, ], col = "blue")

error1
error2

tibble(error1 = error1, error2 = error2) |> 
  pivot_longer(everything(), names_to = "method", values_to = "error") |> 
  ggplot(aes(x = method, y = error)) +
  geom_boxplot() +
  labs(title = "Proposal for new curve_to_q and q_to_curve",
       x = "Method",
       y = "Frobenius norm of error") +
  theme_minimal()

tibble(error1 = error1, error2 = error2) |> 
  mutate(diff = error1 - error2) |>
  select(diff) |>
  ggplot(aes(y = 1, x = diff)) +
  geom_boxplot() +
  scale_x_continuous(breaks = 0:15) +
  scale_y_continuous(breaks = NULL) +
  labs(title = "Difference between methods",
       x = "Error of old method minus error of new method",
       y = "") +
  theme_minimal()

Which is better than solution 1 above but still present non negligible integration error compared to solution 2 which uses representation as R functions.

In fact, what makes the integration error vanish in solution 2, is that q is a function and therefore, in q_to_curve(), we can actually define the integrand function $q(s) || q(s) ||$ and use stats::integrate() to compute the integral via quadrature.

We could output both array and function in curve_to_q() and only use internally the function version in q_to_curve(). That would mean however to pass the entire output of curve_to_q() to q_to_curve(), which could also make sense.

jdtuck commented 5 months ago

Using the R functions and pass entire output could make sense....would be some refactor but might be the way to go to start the transition.

jdtuck commented 5 months ago

One thing we need to balance with the error is computational time. Would be good to compare that as well.

astamm commented 5 months ago

Here is what I propose:

library(tidyverse)
library(fdasrvf)

curve_to_q_alt <- function(beta, scale = TRUE) {
  L <- nrow(beta)
  M <- ncol(beta)
  grd <- seq(0, 1, length.out = M)

  funs <- lapply(1:L, \(.l) splinefun(grd, beta[.l, ], method = "natural"))

  qft <- function(.t) {
    Mt <- length(.t)
    res <- matrix(nrow = L, ncol = Mt)
    for (m in 1:Mt) {
      fprime <- sapply(funs, \(.fun) .fun(.t[m], deriv = 1))
      denom <- max(sqrt(sqrt(sum(fprime^2))), 1e-4)
      res[, m] <- fprime / denom
    }
    res
  }

  len <- sum(cubature::cubintegrate(
    f = \(.t) qft(.t)^2, fDim = L,
    lower = 0, upper = 1, 
    method = "pcubature"
  )$integral)
  len_q <- sqrt(len)

  if (scale)
    qf <- function(.t) return(qft(.t) / len_q)
  else
    qf <- qft

  list(q = qf(grd), len = len, lenq = len_q, qf = qf)
}

q_to_curve_alt <- function(qf, f0 = rep(0, environment(qf)$L), scale = 1) {
  L <- environment(qf)$L
  M <- environment(qf)$M
  grd <- seq(0, 1, length.out = M)
  beta <- matrix(nrow = L, ncol = M)
  fun <- function(s) {
    qvals <- scale * qf(s)
    qnorm <- sqrt(colSums(qvals^2))
    qvals * matrix(qnorm, nrow = L, ncol = length(s), byrow = TRUE)
  }
  out <- lapply(grd, \(.t) cubature::cubintegrate(
    f = fun, fDim = L, 
    lower = 0, upper = .t, 
    method = "pcubature"
  )$integral)
  beta <- do.call(cbind, out)
  beta + matrix(f0, nrow = L, ncol = M, byrow = FALSE)
}

N <- dim(beta)[4]
error1 <- numeric(N)
error2 <- numeric(N)
for (n in 1:N) {
  cli::cli_alert_info("Processing curve {n} of {N}...")

  feval <- beta[, , 1, n]
  qeval1 <- curve_to_q(feval, scale = FALSE)$q
  feval1 <- q_to_curve(qeval1) + feval[, 1]

  qeval2 <- curve_to_q_alt(feval, scale = FALSE)$qf
  feval2 <- q_to_curve_alt(qeval2, f0 = feval[, 1])

  error1[n] <- norm(feval - feval1, "F")
  error2[n] <- norm(feval - feval2, "F")
}

plot(qeval1[1, ], type = "l")
lines(qeval2(seq(0, 1, len=100))[1, ], col = "blue")

plot(feval[1, ], type = "l")
lines(feval1[1, ], col = "red")
plot(feval[1, ], type = "l")
lines(feval2[1, ], col = "blue")

error1
error2

tibble(error1 = error1, error2 = error2) |> 
  pivot_longer(everything(), names_to = "method", values_to = "error") |> 
  ggplot(aes(x = method, y = error)) +
  geom_boxplot() +
  labs(title = "Proposal for new curve_to_q and q_to_curve",
       x = "Method",
       y = "Frobenius norm of error") +
  theme_minimal()

The use of library cubature is important for multidimensional integration and stability of quadrature integral v.s. stats::integrate(). Note that you will be able to use it for images on $[0,1]^2$ also when the time comes.

In the proposed implementation, the function q_to_curve() only asks for the function representation of q but outputs the final curve beta as an array. If you prefer, I can pass the entire output of curve_to_q() but that seems unnecessary.

astamm commented 5 months ago

One thing we need to balance with the error is computational time. Would be good to compare that as well.

Definitely slower. I will work on that a bit to see if I can improve.

astamm commented 5 months ago

Ok I used doParallel to speed up q_to_curve() when evaluating $\int_0^t q(s) || q(s) || ds$ which is the longest. curve_to_q() is actually fast.

q_to_curve() remains significantly slower than the current version but I guess it is not called that often given that all the stats are performed in $L^2$ or $\mathbb{S}_\infty$. Does that make sense?

jdtuck commented 5 months ago

They are both called more than you think and q_to_curve could be called in parallel, so I am not sure having another parallel inside of it is a good idea.

astamm commented 5 months ago

Here is a possible solution:

q_to_curve_alt <- function(qf, f0 = rep(0, environment(qf)$L), scale = 1, tol = 1e-4) {
  tol <- min(1e-3, tol)

  L <- environment(qf)$L
  M <- environment(qf)$M
  grd <- seq(0, 1, length.out = M)

  fun <- function(s) {
    qvals <- scale * qf(s)
    qnorm <- sqrt(colSums(qvals^2))
    qvals * matrix(qnorm, nrow = L, ncol = length(s), byrow = TRUE)
  }

  out <- lapply(grd, \(.t) cubature::cubintegrate(
    f = fun, fDim = L,
    lower = 0, upper = .t,
    method = "pcubature", 
    nVec = 1024L, 
    relTol = tol
  )$integral)

  beta <- do.call(cbind, out)
  beta + matrix(f0, nrow = L, ncol = M, byrow = FALSE)
}

A default tolerance on the integration routine is set to 1e-4 which makes the function run in about 1.3 seconds on one curve of your beta dataset and leads to an integration error of about 0.02 (Frobenius norm compared to original curve).

After playing a bit with this tolerance, I found that going too large (larger than 1e-3) makes integration unstable. So in the function I do tol = min(tol, 1e-3). The computation time for tol=1e-3 is about 0.6 seconds on a beta curve.

Does that sound acceptable?

astamm commented 5 months ago

Another solution would be to switch to Rcpp for implementing these two functions. Or at least q_to_curve().

jdtuck commented 5 months ago

That can work for now, to keep this integration, yes Rcpp would be the next solution.

jdtuck commented 5 months ago

One thing to think about, the beta data set is small compared to what a lot of people use for real implementation. I am currently analyzing a curve set that has over 600 points per curves and 30 curves.

astamm commented 5 months ago

I could benchmark with an upsampled version of the beta dataset to match this grid size.

jdtuck commented 5 months ago

Would be good to benchmark as the number of sample points increases on 1 curve.

jdtuck commented 5 months ago

Some data sets people have used have gone as high as 5000 points, which was required for the analysis and couldn't be subsampled.

astamm commented 5 months ago

I did some benchmark in the way you suggested. I used the beta dataset that I upsampled up to 4096 points.

First comment, the current approach to integration via cumtrapz() is the most computationally effective. It integrates the SRVF in half a second when it is evaluated on a grid of 4096 points.

Also, the Frobenius distance between the original beta curve (possibly upsampled) and the integrated SRVF decreases as the grid size increases. I tried up to $2^{15} = 32,768$ and integration is still performed within 1 second and the error matches the one obtained by using cubature integration.

About the second approach, for curves evaluated on 5000 points, it can take up to 20 minutes to integrate the SRVF. Plus, quadrature approaches somehow do not behave well as the number of sample points increases. Instead of decreasing, the Frobenius norm actually increases.

My current conclusion is that the ideal scenario is to start with upsampling the raw curves to something like $2^{15}$ points, at which point trapezoidal integration as currently performed in the package is almost error-free and still blazing fast. But how would other functions in the package scale with the number of sampled points?

jdtuck commented 5 months ago

So the problem with increasing the dimension that large is the Dynamic Programming routine for computing the warping function is $O(n^2)$ and that size will make that extremely slow. Additionally, for closed curves the projection will also suffer computationally in time for large dimension.

One idea might be to upsample to integrate and then downsample.

astamm commented 5 months ago

I tried downsampling just after curve_to_q() and upsampling again for q_to_curve() but it does not achieve the same low error as when the upsampling is maintained all the way through.

Needs some more thoughts.

jdtuck commented 5 months ago

I kind of figured that would happen.

astamm commented 5 months ago

This is a working solution based on additionally keeping the SRVF as an R function in the output of curve_to_q() so that we can use it in q_to_curve() to build up an R function for the integrand, then evaluate the integrand on an upsampled grid, use cumtrapz() and downsampled the resulting curve.

curve_to_q_alt <- function(beta, scale = TRUE) {
  L <- nrow(beta)
  M <- ncol(beta)
  grd <- seq(0, 1, length.out = M)

  funs <- lapply(1:L, \(.l) splinefun(grd, beta[.l, ], method = "natural"))

  qft <- function(.t) {
    Mt <- length(.t)
    res <- matrix(nrow = L, ncol = Mt)
    for (m in 1:Mt) {
      fprime <- sapply(funs, \(.fun) .fun(.t[m], deriv = 1))
      denom <- max(sqrt(sqrt(sum(fprime^2))), 1e-4)
      res[, m] <- fprime / denom
    }
    res
  }

  len <- sum(cubature::cubintegrate(
    f = \(.t) qft(.t)^2, fDim = L,
    lower = 0, upper = 1, 
    method = "pcubature"
  )$integral)
  len_q <- sqrt(len)

  if (scale)
    qf <- function(.t) return(qft(.t) / len_q)
  else
    qf <- qft

  list(q = qf(grd), len = len, lenq = len_q, qf = qf)
}

q_to_curve_alt <- function(qf, f0 = rep(0, environment(qf)$L), scale = 1, Mup = 20000L) {
  L <- environment(qf)$L
  M <- environment(qf)$M
  grd <- seq(0, 1, length.out = M)

  fun <- function(s) {
    qvals <- scale * qf(s)
    qnorm <- sqrt(colSums(qvals^2))
    qvals * matrix(qnorm, nrow = L, ncol = length(s), byrow = TRUE)
  }

  grd_up <- seq(0, 1, length.out = Mup)
  integrand <- scale * fun(grd_up)

  beta_up <- fdasrvf:::cumtrapz(1:Mup, integrand, 2) / Mup

  beta <- matrix(nrow = L, ncol = M)
  for (l in 1:L)
    beta[l, ] <- spline(grd_up, beta_up[l, ], n = M)$y

  beta + matrix(f0, nrow = L, ncol = M, byrow = FALSE)
}

N <- dim(beta)[4]
error1 <- numeric(N)
error2 <- numeric(N)
for (n in 1:N) {
  cli::cli_alert_info("Processing curve {n} of {N}...")

  feval <- beta[, , 1, n]
  qeval1 <- curve_to_q(feval, scale = FALSE)$q
  feval1 <- q_to_curve(qeval1) + feval[, 1]

  qeval2 <- curve_to_q_alt(feval, scale = FALSE)$qf
  feval2 <- q_to_curve_alt(qeval2, f0 = feval[, 1], Mup = 100000L)

  error1[n] <- norm(feval - feval1, "F")
  error2[n] <- norm(feval - feval2, "F")
}

plot(qeval1[1, ], type = "l")
lines(qeval2[1, ], col = "blue")

plot(feval[1, ], type = "l")
lines(feval1[1, ], col = "red")
plot(feval[1, ], type = "l")
lines(feval2[1, ], col = "blue")

error1
error2

# Varying the number of sample points -------------------------------------

beta1 <- beta[, , 1, 1]
L <- dim(beta1)[1]
Morig <- dim(beta1)[2]
M <- 2^(4:17)
grd <- seq(0, 1, length.out = Morig)
res1 <- purrr::map(M, \(.M) {
  cli::cli_alert_info("Processing M = { .M}...")
  qeval1 <- curve_to_q_alt(beta1, scale = FALSE)$qf
  start_time <- Sys.time()
  feval1 <- q_to_curve_alt(qeval1, f0 = beta1[, 1], Mup = .M)
  end_time <- Sys.time()
  list(error = norm(beta1 - feval1, "F"), time = end_time - start_time)
})
plot(log2(M), map_dbl(res1, "error"), type = "l", xlab = "M", ylab = "Frobenius norm of error")
tms <- map_dbl(res1, "time")
idx <- which(diff(tms) < 0)
if (length(idx) > 0) {
  tms[(idx + 1):length(tms)] <- tms[(idx + 1):length(tms)] * 60
}
plot(log2(M), tms, type = "l", xlab = "M", ylab = "Time (s)")

Benchmarking reveals that error goes to 0 as upsampling size goes to infinity. Computation time is not that bad even at very high upsampling sizes.

We could even provide the user with a utility function which, given a curve or a sample of curves, computes the optimal upsampling size to achieve a given (average in the case of a sample) reconstruction error.

What do you think?

astamm commented 5 months ago

The cubature function in curve_to_q() to compute the norm of the SRVF would also be replaced by innerprod_q2() as before with the trick of upsampling as well to increase precision of trapezoidal integration.

jdtuck commented 5 months ago

I got one error

Error in qeval2[1, ] : object of type 'closure' is not subsettable

But this could work well and a utility function that also provides a reasonable default would be smart. This would balance computation with error without increasing further down stream processing.

astamm commented 5 months ago

After one more night of thoughts, here is my summary on all this.

There are two distinct (yet related) things.

One is how should numerical integration be performed

I tested trapezoidal integration as currently performed in the package via cumtrapz(), quadrature integration as performed by stats::integrate() and cubature integration as performed by cubature::cubintegrate() and its various options. From a given $f$ (which was successively each curve of the beta dataset), I computed the corresponding SRVF $q$ and then integrated:

$$ f(0) + \int_0^t q(s) || q(s) || ds $$

to get the curve $f$ back. The best both in terms of computation time and integration error was to use trapezoidal integration. The error goes to 0 as the grid size on which the integrand $q(s) || q(s) || is evaluated goes to infinity. Using grid sizes even as large as 100,000 leads to integration in less than a minute.

The other thing is how do we represent curves in the package

The reason why trapezoidal integration error goes to 0 as grid size goes to infinity is precisely because curve_to_q() outputs the SRVF also in a functional representation qf which is a vectorized R function representing

$$ s \mapsto q(s) = \frac{f^\prime(s)}{\sqrt{|| f^\prime(s) ||}}. $$

Using this trick allows then to first define the integrand as an R function as well as integrand_f <- function(s) qf(s) || qf(s) || which is precisely:

$$ s \mapsto \frac{f^\prime(s)}{\sqrt{|| f^\prime(s) ||}} \cdot \left| \frac{f^\prime(s)}{\sqrt{|| f^\prime(s) ||}} \right| = f^\prime(s). $$

Then, when we take evaluations of integrand_f on a grid for trapezoidal integration, we are in fact evaluating $f^\prime$ and thus getting $f$ after integration.

Proposal

I think the proper way to go would be that all functions dealing with either $f$ or $q$ as input (probably almost all functions in the package) should allow the input to be either an array or an R function. If an R function, proper implementation is of course needed and an extra optional argument called something like gridsize should also allow the user to ask for an output of type array. This argument could be set by default as the original number of sampled points so that user experience is unchanged. But if one sets gridsize to NULL then the function would output another R function instead of an array.

This is a major refactor but, in this way, it would not change current user experience, and I strongly believe this is the best way to achieve a good compromise between integration error and computation time.

jdtuck commented 5 months ago

I agree with your proposal as handling both types of inputs. There is always a balance with speed and error when perform computations on a manifold. Can you please check why got the error above on your latest code.

astamm commented 5 months ago

It is because qeval2 is an R function. Instead of qeval2[1, ], it should be something like qeval2(seq(0, 1, len = M)[1, ].