stephenslab / susieR

R package for "sum of single effects" regression.
https://stephenslab.github.io/susieR
Other
176 stars 45 forks source link

Illustrate in vignettes how computation scales with sample size (n) #46

Open pcarbo opened 5 years ago

pcarbo commented 5 years ago

@KaiqianZhang In the sparse_matrix_strategy.Rmd and sparse_susie_eval.Rmd vignettes, it would be more insightful to show how changing the size of the data set (specifically, the number of rows of X) affects the computation time when using a sparse vs. dense matrix X.

Here's some example code that roughly illustrate what I'm thinking for sparse_matrix_strategy.Rmd:

library(ggplot2)
library(cowplot)
b       <- rnorm(p)
timings <- NULL
N       <- c(100,200,400,800,1600)
for (n in N) {
  X.dense  <- create_sparsity_mat(1 - 1e5/(n*p),n,p)
  X.sparse <- as(X.dense,"dgCMatrix")
  X.tilde  <- susieR:::safe_colScale(X.dense)
  print(dim(X.tilde))
  compute_Xb_benchmark <- microbenchmark(
    dense  = (use.normal.Xb <- X.tilde %*% b),
    sparse = (use.sparse.Xb <- susieR:::compute_Xb(X,b)),
      times = 10,unit = "s")
  timings <- rbind(timings,summary(compute_Xb_benchmark)[,"mean"])
}

dat <- rbind(data.frame(n = N,timing = timings[,1],matrix.type = "dense"),
             data.frame(n = N,timing = timings[,2],matrix.type = "sparse"))
dat <- transform(dat,n = factor(n))
g <- ggplot(dat,aes(x = n,y = timing,fill = matrix.type)) +
    geom_col(position = "dodge",width = 0.5) +
    scale_fill_manual(values = c("dodgerblue","darkorange")) +
    labs(y = "mean runtime (s)")
print(g)

And here's some example code giving a rough idea what I'm thinking for sparse_susie_eval.Rmd:

library(ggplot2)
library(cowplot)
library(microbenchmark)
timings <- NULL
N       <- c(100,400,1600,6400,25600)
for (n in N) {
  X.dense  <- create_sparsity_mat(1 - 1e5/(n*p),n,p)
  X.sparse <- as(X.dense,"dgCMatrix")
  y <- c(X.dense %*% beta + rnorm(n))
  print(dim(X.dense))
  compute_Xb_benchmark <- microbenchmark(
    dense  = (susie.dense <- susie(X.dense,y)),
    sparse = (susie.sparse <- susie(X.sparse,y)),
      times = 1,unit = "s")
  timings <- rbind(timings,summary(compute_Xb_benchmark)[,"mean"])
}

dat <- rbind(data.frame(n = N,timing = timings[,1],matrix.type = "dense"),
             data.frame(n = N,timing = timings[,2],matrix.type = "sparse"))
dat <- transform(dat,n = factor(n))
g <- ggplot(dat,aes(x = n,y = timing,fill = matrix.type)) +
    geom_col(position = "dodge",width = 0.5) +
    scale_fill_manual(values = c("dodgerblue","darkorange")) +
    labs(y = "mean runtime (s)")
print(g)