JohnsonHsieh / iNEXT

R package for interpolation and extrapolation
https://JohnsonHsieh.github.com/iNEXT
57 stars 26 forks source link

making lower-level functions of iNEXT directly accessible #17

Open FabianRoger opened 7 years ago

FabianRoger commented 7 years ago

Hej Johnson,

thanks for the iNEXT package, it has many useful features. However, I work with microbial datasets where it tends to have (very) long runtimes. A typical dataset of mine has maybe 50 to 100 samples with anywhere from 1000 to 10 000 (or more) "species".

Right now I just want to use iNEXT to plot the rarefaction curves with q = 1, but the without manually digging deep into the code I have to call the iNEXT() function for this - which calculates a whole suite of things that I don't need.

So in this example I end up doing the following in order to just calculate what I need.

RareDF_nifh <- mclapply(nifH_rar, function(x) {
    m <-  round(seq(1, ifelse(sum(x) < 15000, sum(x), 15000), length = 40))
    rar <- iNEXT:::Dqhat.Ind(x, q = 1, m)
    out <- data.frame(m = m, qD = rar)
    return(out)})

This code alone takes >10 min to run.

So here and in general it would be very useful if besides the iNEXT function, the lower level functions could be made accessible and written in such a way that they can be called as stand-alone functions. This would allow for a much more flexible usage of the package.

Would you agree?

thanks!

Fabian

jslefche commented 6 years ago

@FabianRoger you've undoubtedly solved this by now, but I ran into a similar issue. Part of the problem is the function calculates 3 orders of q and also the confidence intervals around those estimates, which vastly inflates the system downtime.

Below is a function estimateD.new that returns abundance-based estimates of diversity for only a single value of q that is specified by the user, and omits the calculation of the CIs.

#' estimateD.new = faster coverage-based rarefaction 
#' @param x a community (rows) by species (cols) matrix
#' @param q the level of q to be used for calculating diversity (default is richness)
#' @param level a user-defined level of coverage
#' @return a data.frame with the sample size, method of estimation, sample coverage, and estimated diversity
#' 
estimateD.new <- function(x, q = 0, level=NULL){

  C <- level

  if(is.null(C))

    C <- min(unlist(apply(x, 1, function(x) Chat.Ind(x, sum(x)))))

  do.call(rbind, apply(x, 1, function(x) invChat.Ind(x, q, C)))

}

#' helper functions
Chat.Ind <- function(x, m){
  x <- x[x>0]
  n <- sum(x)
  f1 <- sum(x == 1)
  f2 <- sum(x == 2)
  f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2)  #estimation of unseen species via Chao1
  A <- ifelse(f1>0, n*f0.hat/(n*f0.hat+f1), 1)
  Sub <- function(m){
    #if(m < n) out <- 1-sum(x / n * exp(lchoose(n - x, m)-lchoose(n - 1, m)))
    if(m < n) {
      xx <- x[(n-x)>=m]
      out <- 1-sum(xx / n * exp(lgamma(n-xx+1)-lgamma(n-xx-m+1)-lgamma(n)+lgamma(n-m)))
    }
    if(m == n) out <- 1-f1/n*A
    if(m > n) out <- 1-f1/n*A^(m-n+1)
    out
  }
  sapply(m, Sub)        
}

invChat.Ind <- function(x, q, C){

  m <- NULL # no visible binding for global variable 'm'

  n <- sum(x)

  refC <- Chat.Ind(x,n)

  f <- function(m, C) abs(Chat.Ind(x,m)-C)

  if(refC > C) {
    opt <- optimize(f, C=C, lower=0, upper=sum(x))
    mm <- opt$minimum
    mm <- round(mm)
  }

  if(refC <= C) {
    f1 <- sum(x==1)
    f2 <- sum(x==2)
    if(f1>0 & f2>0){A <- (n-1)*f1/((n-1)*f1+2*f2)}
    if(f1>1 & f2==0){A <- (n-1)*(f1-1)/((n-1)*(f1-1)+2)}
    if(f1==1 & f2==0){A <- 1}
    if(f1==0 & f2==0){A <- 1}
    mm <- (log(n/f1)+log(1-C))/log(A)-1
    mm <- n+mm
    mm <- round(mm)
  }

  if(mm > 2*n) warning("The maximum size of the extrapolation exceeds double reference sample size, the results for q = 0 may be subject to large prediction bias.")

  method <- ifelse(mm<n, "interpolated", ifelse(mm==n, "observed", "extrapolated"))

  out <- data.frame(m=mm,
                    method=method, 
                    coverage=round(Chat.Ind(x,mm),3),
                    diversity=round(Dqhat.Ind(x,q,mm),3)
                    )

  out

}

Dqhat.Ind <- function(x, q, m){
  x <- x[x > 0]
  n <- sum(x)

  fk.hat <- function(x, m){
    x <- x[x > 0]
    n <- sum(x)
    if(m <= n){
      Sub <- function(k)    sum(exp(lchoose(x, k) + lchoose(n - x, m -k) - lchoose(n, m)))
      sapply(1:m, Sub)
    }

    else {
      f1 <- sum(x == 1)
      f2 <- sum(x == 2)
      A <- ifelse(f2 > 0, (n-1)*f1/((n-1)*f1+2*f2), (n-1)*f1/((n-1)*f1+2))
      C.hat <- 1 - f1 / n * A
      p.hat <- x / n * C.hat            
      Sub <- function(k)    sum((choose(m, k) * p.hat^k * (1 - p.hat)^(m - k)) / (1 - (1 - p.hat)^n))
      sapply(1:m, Sub)
    }
  }

  D0.hat <- function(x, m){
    x <- x[x > 0]
    n <- sum(x)
    Sub <- function(m){
      if(m <= n){
        Fun <- function(x){
          if(x <= (n - m)) exp(lgamma(n - x + 1) + lgamma(n - m + 1) - lgamma(n - x - m + 1) - lgamma(n + 1))
          else 0
        }
        sum(1 - sapply(x, Fun))
      }
      else {
        Sobs <- sum(x > 0)
        f1 <- sum(x == 1)
        f2 <- sum(x == 2)
        f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2)    #estimation of unseen species via Chao1
        A <- n*f0.hat/(n*f0.hat+f1)
        ifelse(f1 ==0, Sobs ,Sobs + f0.hat * (1 - A ^ (m - n))) 
      }
    }
    sapply(m, Sub)
  }

  D1.hat <- function(x, m){
    x <- x[x > 0]
    n <- sum(x)
    Sub <- function(m){
      if(m < n){
        k <- 1:m
        exp(-sum(k / m * log(k / m) * fk.hat(x, m)))
      }
      else{
        #UE=sum(sapply(1:(n-1),function(k){sum(1/k*x/n*exp(lchoose(n-x,k)-lchoose(n-1,k)))}))
        UE <- sum(x/n*(digamma(n)-digamma(x)))
        f1 <- sum(x == 1)
        f2 <- sum(x == 2)
        A <- 1 - ifelse(f2 > 0, (n-1)*f1/((n-1)*f1+2*f2), (n-1)*f1/((n-1)*f1+2))
        #A=2*sum(x==2)/((n-1)*sum(x==1)+2*sum(x==2))
        B <- ifelse(A<1, sum(x==1)/n*(1-A)^(-n+1)*(-log(A)-sum(sapply(1:(n-1),function(k){1/k*(1-A)^k}))), 0)
        D.hat <- exp(UE+B)
        Dn <- exp(-sum(x / n * log(x / n)))

        a <- 1:(n-1)
        b <- 1:(n-2)
        Da <-  exp(-sum(a / (n-1) * log(a / (n-1)) * fk.hat(x, (n-1))))
        Db <-  exp(-sum(b / (n-2) * log(b / (n-2)) * fk.hat(x, (n-2))))
        Dn1 <- ifelse(Da!=Db, Dn + (Dn-Da)^2/(Da-Db), Dn)
        b <- ifelse(D.hat>Dn, (Dn1-Dn)/(D.hat-Dn), 0)
        # b <- A
        ifelse(b!=0, Dn + (D.hat-Dn)*(1-(1-b)^(m-n)), Dn)
      }
    }
    sapply(m, Sub)
  }

  D2.hat <- function(x, m){
    Sub <- function(m) 1 / (1 / m + (1 - 1 / m) * sum(x * (x - 1) / n / (n - 1)))
    sapply(m, Sub)
  }

  Dq.hat <- function(x, m){
    Sub <- function(m){
      k <- 1:m
      sum( (k / m)^q * fk.hat(x, m))^(1 / (1 - q))
    }
    sapply(m, Sub)
  }
  if(q == 0) D0.hat(x, m)
  else if(q == 1) D1.hat(x, m)
  else if(q == 2) D2.hat(x, m)
  else Dq.hat(x, m)
}
FabianRoger commented 3 years ago

@jslefche I had, but just came back here because now I need it again :-)

Thanks!