dirkschumacher / rcppglm

GLM implementation in c++ with rcpp and armadillo
GNU Lesser General Public License v2.1
5 stars 2 forks source link

Error Expecting an external pointer: [type=list]. #1

Open Vanderser opened 1 year ago

Vanderser commented 1 year ago

Hi! first of all thanks for the great support of doing glm in Rcpp!

I am using the rcppglm to modify MASS:polr to enhance the speeds over a normal GLM, but i encountered an error like above in my attempts, here is the full code:

original polr

polr <- function (formula, data, weights, start, ..., subset, na.action, 
    contrasts = NULL, Hess = FALSE, model = TRUE, method = c("logistic", 
        "probit", "loglog", "cloglog", "cauchit")) 
{
    m <- match.call(expand.dots = FALSE)
    method <- match.arg(method)
    if (is.matrix(eval.parent(m$data))) 
        m$data <- as.data.frame(data)
    m$start <- m$Hess <- m$method <- m$model <- m$... <- NULL
    m[[1L]] <- quote(stats::model.frame)
    m <- eval.parent(m)
    Terms <- attr(m, "terms")
    x <- model.matrix(Terms, m, contrasts)
    xint <- match("(Intercept)", colnames(x), nomatch = 0L)
    n <- nrow(x)
    pc <- ncol(x)
    cons <- attr(x, "contrasts")
    if (xint > 0L) {
        x <- x[, -xint, drop = FALSE]
        pc <- pc - 1L
    }
    else warning("an intercept is needed and assumed")
    wt <- model.weights(m)
    if (!length(wt)) 
        wt <- rep(1, n)
    offset <- model.offset(m)
    if (length(offset) <= 1L) 
        offset <- rep(0, n)
    y <- model.response(m)
    if (!is.factor(y)) 
        stop("response must be a factor")
    lev <- levels(y)
    llev <- length(lev)
    if (llev <= 2L) 
        stop("response must have 3 or more levels")
    y <- unclass(y)
    q <- llev - 1L
    if (missing(start)) {
        q1 <- llev%/%2L
        y1 <- (y > q1)
        X <- cbind(Intercept = rep(1, n), x)
        fit <- switch(method, logistic = glm.fit(X, y1, wt, family = binomial(), 
            offset = offset), probit = glm.fit(X, y1, wt, family = binomial("probit"), 
            offset = offset), loglog = glm.fit(X, y1, wt, family = binomial("probit"), 
            offset = offset), cloglog = glm.fit(X, y1, wt, family = binomial("probit"), 
            offset = offset), cauchit = glm.fit(X, y1, wt, family = binomial("cauchit"), 
            offset = offset))
        if (!fit$converged) 
            stop("attempt to find suitable starting values failed")
        coefs <- fit$coefficients
        if (any(is.na(coefs))) {
            warning("design appears to be rank-deficient, so dropping some coefs")
            keep <- names(coefs)[!is.na(coefs)]
            coefs <- coefs[keep]
            x <- x[, keep[-1L], drop = FALSE]
            pc <- ncol(x)
        }
        logit <- function(p) log(p/(1 - p))
        spacing <- logit((1L:q)/(q + 1L))
        if (method != "logistic") 
            spacing <- spacing/1.7
        gammas <- -coefs[1L] + spacing - spacing[q1]
        start <- c(coefs[-1L], gammas)
    }
    else if (length(start) != pc + q) 
        stop("'start' is not of the correct length")
    ans <- polr.fit(x, y, wt, start, offset, method, hessian = Hess, 
        ...)
    beta <- ans$coefficients
    zeta <- ans$zeta
    deviance <- ans$deviance
    res <- ans$res
    niter <- c(f.evals = res$counts[1L], g.evals = res$counts[2L])
    eta <- if (pc) 
        offset + drop(x %*% beta)
    else offset + rep(0, n)
    pfun <- switch(method, logistic = plogis, probit = pnorm, 
        loglog = pgumbel, cloglog = pGumbel, cauchit = pcauchy)
    cumpr <- matrix(pfun(matrix(zeta, n, q, byrow = TRUE) - eta), 
        , q)
    fitted <- t(apply(cumpr, 1L, function(x) diff(c(0, x, 1))))
    dimnames(fitted) <- list(row.names(m), lev)
    fit <- list(coefficients = beta, zeta = zeta, deviance = deviance, 
        fitted.values = fitted, lev = lev, terms = Terms, df.residual = sum(wt) - 
            pc - q, edf = pc + q, n = sum(wt), nobs = sum(wt), 
        call = match.call(), method = method, convergence = res$convergence, 
        niter = niter, lp = eta)
    if (Hess) {
        dn <- c(names(beta), names(zeta))
        H <- res$hessian
        dimnames(H) <- list(dn, dn)
        fit$Hessian <- H
    }
    if (model) 
        fit$model <- m
    fit$na.action <- attr(m, "na.action")
    fit$contrasts <- cons
    fit$xlevels <- .getXlevels(Terms, m)
    class(fit) <- "polr"
    fit
}
<bytecode: 0x000000002263b7f8>
<environment: namespace:MASS>

modified polr (adding the rcppglm in fit section)

polr_cpp <- function(formula, data, weights, start, ..., subset, na.action, 
                      contrasts = NULL, Hess = FALSE, model = TRUE, 
                      method = c("logistic", "probit", "loglog", "cloglog", "cauchit")){
  library(Rcpp)
  library(RcppGLM)
  library(RcppArmadillo)
  cppFunction('NumericMatrix mmult(const NumericMatrix& m1, const NumericMatrix& m2){
  if (m1.ncol() != m2.nrow()) stop ("Incompatible matrix dimensions");
  NumericMatrix out(m1.nrow(),m2.ncol());
  NumericVector rm1, cm2;
  for (size_t i = 0; i < m1.nrow(); ++i) {
      rm1 = m1(i,_);
      for (size_t j = 0; j < m2.ncol(); ++j) {
        cm2 = m2(_,j);
        out(i,j) = std::inner_product(rm1.begin(), rm1.end(), cm2.begin(), 0.);              
      }
    }
  return out;
  }')

  polr.fit.enhanced <- function(x, y, wt, start, offset, method, ...){
    fmin <- function(beta){
      theta <- beta[pc + ind_q]
      gamm <- c(-Inf , cumsum(c(theta[1L], exp(theta[-1L]))), Inf)
      eta <- offset
      if (pc) eta <- eta + drop(x %*% beta[ind_pc])
      pr <- pfun(pmin(100, gamm[y + 1] - eta)) -
        pfun(pmax(-100, gamm[y] - eta))
      if (all(pr > 0)) -sum(wt * log(pr)) else Inf
    }

    gmin <- function(beta){
      jacobian <- function(theta) { ## dgamma by dtheta matrix
        k <- length(theta)
        etheta <- exp(theta)
        mat <- matrix(0 , k, k)
        mat[, 1L] <- rep(1, k)
        for (i in 2L:k) mat[i:k, i] <- etheta[i]
        mat
      }
      theta <- beta[pc + ind_q]
      gamm <- c(-Inf, cumsum(c(theta[1L], exp(theta[-1L]))), Inf)
      eta <- offset
      if(pc) eta <- eta + drop(x %*% beta[ind_pc])
      z1 <- pmin(100, gamm[y+1L] - eta)
      z2 <- pmax(-100, gamm[y] - eta)
      pr <- pfun(z1) - pfun(z2)
      p1 <- dfun(z1); p2 <- dfun(z2)
      g1 <- if(pc) t(x) %*% (wt*(p1 - p2)/pr) else numeric()
      xx <- .polrY1*p1 - .polrY2*p2
      g2 <- - t(xx) %*% (wt/pr)
      g2 <- t(g2) %*% jacobian(theta)
      if(all(pr > 0)) c(g1, g2) else rep(NA_real_, pc+q)
    }

    pfun <- switch(method, logistic = plogis, probit = pnorm,
                   loglog = pgumbel, cloglog = pGumbel, cauchit = pcauchy)
    dfun <- switch(method, logistic = dlogis, probit = dnorm,
                   loglog = dgumbel, cloglog = dGumbel, cauchit = dcauchy)
    n <- nrow(x)
    pc <- ncol(x)
    ind_pc <- seq_len(pc)
    lev <- levels(y)
    if(length(lev) <= 2L) stop("response must have 3 or more levels")
    y <- unclass(y)
    q <- length(lev) - 1L
    ind_q <- seq_len(q)
    Y <- matrix(0, n, q)
    .polrY1 <- col(Y) == y; .polrY2 <- col(Y) == (y - 1L)

    # pc could be 0.
    s0 <- if(pc) c(start[seq_len(pc+1L)], log(diff(start[-seq_len(pc)])))
    else c(start[1L], log(diff(start)))

    library(optimParallel)
    res <- optimParallel(s0, fmin, gmin, method="BFGS", ...)
    beta <- res$par[seq_len(pc)]
    theta <- res$par[pc + ind_q]
    zeta <- cumsum(c(theta[1L], exp(theta[-1L])))
    deviance <- 2 * res$value
    names(zeta) <- paste(lev[-length(lev)], lev[-1L], sep="|")
    if(pc) names(beta) <- colnames(x)
    list(coefficients = beta, zeta = zeta, deviance = deviance, res = res)
  }

  m <- match.call(expand.dots = FALSE)
  method <- match.arg(method)
  if (is.matrix(eval.parent(m$data))) 
    m$data <- as.data.frame(data)
  m$start <- m$Hess <- m$method <- m$model <- m$... <- NULL
  m[[1L]] <- quote(stats::model.frame)
  m <- eval.parent(m)
  Terms <- attr(m, "terms")
  x <- model.matrix(Terms, m, contrasts)
  xint <- match("(Intercept)", colnames(x), nomatch = 0L)
  n <- nrow(x)
  pc <- ncol(x)
  cons <- attr(x, "contrasts")
  if (xint > 0L) {
    x <- x[, -xint, drop = FALSE]
    pc <- pc - 1L
  }
  else warning("an intercept is needed and assumed")
  wt <- model.weights(m)
  if (!length(wt)) 
    wt <- rep(1, n)
  offset <- model.offset(m)
  if (length(offset) <= 1L) 
    offset <- rep(0, n)
  y <- model.response(m)
  if (!is.factor(y)) 
    stop("response must be a factor")
  lev <- levels(y)
  llev <- length(lev)
  if (llev <= 2L) 
    stop("response must have 3 or more levels")
  y <- unclass(y)
  q <- llev - 1L
  if (missing(start)) {
    q1 <- llev%/%2L
    y1 <- (y > q1)
    X <- cbind(Intercept = rep(1, n), x)
    fit <- switch(method, logistic = glm_fit(X, y1, family = binomial()), 
                  probit = glm_fit(X, y1, family = binomial("probit")), 
                  loglog = glm_fit(X, y1, family = binomial("probit")), 
                  cloglog = glm_fit(X, y1, family = binomial("probit")), 
                  cauchit = glm_fit(X, y1, family = binomial("cauchit")))

    if (!fit$converged) 
      stop("attempt to find suitable starting values failed")
    coefs <- fit$coefficients
    if (any(is.na(coefs))) {
      warning("design appears to be rank-deficient, so dropping some coefs")
      keep <- names(coefs)[!is.na(coefs)]
      coefs <- coefs[keep]
      x <- x[, keep[-1L], drop = FALSE]
      pc <- ncol(x)
    }
    logit <- function(p) log(p/(1 - p))
    spacing <- logit((1L:q)/(q + 1L))
    if (method != "logistic") 
      spacing <- spacing/1.7
    gammas <- -coefs[1L] + spacing - spacing[q1]
    start <- c(coefs[-1L], gammas)
  }
  else if (length(start) != pc + q) 
    stop("'start' is not of the correct length")

  ans <- polr.fit.enhanced(x, y, wt, start, offset, method, hessian = Hess, ...)
  beta <- ans$coefficients
  zeta <- ans$zeta
  deviance <- ans$deviance
  res <- ans$res
  niter <- c(f.evals = res$counts[1L], g.evals = res$counts[2L])
  eta <- if (pc){
    beta <- matrix(beta, nrow = length(beta), ncol = 1)
    offset + mmult(x, beta)
    offset <- as.numeric(offset)
    names(offset) <- 1:length(offset)
  }
  else offset + rep(0, n)

  return(eta)

  pfun <- switch(method, logistic = plogis, probit = pnorm, 
                 loglog = pgumbel, cloglog = pGumbel, cauchit = pcauchy)
  cumpr <- matrix(pfun(matrix(zeta, n, q, byrow = TRUE) - eta), , q)
  fitted <- t(apply(cumpr, 1L, function(x) diff(c(0, x, 1))))
  dimnames(fitted) <- list(row.names(m), lev)
  fit <- list(coefficients = beta, zeta = zeta, deviance = deviance, 
              fitted.values = fitted, lev = lev, terms = Terms, df.residual = sum(wt) - 
                pc - q, edf = pc + q, n = sum(wt), nobs = sum(wt), 
              call = match.call(), method = method, convergence = res$convergence, 
              niter = niter, lp = eta)
  if (Hess) {
    dn <- c(names(beta), names(zeta))
    H <- res$hessian
    dimnames(H) <- list(dn, dn)
    fit$Hessian <- H
  }
  if (model) 
    fit$model <- m

  fit$na.action <- attr(m, "na.action")
  fit$contrasts <- cons
  fit$xlevels <- .getXlevels(Terms, m)
  class(fit) <- "polr"
  fit
}
Vanderser commented 1 year ago

I use this custom dataset to reproduce the issues: https://file.io/ZzYBx4csNcMw

custom_dataset <- read.csv(file.choose()) custom_dataset$LH_info <- factor(custom_dataset$LH_info, order = TRUE, levels=c("Common Low High (<50)","Uncommon Low High (50-80)", "Extreme Low High (>80)")) test <- polr(LH_info ~ ., data=custom_dataset) #original test2 <- polr_cpp(LH_info ~ ., data=custom_dataset) #modified