simsem / semTools

Useful tools for structural equation modeling
75 stars 36 forks source link

Function for comparing coefficients #24

Open mronkko opened 7 years ago

mronkko commented 7 years ago

semTools currently includes function compareFit for comparing fit indices between multiple models. Users may also be interested in comparing coefficients across different models. I created a function for that. Please let me know whether you think that the following would be a useful addition to the package. Parts of the code have been copied from the compareFit function and the print function of the lavaan summary method.

require(plyr)

compareEstimates <- function (..., 
                              ci           = FALSE,
                              fmi          = FALSE,
                              standardized = FALSE,
                              rsquare      = FALSE,
                              std.nox      = FALSE,
                              nd = 3L) 
{
  arg <- match.call()
  mods <- list(...)
  if (any(sapply(mods, is, "list"))) {
    temp <- list()
    for (i in seq_along(mods)) {
      if (!is(mods[[i]], "list")) {
        temp <- c(temp, list(mods[[i]]))
      }
      else {
        temp <- c(temp, mods[[i]])
      }
    }
    mods <- temp
  }
  if (any(!sapply(mods, is, "lavaan"))) 
    stop("Some models specified here are not lavaan outputs or list of lavaan outputs")

  PElist <- lapply(mods,function(object){

    PE <- parameterEstimates(object, ci = ci, standardized = standardized,
                             rsquare = rsquare, fmi = fmi,
                             remove.eq = FALSE, remove.system.eq = TRUE,
                             remove.ineq = FALSE, remove.def = FALSE,
                             add.attributes = TRUE)
    if(standardized && std.nox) {
      PE$std.all <- PE$std.nox
    }
    PE

  })

  PEframe <- join_all(PElist, by = c("lhs","op","rhs","exo"), match = "first") 
  class(PEframe) <- c("CoefDiff","data.frame")

  PEframe
}

#' @param cols which columns to print. Defaults to \code{NULL} for all columns

print.CoefDiff <- function(x, ..., nd = 3L, cols = NULL) {

  # format for numeric values
  num.format  <- paste("%", max(8, nd + 5), ".", nd, "f", sep = "")
  char.format <- paste("%", max(8, nd + 5), "s", sep="") 

  # output sections
  GSECTIONS <- c("Latent Variables", 
                 "Composites", 
                 "Regressions", 
                 "Covariances",
                 "Intercepts", 
                 "Thresholds", 
                 "Variances", 
                 "Scales y*",
                 "Group Weight",
                 "R-Square")
  ASECTIONS <- c("Defined Parameters", 
                 "Constraints")

  cat("\nParameter Estimates:\n\n")

  # number of groups
  if(is.null(x$group)) {
    ngroups <- 1L
    x$group <- rep(1L, length(x$lhs))
  } else {
    ngroups <- lav_partable_ngroups(x)
  }

  # number of levels
  if(is.null(x$level)) {
    nlevels <- 1L
    x$level <- rep(1L, length(x$lhs))
  } else {
    nlevels <- lav_partable_nlevels(x)
  }

  # block column
  if(is.null(x$block)) {
    x$block <- rep(1L, length(x$lhs))
  }

  # round to 3 digits after the decimal point
  y <- as.data.frame(
    lapply(x, function(x) {
      if(is.numeric(x)) {
        sprintf(num.format, x)   
      } else {
        x
      }
    }),
    stringsAsFactors = FALSE, optional = TRUE)

  # always remove /block/level/group/op/rhs/label/exo columns
  y$op <- y$group <- y$rhs <- y$label <- y$exo <- NULL
  y$block <- y$level <- NULL

  # if standardized, remove std.nox column (space reasons only)
  y$std.nox <- NULL

  # convert to character matrix
  m <- as.matrix(format.data.frame(y, na.encode = FALSE, 
                                   justify = "right"))

  # use empty row names
  rownames(m) <- rep("", nrow(m))

  # missing and zero parameter values, one model at at time

  xStartIndices <- which(colnames(x)=="est")
  xEndIndices <- c(xStartIndices[-1]-1, length(colnames(x)))
  mStartIndices <- which(colnames(m)=="est")
  mEndIndices <- c(mStartIndices[-1]-1, length(colnames(m)))

  for(model in 1:length(xStartIndices)){

    xblock <- xStartIndices[model]:xEndIndices[model]
    mblock <- mStartIndices[model]:mEndIndices[model]

    # handle missing estimates
    m[is.na(x[,xblock]$est),mblock] <- ""

    # handle se == 0.0
    if(!is.null(x[,xblock]$se)) {
      se.idx <- which(x[,xblock]$se == 0)
      if(length(se.idx) > 0L) {
        m[,mblock][se.idx, "se"] <- ""
        if(!is.null(x[,xblock]$z)) {
          m[,mblock][se.idx, "z"] <- ""
        }
        if(!is.null(x[,xblock]$pvalue)) {
          m[,mblock][se.idx, "pvalue"] <- ""
        }
      }

      # handle se == NA
      se.idx <- which(is.na(x[,xblock]$se))
      if(length(se.idx) > 0L) {
        if(!is.null(x[,xblock]$z)) {
          m[,mblock][se.idx, "z"] <- ""
        }
        if(!is.null(x[,xblock]$pvalue)) {
          m[,mblock][se.idx, "pvalue"] <- ""
        }
      }
    }

    # handle fmi
    if(!is.null(x[,xblock]$fmi)) {
      se.idx <- which(x[,xblock]$se == 0)
      if(length(se.idx) > 0L) {
        m[,mblock][se.idx, "fmi"] <- ""
      }

      not.idx <- which(x$op %in% c(":=", "<", ">", "=="))
      if(length(not.idx) > 0L) {
        if(!is.null(x[,xblock]$fmi)) {
          m[,mblock][not.idx, "fmi"] <- ""
        }
      }
    }

    # for blavaan, handle Post.SD and PSRF
    if(!is.null(x[,xblock]$Post.SD)) {
      se.idx <- which(x[,xblock]$Post.SD == 0)
      if(length(se.idx) > 0L) {
        m[,mblock][se.idx, "Post.SD"] <- ""
        if(!is.null(x[,xblock]$psrf)) {
          m[,mblock][se.idx, "psrf"] <- ""
        }
        if(!is.null(x[,xblock]$PSRF)) {
          m[,mblock][se.idx, "PSRF"] <- ""
        }
      }

      # handle psrf for defined parameters
      not.idx <- which(x$op %in% c(":=", "<", ">", "=="))
      if(length(not.idx) > 0L) {
        if(!is.null(x[,xblock]$psrf)) {
          m[,mblock][not.idx, "psrf"] <- ""
        }
        if(!is.null(x[,xblock]$PSRF)) {
          m[,mblock][not.idx, "PSRF"] <- ""
        }
      }
    }
  }

  # Print only subset of columns if the cols argument is not null
  if(!is.null(cols)){
    m <- m[,colnames(m) %in% c("lhs",cols)]
  }

  # rename some column names
  colnames(m)[ colnames(m) ==    "lhs" ] <- ""
  colnames(m)[ colnames(m) ==     "op" ] <- ""
  colnames(m)[ colnames(m) ==    "rhs" ] <- ""
  colnames(m)[ colnames(m) ==    "est" ] <- "Estimate"
  colnames(m)[ colnames(m) ==     "se" ] <- "Std.Err"
  colnames(m)[ colnames(m) ==      "z" ] <- "z-value"
  colnames(m)[ colnames(m) == "pvalue" ] <- "P(>|z|)"
  colnames(m)[ colnames(m) == "std.lv" ] <- "Std.lv"
  colnames(m)[ colnames(m) == "std.all"] <- "Std.all"
  colnames(m)[ colnames(m) == "std.nox"] <- "Std.nox"
  colnames(m)[ colnames(m) == "prior"  ] <- "Prior"
  colnames(m)[ colnames(m) == "fmi"    ] <- "FMI"

  # format column names
  colnames(m) <- sprintf(char.format, colnames(m))

  # exceptions for blavaan: Post.Mean (width = 9), Prior (width = 14)
  if(!is.null(x$Post.Mean)) {
    tmp <- gsub("[ \t]+", "", colnames(m), perl=TRUE)

    # reformat "Post.Mean" column
    col.idx <- which(tmp == "Post.Mean")
    if(length(col.idx) > 0L) {
      tmp.format <- paste("%", max(9, nd + 5), "s", sep="")
      colnames(m)[col.idx] <- sprintf(tmp.format, colnames(m)[col.idx])
      m[,col.idx] <- sprintf(tmp.format, m[,col.idx])
    }

    # reformat "Prior" column
    col.idx <- which(tmp == "Prior")
    if(length(col.idx) > 0L) {
      MAX <- max( nchar( m[,col.idx] ) ) + 1L
      tmp.format <- paste("%", max(MAX, nd + 5), "s", sep="")
      colnames(m)[col.idx] <- sprintf(tmp.format, colnames(m)[col.idx])
      m[,col.idx] <- sprintf(tmp.format, m[,col.idx])
    }
  }

  b <- 0L
  # group-specific sections
  for(g in 1:ngroups) {

    # group header
    if(ngroups > 1L) {
      group.label <- attr(x, "group.label")
      cat("\n\n")
      cat("Group ", g, " [", group.label[g], "]:\n", sep="")
    }

    for(l in 1:nlevels) {

      # block number
      b <- b + 1L

      # ov/lv names
      ov.names <- lavNames(x, "ov", block = b)
      lv.names <- lavNames(x, "lv", block = b)

      # level header
      if(nlevels > 1L) {
        level.label <- attr(x, "level.label")
        cat("\n\n")
        cat("Level ", l, " [", level.label[l], "]:\n", sep="")
      }

      # group-specific sections
      for(s in GSECTIONS) {
        if(s == "Latent Variables") {
          row.idx <- which( x$op == "=~" & !x$lhs %in% ov.names & 
                              x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else if(s == "Composites") {
          row.idx <- which( x$op == "<~" & x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else if(s == "Regressions") {
          row.idx <- which( x$op == "~" & x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else if(s == "Covariances") {
          row.idx <- which(x$op == "~~" & x$lhs != x$rhs & !x$exo &
                             x$block == b)
          if(length(row.idx) == 0L) next
          # make distinction between residual and plain
          y.names <- unique( c(lavNames(x, "eqs.y"),
                               lavNames(x, "ov.ind")) )
          PREFIX <- rep("", length(row.idx))
          PREFIX[ x$rhs[row.idx] %in% y.names ] <- "  ."
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx],
                                              PREFIX = PREFIX)
          #m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else if(s == "Intercepts") {
          row.idx <- which(x$op == "~1" & !x$exo & x$block == b)
          if(length(row.idx) == 0L) next
          # make distinction between intercepts and means
          y.names <- unique( c(lavNames(x, "eqs.y"),
                               lavNames(x, "ov.ind")) )
          PREFIX <- rep("", length(row.idx))
          PREFIX[ x$lhs[row.idx] %in% y.names ] <- "  ."
          m[row.idx,1] <- lavaan:::.makeNames(x$lhs[row.idx], x$label[row.idx],
                                              PREFIX = PREFIX)
          #m[row.idx,1] <- lavaan:::.makeNames(x$lhs[row.idx], x$label[row.idx])
        } else if(s == "Thresholds") {
          row.idx <- which(x$op == "|" & x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(paste(x$lhs[row.idx], "|", 
                                                    x$rhs[row.idx], sep=""), x$label[row.idx])
        } else if(s == "Variances") {
          row.idx <- which(x$op == "~~" & x$lhs == x$rhs & !x$exo &
                             x$block == b)
          if(length(row.idx) == 0L) next
          # make distinction between residual and plain
          y.names <- unique( c(lavNames(x, "eqs.y"),
                               lavNames(x, "ov.ind")) )
          PREFIX <- rep("", length(row.idx))
          PREFIX[ x$rhs[row.idx] %in% y.names ] <- "  ."
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx],
                                              PREFIX = PREFIX)
        } else if(s == "Scales y*") {
          row.idx <- which(x$op == "~*~" & x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else if(s == "Group Weight") {
          row.idx <- which(x$lhs == "group" & x$op == "%" & x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else if(s == "R-Square") {
          row.idx <- which(x$op == "r2" & x$block == b)
          if(length(row.idx) == 0L) next
          m[row.idx,1] <- lavaan:::.makeNames(x$rhs[row.idx], x$label[row.idx])
        } else {
          row.idx <- integer(0L)
        }

        # do we need special formatting for this section?
        # three types:
        #  - regular (nothing to do, except row/colnames)
        #  - R-square
        #  - Latent Variables (and Composites), Regressions and Covariances
        #    'bundle' the output per lhs element

        # bundling
        if(s %in% c("Latent Variables", "Composites", 
                    "Regressions", "Covariances")) {
          nel <- length(row.idx)
          M <- matrix("", nrow = nel*2, ncol = ncol(m))
          colnames(M) <- colnames(m)
          rownames(M) <- rep("", NROW(M))
          #colnames(M)[1] <- sprintf("%-17s", paste(s, ":", sep = ""))
          LHS <- paste(x$lhs[row.idx], x$op[row.idx])
          lhs.idx <- seq(1, nel*2L, 2L)
          rhs.idx <- seq(1, nel*2L, 2L) + 1L
          if(s == "Covariances") {
            # make distinction between residual and plain
            y.names <- unique( c(lavNames(x, "eqs.y"),
                                 lavNames(x, "ov.ind")) )
            PREFIX <- rep("", length(row.idx))
            PREFIX[ x$lhs[row.idx] %in% y.names ] <- "."
          } else {
            PREFIX <- rep("", length(LHS))
          }
          M[lhs.idx, 1] <- sprintf("%1s%-15s", PREFIX, LHS)
          M[rhs.idx,  ] <- m[row.idx,]
          # avoid duplicated LHS labels
          if(nel > 1L) {
            del.idx <- integer(0)
            old.lhs <- ""
            for(i in 1:nel) {
              if(LHS[i] == old.lhs) {
                del.idx <- c(del.idx, lhs.idx[i])
              }
              old.lhs <- LHS[i]
            }
            if(length(del.idx) > 0L) {
              M <- M[-del.idx,,drop=FALSE]
            }
          }
          cat("\n", s, ":\n", sep = "")
          #cat("\n")
          print(M, quote = FALSE)
          # R-square
        } else if(s == "R-Square") {
          M <- m[row.idx,1:2,drop=FALSE]
          colnames(M) <- colnames(m)[1:2]
          rownames(M) <- rep("", NROW(M))
          #colnames(M)[1] <- sprintf("%-17s", paste(s, ":", sep = ""))
          cat("\n", s, ":\n", sep = "")
          #cat("\n")
          print(M, quote = FALSE)

          # Regular
        } else {
          #M <- rbind(matrix("", nrow = 1L, ncol = ncol(m)),
          #           m[row.idx,])
          M <- m[row.idx,,drop=FALSE]
          colnames(M) <- colnames(m)
          rownames(M) <- rep("", NROW(M))
          #colnames(M)[1] <- sprintf("%-17s", paste(s, ":", sep = ""))
          cat("\n", s, ":\n", sep = "")
          #cat("\n")
          print(M, quote = FALSE)
        }
      } # GSECTIONS

    } # levels

  } # groups

  # asections
  for(s in ASECTIONS) {
    if(s == "Defined Parameters") {
      row.idx <- which(x$op == ":=")
      m[row.idx,1] <- lavaan:::.makeNames(x$lhs[row.idx], "")
      M <- m[row.idx,,drop=FALSE]
      colnames(M) <- colnames(m)
    } else if(s == "Constraints") {
      row.idx <- which(x$op %in% c("==", "<", ">"))
      if(length(row.idx) == 0) next
      m[row.idx,1] <- .makeConNames(x$lhs[row.idx], x$op[row.idx], 
                                    x$rhs[row.idx], nd = nd)
      m[row.idx,2] <- sprintf(num.format, abs(x$est[row.idx]))
      M <- m[row.idx,1:2,drop=FALSE]
      colnames(M) <- c("", sprintf(char.format, "|Slack|"))
    } else {
      row.idx <- integer(0L)
    }

    if(length(row.idx) == 0L) {
      next
    }

    rownames(M) <- rep("", NROW(M))
    #colnames(M)[1] <- sprintf("%-17s", paste(s, ":", sep = ""))
    #cat("\n")
    cat("\n", s, ":\n", sep = "")
    print(M, quote = FALSE)
  }
  cat("\n")

  invisible(m)
}
TDJorgensen commented 6 years ago

Hi Mikko,

Thanks for the suggestion! Sorry for the delay, but I have not had much time for programming in the past year (new daughter, job pressures, etc.). I would like to include this in the next version, and I think it can share a documentation page with compareFit(). Does it still work with the latest development version of lavaan?

It would help me out if you could add some roxygen2 comments to indicate which functions are imported that aren't in the base package, as well as defining arguments that are not already part of compareFit(). For example, you use is() from the methods package, which can be indicated with:

#' @import methods is

And to share the documentation page, you can use this comment before @export:

#' @rdname compareFit

If you are unfamiliar with roxygen2, here is a good tutorial: http://r-pkgs.had.co.nz/man.html And you can see examples in my source code: https://github.com/simsem/semTools/blob/master/semTools/R/clipboard.R

If you need me to handle the documentation part, I don't think I would have time to do so before the next version goes to CRAN (hopefully end of April, if Yves can get lavaan out by then).

If you can add this to the https://github.com/simsem/semTools/blob/master/semTools/R/compareFit.R file and create a pull request, that works too. But you can also just post the source-code file here and I can add it myself. You can also add an @example application for the help page to the https://github.com/simsem/semTools/blob/master/semTools/R/compareFit.R file, or I can add one myself.

Thanks!

mronkko commented 6 years ago

Hi Terry,

I can relate to your situation because it is quite similar to mine ;) I will try to check this over the easter weekend or the week after easter.

I have experience using oxygen2 and can add documentation. I agree that the two functions should probably reside in the same file and share the documentation page. Also, I would like to refactor the code a bit because the function is very long and there is some duplicate code.

Would it make sense to have one function (e.g. compareModels) that compares coefficients and indices, and then just two wrappers (compareFit and compareEstimates) that call this one function to print just the coefficients or indices depending on which function is called?

Should I do this as a pull request?

Mikko

TDJorgensen commented 6 years ago

I would like to refactor the code a bit because the function is very long and there is some duplicate code.

Great!

Would it make sense to have one function (e.g. compareModels) that compares coefficients and indices, and then just two wrappers (compareFit and compareEstimates) that call this one function to print just the coefficients or indices depending on which function is called?

I don't think so. Then all the same work is being done, regardless of which wrapper is called. I like the 2 functions separate. But perhaps compareModels() could be a wrapper that calls both compareFit() and compareEstimates().

Should I do this as a pull request?

Yes, please -- whenever it is ready. That way I can see github's automatic check for conflicts with my current version.

Enjoy your holiday weekend! Terrence

sfcheung commented 3 years ago

This function will be a very useful companion to compareFit()! I tried it myself and it worked very well. It can also serve as a pedagogical tool useful for students to see how parameter estimates change as the model specification changes.

May I suggest a few minor changes to compareEstimates()?

Add

mod.names <- sapply(substitute(list(...))[-1], deparse)

to capture the names of the fit objects (as in compareFit()) (expand the names if necessary if any of fit object is a list) and then store the names as an attribute to the output:

attr(PEframe, "mod.names") <- mod.names

print.CoefDiff() can then retrieve these names

mod.names <- attr(x, "mod.names")

and add them as prefixes (or suffixes). E.g.,

colnames(m)[ colnames(m) ==    "est" ] <- paste0(mod.names, ":", "Estimate")
colnames(m)[ colnames(m) ==     "se" ] <- paste0(mod.names, ":", "Std.Err")
colnames(m)[ colnames(m) ==      "z" ] <- paste0(mod.names, ":", "z-value")

I tried these changes based on the version posted by @mronkko and this is the sample output:

Parameter Estimates:

Latent Variables:
                   fit1:Estimate fit2:Estimate
  visual =~
    x1                1.000         1.000
    x2                0.554         0.527
    x3                0.729         0.647

The following changes may also help to merge parameter estimates whether the models have more than one group (e.g., the measure invariance example in compareFit())

  by_names <- c("lhs","op","rhs","exo")
  if (any(sapply(PElist, function(x) "block" %in% colnames(x)))) {
      by_names <- c(by_names, "block")
    }
  if (any(sapply(PElist, function(x) "group" %in% colnames(x)))) {
      by_names <- c(by_names, "group")
    }
  PEframe <- join_all(PElist, by = by_names, match = "first")

I did not post here the modified code I tired because I am not sure whether the version in the first post of this issue is the latest version. (And I am also not the author so it may not be OK for me to modify the code directly.)

I look forward to seeing this function in semTools.

P.S.: I am not familiar with join_all(). It seems that the current version does not work when the two or more models are not nested, and some parameters are only in one model while some other parameters are only in another model. E.g.,

# textual =~ x8 only in A
HS.model.A <- ' visual  =~ x1 + x2 + x3
              textual =~ x4 + x5 + x6 + x8
              speed   =~ x7 + x8 + x9 '
# visual =~ x9 only in B
HS.model.B <- ' visual  =~ x1 + x2 + x3 + x9
              textual =~ x4 + x5 + x6
              speed   =~ x7 + x8 + x9 '

I tried type = "full" but it only uses one copy of the columns in the ouput. The default, type = "left", will only include parameters in the first model, i.e., visual =~ x9 will be excluded if the fit of HS.model.B is entered as the second object.