amices / ggmice

Visualize incomplete and imputed data with the R package `ggmice`
http://amices.org/ggmice
GNU General Public License v3.0
31 stars 9 forks source link

Proposal to visualize missingness proportion of categorical variable levels #60

Open rjverheijden opened 2 years ago

rjverheijden commented 2 years ago

Dear ggmice contributers, thanks a lot for the usefull package. Based on the plot_corr, I made a function to visualise the number of cases per level compared to other variables (a sort of contigency table for all desired variables in heatmap format). Especially with missing data and handling these with mice or smcfcs it might come in handy. I am curious what your thoughts are, there are for sure ways to make it more efficient. Please feel free to modify it and (if you like it) implement it in the package.

plot_groups function

With this function, you can quickly check the number op subjects per level for categorical variables and in total for continuous data. It creates a heat map in which red means little subjects per group and blue numerous subjects per group. Note that groups with 0 were set to 0.1 because of the log10 transfromation, so groups with value 0.1 represent groups with 0 subjects. Options for useNA are "no" to exclude them from the plot, "both" to include both variables and missings and "only" to only include the missings.

plot_groups <- function(data, vrb = "all", label = FALSE, square = TRUE, diagonal = FALSE, rotate = FALSE, useNA = "no") {
  if (!is.data.frame(data) & !is.matrix(data)) {
    stop("Dataset should be a 'data.frame' or 'matrix'.")
  }
  if (vrb[1] == "all") {
    vrb <- names(data)
  }

groups <- data.frame(vrb=NA, lvl=NA)
for(i in vrb){
  if("factor" %in% class(data[,i])){
    tmp <- data.frame(vrb=rep(i,times=length(levels(data[,i]))),lvl=levels(data[,i]))
  } else{
    tmp <- data.frame(vrb=i,lvl=NA)
  }
  tmp <- rbind(tmp, cbind(vrb=i, lvl="NA"))
  groups <- rbind(groups,tmp)
}
groups<-groups[-1,]

groups_compl <- data.frame(groups[rep(seq_len(nrow(groups)), each = nrow(groups)), ],
                          groups, compl=NA, x=NA, y=NA)
for(i in 1:nrow(groups_compl)){
  if(groups_compl[i,"vrb"]==groups_compl[i,"vrb.1"]){
    groups_compl[i,"compl"] <- NA
  }else if(is.na(groups_compl[i,"lvl"]) & is.na(groups_compl[i,"lvl.1"])){
    groups_compl[i,"compl"] <- nrow(data[!is.na(data[,groups_compl[i,"vrb"]]) & !is.na(data[,groups_compl[i,"vrb.1"]]),])
  }else if (is.na(groups_compl[i,"lvl"])&groups_compl[i,"lvl.1"]=="NA"){
    groups_compl[i,"compl"] <- nrow(data[!is.na(data[,groups_compl[i,"vrb"]]) & is.na(data[,groups_compl[i,"vrb.1"]]),])
  }else if (groups_compl[i,"lvl"]=="NA" & is.na(groups_compl[i,"lvl.1"])){
    groups_compl[i,"compl"] <- nrow(data[is.na(data[,groups_compl[i,"vrb"]]) & !is.na(data[,groups_compl[i,"vrb.1"]]),])
  }else if(is.na(groups_compl[i,"lvl"])){
    groups_compl[i,"compl"] <- nrow(data[!is.na(data[,groups_compl[i,"vrb"]]) & data[,groups_compl[i,"vrb.1"]]%in%groups_compl[i,"lvl.1"],])
  }else if(is.na(groups_compl[i,"lvl.1"])){
    groups_compl[i,"compl"]<- nrow(data[data[,groups_compl[i,"vrb"]]%in%groups_compl[i,"lvl"] & !is.na(data[,groups_compl[i,"vrb.1"]]),])
  }else if(groups_compl[i,"lvl"]=="NA"){
    groups_compl[i,"compl"]<- nrow(data[is.na(data[,groups_compl[i,"vrb"]]) & data[,groups_compl[i,"vrb.1"]]%in%groups_compl[i,"lvl.1"],])
  }else if(groups_compl[i,"lvl.1"]=="NA"){
    groups_compl[i,"compl"]<- nrow(data[data[,groups_compl[i,"vrb"]]%in%groups_compl[i,"lvl"] & is.na(data[,groups_compl[i,"vrb.1"]]) ,])
  }else{
    groups_compl[i,"compl"]<- nrow(data[data[,groups_compl[i,"vrb"]]%in%groups_compl[i,"lvl"] & data[,groups_compl[i,"vrb.1"]]%in%groups_compl[i,"lvl.1"],])
  }
  groups_compl[i,"y"]<-ifelse(is.na(groups_compl[i,"lvl.1"]), paste0(groups_compl[i,"vrb.1"]),paste0(groups_compl[i,"vrb.1"], " | ", groups_compl[i,"lvl.1"]))
  groups_compl[i,"x"]<-ifelse(is.na(groups_compl[i,"lvl"]), paste0(groups_compl[i,"vrb"]),paste0(groups_compl[i,"vrb"], " | ", groups_compl[i,"lvl"]))
}
groups_compl[groups_compl$compl%in%0,"compl"]<-0.1
if(useNA=="both"){
  groups_plot<-groups_compl
} else if(useNA=="only"){
  groups_plot <- groups_compl[groups_compl$lvl%in%"NA",]
} else{
  groups_plot <- groups_compl[(is.na(groups_compl$lvl)|groups_compl$lvl!="NA")&(is.na(groups_compl$lvl.1)|groups_compl$lvl.1!="NA"),]
}

gg <- ggplot2::ggplot(groups_plot, ggplot2::aes(x = .data$x, y = .data$y, label = .data$compl, fill = .data$compl)) +
        ggplot2::geom_tile(color = "black") +
        ggplot2::scale_x_discrete(limits = unique(groups_plot$x), position = "top") +
        ggplot2::scale_y_discrete(limits = rev(unique(groups_plot$y))) +
        # ggplot2::scale_fill_viridis_c("number\nof\ncases", option = "viridis", na.value = "white", direction = -1, trans = "log10")+
        # ggplot2::scale_fill_viridis_b("number\nof\ncases", option = "viridis", na.value = "white", direction = -1, trans = "log10", breaks = c(10,100,1000))+
        ggplot2::scale_fill_gradient2("number\nof\ncases", low = "orangered", mid = "lightyellow", high = "deepskyblue", midpoint = log10(10), na.value = "grey50", trans = "log10") +
        ggplot2::labs(title = "Number of cases per category")

  if (label) {
    gg <- gg + ggplot2::geom_text(color = "black", show.legend = FALSE, na.rm = TRUE)
  }
  if (square) {
    gg <- gg + ggplot2::coord_fixed(expand = FALSE)
  } else {
    gg <- gg + ggplot2::coord_cartesian(expand = FALSE)
  }
  if (rotate) {
    gg <- gg + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
  }
  return(gg)
}

Example

plot_groups(mice::boys, rotate = T, useNA = "no")
plot_groups(mice::boys, rotate = T, label = TRUE, useNA = "only")
plot_groups(mice::boys, rotate = T, useNA = "both")

This function was based on the plot_cor() function of package ggmice v0.0.1 available on https://github.com/amices/ggmice which was licensed under the GNU General Public Licence v3.0.

Hence this function is also licensed under the GNU GPLv3.0.

    Copyright (C) 2022  Rik J Verheijden

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
hanneoberman commented 1 year ago

Reprex:

plot_groups <- function(data, vrb = "all", label = FALSE, square = TRUE, diagonal = FALSE, rotate = FALSE, useNA = "no") {
    if (!is.data.frame(data) & !is.matrix(data)) {
        stop("Dataset should be a 'data.frame' or 'matrix'.")
    }
    if (vrb[1] == "all") {
        vrb <- names(data)
    }

    groups <- data.frame(vrb=NA, lvl=NA)
    for(i in vrb){
        if("factor" %in% class(data[,i])){
            tmp <- data.frame(vrb=rep(i,times=length(levels(data[,i]))),lvl=levels(data[,i]))
        } else{
            tmp <- data.frame(vrb=i,lvl=NA)
        }
        tmp <- rbind(tmp, cbind(vrb=i, lvl="NA"))
        groups <- rbind(groups,tmp)
    }
    groups<-groups[-1,]

    groups_compl <- data.frame(groups[rep(seq_len(nrow(groups)), each = nrow(groups)), ],
                               groups, compl=NA, x=NA, y=NA)
    for(i in 1:nrow(groups_compl)){
        if(groups_compl[i,"vrb"]==groups_compl[i,"vrb.1"]){
            groups_compl[i,"compl"] <- NA
        }else if(is.na(groups_compl[i,"lvl"]) & is.na(groups_compl[i,"lvl.1"])){
            groups_compl[i,"compl"] <- nrow(data[!is.na(data[,groups_compl[i,"vrb"]]) & !is.na(data[,groups_compl[i,"vrb.1"]]),])
        }else if (is.na(groups_compl[i,"lvl"])&groups_compl[i,"lvl.1"]=="NA"){
            groups_compl[i,"compl"] <- nrow(data[!is.na(data[,groups_compl[i,"vrb"]]) & is.na(data[,groups_compl[i,"vrb.1"]]),])
        }else if (groups_compl[i,"lvl"]=="NA" & is.na(groups_compl[i,"lvl.1"])){
            groups_compl[i,"compl"] <- nrow(data[is.na(data[,groups_compl[i,"vrb"]]) & !is.na(data[,groups_compl[i,"vrb.1"]]),])
        }else if(is.na(groups_compl[i,"lvl"])){
            groups_compl[i,"compl"] <- nrow(data[!is.na(data[,groups_compl[i,"vrb"]]) & data[,groups_compl[i,"vrb.1"]]%in%groups_compl[i,"lvl.1"],])
        }else if(is.na(groups_compl[i,"lvl.1"])){
            groups_compl[i,"compl"]<- nrow(data[data[,groups_compl[i,"vrb"]]%in%groups_compl[i,"lvl"] & !is.na(data[,groups_compl[i,"vrb.1"]]),])
        }else if(groups_compl[i,"lvl"]=="NA"){
            groups_compl[i,"compl"]<- nrow(data[is.na(data[,groups_compl[i,"vrb"]]) & data[,groups_compl[i,"vrb.1"]]%in%groups_compl[i,"lvl.1"],])
        }else if(groups_compl[i,"lvl.1"]=="NA"){
            groups_compl[i,"compl"]<- nrow(data[data[,groups_compl[i,"vrb"]]%in%groups_compl[i,"lvl"] & is.na(data[,groups_compl[i,"vrb.1"]]) ,])
        }else{
            groups_compl[i,"compl"]<- nrow(data[data[,groups_compl[i,"vrb"]]%in%groups_compl[i,"lvl"] & data[,groups_compl[i,"vrb.1"]]%in%groups_compl[i,"lvl.1"],])
        }
        groups_compl[i,"y"]<-ifelse(is.na(groups_compl[i,"lvl.1"]), paste0(groups_compl[i,"vrb.1"]),paste0(groups_compl[i,"vrb.1"], " | ", groups_compl[i,"lvl.1"]))
        groups_compl[i,"x"]<-ifelse(is.na(groups_compl[i,"lvl"]), paste0(groups_compl[i,"vrb"]),paste0(groups_compl[i,"vrb"], " | ", groups_compl[i,"lvl"]))
    }
    groups_compl[groups_compl$compl%in%0,"compl"]<-0.1
    if(useNA=="both"){
        groups_plot<-groups_compl
    } else if(useNA=="only"){
        groups_plot <- groups_compl[groups_compl$lvl%in%"NA",]
    } else{
        groups_plot <- groups_compl[(is.na(groups_compl$lvl)|groups_compl$lvl!="NA")&(is.na(groups_compl$lvl.1)|groups_compl$lvl.1!="NA"),]
    }

    gg <- ggplot2::ggplot(groups_plot, ggplot2::aes(x = .data$x, y = .data$y, label = .data$compl, fill = .data$compl)) +
        ggplot2::geom_tile(color = "black") +
        ggplot2::scale_x_discrete(limits = unique(groups_plot$x), position = "top") +
        ggplot2::scale_y_discrete(limits = rev(unique(groups_plot$y))) +
        # ggplot2::scale_fill_viridis_c("number\nof\ncases", option = "viridis", na.value = "white", direction = -1, trans = "log10")+
        # ggplot2::scale_fill_viridis_b("number\nof\ncases", option = "viridis", na.value = "white", direction = -1, trans = "log10", breaks = c(10,100,1000))+
        ggplot2::scale_fill_gradient2("number\nof\ncases", low = "orangered", mid = "lightyellow", high = "deepskyblue", midpoint = log10(10), na.value = "grey50", trans = "log10") +
        ggplot2::labs(title = "Number of cases per category")

    if (label) {
        gg <- gg + ggplot2::geom_text(color = "black", show.legend = FALSE, na.rm = TRUE)
    }
    if (square) {
        gg <- gg + ggplot2::coord_fixed(expand = FALSE)
    } else {
        gg <- gg + ggplot2::coord_cartesian(expand = FALSE)
    }
    if (rotate) {
        gg <- gg + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
    }
    return(gg)
}
plot_groups(mice::boys, rotate = T, useNA = "no")

Created on 2023-06-27 with reprex v2.0.2

hanneoberman commented 1 year ago

Hi @rjverheijden, sorry for the late response. Thank you very much for your contribution. I'm not adding your function in the current form to the ggmice package. I will be developing a function to visualize the missingness proportion per variable (#66), in the style/color scheme of mice. If you create a pull request with this function, I can use parts of your code as inspiration to visualize categorical variables split by groups and automatically provide credit to you. Otherwise I'll just add you to the acknowledgements.

rjverheijden commented 1 year ago

Hi @hanneoberman, great that you are developing this function! I have now created a pull request (I hope). Let me know if something went wrong and feel free to deny the pull request if something went wrong and just use the function as you wish. As long as other's can make use of it, it is fine to me.

hanneoberman commented 1 year ago

Alternative: add optional argument to plot_corr etc. to use model.matrix instead of variables.