Discrepancy between including factors as covariates directly vs hard-coding dummy variables

jmchan88 commented 3 years ago

Dear MAST team,

I encountered unexpected behavior when I was including a covariate (tissue) to adjust in my formula.

suppressMessages(library(MAST))
suppressMessages(library(data.table))
suppressMessages(require("Matrix"))
suppressMessages(library(stringr))

args <- commandArgs(trailingOnly = TRUE)

# command line arguments
mtx_file <- args[1] 
g_file <- args[2] 
bc_file <- args[3] 
obs_file <- args[4] 
clust1 = args[5]
clust2 = args[6]
out_dir <- args[7] 

# Clusters
obs_df = read.csv(obs_file,header=T,stringsAsFactors=F, sep = '\t', row.names=1)
cell_clusters = as.integer(obs_df$phenograph); names(cell_clusters) <- rownames(obs_df)

tissue0 = as.character(obs_df[,'tissue']); names(tissue0) <- rownames(obs_df)
tissue0[!grepl('lung|LN',tissue0)] = 'distant'
tissue0 = factor(tissue0, levels = c('lung','LN','distant'))

i = as.integer(str_split(clust1,',')[[1]])
j = as.integer(str_split(clust2,',')[[1]])

ind1 = cell_clusters %in% i
ind2 = cell_clusters %in% j

# Normalized data frame
counts = as.matrix(readMM(mtx_file))
bc = read.csv(bc_file,header=F,col.names = 'Cell', stringsAsFactors = F)
g = read.csv(g_file,header=F,col.names = 'Gene', stringsAsFactors = F)
rownames(counts) = bc$Cell
norm_df <- counts/ms * median(ms)
data_df <- log(norm_df + 1)

cell_clusters = cell_clusters[ind1 | ind2]
tissue0 = tissue0[ind1 | ind2]

ind1 = cell_clusters %in% i
ind2 = cell_clusters %in% j

df1 <- data_df[ind1, ]
df2 <- data_df[ind2, ]

tissue1 = tissue0[ ind1 ]
tissue2 = tissue0[ ind2 ]

# Prepare for MAST
o_df <- rbind(df1, df2)
tissue <- factor(c(as.character(tissue1), as.character(tissue2))) #rbind(tissue1, tissue2)
tissue = relevel(tissue, 'lung')

condition <- rep(c(1, 2), times=c(nrow(df1), nrow(df2)))

wellKey <- rownames(o_df)
cdata <- data.frame(cbind(wellKey=wellKey, condition=condition))
fdata <- data.frame(primerid=colnames(o_df))

# SCa data
sca <- FromMatrix( t(o_df), cdata, fdata)
cdr2 <-colSums(assay(sca)>0)
colData(sca)$cngeneson <- scale(cdr2)

colData(sca)$condition <- factor(unlist(as.list(condition)))
colData(sca)$condition <- relevel(colData(sca)$condition, 2)

colData(sca)$treatment = treatment
colData(sca)$tissue = tissue

    # Fits
    zlmCond <- zlm(~ condition + tissue + cngeneson, sca)

summaryDt <- summary(zlmCond, doLRT='condition1')$datatable

# Significance table
fcHurdle <- merge(summaryDt[contrast=='condition1' & component=='H',.(primerid, `Pr(>Chisq)`)],
                summaryDt[contrast=='condition1' & component=='logFC', .(primerid,coef, ci.hi, ci.lo)], by='primerid')

        # FDR
        fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
        setorder(fcHurdle, fdr)

        # Export data
        write.csv(as.data.frame(fcHurdle), sprintf("%s/deg.mast.%s_%s.adjustFE_tissue.csv", out_dir, clust1, clust2), quote=FALSE)

I noticed that changing the reference level of tissue changes the results!

However, when I instead hard-code the dummy variables for tissue, I not only get different results, but these results don't change based on which reference level is considered. For example:

suppressMessages(library(MAST))
suppressMessages(library(data.table))
suppressMessages(require("Matrix"))
suppressMessages(library(stringr))

args <- commandArgs(trailingOnly = TRUE)

# command line arguments
mtx_file <- args[1] #/home/chanj3/data/HTA.myeloid.SCLC_samples_only.010920/out.myeloid.SCLC_samples_only.010920/myeloid.SCLC_samples_only.mtx
g_file <- args[2] #/home/chanj3/data/HTA.myeloid.SCLC_samples_only.010920/out.myeloid.SCLC_samples_only.010920/myeloid.SCLC_samples_only.genes.csv
bc_file <- args[3] #/home/chanj3/data/HTA.myeloid.SCLC_samples_only.010920/out.myeloid.SCLC_samples_only.010920/myeloid.SCLC_samples_only.cells.csv
obs_file <- args[4] #/home/chanj3/data/HTA.myeloid.SCLC_samples_only.010920/out.myeloid.SCLC_samples_only.010920/obs.myeloid.SCLC_samples_only.csv
clust1 = args[5]
clust2 = args[6]
out_dir <- args[7] #/home/chanj3/data/HTA.myeloid.SCLC_samples_only.010920/out.myeloid.SCLC_samples_only.010920

# Clusters
obs_df = read.csv(obs_file,header=T,stringsAsFactors=F, sep = '\t', row.names=1)
cell_clusters = as.integer(obs_df$phenograph); names(cell_clusters) <- rownames(obs_df)

tissue0 = as.character(obs_df[,'tissue']); names(tissue0) <- rownames(obs_df)
treatment0 = as.character(obs_df[,'treatment_mast']); names(treatment0) <- rownames(obs_df)

treatment0 = gsub('Platinum Doublet,Immunotherapy','ChemoIO',treatment0)
treatment0 = gsub('Platinum Doublet','Chemo_1L',treatment0)
tissue0[!grepl('lung|LN',tissue0)] = 'distant'

treatment0 = factor(treatment0, levels = c('Naive',"Chemo_1L","ChemoIO","TMZ"))
tissue0 = factor(tissue0, levels = c('lung','LN','distant'))

i = as.integer(str_split(clust1,',')[[1]])
j = as.integer(str_split(clust2,',')[[1]])

ind1 = cell_clusters %in% i
ind2 = cell_clusters %in% j

# Normalized data frame
counts = as.matrix(readMM(mtx_file))
bc = read.csv(bc_file,header=F,col.names = 'Cell', stringsAsFactors = F)
g = read.csv(g_file,header=F,col.names = 'Gene', stringsAsFactors = F)
rownames(counts) = bc$Cell
colnames(counts) = g$Gene
counts = counts[,!grepl("^MT-|^MTMR|^MTND|NEAT1|TMSB4X|TMSB10|^RPS|^RPL|^MRP|^FAU$|UBA52", colnames(counts))]
counts = counts[ind1 | ind2,]
ms <- rowSums(counts)
norm_df <- counts/ms * median(ms)
data_df <- log(norm_df + 1)

cell_clusters = cell_clusters[ind1 | ind2]
tissue0 = tissue0[ind1 | ind2]

ind1 = cell_clusters %in% i
ind2 = cell_clusters %in% j

df1 <- data_df[ind1, ]
df2 <- data_df[ind2, ]

tissue1 = tissue0[ ind1 ]
tissue2 = tissue0[ ind2 ]

# Prepare for MAST
o_df <- rbind(df1, df2)
tissue <- factor(c(as.character(tissue1), as.character(tissue2))) #rbind(tissue1, tissue2)
tissue = relevel(tissue, 'lung')

tissue_df = data.frame(model.matrix(~tissue0))
tissue_df = tissue_df[,2:ncol(tissue_df)]
colnames(tissue_df) = gsub('tissue0','',colnames(tissue_df))
rownames(tissue_df) = names(tissue0)

condition <- rep(c(1, 2), times=c(nrow(df1), nrow(df2)))

wellKey <- rownames(o_df)
cdata <- data.frame(cbind(wellKey=wellKey, condition=condition))
fdata <- data.frame(primerid=colnames(o_df))

# SCa data
sca <- FromMatrix( t(o_df), cdata, fdata)
cdr2 <-colSums(assay(sca)>0)
colData(sca)$cngeneson <- scale(cdr2)

colData(sca)$condition <- factor(unlist(as.list(condition)))
colData(sca)$condition <- relevel(colData(sca)$condition, 2)

    for (i in colnames(tissue_df)) {
        colData(sca)[i] <- factor(unlist(as.list(tissue_df[,i])))
    }

    # Fits
    formula = as.formula(paste('~ ', paste(colnames(colData(sca))[2:ncol(colData(sca))], collapse = '+')))
    zlmCond <- zlm(formula, sca)

summaryDt <- summary(zlmCond, doLRT='condition1')$datatable

# Significance table
fcHurdle <- merge(summaryDt[contrast=='condition1' & component=='H',.(primerid, `Pr(>Chisq)`)],
                summaryDt[contrast=='condition1' & component=='logFC', .(primerid,coef, ci.hi, ci.lo)], by='primerid')

        # FDR
        fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
        setorder(fcHurdle, fdr)

        # Export data
        write.csv(as.data.frame(fcHurdle), sprintf("%s/deg.mast.%s_%s.adjustFE_tissue.test1.csv", out_dir, clust1, clust2), quote=FALSE)

Does zlm in MAST allow for factors to be entered in the formula directly? My understanding is that when adjusting for a categorical variable, the reference level chosen should not change results, right? Is hard-coding the dummy variables of the factor the right way to do this? Any help/ideas would be greatly appreciated! Thank you!

Joe

gfinak commented 3 years ago

I'm not going to debug your wall of code. Can you be more specific? "Change the results" how? Maybe you can provide a minimum reproducible example or show the model output and be more specific about what your see vs what you think you should see?

amcdavid commented 3 years ago

I agree with Greg, a self-contained example would be helpful. How can you reproduce this issue using included data(vbetaFA) experiment?

But if I had to take a guess based on what wrote, this may relate to how shrinkage is done in the logistic regression (see ?arm::bayesglm and ?MAST::defaultPrior for some details). I expect that setting zlm(, method='glm') should give invariance to the factor encoding. I also expect that variation in bayesglm due the shrinkage should not be substantial (ie, compared to the standard error of the coefficient).

Yes, factors can be entered into the formula directly, the formula uses model.matrix to generate the design matrix like most of R.

jmchan88 commented 3 years ago

Apologies! I guess my question was more high-level rather than asking to debug. My presupposition is that it should not matter what reference level you choose for a factor to be adjusted. Indeed, that's what seems to be the case if I hard-code the one-hot encodings using model.matrix. On the other hand, if I just use the factor as is in the zlm formula, I get different results depending on what is the reference level.

I'll try to highlight the relevant code below:

In the following, the covariate that I would like to adjust is "tissue" that I feed into the zlm directly as a factor below.

# Prepare for MAST
tissue <- factor(tissue, levels = c('lung','LN','distant')) #Reference level is lung
colData(sca)$treatment = treatment

# Fits
zlmCond <- zlm(~ condition + tissue + cngeneson, sca)

The resulting output changes based on how I set the reference level:

tissue <- factor(tissue, levels = c('lung','LN','distant')) #Reference level is lung
tissue <- factor(tissue, levels = c('LN','lung','distant')) #Reference level is LN
tissue <- factor(tissue, levels = c('distant','LN','lung')) #Reference level is distant

I get the following output:

#Reference level is lung
    primerid    Pr(>Chisq)  coef    ci.hi   ci.lo   fdr
327 FN1 1.27397590106933e-09    0.147043063768172   0.249096390813669   0.0449897367226746  7.51918498184648e-08

#Reference level is LN
    primerid    Pr(>Chisq)  coef    ci.hi   ci.lo   fdr
327 FN1 1.26543559466497e-09    0.154152928165985   0.294230575593857   0.0140752807381125  7.46877889205928e-08

#Reference level is distant
    primerid    Pr(>Chisq)  coef    ci.hi   ci.lo   fdr
327 FN1 1.25377081171509e-09    0.463576811768406   0.675322508262732   0.251831115274079   7.39993170217161e-08

Mainly I see differences in the coef ranging from 0.15 to 0.46. However, if I hard-code the one-hot dummy encoding using model.matrix as follows

tissue <- factor(tissue, levels = c('lung','LN','distant'))  #Reference level is lung
tissue_df = data.frame(model.matrix(~tissue))
tissue_df = tissue_df[,2:ncol(tissue_df)]
colnames(tissue_df) = gsub('tissue0','',colnames(tissue_df))
rownames(tissue_df) = names(tissue)

for (i in colnames(tissue_df)) {
     colData(sca)[i] <- factor(unlist(as.list(tissue_df[,i])))
}

# Fits
formula = as.formula(paste('~ ', paste(colnames(colData(sca))[2:ncol(colData(sca))], collapse = '+')))
zlmCond <- zlm(formula, sca)

I get the same answer no matter how I choose the reference level.

    primerid    Pr(>Chisq)  coef    ci.hi   ci.lo   fdr
79  FN1 1.1892286821448e-19 0.292507098640155   0.433012762308486   0.152001434971824   2.90533083106261e-17

The idea that variation might occur with bayesglm sounds promising, but I was able to achieve invariance to factor encoding when using model.matrix. I guess my question really is, is that expected? Should I not be feeding factors directly in to the zlm formula for bayesglm, but it should be ok for glm?

Thanks so much!

amcdavid commented 3 years ago

I can't speculate what the "one hot coding" you implemented actually doing without code that I can run myself, ie, I have no idea what factor(unlist(as.list(tissue_df[,i])) does, but it doesn't look idiomatic for a one-hot encoding which I'd expect would use something like model.matrix(~ tissue + 0). Nor can I tell what the output you included is supposed to represent, is it a log fold change? Those have their own issues, see ?getLogFC, and you do need to be careful what the reference group is because the contrasts are not a linear function of the coefficients.

Manually generating a design matrix doesn't do anything that you can't do with a factor for glm or bayesglm.

gfinak commented 3 years ago

@jmchan88 do you have a reprex for us to follow up on?

RGLab / MAST

Discrepancy between including factors as covariates directly vs hard-coding dummy variables #144