statOmics / msqrob2

Implementation of the MSqRob analysis of differentially expressed proteins using the Features infrastructure
9 stars 10 forks source link

We have a problem using the mzTab file input type #49

Open Douerww opened 1 year ago

Douerww commented 1 year ago

We refer to the sample code in msqrob2Example and rewrite it into a code that can input . mzTab format, and successfully run the PXD000279 dataset (data can be downloaded from here. It is worth mentioning that the file we use at runtime changed the peptide_abundance_std_error_study_variable[X] column name to the sumIntensity_X column name ).

However, we encountered an error that we could not solve when running another dataset PXD007145. (data can be downloaded from here. It is worth mentioning that the file we use at runtime changed the peptide_abundance_std_error_study_variable[X] column name to the sumIntensity_X column name ). When running pe <- msqrob2::msqrob(object = pe, i = "protein", formula = ~condition), an error occurred Error in if (m == 0) { : Missing values cannot be used where TRUE/FALSE values are required When we try to use msqrob2gui to run the same dataset, the following prompt appears in the lower right corner of the Model tab: The model is overparameterized. The residual degrees of freedom is 0. Variances and standard errors can not be estimated from data with this design. An error occurred on the Inference tab, but we use msqrob2gui to run PXD000279 very smoothly.

Annotations file contents are as follows 4LIT2OR7U(WNHKA$MX8TW_1

Any possible help is needed, thank you very much!

The codes to run PXD000279 is as follows

mzTab_pep = read.csv("./PXD000279/onlyPEP-filter-PXD000279.dynamic.sdrf_openms_design_openms.mzTab", sep="\t")

#mzTab_pep = mzTab_pep[mzTab_pep$opt_global_cv_MS.1002217_decoy_peptide == 0,]
mzTab_pep = mzTab_pep[!grepl("DECOY", mzTab_pep$accession),]

#peptide_abundance_study variable[1] == sumIntensity_1
mzTab_pep = mzTab_pep[,c("sequence", "accession", "sumIntensity_1", "sumIntensity_2",
                       "sumIntensity_3", "sumIntensity_4", "sumIntensity_5", "sumIntensity_6",
                       "sumIntensity_7", "sumIntensity_8")]
mzTab_pep_sum <- mzTab_pep %>% group_by(sequence, accession) %>% summarise(sumIntensity_1 = sum(sumIntensity_1),
                                                                           sumIntensity_2 = sum(sumIntensity_2),
                                                                           sumIntensity_3 = sum(sumIntensity_3),
                                                                           sumIntensity_4 = sum(sumIntensity_4),
                                                                           sumIntensity_5 = sum(sumIntensity_5),
                                                                           sumIntensity_6 = sum(sumIntensity_6),
                                                                           sumIntensity_7 = sum(sumIntensity_7),
                                                                           sumIntensity_8 = sum(sumIntensity_8))

mzTab_pep_sum <- mzTab_pep_sum[which((rowSums(mzTab_pep_sum[,3:6]) > 0)|(rowSums(mzTab_pep_sum[7:10]) > 0)),]

names(mzTab_pep_sum)[names(mzTab_pep_sum) =="accession"] <-"Proteins"

ecols <- grep("sumIntensity_", names(mzTab_pep_sum))

  pe <- readQFeatures(
  table = mzTab_pep_sum, fnames = 1, ecol = ecols,
  name = "peptideRaw", sep = "\t"
)

colData(pe)$condition <- as.factor(c("UPS1", "UPS1", "UPS1", "UPS1", "UPS2", "UPS2", "UPS2", "UPS2"))

rowData(pe[["peptideRaw"]])$nNonZero <- rowSums(assay(pe[["peptideRaw"]]) > 0)
pe <- zeroIsNA(pe, "peptideRaw") # convert 0 to NA

MSnbase::plotNA(assay(pe[["peptideRaw"]])) +
  xlab("Peptide index (ordered by data completeness)")

pe <- logTransform(pe, base = 2, i = "peptideRaw", name = "peptideLog")
limma::plotDensities(assay(pe[["peptideLog"]]))

Protein_filter <- rowData(pe[["peptideLog"]])$Proteins %in% smallestUniqueGroups(rowData(pe[["peptideLog"]])$Proteins)
pe <- pe[Protein_filter,,]

pe <- filterFeatures(pe, ~ nNonZero >= 2)

pe <- normalize(pe,
                i = "peptideLog",
                name = "peptideNorm",
                method = "center.median"
)

limma::plotDensities(assay(pe[["peptideNorm"]]))
boxplot(assay(pe[["peptideNorm"]]),
        col = palette()[-1],
        main = "Peptide distribtutions after normalisation", ylab = "intensity"
)

limma::plotMDS(assay(pe[["peptideNorm"]]), col = as.numeric(colData(pe)$condition))

# Summarization to protein level
pe <- QFeatures::aggregateFeatures(pe, i = "peptideNorm", fcol = "Proteins", na.rm = TRUE, name = "protein")

plotMDS(assay(pe[["protein"]]), col = as.numeric(colData(pe)$condition))

pe <- msqrob2::msqrob(object = pe, i = "protein", formula = ~condition)
getCoef(rowData(pe[["protein"]])$msqrobModels[[1]])

L <- makeContrast("conditionUPS2=0", parameterNames = c("conditionUPS2"))
pe <- hypothesisTest(object = pe, i = "protein", contrast = L)

volcano <- ggplot(
  rowData(pe[["protein"]])$conditionUPS2,
  aes(x = logFC, y = -log10(pval), color = adjPval < 0.05)
) +
  geom_point(cex = 2.5) +
  scale_color_manual(values = alpha(c("black", "red"), 0.5)) +
  theme_minimal() +
  ggtitle("Default workflow")

volcano

write.csv(rowData(pe[["protein"]])$conditionUPS2, file = "ups-quantms-dep-CM.csv")

The codes to run PXD007145 is as follows

mzTab_pep = read.csv("./PXD007145/only_pep_filterPXD007145-Th.sdrf_openms_design_openms.mzTab", sep="\t")
#mzTab_pep = mzTab_pep[mzTab_pep$opt_global_cv_MS.1002217_decoy_peptide == 0,]
mzTab_pep = mzTab_pep[!grepl("DECOY", mzTab_pep$accession),]

mzTab_pep = mzTab_pep[,c("sequence", "accession", "sumIntensity_1", "sumIntensity_2",
                         "sumIntensity_3")]
mzTab_pep_sum <- mzTab_pep %>% group_by(sequence, accession) %>% summarise(sumIntensity_1 = sum(sumIntensity_1),
                                                                           sumIntensity_2 = sum(sumIntensity_2),
                                                                           sumIntensity_3 = sum(sumIntensity_3))

mzTab_pep_sum <- mzTab_pep_sum[which((rowSums(mzTab_pep_sum[,3]) > 0)&(rowSums(mzTab_pep_sum[,4]) > 0)&(rowSums(mzTab_pep_sum[,5]) > 0)),]

names(mzTab_pep_sum)[names(mzTab_pep_sum) =="accession"] <-"Proteins"

ecols <- grep("sumIntensity_", names(mzTab_pep_sum))

pe <- readQFeatures(
  table = mzTab_pep_sum, fnames = 1, ecol = ecols,
  name = "peptideRaw", sep = "\t"
)

colData(pe)$condition <- as.factor(c("1", "4", "10"))

rowData(pe[["peptideRaw"]])$nNonZero <- rowSums(assay(pe[["peptideRaw"]]) > 0)
pe <- zeroIsNA(pe, "peptideRaw") # convert 0 to NA

MSnbase::plotNA(assay(pe[["peptideRaw"]])) +
  xlab("Peptide index (ordered by data completeness)")

pe <- logTransform(pe, base = 2, i = "peptideRaw", name = "peptideLog")
limma::plotDensities(assay(pe[["peptideLog"]]))

Protein_filter <- rowData(pe[["peptideLog"]])$Proteins %in% smallestUniqueGroups(rowData(pe[["peptideLog"]])$Proteins)
pe <- pe[Protein_filter,,]

pe <- filterFeatures(pe, ~ nNonZero >= 2)

pe <- normalize(pe,
                i = "peptideLog",
                name = "peptideNorm",
                method = "center.median"
)

limma::plotDensities(assay(pe[["peptideNorm"]]))
boxplot(assay(pe[["peptideNorm"]]),
        col = palette()[-1],
        main = "Peptide distribtutions after normalisation", ylab = "intensity"
)

limma::plotMDS(assay(pe[["peptideNorm"]]), col = as.numeric(colData(pe)$condition))

# Summarization to protein level
pe <- QFeatures::aggregateFeatures(pe, i = "peptideNorm", fcol = "Proteins", na.rm = TRUE, name = "protein")

plotMDS(assay(pe[["protein"]]), col = as.numeric(colData(pe)$condition))

pe <- msqrob2::msqrob(object = pe, i = "protein", formula = ~condition)

getCoef(rowData(pe[["protein"]])$msqrobModels[[1]])

L4 <- makeContrast("condition4=0", parameterNames = c("condition4"))
pe4 <- hypothesisTest(object = pe, i = "protein", contrast = L4)

volcano <- ggplot(
  rowData(pe4[["protein"]])$condition4,
  aes(x = logFC, y = -log10(pval), color = adjPval < 0.05)
) +
  geom_point(cex = 2.5) +
  scale_color_manual(values = alpha(c("black", "red"), 0.5)) +
  theme_minimal() +
  ggtitle("Default workflow")

volcano

write.csv(rowData(pe4[["protein"]])$condition4, file = "PXD007145-4FC-quantms-dep-CM.csv")

L10 <- makeContrast("condition10=0", parameterNames = c("condition10"))
pe10 <- hypothesisTest(object = pe, i = "protein", contrast = L10)

volcano <- ggplot(
  rowData(pe10[["protein"]])$condition10,
  aes(x = logFC, y = -log10(pval), color = adjPval < 0.05)
) +
  geom_point(cex = 2.5) +
  scale_color_manual(values = alpha(c("black", "red"), 0.5)) +
  theme_minimal() +
  ggtitle("Default workflow")

volcano

write.csv(rowData(pe10[["protein"]])$condition10, file = "PXD007145-10FC-quantms-dep-CM.csv")