Open vravik opened 3 years ago
Hi, thanks a lot for the super nice resources! I'm also interested in the mouse DB. It would be helpful if you could kindly point us to it.
Just chiming in to say that I am interested in annotating my mouse data using scType, too.
Hello, I'm also interested in the mouse database.
Or maybe you use the same database for humans and mice, and convert the gene names?
Thank you!
I would download databases from CellMaker and make mouse version by myself http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download.html
I would download databases from CellMaker and make mouse version by myself http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download.html
How to make mouse version database
For those interested, here is a mouse immune cell Excel that I compiled from the CellMarker 2.0 mouse genes.
I categorized the cells to the progenitor lineage, into just 11 cell types: Macrophages, monocytes, eosinophils, neutrophils, basophils, DC, pDC, NK, T-cell, B-cell, and platelets. This is not tissue-type specific, so I compiled all the marker genes based on the cell-type.
I also attach the Excel file of the filtered CellMarker 2.0 of the cell-type of interest.
sc_type_mouse_alltumours.xlsx immune_cell_no_progenitors.xlsx
I also edited the functions a bit for the mouse genes:
# Modified gene_sets_prepare
gene_sets_prepare <- function(path_to_db_file, cell_type){
cell_markers = openxlsx::read.xlsx(path_to_db_file)
cell_markers = cell_markers[cell_markers$tissueType == cell_type,]
cell_markers$geneSymbolmore1 = gsub(" ",
"",
cell_markers$geneSymbolmore1)
cell_markers$geneSymbolmore2 = gsub(" ",
"",
cell_markers$geneSymbolmore2)
# correct gene symbols from the given DB (up-genes)
cell_markers$geneSymbolmore1 = sapply(1:nrow(cell_markers), function(i){
markers_all = gsub(" ",
"",
unlist(strsplit(cell_markers$geneSymbolmore1[i],",")))
# markers_all = toupper(markers_all[markers_all != "NA" & markers_all != ""]) # Only for human
markers_all = sort(markers_all)
if(length(markers_all) > 0){
# suppressMessages({markers_all = unique(na.omit(checkGeneSymbols(markers_all, species = "mouse")$Suggested.Symbol))}) # This is used if the symbols are not in the format of HGNC
paste0(markers_all, collapse = ",")
} else {
""
}
})
# correct gene symbols from the given DB (down-genes)
cell_markers$geneSymbolmore2 = sapply(1:nrow(cell_markers), function(i){
markers_all = gsub(" ", "", unlist(strsplit(cell_markers$geneSymbolmore2[i],",")))
# markers_all = toupper(markers_all[markers_all != "NA" & markers_all != ""]) # Only for human
markers_all = sort(markers_all)
if (length(markers_all) > 0){
# suppressMessages({markers_all = unique(na.omit(checkGeneSymbols(markers_all)$Suggested.Symbol))}) # This is used if the symbols are not in the format of HGNC
paste0(markers_all, collapse = ",")
} else {
""
}
})
cell_markers$geneSymbolmore1 = gsub("///",",",cell_markers$geneSymbolmore1)
cell_markers$geneSymbolmore1 = gsub(" ","",cell_markers$geneSymbolmore1)
cell_markers$geneSymbolmore2 = gsub("///",",",cell_markers$geneSymbolmore2)
cell_markers$geneSymbolmore2 = gsub(" ","",cell_markers$geneSymbolmore2)
gs = lapply(1:nrow(cell_markers), function(j) {
gsub(" ",
"",
unlist(strsplit(toString(cell_markers$geneSymbolmore1[j]),","))
)
})
names(gs) = cell_markers$cellName
gs2 = lapply(1:nrow(cell_markers), function(j) {
gsub(" ",
"",
unlist(strsplit(toString(cell_markers$geneSymbolmore2[j]),","))
)
})
names(gs2) = cell_markers$cellName
list(gs_positive = gs, gs_negative = gs2)
}
# Modified sctype_score
sctype_score <- function(scRNAseqData, scaled = !0, gs, gs2 = NULL, gene_names_to_uppercase = !0, ...){
# For testing
# gs = gs_list$gs_positive
# check input matrix
if(!is.matrix(scRNAseqData)){
warning("scRNAseqData doesn't seem to be a matrix")
} else {
if(sum(dim(scRNAseqData))==0){
warning("The dimension of input scRNAseqData matrix equals to 0, is it an empty matrix?")
}
}
# marker sensitivity
marker_stat = sort(table(unlist(gs)), decreasing = T)
marker_sensitivity = data.frame(
score_marker_sensitivity = scales::rescale(as.numeric(marker_stat),
to = c(0,1),
from = c(length(gs),1)),
gene_ = names(marker_stat),
stringsAsFactors = !1)
# convert gene names to Uppercase
if (gene_names_to_uppercase){
rownames(scRNAseqData) = toupper(rownames(scRNAseqData));
}
# subselect genes only found in data
names_gs_cp = names(gs)
names_gs_2_cp = names(gs2)
gs = lapply(1:length(gs), function(d_){
GeneIndToKeep = rownames(scRNAseqData) %in% as.character(gs[[d_]])
rownames(scRNAseqData)[GeneIndToKeep]
})
gs2 = lapply(1:length(gs2), function(d_){
GeneIndToKeep = rownames(scRNAseqData) %in% as.character(gs2[[d_]])
rownames(scRNAseqData)[GeneIndToKeep]
})
names(gs) = names_gs_cp
names(gs2) = names_gs_2_cp
cell_markers_genes_score = marker_sensitivity[marker_sensitivity$gene_ %in% unique(unlist(gs)),]
# z-scale if not
if (!scaled) {
Z <- t(scale(t(scRNAseqData)))
} else {
Z <- scRNAseqData
}
# multiple by marker sensitivity
for (jj in 1:nrow(cell_markers_genes_score)){
Z[cell_markers_genes_score[jj,"gene_"], ] = Z[cell_markers_genes_score[jj,"gene_"], ] * cell_markers_genes_score[jj, "score_marker_sensitivity"]
}
# subselect only with marker genes
Z = Z[unique(c(unlist(gs),unlist(gs2))), ]
# combine scores
es = do.call("rbind", lapply(names(gs), function(gss_){
sapply(1:ncol(Z), function(j) {
gs_z = Z[gs[[gss_]], j]
gz_2 = Z[gs2[[gss_]], j] * -1
sum_t1 = (sum(gs_z) / sqrt(length(gs_z)))
sum_t2 = sum(gz_2) / sqrt(length(gz_2))
if (is.na(sum_t2)){
sum_t2 = 0;
}
sum_t1 + sum_t2
})
}))
dimnames(es) = list(names(gs), colnames(Z))
es.max <- es[!apply(is.na(es) | es == "", 1, all),] # remove NA rows
es.max
}
For those interested, here is a mouse immune cell Excel that I compiled from the CellMarker 2.0 mouse genes.
I categorized the cells to the progenitor lineage, into just 11 cell types: Macrophages, monocytes, eosinophils, neutrophils, basophils, DC, pDC, NK, T-cell, B-cell, and platelets. This is not tissue-type specific, so I compiled all the marker genes based on the cell-type.
I also attach the Excel file of the filtered CellMarker 2.0 of the cell-type of interest.
sc_type_mouse_alltumours.xlsx immune_cell_no_progenitors.xlsx
I also edited the functions a bit for the mouse genes:
# Modified gene_sets_prepare gene_sets_prepare <- function(path_to_db_file, cell_type){ cell_markers = openxlsx::read.xlsx(path_to_db_file) cell_markers = cell_markers[cell_markers$tissueType == cell_type,] cell_markers$geneSymbolmore1 = gsub(" ", "", cell_markers$geneSymbolmore1) cell_markers$geneSymbolmore2 = gsub(" ", "", cell_markers$geneSymbolmore2) # correct gene symbols from the given DB (up-genes) cell_markers$geneSymbolmore1 = sapply(1:nrow(cell_markers), function(i){ markers_all = gsub(" ", "", unlist(strsplit(cell_markers$geneSymbolmore1[i],","))) # markers_all = toupper(markers_all[markers_all != "NA" & markers_all != ""]) # Only for human markers_all = sort(markers_all) if(length(markers_all) > 0){ # suppressMessages({markers_all = unique(na.omit(checkGeneSymbols(markers_all, species = "mouse")$Suggested.Symbol))}) # This is used if the symbols are not in the format of HGNC paste0(markers_all, collapse = ",") } else { "" } }) # correct gene symbols from the given DB (down-genes) cell_markers$geneSymbolmore2 = sapply(1:nrow(cell_markers), function(i){ markers_all = gsub(" ", "", unlist(strsplit(cell_markers$geneSymbolmore2[i],","))) # markers_all = toupper(markers_all[markers_all != "NA" & markers_all != ""]) # Only for human markers_all = sort(markers_all) if (length(markers_all) > 0){ # suppressMessages({markers_all = unique(na.omit(checkGeneSymbols(markers_all)$Suggested.Symbol))}) # This is used if the symbols are not in the format of HGNC paste0(markers_all, collapse = ",") } else { "" } }) cell_markers$geneSymbolmore1 = gsub("///",",",cell_markers$geneSymbolmore1) cell_markers$geneSymbolmore1 = gsub(" ","",cell_markers$geneSymbolmore1) cell_markers$geneSymbolmore2 = gsub("///",",",cell_markers$geneSymbolmore2) cell_markers$geneSymbolmore2 = gsub(" ","",cell_markers$geneSymbolmore2) gs = lapply(1:nrow(cell_markers), function(j) { gsub(" ", "", unlist(strsplit(toString(cell_markers$geneSymbolmore1[j]),",")) ) }) names(gs) = cell_markers$cellName gs2 = lapply(1:nrow(cell_markers), function(j) { gsub(" ", "", unlist(strsplit(toString(cell_markers$geneSymbolmore2[j]),",")) ) }) names(gs2) = cell_markers$cellName list(gs_positive = gs, gs_negative = gs2) }
# Modified sctype_score sctype_score <- function(scRNAseqData, scaled = !0, gs, gs2 = NULL, gene_names_to_uppercase = !0, ...){ # For testing # gs = gs_list$gs_positive # check input matrix if(!is.matrix(scRNAseqData)){ warning("scRNAseqData doesn't seem to be a matrix") } else { if(sum(dim(scRNAseqData))==0){ warning("The dimension of input scRNAseqData matrix equals to 0, is it an empty matrix?") } } # marker sensitivity marker_stat = sort(table(unlist(gs)), decreasing = T) marker_sensitivity = data.frame( score_marker_sensitivity = scales::rescale(as.numeric(marker_stat), to = c(0,1), from = c(length(gs),1)), gene_ = names(marker_stat), stringsAsFactors = !1) # convert gene names to Uppercase if (gene_names_to_uppercase){ rownames(scRNAseqData) = toupper(rownames(scRNAseqData)); } # subselect genes only found in data names_gs_cp = names(gs) names_gs_2_cp = names(gs2) gs = lapply(1:length(gs), function(d_){ GeneIndToKeep = rownames(scRNAseqData) %in% as.character(gs[[d_]]) rownames(scRNAseqData)[GeneIndToKeep] }) gs2 = lapply(1:length(gs2), function(d_){ GeneIndToKeep = rownames(scRNAseqData) %in% as.character(gs2[[d_]]) rownames(scRNAseqData)[GeneIndToKeep] }) names(gs) = names_gs_cp names(gs2) = names_gs_2_cp cell_markers_genes_score = marker_sensitivity[marker_sensitivity$gene_ %in% unique(unlist(gs)),] # z-scale if not if (!scaled) { Z <- t(scale(t(scRNAseqData))) } else { Z <- scRNAseqData } # multiple by marker sensitivity for (jj in 1:nrow(cell_markers_genes_score)){ Z[cell_markers_genes_score[jj,"gene_"], ] = Z[cell_markers_genes_score[jj,"gene_"], ] * cell_markers_genes_score[jj, "score_marker_sensitivity"] } # subselect only with marker genes Z = Z[unique(c(unlist(gs),unlist(gs2))), ] # combine scores es = do.call("rbind", lapply(names(gs), function(gss_){ sapply(1:ncol(Z), function(j) { gs_z = Z[gs[[gss_]], j] gz_2 = Z[gs2[[gss_]], j] * -1 sum_t1 = (sum(gs_z) / sqrt(length(gs_z))) sum_t2 = sum(gz_2) / sqrt(length(gz_2)) if (is.na(sum_t2)){ sum_t2 = 0; } sum_t1 + sum_t2 }) })) dimnames(es) = list(names(gs), colnames(Z)) es.max <- es[!apply(is.na(es) | es == "", 1, all),] # remove NA rows es.max }
I think this would work.Can you please submit this as a pull request so the authors will accept it (hopefully) for the general public to access this code?
Thanks, Shani.
Dear Aleksandr,
Thank you for this developing this very useful tool. I am trying to annotate a mouse intestinal single cell dataset, but I'm not able to access the ScType database to generate the marker genesets. Could you please tell me how to proceed?
The example database file you have provided has only few genesets for Human tissues.