This repository contains the R code used to analyse the single-cell RNA-seq dataset shown in:
Delile, J., Rayon, T., Melchionda, M., Edwards, A., Briscoe, J., & Sagner, A. (2019). Single cell transcriptomics reveals spatial and temporal dynamics of gene expression in the developing mouse spinal cord. Development, dev-173807. https://doi.org/10.1242/dev.173807
Those interested in processing the dataset independently should consider downloading:
Figure 1 | Figure 2 | Figure 3 | Figure 4 |
---|---|---|---|
B C D-E | A-D E | A B | A |
Figure 5 | Figure 6 | Figure 7 | Figure S1 |
---|---|---|---|
A | A | A-B C D | A-C D E-F G |
Figure S2 | Figure S3 | Figure S4 | Figure S6 |
---|---|---|---|
A | A | A-H | A B-D |
To reproduce the analysis, the files contained in R_files, input_files and dataset must to be downloaded from this repository (most conveniently using git clone). The UMI count matrix is zipped and must be uncompressed into the dataset folder.
unzip("./dataset/UMI_count.tsv.zip", exdir="./dataset/")
The Antler package as it was at the time of publication is required and can be installed with devtools.
devtools::install_github("juliendelile/Antler", ref = "Development2019")
library(Antler)
Most functions not provided by Antler are stored in MouseSpinalCordAtlas_tools.R
source('./R_files/MouseSpinalCordAtlas_tools.R')
The output path can be changed to any existing directory path
output_path = './output/'
m = Antler$new(plot_folder=output_path, num_cores=4)
m$loadDataset(folderpath="./dataset/", phenoData_filename="phenoData.csv", assayData_filename="UMI_count.tsv")
Annotate gene names
m$setCurrentGeneNames(geneID_mapping_file=system.file("extdata", "Annotations/biomart_ensemblid_genename_mmusculus.csv", package="Antler"))
Display counts pre-QC
m$plotReadcountStats(data_status="Raw", by="timepoint", category="replicate_id", basename="preQC")
Remove cells having more than 6% of mitochondrial UMI counts
m$removeGenesFromRatio(
candidate_genes=grep('^mt-', grep('^mt-', m$getGeneNames(), value=T), value=T),
threshold = 0.06,
)
Remove outliers genes and cells
m$removeOutliers( lowread_thres = -Inf,
highread_thres = Inf,
genesmin = 500,
cellmin = 3,
data_status = 'Raw')
Display counts post-QC
m$plotReadcountStats(data_status="Raw", by="timepoint", category="replicate_id", basename="postQC")
Cell identities are determined by associating each cell to the closest target population state defined by a list of known marker genes
cell_partition = doCellPartition(known_template_file="./input_files/partitioning_table.csv", readcounts=m$getReadcounts(data_status='Raw'))
pop_colors = getPopulationColors(known_template_file="./input_files/partitioning_table.csv")
for(md in c("Type_step1", "Type_step2", "Type_step2_unique", "DV")){pData(m$expressionSet)[[md]] <- cell_partition[[md]]}
cellcluster_sizes_cumsum_step1 = setNames(
c(0, cumsum(table(pData(m$expressionSet)$Type_step1))),
c(levels(pData(m$expressionSet)$Type_step1), ""))
m$readcounts_norm=m$readcounts_raw
m$readcounts_norm[m$readcounts_norm >= 2] <- 1 # threshold used in doCellPartition
pop_def_mask_step1 = markersToMask(cell_partition$step1_markers)
m$dR$genemodules = as.list(rownames(pop_def_mask_step1))
m$plotGeneModules(
basename='FIG1_Map_Step1',
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA,
use.dendrogram=NA,
display.clusters=NULL,
file_settings=list(list(type='pdf', width=10, height=3)),
data_status=c('Normalized'),
gene_transformations='none',
display.legend=TRUE,
cell.ordering=order(pData(m$expressionSet)$Type_step1), # works iff Type_step1 are factors
extra_colors=cbind(
NA
"UMI counts"=colorRampPalette(c("white", "black"))(n = 101)[as.integer(1+100*(colSums(m$readcounts_raw) / max(colSums(m$readcounts_raw))))]
),
extra_legend=list("text"=c("", levels(pData(m$expressionSet)$Type_step1)), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m$expressionSet)$Type_step1)))])),
genemodules.palette=rep("white", length(m$dR$genemodules)),
rect_overlay=apply(which(pop_def_mask_step1==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_step1[[x[2]]],
xright=cellcluster_sizes_cumsum_step1[[x[2]+1]],
ytop=length(unlist(m$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m$dR$genemodules)) - x[1]+1 + 0.5
)
}),
pretty.params=list("size_factor"=3, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
white_rects = c(lapply(
list('Neuron', c('Blood', 'Hematopoeitic', 'Erythropoeitic', 'Erythrocytes', 'Erythrocytes II'), c('Mesoderm I', 'Mesoderm II', 'Mesoderm III', 'Mesoderm IV', 'Mesoderm V', 'Mesoderm VI', 'Myoblast'), c('Neural Crest I', 'Neural Crest II'), c('Neural crest neurons I', 'Neural crest neurons II', 'Neural crest neurons III')),
# list('Progenitor', 'Neuron', c('Blood', 'Hematopoeitic', 'Erythropoeitic', 'Erythrocytes', 'Erythrocytes II'), c('Mesoderm I', 'Mesoderm II', 'Mesoderm III', 'Mesoderm IV', 'Mesoderm V', 'Mesoderm VI', 'Myoblast'), c('Neural Crest I', 'Neural Crest II'), c('Neural crest neurons I', 'Neural crest neurons II', 'Neural crest neurons III'), c('Skin', 'NULL')),
function(l){
list(
xleft=min(which(names(cellcluster_sizes_cumsum_step1) %in% l)) %>% cellcluster_sizes_cumsum_step1[[.]],
xright= (max(which(names(cellcluster_sizes_cumsum_step1) %in% l))+1) %>% cellcluster_sizes_cumsum_step1[.],
ytop=0.5,
ybottom=length(unlist(m$dR$genemodules)) + 0.5,
color="white",
width=1
)
}),
lapply(
list(c('Tubb3', 'Elavl3'), c('Sox17', 'Fermt3', 'Klf1', 'Hemgn', 'Car2'), c('Foxc1', 'Foxc2', 'Twist1', 'Twist2', 'Meox1', 'Meox2', 'Myog'), 'Sox10', c('Tlx2', 'Six1')), function(l){
# list('Sox2', c('Tubb3', 'Elavl3'), c('Sox17', 'Fermt3', 'Klf1', 'Hemgn', 'Car2'), c('Foxc1', 'Foxc2', 'Twist1', 'Twist2', 'Meox1', 'Meox2', 'Myog'), 'Sox10', c('Tlx2', 'Six1'), 'Krt8'), function(l){
list(
xleft=0,
xright=max(cellcluster_sizes_cumsum_step1),
ytop=min(which(unlist(m$dR$genemodules) %in% l)) %>% {length(unlist(m$dR$genemodules)) - . + 1.5},
ybottom=max(which(unlist(m$dR$genemodules) %in% l)) %>% {length(unlist(m$dR$genemodules)) - . + 2.5},
color="white",
width=1
)
}))
m$plotGeneModules(
basename='FIG1_Map_Step1_whitelines',
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA,
use.dendrogram=NA,
display.clusters=NULL,
file_settings=list(list(type='pdf', width=10, height=3)),
data_status=c('Normalized'),
gene_transformations='none',
display.legend=TRUE,
cell.ordering=order(pData(m$expressionSet)$Type_step1), # works iff Type_step1 are factors
extra_colors=cbind(
NA
),
extra_legend=list("text"=c("", levels(pData(m$expressionSet)$Type_step1)), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m$expressionSet)$Type_step1)))])),
genemodules.palette=rep("white", length(m$dR$genemodules)),
rect_overlay=c(white_rects, apply(which(pop_def_mask_step1==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_step1[[x[2]]],
xright=cellcluster_sizes_cumsum_step1[[x[2]+1]],
ytop=length(unlist(m$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m$dR$genemodules)) - x[1]+1 + 0.5,
color="white", #"gray90",
width=0.5
)
})),
pretty.params=list("size_factor"=3, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
cellcluster_sizes_cumsum_step2 = setNames(
c(0, cumsum(table(pData(m$expressionSet)$Type_step2_unique))),
c(levels(pData(m$expressionSet)$Type_step2_unique), ""))
bothSteps_mask = markersToMask(cell_partition$bothSteps_markers)
m$dR$genemodules = as.list(rownames(bothSteps_mask))
m$plotGeneModules(
basename='FIGS1_Map_Step2',
displayed.gms = c('dR.genemodules'),
displayed.geneset="naked", # NA, plot all genes
use.dendrogram=NA,
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=15, height=8)),
data_status=c('Raw'),
gene_transformations='logscaled',
display.legend=TRUE,
cell.ordering=order(pData(m$expressionSet)$Type_step2_unique),
extra_colors=cbind(
"Timepoint"=pop_colors$timepoint[as.character(pData(m$expressionSet)$timepoint)],
"Step 1 Type"=pop_colors$Step1[as.character(pData(m$expressionSet)$Type_step1)],
"Step 2 Type"=pop_colors$Step2[as.character(pData(m$expressionSet)$Type_step2)]
),
extra_legend=list("text"=c("", levels(pData(m$expressionSet)$Type_step2)), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m$expressionSet)$Type_step2)))])),
rect_overlay=apply(which(bothSteps_mask==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_step2[[x[2]]],
xright=cellcluster_sizes_cumsum_step2[[x[2]+1]],
ytop=length(unlist(m$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m$dR$genemodules)) - x[1]+1 + 0.5
)
}),
pretty.params=list("size_factor"=3, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
Estimate quantity of cell doublets by calculating the ratio of the expressed cell type markers in other populations
dcounts = estimateDoublets()
write.csv(dcounts, file=paste0(output_path, "/FIGS1_doublet_estimates.csv"))
Export markers to csv
write.table(markersToMask(cell_partition$step2_markers) %>% cbind("Gene"=rownames(.), .), file=paste0(output_path,'/Table_S1.csv'), row.names=F, sep=";", quote=FALSE)
Select and create structures for 1. neural cells, 2. neuronal progenitors and 3. Neurons
# 1. Neural cells
m_neural = m$copy()
m_neural$excludeCellsFromIds(which(!pData(m_neural$expressionSet)$Type_step1 %in% c('Progenitor', 'Neuron')))
m_neural$excludeCellsFromIds(which(pData(m_neural$expressionSet)$Type_step2 %in% c('Null_Progenitor', 'Null_Neuron')))
m_neural$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
pData(m_neural$expressionSet)$Type_step1 = factor(pData(m_neural$expressionSet)$Type_step1, levels=levels(droplevels(pData(m_neural$expressionSet)$Type_step1)))
pData(m_neural$expressionSet)$Type_step2 = factor(pData(m_neural$expressionSet)$Type_step2, levels=levels(droplevels(pData(m_neural$expressionSet)$Type_step2)))
pData(m_neural$expressionSet)$Type_step2_unique = factor(pData(m_neural$expressionSet)$Type_step2_unique, levels=levels(droplevels(pData(m_neural$expressionSet)$Type_step2_unique)))
# 2. Neuronal progenitors
m_prog = m_neural$copy()
m_prog$excludeCellsFromIds(which(!pData(m_prog$expressionSet)$Type_step1 == 'Progenitor'))
m_prog$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
pData(m_prog$expressionSet)$Type_step1 = factor(pData(m_prog$expressionSet)$Type_step1, levels=levels(droplevels(pData(m_prog$expressionSet)$Type_step1)))
pData(m_prog$expressionSet)$Type_step2 = factor(pData(m_prog$expressionSet)$Type_step2, levels=levels(droplevels(pData(m_prog$expressionSet)$Type_step2)))
m_prog$dR$genemodules = as.list(unique(unlist(cell_partition$bothSteps_markers_neural[levels(droplevels(pData(m_prog$expressionSet)$Type_step2))])))
# 3. Neurons
m_neuron = m_neural$copy()
m_neuron$excludeCellsFromIds(which(!pData(m_neuron$expressionSet)$Type_step1 == 'Neuron'))
m_neuron$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
pData(m_neuron$expressionSet)$Type_step1 = factor(pData(m_neuron$expressionSet)$Type_step1, levels=levels(droplevels(pData(m_neuron$expressionSet)$Type_step1)))
pData(m_neuron$expressionSet)$Type_step2 = factor(pData(m_neuron$expressionSet)$Type_step2, levels=levels(droplevels(pData(m_neuron$expressionSet)$Type_step2)))
m_neuron$dR$genemodules = as.list(unique(unlist(cell_partition$bothSteps_markers_neural[levels(droplevels(pData(m_neuron$expressionSet)$Type_step2))])))
# Save structures to file
saveRDS(m_neural, file=paste0(output_path, '/m_neural.rds'))
# m_neural=readRDS(paste0(output_path, '/m_neural.rds'))
saveRDS(m_neuron, file=paste0(output_path, '/m_neuron.rds'))
# m_neuron=readRDS(paste0(output_path, '/m_neuron.rds'))
saveRDS(m_prog, file=paste0(output_path, '/m_prog.rds'))
# m_prog=readRDS(paste0(output_path, '/m_prog.rds'))
saveRDS(m, file=paste0(output_path, '/m_typed.rds'))
# m = readRDS(paste0(output_path, '/m_typed.rds'))
bubble_chart.df = data.frame(t(m_neural$getReadcounts('Raw')[unique(unlist(cell_partition$bothSteps_markers_neural)), ]), check.names=F) %>%
tibble::rownames_to_column('cellname') %>%
tidyr::gather(genename, value, -cellname) %>%
# dplyr::mutate(genename=factor(genename, levels=rev(unique(unlist(cell_partition$bothSteps_markers_neural))))) %>%
dplyr::mutate(genename=factor(genename, levels=(unique(unlist(cell_partition$bothSteps_markers_neural))))) %>%
dplyr::left_join(pData(m_neural$expressionSet) %>% tibble::rownames_to_column("cellname"), by="cellname") %>%
dplyr::group_by(Type_step2, genename) %>%
dplyr::summarise(mean=mean(value)) %>%
dplyr::group_by(genename) %>%
dplyr::mutate(mean_norm_max=mean/max(mean),
Type_step2 = factor(Type_step2, levels=rev(levels(Type_step2)))) %>%
dplyr::ungroup() %>%
dplyr::mutate(partitionning=factor(
(function(x,y){
unlist(lapply(seq(length(x)),
function(i){
markersToMask(cell_partition$bothSteps_markers_neural)[x[[i]],y[i]]}))
}
)(as.character(genename), as.character(Type_step2)),
levels=c(0,1))) %>%
dplyr::mutate(fillcol = (as.integer(partitionning)-1) * as.integer(Type_step2))
unfocus_grey = "grey"
pdf(paste0(output_path, '/FIG1_Progenitors_bubble_chart.pdf'), height=3.5, width=7, useDingbats=FALSE)
p <- ggplot(bubble_chart.df %>% dplyr::filter(Type_step2 %in% levels(droplevels(pData(m_prog$expressionSet)$Type_step2)), genename %in% unlist(m_prog$dR$genemodules))) +
geom_count(aes(x=genename, y=Type_step2, size=mean_norm_max, fill=factor(fillcol)), color="black", shape=21) + scale_size_area(max_size=5) + scale_fill_manual(breaks=seq(0, length(levels(bubble_chart.df$Type_step2))), values=c(unfocus_grey, rev(unname(pop_colors$Step2[levels(pData(m_prog$expressionSet)$Type_step2)])))) + scale_x_discrete(position = "top") + xlab("") + ylab("") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 0))
print(p)
graphics.off()
pdf(paste0(output_path, '/FIG1_Neurons_bubble_chart.pdf'), height=3.5, width=10, useDingbats=FALSE)
p <- bubble_chart.df %>%
dplyr::filter(Type_step2 %in% levels(droplevels(pData(m_neuron$expressionSet)$Type_step2)), genename %in% unlist(m_neuron$dR$genemodules)) %>%
dplyr::mutate(genename=factor(genename, levels=unique(unlist(cell_partition$bothSteps_markers_neural[levels(droplevels(pData(m_neuron$expressionSet)$Type_step2))])))) %>%
ggplot(.) + geom_count(aes(x=genename, y=Type_step2, size=mean_norm_max, fill=factor(fillcol)), color="black", shape=21) + scale_size_area(max_size=5) + scale_fill_manual(breaks=seq(0, length(levels(bubble_chart.df$Type_step2))), values=c(unfocus_grey, rev(unname(pop_colors$Step2[levels(pData(m_neuron$expressionSet)$Type_step2)])))) +
# scale_fill_manual(breaks=c(0,1), values=c(unfocus_grey, pop_colors$Step2)) +
scale_x_discrete(position = "top") + xlab("") + ylab("") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 0))
print(p)
graphics.off()
tSNE computation takes about 30min, skip to next section if the code has already run once.
m$normalize("geometric_mean_sizeFactors")
step1.tsne.data = m$getReadcounts('Normalized')[unique(unlist(cell_partition$step1_markers)), ]
set.seed(1)
t0=Sys.time()
step1.tsne.coords = Rtsne::Rtsne(t(step1.tsne.data), pca=FALSE, perplexity=100, max_iter=2000, verbose=T, check_duplicates = FALSE)$Y
print(Sys.time()-t0)
saveRDS(step1.tsne.coords, file=paste0(output_path, '/step1.tsne.coords.rds'))
Load pre calculated tsne coordinates
step1.tsne.coords = readRDS(paste0(output_path, '/step1.tsne.coords.rds'))
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Types.png'), width=4000, height=4000, pointsize=75)
op <- par(mfrow = c(1,1),
oma = c(5,4,0,0) + 0.1,
mar = c(0,0,1,1) + 0.1)
vertex.plot.order = rev(order(pData(m$expressionSet)$Type_step1))
plot(step1.tsne.coords[vertex.plot.order,], , col=pop_colors$Step1[as.character(pData(m$expressionSet)$Type_step1)][vertex.plot.order], pch=16, cex=.7, main='Step 1 Types', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
par(op)
par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n")
legend(x="left",
legend=names(pop_colors$Step1),
col=pop_colors$Step1,
pch=15,
y.intersp = 1.2, cex=.8,
bg='transparent',
box.col='transparent'
)
graphics.off()
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Types_naked.png'), width=4000, height=4000, pointsize=75)
vertex.plot.order = rev(order(pData(m$expressionSet)$Type_step1))
plot(step1.tsne.coords[vertex.plot.order,], , col=pop_colors$Step1[as.character(pData(m$expressionSet)$Type_step1)][vertex.plot.order], pch=16, cex=.7, xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
Progenitor only / Neuron only
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Progenitor_Focus.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords, col=ifelse(pData(m$expressionSet)$Type_step1=="Progenitor", "#D63624", "grey"), pch=16, cex=.7, main='Progenitors', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
png(paste0(output_path, '/FIG1_Map_Step1_tSNE_Normalized_Neuron_Focus.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords, col=ifelse(pData(m$expressionSet)$Type_step1=="Neuron", "#D63624", "grey"), pch=16, cex=.7, main='Neurons', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
Replicate per timepoint plots
# relabel replicate id to remove gaps
rep_colors = RColorBrewer::brewer.pal(9, "Set1")[1:3]
for(tp in unique(pData(m$expressionSet)$timepoint)) {
rep_ids = unique(pData(m$expressionSet)$replicate_id[pData(m$expressionSet)$timepoint==tp])
cell_colors = ifelse(
pData(m$expressionSet)$timepoint != tp,
'grey',
rep_colors[pData(m$expressionSet)$replicate_id]
)
tp_cell_ids = sample(which((m$expressionSet)$timepoint == tp))
vertex.plot.order=c(setdiff(seq(m$getNumberOfCells()), tp_cell_ids), tp_cell_ids)
png(paste0(output_path, '/FIGS1_Map_Step1_tSNE_Normalized_ReplicateIDs_', tp, '.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords[vertex.plot.order, ], col=cell_colors[vertex.plot.order], pch=16, cex=.7, main=paste0('Replicates E', tp, ' (n=', length(rep_ids), ')'), xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
}
Plot Sex
sex.df=cbind.data.frame(Xist=m$getReadcounts('Raw')['Xist',], pData(m$expressionSet)) %>%
dplyr::group_by(replicate_id, timepoint) %>%
dplyr::summarize(Xist_mean=mean(Xist)) %>%
dplyr::mutate(Sex=ifelse(Xist_mean < 1, "Male", "Female")) %>%
dplyr::arrange(timepoint)
cell_sex = pData(m$expressionSet) %>%
dplyr::select(replicate_id, timepoint) %>%
dplyr::left_join(sex.df, by=c("timepoint"="timepoint", "replicate_id"="replicate_id")) %>%
.$Sex
sex_colors = c("Male"="cornflowerblue", "Female"="coral1")
png(paste0(output_path, '/FIGS1_Map_Step1_tSNE_Normalized_Sex.png'), width=2000, height=2000, pointsize=40)
plot(step1.tsne.coords, col=sex_colors[cell_sex], pch=16, cex=.4, main='Sex', xlab = "", ylab = "", xaxt='n', yaxt='n', bty = "n")
graphics.off()
count.df = pData(m_neural$expressionSet) %>%
dplyr::select(timepoint, Type_step1, Type_step2) %>%
dplyr::mutate(
timepoint=factor(timepoint, levels=sort(unique(timepoint))),
Type_step1=factor(Type_step1, levels=levels(droplevels(Type_step1))),
Type_step2=factor(Type_step2, levels=levels(droplevels(Type_step2))),
) %>%
dplyr::group_by(timepoint, Type_step1, Type_step2) %>%
dplyr::tally() %>%
dplyr::group_by(timepoint) %>%
dplyr::add_tally(name="nn") %>%
dplyr::mutate(freq_norm=n/nn) %>%
dplyr::group_by(timepoint, Type_step1) %>%
dplyr::add_tally(name="nnn") %>%
dplyr::mutate(freq_norm_type=n/nnn) %>%
dplyr::ungroup()
Display population size dynamics
legacy_color = setNames(
c('#d3aed6ff', '#b0cbd8ff', '#c0d2d9ff', '#d1d9daff', '#e3e2dbff', '#f4e9dcff', '#ffeeddff', '#f3e8d4ff', '#daddc3ff', '#c2d1b1ff', '#a7c59eff', '#8fb98cff', '#b2aed6ff', '#FED908', '#b0cbd8ff', '#c0d2d9ff', '#d1d9daff', '#e3e2dbff', '#f4e9dcff', '#ffeeddff', '#f3e8d4ff', '#daddc3ff', '#c2d1b1ff', '#c2d1b1ff', '#a7c59eff', '#8fb98cff'),
c('RP', 'dp1', 'dp2', 'dp3', 'dp4', 'dp5', 'dp6', 'p0', 'p1', 'p2', 'pMN', 'p3', 'FP', 'Null_Progenitor', 'dl1', 'dl2', 'dl3', 'dl4', 'dl5', 'dl6', 'V0', 'V1', 'V2a', 'V2b', 'MN', 'V3')
)
p2 <- ggplot(count.df[which(count.df$Type_step1=="Progenitor"),], aes(x=timepoint, y=freq_norm, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Progenitors") + ylab("") + xlab('') + theme(legend.position="none") + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_minimal() + theme(legend.position="none")
p3 <- ggplot(count.df[which(count.df$Type_step1=="Neuron"),], aes(x=timepoint, y=freq_norm, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Neurons") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_classic() + theme_minimal() + theme(legend.position="none")
p4 <- ggplot(count.df[which(count.df$Type_step1=="Progenitor"),], aes(x=timepoint, y=freq_norm_type, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Progenitor Ratios") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_minimal() + theme(legend.position="none")
p5 <- ggplot(count.df[which(count.df$Type_step1=="Neuron"),], aes(x=timepoint, y=freq_norm_type, fill=Type_step2, group=interaction(Type_step2, Type_step1))) + geom_area() + ggtitle("Neuron Ratios") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + theme_minimal() + theme(legend.position="none")
p1 <- ggplot(count.df[which(count.df$Type_step1 %in% c("Progenitor", "Neuron")),], aes(x=timepoint, y=freq_norm, fill=Type_step2, group=interaction(Type_step1, Type_step2), alpha=Type_step1)) + geom_area(color='black', size=.05) + ggtitle("Neural Ratios") + ylab("") + xlab('') + scale_fill_manual(breaks=levels(count.df$Type_step2), values=legacy_color[levels(count.df$Type_step2)], drop=F) + scale_alpha_manual(breaks=c("Progenitor", "Neuron"), values=c(0.5, 1)) + theme(legend.key.width=unit(0.15,"cm"), legend.key.height=unit(0.15,"cm"), legend.position = c(0.6, 0.6)) + guides(fill=guide_legend(ncol=3))
# print(p1)
# graphics.off()
g_legend <- function(a.gplot){
pdf(file=NULL)
tmp <- ggplot_gtable(ggplot_build(a.gplot))
graphics.off()
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)}
lgd <- g_legend(p1)
pdf(paste0(output_path, 'FIG2_DV_Population_Ratios.pdf'), width=7, height=4, useDingbats=FALSE)
gridExtra::grid.arrange(grobs=list(p1 + theme(legend.position="none"),p2,p3,lgd,p4,p5), layout_matrix=matrix(seq(6), ncol=3, byrow=T))
graphics.off()
Comparison with Kicheva et al., 2014
kicheva_vals = read.table(file='./input_files/kicheva2014_Fig1G.csv', sep='\t', stringsAsFactors=F, header=FALSE, skipNul=T)
colnames(kicheva_vals) <- kicheva_vals[1,]
kicheva_vals <- kicheva_vals[-1,]
kicheva_vals <- kicheva_vals %>%
dplyr::filter(Type %in% c("average", "sd")) %>%
tidyr::gather(Timepoint, value, -Type, -Domain) %>%
dplyr::group_by(Timepoint) %>%
dplyr::mutate(value=value/max(value, na.rm=T)) %>% # works because max is always a mean value, not a sd
dplyr::ungroup() %>%
dplyr::mutate(dataset="Kicheva et al., 2014", Timepoint=as.numeric(Timepoint)) %>%
dplyr::filter(Domain != "total") %>%
tidyr::spread(Type, value)
baseline=90
both_dataset.df = count.df %>%
dplyr::filter(Type_step1=="Progenitor" & Type_step2 %in% levels(pData(m_prog$expressionSet)$Type_step2)) %>%
dplyr::filter(!Type_step2 %in% c(NULL, "RP")) %>%
dplyr::select(timepoint, Type_step2, freq_norm_type) %>%
dplyr::rowwise() %>%
dplyr::mutate(
# Timepoint=switch(as.numeric(timepoint), "E9.5"=60, "10.5"=80, "11.5"=100), # James email values
Timepoint=switch(as.numeric(timepoint), "E9.5"=baseline-48, "10.5"=baseline-24, "11.5"=baseline, "12.5"=baseline+24, "13.5"=baseline+48), # extrapolated from manuscript (E11.5 ~ 90hph)
Domain=switch(as.character(Type_step2), "FP"="FP", "p3"="p3", "pMN"="pMN", "p2"="pI", "p1"="pI", "p0"="pI", "dp6"="pD", "dp5"="pD", "dp4"="pD", "dp3"="pD", "dp2"="pD", "dp1"="pD"),
average=freq_norm_type,
sd=NA,
dataset="scRNA-seq"
) %>%
dplyr::ungroup() %>%
dplyr::group_by(Domain, Timepoint) %>%
dplyr::summarise(average=sum(average), dataset=unique(dataset)) %>%
dplyr::bind_rows(kicheva_vals) %>%
dplyr::mutate(dataset=factor(dataset, levels=c('scRNA-seq', 'Kicheva et al., 2014')))
p <- ggplot(both_dataset.df) +
geom_line(aes(x=Timepoint, y=average, color=Domain, linetype=dataset, group=interaction(dataset, Domain)), size=.2) +
geom_point(aes(x=Timepoint, y=average, color=Domain), size=.5) +
geom_errorbar(aes(x=Timepoint, ymin=average-sd, ymax=average+sd, color=Domain, linetype=dataset)) +
xlab("Time (hph)") + ylab("Progenitors (fraction)") + xlim(0,150) +
scale_colour_manual(breaks=c("FP", "p3", "pMN", "pI", "pD"), values=c("#B2AED6", "#82B383", "#F59498", "#D4D4D4", "#A7C7D7")) +
theme_minimal()
pdf(paste0(output_path, 'FIG2_DV_Population_Ratios_Comparison_Kicheva.pdf'), width=7, height=4, useDingbats=FALSE)
print(p)
graphics.off()
Resample dataset and select populations
m_neuron_combDE = resampleDataForCombinationDE(
orig_m_neural_path = paste0(output_path, '/m_neural.rds'),
num_min_cells_per_type2 = 40, # remove cell types with less than X cells
selected_type1 = "Neuron", # "Both", "Neuron", "Progenitor"
sampling_categories = c('Type_step2'),
num_cells_per_category = 200, # number of cells sampled per timepoint X Type_step2
dispersion_prefiltering_zscore = 0 # NULL, pre_select genes with sufficient dispersion score
)
neuron_partitionning_genes = unique(unlist(cell_partition$step2_markers[levels(pData(m_neuron_combDE$expressionSet)$Type_step2)]))
Pre-select type patterned genes
neuron_patterned_genes_test = m_neuron_combDE$runMonocle2DETest(
fullModelFormulaStr="~Type_step2",
reducedModelFormulaStr = "~1",
expressionFamily=VGAM::negbinomial.size()
)
neuron_patterned_genes = neuron_patterned_genes_test %>% as_tibble %>% dplyr::arrange(qval) %>% dplyr::filter(qval < 1e-5) %>% .$gene_short_name %>% as.character
neuron_partitionning_genes %>% {length(intersect(., neuron_patterned_genes))/length(.)} %>% {sprintf("%.2f%% of the markers used to partition neurons are included in the patterned gene list", .*100)}
m_neuron_combDE$excludeGenesFromIds(which(!m_neuron_combDE$getGeneNames() %in% neuron_patterned_genes))
Generate tested models
neuronCombinatorialModels = generateCombinatorialModels(m_neuron_combDE)
Run tests (4h45 with 8 cores)
Monocle_run_neuron = runCombinatorialDETests(m_neuron_combDE, neuronCombinatorialModels, numcores=8)
save(Monocle_run_neuron, file=paste0(output_path,'/Monocle_run_neuron_NegBinFamilyResample'))
# load(paste0(output_path,'/Monocle_run_neuron_NegBinFamilyResample.Rdata'))
Filter best model, for each gene
Monocle_run_neuron_topModel = getTopModel(m_neuron_combDE, Monocle_run_neuron, temporal_pattern_thres=.2)
saveRDS(Monocle_run_neuron_topModel, file=paste0(output_path, '/Monocle_run_neuron_topModel.rds'))
# Monocle_run_neuron_topModel = readRDS(file=paste0(output_path, '/Monocle_run_neuron_topModel.rds'))
Add additional measures and filter models (fold change, average level, time correlation…)
pval_thres_neuron = 1e-9
neuron_models_annotations = getModelAnnotations(m_neuron_combDE, model_list=unique(Monocle_run_neuron_topModel$model))
m_neuron_combDE$dR$genemodules = makeGeneModules(Monocle_run_neuron_topModel, neuron_models_annotations,
pval_thres = pval_thres_neuron, pos_pop_level_threshold = .15,
pos_pop_ratio_threshold = .08, log2fc_thres = 2,
gm_grouping_vars = c("model", "temporal_pattern"),
gm_ordering_vars = c("temporal_pattern", "first_domain_factor", "num_domain")
)
length(unlist(m_neuron_combDE$dR$genemodules))
setdiff(neuron_partitionning_genes, unlist(m_neuron_combDE$dR$genemodules))
print(paste0("Number of neuronal categories: ", length(which(unlist(lapply(m_neuron_combDE$dR$genemodules, length)) > 0))))
saveRDS(m_neuron_combDE$dR$genemodules, file=paste0(output_path, '/neurons_combDE_genelists.rds'))
save.image(paste0(output_path, '/workspace_postCombDE_neurons.Rdata'))
# For figure 3A, we do not use temporal pattern categories
gms.ordered = makeGeneModules(Monocle_run_neuron_topModel, neuron_models_annotations,
pval_thres = pval_thres_neuron, pos_pop_level_threshold = .15,
pos_pop_ratio_threshold = .08, log2fc_thres = 2,
gm_grouping_vars = c("model"),
gm_ordering_vars = c("first_domain_factor", "num_domain")
)
saveRDS(gms.ordered, file=paste0(output_path, '/neurons_combDE_genelists_ordered.rds'))
Plot predicted genes
neuron_combDE_by_domain = list()
for(x in setdiff(names(m_neuron_combDE$dR$genemodules), "Missing")) {
for(p in strsplit(strsplit(x, " / ")[[1]][1], '-')[[1]]) {
neuron_combDE_by_domain[[p]] <- c(neuron_combDE_by_domain[[p]], m_neuron_combDE$dR$genemodules[[x]])
}
}
neuron_combDE_by_domain_mask = markersToMask(neuron_combDE_by_domain[rev(levels(pData(m_neuron_combDE$expressionSet)$Type_step2))])[unlist(m_neuron_combDE$dR$genemodules),]
gene_pValues = (Monocle_run_neuron_topModel %>% {"names<-"(.$pval, .$genename)})[unlist(m_neuron_combDE$dR$genemodules)]
gene_pValues_colors = colorRampPalette(c("white", "black"))(n = 10)[cut(log10(1e-100 + gene_pValues), 10)]
gene_pValues_colors[which(gene_pValues == 0)] <- "orange"
cellcluster_sizes_combDE = table(pData(m_neuron_combDE$expressionSet)$Type_step2)
cellcluster_sizes_cumsum_neuron_combDE = c(0, cumsum(cellcluster_sizes_combDE))
names(cellcluster_sizes_cumsum_neuron_combDE) <- c(names(cellcluster_sizes_combDE), "")
tf_list = getTFs()
interesting_genes = sort(setdiff(unlist(m_neuron_combDE$dR$genemodules), sort(unique(unlist(cell_partition$bothSteps_markers_neural)))))
plot_basename_neuron = paste0('FIGS3_Neural_DE_Combinatorial_Neurons_NegBinResampled')
m_neuron_combDE$plotGeneModules(
basename=plot_basename_neuron,
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA, # plot all genes
use.dendrogram=NA,
display.clusters=NULL, #"State",
# file_settings=list(list(type='png', width=1500, height=4000)),
file_settings=list(list(type='pdf', width=7, height=as.integer(0.01*length(unlist(m_neuron_combDE$dR$genemodules)) + 6))),
data_status='Raw',
gene_transformations=c('log', 'logscaled'),
display.legend=TRUE,
cell.ordering=order(rev(pData(m_neuron_combDE$expressionSet)$Type_step2), pData(m_neuron_combDE$expressionSet)$timepoint),
extra_colors=cbind(
"Neural types"=pop_colors$Step2[as.character(pData(m_neuron_combDE$expressionSet)$Type_step2)],
"timepoint"=pop_colors$timepoint[as.character(pData(m_neuron_combDE$expressionSet)$timepoint)]
),
genemodules.palette=rep(c("gray80", "gray90"), length(m_neuron_combDE$dR$genemodules)),
genes.extra_colors=cbind(
"Partitionning"=unname(unlist(lapply(unlist(m_neuron_combDE$dR$genemodules), function(l) if(l %in% neuron_partitionning_genes) 'red' else 'white'))),
"pval"=gene_pValues_colors,
"TF"=unname(unlist(lapply(unlist(m_neuron_combDE$dR$genemodules), function(l) if(l %in% tf_list) 'yellow' else 'white'))),
"Time behavior"=c("blue", "red", "white")[(Monocle_run_neuron_topModel %>% {"names<-"(.$temporal_pattern, .$genename)}) [unlist(m_neuron_combDE$dR$genemodules)]],
"+"=unname(unlist(lapply(unlist(m_neuron_combDE$dR$genemodules), function(l) if(l %in% interesting_genes) 'green' else 'white')))
),
extra_legend=list("text"=c("", levels(droplevels(pData(m_neuron_combDE$expressionSet)$Type_step2))), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m_neuron_combDE$expressionSet)$Type_step2)))])),
rect_overlay=c(lapply(seq(rev(levels(pData(m_neuron_combDE$expressionSet)$Type_step2))), function(i){
list(
xleft=cellcluster_sizes_cumsum_neuron_combDE[[i]],
xright=cellcluster_sizes_cumsum_neuron_combDE[[i+1]],
ytop=0.5,
ybottom=length(unlist(m_neuron_combDE$dR$genemodules)) + 0.5,
width=.2,
color="grey"
)
}),
apply(which(neuron_combDE_by_domain_mask==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_neuron_combDE[[x[2]]],
xright=cellcluster_sizes_cumsum_neuron_combDE[[x[2]+1]],
ytop=length(unlist(m_neuron_combDE$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m_neuron_combDE$dR$genemodules)) - x[1]+1 + 0.5,
color='black'
)
})
),
pretty.params=list("size_factor"=1, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
Export predicted genes to file
m_neuron_combDE$writeGeneModules(basename=plot_basename_neuron, gms='dR.genemodules')
Resample dataset and select populations
m_prog_combDE = resampleDataForCombinationDE(
orig_m_neural_path = paste0(output_path, '/m_neural.rds'),
num_min_cells_per_type2 = 40, # remove cell types with less than X cells
selected_type1 = "Progenitor", # "Both", "Neuron", "Progenitor"
sampling_categories = c('Type_step2'),
num_cells_per_category = 200, # number of cells sampled per timepoint X Type_step2
dispersion_prefiltering_zscore = NULL # 0, pre_select genes with sufficient dispersion score
)
prog_partitionning_genes = unique(unlist(cell_partition$step2_markers[levels(pData(m_prog_combDE$expressionSet)$Type_step2)]))
Pre-select type patterned genes
prog_patterned_genes_test = m_prog_combDE$runMonocle2DETest(
fullModelFormulaStr="~Type_step2",
reducedModelFormulaStr = "~1",
expressionFamily=VGAM::negbinomial.size()
)
prog_patterned_genes = prog_patterned_genes_test %>% as_tibble %>% dplyr::arrange(qval) %>% dplyr::filter(qval < 1e-5) %>% .$gene_short_name %>% as.character
prog_partitionning_genes %>% {length(intersect(., prog_patterned_genes)) / length(.)} %>% {sprintf("%.2f%% of the markers used to partition progenitors are included in the patterned gene list", .*100)}
m_prog_combDE$excludeGenesFromIds(which(!m_prog_combDE$getGeneNames() %in% prog_patterned_genes))
Generate tested models
progCombinatorialModels = generateCombinatorialModels(m_prog_combDE)
Run tests (about 24h with 8 cores)
Monocle_run_prog = runCombinatorialDETests(m_prog_combDE, progCombinatorialModels, numcores=8)
save(Monocle_run_prog, file=paste0(output_path,'/Monocle_run_prog_NegBinFamilyResample.Rdata'))
# load(paste0(output_path,'/Monocle_run_prog_NegBinFamilyResample.Rdata'))
Filter best model, for each gene
Monocle_run_prog_topModel = getTopModel(m_prog_combDE, Monocle_run_prog, temporal_pattern_thres=.2)
saveRDS(Monocle_run_prog_topModel, file=paste0(output_path, '/Monocle_run_prog_topModel.rds'))
# Monocle_run_prog_topModel = readRDS(file=paste0(output_path, '/Monocle_run_prog_topModel.rds'))
Add additional measures and filter models (fold change, average level, time correlation…)
pval_thres_prog = 1e-9
prog_models_annotations = getModelAnnotations(m_prog_combDE, model_list=unique(Monocle_run_prog_topModel$model))
m_prog_combDE$dR$genemodules = makeGeneModules(Monocle_run_prog_topModel, prog_models_annotations,
pval_thres = pval_thres_prog, pos_pop_level_threshold = .2,
pos_pop_ratio_threshold = .1, log2fc_thres = 2,
gm_grouping_vars = c("model", "temporal_pattern"),
gm_ordering_vars = c("temporal_pattern", "first_domain_factor", "num_domain")
)
length(unlist(m_prog_combDE$dR$genemodules))
setdiff(prog_partitionning_genes, unlist(m_prog_combDE$dR$genemodules))
print(paste0("Number of progenitor categories: ", length(which(unlist(lapply(m_prog_combDE$dR$genemodules, length)) > 0))))
save.image(paste0(output_path, '/workspace_postCombDE_progenitors.rds'))
saveRDS(m_prog_combDE$dR$genemodules, file=paste0(output_path, '/progenitors_combDE_genelists.Rdata'))
Plot predicted genes
prog_combDE_by_domain = list()
for(x in setdiff(names(m_prog_combDE$dR$genemodules), "Missing")) {
for(p in strsplit(strsplit(x, " / ")[[1]][1], '-')[[1]]) {
prog_combDE_by_domain[[p]] <- c(prog_combDE_by_domain[[p]], m_prog_combDE$dR$genemodules[[x]])
}
}
# we drop the Type_step2 in case a type has been excluded from the tests
prog_combDE_by_domain_mask = markersToMask(prog_combDE_by_domain[rev(levels(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2)))])[unlist(m_prog_combDE$dR$genemodules),]
gene_pValues = (Monocle_run_prog_topModel %>% {"names<-"(.$pval, .$genename)})[unlist(m_prog_combDE$dR$genemodules)]
gene_pValues_colors = colorRampPalette(c("white", "black"))(n = 10)[cut(log10(1e-100 + gene_pValues), 10)]
gene_pValues_colors[which(gene_pValues == 0)] <- "orange"
cellcluster_sizes_combDE = table(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2))
cellcluster_sizes_cumsum_prog_combDE = c(0, cumsum(cellcluster_sizes_combDE))
names(cellcluster_sizes_cumsum_prog_combDE) <- c(names(cellcluster_sizes_combDE), "")
tf_list = getTFs()
interesting_genes = sort(setdiff(unlist(m_prog_combDE$dR$genemodules), sort(unique(unlist(cell_partition$bothSteps_markers_neural)))))
plot_basename_prog = paste0('FIGS2_Neural_DE_Combinatorial_Progenitors_NegBinResampled')
m_prog_combDE$plotGeneModules(
basename=plot_basename_prog,
displayed.gms = c('dR.genemodules'),
displayed.geneset=NA, # plot all genes
use.dendrogram=NA,
display.clusters=NULL, #"State",
# file_settings=list(list(type='png', width=1500, height=4000)),
file_settings=list(list(type='pdf', width=7, height=as.integer(0.01*length(unlist(m_prog_combDE$dR$genemodules)) + 6))),
data_status='Raw',
gene_transformations=c('log', 'logscaled'),
display.legend=TRUE,
cell.ordering=order(rev(pData(m_prog_combDE$expressionSet)$Type_step2), pData(m_prog_combDE$expressionSet)$timepoint),
extra_colors=cbind(
"Neural types"=pop_colors$Step2[as.character(pData(m_prog_combDE$expressionSet)$Type_step2)],
"timepoint"=pop_colors$timepoint[as.character(pData(m_prog_combDE$expressionSet)$timepoint)]
),
genemodules.palette=rep(c("gray80", "gray90"), length(m_prog_combDE$dR$genemodules)),
genes.extra_colors=cbind(
"Partitionning"=unname(unlist(lapply(unlist(m_prog_combDE$dR$genemodules), function(l) if(l %in% prog_partitionning_genes) 'red' else 'white'))),
"pval"=gene_pValues_colors,
"TF"=unname(unlist(lapply(unlist(m_prog_combDE$dR$genemodules), function(l) if(l %in% tf_list) 'yellow' else 'white'))),
"Time behavior"=c("blue", "red", "white")[(Monocle_run_prog_topModel %>% {"names<-"(.$temporal_pattern, .$genename)}) [unlist(m_prog_combDE$dR$genemodules)]],
"+"=unname(unlist(lapply(unlist(m_prog_combDE$dR$genemodules), function(l) if(l %in% interesting_genes) 'green' else 'white')))
),
extra_legend=list("text"=c("", levels(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2))), "colors"=c('white', getClusterColors()[seq(length(unique(pData(m_prog_combDE$expressionSet)$Type_step2)))])),
rect_overlay=c(lapply(seq(rev(levels(droplevels(pData(m_prog_combDE$expressionSet)$Type_step2)))), function(i){
list(
xleft=cellcluster_sizes_cumsum_prog_combDE[[i]],
xright=cellcluster_sizes_cumsum_prog_combDE[[i+1]],
ytop=0.5,
ybottom=length(unlist(m_prog_combDE$dR$genemodules)) + 0.5,
width=.2,
color="grey"
)
}),
apply(which(prog_combDE_by_domain_mask==1, arr.ind=T), 1, function(x){
list(
xleft=cellcluster_sizes_cumsum_prog_combDE[[x[2]]],
xright=cellcluster_sizes_cumsum_prog_combDE[[x[2]+1]],
ytop=length(unlist(m_prog_combDE$dR$genemodules)) - x[1] + 0.5,
ybottom=length(unlist(m_prog_combDE$dR$genemodules)) - x[1]+1 + 0.5,
color='black'
)
})
),
pretty.params=list("size_factor"=1, "ngenes_per_lines" = 8, "side.height.fraction"=.3),
curr_plot_folder=output_path
)
Export predicted genes to file
m_prog_combDE$writeGeneModules(basename=plot_basename_prog, gms='dR.genemodules')
# ###############################
library(MSigDB)
gene_categories = list(
"Adhesion" = m$getGeneNames() %>% .[toupper(.) %in% unique(unlist(MSigDB[["C5_GENE_ONTOLOGY"]][grep('ADHESION', names(MSigDB[["C5_GENE_ONTOLOGY"]]), value=T)]["GO_CELL_CELL_ADHESION"]))],
"TF" = tf_list,
"NeuroTransmitters" = m$getGeneNames() %>% .[toupper(.) %in% unique(unlist(MSigDB[["C5_GENE_ONTOLOGY"]][["GO_NEUROTRANSMITTER_TRANSPORT"]]))]
)
neuron_markers_selection = names(gene_categories) %>% {setNames(lapply(., function(gc){intersect(unname(unlist(readRDS(paste0(output_path, '/neurons_combDE_genelists_ordered.rds')))), gene_categories[[gc]])}), .)}
# remove gene whose topmodel is in dl6 (population is very small)
dl6_only_genes = Monocle_run_neuron_topModel %>% dplyr::filter(model=="m000001000000" & genename %in% unname(unlist(readRDS(paste0(output_path, '/neurons_combDE_genelists_ordered.rds'))))) %>% .$genename
neuron_markers_selection$Adhesion <- setdiff(neuron_markers_selection$Adhesion, unique(c(tf_list, unlist(cell_partition$bothSteps_markers_neural), dl6_only_genes)))
neuron_markers_selection$TF <- setdiff(neuron_markers_selection$TF, unique(c(unlist(cell_partition$bothSteps_markers_neural), dl6_only_genes)))
neuron_markers_selection$NeuroTransmitters <- setdiff(neuron_markers_selection$NeuroTransmitters, unlist(neuron_markers_selection[c('Adhesion', 'TF')]))
# Order neurotransmitter by category (inhibitory, excitatory, ...etc)
neuro_trans_order = c("Gad1", "Gad2", "Slc6a5", "Slc32a1", "Sv2c", "Slc5a7", "Stx1a", "Slc18a3", "Slc17a6", "Rims4", "Rph3a")
neuron_markers_selection$NeuroTransmitters <- c(intersect(neuro_trans_order, neuron_markers_selection$NeuroTransmitters), setdiff(neuron_markers_selection$NeuroTransmitters, neuro_trans_order))
write(jsonlite::toJSON(neuron_markers_selection, pretty=TRUE), paste0(output_path, '/FIG3_Prediction_NeuronHighlights.json'))
# genelist is a 3-level list: 1. plot 2. plot categories 3. genes
plotPredictedSelection(currm=m_neuron,
genelist=list("FIG3_Neuron_Marker_Prediction"=neuron_markers_selection),
selected_genes=unname(unlist(readRDS(paste0(output_path, '/neurons_combDE_genelists.rds')))),
monocle_run_topModel=Monocle_run_neuron_topModel,
pc = pop_colors$Step2,
zscore=T,
height=7, width=15
)
Plot Claudin3 (gene average per type1 X Dv X timepoint)
plotAveragePerTypeTimeDV(m_neural, gene = "Cldn3", basename="FIG3_Gene_expression_dynamics_")
processed_subsets = list()
denovo_folder = paste0(output_path, "/subClustering_Neuron_by_TypeStep2/")
dir.create(denovo_folder, showWarnings = FALSE)
for(p in levels(droplevels(pData(m$expressionSet) %>% dplyr::filter(Type_step1=="Neuron") %>% .$Type_step2))){
selected_cells = which(pData(m$expressionSet)$Type_step2 %in% p)
if(length(selected_cells) >= 50)
processed_subsets[[p]] <- selected_cells
}
Identify gene modules in each domain
m_pops = list()
for(p in names(processed_subsets)) {
currm = m$copy()
currm$plot_folder = denovo_folder
currm$excludeCellsFromIds(setdiff(seq(m$getNumberOfCells()), processed_subsets[[p]]))
currm$excludeUnexpressedGenes(min.cells=20, min.level=1, verbose=T, data_status='Raw')
corr.curr.mat = fastCor(t(currm$getReadcounts(data_status='Raw')), method="spearman")
# Hack identifyGeneModules as it requires Normalized readcounts (TODO: remove this requirement)
currm$readcounts_norm = currm$readcounts_raw
currm$identifyGeneModules(
method="TopCorr_DR",
corr=corr.curr.mat,
corrgenes_t=2000,
topcorr_num_initial_gms=200,
topcorr_num_final_gms=200,
topcorr_mod_min_cell=5,
topcorr_mod_consistency_thres=0.4,
topcorr_mod_skewness_thres=-Inf,
topcorr_min_cell_level=1,
data_status='Raw'
)
if(length(unlist(currm$topCorr_DR$genemodules)) > 0){
m_pops[[p]] <- currm$copy()
}
}
saveRDS(m_pops, file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons.rds'))
# m_pops = readRDS(file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons.rds'))
subclustering_params = rjson::fromJSON(file='./input_files/subclustering_typestep2_181106.json')
# Add cluster names / colors
for(d in names(subclustering_params)){
subclustering_params[[d]]$populations_names = paste0(d, '.', seq(subclustering_params[[d]]$num_clusters))
subclustering_params[[d]]$populations_colors = colorRampPalette(c("gray80", "black"))(subclustering_params[[d]]$num_clusters)
}
# Extra gene level are normalized globally
extra_genelevels <- apply(m$getReadcounts('Raw')[unique(c(unlist(cell_partition$bothSteps_markers_neural_unique), unlist(lapply(subclustering_params, function(x) x$extra_markers)))), ], 1, function(x) log(1+x)/max(log(1+x)))
extra_genelevels_colors = apply(extra_genelevels, 2, function(x) colorRampPalette(c("#0464DF", "#FFE800"))(n = 1000)[1+as.integer(999*x)])
rownames(extra_genelevels_colors) <- rownames(extra_genelevels)
plotSubC = TRUE
m_pops2 = list()
for(p in names(subclustering_params)){
print(p)
m_pops[[p]]$identifyCellClusters(method='hclust', used_genes="topCorr_DR.genemodules", data_status='Raw')
clustering_order = m_pops[[p]]$cellClusters[['hclust']]$res$order
if(plotSubC)
m_pops[[p]]$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Hclust'),
displayed.gms = c('topCorr_DR.genemodules'),
displayed.geneset=getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), #NA
use.dendrogram=NA, #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=10)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
cell.ordering=unlist(clustering_order),
extra_colors=cbind(
"Neural types"=getClusterColors()[pData(m_pops[[p]]$expressionSet)$Type_step2],
"DV position"=c(colorRampPalette(c("grey", "darkgreen"))(n = 13), 'white')[pData(m_pops[[p]]$expressionSet)$DV],
extra_genelevels_colors[m_pops[[p]]$getCellsNames(), ]
),
genemodules.text_colors=unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules, function(l) if(any(unique(unlist(cell_partition$bothSteps_markers_neural)) %in% l)) 'red' else 'black')),
pretty.params=(list("size_factor"=4, "ngenes_per_lines" = 10, "side.height.fraction"=.6))
)
if(p %in% names(subclustering_params)){
m_pops$cellClusters[['curatedSelection']] <- NULL
m_pops[[p]]$dR$genemodules <- Filter(function(x){any(unlist(subclustering_params[[p]]$populations_markers) %in% x)}, m_pops[[p]]$topCorr_DR$genemodules)
if(length(m_pops[[p]]$dR$genemodules) > 1){
m_pops[[p]]$identifyCellClusters(method='hclust', clust_name="curatedSelection" , used_genes="dR.genemodules", data_status='Raw', numclusters = subclustering_params[[p]][["num_clusters"]])
timepoint_clusters_stats = cbind.data.frame(
cell_ids=m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids,
timepoint=pData(m_pops[[p]]$expressionSet)$timepoint,
timepoint_norm=
(pData(m_pops[[p]]$expressionSet)$timepoint-min(pData(m$expressionSet)$timepoint)) /
(max(pData(m$expressionSet)$timepoint)-min(pData(m$expressionSet)$timepoint))
) %>%
dplyr::group_by(cell_ids) %>%
dplyr::summarise(
mean=mean(timepoint),
sd=sd(timepoint),
mean_norm=mean(timepoint_norm),
sd_norm=sd(timepoint_norm)
)
cell_ids_timepoint_color = timepoint_clusters_stats %>% {"names<-"(hsv(1, .$mean_norm, 1, 1), .$cell_ids)}
if(plotSubC)
m_pops[[p]]$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Curated_Hclust'),
displayed.gms = c('dR.genemodules'),
displayed.geneset=unique(c(getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), unlist(subclustering_params[[p]]$populations_markers))), #NA
use.dendrogram="curatedSelection", #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=10)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
extra_legend=list("text"=c("", subclustering_params[[p]]$populations_names), "colors"=c('white', subclustering_params[[p]]$populations_colors)),
extra_colors=cbind(
cell_ids_timepoint_color[m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids],
subclustering_params[[p]]$populations_colors[m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids],
extra_genelevels_colors[m_pops[[p]]$getCellsNames(), unlist(subclustering_params[[p]]$extra_markers)]
),
pretty.params=(list("size_factor"=4, "ngenes_per_lines" = 5, "side.height.fraction"=.6))
)
if(plotSubC)
m_pops[[p]]$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Hclust_CuratedOrdering'),
displayed.gms = c('topCorr_DR.genemodules'),
displayed.geneset=getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), #NA
use.dendrogram="curatedSelection", #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=15)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
# cell.ordering=,
extra_colors=cbind(
"Neural types"=getClusterColors()[pData(m_pops[[p]]$expressionSet)$Type_step2],
"DV position"=c(colorRampPalette(c("grey", "darkgreen"))(n = 13), 'white')[pData(m_pops[[p]]$expressionSet)$DV],
extra_genelevels_colors[m_pops[[p]]$getCellsNames(), ]
),
genemodules.text_colors=unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules, function(l) if(any(unique(unlist(cell_partition$bothSteps_markers_neural)) %in% l)) 'red' else 'black')),
genemodules.extra_colors = cbind(
# colorRampPalette(c("white", "black"))(n = 10)[cut(genemodules.skewness, 10)],
# c("white", "green")[1+1*(genemodules.skewness > 2)]
c("white", "green")[1+1*unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules,function(x){length(intersect(x, unlist(subclustering_params[[p]]$populations_markers))) > 0}))],
c("white", "green")[1+1*unlist(lapply(m_pops[[p]]$topCorr_DR$genemodules,function(x){length(intersect(x, unlist(subclustering_params[[p]]$populations_markers))) > 0}))]
# c("white", "red")[1+1*(seq(length(m_pops[[p]]$topCorr_DR$genemodules)) %in% curated_list)]
),
pretty.params=(list("size_factor"=2, "ngenes_per_lines" = 15, "side.height.fraction"=.6))
)
# average dataset from hclust
# ! we should logscale the dataset globally before doing a domain specific gm/cluster average
domain_clust_gm_avg = data.frame(t(m_pops[[p]]$getReadcounts('Raw')[unlist(m_pops[[p]]$dR$genemodules), ]), check.names=F) %>%
tibble::rownames_to_column('cellname') %>%
tidyr::gather(genename, value, -cellname) %>%
dplyr::left_join(cbind.data.frame(cell_ids=m_pops[[p]]$cellClusters[['curatedSelection']]$cell_ids, cellname=m_pops[[p]]$getCellsNames()), by="cellname") %>%
dplyr::left_join(m_pops[[p]]$dR$genemodules %>% {cbind.data.frame("gm_id"=rep(seq(length(.)), lapply(.,length)), genename=unlist(.))}, by="genename") %>%
dplyr::group_by(genename) %>%
dplyr::mutate(logscaled = scale(log(1+value), center=T, scale=T)) %>%
dplyr::group_by(cell_ids, gm_id) %>% # average per gm
dplyr::summarise(mean_logscaled=mean(logscaled)) %>%
tidyr::spread(cell_ids, mean_logscaled) %>%
tibble::column_to_rownames("gm_id")
# Remove "excluded" cell clusters
domain_clust_gm_avg <- domain_clust_gm_avg[, which(subclustering_params[[p]]$populations_names != "Excluded")]
colnames(domain_clust_gm_avg) <- setdiff(subclustering_params[[p]]$populations_names, "Excluded")
m_pops2[[p]] <- m_pops[[p]]$copy()
}
# if(is.null(m_pops$cellClusters[['curatedSelection']])){
# m_pops$cellClusters[['curatedSelection']] <- list("num_clusters"=1)
# }
}
m_pops[[p]]$writeGeneModules(p, gms="topCorr_DR.genemodules")
m_pops[[p]]$writeGeneModules(p, gms="dR.genemodules")
}
saveRDS(m_pops2, file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons_v2.rds'))
# m_pops2 = readRDS(file=paste0(output_path, '/All_Clusters_topCorrDR_Neurons_v2.rds'))
Export curated gene module list
curated_gms.df = do.call(rbind.data.frame, lapply(names(m_pops2), function(p){
do.call(rbind.data.frame,
lapply(seq(length(m_pops2[[p]]$dR$genemodules)), function(i){
list("domain"=p, "GM_id"=i, "Genes"=paste0(m_pops2[[p]]$dR$genemodules[[i]], collapse=","))
}))
}))
write.table(curated_gms.df, file=paste0(output_path,'/Table_S2.csv'), row.names=F, sep=";", quote=FALSE)
Clean-up V2a and MN subclusters
excl_subtypes = list(
"V2a"=list("cl"=c(5), "gm"=2:5),
"MN"=list("cl"=c(5), "gm"=1:10)
)
excl_subtypes_cl = unlist(lapply(names(excl_subtypes), function(i){paste0(i,'.',excl_subtypes[[i]]$cl)}))
# average cluster sample time
cluster_avg_sample_time = setNames(
lapply(names(m_pops2), function(p){
unlist(lapply(seq(m_pops2[[p]]$cellClusters[['curatedSelection']]$num_clusters), function(i){
mean(pData(m_pops[[p]]$expressionSet)$timepoint[which(m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids == i)])
})) %>% {(.-min(pData(m$expressionSet)$timepoint)) / (max(pData(m$expressionSet)$timepoint)-min(pData(m$expressionSet)$timepoint))}
}),
names(m_pops2))
for(p in names(excl_subtypes)){
print(p)
currm = m_pops2[[p]]$copy()
excl_cells = which(currm$cellClusters[['curatedSelection']]$cell_ids %in% excl_subtypes[[p]]$cl)
currm$excludeCellsFromIds(excl_cells)
currm$cellClusters[['curatedSelection']]$cell_ids <- currm$cellClusters[['curatedSelection']]$cell_ids[-excl_cells]
currm$cellClusters[['curatedSelection']]$res <- currm$cellClusters[['curatedSelection']]$res %>% as.dendrogram %>% dendextend::prune(m_pops2[[p]]$getCellsNames()[excl_cells]) %>% as.hclust
currm$dR$genemodules.selected <- currm$dR$genemodules[excl_subtypes[[p]]$gm]
currm$plotGeneModules(
main=p,
basename=paste0(p, '_deNOVO_Curated_Hclust_CLEAN'),
displayed.gms = c('dR.genemodules.selected'),
displayed.geneset=unique(c(getGenesFromGOterms(c('GO:0007399', 'GO:0030154', 'GO:0016477')), unlist(subclustering_params[[p]]$populations_markers))), #NA
use.dendrogram="curatedSelection", #'hclust',
display.clusters=NULL, #"State",
file_settings=list(list(type='pdf', width=10, height=10)),
data_status='Raw',
gene_transformations=c('logscaled'), # 'log'
display.legend=TRUE,
extra_legend=list("text"=c("", subclustering_params[[p]]$populations_names), "colors"=c('white', subclustering_params[[p]]$populations_colors)),
extra_colors=cbind(
cluster_avg_sample_time[[p]] %>% {.[currm$cellClusters[['curatedSelection']]$cell_ids]} %>% {hsv(1, ., 1, 1)},
subclustering_params[[p]]$populations_colors[currm$cellClusters[['curatedSelection']]$cell_ids],
extra_genelevels_colors[currm$getCellsNames(), unlist(subclustering_params[[p]]$extra_markers)]
),
pretty.params=(list("size_factor"=4, "ngenes_per_lines" = 5, "side.height.fraction"=.6))
)
}
# Summary table indicating the number of genes and modules for each step of the gene module identification method used to cluster the neuronal subtypes.
domain_stats = cbind.data.frame(
"Domain"=names(m_pops2),
"Cells"=unlist(lapply(m_pops2, function(x) x$getNumberOfCells())),
"Expressed Genes"=unlist(lapply(m_pops2, function(x) x$getNumberOfGenes())),
"Correlated Genes"=unlist(lapply(m_pops2, function(x) length(unlist(x$topCorr_DR$genemodules.history[['corrgenes']])))),
"Unbiased genes"=unlist(lapply(m_pops2, function(x) length(unlist(x$topCorr_DR$genemodules)))),
"Curated genes"=unlist(lapply(m_pops2, function(x) length(unlist(x$dR$genemodules)))),
"Unbiased gene modules"=unlist(lapply(m_pops2, function(x) length(x$topCorr_DR$genemodules))),
"Curated gene modules"=unlist(lapply(m_pops2, function(x) length(x$dR$genemodules))),
"Neuron subtypes"=unlist(lapply(m_pops2, function(x) x$cellClusters[['curatedSelection']]$num_clusters))
)
write.table(domain_stats, file=paste0(output_path,'/Table_S4.csv'), row.names=F, sep=";", quote=FALSE)
Build aggregated dendrogram
data_time = unlist(lapply(names(m_pops2), function(p){
print(p)
m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids %>% .[!duplicated(.)] %>% {setNames(cluster_avg_sample_time[[p]][.], paste0(p, '.', .))}
})) %>% rbind(.,.)
data_col = unlist(lapply(names(m_pops2), function(p){
print(p)
m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids %>% .[!duplicated(.)] %>% subclustering_params[[p]]$populations_colors[.]
}))
library(ape)
pruned_dendrograms = lapply(names(m_pops2), function(p){
print(p)
redondant_cell_names = m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids %>% .[duplicated(.)] %>% names
m_pops2[[p]]$cellClusters[['curatedSelection']]$res %>% as.phylo %>% ape::drop.tip(redondant_cell_names) %>% as.dendrogram
})
dend_merge_all = Reduce(function(x,y) merge(x, y, height=1000, adjust="auto"), pruned_dendrograms)
pdf(paste0(output_path, '/FIG4_subclusters_dendrogram_full.pdf'), height=3, pointsize=2, useDingbats=FALSE)
heatmap.3(
data_time,
Colv=dend_merge_all,
Rowv=FALSE,
dendrogram="column",
labRow=FALSE,
ColSideColors=t(rbind(
data_col,
hsv(1, data_time[1,], 1, 1)
)),
useRaster=FALSE,
lmat = rbind(c(5, 4, 0), c(0,1,0), c(3, 2,0)),
lhei = c(1, .5, .2),
lwid = c(1, 10, 1),
key=F
)
graphics.off()
Aggregate all domains heatmaps in a single pdf
agg_subclust_name = paste0(output_path, "/FIG4_Domain_sub_clustering.pdf")
subclust_path = grep("pdf", list.files(m_pops2[[1]]$plot_folder, full.names=T), value=T)
system2(command="gs", args=sprintf("-dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=%s %s", agg_subclust_name, paste0(subclust_path, collapse=" ")))
Download all neuronal subclusters
Gene average per cell type
subtypes.df = do.call(rbind, lapply(names(subclustering_params), function(p){
cbind.data.frame(
subtype_name = paste0(p, '.', m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids),
cellname = names(m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids),
subtype_cluster_id = m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids,
type = p
)
}))
subtype_map_genes = list(
"all"=unlist(lapply(subclustering_params, function(x) unlist(x$populations_markers))),
"custom"=c("Onecut2","Pou2f2","Zfhx2","Zfhx3","Zfhx4","Pou3f1","Nfia","Nfib","Nfix","Neurod2","Neurod6","Tcf4","Enc1","Olig3","Hes6","Neurog1","Neurog2","Ascl1","Gadd45g","Neurod1","Neurod4")
)
subtypes_gene_avg = data.frame(t(m$getReadcounts('Raw')[unique(unlist(subtype_map_genes)), which(pData(m$expressionSet)$Type_step2 %in% intersect(names(subclustering_params), names(m_pops2)))]), check.names=F) %>%
tibble::rownames_to_column('cellname') %>%
tidyr::gather(genename, value, -cellname) %>%
dplyr::left_join(subtypes.df, by="cellname") %>%
dplyr::group_by(genename, subtype_name) %>%
dplyr::mutate(mean=mean(value)) %>%
dplyr::distinct(genename, subtype_name, .keep_all=TRUE) %>%
dplyr::group_by(genename) %>%
dplyr::mutate(mean_norm_max=mean/max(mean),
type = factor(type, levels=names(subclustering_params))
) %>%
dplyr::ungroup()
mgnames = "custom"
print(mgnames)
p <- subtypes_gene_avg %>%
dplyr::mutate(
genename=factor(genename, levels=unique(subtype_map_genes[[mgnames]]))
) %>%
dplyr::filter(
genename %in% unique(subtype_map_genes[[mgnames]]) &
!subtype_name %in% excl_subtypes_cl
) %>%
ggplot(.) +
geom_count(aes(x=subtype_name, y=genename, size=mean_norm_max, fill=type), color='gray80', stroke=0, shape=21) +
scale_size_area(max_size=5) +
scale_fill_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
# scale_color_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
xlab("") + ylab("") +
coord_flip() +
# scale_x_discrete(position = "bottom") +
scale_y_discrete(position = "bottom") +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 0))#, size=8))
pdf(paste0(output_path, '/FIG5_subclustering_map_', mgnames,'_vertical.pdf'), height=10, width=6, useDingbats=FALSE)
print(p)
graphics.off()
mgnames = "all"
print(mgnames)
p <- subtypes_gene_avg %>%
dplyr::mutate(
genename=factor(genename, levels=unique(subtype_map_genes[[mgnames]]))
) %>%
dplyr::filter(
genename %in% unique(subtype_map_genes[[mgnames]]) &
!subtype_name %in% excl_subtypes_cl
) %>%
ggplot(.) +
geom_count(aes(x=genename, y=subtype_name, size=mean_norm_max, fill=type), color='gray80', stroke=0, shape=21) +
scale_size_area(max_size=3) +
scale_fill_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
# scale_color_manual(breaks=names(pop_colors$Step2), values=pop_colors$Step2) +
scale_x_discrete(position = "top") + xlab("") + ylab("") +
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 0, size=7))
pdf(paste0(output_path, '/FIG5_subclustering_map_', mgnames,'.pdf'), height=7, width=12, useDingbats=FALSE)
print(p)
graphics.off()
Export subcluster ids in new m_neural structure
m_neural = readRDS(paste0(output_path, '/m_neural.rds'))
pData(m_neural$expressionSet)$subtypes = rep(NA, m_neural$getNumberOfCells())
subtypes=unlist(lapply(grep("Null_", names(subclustering_params), value=T, invert=T), function(p){
# subtypes=unlist(lapply(grep("Null_", names(m_pops2), value=T, invert=T), function(p){
setNames(paste0(p, '.', m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids), names(m_pops2[[p]]$cellClusters[['curatedSelection']]$cell_ids))
}))
pData(m_neural$expressionSet)[names(subtypes), "subtypes"] <- subtypes
pData(m_neural$expressionSet)$subtypes[which(pData(m_neural$expressionSet)$subtypes %in% excl_subtypes_cl)] <- NA
saveRDS(m_neural, file=paste0(output_path, '/m_neural_subtypes.rds'))
# m_neural = readRDS(file=paste0(output_path, '/m_neural_subtypes.rds'))
Plot the two waves of neurogenesis in selected domains (Figure 6A)
gene_list = c('Onecut2', 'Pou2f2', 'Zfhx3', 'Zfhx4', 'Nfia', 'Nfib', 'Neurod2', 'Neurod6')
domain_list = rev(c('V3', 'MN', 'V2b', 'V2a', 'V1', 'V0', "dl6", "dl5", "dl4", "dl3", "dl2", "dl1"))
dot.input2 = pData(m_neural$expressionSet) %>%
tibble::rownames_to_column("cell_name") %>%
dplyr::filter(Type_step2 %in% domain_list) %>%
dplyr::select(cell_name, timepoint, Type_step2) %>%
dplyr::left_join(
as.data.frame(t(m_neural$getReadcounts('Raw')[gene_list,])) %>% tibble::rownames_to_column("cell_name") %>% tidyr::gather(gene_name, level, -cell_name),
by="cell_name"
) %>%
dplyr::group_by(timepoint, gene_name, Type_step2) %>%
dplyr::summarize(mean=mean(level), count=n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(
gene_name=factor(gene_name, levels=gene_list),
Type_step2=factor(Type_step2, levels=domain_list)
)
# dot.input2[which(dot.input2$count < min.cells), "mean"] <- NA
dot.input2 <- dot.input2 %>% dplyr::filter(Type_step2 %in% domain_list)
gg <- ggplot(dot.input2, aes(x = factor(timepoint), y = gene_name)) +
geom_count(aes(size = mean, fill = factor(timepoint)), color="black", shape=21) +
facet_wrap(~ unlist(dot.input2$Type_step2), nrow=2) +
scale_size_area(max_size = 5) +
scale_fill_manual(values=pop_colors$timepoint) +
theme_bw() +
ylab("Genes") +
xlab("Embryonic days") +
guides(color = 'none')
pdf(paste0(output_path, "FIG6_Domain_dynamics.pdf"), width = 10, height = 4, useDingbats = FALSE)
print(gg)
graphics.off()
Resample dataset to have as many cells at each pairs of (timepoint x domain) conditions
m_RP = m_neural$copy()
RNGkind("Mersenne-Twister")
set.seed(1)
equal_sample_size = 500
selected_cell_ids = unlist(lapply(
2:12, function(dv){
dv_ids = which(pData(m_RP$expressionSet)$DV==dv)
p = 1 / table(pData(m_RP$expressionSet)$timepoint[dv_ids])
sample(
dv_ids,
equal_sample_size,
prob=p[as.character(pData(m_RP$expressionSet)$timepoint[dv_ids])],
replace=T)
}))
m_RP$excludeCellsFromIds(setdiff(seq(m_RP$getNumberOfCells()), selected_cell_ids))
m_RP$excludeUnexpressedGenes(min.cells=1, min.level=1, verbose=TRUE, data_status="Raw")
# ### Run GM identification with subsample cells
Identify gene modules with Antler’s topCorr dimensional reduction technique
corr.mat.RP = fastCor(t(m_RP$getReadcounts(data_status='Raw')), method="spearman")
# Hack identifyGeneModules as it requires Normalized readcounts (TODO: remove this requirement)
m_RP$readcounts_norm = m_RP$readcounts_raw
m_RP$identifyGeneModules(
method="TopCorr_DR",
corr=corr.mat.RP,
topcorr_corr_min=1, # lower than usual
corr_t = 0.3,
topcorr_num_min_final_gms=200,
topcorr_mod_min_cell=10, # default
topcorr_mod_consistency_thres=0.4, # default
topcorr_mod_skewness_thres=-Inf, # default
topcorr_min_cell_level=.5, # default
data_status='Raw'
)
Select gene modules with common neuro-glio bifurcation markers
bif_genelist = list(
"early"=c('Lin28a'),
"glial"=c('Fabp7'),
"progenitor"=c('Sox2'),
"neuron"=c("Tubb3", "Elavl3")
)
m_RP$dR$genemodules = lapply(bif_genelist, function(l){
unlist(Filter(function(x){any(l %in% x)}, m_RP$topCorr_DR$genemodules))
})
names(m_RP$dR$genemodules) = names(bif_genelist)
m_RP$plotGeneModules(
basename='GM_RP_byDV_Resampled_Selected',
displayed.gms = c('dR.genemodules'),
displayed.geneset="naked",
use.dendrogram=NA,
display.clusters=NULL,
display.legend=T,
file_settings=
list(
list(type='pdf', width=30, height=15)
# list(type='png', width=2000, height=1500)
),
cell.ordering = order(pData(m_RP$expressionSet)$DV),
data_status='Raw',
gene_transformations='logscaled',
extra_colors=cbind(
pop_colors$timepoint[as.character(pData(m_RP$expressionSet)$timepoint)],
"Neuron/Progenitor"=ifelse(pData(m_RP$expressionSet)$Type_step1 == "Progenitor", pop_colors$Step1[['Progenitor']], pop_colors$Step1[['Neuron']]),
"DV position"=pop_colors$DV[as.character(pData(m_RP$expressionSet)$DV)]
),
pretty.params=(list("size_factor"=6, "ngenes_per_lines" = 10, "side.height.fraction"=.3))
)
# Remove DV bias with monocle
candidate_gms = m_RP$dR$genemodules
gm_genes_DV_test = m_RP$runMonocle2DETest(
fullModelFormulaStr="~DV",
reducedModelFormulaStr = "~1",
expressionFamily=VGAM::negbinomial.size(),
tested_genes=unlist(candidate_gms)
)
logpval_thres = 15
pdf(paste0(output_path, '/gene_DV_pval.pdf'), useDingbats=FALSE)
plot(-log10(gm_genes_DV_test$pval), seq(nrow(gm_genes_DV_test)), cex=0)
text(-log10(gm_genes_DV_test$pval), seq(nrow(gm_genes_DV_test)), gm_genes_DV_test$current_gene_names, cex=.3)
abline(v=logpval_thres, col="red")
dev.off()
biased_genes = gm_genes_DV_test %>% dplyr::mutate(logpval=-log10(pval)) %>% dplyr::filter(logpval > logpval_thres) %>% .$gene_short_name %>% as.character
selected_gms = candidate_gms %>% {lapply(., function(l) setdiff(l, biased_genes))}
saveRDS(selected_gms, paste0(output_path, '/DV_debiased_GMs.rds'))
# selected_gms = readRDS(paste0(output_path, '/DV_debiased_GMs.rds'))
m_RP$normalize("geometric_mean_sizeFactors")
m_neural$normalize("geometric_mean_sizeFactors")
pca.input=log(m_RP$getReadcounts('Normalized')[unlist(selected_gms), ]+1)
pca_res= prcomp(t(pca.input), center=TRUE, scale=TRUE)
pca_coord = pca_res$x[,1:2]
summary(pca_res)$importance[,1:2]
apply(pca_coord, 2, var)
table(abs(t(t(pca_res$x %*% t(pca_res$rotation)) * pca_res$scale + pca_res$center) - t(pca.input)) < 1e-10)
table(abs( pca_res$x - t((pca.input - pca_res$center) / pca_res$scale) %*% pca_res$rotation ) < 1e-10)
# Rotate all cells with PCA
pca.input_all=log(m_neural$getReadcounts('Normalized')[unlist(selected_gms), ]+1)
pca_res_all_x = t((pca.input_all - pca_res$center) / pca_res$scale) %*% pca_res$rotation
# Manual x-axis flip to have neuron on the right hand side
pca_res_all_x[, 1] <- - pca_res_all_x[, 1]
pData(m_neural$expressionSet)$Pseudotime = pca_res_all_x[, 1] %>% {100*((. - min(.))/(max(.) - min(.)))}
plotNeuroSpace(cell_domains=2:12, genelist=c('Lin28a', 'Fabp7', 'Sox2', "Tubb3", "Elavl3"), basename="Fig7A_")
# plotNeuroSpace(cell_domains=2, genelist=c('Sox2', "Tubb3"), basename="Fig7B_2_")
# plotNeuroSpace(cell_domains=3, genelist=c('Sox2', "Tubb3"), basename="Fig7B_3_")
# plotNeuroSpace(cell_domains=12, genelist=c('Sox2', "Tubb3"), basename="Fig7B_12_")
library(monocle)
df_pheno = pData(m_neural$expressionSet)
df_feature = cbind(fData(m_neural$expressionSet), 'gene_short_name'=fData(m_neural$expressionSet)$current_gene_names) # "gene_short_name" may be required by monocle
rownames(df_feature) <- df_feature$gene_short_name
currHSMM <- monocle::newCellDataSet(
as.matrix(m_neural$getReadcounts(data_status='Raw')),
phenoData = new("AnnotatedDataFrame", data = df_pheno),
featureData = new('AnnotatedDataFrame', data = df_feature),
lowerDetectionLimit = .5,
expressionFamily=VGAM::negbinomial.size()
)
currHSMM <- estimateSizeFactors(currHSMM)
currHSMM <- estimateDispersions(currHSMM) # only with VGAM::negbinomial.size()
smoothed_genes = getDispersedGenes(m_neural$getReadcounts('Normalized'), 0)
# smoothed_genes = sort(unique(c(
# (pt_genes.df %>% .$gene_list %>% unlist %>% unique %>% sort),
# unlist(lapply(highlighted_genes, function(x) x[[2]]))
# )))
num_pt = 100
mincells = 20
Smoothing runtime takes about 20 mins
t0=Sys.time()
bif_dataset_vgam = do.call(cbind, lapply(2:12, function(dv){
print(dv)
cds_subset <- currHSMM[smoothed_genes, which(pData(currHSMM)$DV == dv)]
newdata <- data.frame(Pseudotime = seq(min(pData(cds_subset)$Pseudotime), max(pData(cds_subset)$Pseudotime), length.out = num_pt))
sc <- monocle::genSmoothCurves(cds_subset, cores=1, trend_formula = '~sm.ns(Pseudotime, df=3)', relative_expr = T, new_data = newdata)
insufficiently_covered_genes = apply(m_neural$getReadcounts('Normalized')[rownames(exprs(cds_subset)), colnames(exprs(cds_subset))], 1, function(x){length(which(x>0))<mincells})
sc[insufficiently_covered_genes, ] <- NA
colnames(sc) <- paste0(dv, "_1_", seq(num_pt))
sc
}))
t1=Sys.time()
print(t1-t0)
bif_dataset_whole_log = log(1+bif_dataset_vgam)
pt_genes.df = read.table(file='./input_files/pseudotime_genes_short.csv', sep='\t', stringsAsFactors=F, header=TRUE, skipNul=T) %>%
tidyr::gather(gene_type, gene_list, -domain, -DV) %>%
dplyr::rowwise() %>%
dplyr::mutate(
gene_list=strsplit(gene_list, split=", ")
)
pt_genes.list = pt_genes.df %>% dplyr::arrange(desc(DV)) %>% .$gene_list %>% unlist %>% unique %>% as.list
plotNeurogenesisHeatmap(
bif_dataset_whole_log %>% is.na %>% replace(bif_dataset_whole_log, ., 0),
pt_clusters_reordered = pt_genes.list,
plotfullname=paste0(output_path, '/FIG7C_Neurogenesis_heatmap.pdf'),
num_pt=num_pt,
plotted_DV=2:12,
zscore_cap = NULL,
dv_colors=pop_colors$DV
)
highlighted_genes = list(
"dp2-dl2"=list(11, c("Foxd3", "Neurog1", "Olig3", "Msx1", "Sox2", "Tubb3")),
"dp3-dl3"=list(10, c("Isl1", "Neurog2", "Olig3", "Msx1", "Pax3", "Tubb3")),
"p0-V0"=list(6, c("Dbx1", "Evx1", "Neurog1", "Pax6", "Sox2", "Tubb3")),
"p3-V3"=list(2, c("Nkx2-2", "Neurog3", "Sim1", "Olig3", "Sox2"))
)
plot.data = t(apply(bif_dataset_whole_log[, grep(paste0('_1_'), colnames(bif_dataset_whole_log), value=T)], 1, function(x) x/max(x, na.rm=T)))
plot.data[is.na(plot.data)] <- 0
for(n in names(highlighted_genes)){
print(n)
NA
selected_domain_cols = paste0(highlighted_genes[[n]][[1]],'_1_', 1:num_pt)
# select genes
genes = highlighted_genes[[n]][[2]]
print(paste0("Missing genes:", paste0(genes %>% .[!. %in% rownames(plot.data)], collapse=', ')))
genes <- genes %>% .[. %in% rownames(plot.data)]
# set up dataset for ggplot
data = as.data.frame(plot.data[genes, selected_domain_cols]) %>%
tibble::rownames_to_column("genename") %>%
tidyr::gather(PT, level, -genename) %>%
dplyr::mutate(PT=factor(PT, levels=selected_domain_cols))
data <- data %>%
dplyr::group_by(genename) %>%
dplyr::mutate(level=level/max(level))
# ggplot
p <- ggplot(data) +
geom_line(aes(x=as.numeric(PT), y=level, color=genename, group=genename)) +
# scale_x_continuous(pretty(adata$PT)) +
scale_colour_manual(values = getClusterColors(v=2)) + ggtitle(n) + xlab("Pseudotime") +
ylab("Relative expression") + theme_classic() +
theme(axis.text.x = element_blank())
pdf(paste0(output_path, '/PT_profile_', n, '.pdf'), height=4, useDingbats=FALSE)
print(p)
graphics.off()
}
Export complete dataset meta-data, including all cell types
full_pData = read.table('./dataset/phenoData.csv', header=TRUE, sep="\t", row.names=1, as.is=TRUE, stringsAsFactors=F, check.names=FALSE)
full_pData$Type_step1 = rep("Outliers", nrow(full_pData))
full_pData[rownames(pData(m$expressionSet)), "Type_step1"] <- as.character(pData(m$expressionSet)$Type_step1)
full_pData$Type_step2 = rep("Outliers", nrow(full_pData))
full_pData[rownames(pData(m$expressionSet)), "Type_step2"] <- as.character(pData(m$expressionSet)$Type_step2)
full_pData[rownames(pData(m$expressionSet)), "Type_step2_unique"] <- as.character(pData(m$expressionSet)$Type_step2_unique)
full_pData$DV = rep(NA, nrow(full_pData))
full_pData[rownames(pData(m_neural$expressionSet)), "DV"] <- as.character(pData(m_neural$expressionSet)$DV)
full_pData$Neuron_subtypes = rep(NA, nrow(full_pData))
full_pData[rownames(pData(m_neural$expressionSet)), "Neuron_subtypes"] <- as.character(pData(m_neural$expressionSet)$subtypes)
full_pData$Pseudotime = rep(NA, nrow(full_pData))
full_pData[rownames(pData(m_neural$expressionSet)), "Pseudotime"] <- as.character(pData(m_neural$expressionSet)$Pseudotime)
write.table(x=full_pData[, c('timepoint', 'replicate_id', 'Type_step1', 'Type_step2', 'Type_step2_unique', 'DV', 'Neuron_subtypes', 'Pseudotime')], file='./output/phenoData_annotated.csv', sep='\t', row.names=TRUE, quote=TRUE, col.names=NA)