Closed sabrinacamp2 closed 2 years ago
Can you show the full code you used?
Sure!
tumor_barcodes = rownames(combined[[]] %>% dplyr::filter(identity == 'tumor'))
other_barcodes = rownames(combined[[]] %>% dplyr::filter(identity != 'tumor'))
tumor_vs_all_da_peaks = FindMarkers(
object = combined,
ident.1 = tumor_barcodes,
ident.2 = other_barcodes,
test.use = 'LR',
latent.vars = 'peak_region_fragments'
)
tumor_vs_all_da_peaks$query_region = rownames(tumor_vs_all_da_peaks)
tumor_vs_all_da_peaks %>% filter(query_region == 'chr6-29827265-29828849')
No result found
combined$identity2 = ifelse(combined$identity == 'tumor', ' tumor', 'non_tumor')
CoveragePlot(group.by = 'identity2', object = combined,
region = '###')
[redacted, project sensitive]
FindMarkers(features = 'chr6-29827265-29828849',
object = combined,
logfc.threshold = 0, ident.1 = 'tumor', min.pct = 0)
Since that seems to be the case, evaluate if pct.1 and pct.2 make sense
rowSums(combined['chr6-29827265-29828849', tumor_barcodes]) / length(tumor_barcodes)
rowSums(combined['chr6-29827265-29828849', other_barcodes]) / length(other_barcodes)
log2((rowSums(combined['chr6-29827265-29828849', tumor_barcodes]) / length(tumor_barcodes))/
(rowSums(combined['chr6-29827265-29828849', other_barcodes]) / length(other_barcodes)))
sessionInfo()
Can you show the full code you used to create the object?
## load in barcodes from cellranger that are cellular ones
barcodes <- read.table(glue(‘data/cell_ranger_qc/rcc_{sample_id}/barcodes.tsv’))
## this is the right file, this is the # estimated cells in cellrangers web summary output, use this to filter my fragments
cellular_fragments <- FilterCells(glue(‘data/fragments/rcc_{sample_id}/{sample_id}.fragments.tsv.gz’), barcodes$V1, outfile = glue(‘data/fragments/rcc_{sample_id}/{sample_id}.fragments.filtered.tsv.gz’))
cellular_fragments
peaks <- CallPeaks(outdir = ‘/home/jupyter/scATAC_analysis/edit/data/fragments/rcc_ccrcc_12_no855T3_aggr_may262022’, fragment.tempdir = ‘/home/jupyter/scATAC_analysis/edit/data/fragments/rcc_ccrcc_12_no855T3_aggr_may262022’,
object = ‘/home/jupyter/scATAC_analysis/edit/data/fragments/rcc_ccrcc_12_no855T3_aggr_may262022/ccrcc_12_no855T3_aggr_may262022.fragments.filtered.tsv.gz’,
macs2.path = ‘/home/jupyter/.local/bin/macs2’)
filtered_fragments <- CreateFragmentObject(glue(‘data/fragments/rcc_{sample_id}/{sample_id}.fragments.filtered.tsv.gz’))
counts <- FeatureMatrix(filtered_fragments, peaks)
## create the chromatin assay from the count matrix, include peaks detected in at least 10 cells, include cells with at least 200 peaks called. not sure if this is too lenient yet
chrom_assay <- CreateChromatinAssay(
counts = counts,
sep = c(“:”, “-”),
genome = ‘GRCh38’,
fragments = filtered_fragments,
min.cells = 10,
min.features = 200)
rcc_atac_obj <- CreateSeuratObject(
counts = chrom_assay,
assay = “peaks”,
)
rcc_atac_obj <- subset(
x = rcc_atac_obj,
subset = scDblFinder.class == ‘singlet’
)
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotations) <- ‘UCSC’
genome(annotations) <- ‘GRCh38’
Annotation(rcc_atac_obj) <- annotations
rcc_atac_obj <- FRiP(
object = rcc_atac_obj,
assay = ‘peaks’,
total.fragments = ‘fragments’)
rcc_atac_obj$blacklist_fraction <- FractionCountsInRegion(
object = rcc_atac_obj,
assay = ‘peaks’,
regions = blacklist_hg38)
rcc_atac_obj <- NucleosomeSignal(object = rcc_atac_obj)
rcc_atac_obj <- TSSEnrichment(object = rcc_atac_obj, fast = FALSE)
rcc_atac_obj$peak_region_fragments <- colSums(GetAssayData(rcc_atac_obj))
rcc_atac_obj <- subset(
x = rcc_atac_obj,
subset = peak_region_fragments < 20498.37 &
peak_region_fragments > 3000 &
FRiP > 0.04 &
blacklist_fraction < 0.01 &
nucleosome_signal < 2.2 &
TSS.enrichment > 1.02
)
rcc_atac_obj <- RunTFIDF(rcc_atac_obj)
rcc_atac_obj <- FindTopFeatures(rcc_atac_obj, min.cutoff = ‘q0’)
rcc_atac_obj <- RunSVD(rcc_atac_obj)
DefaultAssay(rcc_atac_obj) <- ‘peaks’
rcc_atac_obj <- RunUMAP(object = rcc_atac_obj, reduction = ‘lsi’, dims = 2:30)
rcc_atac_obj <- FindNeighbors(object = rcc_atac_obj, reduction = ‘lsi’, dims = 2:30)
rcc_atac_obj <- FindClusters(object = rcc_atac_obj, verbose = FALSE, algorithm = 3, resolution = 0.15)
harmony_onevar <- RunHarmony(
object = rcc_atac_obj,
group.by.vars = c(‘dataset’),
reduction = ‘lsi’,
assay.use = ‘peaks’,
project.dim = FALSE
)
# re-compute the UMAP using corrected LSI embeddings
harmony_onevar <- RunUMAP(harmony_onevar, dims = 2:30, reduction = ‘harmony’)
DefaultAssay(harmony_onevar) <- ‘peaks’
harmony_onevar <- FindNeighbors(object = harmony_onevar, reduction = ‘harmony’, dims = 2:30)
harmony_onevar <- FindClusters(object = harmony_onevar, verbose = FALSE, algorithm = 3, resolution = 0.1)
Idents(harmony_onevar) = 'seurat_clusters'
harmony_onevar <- RenameIdents(
object = harmony_onevar,
'0' = 'Putative tumor',
'1' = 'T cells',
'2' = 'Myeloid',
'3' = 'Normal endothelial',
'4' = 'Fibroblasts',
'5' = 'pDCs',
'6' = 'Putative tumor',
'7' = 'B cells',
'8' = 'Alveolar cells',
'9' = 'Putative tumor'
)
harmony_onevar$identity = Idents(harmony_onevar)
peaks <- CallPeaks(
object = harmony_onevar,
group.by = 'identity',
macs2.path = '/home/jupyter/.local/bin/macs2')
fragments = CreateFragmentObject(path = glue('data/fragments/rcc_{sample_id}/{sample_id}.fragments.filtered.tsv.gz'), cells = rownames(harmony_onevar[[]]))
Computing hash
counts <- FeatureMatrix(
fragments = fragments,
features = peaks,
cells = rownames(harmony_onevar[[]]))
assay <- CreateChromatinAssay(counts, fragments = fragments)
pre_int_clusterpeaks_obj <- CreateSeuratObject(assay, assay = 'peaks')
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- 'GRCh38'
Annotation(pre_int_clusterpeaks_obj) <- annotations
total_fragments <- CountFragments(glue("data/fragments/rcc_{sample_id}/{sample_id}.fragments.filtered.tsv.gz"))
row.names(total_fragments) <- total_fragments$CB
pre_int_clusterpeaks_obj$fragments <- total_fragments[colnames(pre_int_clusterpeaks_obj), "frequency_count"]
pre_int_clusterpeaks_obj <- FRiP(
object = pre_int_clusterpeaks_obj,
assay = 'peaks',
total.fragments = 'fragments')
pre_int_clusterpeaks_obj$blacklist_fraction <- FractionCountsInRegion(
object = pre_int_clusterpeaks_obj,
assay = 'peaks',
regions = blacklist_hg38)
pre_int_clusterpeaks_obj <- NucleosomeSignal(object = pre_int_clusterpeaks_obj)
# compute TSS enrichment score per cell
pre_int_clusterpeaks_obj <- TSSEnrichment(object = pre_int_clusterpeaks_obj, fast = FALSE)
pre_int_clusterpeaks_obj$peak_region_fragments <- colSums(GetAssayData(pre_int_clusterpeaks_obj))
pre_int_clusterpeaks_obj <- RunTFIDF(pre_int_clusterpeaks_obj)
pre_int_clusterpeaks_obj <- FindTopFeatures(pre_int_clusterpeaks_obj, min.cutoff = 'q0')
pre_int_clusterpeaks_obj <- RunSVD(pre_int_clusterpeaks_obj)
DefaultAssay(pre_int_clusterpeaks_obj) <- 'peaks'
pre_int_clusterpeaks_obj <- RunUMAP(object = pre_int_clusterpeaks_obj, reduction = 'lsi', dims = 2:30)
pre_int_clusterpeaks_obj <- FindNeighbors(object = pre_int_clusterpeaks_obj, reduction = 'lsi', dims = 2:30)
pre_int_clusterpeaks_obj <- FindClusters(object = pre_int_clusterpeaks_obj, verbose = FALSE, algorithm = 3, resolution = 0.15)
harmony_onevar <- RunHarmony(
object = pre_int_clusterpeaks_obj,
group.by.vars = c('dataset'),
reduction = 'lsi',
assay.use = 'peaks',
project.dim = FALSE
)
harmony_onevar = RunUMAP(harmony_onevar, dims = 2:30, reduction = 'harmony')
harmony_onevar = FindNeighbors(object = harmony_onevar, reduction = 'harmony', dims = 2:30)
harmony_onevar = FindClusters(object = harmony_onevar, verbose = FALSE, algorithm = 3, resolution = 0.1)
combined = subset(harmony_onevar, idents = c('0', '1', '2'))
combined <- RenameIdents(
object = combined,
'0' = 'tumor',
'1' = 't_cells',
'2' = 'myeloid',
)
combined$identity = Idents(combined)
The difference could be due to the psuedocount that's added when computing the log. This example gives the same fold change from FindMarkers and manually calculating:
library(Seurat)
library(Signac)
cells1 <- WhichCells(atac_small, expression = seurat_clusters == 1)
cells2 <- WhichCells(atac_small, expression = seurat_clusters == 0)
results1 <- FindMarkers(atac_small, ident.1 = 1, ident.2 = 0, group.by = 'seurat_clusters')
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# chr1-2471903-2481288 0.003694153 0.4647995 0.78 0.52 1
results1['chr1-2471903-2481288', ]
dat <- GetAssayData(atac_small, assay = 'peaks', slot = 'data')
log2(mean(dat['chr1-2471903-2481288', cells1]) + 1) - log2(mean(dat['chr1-2471903-2481288', cells2] + 1))
# 0.4647995
Your code does replicate the FindMarkers result. If the calculation is correct, perhaps the suggested logfc.threshold should be lowered for ATAC? I'm providing an example of a peak below that is mainly accessible in one group, yet the avg_log2FC is below the default cutoff of 0.25.
highlight.this = StringToGRanges('chr17-27635461-27635786')
CoveragePlot(extend.upstream = 3000, extend.downstream = 3000, object = combined, region = highlight.this, region.highlight = highlight.this,
peaks = FALSE)& theme(plot.margin = unit(c(0,3,0,0), "cm"),
text = element_text(size = 15),
legend.text = element_text(size = 15),
legend.title = element_text(size = 15),
legend.position = c(1.15,.8))
[redacted, project sensitive]
cells1 <- WhichCells(combined, expression = identity == 'myeloid')
cells2 <- WhichCells(combined, expression = identity != 'myeloid')
DefaultAssay(combined) = 'peaks'
results1 <- FindMarkers(combined, ident.1 = 'myeloid', features = 'chr17-27635461-27635786', min.pct = 0, logfc.threshold = 0)
results1['chr17-27635461-27635786', ]
dat <- GetAssayData(combined, assay = 'peaks', slot = 'data')
log2(mean(dat['chr17-27635461-27635786', cells1]) + 1) - log2(mean(dat['chr17-27635461-27635786', cells2] + 1))
I agree, the defaults are not very good here for ATAC-seq. We could either change the minimum log fold change cutoff, or make the default pseudocount used smaller, or maybe add the pseudocount at a different point (eg, (sum(x) + 1) / length(x)
rather than mean(x) + 1
).
For now I'd suggest just setting logfc.threshold = 0
for ATAC, I will explore the different options in more detail and try to work out which will be the best approach to change the defaults in a future release.
I looked into this a little more, and the use of mean(x) + 1 makes a very big difference. Taking (sum(x) + 1)/ length(x) is much more similar to the real fold change.
Example using the PBMC data from the Signac vignette:
library(Signac)
library(Seurat)
mf_sum <- function(x) {
return(log(x = (rowSums(x = x) + 1) / length(x = x), base = 2))
}
mf_orig <- function(x) {
return(log(x = (rowSums(x = x)) / length(x = x), base = 2))
}
obj <- readRDS("vignette_data/pbmc.rds")
res1 <- FoldChange(obj, ident.1 = 'CD14 Mono')
res2 <- FoldChange(obj, ident.1 = 'CD14 Mono', mean.fxn = mf_sum)
res3 <- FoldChange(obj, ident.1 = 'CD14 Mono', mean.fxn = mf_orig)
plot(res1$avg_log2FC, res3$avg_log2FC)
plot(res2$avg_log2FC, res3$avg_log2FC)
Pseudocount still needed to prevent Inf values:
sum(!is.finite(res3$avg_log2FC)) # 550
sum(!is.finite(res2$avg_log2FC)) # 0
Thanks for highlighting this issue, I'll update the fold change calculation on the develop branch
That's great to hear! I appreciate you looking into this.
Closing this now since it should be fixed on the develop branch
Hi @sabrinacamp2, someone pointed out that using sum(x) + 1 / length(x)
would create a fold change when the number of cells in different groups is different. I also tested simply making the pseudocount much smaller (1/10000), and that also solves the issue, so I'm updating Signac to use 1/10000 as the fold change pseudocount
1/10000 pseudocount plot comparison:
Hi, thanks for the update!
If the res1$avg_log2FC
the fold-change calculation in the above plot uses (sum(x) + 1)/ length(x)
and not the Signac v1.3.0 fold-change calculation represented as res1 here, that makes sense.
Thanks again for looking into this and implementing a change.
Hi @timoast how can I load the updated and fixed version with this changes?
Hello, I have updated both Seurat and Signac to the latest versions and yet see this result where the log2FC values don't match the coverage plot. Can you please clarify what the solution to circumvent this issue is?
Thank you, Iva
I have the same issue. I run the FindAllMarkers for the ATAC assay, slot = "data" with the default options. The avg_log2FC is 1.86 for AMC. In the fragment plot, we can barely see fragments for another group. However, they have the same peak height. I am appreciated of any advice. Thank you.
Hi,
I saw previous issues mentioning this, but also saw that the change to FoldChange() was implemented in v1.3.0. I am currently using Signac v1.4.0. I'm not sure if the change is working as intended, at least for my analysis.
I have a coverage plot for a particular peak region that looks like the following: [redacted, project sensitive]
However, this peak does not come up as differentially accessible using FindMarkers(), when Idents() is set to the tumor, non-tumor annotations because the logfc.threshold is 0.25. To investigate this and see if this region is somehow below 0.25, I did the following:
FindMarkers(features = 'chr6-29827265-29828849', object = combined, logfc.threshold = 0, ident.1 = 'tumor', min.pct = 0)
Does this log2FC make sense? Additionally, when I was trying to see how pct1 and pct2 are calculated I could not find the same results as FindMarkers did.
Lastly, I tried using the #163 log2.ratio calculation and got the following:
All the best, Sabrina