Open robertkrautz opened 3 months ago
Seems to have been part of the subsample.R
script in some versions of the extensions. Maybe that's a good place to add it as well?
poolReplicates now included in pool.R
Add Hjolli's alternative subsampling function from 1.1.2.CTSSs.processing.Rmd:
subsampleTarget <- function(object, inputAssay = "counts", target) {
assertthat::assert_that(
methods::is(object, "SummarizedExperiment"),
inputAssay %in% SummarizedExperiment::assayNames(object),
base::is.numeric(target),
target > 0
)
a <- SummarizedExperiment::assay(object,inputAssay)
n <- base::ncol(a)
nz <- base::lapply(
X = 1:n,
FUN = function(i){
.GlobalEnv$nonzero(a[,i,drop=FALSE])[,1]
}
)
s <- Matrix::colSums(a)
d <- base::unlist(
base::lapply(
X = 1:n,
FUN = function(i){
if (s[i]<=target){
a[nz[[i]],i]
} else {
stats::rbinom(base::length(nz[[i]]),a[nz[[i]],i], target/s[i])
}
}
)
)
keep <- base::which(
base::sapply(nz,length)>0
)
# Create a matrix to store the subsampled counts, ensuring all rows are retained
new_matrix <- Matrix::sparseMatrix(
i = base::unlist(nz),
j = base::unlist(
base::lapply(
X = keep,
FUN = function(i){
base::rep(i, base::length(nz[[i]]))
}
)
),
x = d,
dims = base::dim(a), # Preserve the original dimensions
dimnames = base::list(
base::rownames(a),
base::colnames(a)[keep]
)
)
# Replace the original assay with the new matrix
SummarizedExperiment::assay(object, inputAssay) <- new_matrix
# Recalculate the total count of reads for each sample
object <- CAGEfightR::calcTotalTags(object)
return(object)
}
Add writeBw() function for storing CTSSs of individual replicates / samples from a RangedSummarizedExperiment (CAGEfightR object) as bigWig files:
writeBw <- function(object, dir, replicates = "all", inputAssay = "counts", splitByStrand = TRUE){
if(replicates == "all"){
replicates <- base::rownames(SummarizedExperiment::colData(object))
}
## Check consistency of replicates
assertthat::assert_that(
base::length(SummarizedExperiment::rowRanges(object)) == base::nrow(SummarizedExperiment::assay(object,inputAssay)),
base::all(replicates %in% base::rownames(SummarizedExperiment::colData(object))),
inputAssay %in% base::names(SummarizedExperiment::assays(object))
)
base::stopifnot("Chosen directory doesn't exist!" = base::dir.exists(dir))
gr <- GenomicRanges::GRanges(SummarizedExperiment::rowRanges(object))
## Parallel export across all chosen replicates
foreach::foreach(i=1:base::length(replicates)) %dopar% {
## Print replicate when exiting
base::on.exit(base::cat(base::paste0("Exit due to ",replicates[i],".\n")), add = TRUE)
## Convert individual replicate to GRanges object
S4Vectors::mcols(gr)["score"] <- SummarizedExperiment::assay(object,inputAssay)[,replicates[i]]
## Export replicate's plus and minus strands separately
if(splitByStrand){
for (s in base::c("+","-")){
grs <- gr[BiocGenerics::strand(gr) == s,]
s <- base::ifelse(s == "+","plus","minus")
fn <- base::paste0(dir,replicates[i],".",inputAssay,".",s,".bw")
rtracklayer::export(
object = grs,
con = fn,
format = "bigWig"
)
base::message(base::paste(fn,"done.",sep = " "))
}
} else {
# Aggregate counts from both strands
grred <- IRanges::reduce(gr, ignore.strand = TRUE, with.revmap = TRUE, min.gapwidth = 0)
S4Vectors::mcols(grred) <- S4Vectors::aggregate(
gr, S4Vectors::mcols(grred)$revmap, score = base::sum(score), drop = FALSE
)
# Keep only score metadata column
grred@elementMetadata <- grred@elementMetadata[,"score",drop = FALSE]
fn <- base::paste0(dir,replicates[i],".",inputAssay,".bw")
rtracklayer::export(
object = grred,
con = fn,
format = "bigWig"
)
base::message(base::paste(fn,"done.",sep = " "))
}
}
}
Maybe we can leave this issue track open as more functions and amendments are to come?
For subsampleTarget above, the difference to the current version is the addition of recalculating the total tags in the end. Correct?
object <- CAGEfightR::calcTotalTags(object)
This makes a lot of sense to do and should likley be added to subsampleProportion
as well.
Correct. The only other change is the more explicit setting of the dimensions in the
dims = base::dim(a), # Preserve the original dimensions
All credits to @HjolliEin!
Add helper functions for genomic track plotting together with writeBw() from above:
readRange <- function(files, range){
## Prepare range for rtracklayer
ran_sel <- rtracklayer::BigWigSelection(
ranges = range
)
## Reading in scores in range from files serially
raw <- purrr::map(
.x = files,
.f = function(f){
s <- base::basename(f) %>%
stringr::str_replace(.,"^(.*?).bw$","\\1")
return(
base::suppressWarnings(
rtracklayer::import(
con = f,
as = "GRanges",
selection = ran_sel
) %>%
tibble::as_tibble() %>%
dplyr::mutate(
sample = s,
strand = stringr::str_replace(
s, "^(.*)\\.(.*?)$", "\\2"
),
score = dplyr::case_when(
strand == "minus" ~ (-1)*score,
TRUE ~ score
)
)
)
)
}
) %>%
dplyr::bind_rows()
## Fill missing scores across samples with zeros
out <- .GlobalEnv$fillGaps(raw)
return(out)
}
fillGaps <- function(data){
columns <- data %>%
dplyr::distinct(sample) %>%
dplyr::pull(sample)
df_filled <- data %>%
tidyr::pivot_wider(
names_from = "sample",
values_from = "score"
) %>%
dplyr::mutate(
dplyr::across(
.cols = columns,
.fns = function(x){
tidyr::replace_na(x, 0)
}
)
) %>%
tidyr::pivot_longer(
cols = tidyselect::all_of(columns),
names_to = "sample",
values_to = "score"
) %>%
dplyr::mutate(
sample = stringr::str_replace(
sample, ".minus|.plus", ""
)
)
return(df_filled)
}
plotTracks <- function(data, range){
start_pos <- BiocGenerics::start(range)
end_pos <- BiocGenerics::end(range)
plot <- ggplot2::ggplot(
data = data,
mapping = aes(
x = start,
y = score,
colour = strand,
linetype = strand
)
) +
ggplot2::geom_bar(
stat = "identity",
size = 0.5
) +
ggplot2::scale_x_continuous(
limits = base::c(start_pos,end_pos),
position = "top"
) +
ggplot2::scale_color_manual(
limits = base::c("minus","plus"),
values = base::c("#FF6400","#632770")
) +
ggplot2::scale_linetype_manual(
limits = base::c("minus","plus"),
breaks = base::c("minus","plus"),
values = base::c("solid","solid")
) +
ggplot2::facet_wrap(
facets = . ~ sample,
ncol = 1,
strip.position = "right"
) +
ggplot2::theme_bw() +
ggplot2::theme(
legend.position = "none",
axis.title.x = element_blank(),
panel.spacing = grid::unit(0.05,"lines"),
panel.grid.minor = element_blank(),
plot.margin = grid::unit(base::c(0,0,0,0),"cm")
)
return(plot)
}
Additional function for cumulative.R. We might want to think whether to include the code to generate ucumfrac and locifrac, if we don't include it the acquired data in our own figures and analysis, thus not documenting it sufficiently for others to understand the use. On the other hand, it's great to have these analytical steps in there (also for our own purpose of using PRIME' in the future).
cumulativeFractions <- function(loci, ctss, max_dist = 1000, inputAssay = "counts") {
g <- GenomicRanges::GRanges(
seqnames = GenomeInfoDb::seqnames(loci),
ranges = loci$thick,
strand = "*"
)
t <- methods::as(SummarizedExperiment::rowRanges(ctss), "GRanges")
base::message("finding nearest loci...")
g_nearest <- IRanges::nearest(t, g)
base::message("calculating distances...")
d <- base::rep(Inf, base::length(t))
non.na <- base::which(!base::is.na(g_nearest))
d[non.na] <- IRanges::distance(t[non.na], g[g_nearest[non.na]])
base::message("calculating totalTags and unique Tags...")
ctss <- base::suppressWarnings(
CAGEfightR::calcTotalTags(
object = ctss,
inputAssay = "counts"
)
)
SummarizedExperiment::colData(ctss)$uniqTags <- Matrix::colSums(
SummarizedExperiment::assay(ctss,inputAssay="counts") > 0
)
base::message("calculating cumulative fractions...")
focus <- base::which(d <= max_dist)
d <- d[focus]
ctss <- ctss[focus]
o <- base::order(d)
d <- d[o]
unique_loc_frac <- base::sapply(
X = base::colnames(ctss),
FUN = function(n) {
base::cat(".")
a <- base::as.numeric(
SummarizedExperiment::assay(ctss, inputAssay)[o, n]
)
a[a>0] <- 1
nz <- base::which(a > 0)
da <- d[nz]
a_sub <- a[nz]
return(
base::sapply(
X = base::seq(0, max_dist, by = 1),
FUN = function(i){base::sum(a_sub[da == i])}
)
)
}
)
base::cat("\n")
fracs <- base::sapply(
X = base::colnames(ctss),
FUN = function(n) {
base::cat(".")
a <- base::as.numeric(
SummarizedExperiment::assay(ctss, inputAssay)[o, n]
)
nz <- base::which(a > 0)
da <- d[nz]
a_sub <- a[nz]
return(
base::sapply(
X = base::seq(0, max_dist, by = 1),
FUN = function(i){base::sum(a_sub[da == i])}
)
)
}
)
base::cat("\n")
t <- methods::as(SummarizedExperiment::rowRanges(ctss), "GRanges")
loc_fracs <- base::sapply(
X = base::colnames(ctss),
FUN = function(n) {
base::cat(".")
a <- base::as.numeric(
SummarizedExperiment::assay(ctss, inputAssay)[o, n]
)
nz <- t[base::which(a > 0)]
dist <- IRanges::distanceToNearest(g,nz)
dist <- base::as.data.frame(dist)[,3]
return(
base::sapply(
X = base::seq(0, max_dist, by = 1),
FUN = function(i){base::sum(dist==i)}
)
)
}
)
unique_loc_frac <- base::sapply(
X = base::colnames(unique_loc_frac),
FUN = function(n){
base::cumsum(unique_loc_frac[, n] / ctss$uniqTags[n])
}
)
fracs <- base::sapply(
X = base::colnames(fracs),
FUN = function(n){
base::cumsum(fracs[, n] / ctss$totalTags[n])
}
)
loc_fracs <- base::sapply(
X = base::colnames(loc_fracs),
FUN = function(n){
base::cumsum(loc_fracs[, n] / base::length(g))
}
)
ucumfrac <- base::cbind(
base::data.frame(
distance = base::seq(0, max_dist, by = 1),
unique_loc_frac
)
)
cumfrac <- base::cbind(
base::data.frame(
distance = base::seq(0, max_dist, by = 1),
fracs
)
)
locifrac <- base::cbind(
base::data.frame(
distance = base::seq(0, max_dist, by = 1),
loc_fracs
)
)
fraclist <- base::list(cumfrac,ucumfrac,locifrac)
return(fraclist)
}
Additional function to annotate CTSSs as used in 1.4.2.CTSSs.annFracs.Rmd:
calcAnnoCTSS <- function(data, txModels, uniqueCTSS = FALSE) {
annos <- CAGEfightR::assignTxType(
object = data,
txModels = txModels
)
#Identify all txTypes
types <- base::unique(SummarizedExperiment::rowData(annos)$txType)
# Initialize list to store total tags for each txType
totalTags_list <- base::vector("list", length = base::length(types))
# Loop through each txType
for (i in base::seq_along(types)) {
# Subset data based on current txType
current_subset <- S4Vectors::subset(annos, txType %in% types[i])
if(uniqeCTSS){
# Calculate totalTags for current subset
current_subset <- Matrix::colSums(
SummarizedExperiment::assay(current_subset) > 0
)
totalTags_list[[i]] <- current_subset
} else {
# Calculate totalTags for current subset
current_subset <- SummarizedExperiment::colData(
CAGEfightR::calcTotalTags(current_subset)
)
totalTags_list[[i]] <- current_subset$totalTags
}
}
# Combine totalTags into a data frame
ann_counts <- base::as.data.frame(
base::do.call(base::cbind, totalTags_list)
)
base::colnames(ann_counts) <- types
return(ann_counts)
}
Create fingerPrint plots for CTSSs rather than genome-wide:
fingerPrint <- function(object,replicates,inputAssay = "counts",n = 100000){
all <- base::data.frame()
for(i in 1:base::length(replicates)){
sam <- replicates[i]
col <- base::sort(
SummarizedExperiment::assay(object,)[,sam]
)
## Calculate cumSum as a function of CTSSs rank
csum <- base::cumsum(col)
totalLength <- base::length(csum)
totalCTSSs <- csum[totalLength]
csumRel <- csum / totalCTSSs
rank <- base::seq(1L,totalLength,1L) / totalLength
## Subset to memory-efficient 100k values
subset <- base::c(
base::seq(1,totalLength,base::floor(totalLength/n)),
totalLength
)
df <- tibble::tibble(
sample = sam,
raw = col[subset],
csum = csum[subset],
csumRel = csumRel[subset],
rank = rank[subset]
)
## Combine
all <- dplyr::bind_rows(all,df)
}
return(all)
}
Add Hjolli's poolReplicates function, here in an edited, streamlined version: