Include additional functions

robertkrautz commented 3 months ago

Add Hjolli's poolReplicates function, here in an edited, streamlined version:

poolReplicates <- function(object, replicates, inputAssay="counts") {

    ## dimensions
    assertthat::assert_that(
        base::ncol(object) == base::length(replicates),
        base::length(base::unique(replicates)) > 1
      )

    base::message("Calculating pooled counts...")
    a <- base::do.call(
        what = "cbind",
        args = base::lapply(
          X = base::unique(replicates),
          FUN = function(b){
            Matrix::rowSums(
                SummarizedExperiment::assay(object,inputAssay)[,replicates==b,drop=FALSE]
              )
            }
          )
        )
    a <- methods::as(a, "sparseMatrix")
    base::colnames(a) <- base::unique(replicates)

    base::message("Create a pasted design matrix...")
    c <- base::do.call(
      what = "rbind",
      args = base::lapply(
        X = base::unique(replicates),
        FUN = function(b){ 
          base::apply(
              X = SummarizedExperiment::colData(object)[replicates==b,],
              MARGIN = 2 ,
              FUN = base::paste ,
              collapse = ","
            )
          }
        )
      )

    rse <- SummarizedExperiment::SummarizedExperiment(
        assays = S4Vectors::SimpleList(counts = a),
        rowRanges = SummarizedExperiment::rowRanges(object),
        colData = base::as.data.frame(c)
      )
    rse <- CAGEfightR::calcTotalTags(rse)
    return(rse)
  }

robertkrautz commented 3 months ago

Seems to have been part of the subsample.R script in some versions of the extensions. Maybe that's a good place to add it as well?

anderssonrobin commented 3 months ago

poolReplicates now included in pool.R

robertkrautz commented 3 months ago

Add Hjolli's alternative subsampling function from 1.1.2.CTSSs.processing.Rmd:

subsampleTarget <- function(object, inputAssay = "counts", target) {

    assertthat::assert_that(
        methods::is(object, "SummarizedExperiment"),
        inputAssay %in% SummarizedExperiment::assayNames(object),
        base::is.numeric(target),
        target > 0
      )

    a <- SummarizedExperiment::assay(object,inputAssay)
    n <- base::ncol(a)
    nz <- base::lapply(
      X = 1:n,
      FUN = function(i){
          .GlobalEnv$nonzero(a[,i,drop=FALSE])[,1]
        }
      )
    s <- Matrix::colSums(a)
    d <- base::unlist(
      base::lapply(
        X = 1:n,
        FUN = function(i){
        if (s[i]<=target){
              a[nz[[i]],i]
            } else {
              stats::rbinom(base::length(nz[[i]]),a[nz[[i]],i], target/s[i])
            }
          }
        )
      )
    keep <- base::which(
        base::sapply(nz,length)>0
      )

    # Create a matrix to store the subsampled counts, ensuring all rows are retained
    new_matrix <- Matrix::sparseMatrix(
      i = base::unlist(nz),
      j = base::unlist(
        base::lapply(
          X = keep,
          FUN = function(i){
              base::rep(i, base::length(nz[[i]]))
            }
          )
        ),
      x = d, 
      dims = base::dim(a),  # Preserve the original dimensions
      dimnames = base::list(
          base::rownames(a),
          base::colnames(a)[keep]
        )
      )

    # Replace the original assay with the new matrix
    SummarizedExperiment::assay(object, inputAssay) <- new_matrix

    # Recalculate the total count of reads for each sample
    object <- CAGEfightR::calcTotalTags(object)

    return(object)
  }

robertkrautz commented 3 months ago

Add writeBw() function for storing CTSSs of individual replicates / samples from a RangedSummarizedExperiment (CAGEfightR object) as bigWig files:

writeBw <- function(object, dir, replicates = "all", inputAssay = "counts", splitByStrand = TRUE){

  if(replicates == "all"){
      replicates <- base::rownames(SummarizedExperiment::colData(object))
    }

  ## Check consistency of replicates
  assertthat::assert_that(
      base::length(SummarizedExperiment::rowRanges(object)) == base::nrow(SummarizedExperiment::assay(object,inputAssay)),
      base::all(replicates %in% base::rownames(SummarizedExperiment::colData(object))),
      inputAssay %in% base::names(SummarizedExperiment::assays(object))
    )
  base::stopifnot("Chosen directory doesn't exist!" = base::dir.exists(dir))

  gr <- GenomicRanges::GRanges(SummarizedExperiment::rowRanges(object))

  ## Parallel export across all chosen replicates
  foreach::foreach(i=1:base::length(replicates)) %dopar% {

    ## Print replicate when exiting
    base::on.exit(base::cat(base::paste0("Exit due to ",replicates[i],".\n")), add = TRUE)

    ## Convert individual replicate to GRanges object
    S4Vectors::mcols(gr)["score"] <- SummarizedExperiment::assay(object,inputAssay)[,replicates[i]]

    ## Export replicate's plus and minus strands separately
    if(splitByStrand){
      for (s in base::c("+","-")){
          grs <- gr[BiocGenerics::strand(gr) == s,]
          s <- base::ifelse(s == "+","plus","minus")

          fn <- base::paste0(dir,replicates[i],".",inputAssay,".",s,".bw")
          rtracklayer::export(
              object = grs,
              con = fn,
              format = "bigWig"
            )
          base::message(base::paste(fn,"done.",sep = " "))
        }
      } else {

        # Aggregate counts from both strands
        grred <- IRanges::reduce(gr, ignore.strand = TRUE, with.revmap = TRUE, min.gapwidth = 0)
        S4Vectors::mcols(grred) <- S4Vectors::aggregate(
            gr, S4Vectors::mcols(grred)$revmap, score = base::sum(score), drop = FALSE
          )

        # Keep only score metadata column
        grred@elementMetadata <- grred@elementMetadata[,"score",drop = FALSE]

        fn <- base::paste0(dir,replicates[i],".",inputAssay,".bw")
        rtracklayer::export(
              object = grred,
              con = fn,
              format = "bigWig"
            )
        base::message(base::paste(fn,"done.",sep = " "))
      }
    }
 }

robertkrautz commented 3 months ago

Maybe we can leave this issue track open as more functions and amendments are to come?

anderssonrobin commented 3 months ago

For subsampleTarget above, the difference to the current version is the addition of recalculating the total tags in the end. Correct?

object <- CAGEfightR::calcTotalTags(object)

This makes a lot of sense to do and should likley be added to subsampleProportion as well.

robertkrautz commented 3 months ago

Correct. The only other change is the more explicit setting of the dimensions in the matrix:

dims = base::dim(a), # Preserve the original dimensions

All credits to @HjolliEin!

robertkrautz commented 3 months ago

Add helper functions for genomic track plotting together with writeBw() from above:

(1.) readRange()

readRange <- function(files, range){
    ## Prepare range for rtracklayer
    ran_sel <- rtracklayer::BigWigSelection(
        ranges = range
      )
    ## Reading in scores in range from files serially
    raw <- purrr::map(
      .x = files,
      .f = function(f){
          s <- base::basename(f) %>%
            stringr::str_replace(.,"^(.*?).bw$","\\1")
          return(
            base::suppressWarnings(
              rtracklayer::import(
                  con = f,
                  as = "GRanges",
                  selection = ran_sel
                ) %>%
              tibble::as_tibble() %>%
              dplyr::mutate(
                sample = s,
                strand = stringr::str_replace(
                    s, "^(.*)\\.(.*?)$", "\\2"
                  ),
                score = dplyr::case_when(
                    strand == "minus" ~ (-1)*score,
                    TRUE ~ score
                  )
                )
              )
            )
          }
        ) %>%
      dplyr::bind_rows()

    ## Fill missing scores across samples with zeros
    out <- .GlobalEnv$fillGaps(raw)

    return(out)
  }

(2.) fillGaps() is supposed to be an internal function not to be exported with the package

fillGaps <- function(data){
    columns <- data %>% 
      dplyr::distinct(sample) %>% 
      dplyr::pull(sample)

    df_filled <- data %>% 
      tidyr::pivot_wider(
          names_from = "sample",
          values_from = "score"
        ) %>%
      dplyr::mutate(
        dplyr::across(
          .cols = columns,
          .fns = function(x){
              tidyr::replace_na(x, 0)
            }
          )
        ) %>%
      tidyr::pivot_longer(
          cols = tidyselect::all_of(columns),
          names_to = "sample",
          values_to = "score"
        ) %>%
      dplyr::mutate(
        sample = stringr::str_replace(
            sample, ".minus|.plus", ""
          )
        )
    return(df_filled)
  }

(3.) plotTracks()

plotTracks <- function(data, range){
    start_pos <- BiocGenerics::start(range)
    end_pos <- BiocGenerics::end(range)

    plot <- ggplot2::ggplot(
        data = data,
        mapping = aes(
          x = start,
          y = score,
          colour = strand,
          linetype = strand
        )
      ) +
    ggplot2::geom_bar(
        stat = "identity",
        size = 0.5
      ) +
    ggplot2::scale_x_continuous(
        limits = base::c(start_pos,end_pos),  
        position = "top"
      ) +
    ggplot2::scale_color_manual(
        limits = base::c("minus","plus"),
        values = base::c("#FF6400","#632770")
      ) +
    ggplot2::scale_linetype_manual(
        limits = base::c("minus","plus"),
        breaks = base::c("minus","plus"),
        values = base::c("solid","solid")
      ) +
    ggplot2::facet_wrap(
        facets = . ~ sample,
        ncol = 1,
        strip.position = "right"
      ) +
    ggplot2::theme_bw() +
    ggplot2::theme(
        legend.position = "none",
        axis.title.x = element_blank(),
        panel.spacing = grid::unit(0.05,"lines"),
        panel.grid.minor = element_blank(),
        plot.margin = grid::unit(base::c(0,0,0,0),"cm")
      )
    return(plot)
  }

robertkrautz commented 3 months ago

Additional function for cumulative.R. We might want to think whether to include the code to generate ucumfrac and locifrac, if we don't include it the acquired data in our own figures and analysis, thus not documenting it sufficiently for others to understand the use. On the other hand, it's great to have these analytical steps in there (also for our own purpose of using PRIME' in the future).

cumulativeFractions <- function(loci, ctss, max_dist = 1000, inputAssay = "counts") {

    g <- GenomicRanges::GRanges(
        seqnames = GenomeInfoDb::seqnames(loci),
        ranges = loci$thick,
        strand = "*"
      )
    t <- methods::as(SummarizedExperiment::rowRanges(ctss), "GRanges")

    base::message("finding nearest loci...")
    g_nearest <- IRanges::nearest(t, g)

    base::message("calculating distances...")
    d <- base::rep(Inf, base::length(t))
    non.na <- base::which(!base::is.na(g_nearest))
    d[non.na] <- IRanges::distance(t[non.na], g[g_nearest[non.na]])

    base::message("calculating totalTags and unique Tags...")
    ctss <- base::suppressWarnings(
      CAGEfightR::calcTotalTags(
          object = ctss,
          inputAssay = "counts"
        )
      )
    SummarizedExperiment::colData(ctss)$uniqTags <- Matrix::colSums(
        SummarizedExperiment::assay(ctss,inputAssay="counts") > 0
      )

    base::message("calculating cumulative fractions...")
    focus <- base::which(d <= max_dist)
    d <- d[focus]
    ctss <- ctss[focus]

    o <- base::order(d)
    d <- d[o]

    unique_loc_frac <- base::sapply(
        X = base::colnames(ctss),
        FUN = function(n) {
          base::cat(".")
          a <- base::as.numeric(
              SummarizedExperiment::assay(ctss, inputAssay)[o, n]
            )
          a[a>0] <- 1
          nz <- base::which(a > 0)
          da <- d[nz]
          a_sub <- a[nz]
          return(
            base::sapply(
              X = base::seq(0, max_dist, by = 1),
              FUN = function(i){base::sum(a_sub[da == i])}
            )
          )
        }
      )
    base::cat("\n")

    fracs <- base::sapply(
      X = base::colnames(ctss),
      FUN = function(n) {
        base::cat(".")
        a <- base::as.numeric(
            SummarizedExperiment::assay(ctss, inputAssay)[o, n]
          )
        nz <- base::which(a > 0)
        da <- d[nz]
        a_sub <- a[nz]
        return(
          base::sapply(
              X = base::seq(0, max_dist, by = 1),
              FUN = function(i){base::sum(a_sub[da == i])}
            )
          )
        }
      )
    base::cat("\n")

    t <- methods::as(SummarizedExperiment::rowRanges(ctss), "GRanges")

    loc_fracs <- base::sapply(
        X = base::colnames(ctss),
        FUN = function(n) {
          base::cat(".")
          a <- base::as.numeric(
              SummarizedExperiment::assay(ctss, inputAssay)[o, n]
            )
        nz <- t[base::which(a > 0)]
        dist <- IRanges::distanceToNearest(g,nz)
        dist <- base::as.data.frame(dist)[,3]
        return(
            base::sapply(
              X = base::seq(0, max_dist, by = 1),
              FUN = function(i){base::sum(dist==i)}
            )
          )
        }
      )

    unique_loc_frac <- base::sapply(
        X = base::colnames(unique_loc_frac),
        FUN = function(n){
          base::cumsum(unique_loc_frac[, n] / ctss$uniqTags[n])
        }
      )
    fracs <- base::sapply(
      X = base::colnames(fracs),
      FUN = function(n){
          base::cumsum(fracs[, n] / ctss$totalTags[n])
        }
      )
    loc_fracs <- base::sapply(
      X = base::colnames(loc_fracs),
      FUN = function(n){
          base::cumsum(loc_fracs[, n] / base::length(g))
        }
      )

    ucumfrac <- base::cbind(
      base::data.frame(
          distance = base::seq(0, max_dist, by = 1),
          unique_loc_frac
        )
      )
    cumfrac <- base::cbind(
      base::data.frame(
          distance = base::seq(0, max_dist, by = 1),
          fracs
        )
      )
    locifrac <- base::cbind(
      base::data.frame(
          distance = base::seq(0, max_dist, by = 1),
          loc_fracs
        )
      )

    fraclist <- base::list(cumfrac,ucumfrac,locifrac)
    return(fraclist)
  }

robertkrautz commented 3 months ago

Additional function to annotate CTSSs as used in 1.4.2.CTSSs.annFracs.Rmd:

calcAnnoCTSS <- function(data, txModels, uniqueCTSS = FALSE) {

    annos <- CAGEfightR::assignTxType(
        object = data,
        txModels = txModels
      )

    #Identify all txTypes
    types <- base::unique(SummarizedExperiment::rowData(annos)$txType)

    # Initialize list to store total tags for each txType
    totalTags_list <- base::vector("list", length = base::length(types))

    # Loop through each txType
    for (i in base::seq_along(types)) {

        # Subset data based on current txType
        current_subset <- S4Vectors::subset(annos, txType %in% types[i])
        if(uniqeCTSS){

            # Calculate totalTags for current subset
            current_subset <- Matrix::colSums(
                SummarizedExperiment::assay(current_subset) > 0
              ) 
            totalTags_list[[i]] <- current_subset

          } else {

            # Calculate totalTags for current subset
            current_subset <- SummarizedExperiment::colData(
                CAGEfightR::calcTotalTags(current_subset)
              )
            totalTags_list[[i]] <- current_subset$totalTags

          }
      }

    # Combine totalTags into a data frame
    ann_counts <- base::as.data.frame(
        base::do.call(base::cbind, totalTags_list)
      )
    base::colnames(ann_counts) <- types
    return(ann_counts)
  }

robertkrautz commented 3 months ago

Create fingerPrint plots for CTSSs rather than genome-wide:

fingerPrint <- function(object,replicates,inputAssay = "counts",n = 100000){
      all <- base::data.frame()
      for(i in 1:base::length(replicates)){
          sam <- replicates[i]
          col <- base::sort(
              SummarizedExperiment::assay(object,)[,sam]
            )

          ## Calculate cumSum as a function of CTSSs rank
          csum <- base::cumsum(col)
          totalLength <- base::length(csum)
          totalCTSSs <- csum[totalLength]
          csumRel <- csum / totalCTSSs
          rank <- base::seq(1L,totalLength,1L) / totalLength

          ## Subset to memory-efficient 100k values
          subset <- base::c(
              base::seq(1,totalLength,base::floor(totalLength/n)),
              totalLength
            )

          df <- tibble::tibble(
              sample = sam,
              raw = col[subset],
              csum = csum[subset],
              csumRel = csumRel[subset],
              rank = rank[subset]
            )
          ## Combine
          all <- dplyr::bind_rows(all,df)
        }
    return(all)
  }

anderssonlab / PRIME