BenaroyaResearch / bripipetools

tools for managing genomics pipelines at BRI
MIT License
1 stars 0 forks source link

Clean up epigenetic tools #123

Open spribitzer opened 2 years ago

spribitzer commented 2 years ago

Currently there are a few steps in the pipeline for ATAC (and possibly CnR) that are no longer required: for example splitting bedgraphs in ATAC (instead we are generating bigwigs now). These scripts could be removed. A second issue with the epigenetic tools is that they are all script based and need to be run within the globus processed folder. This means that if there are multiple projects on a flowcell, it is necessary to run the commands/script in each directory individually.

spribitzer commented 2 years ago

additionally we should add the capability to generate fragmentsize distribution plots, this could either be an R script or with deeptools https://deeptools.readthedocs.io/en/develop/content/tools/bamPEFragmentSize.html

library(tidyr)
library(stringr)
library(tidyverse)
library(apird)

setwd("/Volumes/Bioinformatics/pipeline/Illumina/220711_M06794_0010_ADH8FDXX/Project_P425-3Processed_globus_220712")

theme_set(
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_rect(colour="black", fill=NA, size=1),
        axis.text = element_text(colour="black"),
        axis.ticks = element_line(colour="black"),
        legend.key = element_blank(),
        text = element_text(size=20),
        strip.text.x = element_text(size = 10,margin = margin( b = 2, t = 2) ),
        strip.background = element_rect(fill="white", colour="black"),
        panel.background = element_rect(fill="white"),
        )
  )

libs = getProjectLibs("P425-3")

anno = getAnno(libs)
metrics = getMetrics(libs)

design <- full_join(anno, metrics, by = c("libid"))

fragmentDir <-  file.path(getwd(),"insertSizes")
fragmentFiles <- list.files(fragmentDir, full.names = T)

unpackFragData <- function(fragmentFile, annotation, includeShortFrags = F){
  libid <- str_extract(fragmentFile, "lib[0-9]+")
  cat(sprintf("Counting fragments of %s... \n", libid))

  fragmentDistribution <- read.table(fragmentFile)
  colnames(fragmentDistribution) <- c("nReads", "fragLen")

  fragmentDistribution$shortReads = fragmentDistribution$nReads[fragmentDistribution$fragLen == 0]

  if (!includeShortFrags) {
    fragmentDistribution = dplyr::filter(fragmentDistribution, fragLen > 15)
  }

  fragmentDistribution$fragmentLengths <- abs(fragmentDistribution$fragLen)
  fragmentDistribution$normReads <- fragmentDistribution$nReads/sum(fragmentDistribution$nReads) *10^3

  fragmentDistribution$nFragments = sum(fragmentDistribution$nReads)

  fragmentDistribution$libid <- libid
  return(fragmentDistribution)

}

fragmentList <- lapply(fragmentFiles, function(x) unpackFragData(x, annotation))
fragmentDistributions <- bind_rows(fragmentList)

fragmentDistributions <- merge(fragmentDistributions, design, by="libid")

xrange = c(0,750)

fragmentDistributionPlot  <- fragmentDistributions %>%
  dplyr::filter(fragmentLengths > xrange[1] & fragmentLengths < xrange[2]) %>%
  ggplot(aes(x = fragmentLengths, y = normReads, color = libid)) +
#   scale_color_manual(values = colors.sampleName) + 
  geom_line() +
  xlim(c(xrange[1],xrange[2])) +
  labs(x = "Fragment length, bp", y = expression(paste("Normalized read density (a.u.)")), color = "") + 
  theme(text = element_text(size = 14)) +
  theme(legend.position="right") + 
  theme(legend.direction='vertical')
  #   facet_wrap(. ~ libid, scales="free")

plot(fragmentDistributionPlot)

ggsave("fragmentSizeDistributions.pdf", fragmentDistributionPlot, width = 7, height = 4)