Open spribitzer opened 2 years ago
additionally we should add the capability to generate fragmentsize distribution plots, this could either be an R script or with deeptools https://deeptools.readthedocs.io/en/develop/content/tools/bamPEFragmentSize.html
library(tidyr)
library(stringr)
library(tidyverse)
library(apird)
setwd("/Volumes/Bioinformatics/pipeline/Illumina/220711_M06794_0010_ADH8FDXX/Project_P425-3Processed_globus_220712")
theme_set(
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour="black", fill=NA, size=1),
axis.text = element_text(colour="black"),
axis.ticks = element_line(colour="black"),
legend.key = element_blank(),
text = element_text(size=20),
strip.text.x = element_text(size = 10,margin = margin( b = 2, t = 2) ),
strip.background = element_rect(fill="white", colour="black"),
panel.background = element_rect(fill="white"),
)
)
libs = getProjectLibs("P425-3")
anno = getAnno(libs)
metrics = getMetrics(libs)
design <- full_join(anno, metrics, by = c("libid"))
fragmentDir <- file.path(getwd(),"insertSizes")
fragmentFiles <- list.files(fragmentDir, full.names = T)
unpackFragData <- function(fragmentFile, annotation, includeShortFrags = F){
libid <- str_extract(fragmentFile, "lib[0-9]+")
cat(sprintf("Counting fragments of %s... \n", libid))
fragmentDistribution <- read.table(fragmentFile)
colnames(fragmentDistribution) <- c("nReads", "fragLen")
fragmentDistribution$shortReads = fragmentDistribution$nReads[fragmentDistribution$fragLen == 0]
if (!includeShortFrags) {
fragmentDistribution = dplyr::filter(fragmentDistribution, fragLen > 15)
}
fragmentDistribution$fragmentLengths <- abs(fragmentDistribution$fragLen)
fragmentDistribution$normReads <- fragmentDistribution$nReads/sum(fragmentDistribution$nReads) *10^3
fragmentDistribution$nFragments = sum(fragmentDistribution$nReads)
fragmentDistribution$libid <- libid
return(fragmentDistribution)
}
fragmentList <- lapply(fragmentFiles, function(x) unpackFragData(x, annotation))
fragmentDistributions <- bind_rows(fragmentList)
fragmentDistributions <- merge(fragmentDistributions, design, by="libid")
xrange = c(0,750)
fragmentDistributionPlot <- fragmentDistributions %>%
dplyr::filter(fragmentLengths > xrange[1] & fragmentLengths < xrange[2]) %>%
ggplot(aes(x = fragmentLengths, y = normReads, color = libid)) +
# scale_color_manual(values = colors.sampleName) +
geom_line() +
xlim(c(xrange[1],xrange[2])) +
labs(x = "Fragment length, bp", y = expression(paste("Normalized read density (a.u.)")), color = "") +
theme(text = element_text(size = 14)) +
theme(legend.position="right") +
theme(legend.direction='vertical')
# facet_wrap(. ~ libid, scales="free")
plot(fragmentDistributionPlot)
ggsave("fragmentSizeDistributions.pdf", fragmentDistributionPlot, width = 7, height = 4)
Currently there are a few steps in the pipeline for ATAC (and possibly CnR) that are no longer required: for example splitting bedgraphs in ATAC (instead we are generating bigwigs now). These scripts could be removed. A second issue with the epigenetic tools is that they are all script based and need to be run within the globus processed folder. This means that if there are multiple projects on a flowcell, it is necessary to run the commands/script in each directory individually.