shortRNAhub / shortRNA

short RNA-seq analysis package
GNU General Public License v3.0
1 stars 2 forks source link

shortRNA project

This is a project for the development of short RNA analysis R package. It was initiated by Pierre-Luc in 2017.

shortRNA analysis steps in R (a short vignette)

Package installation

install.packages("BiocManager")
BiocManager::install("remotes")
remotes::install_github("shortRNAhub/shortRNA")
library(shortRNA)

Quality check and trimming

qc_SE(file = fq_files, outdir = "output/", ad1 = adapter_sequence)

FastQ files to sequence by counts matrix

m <- fastq2SeqCountMatrix(files = trimmed_fastq_files)

Unique sequences for alignment

fa <- DNAStringSet(row.names(m))
names(fa) <- paste0("S", 1:length(fa))
writeXStringSet(fa, fasta_file)

Obtaining databases for analysis

db <- getDB()

Annotation preparation and genome index generation for alignment

a <- prepareAnnotation(
  ensdb = db$ensdb,
  output_dir = genomeDir,
  extra.gr = list(piRNA = db$piRNA_GR, miRNA = db$miRNA_GR),
  extra.seqs = list(rRNA = db$rRNA_fa, tRNA = db$tRNA_fa),
  resolveSplicing = NULL,
  rules = defaultAssignRules(),
  tRNAEnsembleRemove = FALSE,
  clusterMiRNA = TRUE
)

Alignment

alignShortRNA(
  fastq = "unique.fasta",
  index = "genomeDir/customGenome",
  outDir = "align", GTF = exonsBy(db$ensdb),
  GTF.featureType = "exon", GTF.attrType = "gene_id"
)

Overlapping aligned data with annotations

o <- overlapWithTx2(
  bamFile = align_file, annotation = a, uniqueFasta = "unique.fasta", 
  ignoreStrand = TRUE, nbthreads = 16
)

Assignment of overlapping reads to features with assignment rules

ar <- assignReads(sources = o, rules = defaultAssignRules())

Converting features to FactorList

fl <- featuresAnnoToFL(a)

Assignment of reads to the features tree

ar_tree <- addReadsToTree(
  fL = fl,
  mappedFeaturesDF = ar,
  unassigned = FALSE,
  extraTreeBranch = NULL
)

Creation of TreeSummarizedExperiment object

library(TreeSummarizedExperiment)

rt <- ar_tree
as <- list(counts = m)
cd <- DataFrame(samples = c(s1, s2), group - c("group1", "group2"))

tse <- TreeSummarizedExperiment(
  assays = as,
  rowTree = rt,
  colData = cd,
  rowData = ar[row.names(m), ],
  metadata = list(
    assignedReads = ar,
    counts = m,
    notAligned = getReadsFromBam(
      bam = align_file
    )
  )
)

Differential analysis

library(treeClimbR)
dea <- runDA(TSE = tse, filter_min_count = 20)
out <- nodeResult(object = dea, n = Inf)

cand <- getCand(
  tree = dea$tree,
  score_data = out, node_column = "node",
  p_column = "PValue", sign_column = "logFC",
  message = FALSE, threshold = 0.05
)

candB <- evalCand(
  tree = dea$tree,
  levels = cand$candidate_list,
  score_data = out, node_column = "node",
  p_column = "PValue", sign_column = "logFC",
  method = "BH", limit_rej = 0.05,
  use_pseudo_leaf = FALSE,
  message = FALSE
)

result <- topNodes(object = candB, n = Inf, p_value = 0.05)