greenelab / 2022-microberna

A pipeline to generate a compendia of bacterial and archaeal RNA-seq data
BSD 3-Clause "New" or "Revised" License
4 stars 1 forks source link

check md5sums of downloaded fastq files #9

Closed taylorreiter closed 2 years ago

taylorreiter commented 2 years ago

would need to be done in python with download rules.

taylorreiter commented 2 years ago

instead matched reads numbers against fastp output to eval.

library(dplyr)
library(readr)
library(purrr)
source("scripts/utils.R")
setwd("~/github/2022-microberna/")

fastp <- Sys.glob("outputs/rnaseq_fastp/*json") %>%
  map_dfr(read_fastp)

fastp <- fastp %>%
  mutate(file_name = gsub("\\.fastp\\.json", "", file_name))
head(fastp$file_name)
ena <- read_tsv('inputs/20220407_runinfo.tsv.gz')

all <- left_join(fastp, ena, by = c("file_name" = "run_accession"))
colnames(all)

check_readcount <- all %>%
  select(file_name, before_filtering_total_reads, read_count) %>%
  mutate(read_count_ok = ifelse(before_filtering_total_reads == read_count | before_filtering_total_reads == read_count*2, "ok", "bad"))
table(check_readcount$read_count_ok)