dib-lab / charcoal

Remove contaminated contigs from genomes using k-mers and taxonomies.
Other
52 stars 1 forks source link

R parsers for stage1 and stage2 json files #209

Open taylorreiter opened 2 years ago

taylorreiter commented 2 years ago

Varying levels of goodness

stage1/*contam_summary.json:

library(jsonlite)
library(dplyr)
json <- fromJSON('outputs/gtdb_rs202_charcoal1/stage1/GCA_900762445.1_genomic.fna.gz.contam_summary.json')
all_contam_summary <- data.frame()
for(i in 1:length(json[[1]])){
  genome_taxonomy <- unlist(json[[1]][[i]][1])
  genome_taxonomy <- genome_taxonomy[5:8]
  genome_taxonomy <- paste(genome_taxonomy, collapse = ";")
  contig_taxonomy <- unlist(json[[1]][[i]][2])
  contig_taxonomy <- contig_taxonomy[5:8]
  contig_taxonomy <- paste(contig_taxonomy, collapse = ";")
  counts <- unlist(json[[1]][[i]][3])
  contam_summary <- data.frame(genome_taxonomy, contig_taxonomy, counts)
  all_contam_summary <- bind_rows(all_contam_summary, contam_summary)
}

stage1/*contigs-tax.json: This one doesn't have good labels for what the numbers are...because it was hard to figure it out :)

library(jsonlite)
library(tibble)
library(stringr)
read_contigs_tax <- function(contigs_tax_path){  
  json <- fromJSON(contigs_tax_path)
  contig_tax_all <- data.frame()
  for(i in 1:length(json)){
    contig_name <- names(json)[i]
    num1 <- json[[i]][[1]]
    num2 <- json[[i]][[2]]
    if(length(json[[i]][[3]]) > 0){
      lineage <- json[[i]][[3]][[1]][[1]][,2]
      lineage = paste(lineage, collapse = ";", sep = ";")
      num3 <- json[[i]][[3]][[1]][[2]]
    } else {
      lineage = NA
      num3 = NA
    }
    contig_tax <- data.frame(contig_name, num1, num2, lineage, num3)
    contig_tax_all <- bind_rows(contig_tax_all, contig_tax)
  }
  return(contig_tax_all)
}

stage2/*matches.json:

library(jsonlite)
library(dplyr)
library(tidyr)
read_matches_json <- function(matches_json_path){
  read_json(matches_json_path, simplifyVector = F) %>%
    as.data.frame() %>%
    mutate(across(everything(), as.character)) %>%
    pivot_longer(cols = !starts_with("query_info"), names_to = "name", values_to = "value") %>%
    separate(col = name, into = c("match", "match_accession", "version", "name"), sep = "\\.", remove = T) %>%
    mutate(accession = paste(match_accession, version, sep = ".")) %>%
    select(-version, -match) %>%
    pivot_wider(id_cols = c("query_info.genome", "query_info.genome_lineage", "query_info.match_rank", 
                            "query_info.scaled", "match_accession"), names_from = name, values_from = value) %>%
    mutate(counts = as.numeric(counts),
           query_info.scaled = as.numeric(query_info.scaled)) %>%
    arrange(match_type, desc(counts)) %>%
    separate(col = lineage, into = c("domain", "phylum", "class", "order", "family", "genus", "species"), sep = ';')
}
taylorreiter commented 2 years ago

Ok I think I figured out what the num* were for stage1/*contigs-tax.json:

 num1 = basepairs
 num2 = hashes, 
 num3 = matched_hashes

and matched_hashes should match counts in stage2/*matches.json