Open taylorreiter opened 2 years ago
Varying levels of goodness
stage1/*contam_summary.json:
stage1/*contam_summary.json
library(jsonlite) library(dplyr) json <- fromJSON('outputs/gtdb_rs202_charcoal1/stage1/GCA_900762445.1_genomic.fna.gz.contam_summary.json') all_contam_summary <- data.frame() for(i in 1:length(json[[1]])){ genome_taxonomy <- unlist(json[[1]][[i]][1]) genome_taxonomy <- genome_taxonomy[5:8] genome_taxonomy <- paste(genome_taxonomy, collapse = ";") contig_taxonomy <- unlist(json[[1]][[i]][2]) contig_taxonomy <- contig_taxonomy[5:8] contig_taxonomy <- paste(contig_taxonomy, collapse = ";") counts <- unlist(json[[1]][[i]][3]) contam_summary <- data.frame(genome_taxonomy, contig_taxonomy, counts) all_contam_summary <- bind_rows(all_contam_summary, contam_summary) }
stage1/*contigs-tax.json: This one doesn't have good labels for what the numbers are...because it was hard to figure it out :)
stage1/*contigs-tax.json
library(jsonlite) library(tibble) library(stringr) read_contigs_tax <- function(contigs_tax_path){ json <- fromJSON(contigs_tax_path) contig_tax_all <- data.frame() for(i in 1:length(json)){ contig_name <- names(json)[i] num1 <- json[[i]][[1]] num2 <- json[[i]][[2]] if(length(json[[i]][[3]]) > 0){ lineage <- json[[i]][[3]][[1]][[1]][,2] lineage = paste(lineage, collapse = ";", sep = ";") num3 <- json[[i]][[3]][[1]][[2]] } else { lineage = NA num3 = NA } contig_tax <- data.frame(contig_name, num1, num2, lineage, num3) contig_tax_all <- bind_rows(contig_tax_all, contig_tax) } return(contig_tax_all) }
stage2/*matches.json:
stage2/*matches.json
library(jsonlite) library(dplyr) library(tidyr) read_matches_json <- function(matches_json_path){ read_json(matches_json_path, simplifyVector = F) %>% as.data.frame() %>% mutate(across(everything(), as.character)) %>% pivot_longer(cols = !starts_with("query_info"), names_to = "name", values_to = "value") %>% separate(col = name, into = c("match", "match_accession", "version", "name"), sep = "\\.", remove = T) %>% mutate(accession = paste(match_accession, version, sep = ".")) %>% select(-version, -match) %>% pivot_wider(id_cols = c("query_info.genome", "query_info.genome_lineage", "query_info.match_rank", "query_info.scaled", "match_accession"), names_from = name, values_from = value) %>% mutate(counts = as.numeric(counts), query_info.scaled = as.numeric(query_info.scaled)) %>% arrange(match_type, desc(counts)) %>% separate(col = lineage, into = c("domain", "phylum", "class", "order", "family", "genus", "species"), sep = ';') }
Ok I think I figured out what the num* were for stage1/*contigs-tax.json:
num*
num1 = basepairs num2 = hashes, num3 = matched_hashes
and matched_hashes should match counts in stage2/*matches.json
matched_hashes
counts
Varying levels of goodness
stage1/*contam_summary.json
:stage1/*contigs-tax.json
: This one doesn't have good labels for what the numbers are...because it was hard to figure it out :)stage2/*matches.json
: