Closed evajimenezguri closed 3 weeks ago
Thanks Eva,
OK, so we fixed (hopefully) this issue with the following branch https://github.com/Eco-Flow/pollen-metabarcoding/tree/kingdom_fix
Which now uses an input database that does not have a kingdom classification, starting at p (phylum) instead. So hopefully it should now run until the end of the pipeline.
We could add a flag to specify the type of DB you use with the pipeline so that this does not fail in the future
This should be fixed now, in this branch: https://github.com/Eco-Flow/pollen-metabarcoding/tree/Fix_database_autmation_if_different
If will automatically detect what the database has and create the relevant columns in the final file. So now any of the categories can be missing from the input database.
Could you please test that it works with your database.
git clone https://github.com/Eco-Flow/pollen-metabarcoding.git --branch Fix_database_autmation_if_different
Then cd into this directory and run the same as before, pointing to your new database
This is now merged into main code: d5883a9
My Database did not have Kingdom data and the pipeline failed for it.
Workflow execution completed unsuccessfully! The exit status of the task that caused the workflow execution to fail was: 1.
The full error message was:
Error executing process > 'R_PROCESSING (10793_100_S50)'
Caused by: Process
R_PROCESSING (10793_100_S50)
terminated with an error exit status (1)Command executed:
!/usr/bin/Rscript
library(dplyr)
data <- read.table("cut.10793_100_S50.tsv", header=F, sep=" ") data <- data %>% mutate_all(na_if,"")
id <- data.frame(do.call('rbind', strsplit(as.character(data$V1), ';', fixed=TRUE)))[1]
size <- data.frame(do.call('rbind', strsplit(as.character(data$V1), '=', fixed=TRUE)))[2] size$X2 <- as.numeric(size$X2)
classif <- data.frame(do.call('rbind', strsplit(as.character(data$V2),',',fixed=TRUE)))
k <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X1),'(',fixed=TRUE))) k$X1 <- gsub("k:", "", k$X1) k$X2 <- as.numeric(k$X2)
p <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X2),'(',fixed=TRUE))) p$X1 <- gsub("p:", "", p$X1) p$X2 <- as.numeric(p$X2)
c <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X3),'(',fixed=TRUE))) c$X1 <- gsub("c:", "", c$X1) c$X2 <- as.numeric(c$X2)
o <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X4),'(',fixed=TRUE))) o$X1 <- gsub("o:", "", o$X1) o$X2 <- as.numeric(o$X2)
f <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X5),'(',fixed=TRUE))) f$X1 <- gsub("f:", "", f$X1) f$X2 <- as.numeric(f$X2)
g <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X6),'(',fixed=TRUE))) g$X1 <- gsub("g:", "", g$X1) g$X2 <- as.numeric(g$X2)
s <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif$X7),'(',fixed=TRUE))) s$X1 <- gsub("s:", "", s$X1) s$X2 <- as.numeric(s$X2)
outdf <- cbind(id, k, p, c, o, f, g, s, size) colnames(outdf)[c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)] <- c("sample", "kingdom", "prob_kingdom", "division", "prob_division", "clade", "prob_clade", "order", "prob_order", "family", "prob_family", "genus", "prob_genus", "species", "prob_species", "size")
write.table(outdf, file=paste("10793_100_S50", ".classified.tsv", sep=""), quote=FALSE, sep=' ', row.names = FALSE)
Function for making pie charts
pie_plots <- function(r){ sums <- aggregate(size ~ outdf[[r]], outdf, sum) colnames(sums) <- c("tax", "size") sums <- sums[order(sums$size, decreasing = TRUE),] sums <- sums %>% mutate(perc = size/sum(size)) cutoff <- sum(sums$perc > 0.03) top_val <- head(sums$tax, n = cutoff) sums <- sums %>% mutate(legend_value = case_when(tax %in% top_val ~ tax, !(tax %in% top_val) ~ "OTHER" )) pie_table <- aggregate(size~legend_value,sums,sum) pdf (paste("10793_100_S50", ".", r, ".pdf", sep = ""), width=6, height=6) pie(pie_table$size, pie_table$legend_value, clockwise = T) dev.off() }
Make pie charts for order, family and genus
for (tax in c("family", "order", "genus")) { pie_plots(tax) }
Generate summary stats
total_mapped_reads <- sum(aggregate(size~genus, outdf, sum)[2]) #just need total so does not matter which column used unique_samples= nrow(outdf) output <- cbind(unique_samples, total_mapped_reads) write.table(output,"summary.tsv", quote=F, sep=" ", row.names = FALSE)
Command exit status: 1
Command output: (empty)
Command error:
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
The following objects are masked from ‘package:base’:
Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 1807, 0 Calls: cbind -> cbind -> data.frame Execution halted