SteMIDIfactory / P-DOR

Quick and easy outbreak reconstruction pipeline
GNU General Public License v3.0
5 stars 2 forks source link

Error when generating annotated phylogenetic tree #5

Open riccabolla opened 2 months ago

riccabolla commented 2 months ago

Hi,

I have tried P-DOR over a batch of 347 FASTA samples. It went all smooth, till the end were I faced this error:

Date and Time: Wed Sep 18 15:06:05 2024

Possible outbreak detected!

Generating annotated phylogenetic tree...

Error in names(color_vec) <- unique(rr$cluster) : 'names' attribute [19] must be the same length as the vector [8] Execution halted

Is there any way to overcome it? As with this error no .svg files are generated for the phylogenetic tree. Am I using too many samples or it has generated too many clusters? I did not find any warning on the page so this is why I am asking you.

Thanks in advance

gbatbiff commented 1 month ago

Hi,

It is not a problem of the dataset, but rather a bug regarding the number of possible clusters found. Thanks for point this out.

Although P-DOR can easily handle hundreds of samples, it was conceived mainly to deal with relatively small datasets (e.g. N<100), as this resembles a plausible outbreak population in reality, with the chance of having a bunch of clusters. I attach below a new version of the script annotated_tree.R which takes into account more clusters (the first 11 clusters in by decreasing order) indicating as "Other" the other ones. This should give you the chance to handle a bigger dataset. I have also updated the help raising a warning about this issue.

You can just run "Rscript annotated_tree.R [SNP alignment file name]" in a folder where you have the also cluster file (and/or the summary of resistance file, If you have used that option when running P-DOR).

Please, let me know if that works now.

gbatbiff commented 1 month ago

rm(list=ls())

options(warn=-1)

suppressPackageStartupMessages({

library(svglite) library(pheatmap) library(RColorBrewer) library(ggtree) library(ggnewscale) library(ggtreeExtra) library(ggplot2)

})

args=commandArgs(trailingOnly = TRUE) snp_phylotree<-args[1]

summary_resistance_virulence<-"summary_resistance_virulence.txt"

tree <- read.tree(snp_phylotree)

add<-as.data.frame(tree$tip.label) colnames(add)<-"STRAIN_ID"

#######################################CHECK CLUSTER FILE

check_cluster_file<-list.files(pattern = ".csv")

if (length(check_cluster_file)>0) {

cluster_file<-read.csv(list.files(pattern = "csv"),sep='') cluster_file$cluster<-as.character(cluster_file$cluster) cluster_file$STRAIN_ID<-as.character(cluster_file$STRAIN_ID)

cat("\n\nPossible outbreak detected!\n\nGenerating annotated phylogenetic tree...\n\n")

} else {

cluster_file<-data.frame(matrix(ncol=2)) colnames(cluster_file)<-c("STRAIN_ID","cluster")

}

mm<-merge(cluster_file,add,by="STRAIN_ID",all = T)

mm<-mm[which(mm$STRAIN_ID %in% tree$tip.label),]

rr<-unique(rbind(mm,cluster_file)) rr<-rr[-which(is.na(rr$STRAIN_ID)),]

rownames(rr)<-rr$STRAIN_ID rr$STRAIN_ID<-NULL

add<-as.data.frame(tree$tip.label) colnames(add)<-"STRAIN_ID"

query_out_lab<-c() for (i in cluster_file$cluster){ cc<-cluster_file[which(clusterfile$cluster==i),] if (length(grep("BD",cc$STRAIN_ID,invert = T,value = T))>=1) query_out_lab<-unique(c(query_out_lab,i))

}

cluster_file_query_out<-cluster_file[which(cluster_file$cluster %in% query_out_lab),]

mm<-merge(cluster_file_query_out,add,by="STRAIN_ID",all = T) mm<-mm[which(mm$STRAIN_ID %in% tree$tip.label),]

rr<-unique(rbind(mm,cluster_file_query_out))

num_samples <- nrow(rr)

num_clusters <- length(unique(rr$cluster))

base_size <- max(1, 20 / log10(num_samples))0.25 # Example formula for size legend_font_size <- max(1, 10 / log10(num_clusters))0.25 # Adjusting the legend size

rr<-rr[order(rr$cluster),]

ncols<-nrow(table(rr$cluster))

rr$cluster<-ifelse(is.na(rr$cluster),"",rr$cluster)

rr<-rr[which(!is.na(rr$cluster)),]

p<-ggtree(tree,branch.length = "none",layout = "rectangular",size=base_size*0.5)

most_clade<-rownames(as.matrix(head(sort(table(rr$cluster),decreasing = T),11))) rr$cluster<-ifelse(rr$cluster %in% most_clade, rr$cluster, "Others")

cols<-suppressWarnings(brewer.pal(n = nrow(table(rr$cluster)), name = "Paired"))

if (nrow(table(rr$cluster))==1){ cols<-cols[1] }

color_vec <- cols names(color_vec) <- unique(rr$cluster) color_vec[names(color_vec) == "Others"] <- "lightgrey"

offset_tips<-mean(nchar(tree$tip.label))/base_size

###########NO CLUSTER NO RESISTANCE

if (length(unique(rr$cluster))==1 && is.na(unique(rr$cluster))){

pp <- p %<+% rr + geom_tiplab(aes(fontface=base_size),alpha=1, size=base_size,offset = 0.1,show.legend = FALSE)+ theme_tree2()+theme(legend.text=element_text(size=legend_font_size1.5),legend.title = element_text(size=legend_font_size1.5))

rownames(rr)<-rr$STRAIN_ID rr$STRAIN_ID<-NULL colnames(rr)<-" "

suppressMessages(

hm_cluster <- gheatmap(pp, rr, offset = offset_tips, width=0.02, font.size = base_size)+
  scale_fill_manual(values = color_vec,na.translate = F,name="Cluster")+
  scale_y_continuous(limits=c(-1, NA))+theme(legend.key.size = unit(base_size/legend_font_size, 'cm'),
                                             legend.spacing.y = unit(base_size/legend_font_size,"cm"))

)

ggsave(hm_cluster, filename = "annotated_tree.svg",width = log(num_samples)15, height=log(num_samples)10, units = "cm",scale=1.2,limitsize = F)

###########CLUSTER YES RESISTANCE YES

} else if (file.exists("summary_resistance_virulence.txt") && length(unique(rr$cluster)>=1)) {

report<-read.delim("summary_resistance_virulence.txt",sep='\t')

refname<-unique(report[-which(report$Name %in% tree$tip.label),"Name"])

report<-unique(report)

colnames(report)[17:18]<-c("COVERAGE","IDENTITY")

report$COVERAGE<-as.character(report$COVERAGE) report$IDENTITY<-as.character(report$IDENTITY)

report<-report[which(report$Name!="Name"),] genes_list<-grep("Gene",report$Gene.symbol,value = T,invert = T)

report<-report[which(as.character(report$Gene.symbol) %in% genes_list),]

report<-report[grep("fna|fasta",report$Name),]

report$Name<-gsub(".fna","",report$Name) report$Name<-gsub(".fasta","",report$Name)

report_res<-report[which(report$COVERAGE=="100.00" & report$IDENTITY=="100.00"),] report_res<-report_res[which(report_res$Element.type=="AMR" | report_res$Element.type=="STRESS"),] report_vir<-report[which(as.numeric(report$COVERAGE)>80 & as.numeric(report$IDENTITY)>80),] report_vir<-report_vir[which(report_vir$Element.type=="VIRULENCE"),]

mat_widh<-length(unique(report_res$Gene.symbol))*0.075

report_summary<-rbind(report_res,report_vir)

pp<-table(report_summary$Name,report_summary$Gene.symbol)

res_tab<-table(report_res$Name,report_res$Gene.symbol) res_mtab<-as.matrix(res_tab) res_mtab[res_mtab>1]<-1

res_mtab1<-matrix(as.character(res_mtab),ncol = ncol(res_mtab)) rownames(res_mtab1)<-rownames(res_mtab) colnames(res_mtab1)<-colnames(res_mtab)

res_pres<-unique(report_res$Name) missed_res<-unique(report[-which(report$Name %in% res_pres),"Name"])

ss<-data.frame(matrix(NA,nrow=length(missed_res),ncol=ncol(res_mtab1))) colnames(ss)<-colnames(res_mtab1) rownames(ss)<-missed_res ss[is.na(ss)]<-0

res_mtab1<-rbind(res_mtab1,ss)

res_mtab1<-as.matrix(res_mtab1)

res_mtab1[res_mtab1=="1"]<-"AMR/STRESS"

vir_tab<-table(report_vir$Name,report_vir$Gene.symbol) vir_mtab<-as.matrix(vir_tab) vir_mtab[vir_mtab>1]<-1 vir_mtab1<-matrix(as.character(vir_mtab),ncol = ncol(vir_mtab)) rownames(vir_mtab1)<-rownames(vir_mtab) colnames(vir_mtab1)<-colnames(vir_mtab)

vir_pres<-unique(report_vir$Name) missed_vir<-unique(report[-which(report$Name %in% vir_pres),"Name"])

ss<-data.frame(matrix(NA,nrow=length(missed_vir),ncol=ncol(vir_mtab1))) colnames(ss)<-colnames(vir_mtab1) rownames(ss)<-missed_vir ss[is.na(ss)]<-0

vir_mtab1<-rbind(vir_mtab1,ss)

vir_mtab1$Gene symbol<-NULL vir_mtab1<-as.matrix(vir_mtab1) vir_mtab1[vir_mtab1=="1"]<-"VIR"

if (dim(vir_mtab1)[1]==0){ summary_resvir<-res_mtab1 summary_resvir[summary_resvir=="0"]<-NA

} else if (dim(res_mtab1)[1]==0){ summary_resvir<-vir_mtab1 summary_resvir[summary_resvir=="0"]<-NA

}else{

vir_mtab1<-vir_mtab1[rownames(res_mtab1),]
summary_resvir<-cbind(vir_mtab1,res_mtab1)
summary_resvir[summary_resvir=="0"]<-NA

}

pp <- p %<+% rr + geom_tiplab(aes(fontface=base_size),alpha=1, size=base_size,offset = 0.1,show.legend = FALSE)+ theme_tree2()+theme(legend.text=element_text(size=legend_font_size1.5),legend.title = element_text(size=legend_font_size1.5))

suppressMessages(

res_hm <- gheatmap(pp,summary_resvir, offset = offset_tips+1, width=log10(ncol(summary_resvir)*2),
                   font.size=base_size, colnames_position= "top",
                   colnames_angle = 90, colnames_offset_y = -0.2, hjust = 0) +
  scale_y_continuous(limits=c(-1, NA))+
  scale_fill_manual(breaks=c("AMR/STRESS", "VIR"),
                    values=c("#6495ED", "firebrick"), name="Category",na.translate = F)

) rownames(rr)<-rr$STRAIN_ID rr$STRAIN_ID<-NULL

cluster_hm_levels <-res_hm + new_scale_fill()

suppressMessages(

hm_cluster <- gheatmap(cluster_hm_levels, rr, offset = offset_tips, width=0.02, font.size = base_size)+
  scale_fill_manual(values = color_vec,na.translate = F,name="Cluster")+
  scale_y_continuous(limits=c(-1, NA))+theme(legend.key.size = unit(base_size/legend_font_size, 'cm'),
                                             legend.spacing.y = unit(base_size/legend_font_size,"cm"))

)

ggsave(hm_cluster, filename = "annotated_tree.svg",width = log(num_samples)15, height=log(num_samples)15, units = "cm",scale=1.2,limitsize = F)

########### CLUSTER AND NO RESISTANCE

} else if (file.exists("summary_resistance_virulence.txt")==FALSE) {

pp <- p %<+% rr + geom_tiplab(aes(fontface=base_size),alpha=1, size=base_size,offset = 0.1,show.legend = FALSE)+ theme_tree2()+theme(legend.text=element_text(size=legend_font_size1.5),legend.title = element_text(size=legend_font_size1.5))

rownames(rr)<-rr$STRAIN_ID

rr$STRAIN_ID<-NULL

offset_tips<-mean(nchar(tree$tip.label))/base_size

suppressMessages(

hm_cluster <- gheatmap(pp, rr, offset = offset_tips, width=0.02, font.size = base_size)+
  scale_fill_manual(values = color_vec,na.translate = F,name="Cluster")+
  scale_y_continuous(limits=c(-1, NA))+theme(legend.key.size = unit(base_size/legend_font_size, 'cm'),
                                             legend.spacing.y = unit(base_size/legend_font_size,"cm"))

)

ggsave(hm_cluster, filename = "annotated_tree.svg",width = log(num_samples)15, height=log(num_samples)10, units = "cm",scale=1.2,limitsize = F)

######### NO CLUSTER / YES RESISTANCE } else if (file.exists("summary_resistance_virulence.txt")) {

report<-read.delim("summary_resistance_virulence.txt",sep='\t')

refname<-unique(report[-which(report$Name %in% tree$tip.label),"Name"])

report<-unique(report)

colnames(report)[17:18]<-c("COVERAGE","IDENTITY")

report$COVERAGE<-as.character(report$COVERAGE) report$IDENTITY<-as.character(report$IDENTITY)

ref_name<-sub(".*\/", "", ref_name)

report<-report[which(report$Name!="Name"),] genes_list<-grep("Gene",report$Gene.symbol,value = T,invert = T)

report<-report[which(as.character(report$Gene.symbol) %in% genes_list),]

report<-report[grep("fna|fasta",report$Name),]

report$Name<-gsub(".fna","",report$Name) report$Name<-gsub(".fasta","",report$Name)

report_res<-report[which(report$COVERAGE=="100.00" & report$IDENTITY=="100.00"),] report_res<-report_res[which(report_res$Element.type=="AMR" | report_res$Element.type=="STRESS"),] report_vir<-report[which(as.numeric(report$COVERAGE)>80 & as.numeric(report$IDENTITY)>80),] report_vir<-report_vir[which(report_vir$Element.type=="VIRULENCE"),]

res_tab<-table(report_res$Name,report_res$Gene.symbol) res_mtab<-as.matrix(res_tab) res_mtab[res_mtab>1]<-1

res_mtab1<-matrix(as.character(res_mtab),ncol = ncol(res_mtab)) rownames(res_mtab1)<-rownames(res_mtab) colnames(res_mtab1)<-colnames(res_mtab)

res_pres<-unique(report_res$Name) missed_res<-unique(report[-which(report$Name %in% res_pres),"Name"])

ss<-data.frame(matrix(NA,nrow=length(missed_res),ncol=ncol(res_mtab1))) colnames(ss)<-colnames(res_mtab1) rownames(ss)<-missed_res ss[is.na(ss)]<-0

res_mtab1<-rbind(res_mtab1,ss)

res_mtab1<-as.matrix(res_mtab1)

res_mtab1[res_mtab1=="1"]<-"AMR/STRESS"

vir_tab<-table(report_vir$Name,report_vir$Gene.symbol) vir_mtab<-as.matrix(vir_tab) vir_mtab[vir_mtab>1]<-1 vir_mtab1<-matrix(as.character(vir_mtab),ncol = ncol(vir_mtab)) rownames(vir_mtab1)<-rownames(vir_mtab) colnames(vir_mtab1)<-colnames(vir_mtab)

vir_pres<-unique(report_vir$Name) missed_vir<-unique(report[-which(report$Name %in% vir_pres),"Name"])

ss<-data.frame(matrix(NA,nrow=length(missed_vir),ncol=ncol(vir_mtab1))) colnames(ss)<-colnames(vir_mtab1) rownames(ss)<-missed_vir ss[is.na(ss)]<-0

vir_mtab1<-rbind(vir_mtab1,ss)

vir_mtab1$Gene symbol<-NULL vir_mtab1<-as.matrix(vir_mtab1) vir_mtab1[vir_mtab1=="1"]<-"VIR"

if (dim(vir_mtab1)[1]==0){ summary_resvir<-res_mtab1 summary_resvir[summary_resvir=="0"]<-NA

} else if (dim(res_mtab1)[1]==0){ summary_resvir<-vir_mtab1 summary_resvir[summary_resvir=="0"]<-NA

}else{ vir_mtab1<-vir_mtab1[rownames(res_mtab1),] summary_resvir<-cbind(vir_mtab1,res_mtab1) summary_resvir[summary_resvir=="0"]<-NA

}

pp <- p %<+% rr + geom_tiplab(aes(fontface=base_size),alpha=1, size=base_size,offset = 0.1,show.legend = FALSE)+ theme_tree2()+theme(legend.text=element_text(size=legend_font_size1.5),legend.title = element_text(size=legend_font_size1.5))

rownames(rr)<-rr$STRAIN_ID

rr$STRAIN_ID<-NULL

offset_tips<-mean(nchar(tree$tip.label))/base_size

suppressMessages(

res_hm <- gheatmap(pp,summary_resvir, offset = offset_tips+1, width=log10(ncol(summary_resvir)*2),
                   font.size=base_size, colnames_position= "top",
                   colnames_angle = 90, colnames_offset_y = -0.2, hjust = 0) +
  scale_y_continuous(limits=c(-1, NA))+
  scale_fill_manual(breaks=c("AMR/STRESS", "VIR"),
                    values=c("#6495ED", "firebrick"), name="Category",na.translate = F)

)

ggsave(res_hm, filename = "annotated_tree.svg",width = log(num_samples)15, height=log(num_samples)10, units = "cm",scale=1.2,limitsize = F)

}