Closed ArtPoon closed 1 year ago
This line is the culprit:
itemp <- iTotal[[i]][iTotal[[i]]$vloop==vloop,]
> str(iTotal)
'data.frame': 5146000 obs. of 8 variables:
$ header : chr "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_1_a" "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_2_a" "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_3_a" "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_4_a" ...
$ vloop : int 1 2 3 4 5 1 2 3 4 5 ...
$ vlen : int 72 138 105 93 33 75 147 105 93 33 ...
$ count : num 0 0 0 0 0 0 0 0 0 0 ...
$ pat : chr "108869" "108869" "108869" "108869" ...
$ index : int 99 99 99 99 99 157 157 157 157 157 ...
$ length : num 11.1 11.1 11.1 11.1 11.1 ...
$ rtt.mid: num 55.8 55.8 55.8 55.8 55.8 ...
> str(iTotal[[i]])
chr [1:5146000] "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_1_a" ...
Looks like this code was expecting iTotal
to be a list storing a collection of data.frame
objects, but instead it is a single data.frame
.
Fixed with following changes:
diff --git a/2_within-host/9_5_indel_rates.r b/2_within-host/9_5_indel_rates.r
index 67aabee..c8b8a45 100644
--- a/2_within-host/9_5_indel_rates.r
+++ b/2_within-host/9_5_indel_rates.r
@@ -3,25 +3,34 @@ require(stringr)
require(phangorn)
require(data.table)
require(bbmle)
-source("~/vindels/2_within-host/utils.r")
+source("~/git/vindels/2_within-host/utils.r")
# INSERTION PARSING ----------
-path <- "~/PycharmProjects/hiv-withinhost/"
-ifolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/ins/*.tsv"))
-dfolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/del/*.tsv"))
+#path <- "~/PycharmProjects/hiv-withinhost/"
+path <- "~/git/vindels/2_within-host/data/"
+
+## new-skygrid is not present in original path
+#ifolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/ins/*.tsv"))
+#dfolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/del/*.tsv"))
+ifolder <- Sys.glob(paste0(path, "9Indels/new-final/ins/*.tsv"))
+dfolder <- Sys.glob(paste0(path, "9Indels/new-final/del/*.tsv"))
+
sep <- "\t"
-trefolder <- paste0(path,"7SampleTrees/prelim200/")
+## prelim200 does not exist at original path PycharmProjects/hiv-withinhost
+#trefolder <- paste0(path,"7SampleTrees/prelim200/")
+trefolder <- paste0(path, "7SampleTrees/new-final/prelim/")
# CASE: Removed patient 56552 because could not complete Historian runs
# CASE: Removed patients 49641 and 56549 because they are SUBTYPE B
# CASE: Removed patient 28376 and B because of very bad Rsquared value
-reg <- "56552|49641|56549|28376"
-newreg <- "30647"
+#reg <- "56552|49641|56549|28376"
+reg <- "56552|28376"
+#newreg <- "30647"
ifolder <- ifolder[!grepl(reg,ifolder)]
dfolder <- dfolder[!grepl(reg,dfolder)]
-ifolder <- ifolder[grepl(newreg,ifolder)]
-dfolder <- dfolder[grepl(newreg,dfolder)]
+#ifolder <- ifolder[grepl(newreg,ifolder)]
+#dfolder <- dfolder[grepl(newreg,dfolder)]
tally <- function(infolder){
name <- basename(infolder)
@@ -41,7 +50,7 @@ dtip <- list()
for (file in 1:length(ifolder)){
- print(file)
+ if (file %% 100 == 0) print(file)
filename <- basename(ifolder[file])
full.id <- gsub("_\\d+\\.tsv$","",filename)
@@ -88,8 +97,8 @@ for (file in 1:length(ifolder)){
tips <- which(grepl("^[^\\(\\):\n]+$", iCSV$header))
nodes <- which(!grepl("^[^\\(\\):\n]+$", iCSV$header))
- iCSV <- iCSV[,-c(2,5,6,8)]
- dCSV <- dCSV[,-c(2,5,6,8)]
+ iCSV <- iCSV[,-c(2,5,6)] #,8)]
+ dCSV <- dCSV[,-c(2,5,6)] #,8)]
if (is.null(iint[[id]])){
iint[[id]] <- iCSV[nodes,]
@@ -105,7 +114,7 @@ for (file in 1:length(ifolder)){
}
-
+#save(iint, itip, dint, dtip, maxes, file="9_5_indel_rates_AP.RData")
# CHECKPOINT : 9_6_finished.RData
patnames <- unname(sapply(names(iint), function(x){strsplit(x, "-")[[1]][1]}))
@@ -122,18 +131,22 @@ counts <- lapply(unique(patnames), function(x){
})
-iint <- as.data.frame(rbindlist(iint))
-itip <- as.data.frame(rbindlist(itip))
-dint <- as.data.frame(rbindlist(dint))
-dtip <- as.data.frame(rbindlist(dtip))
+#iint <- as.data.frame(rbindlist(iint))
+#itip <- as.data.frame(rbindlist(itip))
+#dint <- as.data.frame(rbindlist(dint))
+#dtip <- as.data.frame(rbindlist(dtip))
-iTotal <- iint
-dTotal <- dint
+#iTotal <- iint
+#dTotal <- dint
require(data.table)
-iTotal <- as.data.frame(rbindlist(iTotal))
-dTotal <- as.data.frame(rbindlist(dTotal))
+#iTotal <- as.data.frame(rbindlist(iTotal))
+#dTotal <- as.data.frame(rbindlist(dTotal))
+
+iTotal <- as.data.frame(rbindlist(iint))
+dTotal <- as.data.frame(rbindlist(dint))
+
# all.ins <- as.data.frame(rbindlist(csv.ins))
# all.del <- as.data.frame(rbindlist(csv.del))
@@ -154,14 +167,14 @@ require(BSDA)
ins.list <- list()
del.list <- list()
-for (i in 1:length(iTotal)){
+for (i in 1:length(itip)){
print(i)
irates <- c()
drates <- c()
for (vloop in 1:5){
- itemp <- iTotal[[i]][iTotal[[i]]$vloop==vloop,]
- dtemp <- dTotal[[i]][dTotal[[i]]$vloop==vloop,]
+ itemp <- itip[[i]][itip[[i]]$vloop==vloop,]
+ dtemp <- dtip[[i]][dtip[[i]]$vloop==vloop,]
iFinal <- itemp[itemp$count < 3 & itemp$length > 0,]
dFinal <- dtemp[dtemp$count < 3 & dtemp$length > 0,]
@@ -175,16 +188,16 @@ for (i in 1:length(iTotal)){
#print(nrow(current) - nrow(iFinal))
# INSERTION RATES
- cons <- 0
- vals <- c()
- for (i in 2:length(vec1)){
- if (vec1[i] == (vec1[i-1]+1)){
- cons = cons + 1
- }else{
- vals <- c(vals, cons)
- cons <- 0
- }
- }
+ #cons <- 0
+ #vals <- c()
+ #for (i in 2:length(vec1)){
+ # if (vec1[i] == (vec1[i-1]+1)){
+ # cons = cons + 1
+ # }else{
+ # vals <- c(vals, cons)
+ # cons <- 0
+ # }
+ #}
# DELETION RATES
tryCatch(
{
@@ -220,18 +233,20 @@ for (i in 1:length(iTotal)){
irates <- irates*10^3
drates <- drates*10^3
- pat <- str_split(names(iTotal)[i], "-")[[1]][1]
- run <- str_split(names(dTotal)[i], "-")[[1]][2]
+ pat <- str_split(names(itip)[i], "-")[[1]][1]
+ run <- str_split(names(itip)[i], "-")[[1]][2]
# contain the 20
- ins.list[[i]] <- data.frame(pat=pat, run=run, V1=irates[1],V2=irates[2],V3=irates[3],V4=irates[4],V5=irates[5])
- del.list[[i]] <- data.frame(pat=pat, run=run,V1=drates[1],V2=drates[2],V3=drates[3],V4=drates[4],V5=drates[5])
+ ins.list[[i]] <- data.frame(pat=pat, run=run, V1=irates[1], V2=irates[2],
+ V3=irates[3], V4=irates[4], V5=irates[5])
+ del.list[[i]] <- data.frame(pat=pat, run=run, V1=drates[1], V2=drates[2],
+ V3=drates[3], V4=drates[4], V5=drates[5])
}
-ins.list <- as.data.frame(rbindlist(ins.list))
-ins.list <- as.data.frame(rbindlist(del.list))
+ins.df <- as.data.frame(rbindlist(ins.list))
+del.df <- as.data.frame(rbindlist(del.list))
-ins.final <- split(ins.list, ins.list$pat)
-del.final <- split(del.list, del.list$pat)
+ins.final <- split(ins.df, ins.df$pat)
+del.final <- split(del.df, del.df$pat)
pat <- names(ins.final)
@@ -257,8 +272,8 @@ for (i in 1:26){
dev.off()
-ins.list <- split(ins.df, ins.df$pat)
-del.list <- split(del.df, del.df$pat)
+#ins.list <- split(ins.df, ins.df$pat)
+#del.list <- split(del.df, del.df$pat)
Given the number of changes I've had to make, I'm starting to wonder if this script was deprecated @jpalmer37 ?
This arose when trying to run this code block: https://github.com/PoonLab/vindels/blob/03ac625896f931a6ea9c3a723c991a55e89310d2/2_within-host/9_5_indel_rates.r#L153-L229