PoonLab / vindels

Developing an empirical model of sequence insertion and deletion in virus genomes
1 stars 0 forks source link

9_5_indel_rates.r - Error: $ operator is invalid for atomic vectors #102

Closed ArtPoon closed 1 year ago

ArtPoon commented 2 years ago

This arose when trying to run this code block: https://github.com/PoonLab/vindels/blob/03ac625896f931a6ea9c3a723c991a55e89310d2/2_within-host/9_5_indel_rates.r#L153-L229

ArtPoon commented 2 years ago

This line is the culprit:

itemp <- iTotal[[i]][iTotal[[i]]$vloop==vloop,]
ArtPoon commented 2 years ago
> str(iTotal)
'data.frame':   5146000 obs. of  8 variables:
 $ header : chr  "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_1_a" "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_2_a" "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_3_a" "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_4_a" ...
 $ vloop  : int  1 2 3 4 5 1 2 3 4 5 ...
 $ vlen   : int  72 138 105 93 33 75 147 105 93 33 ...
 $ count  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ pat    : chr  "108869" "108869" "108869" "108869" ...
 $ index  : int  99 99 99 99 99 157 157 157 157 157 ...
 $ length : num  11.1 11.1 11.1 11.1 11.1 ...
 $ rtt.mid: num  55.8 55.8 55.8 55.8 55.8 ...
> str(iTotal[[i]])
 chr [1:5146000] "(A1.RW.2008.108869.MF565944.10.3_122:0.000146051,A1.RW.2008.108869.MF565950.10.3_122:0.000146051)_1_a" ...

Looks like this code was expecting iTotal to be a list storing a collection of data.frame objects, but instead it is a single data.frame.

ArtPoon commented 2 years ago

Fixed with following changes:

diff --git a/2_within-host/9_5_indel_rates.r b/2_within-host/9_5_indel_rates.r
index 67aabee..c8b8a45 100644
--- a/2_within-host/9_5_indel_rates.r
+++ b/2_within-host/9_5_indel_rates.r
@@ -3,25 +3,34 @@ require(stringr)
 require(phangorn)
 require(data.table)
 require(bbmle)
-source("~/vindels/2_within-host/utils.r")
+source("~/git/vindels/2_within-host/utils.r")

 # INSERTION PARSING ----------
-path <- "~/PycharmProjects/hiv-withinhost/"
-ifolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/ins/*.tsv"))
-dfolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/del/*.tsv"))
+#path <- "~/PycharmProjects/hiv-withinhost/"
+path <- "~/git/vindels/2_within-host/data/"
+
+## new-skygrid is not present in original path
+#ifolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/ins/*.tsv"))
+#dfolder <- Sys.glob(paste0(path,"9Indels/new-skygrid/del/*.tsv"))
+ifolder <- Sys.glob(paste0(path, "9Indels/new-final/ins/*.tsv"))
+dfolder <- Sys.glob(paste0(path, "9Indels/new-final/del/*.tsv"))
+
 sep <- "\t"
-trefolder <- paste0(path,"7SampleTrees/prelim200/")
+## prelim200 does not exist at original path PycharmProjects/hiv-withinhost
+#trefolder <- paste0(path,"7SampleTrees/prelim200/")
+trefolder <- paste0(path, "7SampleTrees/new-final/prelim/")

 # CASE: Removed patient 56552 because could not complete Historian runs 
 # CASE: Removed patients 49641 and 56549 because they are SUBTYPE B
 # CASE: Removed patient 28376 and B because of very bad Rsquared value 
-reg <- "56552|49641|56549|28376"
-newreg <- "30647"
+#reg <- "56552|49641|56549|28376"
+reg <- "56552|28376"
+#newreg <- "30647"

 ifolder <- ifolder[!grepl(reg,ifolder)]
 dfolder <- dfolder[!grepl(reg,dfolder)]
-ifolder <- ifolder[grepl(newreg,ifolder)]
-dfolder <- dfolder[grepl(newreg,dfolder)]
+#ifolder <- ifolder[grepl(newreg,ifolder)]
+#dfolder <- dfolder[grepl(newreg,dfolder)]

 tally <- function(infolder){
   name <- basename(infolder)
@@ -41,7 +50,7 @@ dtip <- list()

 for (file in 1:length(ifolder)){
-  print(file)
+  if (file %% 100 == 0) print(file)
   filename <- basename(ifolder[file])
   full.id <- gsub("_\\d+\\.tsv$","",filename)

@@ -88,8 +97,8 @@ for (file in 1:length(ifolder)){
   tips <-  which(grepl("^[^\\(\\):\n]+$", iCSV$header))
   nodes <- which(!grepl("^[^\\(\\):\n]+$", iCSV$header))

-  iCSV <- iCSV[,-c(2,5,6,8)]
-  dCSV <- dCSV[,-c(2,5,6,8)]
+  iCSV <- iCSV[,-c(2,5,6)]  #,8)]
+  dCSV <- dCSV[,-c(2,5,6)]  #,8)]

   if (is.null(iint[[id]])){
     iint[[id]] <- iCSV[nodes,]
@@ -105,7 +114,7 @@ for (file in 1:length(ifolder)){
 }

-
+#save(iint, itip, dint, dtip, maxes, file="9_5_indel_rates_AP.RData")
 # CHECKPOINT : 9_6_finished.RData 

 patnames <- unname(sapply(names(iint), function(x){strsplit(x, "-")[[1]][1]}))
@@ -122,18 +131,22 @@ counts <- lapply(unique(patnames), function(x){
 })

-iint <- as.data.frame(rbindlist(iint))
-itip <- as.data.frame(rbindlist(itip))
-dint <- as.data.frame(rbindlist(dint))
-dtip <- as.data.frame(rbindlist(dtip))
+#iint <- as.data.frame(rbindlist(iint))
+#itip <- as.data.frame(rbindlist(itip))
+#dint <- as.data.frame(rbindlist(dint))
+#dtip <- as.data.frame(rbindlist(dtip))

-iTotal <- iint
-dTotal <- dint
+#iTotal <- iint
+#dTotal <- dint

 require(data.table)
-iTotal <- as.data.frame(rbindlist(iTotal))
-dTotal <- as.data.frame(rbindlist(dTotal))
+#iTotal <- as.data.frame(rbindlist(iTotal))
+#dTotal <- as.data.frame(rbindlist(dTotal))
+
+iTotal <- as.data.frame(rbindlist(iint))
+dTotal <- as.data.frame(rbindlist(dint))
+
 # all.ins <- as.data.frame(rbindlist(csv.ins))
 # all.del <- as.data.frame(rbindlist(csv.del))

@@ -154,14 +167,14 @@ require(BSDA)
 ins.list <- list()
 del.list <- list()

-for (i in 1:length(iTotal)){
+for (i in 1:length(itip)){
   print(i)
   irates <- c()
   drates <- c()

   for (vloop in 1:5){
-    itemp <- iTotal[[i]][iTotal[[i]]$vloop==vloop,]
-    dtemp <- dTotal[[i]][dTotal[[i]]$vloop==vloop,]
+    itemp <- itip[[i]][itip[[i]]$vloop==vloop,]
+    dtemp <- dtip[[i]][dtip[[i]]$vloop==vloop,]

     iFinal <- itemp[itemp$count < 3 & itemp$length > 0,]
     dFinal <- dtemp[dtemp$count < 3 & dtemp$length > 0,]
@@ -175,16 +188,16 @@ for (i in 1:length(iTotal)){
     #print(nrow(current) - nrow(iFinal))

     # INSERTION RATES 
-    cons <- 0
-    vals <- c()
-    for (i in 2:length(vec1)){
-      if (vec1[i] == (vec1[i-1]+1)){
-        cons = cons + 1
-      }else{
-        vals <- c(vals, cons)
-        cons <- 0
-      }
-    }
+    #cons <- 0
+    #vals <- c()
+    #for (i in 2:length(vec1)){
+    #  if (vec1[i] == (vec1[i-1]+1)){
+    #    cons = cons + 1
+    #  }else{
+    #    vals <- c(vals, cons)
+    #    cons <- 0
+    #  }
+    #}
     # DELETION RATES
     tryCatch(
       {
@@ -220,18 +233,20 @@ for (i in 1:length(iTotal)){
   irates <- irates*10^3
   drates <- drates*10^3

-  pat <- str_split(names(iTotal)[i], "-")[[1]][1]
-  run <- str_split(names(dTotal)[i], "-")[[1]][2]
+  pat <- str_split(names(itip)[i], "-")[[1]][1]
+  run <- str_split(names(itip)[i], "-")[[1]][2]

   # contain the 20 
-  ins.list[[i]] <- data.frame(pat=pat, run=run, V1=irates[1],V2=irates[2],V3=irates[3],V4=irates[4],V5=irates[5])
-  del.list[[i]] <- data.frame(pat=pat, run=run,V1=drates[1],V2=drates[2],V3=drates[3],V4=drates[4],V5=drates[5])
+  ins.list[[i]] <- data.frame(pat=pat, run=run, V1=irates[1], V2=irates[2],
+                              V3=irates[3], V4=irates[4], V5=irates[5])
+  del.list[[i]] <- data.frame(pat=pat, run=run, V1=drates[1], V2=drates[2],
+                              V3=drates[3], V4=drates[4], V5=drates[5])
 }
-ins.list <- as.data.frame(rbindlist(ins.list))
-ins.list <- as.data.frame(rbindlist(del.list))
+ins.df <- as.data.frame(rbindlist(ins.list))
+del.df <- as.data.frame(rbindlist(del.list))

-ins.final <- split(ins.list, ins.list$pat)
-del.final <- split(del.list, del.list$pat)
+ins.final <- split(ins.df, ins.df$pat)
+del.final <- split(del.df, del.df$pat)

 pat <- names(ins.final)
@@ -257,8 +272,8 @@ for (i in 1:26){

 dev.off()

-ins.list <- split(ins.df, ins.df$pat)
-del.list <- split(del.df, del.df$pat)
+#ins.list <- split(ins.df, ins.df$pat)
+#del.list <- split(del.df, del.df$pat)

Given the number of changes I've had to make, I'm starting to wonder if this script was deprecated @jpalmer37 ?