PoonLab / tn

Optimization of genetic clustering methods by predictive modeling
GNU General Public License v3.0
0 stars 0 forks source link

Build trees for CN subtypes with fasttree and run clmp #6

Closed ArtPoon closed 5 years ago

ArtPoon commented 6 years ago

I'd like to see what happens!

ConnorChato commented 6 years ago

There are 4 .nwk files along with a text file summarizing the clmp output in ~/beijing/CNclmpData on my profile.

ConnorChato commented 6 years ago

3 Root to tip trees and their associated clump RData are now in ~/beijing/CNclmpData on my profile. In addition, the associated R-script for tracking purposes

ArtPoon commented 6 years ago

Thanks! Focus on the TN model analysis for now, but if you're still interested in tinkering with these data, here is an extended R script that prepares files for LSD analysis:

require(clmp)

nwk <- read.tree('~/work/beijing/data/01AE-BeijingFT.nwk')
years <- as.integer(gsub(".+_([0-9]+)$", "\\1", nwk$tip.label))

rooted <- rtt(nwk, tip.dates=years)
write.tree(rooted, file='~/work/beijing/data/01AE-rtt.nwk')

n.days <- function(year, month) {
  # @param year: string (number) or integer for full year, e.g., '1997'
  # @param month: string (number) or integer for month

  start <- as.Date(paste(year, month, '01', sep='-'))
  if (month == '12') {
    # increment to start of next year
    end <- as.Date(paste(as.integer(year)+1, '01', '01', sep='-'))
  } else {
    end <- as.Date(paste(year, as.integer(month)+1, '01', sep='-'))
  }
  return(as.integer(difftime(end, start)))
}
mid.date <- function(lo, hi){
  mid <- as.integer(as.Date(lo,origin = "1970-01-01")) + as.integer(as.Date(hi,origin = "1970-01-01")-as.Date(lo,origin = "1970-01-01"))/2
  return (mid)
}

date.to.decimal <- function(dt) {
  return (as.double(dt) / 365.25 + 1970)
}

get.range <- function(x) {
  x = as.character(x)
  items <- strsplit(x, '-')[[1]]
  year <- items[1]
  if (length(items) == 1) {
    ## year only
    low <- as.Date(paste(year, '01', '01', sep='-'))
    high <- as.Date(paste(year, '12', '31', sep='-'))
  } 
  else if (length(items) == 2) {
    ## year and month
    month <- items[2]

    # determine number of days in this month
    days <- n.days(year, month)
    low <- as.Date(paste(year, month, '01', sep='-'))
    high <- as.Date(paste(year, month, days, sep='-'))
  } 
  else {
    # year, month, day
    days <- items[3]
    month <- items[2]
    low <- as.Date(paste(year, month, days, sep='-'))
    high <- low
  }
  return (c(low, high))
}

# write date file for LSD
setwd('~/work/beijing/data')

temp.str <- write.tree(rooted)
write(gsub("NA;$", ":0;", temp.str), 'rooted.nwk')  # replace NA root length

write(length(years), file='date_file.txt')
for (i in 1:length(years)) {
  min.max <- get.range(years[i])
  write(paste(nwk$tip.label[i], 
              date.to.decimal(mid.date(min.max[1], min.max[2]))),
              #paste0("b(", 
              #       date.to.decimal(min.max[1]), ",", 
              #       date.to.decimal(min.max[2]), ")")),
        file="date_file.txt", append=TRUE)
}

# run:
# lsd -i rooted.nwk -d date_file.txt -o lsd.nwk -c

res <- clmp(rooted)

save(res, file='foobar.Rdata')

You would have to install lsd and then we need to find a method to resolve the zero-length branches in the result. This is going to require some digging in the literature, e.g., https://academic.oup.com/bioinformatics/article/28/20/2689/203074