palautatan / mrRevBayes

translator from MrBayes to RevBayes
0 stars 0 forks source link

Library Assembly and Testing #4

Open palautatan opened 6 years ago

palautatan commented 6 years ago

Prepare the mrRevBayes R Package

  1. Edit all necessary functions (specificially edit addMCMCMC() to read in scripts correctly
  2. Create .R files for each function and library folder
  3. Test on 5 or 6 datasets for now
  4. Create a tutorial
palautatan commented 6 years ago

From S1115 testing.

setwd("/Users/EDIE/Downloads/S1115")

scheme_csv = "S1115.csv"
subsets_folder = "data/subsets/S1115"

# * PARTITIONED ANALYSIS
partitioned_model = function(scheme_csv, subsets_folder, name) {

  # * 0.
  # * BEGIN SCRIPT
  script = c()

  # * 1A.
  # * READ IN SCHEME CSV
  scheme = read.csv(scheme_csv, as.is=TRUE)
  num_subsets = length(scheme[1,])-1

  # * 2.
  # * READ IN NEXUS FILES
  subsets = colnames(scheme)[2:ncol(scheme)]
  subsets = gsub("^X", "", subsets)
  full_paths = paste0(subsets_folder, "/", subsets)

  data_reading_lines = c()
  for (ix in 1:length(subsets)) {
    read_in_data = "data = readDiscreteCharacterData("
    read_in_data = gsub("data", paste0("data_", subsets[ix]), read_in_data)
    read_in_data = paste0(read_in_data,"\"", full_paths[ix], ".nex\")")
    data_reading_lines = c(data_reading_lines, read_in_data)
  }
  script = c(script, data_reading_lines)

  var_1      = strsplit(data_reading_lines[1], " =")[[1]][1]
  taxa_line  = paste0("taxa = ", var_1, ".taxa()")
  ntax_line  = paste0("num_taxa = ", var_1, ".ntaxa()")
  branches   = "num_branches = 2 * num_taxa - 3"
  tree_stats = c(taxa_line, ntax_line, branches)
  script     = c(script, "\n", tree_stats)

  # * 3.
  # * MOVES
  move_start = "move_index = 0"
  script     = c(script, move_start)

  # * 4.
  # * PARTITION VERIFICATION
  if (ncol(scheme) > 2) {
    partition = 1
  } else {
    stop("This is not a partitioned model.")
  }

  # * 5.
  # * SUBSTITUTION MODELS

  # * GO ROW WISE
  models = as.character(scheme[1,2:ncol(scheme)])

  # * IF THE RELATIVE RATES AND STATE FREQS ARE IN THE SAME PARTITIONS
  if (all(as.numeric(scheme[2,2:ncol(scheme)]) == as.numeric(scheme[3,2:ncol(scheme)]))) {

    # * GET THE PARTITIONS
    mod_partitions = unique(as.numeric(scheme[2,2:ncol(scheme)]))
    mod_partitions = mod_partitions[!mod_partitions==0]

    # * FOR EACH PARTITION,
    for (ix in mod_partitions) {
      # * GET THE SUBSETS THAT ARE IN THE PARTITION
      this_partition = which(scheme[2,]==ix)

      # * MAKE SURE THEY'RE ALL THE SAME MODEL
      same_mods = length(unique(models[this_partition]))==1

      # * ASSIGN THE MODELS
      if (same_mods) {
        this_mod = models[this_partition[1]-1]

        if (this_mod == "GTR") {
          pi_1 = "pi ~ dnDirichlet(v(1,1,1,1))"
          pi_2 = "moves[++move_index] = mvBetaSimplex(pi, weight=1.0)"
          er_1 = "er ~ dnDirichlet(v(1,1,1,1,1,1))"
          er_2 = "moves[++move_index] = mvBetaSimplex(er, weight=1.0)"

          pi_1 = gsub("pi", paste0("pi_",ix), pi_1)
          pi_2 = gsub("pi", paste0("pi_",ix), pi_2)
          er_1 = gsub("er", paste0("er_",ix), er_1)
          er_2 = gsub("er", paste0("er_",ix), er_2)
          script = c(script, (c("\n", pi_1, pi_2, er_1, er_2)))
        }

      }

      if (this_mod != "GTR") {
        print(paste0("Need to write code to accomodate another model."))
      }

    } 
  }

  # * DO SIMILAR STEPS FOR GAMMA PARTITIONS
  gam_partitions = unique(as.numeric(scheme[4,2:ncol(scheme)]))
  gam_partitions = gam_partitions[!gam_partitions==0]

  for (ix in gam_partitions){
    # this_partition = which(scheme[2,]==ix)

    # scheme[4,this_partition]
    gam_1 = "alpha ~ dnExponential(1)"
    gam_2 = "moves[++move_index] = mvScale(alpha, weight=1.0)"
    gam_3 = "site_rates := fnDiscretizeGamma(alpha, alpha, 4)"

    gam_1 = gsub("alpha", paste0("alpha_",ix), gam_1)
    gam_2 = gsub("alpha", paste0("alpha_",ix), gam_2)
    gam_3 = gsub("alpha", paste0("alpha_",ix), gam_3)
    gam_3 = gsub("site_rates", paste0("site_rates_",ix), gam_3)

    script = c(script, c("\n", gam_1, gam_2, gam_3))
  }

  # * ALSO FOR PROPORTION OF INVARIANT SITES
  pinv_partitions = unique(as.numeric(scheme[2,2:ncol(scheme)]))
  pinv_partitions = pinv_partitions[!pinv_partitions==0]

  for (ix in pinv_partitions){
    # this_partition = which(scheme[2,]==ix)

    # scheme[4,this_partition]
    pinv_1 = "pinvar ~ dnBeta(1,1)"
    pinv_2 = "moves[++move_index] = mvSlide(pinvar, weight=1.0)"

    pinv_1 = gsub("pinvar", paste0("pinvar_",ix), pinv_1)
    pinv_2 = gsub("pinvar", paste0("pinvar_",ix), pinv_2)

    script = c(script, c("\n", pinv_1, pinv_2))
  }

  # * 5.5
  # * MAKE THE Q-MATRICES
  script = c(script, "\n\n")
  subsets = colnames(scheme)[2:ncol(scheme)]

  for (ix in 1:length(subsets)) {
    # subsets[ix]

    # * CHECK IF THEY'RE THE SAME
    if (var(scheme[2:3,ix+1]) == 0) {
      part_num = scheme[2:3,ix+1][1]

      if (models[1] == "GTR") {
        script = c(script, (paste0("Q_", subsets[ix], " := fnGTR(er_", part_num, ", pi_", part_num,")")))
      }
    }
  }

  # * 6.
  # * BRANCH RATES
  br_rate_lvs = unique(as.numeric(scheme[6,2:ncol(scheme)]))

  if (var(as.numeric(scheme[6,2:ncol(scheme)]))!=0) {
    br_rates = TRUE
    script = c(script, "\n\n")

    for (ix in br_rate_lvs) {
      this_partition = which(scheme[6,]==ix)

      sum = paste0("num_sites[", ix, "] = data_", paste0(subsets[this_partition-1], ".nchar()", collapse=" + "))
      script = c(script, sum)
    }

    br_1 = paste0("relative_rates ~ dnDirichlet(v(", paste(rep(1, length(this_partition)), collapse=","), "))")
    br_2 = "moves[++move_index] = mvBetaSimplex(relative_rates, weight=1.0)"
    br_3 = "subset_rates := relative_rates * sum(num_sites) / num_sites"

    script = c(script, c("\n", br_1, br_2, br_3))
  }

  # * TOPOLOGY AND BRANCH LENGTHS

  top_1 = "topology ~ dnUniformTopology(taxa)"
  top_2 = "moves[++move_index] = mvNNI(topology, weight=10.0)"
  top_3 = "moves[++move_index] = mvSPR(topology, weight=10.0)"

  br_lens_1 = "for(i in 1:num_branches){"
  br_lens_2 = "  br_lens[i] ~ dnExponential(10.0)"
  br_lens_3 = "  moves[++move_index] = mvScale(br_lens[i], weight=1.0)"
  br_lens_4 = "}"
  br_lens_5 = "TL := sum(br_lens)"

  phylogeny = "phylogeny := treeAssembly(topology, br_lens)"

  phylo_lines = c(top_1, top_2, top_3, br_lens_1, br_lens_2, br_lens_3, br_lens_4, br_lens_5, phylogeny)
  script = c(script, "\n", phylo_lines)

  # * 7.
  # * LIKELIHOOD FUNCTIONS

  for (ix in 2:ncol(scheme)) {
    lf_1 = paste0("seq_", subsets[ix-1], " ~ dnPhyloCTMC(tree=phylogeny, Q=Q_", subsets[ix-1], ")")
    lf_2 = paste0("seq_", subsets[ix-1], ".clamp(data_", subsets[ix-1], ")")

    # * SITE RATES
    if (scheme[4,ix] != 0) {
      lf_1 = gsub(")$", paste0(", siteRates=site_rates_", scheme[4,ix], ")"), lf_1)
    }

    # * INVARIANT SITES
    if (scheme[5,ix] != 0) {
      lf_1 = gsub(")$", paste0(", pInv=pinv_", scheme[5,ix], ")"), lf_1)
    }

    # * RATE MULTIPLIER
    if (br_rates) {
      if (scheme[6,ix] != 0) {
        lf_1 = gsub(")$", paste0(", branchRates=subset_rates[", scheme[6,ix], "])"), lf_1)
      }
    }

    script = c(script, c("\n", lf_1, lf_2))
  }

  # * 8.
  # * MODEL
  model = "my_model = model(phylogeny)"
  script = c(script, "\n", model)

  # * 9.
  # * SCRIPT
  return(script)

}

write.table(script, file="testing.txt", sep="\n", quote=FALSE, col.names=FALSE, row.names=FALSE)

model_script = "testing.txt"
analysis_name = "wtf"
output_folder = "wtf2"

addMCMC = function(model_script, analysis_name, output_folder, generations=10000, burnin=250, tuning=100) {

  # * 1.
  # * STARTERS
  # source_line = paste0("source(\"", model_script, "\")")
  name_line = paste0("name = \"", analysis_name, "\"")
  output_line = paste0("output = \"", output_folder, "/\"")

  # * 2.
  # * MONITORS
  mon_1 = paste0("monitors[1] = mnModel(filename = output + name + \"/\" + name + \"_posterior_samples.log\", printgen=10, separator=TAB)")
  mon_2 = paste0("monitors[2] = mnFile(filename = output + name + \"/\" + name + \"_tree_samples.trees\", printgen=10, separator=TAB, phylogeny)")
  mon_3 = "monitors[3] = mnScreen(printgen=100, TL)"
  mon_4 = paste0("monitors[4] = mnStochasticVariable(filename = output + name + \"/\" + name + \"_posterior_stoch.var\", separator=TAB)")
  mon_lines = c(mon_1, mon_2, mon_3, mon_4)

  # * 3.
  # * MCMC
  an_1 = "analysis = mcmc(my_model, monitors, moves)"
  burn_line = paste0("analysis.burnin(generations=", burnin, ", tuningInterval=", tuning,")")
  an_2 = "analysis.operatorSummary()"
  gen_line = paste0("analysis.run(generations=", generations, ")")
  mcmc_lines = c(an_1, burn_line, an_2, gen_line)

  # * 4.
  # * MAP TREE
  trace   = paste0("treetrace = readTreeTrace(output + name + \"/\" + name + \"_tree_samples.trees\", treetype=\"non-clock\")")
  maptree = paste0("map_tree = mapTree(treetrace, output + name + \"/\" + name + \"_map_tree.tre\")")

  quit = "q()"

  # * Z.
  # * SCRIPT
  model_script = scan(model_script, what="character", sep="\n", blank.lines.skip=FALSE)
  script = c(model_script, name_line, output_line, "\n\n",  mon_lines, "\n\n", mcmc_lines, "\n\n", trace, maptree,"\n\n", quit)

  return(script)
}

write.table(script, file="testing2.txt", sep="\n", quote=FALSE, col.names=FALSE, row.names=FALSE)
palautatan commented 6 years ago

Changing the MCMC(MC) functions.

addMCMC = function(model_script, analysis_name, output_folder, generations=10000, burnin=250, tuning=100) {

  # * 1.
  # * STARTERS
  # source_line = paste0("source(\"", model_script, "\")")
  name_line = paste0("name = \"", analysis_name, "\"")
  output_line = paste0("output = \"", output_folder, "/\"")

  # * 2.
  # * MONITORS
  mon_1 = paste0("monitors[1] = mnModel(filename = output + name + \"/\" + name + \"_posterior_samples.log\", printgen=10, separator=TAB)")
  mon_2 = paste0("monitors[2] = mnFile(filename = output + name + \"/\" + name + \"_tree_samples.trees\", printgen=10, separator=TAB, phylogeny)")
  mon_3 = "monitors[3] = mnScreen(printgen=100, TL)"
  mon_4 = paste0("monitors[4] = mnStochasticVariable(filename = output + name + \"/\" + name + \"_posterior_stoch.var\", separator=TAB)")
  mon_lines = c(mon_1, mon_2, mon_3, mon_4)

  # * 3.
  # * MCMC
  an_1 = "analysis = mcmc(my_model, monitors, moves)"
  burn_line = paste0("analysis.burnin(generations=", burnin, ", tuningInterval=", tuning,")")
  an_2 = "analysis.operatorSummary()"
  gen_line = paste0("analysis.run(generations=", generations, ")")
  mcmc_lines = c(an_1, burn_line, an_2, gen_line)

  # * 4.
  # * MAP TREE
  trace   = paste0("treetrace = readTreeTrace(output + name + \"/\" + name + \"_tree_samples.trees\", treetype=\"non-clock\")")
  maptree = paste0("map_tree = mapTree(treetrace, output + name + \"/\" + name + \"_map_tree.tre\")")

  quit = "q()"

  # * Z.
  # * SCRIPT
  model_script = scan(model_script, what="character", sep="\n", blank.lines.skip=FALSE)
  script = c(model_script, name_line, output_line, "\n\n",  mon_lines, "\n\n", mcmc_lines, "\n\n", trace, maptree,"\n\n", quit)

  return(script)
}

addMCMCMC = function(model_script, analysis_name, output_folder, nchains=4, deltaheat=0.2, generations=10000, burnin=250, tuning=100) {

  # * 1.
  # * STARTERS
  # source_line = paste0("source(\"", model_script, "\")")
  name_line = paste0("name = \"", analysis_name, "\"")
  output_line = paste0("output = \"", output_folder, "/\"")

  # * 2.
  # * MONITORS
  mon_1 = paste0("monitors[1] = mnModel(filename = output + name + \"/\" + name + \"_posterior_samples.log\", printgen=10, separator=TAB)")
  mon_2 = paste0("monitors[2] = mnFile(filename = output + name + \"/\" + name + \"_tree_samples.trees\", printgen=10, separator=TAB, phylogeny)")
  mon_3 = "monitors[3] = mnScreen(printgen=100, TL)"
  mon_4 = paste0("monitors[4] = mnStochasticVariable(filename = output + name + \"/\" + name + \"_posterior_stoch.var\", separator=TAB)")
  mon_lines = c(mon_1, mon_2, mon_3, mon_4)

  # * 3.
  # * MCMC
  an_1 = paste0("analysis = mcmcmc(my_model, monitors, moves, nchains=", nchains,", deltaHeat=",deltaheat,")")
  burn_line = paste0("analysis.burnin(generations=", burnin, ", tuningInterval=", tuning,")")
  an_2 = "analysis.operatorSummary()"
  gen_line = paste0("analysis.run(generations=", generations, ")")
  mcmc_lines = c(an_1, burn_line, an_2, gen_line)

  # * 4.
  # * MAP TREE
  trace   = paste0("treetrace = readTreeTrace(output + name + \"/\" + name + \"_tree_samples.trees\", treetype=\"non-clock\")")
  maptree = paste0("map_tree = mapTree(treetrace, output + name + \"/\" + name + \"_map_tree.tre\")")

  quit = "q()"

  # * Z.
  # * SCRIPT
  model_script = scan(model_script, what="character", sep="\n")
  script = c(model_script, name_line, output_line, mon_lines, mcmc_lines, trace, maptree, quit)

  return(script)
}
palautatan commented 6 years ago

To-Do

  1. Fix the descriptions for all the functions 2. Finish tutorial
  2. Check how JC would be shared via identical Q-matrices specified in dnPhyloCTMC()