Open palautatan opened 6 years ago
From S1115 testing.
setwd("/Users/EDIE/Downloads/S1115")
scheme_csv = "S1115.csv"
subsets_folder = "data/subsets/S1115"
# * PARTITIONED ANALYSIS
partitioned_model = function(scheme_csv, subsets_folder, name) {
# * 0.
# * BEGIN SCRIPT
script = c()
# * 1A.
# * READ IN SCHEME CSV
scheme = read.csv(scheme_csv, as.is=TRUE)
num_subsets = length(scheme[1,])-1
# * 2.
# * READ IN NEXUS FILES
subsets = colnames(scheme)[2:ncol(scheme)]
subsets = gsub("^X", "", subsets)
full_paths = paste0(subsets_folder, "/", subsets)
data_reading_lines = c()
for (ix in 1:length(subsets)) {
read_in_data = "data = readDiscreteCharacterData("
read_in_data = gsub("data", paste0("data_", subsets[ix]), read_in_data)
read_in_data = paste0(read_in_data,"\"", full_paths[ix], ".nex\")")
data_reading_lines = c(data_reading_lines, read_in_data)
}
script = c(script, data_reading_lines)
var_1 = strsplit(data_reading_lines[1], " =")[[1]][1]
taxa_line = paste0("taxa = ", var_1, ".taxa()")
ntax_line = paste0("num_taxa = ", var_1, ".ntaxa()")
branches = "num_branches = 2 * num_taxa - 3"
tree_stats = c(taxa_line, ntax_line, branches)
script = c(script, "\n", tree_stats)
# * 3.
# * MOVES
move_start = "move_index = 0"
script = c(script, move_start)
# * 4.
# * PARTITION VERIFICATION
if (ncol(scheme) > 2) {
partition = 1
} else {
stop("This is not a partitioned model.")
}
# * 5.
# * SUBSTITUTION MODELS
# * GO ROW WISE
models = as.character(scheme[1,2:ncol(scheme)])
# * IF THE RELATIVE RATES AND STATE FREQS ARE IN THE SAME PARTITIONS
if (all(as.numeric(scheme[2,2:ncol(scheme)]) == as.numeric(scheme[3,2:ncol(scheme)]))) {
# * GET THE PARTITIONS
mod_partitions = unique(as.numeric(scheme[2,2:ncol(scheme)]))
mod_partitions = mod_partitions[!mod_partitions==0]
# * FOR EACH PARTITION,
for (ix in mod_partitions) {
# * GET THE SUBSETS THAT ARE IN THE PARTITION
this_partition = which(scheme[2,]==ix)
# * MAKE SURE THEY'RE ALL THE SAME MODEL
same_mods = length(unique(models[this_partition]))==1
# * ASSIGN THE MODELS
if (same_mods) {
this_mod = models[this_partition[1]-1]
if (this_mod == "GTR") {
pi_1 = "pi ~ dnDirichlet(v(1,1,1,1))"
pi_2 = "moves[++move_index] = mvBetaSimplex(pi, weight=1.0)"
er_1 = "er ~ dnDirichlet(v(1,1,1,1,1,1))"
er_2 = "moves[++move_index] = mvBetaSimplex(er, weight=1.0)"
pi_1 = gsub("pi", paste0("pi_",ix), pi_1)
pi_2 = gsub("pi", paste0("pi_",ix), pi_2)
er_1 = gsub("er", paste0("er_",ix), er_1)
er_2 = gsub("er", paste0("er_",ix), er_2)
script = c(script, (c("\n", pi_1, pi_2, er_1, er_2)))
}
}
if (this_mod != "GTR") {
print(paste0("Need to write code to accomodate another model."))
}
}
}
# * DO SIMILAR STEPS FOR GAMMA PARTITIONS
gam_partitions = unique(as.numeric(scheme[4,2:ncol(scheme)]))
gam_partitions = gam_partitions[!gam_partitions==0]
for (ix in gam_partitions){
# this_partition = which(scheme[2,]==ix)
# scheme[4,this_partition]
gam_1 = "alpha ~ dnExponential(1)"
gam_2 = "moves[++move_index] = mvScale(alpha, weight=1.0)"
gam_3 = "site_rates := fnDiscretizeGamma(alpha, alpha, 4)"
gam_1 = gsub("alpha", paste0("alpha_",ix), gam_1)
gam_2 = gsub("alpha", paste0("alpha_",ix), gam_2)
gam_3 = gsub("alpha", paste0("alpha_",ix), gam_3)
gam_3 = gsub("site_rates", paste0("site_rates_",ix), gam_3)
script = c(script, c("\n", gam_1, gam_2, gam_3))
}
# * ALSO FOR PROPORTION OF INVARIANT SITES
pinv_partitions = unique(as.numeric(scheme[2,2:ncol(scheme)]))
pinv_partitions = pinv_partitions[!pinv_partitions==0]
for (ix in pinv_partitions){
# this_partition = which(scheme[2,]==ix)
# scheme[4,this_partition]
pinv_1 = "pinvar ~ dnBeta(1,1)"
pinv_2 = "moves[++move_index] = mvSlide(pinvar, weight=1.0)"
pinv_1 = gsub("pinvar", paste0("pinvar_",ix), pinv_1)
pinv_2 = gsub("pinvar", paste0("pinvar_",ix), pinv_2)
script = c(script, c("\n", pinv_1, pinv_2))
}
# * 5.5
# * MAKE THE Q-MATRICES
script = c(script, "\n\n")
subsets = colnames(scheme)[2:ncol(scheme)]
for (ix in 1:length(subsets)) {
# subsets[ix]
# * CHECK IF THEY'RE THE SAME
if (var(scheme[2:3,ix+1]) == 0) {
part_num = scheme[2:3,ix+1][1]
if (models[1] == "GTR") {
script = c(script, (paste0("Q_", subsets[ix], " := fnGTR(er_", part_num, ", pi_", part_num,")")))
}
}
}
# * 6.
# * BRANCH RATES
br_rate_lvs = unique(as.numeric(scheme[6,2:ncol(scheme)]))
if (var(as.numeric(scheme[6,2:ncol(scheme)]))!=0) {
br_rates = TRUE
script = c(script, "\n\n")
for (ix in br_rate_lvs) {
this_partition = which(scheme[6,]==ix)
sum = paste0("num_sites[", ix, "] = data_", paste0(subsets[this_partition-1], ".nchar()", collapse=" + "))
script = c(script, sum)
}
br_1 = paste0("relative_rates ~ dnDirichlet(v(", paste(rep(1, length(this_partition)), collapse=","), "))")
br_2 = "moves[++move_index] = mvBetaSimplex(relative_rates, weight=1.0)"
br_3 = "subset_rates := relative_rates * sum(num_sites) / num_sites"
script = c(script, c("\n", br_1, br_2, br_3))
}
# * TOPOLOGY AND BRANCH LENGTHS
top_1 = "topology ~ dnUniformTopology(taxa)"
top_2 = "moves[++move_index] = mvNNI(topology, weight=10.0)"
top_3 = "moves[++move_index] = mvSPR(topology, weight=10.0)"
br_lens_1 = "for(i in 1:num_branches){"
br_lens_2 = " br_lens[i] ~ dnExponential(10.0)"
br_lens_3 = " moves[++move_index] = mvScale(br_lens[i], weight=1.0)"
br_lens_4 = "}"
br_lens_5 = "TL := sum(br_lens)"
phylogeny = "phylogeny := treeAssembly(topology, br_lens)"
phylo_lines = c(top_1, top_2, top_3, br_lens_1, br_lens_2, br_lens_3, br_lens_4, br_lens_5, phylogeny)
script = c(script, "\n", phylo_lines)
# * 7.
# * LIKELIHOOD FUNCTIONS
for (ix in 2:ncol(scheme)) {
lf_1 = paste0("seq_", subsets[ix-1], " ~ dnPhyloCTMC(tree=phylogeny, Q=Q_", subsets[ix-1], ")")
lf_2 = paste0("seq_", subsets[ix-1], ".clamp(data_", subsets[ix-1], ")")
# * SITE RATES
if (scheme[4,ix] != 0) {
lf_1 = gsub(")$", paste0(", siteRates=site_rates_", scheme[4,ix], ")"), lf_1)
}
# * INVARIANT SITES
if (scheme[5,ix] != 0) {
lf_1 = gsub(")$", paste0(", pInv=pinv_", scheme[5,ix], ")"), lf_1)
}
# * RATE MULTIPLIER
if (br_rates) {
if (scheme[6,ix] != 0) {
lf_1 = gsub(")$", paste0(", branchRates=subset_rates[", scheme[6,ix], "])"), lf_1)
}
}
script = c(script, c("\n", lf_1, lf_2))
}
# * 8.
# * MODEL
model = "my_model = model(phylogeny)"
script = c(script, "\n", model)
# * 9.
# * SCRIPT
return(script)
}
write.table(script, file="testing.txt", sep="\n", quote=FALSE, col.names=FALSE, row.names=FALSE)
model_script = "testing.txt"
analysis_name = "wtf"
output_folder = "wtf2"
addMCMC = function(model_script, analysis_name, output_folder, generations=10000, burnin=250, tuning=100) {
# * 1.
# * STARTERS
# source_line = paste0("source(\"", model_script, "\")")
name_line = paste0("name = \"", analysis_name, "\"")
output_line = paste0("output = \"", output_folder, "/\"")
# * 2.
# * MONITORS
mon_1 = paste0("monitors[1] = mnModel(filename = output + name + \"/\" + name + \"_posterior_samples.log\", printgen=10, separator=TAB)")
mon_2 = paste0("monitors[2] = mnFile(filename = output + name + \"/\" + name + \"_tree_samples.trees\", printgen=10, separator=TAB, phylogeny)")
mon_3 = "monitors[3] = mnScreen(printgen=100, TL)"
mon_4 = paste0("monitors[4] = mnStochasticVariable(filename = output + name + \"/\" + name + \"_posterior_stoch.var\", separator=TAB)")
mon_lines = c(mon_1, mon_2, mon_3, mon_4)
# * 3.
# * MCMC
an_1 = "analysis = mcmc(my_model, monitors, moves)"
burn_line = paste0("analysis.burnin(generations=", burnin, ", tuningInterval=", tuning,")")
an_2 = "analysis.operatorSummary()"
gen_line = paste0("analysis.run(generations=", generations, ")")
mcmc_lines = c(an_1, burn_line, an_2, gen_line)
# * 4.
# * MAP TREE
trace = paste0("treetrace = readTreeTrace(output + name + \"/\" + name + \"_tree_samples.trees\", treetype=\"non-clock\")")
maptree = paste0("map_tree = mapTree(treetrace, output + name + \"/\" + name + \"_map_tree.tre\")")
quit = "q()"
# * Z.
# * SCRIPT
model_script = scan(model_script, what="character", sep="\n", blank.lines.skip=FALSE)
script = c(model_script, name_line, output_line, "\n\n", mon_lines, "\n\n", mcmc_lines, "\n\n", trace, maptree,"\n\n", quit)
return(script)
}
write.table(script, file="testing2.txt", sep="\n", quote=FALSE, col.names=FALSE, row.names=FALSE)
Changing the MCMC(MC) functions.
addMCMC = function(model_script, analysis_name, output_folder, generations=10000, burnin=250, tuning=100) {
# * 1.
# * STARTERS
# source_line = paste0("source(\"", model_script, "\")")
name_line = paste0("name = \"", analysis_name, "\"")
output_line = paste0("output = \"", output_folder, "/\"")
# * 2.
# * MONITORS
mon_1 = paste0("monitors[1] = mnModel(filename = output + name + \"/\" + name + \"_posterior_samples.log\", printgen=10, separator=TAB)")
mon_2 = paste0("monitors[2] = mnFile(filename = output + name + \"/\" + name + \"_tree_samples.trees\", printgen=10, separator=TAB, phylogeny)")
mon_3 = "monitors[3] = mnScreen(printgen=100, TL)"
mon_4 = paste0("monitors[4] = mnStochasticVariable(filename = output + name + \"/\" + name + \"_posterior_stoch.var\", separator=TAB)")
mon_lines = c(mon_1, mon_2, mon_3, mon_4)
# * 3.
# * MCMC
an_1 = "analysis = mcmc(my_model, monitors, moves)"
burn_line = paste0("analysis.burnin(generations=", burnin, ", tuningInterval=", tuning,")")
an_2 = "analysis.operatorSummary()"
gen_line = paste0("analysis.run(generations=", generations, ")")
mcmc_lines = c(an_1, burn_line, an_2, gen_line)
# * 4.
# * MAP TREE
trace = paste0("treetrace = readTreeTrace(output + name + \"/\" + name + \"_tree_samples.trees\", treetype=\"non-clock\")")
maptree = paste0("map_tree = mapTree(treetrace, output + name + \"/\" + name + \"_map_tree.tre\")")
quit = "q()"
# * Z.
# * SCRIPT
model_script = scan(model_script, what="character", sep="\n", blank.lines.skip=FALSE)
script = c(model_script, name_line, output_line, "\n\n", mon_lines, "\n\n", mcmc_lines, "\n\n", trace, maptree,"\n\n", quit)
return(script)
}
addMCMCMC = function(model_script, analysis_name, output_folder, nchains=4, deltaheat=0.2, generations=10000, burnin=250, tuning=100) {
# * 1.
# * STARTERS
# source_line = paste0("source(\"", model_script, "\")")
name_line = paste0("name = \"", analysis_name, "\"")
output_line = paste0("output = \"", output_folder, "/\"")
# * 2.
# * MONITORS
mon_1 = paste0("monitors[1] = mnModel(filename = output + name + \"/\" + name + \"_posterior_samples.log\", printgen=10, separator=TAB)")
mon_2 = paste0("monitors[2] = mnFile(filename = output + name + \"/\" + name + \"_tree_samples.trees\", printgen=10, separator=TAB, phylogeny)")
mon_3 = "monitors[3] = mnScreen(printgen=100, TL)"
mon_4 = paste0("monitors[4] = mnStochasticVariable(filename = output + name + \"/\" + name + \"_posterior_stoch.var\", separator=TAB)")
mon_lines = c(mon_1, mon_2, mon_3, mon_4)
# * 3.
# * MCMC
an_1 = paste0("analysis = mcmcmc(my_model, monitors, moves, nchains=", nchains,", deltaHeat=",deltaheat,")")
burn_line = paste0("analysis.burnin(generations=", burnin, ", tuningInterval=", tuning,")")
an_2 = "analysis.operatorSummary()"
gen_line = paste0("analysis.run(generations=", generations, ")")
mcmc_lines = c(an_1, burn_line, an_2, gen_line)
# * 4.
# * MAP TREE
trace = paste0("treetrace = readTreeTrace(output + name + \"/\" + name + \"_tree_samples.trees\", treetype=\"non-clock\")")
maptree = paste0("map_tree = mapTree(treetrace, output + name + \"/\" + name + \"_map_tree.tre\")")
quit = "q()"
# * Z.
# * SCRIPT
model_script = scan(model_script, what="character", sep="\n")
script = c(model_script, name_line, output_line, mon_lines, mcmc_lines, trace, maptree, quit)
return(script)
}
dnPhyloCTMC()
Prepare the mrRevBayes R Package
addMCMCMC()
to read in scripts correctly.R
files for each function and library folder