Open yiberkeley opened 1 year ago
Below is the changed simulation.r:
FormAttributes <- function(DAG, parents, dataform, LTCF, tvals, netind_cl) {
list(DAG = DAG,
parents = parents,
dataform = dataform,
LTCF = LTCF,
tvals = tvals,
netind_cl = netind_cl,
node_nms = sapply(N(DAG), '[[', "name"), # node names from the DAG name attributes
actnodes = attr(DAG, "actnodes"), # save the action nodes (if defined)
acttimes = attr(DAG, "acttimes"), # action time points (if defined)
attnames = attr(DAG, "attnames"), # save the attribute names (if defined)
attrs = attr(DAG, "attrs") # save the attribute values (if defined)
)
}
# copy all DAG related attributes from a full data.frame attribute list (skipping row names and such)
# can modify some of the attributes, s.a., dataform, tvals & LTCF if provided
CopyAttributes <- function(attrslist, dataform=NULL, tvals=NULL, LTCF=NULL) {
list(DAG=attrslist[["DAG"]],
parents=attrslist[["parents"]],
dataform = if (!is.null(dataform)) dataform else attrslist[["dataform"]],
LTCF = if (!is.null(LTCF)) LTCF else attrslist[["LTCF"]],
tvals = if (!is.null(tvals)) tvals else attrslist[["tvals"]],
node_nms = attrslist[["node_nms"]],
actnodes = attrslist[["actnodes"]],
acttimes = attrslist[["acttimes"]],
attnames = attrslist[["attnames"]],
attrs = attrslist[["attrs"]],
target = attrslist[["target"]]
)
}
get_allts <- function(obs.df) { # calculate actual t values used in the simulated data (wide format)
node_nms <- attributes(obs.df)$names[-1]
all_ts <- as.integer(unique(sapply(strsplit(node_nms, "_"), '[', 2)))
all_ts <- sort(all_ts[!is.na(all_ts)]) #remove NA/NULL cases and sort
}
get_gennames <- function(obs.df) { # grab generic names (without time points) from simulated data (wide format)
node_nms <- attributes(obs.df)$names[-1]
gennames <- as.character(unique(sapply(strsplit(node_nms, "_"), '[', 1)))
# gennames <- sort(gennames[!is.na(gennames)]) #remove NA/NULL cases and sort
}
# given names of actions, extract the corresponding DAG.action objects from DAG
getactions <- function(DAG, actions) {
if (!is.character(actions)) stop("actions must be a character vector of action names")
# check the actions actually exist in the DAG, then grab them
all_action_DAGs <- A(DAG)
if (length(all_action_DAGs)<1) stop("DAG objects contains no actions, define actions with DAG+action before simulating action-specific data...")
all_action_names <- sapply(all_action_DAGs, function(DAG) attr(DAG, "actname"))
# verify that each action name in actions has only 1 unique match
action_idx_sel <- NULL
for (action in actions) {
action_idx <- which(all_action_names %in% action)
if (length(action_idx)<1) {
stop("Couldn't locate action: "%+% action %+% " , first define action by adding it to the DAG object with DAG+action")
} else if (length(action_idx)>1) {
stop("Found more than one action matching action name: " %+% action)
}
action_idx_sel <- c(action_idx_sel, action_idx)
}
actions <- all_action_DAGs[action_idx_sel]
}
# Internal function for simulation data from any DAG
# If prev.data is not NULL all the nodes in DAG will be evaluated in the environment of prev.data alone
simFromDAG <- function(DAG, Nsamp, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, prev.data = NULL, verbose = getOption("simcausal.verbose"),oldNet = NULL,oldDat=NULL) {
assertthat::assert_that(assertthat::is.count(Nsamp) || as.integer(Nsamp)==0L)
if (!is.null(rndseed)) {
set.seed(rndseed)
}
if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
# SAMPLING FROM ARBITRARY NODE DISTR FUN:
sampleNodeDistr <- function(newNodeParams, distr, EFUP.prev, cur.node, expr_str, asis.samp = FALSE) {
N_notNA_samp <- sum(!EFUP.prev)
user.env <- cur.node$node.env
asis.flags <- attr(newNodeParams$dist_params, "asis.flags")
# print("asis.flags: "); print(asis.flags)
# if (all(asis.flags)) asis.samp <- TRUE
standardize_param <- function(distparam) {
check_len <- function(param) {
len <- length(param)
if (len==1) {
param <- rep.int(param, N_notNA_samp)
} else if (len==Nsamp) {
param <- param[!EFUP.prev]
} else {
# stop("error while evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%".\n One of the distribution parameters evaluated to an incorrect vector length, check syntax.")
if (verbose) message("evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%
".\n One of the distribution parameters evaluated to non-standard vector length (its neither 1 nor n), make sure the distribution function knows how to handle it.")
}
param
}
# print("distparam: "); print(distparam)
# print("class(distparam): "); print(class(distparam))
if (is(distparam,"list")) { #if (class(distparam)%in%"list") {
distparam <- lapply(distparam, check_len)
distparam <- do.call("cbind", distparam)
} else if (is.vector(distparam)) {
distparam <- check_len(distparam)
} else if (is.matrix(distparam)) {
if (nrow(distparam)==1) { # for mtx with 1 row -> repeat N_notNA_samp times:
# distparam <- matrix(distparam, nrow = N_notNA_samp, ncol = ncol(distparam), byrow = TRUE)
} else if (nrow(distparam)==Nsamp) { # for mtx with Nsamp rows -> subset mtx[!EFUP.prev,]
distparam <- distparam[!EFUP.prev, , drop=FALSE]
} else {
# stop("error while evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%".\n One of the distribution parameters evaluated to an incorrect vector length, check syntax.")
if (verbose) message("evaluating node "%+% cur.node$name %+%
" expression(s): One of the distribution parameters evaluated to a matrix of non-standard dimensions (its neither 1 nor n rows), make sure the distribution function knows how to handle it.")
}
} else {
# print("expr_str"); print(expr_str)
# print(class(expr_str))
# stop("...one of the distribution parameters evaluated to unsported data type...")
# stop("error while evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%".\n One of the formulas evaluated to an unsported data type, check syntax.")
if (verbose) message("The node " %+% cur.node$name %+% " expression(s): "%+%expr_str%+%
".\n One of the formulas evaluated to unknown data type. Make sure the distribution function knowns how to handle it...")
}
distparam
}
# 1) Check & get the distribution function from the user-calling environment
# 2) If it doesn't exist, search the package namespace, if not found throw an exception
# More efficient version:
if (is.null(distr.fun <- get0(distr, mode = "function"))) {
if (!is.null(distr.fun <- get0(distr, envir = user.env, mode = "function"))) {
if (verbose) message(distr %+% ": note this distribution could not be located in package namespace, simulating from user-defined distribution found under the same name")
} else {
namespace_distr <- unlist(strsplit(distr, "::"))
distr.fun <- tryCatch(getFromNamespace(x = namespace_distr[2], ns = namespace_distr[1], envir = user.env))
if (inherits(distr.fun, "try-error")) stop(distr %+% ": this node distribution function could not be located")
# stop(distr %+%": this distribution function can\'t be found")
}
}
# *) Go over distribution parameters and for each parameter:
# check for consistency; make it an appropriate length; turn each list parameter into matrix;
dprint("before standardize newNodeParams$dist_params:"); dprint(newNodeParams$dist_params)
if (!asis.samp) {
for (idx in seq_along(newNodeParams$dist_params)) {
if (!asis.flags[[idx]] || (is.vector(newNodeParams$dist_params[[idx]]) && length(newNodeParams$dist_params[[idx]]) == Nsamp))
newNodeParams$dist_params[[idx]] <- standardize_param(newNodeParams$dist_params[[idx]])
}
}
dprint("after standardize newNodeParams$dist_params:"); dprint(newNodeParams$dist_params)
# *) Add n argument to dist_params
newNodeParams$dist_params <- append(list(n=N_notNA_samp), newNodeParams$dist_params)
# *) Call the distribution function, passing the distribution parameters:
newVar_Samp <- try(do.call(distr.fun, newNodeParams$dist_params))
if(inherits(newVar_Samp, "try-error")) {
stop("failed while sampling node '" %+% cur.node$name %+% "', from distribution '" %+% distr %+% "'")
}
if (asis.samp) {
return(newVar_Samp)
# output is a vector
} else if (is.null(dim(newVar_Samp))||is.vector(newVar_Samp)) {
newVar <- rep.int(NA, Nsamp)
if (is.factor(newVar_Samp)) {
newVar[!EFUP.prev] <- as.numeric(levels(newVar_Samp))[newVar_Samp]
newVar <- factor(newVar)
} else {
newVar[!EFUP.prev] <- newVar_Samp
}
return(newVar)
# output is a matrix -> test dimensionality is correct (equal length(names)), but do nothing else
} else if (is.matrix(newVar_Samp)) {
if (ncol(newVar_Samp)!=length(cur.node$mv.names)) stop("dimensionality (number of columns) sampled by multivariate node is not equal to length of `name` argument for node: " %+% cur.node$name)
colnames(newVar_Samp) <- cur.node$mv.names
return(newVar_Samp)
} else {
stop("unrecognized sampled variable output type, only vectors or matrices are allowed, node: " %+% cur.node$name)
}
}
#---------------------------------------------------------------------------------
# Iteratively create observed DAG nodes (wide format data.fame of Nsamp observations)
#---------------------------------------------------------------------------------
EFUP.prev <- rep(FALSE, Nsamp)
LTCF.prev <- rep(FALSE, Nsamp)
obs.df <- data.frame(ID = seq(vector(length = Nsamp))) # obs.df <- data.frame(ID = seq(1:Nsamp))
obs.dt <- data.table(obs.df)
# for efficiency, allocate columns in DT to maximum number of variables, if it exceeds 1025 (default)
DT.cols <- (length(DAG)+1)
if (DT.cols > 1025) {
alloc.col(obs.dt, DT.cols)
}
# alloc.col(obs.dt, max(min(length(DAG)+1, 1000), 200))
#---------------------------------------------------------------------------------
# CHECKS PERFORMED DURING EVALUTION:
#---------------------------------------------------------------------------------
# at each node check:
# *) (formulas) check all formulas are evaluable R expressions (try(eval))
# *) (formulas) check each cur.node formula references:
# node names that already exist, and all node names must have order < cur.node$order
#---------------------------------------------------------------------------------
# consider pre-allocating memory for newNodeParams and newVar
#---------------------------------------------------------------------------------
newVar <- vector(length = Nsamp)
NvarsPerNode <- sapply(DAG, function(node) length(node[["mv.names"]]))
totalNvars <- sum(NvarsPerNode)
MVvars <- unlist(lapply(DAG, function(node) node[["mv.names"]]))
MVvarMapNode <- NULL
for (node in DAG) {
for (mv.name in node[["mv.names"]]) MVvarMapNode <- append(MVvarMapNode, list(list(MVname = mv.name, Node = node$name)))
}
names(MVvarMapNode) <- MVvars
NodeParentsNms <- vector("list", length = length(DAG)) # list that will have parents names for each node
names(NodeParentsNms) <- names(DAG)
# initialize the node formula evaluator class (Define_sVar):
node_evaluator <- Define_sVar$new() # netind_cl = netind_cl
t_pts <- NULL
netind_cl <- NULL
for (cur.node in DAG) {
t <- cur.node$t # define variable for a current time point t
t_new <- !(t %in% t_pts) # set to TRUE when switch to new time point t occurs otherwise FALSE
t_pts <- c(t_pts, t)
t_pts <- as.integer(unique(t_pts)) # vector that keeps track of unique timepoints t
gnodename <- as.character(unlist(strsplit(cur.node$name, "_"))[1])
#------------------------------------------------------------------------
# CHECKS NODE DISTRIBUTIONS & EVALUATE NODE DISTRIBUTION PARAMS PROVIDED IN NODE FORMULA(S)
#------------------------------------------------------------------------
# TO DO:
# Need to make sure that action attributes never get separate columns in df, since they are just constants
# The attributes should be added to the eval env as variables
# See if current expr naming strucutre in Define_sVar can be recycled for sampling from multivariate densities
# NEED TO ADDRESS: WHEN A CONSTANT (length==1) IS PASSED AS A PARAMETER, DON'T ALWAYS WANT TO TURN IT INTO A VECTOR OF LENGTH n
#------------------------------------------------------------------------
# setting the node-specific user calling environment for the evaluator:
# node_evaluator$set.user.env(cur.node$node.env)
eval_expr_res <- node_evaluator$eval.nodeforms(cur.node = cur.node, data.df = if (!is.null(prev.data)) prev.data else obs.dt)
par.names <- unique(unlist(lapply(eval_expr_res, '[[', "par.nodes")))
eval_dist_params <- lapply(eval_expr_res, '[[' ,"evaled_expr")
attr(eval_dist_params, "asis.flags") <- attr(cur.node$dist_params, "asis.flags")
newNodeParams <- list(dist_params = eval_dist_params, par.names = par.names)
# dprint("cur.node t:" %+% cur.node$t)
# dprint("original nodeform expr:"); dprint(lapply(cur.node$dist_params, function(expr) as.character(expr)))
# dprint("full eval_expr_res:"); dprint(eval_expr_res)
# dprint("eval_dist_params:"); dprint(eval_dist_params)
#------------------------------------------------------------------------
# SAMPLE NEW COVARIATE BASED ON DISTR PARAMS
#------------------------------------------------------------------------
# reset the seed if the current node matches argument rndseed.reset.node
if (!is.null(rndseed) && !is.null(rndseed.reset.node)) {
if (rndseed.reset.node %in% cur.node$name) set.seed(NULL)
}
distr <- cur.node$distr
if(is.null(oldNet)){
if ("DAG.net" %in% class(cur.node)) {
if (!is.null(netind_cl) && verbose) message("Network is being re-sampled during the same simulation!")
message("1,Network is being re-sampled ")
distr <- cur.node$netfun
# ***************************************************************
# USING AS-IS FOR SAMPLING NETWORK BECAUSE THE OUTPUT IS A MATRIX. THIS IS A BAD WAY TO SOLVE THIS.
# A BETTER SOLUTION IS TO ALLOW THE distr RESULT TO BE A VECTOR OR MATRIX (for multivariate RVs)
# *****************************************************************
NetIndMat <- sampleNodeDistr(newNodeParams = newNodeParams, distr = distr, EFUP.prev = EFUP.prev,
cur.node = cur.node, expr_str = cur.node$dist_params, asis.samp = TRUE)
cur.node$Kmax <- ncol(NetIndMat)
netind_cl <- NetIndClass$new(nobs = Nsamp, Kmax = cur.node$Kmax)
netind_cl$NetInd <- NetIndMat
node_evaluator$set.net(netind_cl) # set the network for the evaluator:
# Kmax.new <- ncol(NetIndMat)
# if (as.integer(Kmax.new) != as.integer(cur.node$Kmax)) {
# if (verbose) message("Kmax is reset to a new value; old Kmax: " %+% cur.node$Kmax %+% "; new Kmax: " %+% Kmax.new)
# cur.node$Kmax <- Kmax.new
# }
# node_evaluator$Kmax <- netind_cl$Kmax
newVar <- NULL
NodeParentsNms <- NodeParentsNms[-which(names(NodeParentsNms) %in% cur.node$name)]
} else {
# print("cur.node: "); print(cur.node$name)
# print("newNodeParams$par.names: "); print(newNodeParams$par.names)
# print("MVvarMapNode"); print(MVvarMapNode)
if (!is.null(newNodeParams$par.names)) {
for (idx in seq_along(newNodeParams$par.names)) {
choose_node_idx <- names(MVvarMapNode) %in% newNodeParams$par.names[idx]
if (sum(choose_node_idx)>0) newNodeParams$par.names[idx] <- MVvarMapNode[choose_node_idx][[1]][["Node"]]
}
NodeParentsNms[[cur.node$name]] <- c(NodeParentsNms[[cur.node$name]], newNodeParams$par.names)
}
newVar <- sampleNodeDistr(newNodeParams = newNodeParams, distr = distr, EFUP.prev = EFUP.prev,
cur.node = cur.node, expr_str = cur.node$dist_params)
}
}
else
{
if ("DAG.net" %in% class(cur.node)) {
message("Using the old network structure, but just generating covariate")
distr <- cur.node$netfun
NetIndMat <- attributes(oldDat)$netind_cl$NetInd
cur.node$Kmax <- attributes(oldDat)$netind_cl$Kmax
netind_cl <- attributes(oldDat)$netind_cl
netind_cl$NetInd <- NetIndMat
node_evaluator$set.net(netind_cl) # set the network for the evaluator:
newVar <- NULL
NodeParentsNms <- NodeParentsNms[-which(names(NodeParentsNms) %in% cur.node$name)]
} else {
if (!is.null(newNodeParams$par.names)) {
for (idx in seq_along(newNodeParams$par.names)) {
choose_node_idx <- names(MVvarMapNode) %in% newNodeParams$par.names[idx]
if (sum(choose_node_idx)>0) newNodeParams$par.names[idx] <- MVvarMapNode[choose_node_idx][[1]][["Node"]]
}
NodeParentsNms[[cur.node$name]] <- c(NodeParentsNms[[cur.node$name]], newNodeParams$par.names)
}
newVar <- sampleNodeDistr(newNodeParams = newNodeParams, distr = distr, EFUP.prev = EFUP.prev,
cur.node = cur.node, expr_str = cur.node$dist_params)
}
}
if (!is.null(newVar)) {
# already performed inside sampleNodeDistr, so commented out:
# newVar[EFUP.prev] <- NA
# uncomment to assign 0 instead of NA for missing (after EOF=TRUE node evals to 1)
# newVar[EFUP.prev] <- 0
#------------------------------------------------------------------------
# LTCF is a generic name for a failure/censoring node, after failure occurred (node=1), carry forward (impute) all future covariate values (for example, survival outcome with EFUP=TRUE)
#------------------------------------------------------------------------
if (!is.null(t)) { # check time points were specified by user
# check current t is past the first time-point & we want to last time-point carried forward (LTCF)
if ((sum(LTCF.prev) > 0) & (t > t_pts[1]) & !is.null(LTCF)) {
# Carry last observation forward for those who reached EFUP (failed or but not censored)
prevtVarnm <- gnodename %+% "_" %+% (t-1)
# Check first that the variable exists (has been defined at the previous time point), if it doesn't exist, just assign NA
# if(!any(names(obs.df)%in%prevtVarnm)) {
if(!any(names(obs.dt)%in%prevtVarnm)) {
warning(gnodename%+%": is undefined at t="%+%(t-1)%+%", hence cannot be carried forward to t="%+%t)
newVar[LTCF.prev] <- NA
} else {
# newVar[LTCF.prev] <- obs.df[LTCF.prev, (gnodename %+% "_" %+% (t-1))]
# newVar[LTCF.prev] <- obs.dt[LTCF.prev, (gnodename %+% "_" %+% (t-1)), with = FALSE]
newVar[LTCF.prev] <- obs.dt[LTCF.prev, ][[(gnodename %+% "_" %+% (t-1))]]
}
}
# carry forward the censoring indicator as well (after evaluating to censored=1) - disabled call doLTCF() with censoring node name instead
# if ((!is.null(LTCF))) {
# if (is.EFUP(cur.node) & (!(LTCF%in%gnodename))) {
# newVar[EFUP.prev & !(LTCF.prev)] <- 1
# }
# }
}
#------------------------------------------------------------------------
# modify the observed data by adding new sampled covariate
# *** TODO: need to internally save the evaluation results EFU.flag for the following lines to work correctly:
# in eval.target():
# vec_EFUP <- sapply(N(DAG)[outnode_nms], is.EFUP)
# in eval.MSM():
# vec_EFUP <- sapply(DAG[outnode_nms], is.EFUP)
# in doLTCF():
# if (is.EFUP(cur.node)&(cur.outcome))
#------------------------------------------------------------------------
# 2 ways to assign several new vars with data.table (1st is the fastest)
if (is.matrix(newVar)) {
for (col in cur.node$mv.names) {
obs.dt[!EFUP.prev, (col) := newVar[, col]]
# obs.df <- within(obs.df, {assign(col, newVar[, col])})
}
} else {
obs.dt[, (cur.node$name) := newVar]
# obs.df <- within(obs.df, {assign(cur.node$name, newVar)})
}
# another version, requires making a copy of newVar
# obs.dt[,(cur.node$name):=as.data.table(newVar)]
if (!is.null(cur.node$EFU)) {
# EFU is now evaluated separately for each observation, only when evaluates to TRUE then the current node starts acting as a censoring variable
# EFU.flag <- node_evaluator$eval.EFU(cur.node = cur.node, data.df = if (!is.null(prev.data)) prev.data else obs.df)
EFU.flag <- node_evaluator$eval.EFU(cur.node = cur.node, data.df = if (!is.null(prev.data)) prev.data else obs.dt)
EFU.TRUE <- EFU.flag %in% TRUE
# EFU.TRUE <- which(EFU.flag %in% TRUE)
# print("EFU.flag: "); print(EFU.flag); print(EFU.flag %in% TRUE)
# check for censoring only when at least one observation had EFU=TRUE
if (sum(EFU.TRUE)>0) {
# EFUP.now <- (obs.df[,ncol(obs.df)]%in%1) & (EFU.TRUE)
EFUP.now <- (obs.dt[[cur.node$name[1]]]%in%1) & (EFU.TRUE)
EFUP.prev <- (EFUP.prev | EFUP.now)
if ((!is.null(LTCF)) && (LTCF%in%gnodename[1]) && is.null(prev.data)) { # is this node the one to be carried forward (LTCF node)? mark the observations with value = 1 (carry forward)
LTCF.prev <- (LTCF.prev | EFUP.now)
}
}
}
}
}
# Collect all attributes to be assigned to the obs data
# all_ts <- get_allts(obs.df) # calculate actual time values used in the simulated data
all_ts <- get_allts(obs.dt) # calculate actual time values used in the simulated data
if (sum(LTCF.prev)>0) { # only change the LTCF attribute if outcome carry forward imputation was really carried out for at least one obs
LTCF_flag <- LTCF
} else {
LTCF_flag <- NULL
}
# -----------------------------------------------------------------------------------
# Remove latent variables from the simulated dataset (if any were declared as latent)
# -----------------------------------------------------------------------------------
latent.v <- attr(DAG, "latent.v")
if (!is.null(latent.v)) {
# excl.cols <- colnames(obs.df)%in%latent.v
excl.cols <- colnames(obs.dt)%in%latent.v
if (sum(excl.cols) > 0) {
# obs.df <- obs.df[,!excl.cols]
obs.dt[,(colnames(obs.dt)[excl.cols]) := NULL]
}
}
newattrs <- FormAttributes(DAG = DAG, parents = NodeParentsNms, dataform = "wide", LTCF = LTCF_flag, tvals = all_ts, netind_cl = netind_cl)
newattrs_names <- names(newattrs)
for (attr_name in newattrs_names) {
setattr(x = obs.dt, name = attr_name, value = newattrs[[attr_name]])
}
if (!wide) {
if (length(all_ts)>1) { # only perform conversion when there is more than one t in the simulated data
# obs.df <- DF.to.long(obs.df)
obs.dt <- DF.to.longDT(obs.dt)
} else {
warning("Simulated data returned in wide format. Can't convert to long format, since only one time-point was detected.")
}
}
class(obs.dt) <- "data.frame"
# reset the seed to NULL if it was previously set
if (!is.null(rndseed)) {
set.seed(NULL)
}
# return(obs.df)
return(obs.dt)
}
#' Simulate Observed Data
#'
#' This function simulates observed data from a DAG object.
#' @param DAG A DAG objects that has been locked with set.DAG(DAG). Observed data from this DAG will be simulated.
#' @param n Number of observations to sample.
#' @param wide A logical, if TRUE the output data is generated in wide format, if FALSE, the output longitudinal data in generated in long format
#' @param LTCF If forward imputation is desired for the missing variable values, this argument should be set to the name of the node that indicates the end of follow-up event. See the vignette, \code{\link{sim}} and \code{\link{doLTCF}} for additional details.
#' @param rndseed Seed for the random number generator.
#' @param rndseed.reset.node When \code{rndseed} is specified, use this argument to specify the name of the \code{DAG} node at which the random number generator seed is reset back to \code{NULL} (simulation function will call \code{set.seed(NULL)}).
#' Can be useful if one wishes to simulate data using the set seed \code{rndseed} only for the first K nodes of the DAG and use an entirely random sample when simulating the rest of the nodes starting at K+1 and on.
#' The name of such (K+1)th order \code{DAG} node should be then specified with this argument.
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console.
#' Turn this off by default using options(simcausal.verbose=FALSE).
#' @return A \code{data.frame} where each column is sampled from the conditional distribution specified by the corresponding \code{DAG} object node.
#' @family simulation functions
#' @seealso \code{\link{simfull}} - a wrapper function for simulating full data only; \code{\link{sim}} - a wrapper function for simulating both types of data; \code{\link{doLTCF}} for forward imputation of the missing values in already simulating data; \code{\link{DF.to.long}}, \code{\link{DF.to.longDT}} - converting longitudinal data from wide to long formats.
#' @export
simobs <- function(DAG, n, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, verbose = getOption("simcausal.verbose"),oldNet = NULL,oldDat=NULL) {
if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
simFromDAG(DAG=DAG, Nsamp=n, wide=wide, LTCF=LTCF, rndseed=rndseed, rndseed.reset.node=rndseed.reset.node, verbose=verbose,oldNet = oldNet,oldDat=oldDat)
}
#' Simulate Full Data (From Action DAG(s))
#'
#' This function simulates full data based on a list of intervention DAGs, returning a list of \code{data.frame}s.
#' @param actions Actions specifying the counterfactual DAG. This argument must be either an object of class DAG.action or a list of DAG.action objects.
#' @param n Number of observations to sample.
#' @param wide A logical, if TRUE the output data is generated in wide format, if FALSE, the output longitudinal data in generated in long format
#' @param LTCF If forward imputation is desired for the missing variable values, this argument should be set to the name of the node that indicates the end of follow-up event. See the vignette, \code{\link{sim}} and \code{\link{doLTCF}} for additional details.
#' @param rndseed Seed for the random number generator.
#' @param rndseed.reset.node When \code{rndseed} is specified, use this argument to specify the name of the \code{DAG} node at which the random number generator seed is reset back to \code{NULL} (simulation function will call \code{set.seed(NULL)}).
#' Can be useful if one wishes to simulate data using the set seed \code{rndseed} only for the first K nodes of the DAG and use an entirely random sample when simulating the rest of the nodes starting at K+1 and on.
#' The name of such (K+1)th order \code{DAG} node should be then specified with this argument.
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console.
#' Turn this off by default using options(simcausal.verbose=FALSE).
#' @return A named list, each item is a \code{data.frame} corresponding to an action specified by the actions argument, action names are used for naming these list items.
#' @family simulation functions
#' @seealso \code{\link{simobs}} - a wrapper function for simulating observed data only; \code{\link{sim}} - a wrapper function for simulating both types of data; \code{\link{doLTCF}} for forward imputation of the missing values in already simulating data; \code{\link{DF.to.long}}, \code{\link{DF.to.longDT}} - converting longitudinal data from wide to long formats.
#' @export
simfull <- function(actions, n, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, verbose = getOption("simcausal.verbose"),oldNet = NULL,oldDat=NULL) {
if (!is.null(rndseed)) {
set.seed(rndseed)
}
if (("DAG.action" %in% class(actions))) {
actname <- get.actname(actions)
actions <- list(actions)
names(actions) <- actname
} else if ("list" %in% class(actions)) {
checkDAGs <- sapply(actions, is.DAG.action)
if (!all(checkDAGs)) stop("argument 'actions': all elements of the list must be DAG.action objects")
} else {
stop("argument 'actions' must be an object of class 'DAG.action' or a list of 'DAG.action' objects")
}
fulldf.list <- list() # list that will contain full data (each item = one dataframe per intervention)
for (idx_act in c(1:length(actions))) { # loop over each action/intervention/regimen DAG
actname <- names(actions)[idx_act]
actDAG <- actions[[idx_act]]
fulldf <- try(simFromDAG(DAG=actDAG, Nsamp=n, wide=wide, LTCF=LTCF, rndseed=rndseed, rndseed.reset.node=rndseed.reset.node, verbose=verbose,oldNet = oldNet ,oldDat=oldDat))
if(inherits(fulldf, "try-error")) {
stop("simulating full data for action '"%+%actname%+%"' resulted in error...", call. = FALSE)
}
# adding action indicator column to the data.frame
# newattrs <- CopyAttributes(attrslist=attributes(fulldf)) # save attributes
# actindvar <- rep.int(actname, times=nrow(fulldf))
# fulldf <- cbind(action=actindvar, fulldf)
# attributes(fulldf) <- c(attributes(fulldf), newattrs)
# attr(fulldf, "node_nms") <- c(actname, attr(fulldf, "node_nms"))
# names(attr(fulldf, "node_nms"))[1] <- actname
fulldf.list <- c(fulldf.list, list(fulldf))
}
if (length(actions)!=length(fulldf.list)) stop("simfull(): couldn't create separate df for each action DAG")
names(fulldf.list) <- names(actions)
return(fulldf.list)
}
#' Simulate Observed or Full Data from \code{DAG} Object
#'
#' This function simulates full data based on a list of intervention DAGs, returning a list of \code{data.frame}s. See the vignette for examples and detailed description.
#' @section Forward Imputation:
#' By default, when LTCF is left unspecified, all variables that follow after any end of follow-up (EFU) event are set to missing (NA).
#' The end of follow-up event occurs when a binary node of type \code{EFU=TRUE} is equal to 1, indicating a failing or right-censoring event.
#' To forward impute the values of the time-varying nodes after the occurrence of the \code{EFU} event, set the LTCF argument to a name of the EFU node representing this event.
#' For additional details and examples see the vignette and \code{\link{doLTCF}} function.
#' @param DAG A DAG objects that has been locked with set.DAG(DAG). Observed data from this DAG will be simulated if actions argument is omitted.
#' @param actions Character vector of action names which will be extracted from the DAG object. Alternatively, this can be a list of action DAGs selected with \code{A(DAG)} function, in which case the argument \code{DAG} is unused. When \code{actions} is omitted, the function returns simulated observed data (see \code{simobs}).
#' @param n Number of observations to sample.
#' @param wide A logical, if TRUE the output data is generated in wide format, if FALSE, the output longitudinal data in generated in long format
#' @param LTCF If forward imputation is desired for the missing variable values, this argument should be set to the name of the node that indicates the end of follow-up event.
#' @param rndseed Seed for the random number generator.
#' @param rndseed.reset.node When \code{rndseed} is specified, use this argument to specify the name of the \code{DAG} node at which the random number generator seed is reset back to \code{NULL} (simulation function will call \code{set.seed(NULL)}).
#' Can be useful if one wishes to simulate data using the set seed \code{rndseed} only for the first K nodes of the DAG and use an entirely random sample when simulating the rest of the nodes starting at K+1 and on.
#' The name of such (K+1)th order \code{DAG} node should be then specified with this argument.
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console.
#' Turn this off by default using options(simcausal.verbose=FALSE).
#' @return If actions argument is missing a simulated data.frame is returned, otherwise the function returns a named list of action-specific simulated data.frames with action names giving names to corresponding list items.
#' @family simulation functions
#' @seealso \code{\link{simobs}} - a wrapper function for simulating observed data only; \code{\link{simfull}} - a wrapper function for simulating full data only; \code{\link{doLTCF}} - forward imputation of the missing values in already simulating data; \code{\link{DF.to.long}}, \code{\link{DF.to.longDT}} - converting longitudinal data from wide to long formats.
#' @example tests/examples/sim.impute.examples12.R
#' @references Sofrygin O, van der Laan MJ, Neugebauer R (2017).
#' "simcausal R Package: Conducting Transparent and Reproducible Simulation Studies of Causal Effect Estimation with Complex Longitudinal Data."
#' Journal of Statistical Software, 81(2), 1-47. doi: 10.18637/jss.v081.i02.
#' @export
sim <- function(DAG, actions, n, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, verbose = getOption("simcausal.verbose"),oldNet = NULL,oldDat=NULL) {
# *) check if actions argument is missing -> simulate observed data from the DAG
# *) if actions consist of characters, try to extract those actions from the DAG and simulate full data
if (!is.integerish(n)) stop("n must be an integer")
# if (!(n >= 1)) stop("n must be a positive integer")
# SIMULATE OBSERVED DATA FROM DAG (if no actions)
if (missing(actions)) {
if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
if (!is.DAGlocked(DAG)) stop("call set.DAG() before attempting to simulate data from DAG")
if (verbose) message("simulating observed dataset from the DAG object")
return(simobs(DAG = DAG, n = n, wide = wide, LTCF = LTCF, rndseed = rndseed, rndseed.reset.node = rndseed.reset.node, verbose = verbose,oldNet = oldNet,oldDat=oldDat))
# SIMULATE FULL DATA FROM ACTION DAGs (when actions are present)
} else {
if (is.character(actions)) { # grab appropriate actions from the DAG by name
if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
actions <- getactions(DAG, actions)
} else if (is.list(actions)) {
checkDAGs <- sapply(actions, is.DAG)
if (!all(checkDAGs)) stop("actions must be either a list of DAGs or a vector of action names")
} else {
stop("argument actions must be a character vector of action names or a list of action DAGs")
}
if (verbose) message("simulating action-specific datasets for action(s): "%+%paste(names(actions), collapse = " "))
return(simfull(actions = actions, n = n, wide = wide, LTCF = LTCF, rndseed = rndseed, rndseed.reset.node = rndseed.reset.node, verbose = verbose,oldNet = oldNet,oldDat=oldDat))
}
}
#' Missing Variable Imputation with Last Time Point Value Carried Forward (LTCF)
#'
#' Forward imputation for missing variable values in simulated data after a particular end of the follow-up event. The end of follow-up event is defined by the node of type \code{EOF=TRUE} being equal to 1.
#' @section Details:
#' The default behavior of the \code{sim} function consists in setting all nodes that temporally follow an \code{EFU} node whose simulated value is 1 to missing (i.e., \code{NA}).
#' The argument \code{LTCF} of the \code{sim} function can however be used to change this default behavior and impute some of these missing values with \emph{last time point value carried forward} (LTCF).
#' More specifically, only the missing values of time-varying nodes (i.e., those with non-missing \code{t} argument) that follow the end of follow-up event encoded by the \code{EFU} node specified by the \code{LTCF} argument will be imputed.
#' One can use the function \code{doLTCF} to apply the \emph{last time point value carried forward} (LTCF) imputation to an existing simulated dataset obtained from the function \code{sim} that was called with its default imputation setting (i.e., with no \code{LTCF} argument).
#' Illustration of the use of the LTCF imputation functionality are provided in the package vignette.
#'
#' The first example below shows the default data format of the \code{sim} function after an end of the follow-up event and how this behavior can be modified to generate data with LTCF imputation by either using the \code{LTCF} argument of the
#' \code{sim} function or by calling the \code{doLTCF} function. The second example demonstrates how to use the \code{doLTCF} function to perform LTCF imputation on already existing data simulated with the \code{sim} function based on its default non-imputation behavior.
#' @param data Simulated \code{data.frame} in wide format
#' @param LTCF Character string specifying the outcome node that is the indicator of the end of follow-up (observations with value of the outcome variable being 1 indicate that the end of follow-up has been reached). The outcome variable must be a binary node that was declared with \code{EFU=TRUE}.
#' @return Modified \code{data.frame}, all time-varying missing variables after the \code{EFU} outcome specified in \code{LTCF} are forward imputed with their last available non-missing value.
#' @family data manipulation functions
#' @seealso \code{\link{sim}}, \code{\link{simobs}} and \code{\link{simfull}} for simulating data with and without carry forward imputation.
#' @example tests/examples/sim.impute.examples12.R
#' @export
doLTCF <- function(data, LTCF) {
DAG <- attr(data, "DAG")
Nsamp <- nrow(data)
LTCF.prev <- rep(FALSE,Nsamp)
t_pts <- NULL
cur.outcome <- FALSE
if (is.longfmt(data)) stop("Last Timepoint Carry Forward (LTCF) can only be executed for data in wide format")
#------------------------------------------------------------------------
# Loop over nodes in DAG and keep track of EFUP observations in the data (already sampled)
#------------------------------------------------------------------------
# all_node_names <- as.character(unlist(lapply(DAG, "[[", "mv.names")))
# [[1]][["mv.names"]]
# for (cur.node in DAG) {
# print(cur.node[["name"]])
# }
for (cur.node in DAG) {
# for (cur.node %in% all_node_names)
# gnodename <- as.character(unlist(strsplit(cur.node$name, "_"))[1])
# cur.node.name <- cur.node[["mv.names"]]
gnodename <- as.character(unlist(lapply(strsplit(cur.node[["mv.names"]], "_"), "[[", 1)))
# gnodename <- as.character(unlist(strsplit(cur.node[["mv.names"]], "_"))[1])
# gnodename <- as.character(unlist(strsplit(cur.node, "_"))[1])
t <- cur.node$t # define variable for a current time point t
t_new <- !(t%in%t_pts) # set to TRUE when switch to new time point t occurs otherwise FALSE
t_pts <- c(t_pts,t); t_pts <- as.integer(unique(t_pts)) # vector that keeps track of unique timepoints t
if (gnodename[1]%in%LTCF) { # if this generic node name is the outcome we need to carry forward this observation when it evaluates to 1
cur.outcome <- TRUE
} else {
cur.outcome <- FALSE
}
if (!is.null(t)) { # check time points were specified by user
# check current t is past the first time-point & we want to last time-point carried forward (LTCF)
if ((sum(LTCF.prev) > 0) & (t > t_pts[1])) { # Carry last observation forward for those who reached EFUP (outcome value = 1)
if (!gnodename[1]%+%"_"%+%(t-1) %in% names(data)){
stop("Cannot perform last time point carry forward (LTCF) imputation. No node name '" %+% gnodename%+%"_"%+%(t-1) %+% "' found. Please make sure to define appropriate time-point for this node.")
}
for (name in gnodename)
data[LTCF.prev, name%+%"_"%+%(t)] <- data[LTCF.prev, (name%+%"_"%+%(t-1))]
}
}
if (is.EFUP(cur.node)&(cur.outcome)) { # if cur.node is EFUP=TRUE type set all observations that had value=1 to LTCF.prev[indx]=TRUE
LTCF.prev <- (LTCF.prev | (data[,cur.node$name]%in%1))
}
# browser()
# if (!is.null(cur.node[["mv.names"]])) {
# for (cur.node in cur.node[["mv.names"]]) {
# }
# }
}
if (sum(LTCF.prev)>0) { # only change the LTCF attribute if outcome carry forward imputation was carried out for at least one obs
LTCF_flag <- LTCF
} else {
LTCF_flag <- NULL
}
attr(data, "LTCF") <- LTCF_flag
# newattrs <- CopyAttributes(attrslist=attrslist, LTCF=LTCF_flag) # save attributes from input format data
# attributes(simdf_long) <- c(attributes(simdf_long), newattrs)
return(data)
}
#' Convert Data from Wide to Long Format Using \code{reshape}
#'
#' This utility function takes a simulated data.frame in wide format as an input and converts it into a long format (slower compared to \code{\link{DF.to.longDT}}).
#'
#' Keeps all covariates that appear only once and at the first time-point constant (carry-forward).
#'
#' All covariates that appear fewer than range(t) times are imputed with NA for missing time-points.
#'
#' Observations with all NA's for all time-varying covariates are removed.
#'
#' When removing NA's the time-varying covariates that are attributes (attnames) are not considered.
#'
#' @param df_wide A \code{data.frame} in wide format
#' @return A \code{data.frame} object in long format
#' @seealso \code{\link{DF.to.longDT}} - a faster version of \code{DF.to.long} that uses \code{data.table} package
#' @family data manipulation functions
#' @export
DF.to.long <- function(df_wide) {
if (is.data.table(df_wide)) class(df_wide) <- "data.frame"
Nsamp <- nrow(df_wide)
all_ts <- get_allts(df_wide) # calculate actual time values used in the simulated data
attnames <- attr(df_wide, "attnames")
node_nms <- attr(df_wide, "node_nms")
node_nms_split <- strsplit(node_nms, "_")
node_nms_vec <- sapply(node_nms_split, '[', 1)
node_nms_unq <- unique(node_nms_vec)
# if there are no time-points (t) attributes, then the long vs. wide format is undefined.
if (length(all_ts)==0) return(df_wide)
varying <- list()
v.names <- NULL
bsl.names <- NULL
for (cur_node_name in node_nms_unq) {
idx_node <- sapply(node_nms_split, function(t_nodename) t_nodename[1]%in%cur_node_name)
count_ts <- sum(idx_node)
node_t_vals <- unlist(sapply(node_nms_split, function(t_nodename) if(t_nodename[1]%in%cur_node_name) as.integer(t_nodename[2])))
# if node name has no "_", assume its bsl and assign first time-point:
if (all(is.na(node_t_vals))) node_t_vals <- all_ts[1]
dprint("cur_node_name: "%+%cur_node_name);
dprint("all_ts"); dprint(all_ts)
# dprint("idx_node"); dprint(idx_node)
dprint("count_ts"); dprint(count_ts)
dprint("node_t_vals"); dprint(node_t_vals)
if (count_ts==length(all_ts)) { # the number of time-points is equal to all time-points => # everything is well defined
varying <- c(varying, list(node_nms[idx_node]))
v.names <- c(v.names, cur_node_name)
} else if ((count_ts==1) & (all_ts[all_ts%in%node_t_vals][1]==all_ts[1])) { # only one time-point and its the first-one
# node is baseline: everything is fine, rename baseline var removing the _t
bsl.dat.name <- node_nms[which(node_nms_vec%in%cur_node_name)][1]
bsl.names <- c(bsl.names, cur_node_name)
if (bsl.dat.name!=cur_node_name) {
names(df_wide)[names(df_wide) %in% bsl.dat.name] <- cur_node_name
}
} else { # in between 1 & 2 => need to impute to length(all_ts) filling up with NAs
ts_to_NAs <- all_ts[!all_ts%in%node_t_vals]
ts_not_NAs <- all_ts[all_ts%in%node_t_vals]
NA_nodenames <- cur_node_name%+%"_"%+%ts_to_NAs # define these node names as missing and add them to df
df_NAts <- matrix(NA, nrow=Nsamp, ncol=length(ts_to_NAs))
colnames(df_NAts) <- NA_nodenames
df_wide <- cbind(df_wide, df_NAts)
all_vnames <- cur_node_name%+%"_"%+%all_ts
varying <- c(varying, list(all_vnames))
v.names <- c(v.names, cur_node_name)
}
}
simdf_long <- reshape(data=df_wide, varying=varying, v.names=v.names, timevar = "t", times = all_ts, sep="_", direction="long")
simdf_long <- simdf_long[order(simdf_long$ID, simdf_long$t),] # 1) long format is sorted by t first, then ID -> needs to be sorted by ID then time
simdf_long <- simdf_long[,-ncol(simdf_long)] # 2) remove last column and reorder columns ID, t, ...
if (length(attnames)>0) v.names <- v.names[!v.names%in%attnames]
dprint("v.names"); dprint(v.names)
# dprint("simdf_long after reshape"); dprint(simdf_long)
simdf_long <- simdf_long[rowSums(is.na(simdf_long[,v.names]))<length(v.names),] # 3) remove last NA rows (after censoring or EFUP)
newattrs <- CopyAttributes(attrslist=attributes(df_wide), dataform="long") # save attributes from input wide format data
attributes(simdf_long) <- c(attributes(simdf_long), newattrs)
attr(simdf_long, "v.names") <- v.names
simdf_long
}
#' @import data.table
NULL
#' Faster Conversion of Data from Wide to Long Format Using \code{dcast.data.table}
#'
#' Faster utility function for converting wide-format \code{data.frame} into a long format.
#' Internally uses \pkg{data.table} package functions \code{melt.data.table} and \code{dcast.data.table}.
#'
#' Keeps all covariates that appear only once and at the first time-point constant (carry-forward).
#'
#' All covariates that appear fewer than range(t) times are imputed with NA for missing time-points.
#'
#' Observations with all NA's for all time-varying covariates are removed.
#'
#' When removing NA's the time-varying covariates that are attributes (attnames) are not considered.
#'
#' @param df_wide A \code{data.frame} or \code{data.table} in wide format
#' @param return_DF \code{TRUE} (default) to return a \code{data.frame}, \code{FALSE} returns a \code{data.table}
#' @return A \code{data.frame} in long format
#' @family data manipulation functions
#' @export
DF.to.longDT <- function(df_wide, return_DF = TRUE) {
assertthat::assert_that(assertthat::is.flag(return_DF))
assertthat::assert_that(data.table::is.data.table(df_wide)||is.data.frame(df_wide))
# wrapping any call inside this function call will suppress all warnings:
SuppressAllWarnings <- function(expr) {
h <- function (w) {
if (!is.null(w$message)) invokeRestart("muffleWarning")
}
withCallingHandlers(expr, warning = h)
}
# old attributes saved:
newattrs <- CopyAttributes(attrslist=attributes(df_wide), dataform="long") # save attributes from input wide format data
Nsamp <- nrow(df_wide)
all_ts <- get_allts(df_wide) # calculate actual time values used in the simulated data
attnames <- attr(df_wide, "attnames")
# node_nms <- attr(df_wide, "node_nms")
node_nms <- colnames(df_wide)
# browser()
node_nms_split <- strsplit(node_nms, "_")
# node_nms_split2 <- strsplit(node_nms, '.', fixed = TRUE)
node_nms_vec <- sapply(node_nms_split, '[', 1)
# sapply(node_nms_split2, '[', 1)
node_nms_unq <- unique(node_nms_vec)
varying <- list()
v.names <- NULL
bsl.names <- NULL
dprint("all_ts"); dprint(all_ts)
dprint("node_nms"); dprint(node_nms)
if (exists("setDTthreads")) setDTthreads(1)
# if there are no time-points (t) attributes, then the long vs. wide format is undefined.
if (length(all_ts)==0) return(df_wide)
#******************************************************************************************************
# this will create a copy of the object, wont modify the original object in the calling environment
if (!is.data.table(df_wide)) df_wide <- data.table(df_wide)
# !!!!! NOTE: THIS WILL convert the data.frame in the calling environment to data.table!!!!!!
# To cancel that run class(df_wide) <- "data.frame" afterwards
# data.table::setDT(df_wide, giveNames=TRUE, keep.rownames=FALSE) # convert data.frame to data.table by reference (without copying)
#******************************************************************************************************
DF.to.longDT_run <- function(dat_df, t_pts, lvars, bslvars=NULL) {
t_vec <- "_"%+%(t_pts)
for (var_nm in lvars) { # one TV var at a time approach
value_vars <- var_nm%+%t_vec
SuppressAllWarnings(
DT_melt <- melt(dat_df, id.vars = "ID", measure.vars = value_vars, variable.factor = TRUE, na.rm = FALSE)
)
# DT_melt <- melt.data.table(dat_df, id.vars="ID", measure.vars=value_vars, variable.factor=TRUE, na.rm=FALSE)
# DT_melt <- data.table::melt(dat_df, id.vars="ID", measure.vars=value_vars, variable.factor=TRUE, na.rm=FALSE)
var_nm_rep <- rep.int(var_nm, nrow(DT_melt))
t_rep <- rep(t_pts, each=nrow(dat_df))
DT_melt[, c("LVname","t") := list(var_nm_rep,t_rep)]
DT_l <- data.table::dcast.data.table(DT_melt, t + ID ~ LVname)
data.table::setkeyv(DT_l, c("ID", "t"))
# data.table::setkey(DT_l, ID, t)
if (which(lvars%in%var_nm)==1) {
DT_l_fin <- DT_l
} else {
DT_l_fin[, (var_nm) := DT_l[[var_nm]]]
}
}
if (!is.null(bslvars)) {
DT_l_bsl <- dat_df[,unique(c("ID",bslvars)), with=FALSE]
# data.table::setkey(DT_l_bsl, ID) # needs to be sorted
data.table::setkeyv(DT_l_bsl, "ID") # needs to be sorted
DT_l_fin <- DT_l_bsl[DT_l_fin]
}
DT_l_fin
}
# for each node, determinine if its baseline or time-varying
# if time-varying and columns are missing for some of its time-points => impute those time-points as NA (last ifelse)
for (cur_node_name in node_nms_unq) {
idx_node <- sapply(node_nms_split, function(t_nodename) t_nodename[1]%in%cur_node_name)
count_ts <- sum(idx_node)
node_t_vals <- unlist(sapply(node_nms_split, function(t_nodename) if(t_nodename[1]%in%cur_node_name) as.integer(t_nodename[2])))
# if node name has no "_", assume its bsl and assign first time-point:
if (all(is.na(node_t_vals))) node_t_vals <- all_ts[1]
dprint("cur_node_name: "%+%cur_node_name);
dprint("all_ts"); dprint(all_ts)
# dprint("idx_node"); dprint(idx_node)
dprint("count_ts"); dprint(count_ts)
dprint("node_t_vals"); dprint(node_t_vals)
if (count_ts==length(all_ts)) { # the number of time-points is equal to all time-points => everything is well defined
varying <- c(varying, list(node_nms[idx_node]))
v.names <- c(v.names, cur_node_name)
} else if ((count_ts==1) & (all_ts[all_ts%in%node_t_vals][1]==all_ts[1])) { # only one time-point and its the first-one
# node is baseline: everything is fine, do nothing, just save its name
bsl.dat.name <- node_nms[which(node_nms_vec%in%cur_node_name)][1]
bsl.names <- c(bsl.names, cur_node_name)
if (bsl.dat.name!=cur_node_name) {
setnames(df_wide, bsl.dat.name, cur_node_name)
}
} else { # in between 1 & 2 => need to impute to length(all_ts) filling up with NAs
ts_to_NAs <- all_ts[!all_ts%in%node_t_vals]
ts_not_NAs <- all_ts[all_ts%in%node_t_vals]
NA_nodenames <- cur_node_name%+%"_"%+%ts_to_NAs # define these node names as missing and add them to df
df_wide[, (NA_nodenames) := NA]
all_vnames <- cur_node_name%+%"_"%+%all_ts
varying <- c(varying, list(all_vnames))
v.names <- c(v.names, cur_node_name)
}
}
dprint("v.names"); dprint(v.names)
dprint("bsl.names"); dprint(bsl.names)
#*** the baseline variable names are also passed as last arg
simDT_long <- DF.to.longDT_run(dat_df=df_wide, t_pts=all_ts, lvars=v.names, bslvars=bsl.names)
if (length(attnames)>0) v.names <- v.names[!v.names%in%attnames]
dprint("v.names after rem attrs"); dprint(v.names)
simDT_long <- simDT_long[rowSums(is.na(simDT_long[,v.names, with=FALSE]))<length(v.names),] # 3) remove last NA rows (after censoring or EFUP)
# attributes(simDT_long) <- c(attributes(simDT_long), newattrs)
newattrs_names <- names(newattrs)
for (attr_name in newattrs_names) {
setattr(x = simDT_long, name = attr_name, value = newattrs[[attr_name]])
}
# attr(simDT_long, "v.names") <- v.names
setattr(x = simDT_long, name = "v.names", value = v.names)
if (return_DF) class(simDT_long) <- "data.frame"
return(simDT_long)
}
# NOT IMPLEMENTED
# will convert the dataset back to wide format from long, get the necessary attributes from simdf_long
# convert_to_wide <- function(simdf_long) {
# # Nsamp <- nrow(df_wide)
# # all_ts <- get_allts(simdf_long) # calculate actual time values used in the simulated data
# attnames <- attr(simdf_long, "attnames")
# node_nms <- attributes(simdf_long)$names[-1]
# # node_nms_split <- strsplit(node_nms, "_")
# # node_nms_unq <- unique(sapply(node_nms_split, '[', 1))
# varying <- list()
# v.names <- NULL
# simdf_wide <- reshape(data=simdf_long, idvar=, varying=varying, v.names=v.names, timevar = "t", times = all_ts, sep="_", direction="wide")
# simdf_wide
# }
The current simulation seems to only allow us to fix the hyper parameters of the network and whenever we simulate, it will first generate a new network and then simulate the covariates.
However, we want to be able to simulate from a fixed network. I.E at most simulate our network according to hyper parameters once, then fix such network, and simulate many different realizations of the covariates and outcomes.
So, I add such functionality by changing the simFromDAG function.