chjackson / flexsurv

The flexsurv R package for flexible parametric survival and multi-state modelling
http://chjackson.github.io/flexsurv/
53 stars 28 forks source link

estimate shared shape parameter with flexsurvreg and log-logistic distribution #123

Closed esther1262 closed 12 months ago

esther1262 commented 2 years ago

Hi, I have a few questions regarding estimating the shape parameter with flexsurvreg and log-logistic distribution. I have a 3 state markov chain, 6 transitions and 2 covariates. Effects of covariates are the same across all transitions. Each transition however has its own shape.

# define true parameters (delta=shape; gamma=coefficient for covariates)
deltas_test <- matrix(c(3, 5, 2, 3.5, 3.5, 2), nrow=3, byrow=T)
gammas_test <- c(1.2, -0.3, 0.2)

# function to generate transition rate, time-dependent
# input: t = time 
#        delta = scalar for a particular starting state
#        gamma = vector of covariate coefficients for a particular starting state, including intercept
#        cov = vector of covariates, excluding intercept
q_ij_func <- function(t, delta, gamma, cov) {
  lambda_z <- exp(-delta*(gamma%*%c(1,cov)))[1,1]
  h_t <- lambda_z*delta*(t^(delta-1)) / (1+lambda_z*(t^delta))
  return (h_t)
}

# function to generate sojourn time for time-dependent semi markov, using inverse CDF method
# CDF of sojourn = 1 - exp(integral of sum of q_ij(u)), u from 0 to w, i/=j
new_sojourn_semi <- function(deltas, gammas, cov, maxtime){
  d2 <- deltas[1]
  d3 <- deltas[2]
  l2 <- exp(-d2 * gammas%*%c(1,cov))[1,1]
  l3 <- exp(-d3 * gammas%*%c(1,cov))[1,1]
  f2_3 <- function(w){ (l2*w^d2 + 1) * (l3*w^d3 +1) - 1/runif(1,0,1) }
  new_sojourn <- NA  
  try(new_sojourn <- uniroot(f2_3, c(0, maxtime), tol=1e-3, maxiter=2e3)$root, silent=T)
  if(is.na(new_sojourn)) new_sojourn <- 0
  return(new_sojourn)
}

# function to simulate 3 states CTMC (clock forward approach)
# input: maxtime = last observation / censoring time allowed
#        deltas = shape of each transition, in matrix form
#        gammas = covariates effects on the scale of transition 
#        nchain = number of chains/subjects
#        chainsize = maximum length of each chain
sim_reset <- function(maxtime, deltas, gammas, nchain, chainsize){

  nstate <- nrow(deltas) # number of states
  ntrans <- nstate * (nstate-1) # number of transitions
  state_space <- seq(nstate) # start with 1

  # store all subjects' id, time, state, z1, z2, q_ij
  clm <- c('id','globaltime','state', 'z1', 'z2','q_ij')
  all_result <- data.frame(matrix(NA, ncol=length(clm)))
  colnames(all_result) <- clm

  for (k in 1:nchain){

    # generate covariates, initial time, initial state
    z1 <- rnorm(n=1, mean=0, sd=1)
    z2 <- rbinom(n=1, size=1, prob=0.5)
    current_sojourn <- c(2e-4)
    current_state <- sample(state_space, size = 1, prob = c(0.33, 0.33, 0.33))

    # store data for each subject
    one_result <- data.frame(id=k, globaltime=current_sojourn, state=current_state, q_ij=NA)

    while ((max(one_result$globaltime)<maxtime) & (nrow(one_result)<chainsize)){

      # rates q_ij depend on the current time spending on state i
      q_ij <- c()
      for (a in 1:ntrans){
        q_ij <- c(q_ij, q_ij_func(current_sojourn, c(deltas)[a], gammas, c(z1,z2)))
      }
      q_ij_mat <- matrix(q_ij, byrow=F, nrow=nstate) # not yet a square matrix
      q_ii <- - rowSums(q_ij_mat)

      # insert q_ii into q_ij 
      Qmat <- matrix(c(q_ii[1], q_ij_mat[1,1], q_ij_mat[1,2], 
                       q_ij_mat[2,1], q_ii[2], q_ij_mat[2,2], 
                       q_ij_mat[3,1], q_ij_mat[3,2], q_ii[3]), 
                     byrow=T, nrow=nstate) # square matrix with nrow=ncol=nstate

        # generate new transition-specific sojourn time      
      new_sojourn <- 0
      while ((new_sojourn<2e-3)){
        new_sojourn <- new_sojourn_semi(deltas[current_state,], gammas, c(z1,z2), maxtime)
      }

      # generate potential next state
      remaining_state <- setdiff(state_space, current_state)

      # nextpr = probability of the next state given the current state and time
      nextpr <- Qmat[current_state, ]
      nextpr <- nextpr[nextpr>0] # remove the q_ii element
      nextpr <- nextpr / sum(nextpr)

      # sample next state 
      st <- sample(1:2, 1, prob=nextpr) # 1 denotes first element, 2 denotes second element
      next_state <- remaining_state[st]

      # store all observations within each subject
      one_res <- c(k, max(one_result$globaltime)+new_sojourn, next_state, Qmat[current_state, next_state])
      one_result <- rbind(one_result, one_res)

      # update state and sojourn time
      current_state <- next_state
      current_sojourn <- new_sojourn

    } # end while loop for each subject

    one_result$z1 <- z1
    one_result$z2 <- z2

    all_result <- rbind(all_result, one_result)
  } # end for loop for all subjects

  all_result <- all_result[-1,]
  return(all_result) 
}

one_sim <- sim_reset(maxtime=15, deltas_test, gammas_test, nchain=1000, chainsize=50)

I converted the markov chain to survival data using msm2Surv and fitted with flexsurvreg:

dat <- one_sim[,c('id','globaltime','state','z1','z2')]
Q <- rbind(c(NA,1,2),c(3,NA,4), c(5,6,NA))
ms <- msm2Surv(dat, subject='id', time='globaltime', state="state", Q=Q, cov=c('z1','z2'))
ms$trans <- as.factor(ms$trans)
ms$status <- ifelse(ms$Tstop > 15, 0, ms$status) # censored observations after censoring time

> print(fitlog)
Call:
flexsurvreg(formula = Surv(time, status) ~ z1 + z2 + shape(trans), 
    data = ms, dist = "llogis")

Estimates: 
               data mean  est       L95%      U95%      se        exp(est)  L95%      U95%    
shape                NA    5.66475   5.38830   5.95539   0.14461        NA        NA        NA
scale                NA    3.31745   3.27776   3.35762   0.02037        NA        NA        NA
z1              0.25592   -0.30887  -0.31684  -0.30090   0.00407   0.73428   0.72845   0.74015
z2              0.44685    0.21261   0.19590   0.22932   0.00853   1.23690   1.21641   1.25774
shape(trans2)   0.16746    0.04532  -0.02893   0.11958   0.03789   1.04637   0.97148   1.12703
shape(trans3)   0.18296   -0.39852  -0.46635  -0.33070   0.03461   0.67131   0.62729   0.71842
shape(trans4)   0.18296   -0.40651  -0.47673  -0.33629   0.03583   0.66597   0.62081   0.71442
shape(trans5)   0.14957   -0.38196  -0.45825  -0.30568   0.03892   0.68252   0.63239   0.73662
shape(trans6)   0.14957   -0.44248  -0.51398  -0.37098   0.03648   0.64244   0.59811   0.69006

Except shape (for trans 1), z1, z2 coefficients, all other shape parameters are suppressed to close to 0.

I attempted this command which calculated the shape and scale given covariates. I set covariates=0. These shape parameters are different again, but closer to what I expect. Shape parameters are not affected by covariates, but this command doesn't provide SE nor CI.

> pars.fmsm(fitlog, trans=Q, newdata=data.frame(z1=0,z2=0,trans=1:6))
$`1`
        shape   scale
[1,] 3.866309 3.31745

$`2`
        shape   scale
[1,] 3.639285 3.31745

$`3`
        shape   scale
[1,] 3.802804 3.31745

$`4`
        shape   scale
[1,] 3.772563 3.31745

$`5`
        shape   scale
[1,] 5.664752 3.31745

$`6`
        shape   scale
[1,] 5.927412 3.31745

I then simulated another data based on the above fitted model using command sim.fmsm(). Shape for trans 1, z1, z2 coefficients are almost the same as the above model, but the remaining shape parameters are different from the first model too. How's that possible?

# simulate a data based on the fitted model
nsubj <- 1000
simD_subj_list <- vector("list", length=nsubj)
for (i in 1:nsubj){
  z1 <- rnorm(1,0,1)
  z2 <- rbinom(1,1,0.5)
  newdata <- data.frame(z1=z1, z2=z2, trans=1:6) #one row for one transition
  simD <- sim.fmsm(fitlog, M=1, t=20, start=sample(1:3, 1), trans=Q, newdata)
  simD <- data.frame(id=i, state=c(simD$st), time=c(simD$t), z1=z1, z2=z2)
  simD_subj_list[[i]] <- simD
}
resimD <- do.call('rbind', simD_subj_list)

# convert data to survival data 
dat <- resimD[,c('id','time','state','z1','z2')]
ms <- msm2Surv(dat, subject='id', time='time', state="state", Q=Q, cov=c('z1','z2'))
ms$trans <- as.factor(ms$trans)
ms$status <- ifelse(ms$Tstop > 15, 0, ms$status) # censored observations after censoring time

> print(fitlog2)
flexsurvreg(formula = Surv(time, status) ~ z1 + z2 + shape(trans), data = ms, dist = "llogis")

Estimates: 
               data mean  est       L95%      U95%      se        exp(est)  L95%      U95%    
shape                NA    3.60867   3.43105   3.79549   0.09293        NA        NA        NA
scale                NA    3.63517   3.58877   3.68217   0.02383        NA        NA        NA
z1              0.27597   -0.31037  -0.31923  -0.30151   0.00452   0.73318   0.72671   0.73970
z2              0.45670    0.20821   0.19075   0.22568   0.00891   1.23147   1.21015   1.25317
shape(trans2)   0.16621   -0.01752  -0.08806   0.05301   0.03599   0.98263   0.91570   1.05444
shape(trans3)   0.16533    0.05994  -0.01059   0.13047   0.03599   1.06177   0.98946   1.13937
shape(trans4)   0.16533   -0.04517  -0.11572   0.02537   0.03599   0.95583   0.89073   1.02570
shape(trans5)   0.16846    0.37628   0.30558   0.44697   0.03607   1.45685   1.35742   1.56356
shape(trans6)   0.16846    0.44536   0.37470   0.51601   0.03605   1.56105   1.45456   1.67534

> pars.fmsm(fitlog2, trans=Q, newdata=data.frame(z1=0,z2=0,trans=1:6))
$`1`
        shape    scale
[1,] 3.608671 3.635166

$`2`
        shape    scale
[1,] 3.545982 3.635166

$`3`
        shape    scale
[1,] 5.257285 3.635166

$`4`
       shape    scale
[1,] 5.63331 3.635166

$`5`
       shape    scale
[1,] 3.83159 3.635166

$`6`
       shape    scale
[1,] 3.44929 3.635166

Best Regards

chjackson commented 2 years ago

Your pars.fmsm call is on a different model fit, and the previous calls worked on a model called fitlog. Are these different models? If so, the parameter estimates will be different. Likewise if you simulate a different dataset and fit a model to it, then the parameter estimates will be different, because the data are different.

The $coefficients are log-transformed, but the print output is on the natural scale. This is documented in the help page.

There's nothing in flexsurv specifically designed to simulate data from Markov models with flexible time-to-event distributions. But if your transition-specific distributions are all exponential, then the semi-Markov model will also be Markov, so you can use sim.fmsm

esther1262 commented 2 years ago

Hi Chris, I updated my codes a bit. My steps are like this: original simulated data -> fit flexsurvreg -> resimulate data based on the previous model -> fit flexsurvreg based on resimulated data Should both models have similar estimates since the second data is based on the first fitted model?

Why pars.fmsm with covariates=0 will have different estimated shapes than print() and $coefficients? Shape parameters shouldn't be affected by choice of covariates anyway.

Thanks

chjackson commented 2 years ago

The extent of similarity depends on the size of the dataset that you simulate. As the dataset gets larger, the estimates from different simulations will become more consistent with each other and with the true parameter values.

pars.fmsm with covariates=0 should give the same estimates as the print() output, as long as you are using it on the same model. As I said in my previous comment, I suspected you were running it on a different model.

esther1262 commented 2 years ago

I checked the models were the same. But, after I simulated more samples and loosened the stopping time of trajectories, the estimates of two models became similar, not the estimated shape from print() and pars.fmsm though.

chjackson commented 2 years ago

If you suspect functions in package are doing something different from what they are documented to do, you should prepare a minimal example that other people can reproduce (e.g. using the datasets in the package). Otherwise we can only guess what the problem might be.

esther1262 commented 2 years ago

Hi Chris, I updated the codes in the first post. Since I used simulated data, I attached the codes as well. Sorry if it's too long. Basically, I created a function to generate transition rates using the hazard of log-logistic; a function to generate sojourn time using inverse CDF method, and then the main function to simulate a semi markov, given transition shapes, covariates effects, censoring time etc. I converted the markov chain to survival data using msm2Surv. I know in the first model, only the covariates effects covered the true values, which I still haven't figured out. But that should not be the reason that the estimated shapes from print and parsf.msm to be so different. Scale with covariates =0 is correct though. I then resimulated another data (nsample=1000) using sim.fmsm and refitted another flexsurvreg. Not just two flexsurvreg models were different, estimated shapes from parsf.msm and print in model 2 were again different. The censoring time for all data was 15. I extended the t in sim.fmsm to 20 but censored all observations occurred after t=15. I'm not sure if this is related. With markov chains, it shouldn't matter if the chain of observations comes from one subject or different subjects, but in survival analysis, people might consider frailty for subject effect. Can flexsurvreg handle markov chains without including frailty? I appreciate if you can give me a hint on the discrepancy. Thanks.

chjackson commented 2 years ago

That is too long I'm afraid, and will be a lot of work for someone else to debug. I'd advise to focus on one problem at a time, and strip the code down to the absolute minimum necessary needed to diagnose that problem. Hopefully the process of doing that will help you work out where the problems are.

esther1262 commented 2 years ago

Last comment then, I used the cav data from msm package and applied the same codes, the estimated shapes (trans 2 or beyond) from pars.fmsm look like the ones from print shape est * exp(shape(trans 2 or beyond)). This doesnt hold for the models in my first post though.

Q <- rbind(c(NA,1,2),c(3,NA,4), c(5,6,NA))

cav2 <- msm::cav
cav2 <- cav2[cav2$state!=4,] # remove death status
cav2$age_scale <- as.numeric(scale(cav2$age))

ms <- msm2Surv(cav2, subject='PTNUM', time='years', state="state", Q=Q, cov=c('age_scale','sex'))
ms$trans <- as.factor(ms$trans)

> print(fitlog3)
Call:
flexsurvreg(formula = Surv(time, status) ~ age_scale + sex + shape(trans), 
    data = ms, dist = "llogis")

Estimates: 
               data mean  est      L95%     U95%     se       exp(est)  L95%     U95%   
shape               NA     2.5063   2.2884   2.7451   0.1164       NA        NA       NA
scale               NA     4.2797   3.9927   4.5874   0.1516       NA        NA       NA
age_scale      -0.1056    -0.0172  -0.0612   0.0267   0.0224   0.9829    0.9407   1.0271
sex             0.1151     0.1972   0.0358   0.3587   0.0824   1.2180    1.0364   1.4314
shape(trans2)   0.4093     0.4995   0.3850   0.6139   0.0584   1.6478    1.4696   1.8476
shape(trans3)   0.0593    -0.4003  -0.5712  -0.2293   0.0872   0.6701    0.5648   0.7951
shape(trans4)   0.0593    -0.4838  -0.6504  -0.3173   0.0850   0.6164    0.5218   0.7281
shape(trans5)   0.0314     0.1295  -0.1953   0.4544   0.1657   1.1383    0.8226   1.5752
shape(trans6)   0.0314    -0.2930  -0.5386  -0.0474   0.1253   0.7460    0.5835   0.9537

> pars.fmsm(fitlog3, trans=Q, newdata=data.frame(age_scale=0, sex=0, trans=1:6))
$`1`
        shape    scale
[1,] 2.506347 4.279736

$`2`
       shape    scale
[1,] 4.13004 4.279736

$`3`
        shape    scale
[1,] 1.679623 4.279736

$`4`
       shape    scale
[1,] 1.54493 4.279736

$`5`
        shape    scale
[1,] 2.852995 4.279736

$`6`
        shape    scale
[1,] 1.869745 4.279736