Issues in manual implementation of the optimization process for ODE parameters

diabloyg commented 2 years ago

Recently I was trying to implement the ODE parameter optimization using autograd function in torch. The data were simulated using the ODEs below (a 2-compartment pharmacokinetic model, in ODE form rather than solved/explicit form):

da1/dt=-cl/v1 amt1-cld/v1 amt1+cld/v2 amt2 da2/dt=cld/v1 amt1-cld/v2 * amt2

observation(cc) = a1/v1, with additive error ~ 5; a1(0) = 1200000, a2(0) = 0, cl=100, cld=200, v1=3000, v2=5000.

The simulated observations are as follows.

time	cc
4	275.2478
8	196.7948
12	153.0787
24	89.82291
36	72.17694
48	61.54845
72	45.67833
96	39.92614
120	25.65196
144	23.36927
168	17.25322

And I tried to build the loss function using RK4 solver, with the R code below:

grad_amt1<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da1dt=-theta_cl/theta_v1 * amt1-theta_cld/theta_v1 * amt1+theta_cld/theta_v2 * amt2
  return(da1dt)
}

grad_amt2<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da2dt=theta_cld/theta_v1 * amt1-theta_cld/theta_v2 * amt2
  return(da2dt)
}

runge_kutta<-function(dose){

  a1<-dose
  a2<-0
  time_step<-4
  t<-0
  j<-1
  ofv<-0

  while(t<168){

  k1_amt1<-grad_amt1(a1,a2,cl,v1,v2,cld)
  k1_amt2<-grad_amt2(a1,a2,cl,v1,v2,cld)

  k2_amt1<-grad_amt1(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)
  k2_amt2<-grad_amt2(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)

  k3_amt1<-grad_amt1(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)
  k3_amt2<-grad_amt2(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)

  k4_amt1<-grad_amt1(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)
  k4_amt2<-grad_amt2(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)

  slp_amt1<-(k1_amt1+2 * k2_amt1+2 * k3_amt1+k4_amt1)/6
  slp_amt2<-(k1_amt2+2 * k2_amt2+2 * k3_amt2+k4_amt2)/6

  a1<-a1+slp_amt1 * time_step
  a2<-a2+slp_amt2 * time_step

  t<-t+time_step

  for (i in j:nrow(dat))
    if (dat[i,"time"]==t){
      ofv<-ofv+(dat[i,"cc"]-a1/v1)^2
      j<-j+1
      break}
  }

  return(ofv)
}

Then I tried to obtain the gradient of the loss function (ofv) to those 4 parameters (cl,v1,v2,cld) using autograd, and optimize using steepest descent approach (but not Newton or Nelder-Mead approach):

cl <- torch_tensor(c(80),requires_grad=TRUE)
v1 <- torch_tensor(c(2000),requires_grad=TRUE)
v2 <- torch_tensor(c(4000),requires_grad=TRUE)
cld <- torch_tensor(c(100),requires_grad=TRUE)
rate<-0.1

for (x in 1:10){
  loss_function<-runge_kutta(1200000)
  loss_function$backward()

  with_no_grad({

  cl$sub_(rate * cl$grad)
  v1$sub_(rate * v1$grad)
  v2$sub_(rate * v2$grad)
  cld$sub_(rate * cld$grad)

  cl$grad$zero_()
  v1$grad$zero_()
  v2$grad$zero_()
  cld$grad$zero_()

  })

  OBJFV<-runge_kutta(1200000)
  print(paste("Objective function value = ",as.numeric(OBJFV),sep=""))
}

However, according to my observation the grad for v1 and v2 is very small (~0), thus these 2 parameters cannot be updated even with large step. At the same time, when I use built-in function optim or nlm in R (for both gradient dependent or gradient-free approach), it can give unbiased estimation:

runge_kutta<-function(x){

  cl<-x[1]
  v1<-x[2]
  v2<-x[3]
  cld<-x[4]

  a1<-1200000
  a2<-0
  time_step<-4
  t<-0
  j<-1
  ofv<-0

  while(t<168){

    k1_amt1<-grad_amt1(a1,a2,cl,v1,v2,cld)
    k1_amt2<-grad_amt2(a1,a2,cl,v1,v2,cld)

    k2_amt1<-grad_amt1(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)
    k2_amt2<-grad_amt2(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)

    k3_amt1<-grad_amt1(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)
    k3_amt2<-grad_amt2(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)

    k4_amt1<-grad_amt1(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)
    k4_amt2<-grad_amt2(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)

    slp_amt1<-(k1_amt1+2 * k2_amt1+2 * k3_amt1+k4_amt1)/6
    slp_amt2<-(k1_amt2+2 * k2_amt2+2 * k3_amt2+k4_amt2)/6

    a1<-a1+slp_amt1 * time_step
    a2<-a2+slp_amt2 * time_step

    t<-t+time_step

    for (i in j:nrow(dat))
      if (dat[i,"time"]==t){
        ofv<-ofv+(dat[i,"cc"]-a1/v1)^2
        j<-j+1
        break}
  }

  ofv
}

optim(par=c(10,2000,2000,10),fn=runge_kutta,method="Nelder-Mead",hessian = TRUE)

nlm(runge_kutta,c(10,2000,2000,10),hessian = TRUE)

Then there must be something wrong with my first approah using torch. Could someone help me?

dfalbel commented 2 years ago

Hi @diabloyg,

Thanks for filing this, that's very interesting!

I am a complete newbie in pharmacometrics, but I'd say that it's not expected that simple gradient descent would behave the same as other methods like nelder-mead. So I'd say that's not a problem in your code, but just a limitation of simple gradient when optimizing this problem.

For instance, if you try using optim_lbfgs you get similar results to the optim:

library(torch)

dat <- data.frame(
        time = c(4L, 8L, 12L, 24L, 36L, 48L, 72L, 96L, 120L, 144L, 168L),
          cc = c(275.2478,196.7948,153.0787,89.82291,
                 72.17694,61.54845,45.67833,39.92614,25.65196,23.36927,
                 17.25322)
)

grad_amt1<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da1dt=-theta_cl/theta_v1 * amt1-theta_cld/theta_v1 * amt1+theta_cld/theta_v2 * amt2
  return(da1dt)
}

grad_amt2<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da2dt=theta_cld/theta_v1 * amt1-theta_cld/theta_v2 * amt2
  return(da2dt)
}

runge_kutta<-function(dose){

  a1<-dose
  a2<-0
  time_step<-4
  t<-0
  j<-1
  ofv<-torch_tensor(0)

  while(t<168){

    k1_amt1<-grad_amt1(a1,a2,cl,v1,v2,cld)
    k1_amt2<-grad_amt2(a1,a2,cl,v1,v2,cld)

    k2_amt1<-grad_amt1(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)
    k2_amt2<-grad_amt2(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)

    k3_amt1<-grad_amt1(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)
    k3_amt2<-grad_amt2(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)

    k4_amt1<-grad_amt1(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)
    k4_amt2<-grad_amt2(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)

    slp_amt1<-(k1_amt1+2 * k2_amt1+2 * k3_amt1+k4_amt1)/6
    slp_amt2<-(k1_amt2+2 * k2_amt2+2 * k3_amt2+k4_amt2)/6

    a1<-a1+slp_amt1 * time_step
    a2<-a2+slp_amt2 * time_step

    t<-t+time_step

    for (i in j:nrow(dat))
      if (dat[i,"time"]==t){
        ofv<-ofv+(torch_tensor(dat[i,"cc"])-a1/v1)^2
        j<-j+1
        break}
  }

  return(ofv)
}

cl <- torch_tensor(c(80),requires_grad=TRUE)
v1 <- torch_tensor(c(2000),requires_grad=TRUE)
v2 <- torch_tensor(c(4000),requires_grad=TRUE)
cld <- torch_tensor(c(100),requires_grad=TRUE)
rate<-0.05

optim <- optim_lbfgs(list(cl, v1, v2, cld), lr = rate)
for (x in 1:50){  
  l <- optim$step(closure = function() {
    optim$zero_grad()
    loss_function <-runge_kutta(1200000)
    loss_function$backward()
    loss_function
  })
  print(paste("Objective function value = ",as.numeric(l),sep=""))
}

diabloyg commented 2 years ago

Hi @dfalbel ,as you said, my previous issue was indeed due to the limitation of simple gradient descent approach when optimizing such kind of problem. the autograd function worked well in both sgd and more appropriate methods. It seemed that at least the sgd approach would be capable of optimizing parameters in single/multiple linear regression.

dfalbel commented 2 years ago

Great that you found the problem. I'll close the issue, feel free to reopen or create another topic if you have additional questions.

mlverse / torch

Issues in manual implementation of the optimization process for ODE parameters #788