mlverse / torch

R Interface to Torch
https://torch.mlverse.org
Other
489 stars 64 forks source link

Issues in manual implementation of the optimization process for ODE parameters #788

Closed diabloyg closed 2 years ago

diabloyg commented 2 years ago

Recently I was trying to implement the ODE parameter optimization using autograd function in torch. The data were simulated using the ODEs below (a 2-compartment pharmacokinetic model, in ODE form rather than solved/explicit form):

da1/dt=-cl/v1 amt1-cld/v1 amt1+cld/v2 amt2 da2/dt=cld/v1 amt1-cld/v2 * amt2

observation(cc) = a1/v1, with additive error ~ 5; a1(0) = 1200000, a2(0) = 0, cl=100, cld=200, v1=3000, v2=5000.

The simulated observations are as follows.

time cc
4 275.2478
8 196.7948
12 153.0787
24 89.82291
36 72.17694
48 61.54845
72 45.67833
96 39.92614
120 25.65196
144 23.36927
168 17.25322

And I tried to build the loss function using RK4 solver, with the R code below:

grad_amt1<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da1dt=-theta_cl/theta_v1 * amt1-theta_cld/theta_v1 * amt1+theta_cld/theta_v2 * amt2
  return(da1dt)
}

grad_amt2<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da2dt=theta_cld/theta_v1 * amt1-theta_cld/theta_v2 * amt2
  return(da2dt)
}

runge_kutta<-function(dose){

  a1<-dose
  a2<-0
  time_step<-4
  t<-0
  j<-1
  ofv<-0

  while(t<168){

  k1_amt1<-grad_amt1(a1,a2,cl,v1,v2,cld)
  k1_amt2<-grad_amt2(a1,a2,cl,v1,v2,cld)

  k2_amt1<-grad_amt1(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)
  k2_amt2<-grad_amt2(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)

  k3_amt1<-grad_amt1(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)
  k3_amt2<-grad_amt2(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)

  k4_amt1<-grad_amt1(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)
  k4_amt2<-grad_amt2(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)

  slp_amt1<-(k1_amt1+2 * k2_amt1+2 * k3_amt1+k4_amt1)/6
  slp_amt2<-(k1_amt2+2 * k2_amt2+2 * k3_amt2+k4_amt2)/6

  a1<-a1+slp_amt1 * time_step
  a2<-a2+slp_amt2 * time_step

  t<-t+time_step

  for (i in j:nrow(dat))
    if (dat[i,"time"]==t){
      ofv<-ofv+(dat[i,"cc"]-a1/v1)^2
      j<-j+1
      break}
  }

  return(ofv)
}

Then I tried to obtain the gradient of the loss function (ofv) to those 4 parameters (cl,v1,v2,cld) using autograd, and optimize using steepest descent approach (but not Newton or Nelder-Mead approach):

cl <- torch_tensor(c(80),requires_grad=TRUE)
v1 <- torch_tensor(c(2000),requires_grad=TRUE)
v2 <- torch_tensor(c(4000),requires_grad=TRUE)
cld <- torch_tensor(c(100),requires_grad=TRUE)
rate<-0.1

for (x in 1:10){
  loss_function<-runge_kutta(1200000)
  loss_function$backward()

  with_no_grad({

  cl$sub_(rate * cl$grad)
  v1$sub_(rate * v1$grad)
  v2$sub_(rate * v2$grad)
  cld$sub_(rate * cld$grad)

  cl$grad$zero_()
  v1$grad$zero_()
  v2$grad$zero_()
  cld$grad$zero_()

  })

  OBJFV<-runge_kutta(1200000)
  print(paste("Objective function value = ",as.numeric(OBJFV),sep=""))
}

However, according to my observation the grad for v1 and v2 is very small (~0), thus these 2 parameters cannot be updated even with large step. At the same time, when I use built-in function optim or nlm in R (for both gradient dependent or gradient-free approach), it can give unbiased estimation:

runge_kutta<-function(x){

  cl<-x[1]
  v1<-x[2]
  v2<-x[3]
  cld<-x[4]

  a1<-1200000
  a2<-0
  time_step<-4
  t<-0
  j<-1
  ofv<-0

  while(t<168){

    k1_amt1<-grad_amt1(a1,a2,cl,v1,v2,cld)
    k1_amt2<-grad_amt2(a1,a2,cl,v1,v2,cld)

    k2_amt1<-grad_amt1(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)
    k2_amt2<-grad_amt2(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)

    k3_amt1<-grad_amt1(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)
    k3_amt2<-grad_amt2(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)

    k4_amt1<-grad_amt1(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)
    k4_amt2<-grad_amt2(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)

    slp_amt1<-(k1_amt1+2 * k2_amt1+2 * k3_amt1+k4_amt1)/6
    slp_amt2<-(k1_amt2+2 * k2_amt2+2 * k3_amt2+k4_amt2)/6

    a1<-a1+slp_amt1 * time_step
    a2<-a2+slp_amt2 * time_step

    t<-t+time_step

    for (i in j:nrow(dat))
      if (dat[i,"time"]==t){
        ofv<-ofv+(dat[i,"cc"]-a1/v1)^2
        j<-j+1
        break}
  }

  ofv
}

optim(par=c(10,2000,2000,10),fn=runge_kutta,method="Nelder-Mead",hessian = TRUE)

nlm(runge_kutta,c(10,2000,2000,10),hessian = TRUE)

Then there must be something wrong with my first approah using torch. Could someone help me?

dfalbel commented 2 years ago

Hi @diabloyg,

Thanks for filing this, that's very interesting!

I am a complete newbie in pharmacometrics, but I'd say that it's not expected that simple gradient descent would behave the same as other methods like nelder-mead. So I'd say that's not a problem in your code, but just a limitation of simple gradient when optimizing this problem.

For instance, if you try using optim_lbfgs you get similar results to the optim:

library(torch)

dat <- data.frame(
        time = c(4L, 8L, 12L, 24L, 36L, 48L, 72L, 96L, 120L, 144L, 168L),
          cc = c(275.2478,196.7948,153.0787,89.82291,
                 72.17694,61.54845,45.67833,39.92614,25.65196,23.36927,
                 17.25322)
)

grad_amt1<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da1dt=-theta_cl/theta_v1 * amt1-theta_cld/theta_v1 * amt1+theta_cld/theta_v2 * amt2
  return(da1dt)
}

grad_amt2<-function(amt1,amt2,theta_cl,theta_v1,theta_v2,theta_cld){
  da2dt=theta_cld/theta_v1 * amt1-theta_cld/theta_v2 * amt2
  return(da2dt)
}

runge_kutta<-function(dose){

  a1<-dose
  a2<-0
  time_step<-4
  t<-0
  j<-1
  ofv<-torch_tensor(0)

  while(t<168){

    k1_amt1<-grad_amt1(a1,a2,cl,v1,v2,cld)
    k1_amt2<-grad_amt2(a1,a2,cl,v1,v2,cld)

    k2_amt1<-grad_amt1(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)
    k2_amt2<-grad_amt2(a1+time_step/2 * k1_amt1,a2+time_step/2 * k1_amt2,cl,v1,v2,cld)

    k3_amt1<-grad_amt1(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)
    k3_amt2<-grad_amt2(a1+time_step/2 * k2_amt1,a2+time_step/2 * k2_amt2,cl,v1,v2,cld)

    k4_amt1<-grad_amt1(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)
    k4_amt2<-grad_amt2(a1+time_step * k3_amt1,a2+time_step * k3_amt2,cl,v1,v2,cld)

    slp_amt1<-(k1_amt1+2 * k2_amt1+2 * k3_amt1+k4_amt1)/6
    slp_amt2<-(k1_amt2+2 * k2_amt2+2 * k3_amt2+k4_amt2)/6

    a1<-a1+slp_amt1 * time_step
    a2<-a2+slp_amt2 * time_step

    t<-t+time_step

    for (i in j:nrow(dat))
      if (dat[i,"time"]==t){
        ofv<-ofv+(torch_tensor(dat[i,"cc"])-a1/v1)^2
        j<-j+1
        break}
  }

  return(ofv)
}

cl <- torch_tensor(c(80),requires_grad=TRUE)
v1 <- torch_tensor(c(2000),requires_grad=TRUE)
v2 <- torch_tensor(c(4000),requires_grad=TRUE)
cld <- torch_tensor(c(100),requires_grad=TRUE)
rate<-0.05

optim <- optim_lbfgs(list(cl, v1, v2, cld), lr = rate)
for (x in 1:50){  
  l <- optim$step(closure = function() {
    optim$zero_grad()
    loss_function <-runge_kutta(1200000)
    loss_function$backward()
    loss_function
  })
  print(paste("Objective function value = ",as.numeric(l),sep=""))
}
diabloyg commented 2 years ago

Hi @dfalbel ,as you said, my previous issue was indeed due to the limitation of simple gradient descent approach when optimizing such kind of problem. the autograd function worked well in both sgd and more appropriate methods. It seemed that at least the sgd approach would be capable of optimizing parameters in single/multiple linear regression.

dfalbel commented 2 years ago

Great that you found the problem. I'll close the issue, feel free to reopen or create another topic if you have additional questions.