Better initialisation to avoid divergence of smoothing parameter

mfasiolo commented 7 years ago

require(RhpcBLASctl); blas_set_num_threads(1) #optional

data("UKload")

qu <- 0.5
form <- NetDemand~s(wM,k=20,bs='cr') + s(wM_s95,k=20,bs='cr') + 
  s(Posan,bs='ad',k=30,xt=list("bs"="cc")) + Dow + s(Trend,k=4) + NetDemand.48 + Holy

# Do library(debug); mtrace(tuneLearn); 
# and then mtrace(newton) after for( ii in 1:nt ) 

library(qgam)
require(RhpcBLASctl); blas_set_num_threads(1) #optional
qu <- 0.05
tic <- proc.time()
set.seed(41241)
sigSeq <- seq(3, 8, length.out = 16)
closs1 <- tuneLearn(form = form, data = UKload, err = 0.15,
                   lsig = sigSeq, qu = qu, control = list("K" = 2))
proc.time() - tic
#check(closs1)

# At the first iteration you will see

# Initial smoothing parameters
7.571046   9.556191   2.441944 -19.327599 -12.368925   5.065998   4.854984  -2.279645 

# After one iteration lsp get to 
-1.630176   1.364197 -12.974725 -19.327599 -12.840063 -11.929365  -9.659406 -11.917420

# with gradient
143.748873878   6.825546046   0.367822900   0.005996531  -0.019127832  -0.226958093  -0.786281196  13.154424694

# Hessian: very concave along the second variable!
[,1]          [,2]         [,3]         [,4]          [,5]          [,6]          [,7]          [,8]
[1,]  2.458789e+01  8.412941e+00 -0.011584408 8.621974e-05  0.0029556391 -0.0265641062 -0.1539408394 -0.3165983797
[2,]  8.412941e+00 -5.877177e+00 -0.001159426 3.529038e-06 -0.0001722144  0.0004438109 -0.0186866835 -0.1314088233
[3,] -1.158441e-02 -1.159426e-03  1.095528270 3.340906e-03  0.4568092625  0.0038351018  0.4755544119  0.0046309245
[4,]  8.621974e-05  3.529038e-06  0.003340906 6.018966e-03  0.0047966777  0.0002868956  0.0006440296  0.0000427432
[5,]  2.955639e-03 -1.722144e-04  0.456809262 4.796678e-03  1.9886273956  0.5466783024  0.0944302443  0.0183346497
[6,] -2.656411e-02  4.438109e-04  0.003835102 2.868956e-04  0.5466783024  0.3882664949  0.3151293611  0.0581675198
[7,] -1.539408e-01 -1.868668e-02  0.475554412 6.440296e-04  0.0944302443  0.3151293611  3.0543169197  0.2696595281
[8,] -3.165984e-01 -1.314088e-01  0.004630925 4.274320e-05  0.0183346497  0.0581675198  0.2696595281  7.4095616814

# Eig values
26.7637733  7.4210911  3.1796840  0.9925382  0.3475061 -8.0462096

# The hessian is not PD hence the following code flips the negative eigenvalue.
grad <- c(143.7488739, 6.8255460, 0.3678229, 0.005996531, -0.019127832, -0.2269581, -0.7862812, 13.1544247)

grad1 <- c(143.7488739, 6.8255460, 0.3678229, -0.2269581, -0.7862812, 13.1544247) 

hess1 <- matrix(c(24.58788695,  8.4129414625, -0.011584408, -0.0265641062, -0.15394084, -0.316598380,
           8.41294146, -5.8771772964, -0.001159426,  0.0004438109, -0.01868668, -0.131408823,
           -0.01158441, -0.0011594263,  1.095528270,  0.0038351018,  0.47555441,  0.004630925,
           -0.02656411,  0.0004438109,  0.003835102,  0.3882664949,  0.31512936,  0.058167520,
           -0.15394084, -0.0186866835,  0.475554412,  0.3151293611,  3.05431692,  0.269659528,
           -0.31659838, -0.1314088233,  0.004630925,  0.0581675198,  0.26965953,  7.409561681), 6, 6)

## get the trial step ...
eh <- eigen(hess1,symmetric=TRUE)
d <- eh$values;U <- eh$vectors
## set eigen-values to their absolute value - heuristically appealing
## as it avoids very long steps being proposed for indefinte components,
## unlike setting -ve e.v.s to very small +ve constant...
ind <- d < 0
pdef <- if (sum(ind)>0) FALSE else TRUE ## is it positive definite? 
d[ind] <- -d[ind] ## see Gill Murray and Wright p107/8
low.d <- max(d)*.Machine$double.eps^.7
ind <- d < low.d
if (sum(ind)>0) pdef <- FALSE ## not certain it is positive definite
d[ind] <- low.d 
ind <- d != 0
d[ind] <- 1/d[ind]

# This produces the following step: the second component is going in the wrong direction 
# (increasing, rather than decreasing)
-drop(U%*%(d*(t(U)%*%grad1))) # (modified) Newton direction
# -6.0315446  2.2039636 -0.4772446  0.4242607  0.2126593 -2.0296309

# This moves ends up lowering (negative) REML quite a lot. Probably because the first coefficient
# update (-6.03) corresponds to precipitation, while the incriminated update (2.203) corresponds to
# smoothed precipitation. Maybe they are quite correlated and get traded off each other?

# Now the smoothing parameters are
[1]  -6.630176   3.191227 -13.370349 -19.327599 -12.840063 -11.577664  -9.483116 -13.599933

# While the old were
-1.630176   1.364197 -12.974725 -19.327599 -12.840063 -11.929365  -9.659406 -11.917420

# The gradient is now
[1]  4.024835175  0.001849281  0.146872205  0.004788172  0.031598356  0.118494166 -0.287925519  3.607569377

# and the hessian 
[,1]          [,2]         [,3]         [,4]          [,5]          [,6]          [,7]          [,8]
[1,] 3.3560882713  8.002191e-04 4.697608e-03 4.038460e-05  7.289772e-03  1.344963e-02  6.638055e-02  4.888544e-02
[2,] 0.0008002191 -1.842555e-03 6.647676e-06 3.294869e-08 -1.114174e-05 -7.135799e-05 -3.043383e-05 -3.971898e-06
[3,] 0.0046976082  6.647676e-06 5.926385e-01 3.131827e-03  4.160236e-01  3.672006e-03  3.751701e-01  3.499639e-04
[4,] 0.0000403846  3.294869e-08 3.131827e-03 4.818144e-03  5.765253e-03  3.440575e-04  8.994476e-04  9.959019e-06
[5,] 0.0072897719 -1.114174e-05 4.160236e-01 5.765253e-03  2.007663e+00  5.485817e-01  8.480139e-02  2.516710e-03
[6,] 0.0134496271 -7.135799e-05 3.672006e-03 3.440575e-04  5.485817e-01  8.691455e-01  2.940997e-01  1.705993e-02
[7,] 0.0663805549 -3.043383e-05 3.751701e-01 8.994476e-04  8.480139e-02  2.940997e-01  3.495033e+00  6.174737e-02
[8,] 0.0488854368 -3.971898e-06 3.499639e-04 9.959019e-06  2.516710e-03  1.705993e-02  6.174737e-02  3.995061e+00

# So the objective is kind of flat and negative definite across the 2nd parameter. Still the grad[2] has the 
# right sign. Now this check
uconv.ind <- uconv.ind & abs(grad)>max(abs(grad))*.001 
# gives
[1]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
# Hence the second lsp is stuck where it is.
# Next iteration is kind of similar (similar gradient and hessian) and 2nd lsp fails the uconv.ind test and
# remains stuck.

# In the end convergence is achieved, that is this is FALSE
abs(old.score-score)>score.scale*conv.tol
# where the gradient is 
[1] 5.936495e-06 2.892744e-03 5.867093e-04 5.995174e-04 1.633661e-03 2.032578e-04 5.398356e-04 1.390044e-04
# and score.scale = 19303.56. Notice that here
score.scale <- abs(log(b$scale.est)) + abs(score)
# but log(b$scale.est) = 0, which might explain why convergence is slower when the response is normalized.
# Maybe we should set scale.est to the learning rate sigma?
# The final hessian is 

[,1]          [,2]         [,3]         [,4]          [,5]          [,6]          [,7]          [,8]
[1,] 9.906413e-01  9.528050e-05 2.017215e-03 7.067377e-06  2.865816e-02  3.390677e-02  9.211675e-02  2.808279e-02
[2,] 9.528050e-05 -2.851802e-03 2.235050e-06 3.550656e-09 -2.025883e-05 -9.484876e-05 -6.078402e-05 -5.010377e-05
[3,] 2.017215e-03  2.235050e-06 8.990583e-02 2.509880e-04  2.426490e-01  7.070839e-04  1.842829e-01  2.548583e-05
[4,] 7.067377e-06  3.550656e-09 2.509880e-04 6.004118e-04  1.184680e-03  3.313392e-05  2.237754e-04  3.348968e-07
[5,] 2.865816e-02 -2.025883e-05 2.426490e-01 1.184680e-03  2.828701e+00  4.866797e-01  1.712816e-01  6.120458e-04
[6,] 3.390677e-02 -9.484876e-05 7.070839e-04 3.313392e-05  4.866797e-01  4.255299e-01  2.428750e-01  2.172774e-03
[7,] 9.211675e-02 -6.078402e-05 1.842829e-01 2.237754e-04  1.712816e-01  2.428750e-01  4.008821e+00  1.476809e-02
[8,] 2.808279e-02 -5.010377e-05 2.548583e-05 3.348968e-07  6.120458e-04  2.172774e-03  1.476809e-02  9.706237e-01

mfasiolo commented 7 years ago

mgcv using the method in Section 6.7 of Simon Wood's book (2nd ed) to initialize the sp. Several fixes in qgam and mgcv are needed:

log smoothing parameters from gausFit cannot possibly be used to initialize the sp of elf, because they are on different scales;
elf should not initialize mustart to y, otherwise the total curvature ( sum(w) ) in mgcv:::initial.sp is much higher than what it should be. Which make to that the smoothing parameters are over-estimated by initial.sp (which makes trace(X^TWX) to trace(S));
in initial.sp, this line w <- .5 * as.numeric(family$Dd(y,mustart,theta,weights)$Dmu2*family$mu.eta(family$linkfun(mustart))^2) makes so the EXPECTED weights are used. These are on average much larger then the actual weights, because the model is mispecified. Hence the initial guess for the sp is too high.

mfasiolo commented 7 years ago

Points 1 and 2 hopefully addressed in these commits:

f87b53779d6eb538bfb63433bcb415cfe2622446 164e9276d8c86d3634418128b8ca3c75cfb70882

but point 3 needs to be addressed in mgcv.

mfasiolo / qgam

Better initialisation to avoid divergence of smoothing parameter #22