ailinweili / FDboost

Boosting Functional Regression Models. The current release version can be found on CRAN (http://cran.r-project.org/package=FDboost).
0 stars 0 forks source link

Usage examples: bfpco-base FDboost #1

Open ailinweili opened 7 years ago

ailinweili commented 7 years ago

library(mgcv) library(dtw)

Generate data, the toy dataset analyzed by Phillip(2017) is used

Xnl <- matrix(0, 30, 101) set.seed(813) tt <- sort(sample(1:90, 30)) for(i in 1:30){ Xnl[i, tt[i]:(tt[i]+4)] <- -1 Xnl[i, (tt[i]+5):(tt[i]+9)] <- 1 } X.toy <- Xnl + matrix(rnorm(30101, ,0.05), 30) colnames(X.toy) <- paste("X", 1:101, sep = "") y.toy <- tt + rnorm(30, 0.05) y.rainbow <- rainbow(30, end=0.9)[(y.toy-min(y.toy))/ diff(range(y.toy))29+1]

dummy <- rep(1,30) # dummy response variable

Display data

par(mfrow=c(2, 2)) matplot((0:100)/100, t(Xnl[c(4, 25), ]), type="l", xlab="t", ylab="", ylim=range(X.toy), main="Noiseless functions") matplot((0:100)/100, t(X.toy[c(4, 25), ]), type="l", xlab="t", ylab="", ylim=range(X.toy), main="Observed functions") matplot((0:100)/100, t(X.toy), type="l", lty=1, col=y.rainbow, xlab="t", ylab="", main="Rainbow plot")

Obtain DTW distances

D.dtw <- dist(X.toy, method="dtw", window.type="sakoechiba", window.size=5)

Model data

toydata <- list(y.toy = y.toy, X.toy = X.toy, s = 1:101)

Fit pco-based gam model(m1), fpco-based boosting model(m2), fpc-based boosting model(m3)

m1 <- gam(y.toy ~ s(dummy, bs="pco", k=15, xt=list(D=D.dtw)), method="REML")

m2 <- FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = 15, window.type="sakoechiba", window.size=5), timeformula = ~ bols(1), data = toydata, control = boost_control(mstop = 200))

m3 <- FDboost(y.toy ~ bfpc(X.toy, s = s), timeformula = NULL, data = toydata)

Model fitted values

fitteds = data.frame(y.toy = toydata$y.toy, fitted_pco = m1$fitted.values, fitted_bfpco = m2$fitted(), fitted_bfpc = m3$fitted())

par(mfrow=c(2, 2)) obs_id = 1:length(toydata$y.toy) matplot(obs_id, preds, type = c("l"), lwd = 1.5, col = 1:4 , main = "model fitted value") legend("topleft", legend = c("y.toy", "fitted_pco", "fitted_fpco", "fitted_fpc"), col=1:4, lwd = 1.5, cex = 0.6)

Average of square residual

c(resid_pco = mean(m1$residuals^2), resid_fpco = mean(m2$resid()^2), resid_fpc = mean(m3$resid()^2))

Prediction for new data

Case when new data have the same time index

newdd = list(X.toy = Xnl + matrix(rnorm(30*101, 0, 0.05), 30), s = 1:101)

newpred_m2 = predict(m2, newdata = newdd) newpred_m3 = predict(m3, newdd = newdd) newpreds = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred_m2, pred_fpc = newpred_m3)

matplot(obs_id, newpreds, type = c("l"), lwd = 1.5, col = 1:4, main = "predicton for new data at identical time grids") legend("topleft", legend = c("y.toy","pred_fpco", "pred_fpc"), col=1:4, lwd = 1.5, cex = 0.7)

Case when new data have different time index

new_xtoy = data.frame() for( i in 1:32 ) new_xtoy[1:nrow(X.toy),i] = rowMeans(X.toy[1:nrow(X.toy), (3i):(3i+2)])

new_s = vector() for(i in 1:32) new_s[i] = mean(newdd$s[(3i):(3i+2)])

newdd_2 = list(X.toy = as.matrix(new_xtoy), s = as.integer(new_s))

newpred2_m2 = predict(m2, newdata = newdd_2) newpred2_m3 = predict(m3, newdata = newdd_2) newpreds2 = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred2_m2, newpred_fpc = newpred2_m3)

matplot(obs_id, newpreds, type = c("l"), pch = 1, col = 1:4, main = "prediciton for new data at differnt time grids") legend("topleft", legend = c("y.toy","pred_fpco", "pred_fpc"), col=1:4, lwd = 1.5 , cex = 0.7)

Model performance over number of dimensions of principal coordinates

Due to limited data size, only training error is computed and displayed

aveperf_fpco = sapply(1:15, FUN = function(i) {mean( FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = i, window.type="sakoechiba", window.size=5), timeformula = ~ bols(1), data = toydata, control = boost_control(mstop = 1000))$resid()^2 ) } )

aveperf_fpc = sapply(1:15, FUN = function(i) {mean( FDboost(y.toy ~ bfpc(X.toy, s = s), timeformula = NULL, data = toydata)$resid()^2) } )

gcvdata <- data.frame(aveperf_fpco, aveperf_fpc) matplot(1:15, gcvdata, type = "b", main = "model performance over number of pc/pco", xlab = "number of pc/pco", ylab = "GCV", pch = c(17,15))

fabian-s commented 7 years ago

Great!

ailinweili commented 7 years ago

These codes can be found in the example of bfpco function(we will remove it later on)

fabian-s commented 7 years ago

good job, Weili! I've added some comments/modifcations below

devtools::load_all(".")

library(mgcv)
library(dtw)

# Generate data, the toy dataset analyzed by Phillip(2017) is used
Xnl <- matrix(0, 30, 101)
set.seed(813)
tt <- sort(sample(1:90, 30))
for (i in 1:30) {
  Xnl[i, tt[i]:(tt[i] + 4)] <- -1
  Xnl[i, (tt[i] + 5):(tt[i] + 9)] <- 1
}
X.toy <- Xnl + matrix(rnorm(30 * 101, , 0.05), 30)
colnames(X.toy) <- paste("X", 1:101, sep = "")
y.toy <- tt + rnorm(30, 0.05)
y.rainbow <- rainbow(30, end = 0.9)[(y.toy - min(y.toy))/diff(range(y.toy)) * 29 + 1]

dummy <- rep(1, 30)

## Display data
par(mfrow = c(2, 2))
matplot((0:100)/100, t(Xnl[c(4, 25), ]), type = "l", xlab = "t", ylab = "", ylim = range(X.toy), 
  main = "Noiseless functions")
matplot((0:100)/100, t(X.toy[c(4, 25), ]), type = "l", xlab = "t", ylab = "", ylim = range(X.toy), 
  main = "Observed functions")
matplot((0:100)/100, t(X.toy), type = "l", lty = 1, col = y.rainbow, xlab = "t", 
  ylab = "", main = "Rainbow plot")

## Obtain DTW distances
D.dtw <- dist(X.toy, method = "dtw", window.type = "sakoechiba", window.size = 5)

## Model data
toydata <- list(y.toy = y.toy, X.toy = X.toy, s = 1:101)

# Fit pco-based gam model(m1), fpco-based boosting model(m2), fpc-based boosting
# model(m3)
m1 <- gam(y.toy ~ s(dummy, bs = "pco", k = 15, xt = list(D = D.dtw)), method = "REML")

m2 <- FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = 15, window.type = "sakoechiba", 
  window.size = 5), timeformula = ~bols(1), data = toydata, control = boost_control(mstop = 200))

m3 <- FDboost(y.toy ~ bfpc(X.toy, s = s), timeformula = NULL, data = toydata)

## Model fitted values
fitteds = data.frame(y.toy = toydata$y.toy, fitted_pco = m1$fitted.values, fitted_bfpco = m2$fitted(), 
  fitted_bfpc = m3$fitted())

par(mfrow = c(2, 2))
obs_id = 1:length(toydata$y.toy)
# matplot(obs_id, preds, type = c('l'), lwd = 1.5, col = 1:4 , main = 'model
# fitted value') FS: i got 'Error in as.matrix(y) : object 'preds' not found',
# should be matplot(obs_id, fitteds, ... legend('topleft', legend = c('y.toy',
# 'fitted_pco', 'fitted_fpco', 'fitted_fpc'), col=1:4, lwd = 1.5, cex = 0.6)

## FS don't really like this plot -- x-axis is rather arbitrary, better to show
## agreement in scatter plot matrix:
pairs(fitteds, panel = function(x, y) {
  points(x, y)
  abline(c(0, 1), col = rgb(0, 0, 0, 0.4))
})

## Average of square residual
c(resid_pco = mean(m1$residuals^2), resid_fpco = mean(m2$resid()^2), resid_fpc = mean(m3$resid()^2))

# Prediction for new data Case when new data have the same time index
newdd = list(X.toy = Xnl + matrix(rnorm(30 * 101, 0, 0.05), 30), s = 1:101)

newpred_m2 = predict(m2, newdata = newdd)
newpred_m3 = predict(m3, newdd = newdd)
newpreds = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred_m2, pred_fpc = newpred_m3)

matplot(obs_id, newpreds, type = c("l"), lwd = 1.5, col = 1:4, main = "predicton for new data at identical time grids")
legend("topleft", legend = c("y.toy", "pred_fpco", "pred_fpc"), col = 1:4, lwd = 1.5, 
  cex = 0.7)

## Case when new data have different time index
new_xtoy = data.frame()
for (i in 1:32) new_xtoy[1:nrow(X.toy), i] = rowMeans(X.toy[1:nrow(X.toy), (3 * i):(3 * 
  i + 2)])

new_s = vector()
for (i in 1:32) new_s[i] = mean(newdd$s[(3 * i):(3 * i + 2)])

newdd_2 = list(X.toy = as.matrix(new_xtoy), s = as.integer(new_s))

newpred2_m2 = predict(m2, newdata = newdd_2)
newpred2_m3 = predict(m3, newdata = newdd_2)
newpreds2 = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred2_m2, newpred_fpc = newpred2_m3)

matplot(obs_id, newpreds, type = c("l"), pch = 1, col = 1:4, main = "prediciton for new data at differnt time grids")
legend("topleft", legend = c("y.toy", "pred_fpco", "pred_fpc"), col = 1:4, lwd = 1.5, 
  cex = 0.7)

# Model performance over number of principal coordinates
aveperf_fpco = sapply(1:15, FUN = function(i) {
  mean(FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = i, window.type = "sakoechiba", 
    window.size = 5), timeformula = ~bols(1), data = toydata, control = boost_control(mstop = 1000))$resid()^2)
})

aveperf_fpc = sapply(1:15, FUN = function(i) {
  ## FS: this does not vary number of PCs ...  FDboost(y.toy ~ bfpc(X.toy, s = s),
  ## timeformula = NULL, data = toydata)$resid()^2)
  m < FDboost(y.toy ~ bfpc(X.toy, s = s, npc = i), timeformula = NULL, data = toydata)
  mean(m$resid()^2)
})

## FS: this is ok for showcase here, for honest evaluation you would need: - the
## test set error, not the training set error. training set error will always
## decrease in <npc> - tune mstop for each <npc>, not set it to a fixed value

gcvdata <- data.frame(aveperf_fpco, aveperf_fpc)
matplot(1:15, gcvdata, type = "b", main = "model performance over number of pc/pco", 
  xlab = "number of pc/pco", ylab = "GCV", pch = c(17, 15))
ailinweili commented 7 years ago

Hi Fabian! I have updated the examples according to your last comment.