ailinweili commented 7 years ago

library(mgcv) library(dtw)

Generate data, the toy dataset analyzed by Phillip(2017) is used

Xnl <- matrix(0, 30, 101) set.seed(813) tt <- sort(sample(1:90, 30)) for(i in 1:30){ Xnl[i, tt[i]:(tt[i]+4)] <- -1 Xnl[i, (tt[i]+5):(tt[i]+9)] <- 1 } X.toy <- Xnl + matrix(rnorm(30101, ,0.05), 30) colnames(X.toy) <- paste("X", 1:101, sep = "") y.toy <- tt + rnorm(30, 0.05) y.rainbow <- rainbow(30, end=0.9)[(y.toy-min(y.toy))/ diff(range(y.toy))29+1]

dummy <- rep(1,30) # dummy response variable

Display data

par(mfrow=c(2, 2)) matplot((0:100)/100, t(Xnl[c(4, 25), ]), type="l", xlab="t", ylab="", ylim=range(X.toy), main="Noiseless functions") matplot((0:100)/100, t(X.toy[c(4, 25), ]), type="l", xlab="t", ylab="", ylim=range(X.toy), main="Observed functions") matplot((0:100)/100, t(X.toy), type="l", lty=1, col=y.rainbow, xlab="t", ylab="", main="Rainbow plot")

Obtain DTW distances

D.dtw <- dist(X.toy, method="dtw", window.type="sakoechiba", window.size=5)

Model data

toydata <- list(y.toy = y.toy, X.toy = X.toy, s = 1:101)

Fit pco-based gam model(m1), fpco-based boosting model(m2), fpc-based boosting model(m3)

m1 <- gam(y.toy ~ s(dummy, bs="pco", k=15, xt=list(D=D.dtw)), method="REML")

m2 <- FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = 15, window.type="sakoechiba", window.size=5), timeformula = ~ bols(1), data = toydata, control = boost_control(mstop = 200))

m3 <- FDboost(y.toy ~ bfpc(X.toy, s = s), timeformula = NULL, data = toydata)

Model fitted values

fitteds = data.frame(y.toy = toydata$y.toy, fitted_pco = m1$fitted.values, fitted_bfpco = m2$fitted(), fitted_bfpc = m3$fitted())

par(mfrow=c(2, 2)) obs_id = 1:length(toydata$y.toy) matplot(obs_id, preds, type = c("l"), lwd = 1.5, col = 1:4 , main = "model fitted value") legend("topleft", legend = c("y.toy", "fitted_pco", "fitted_fpco", "fitted_fpc"), col=1:4, lwd = 1.5, cex = 0.6)

Average of square residual

c(resid_pco = mean(m1$residuals^2), resid_fpco = mean(m2$resid()^2), resid_fpc = mean(m3$resid()^2))

Prediction for new data

Case when new data have the same time index

newdd = list(X.toy = Xnl + matrix(rnorm(30*101, 0, 0.05), 30), s = 1:101)

newpred_m2 = predict(m2, newdata = newdd) newpred_m3 = predict(m3, newdd = newdd) newpreds = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred_m2, pred_fpc = newpred_m3)

matplot(obs_id, newpreds, type = c("l"), lwd = 1.5, col = 1:4, main = "predicton for new data at identical time grids") legend("topleft", legend = c("y.toy","pred_fpco", "pred_fpc"), col=1:4, lwd = 1.5, cex = 0.7)

Case when new data have different time index

new_xtoy = data.frame() for( i in 1:32 ) new_xtoy[1:nrow(X.toy),i] = rowMeans(X.toy[1:nrow(X.toy), (3i):(3i+2)])

new_s = vector() for(i in 1:32) new_s[i] = mean(newdd$s[(3i):(3i+2)])

newdd_2 = list(X.toy = as.matrix(new_xtoy), s = as.integer(new_s))

newpred2_m2 = predict(m2, newdata = newdd_2) newpred2_m3 = predict(m3, newdata = newdd_2) newpreds2 = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred2_m2, newpred_fpc = newpred2_m3)

matplot(obs_id, newpreds, type = c("l"), pch = 1, col = 1:4, main = "prediciton for new data at differnt time grids") legend("topleft", legend = c("y.toy","pred_fpco", "pred_fpc"), col=1:4, lwd = 1.5 , cex = 0.7)

Model performance over number of dimensions of principal coordinates

Due to limited data size, only training error is computed and displayed

aveperf_fpco = sapply(1:15, FUN = function(i) {mean( FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = i, window.type="sakoechiba", window.size=5), timeformula = ~ bols(1), data = toydata, control = boost_control(mstop = 1000))$resid()^2 ) } )

aveperf_fpc = sapply(1:15, FUN = function(i) {mean( FDboost(y.toy ~ bfpc(X.toy, s = s), timeformula = NULL, data = toydata)$resid()^2) } )

gcvdata <- data.frame(aveperf_fpco, aveperf_fpc) matplot(1:15, gcvdata, type = "b", main = "model performance over number of pc/pco", xlab = "number of pc/pco", ylab = "GCV", pch = c(17,15))

fabian-s commented 7 years ago

Great!

ailinweili commented 7 years ago

These codes can be found in the example of bfpco function(we will remove it later on)

fabian-s commented 7 years ago

good job, Weili! I've added some comments/modifcations below

devtools::load_all(".")

library(mgcv)
library(dtw)

# Generate data, the toy dataset analyzed by Phillip(2017) is used
Xnl <- matrix(0, 30, 101)
set.seed(813)
tt <- sort(sample(1:90, 30))
for (i in 1:30) {
  Xnl[i, tt[i]:(tt[i] + 4)] <- -1
  Xnl[i, (tt[i] + 5):(tt[i] + 9)] <- 1
}
X.toy <- Xnl + matrix(rnorm(30 * 101, , 0.05), 30)
colnames(X.toy) <- paste("X", 1:101, sep = "")
y.toy <- tt + rnorm(30, 0.05)
y.rainbow <- rainbow(30, end = 0.9)[(y.toy - min(y.toy))/diff(range(y.toy)) * 29 + 1]

dummy <- rep(1, 30)

## Display data
par(mfrow = c(2, 2))
matplot((0:100)/100, t(Xnl[c(4, 25), ]), type = "l", xlab = "t", ylab = "", ylim = range(X.toy), 
  main = "Noiseless functions")
matplot((0:100)/100, t(X.toy[c(4, 25), ]), type = "l", xlab = "t", ylab = "", ylim = range(X.toy), 
  main = "Observed functions")
matplot((0:100)/100, t(X.toy), type = "l", lty = 1, col = y.rainbow, xlab = "t", 
  ylab = "", main = "Rainbow plot")

## Obtain DTW distances
D.dtw <- dist(X.toy, method = "dtw", window.type = "sakoechiba", window.size = 5)

## Model data
toydata <- list(y.toy = y.toy, X.toy = X.toy, s = 1:101)

# Fit pco-based gam model(m1), fpco-based boosting model(m2), fpc-based boosting
# model(m3)
m1 <- gam(y.toy ~ s(dummy, bs = "pco", k = 15, xt = list(D = D.dtw)), method = "REML")

m2 <- FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = 15, window.type = "sakoechiba", 
  window.size = 5), timeformula = ~bols(1), data = toydata, control = boost_control(mstop = 200))

m3 <- FDboost(y.toy ~ bfpc(X.toy, s = s), timeformula = NULL, data = toydata)

## Model fitted values
fitteds = data.frame(y.toy = toydata$y.toy, fitted_pco = m1$fitted.values, fitted_bfpco = m2$fitted(), 
  fitted_bfpc = m3$fitted())

par(mfrow = c(2, 2))
obs_id = 1:length(toydata$y.toy)
# matplot(obs_id, preds, type = c('l'), lwd = 1.5, col = 1:4 , main = 'model
# fitted value') FS: i got 'Error in as.matrix(y) : object 'preds' not found',
# should be matplot(obs_id, fitteds, ... legend('topleft', legend = c('y.toy',
# 'fitted_pco', 'fitted_fpco', 'fitted_fpc'), col=1:4, lwd = 1.5, cex = 0.6)

## FS don't really like this plot -- x-axis is rather arbitrary, better to show
## agreement in scatter plot matrix:
pairs(fitteds, panel = function(x, y) {
  points(x, y)
  abline(c(0, 1), col = rgb(0, 0, 0, 0.4))
})

## Average of square residual
c(resid_pco = mean(m1$residuals^2), resid_fpco = mean(m2$resid()^2), resid_fpc = mean(m3$resid()^2))

# Prediction for new data Case when new data have the same time index
newdd = list(X.toy = Xnl + matrix(rnorm(30 * 101, 0, 0.05), 30), s = 1:101)

newpred_m2 = predict(m2, newdata = newdd)
newpred_m3 = predict(m3, newdd = newdd)
newpreds = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred_m2, pred_fpc = newpred_m3)

matplot(obs_id, newpreds, type = c("l"), lwd = 1.5, col = 1:4, main = "predicton for new data at identical time grids")
legend("topleft", legend = c("y.toy", "pred_fpco", "pred_fpc"), col = 1:4, lwd = 1.5, 
  cex = 0.7)

## Case when new data have different time index
new_xtoy = data.frame()
for (i in 1:32) new_xtoy[1:nrow(X.toy), i] = rowMeans(X.toy[1:nrow(X.toy), (3 * i):(3 * 
  i + 2)])

new_s = vector()
for (i in 1:32) new_s[i] = mean(newdd$s[(3 * i):(3 * i + 2)])

newdd_2 = list(X.toy = as.matrix(new_xtoy), s = as.integer(new_s))

newpred2_m2 = predict(m2, newdata = newdd_2)
newpred2_m3 = predict(m3, newdata = newdd_2)
newpreds2 = data.frame(y.toy = toydata$y.toy, pred_fpco = newpred2_m2, newpred_fpc = newpred2_m3)

matplot(obs_id, newpreds, type = c("l"), pch = 1, col = 1:4, main = "prediciton for new data at differnt time grids")
legend("topleft", legend = c("y.toy", "pred_fpco", "pred_fpc"), col = 1:4, lwd = 1.5, 
  cex = 0.7)

# Model performance over number of principal coordinates
aveperf_fpco = sapply(1:15, FUN = function(i) {
  mean(FDboost(y.toy ~ bfpco(X.toy, s = s, distType = "dtw", npc = i, window.type = "sakoechiba", 
    window.size = 5), timeformula = ~bols(1), data = toydata, control = boost_control(mstop = 1000))$resid()^2)
})

aveperf_fpc = sapply(1:15, FUN = function(i) {
  ## FS: this does not vary number of PCs ...  FDboost(y.toy ~ bfpc(X.toy, s = s),
  ## timeformula = NULL, data = toydata)$resid()^2)
  m < FDboost(y.toy ~ bfpc(X.toy, s = s, npc = i), timeformula = NULL, data = toydata)
  mean(m$resid()^2)
})

## FS: this is ok for showcase here, for honest evaluation you would need: - the
## test set error, not the training set error. training set error will always
## decrease in <npc> - tune mstop for each <npc>, not set it to a fixed value

gcvdata <- data.frame(aveperf_fpco, aveperf_fpc)
matplot(1:15, gcvdata, type = "b", main = "model performance over number of pc/pco", 
  xlab = "number of pc/pco", ylab = "GCV", pch = c(17, 15))

ailinweili commented 7 years ago

Hi Fabian! I have updated the examples according to your last comment.

ailinweili / FDboost

Usage examples: bfpco-base FDboost #1

Generate data, the toy dataset analyzed by Phillip(2017) is used

Display data

Obtain DTW distances

Model data

Fit pco-based gam model(m1), fpco-based boosting model(m2), fpc-based boosting model(m3)

Model fitted values

Average of square residual

Prediction for new data

Case when new data have the same time index

Case when new data have different time index

Model performance over number of dimensions of principal coordinates

Due to limited data size, only training error is computed and displayed