ScPoEcon / ScPoApps

Interactive apps for our econometrics course
Other
3 stars 4 forks source link

overfitting app #10

Closed floswald closed 3 years ago

floswald commented 4 years ago

i want an app that illustrates overfitting as we increase flexilibility of the fitting function.

The app should look like this here, but instaed of blue and green functions there should be the user-selected degree of freedom function. a slider selection df will make the function more or less wiggly.

Screenshot 2020-11-13 at 16 04 14

code

here is the function that makes the plot

library(ggplot2)
library(splines)
library(dplyr)

smes1 <- function(fun = function(x) {x*sin(x-2) + 0.2*x},n=90,eps = 1,df1=5, df2=40, ub = 5,nnew = 200){
    set.seed(1234)

    r = data.frame(x = seq(0,ub,length.out = n))
    r$truth = fun(r$x) 
    r$epsi = rnorm(n,mean = 0, sd = eps)
    r$y = r$truth + r$epsi
    # browser()
    mods = list()
    mods$lm = lm(y ~ x,data = r)
    mods$low = smooth.spline(r$x,r$y,df = df1)
    mods$hi = smooth.spline(r$x,r$y, df = df2)
    # mods$ns5 = gam::gam(formula = as.formula(paste0("y ~ s(x,",df1,")")),data = r)
    # mods$ns5 = mgcv::gam(formula = as.formula(paste0("y ~ ns(x,df = ",df1,")")),data = r)
    # mods$ns20 = gam::gam(formula = as.formula(paste0("y ~ ns(x,df = ",df2,")")),data = r)
    # mods$ns20 = mgcv::gam(formula = as.formula(paste0("y ~ ns(x,df = ",df2,")")),data = r)

    names(mods)[2:3] <- paste0("s",c(df1,df2))

    # mses and bias
    mses = list(
        train = colMeans(sapply(mods,residuals)^2)
    )

    # predictions
    preds = data.frame(x = seq(0,ub, length.out = nnew))
    preds = cbind(preds, lm = predict(mods$lm,newdata = preds))
    preds = cbind(preds,sapply(mods[2:3],function(x) predict(x,preds$x)$y))
    preds$truth = fun(preds$x) #+ rnorm(nnew,mean = 0, sd = eps)
    mnames = names(preds)[-1]
    preds$testdata = preds$truth + rnorm(nnew,mean = 0, sd = eps)

    # test mses
    mses$test <- colMeans((preds[,names(mods)] - preds[,"testdata"])^2)

    # bias
    mses$bias <- colMeans((preds[,names(mods)] - preds[,"truth"])^2)
    mses$var <- diag(var(preds[,names(mods)]))

    mp = reshape2::melt(preds[,-ncol(preds)], id.vars = "x")
    names(mp)[2] <- "model" 

    # make plots
    p1 = ggplot() + theme_bw()
    # points
    p1 = p1 + geom_point(data = r,aes(x = x, y = y),shape = 1,size = 2)
    p1 = p1 +  geom_line(data=mp, aes(x = x, y = value, color = model), size = 1)
    # 
    # color scale
    cnames = c("orange", "blue" , "green","black")
    names(cnames) <- mnames
    p1 = p1 + scale_color_manual(values = cnames)

    # flexilibty plot
    d2 = data.frame(flexibility = c(2,df1,df2),model = c("lm",paste0(df1," df"),paste0(df2," df")), train = mses$train, test = mses$test)
    d2 = reshape2::melt(d2, id.vars = c("flexibility","model"))
    names(d2)[3] <- "type"
    p2 = ggplot(data = d2, aes(x = flexibility, y = value,linetype = type)) + geom_path(color = "black") + scale_y_continuous(name = "MSE")
    cnames = c("orange", "blue" , "green")
    names(cnames) <- c("lm",paste0(df1," df"),paste0(df2," df"))
    p2 = p2 + geom_point(data = d2, aes(color = model), size = 3, shape = 15) + scale_color_manual(values = cnames) 
    # add minimal test MSE
    p2 = p2 + geom_hline(yintercept = eps^2, linetype = "dashed", color = "grey")+ theme_bw()
    cowplot::plot_grid(p1,p2)
    # list(p1,p2)
}

it's ugly and way too long so i started refactoring this into smaller pieces:

getmodels <- function(x,y,newx,dfs = 2:20){
    r = data.frame(x=x,y=y)
    o = data.frame(x=newx)
    s = list()
    # browser()
    for (i in 1:length(dfs)){
        if (dfs[i] == 2){
            s[[i]] <- lm(y~x,r)
            o = cbind(o, predict(s[[i]], newdata = o))
        } else {
            s[[i]] <- smooth.spline(x,y,df = dfs[i])
            o = cbind(o, predict(s[[i]], o$x)$y)
        }
    }
    names(o)[-c(1)] <- paste0("df",dfs)
    names(s) <- paste0("df",dfs)
    list(models = s, pred = o)
}

datafig2.12 <- function(fun = function(x) {x*sin(x-2) + 0.2*x},n=90,eps = 1,df1=5, df2=40, ub = 5,nnew = 200){
    set.seed(1234)

    r = data.frame(x = seq(0,ub,length.out = n))
    r$truth = fun(r$x) 
    r$epsi = rnorm(n,mean = 0, sd = eps)
    r$y = r$truth + r$epsi
    # browser()

    mods = getmodels(r$x,r$y,seq(0,ub, length.out = nnew))
    # add test data to predictions
    mods$pred$truth = fun(mods$pred$x)
    mods$pred$testdata = mods$pred$truth + rnorm(nnew,mean = 0, sd = eps)
    # mses and bias
    mses = list(
        train = colMeans(sapply(mods$models,residuals)^2)
    ) # test mses
    mses$test <- colMeans((mods$pred[,names(mods$models)] - mods$pred[,"testdata"])^2)

    # bias
    mses$bias <- colMeans((mods$pred[,names(mods$models)] - mods$pred[,"truth"])^2)
    mses$var <- diag(var(mods$pred[,names(mods$models)]))
    list(mods,mses)

}
x = datafig2.12()

plotfig2.12 <- function(d) {
    stopifnot(is.list(d))
    m = data.frame(d)
    m$x = 2:(nrow(m)+1)
    m = reshape2::melt(m,id.vars = "x")
    m %>% 
        rename(model = variable) %>%
        ggplot(aes(x=x,y = value, color = model)) + geom_point()

}
plot(plotfig2.12(x[[2]]))

task