Collect data for ML exercise

iangow commented 2 weeks ago

See here for starter code.

iangow commented 2 weeks ago

Here is some code. I chose indfmt == "INDL", datafmt == "HIST_STD", consol == "C", popsrc == "I" based on what is common in the data. I used filter(fic == "AUS") to pick off Australian companies. Some of the fields are different:

cshoi replaces csho
prcc_f is absent from g_funda. We could get that from g_secm.
ni is missing. We covered that in the first class. I calculated that using the formula we discussed there.
There might be a lot of missing values here, so some of these features may not make sense.

library(tidyverse)
library(DBI)
library(farr)
library(furrr)
library(rpart)
library(glmnet)

db <- dbConnect(duckdb::duckdb())

g_funda <- load_parquet(db, schema = "comp", table = "g_funda")
g_company <- load_parquet(db, schema = "comp", table = "g_company")

g_funda |> count(indfmt, datafmt, consol, popsrc)
g_company |> count(fic) |> arrange(desc(n))

X_vars <- c('act', 'ap', 'at', 'ceq', 'che', 'cogs', 'cshoi', 'dlc', 
            'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst', 
            'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 
            'sstk', 'txp', 'txt', 'xint')

y_var <- "misstate"

aus_cos <- 
  g_company |>
  filter(fic == "AUS") |>
  select(gvkey)

features_all <-
  g_funda |>
  semi_join(aus_cos, by = "gvkey") |>
  mutate(ni = ib + xi) |>
  filter(indfmt == "INDL", datafmt == "HIST_STD",
         consol == "C", popsrc == "I") |>
  mutate(across(c(ivao, pstk, ivst, txp), 
                \(x) coalesce(x, 0))) |>
  select(gvkey, datadate, fyear, all_of(X_vars))

features <-
  features_all |>
  collect() 
features

iangow commented 2 weeks ago

Here's a "chart of accounts" of sorts:

Fundamentals Annual.pdf

iangow commented 2 weeks ago

I think you can get this in PDF form through the UniMelb library.

HanJiang-c commented 5 days ago

Hi Ian, this is what I've done so far.

I noticed that ni will be NA using the code you provided since all xi values are missing. As a result, I used ib as y variable instead.

Additionally, I've dropped dltis and sstk due to a large number of missing values (even one of them all values are NA).

Given that income is a continuous variable, the logistic model might not be suitable here. However, I'm not sure if I've done the right thing using the classification tree below.

I'm unsure how to check continuous variables' performance such as ib. Should we use mean squared error?

db <- dbConnect(duckdb::duckdb())

g_funda <- load_parquet(db, schema = "comp", table = "g_funda")
g_company <- load_parquet(db, schema = "comp", table = "g_company")

g_funda |> count(indfmt, datafmt, consol, popsrc)
g_company |> count(fic) |> arrange(desc(n))

X_vars <- c('act', 'ap', 'at', 'ceq', 'che', 'dlc', 'cogs','cshoi',
             'dltt', 'dp','invt', 'ivao', 'ivst', 'pstk',
            'lct', 'lt', 'ppegt', 're', 'rect', 'sale', 
             'txp', 'txt', 'xint')

y_var <- "ib"

aus_cos <- 
  g_company |>
  filter(fic == "AUS") |>
  select(gvkey)

features_all <-
  g_funda |>
  semi_join(aus_cos, by = "gvkey") |>
  filter(indfmt == "INDL", datafmt == "HIST_STD",
         consol == "C", popsrc == "I") |>
  mutate(across(c(ivao, pstk, ivst, txp), 
                \(x) coalesce(x, 0))) |>
  select(gvkey, datadate, fyear, ib, all_of(X_vars))

features <-
  features_all |>
  collect() |>
  na.omit()

data_train <- 
  features |>
  filter(fyear >= 2012 & fyear <= 2017)

formula <- str_c(y_var, " ~ ", str_c(X_vars, collapse = " + "))

fm2 <- rpart(formula, data = data_train, 
             control = rpart.control(cp = 0.001, minbucket = 5))

plot(fm2)
text(fm2)

iangow commented 5 days ago

You could use coalesce(xi, 0) in place of xi.

HanJiang-c commented 5 days ago

Hi Ian, thanks that works.

I've also tried the lasso model below and used MSE and R square. The lambda plot is a little bit wired. Could you please help me to check if there's any issue?

n_folds <- 5
folds <- 1:n_folds

sample_splits <- 
  data_train |>
  select(gvkey) |>
  distinct() |>
  mutate(fold = sample(folds, nrow(pick(everything())), replace = TRUE))

dft <- data_train |>
  inner_join(sample_splits, by = "gvkey")

fm_lasso_cv <- cv.glmnet(x = as.matrix(dft |> select(all_of(X_vars))),
                         y = dft[[y_var]],
                         alpha = 1,
                         type.measure = "mse",
                         foldid = dft[["fold"]],
                         keep = TRUE)

tibble(lambda = fm_lasso_cv$lambda, auc = fm_lasso_cv$cvm) |> 
  ggplot(aes(x = lambda, y = auc)) +
  geom_line()

idmin <- match(fm_lasso_cv$lambda.min, fm_lasso_cv$lambda)

fit_lasso_cv <- 
  dft |>
  select(ni) |>
  mutate(predicted_ni = fm_lasso_cv$fit.preval[, idmin])

mse <- mean((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2)
r_squared <- 1 - (sum((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2) / sum((fit_lasso_cv$ni - mean(fit_lasso_cv$ni))^2))

HanJiang-c commented 5 days ago

Hi Ian,

Not sure if I've done the right thing in setting up functions, but I couldn't get the last chunk of code running for the rusboost_results:

# Define fit_rus_model function
fit_rus_model <- function(df, formula, size = 30, rus = TRUE, learn_rate = 1,
                          maxdepth = NULL, minbucket = NULL,
                          ir = 1) {
  if (!is.null(maxdepth)) control <- rpart.control(maxdepth = maxdepth)
  if (!is.null(minbucket)) control <- rpart.control(minbucket = minbucket)

  fm <- rusboost(formula, df, size = size, ir = ir, learn_rate = learn_rate,
                 rus = rus, control = control)
  return(fm)
}

# Define rus_predict function
rus_predict <- function(fold, ...) {
  dft <- 
    data_train |>
    inner_join(sample_splits, by = "gvkey")

  fm <- 
    dft |>
    filter(fold != !!fold) |>
    fit_rus_model(...)

  res <- 
    dft |>
    filter(fold == !!fold) |>
    mutate(predicted_ni = predict(fm, newdata = pick(everything()))$predictions) |>
    select(gvkey, fyear, predicted_ni)

  return(res)
}

# Define get_rmse function
get_rmse <- function(...) {
  set.seed(2021)
  rus_fit <- 
    future_map(rus_predict, .options = furrr_options(seed = 2021),
               ...) |>
    list_rbind() |>
    inner_join(data_train, by = c("gvkey", "fyear")) |>
    select(gvkey, fyear, predicted_ni, ni)

  rmse_val <- sqrt(mean((rus_fit$ni - rus_fit$predicted_ni)^2, na.rm = TRUE))

  return(rmse_val)
}

# Define the parameter grid
params <- expand_grid(
  size = c(30, 50),
  learn_rate = c(0.1, 0.5, 1),
  maxdepth = c(3, 5, 7)
)

# Set up parallel processing
plan(multisession)

# Calculate RMSE for each parameter combination
rusboost_results <- params |>
  mutate(rmse = pmap_dbl(params,get_rmse)) |>
           arrange(rmse)

iangow commented 2 days ago

OK. Maybe supply the complete code and I can take a look. You probably want to dial down the metaparameters (e.g., not too many trees) so that the code runs fairly quickly while you're testing it.

HanJiang-c commented 2 days ago

Hi Ian,

Please find the complete code below.

library(tidyverse)
library(DBI)
library(farr)
library(furrr)
library(rpart)
library(glmnet)
library(dplyr)

db <- dbConnect(duckdb::duckdb())

g_funda <- load_parquet(db, schema = "comp", table = "g_funda")
g_company <- load_parquet(db, schema = "comp", table = "g_company")

g_funda |> count(indfmt, datafmt, consol, popsrc)
g_company |> count(fic) |> arrange(desc(n))

X_vars <- c('act', 'ap', 'at', 'ceq', 'che', 'cogs', 'cshoi', 'dlc',
            'dltt', 'dp', 'invt', 'ivao', 'ivst', 
            'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale', 
            'sstk', 'txp', 'txt', 'xint')

y_var <- "ni"

aus_cos <- 
  g_company |>
  filter(fic == "AUS") |>
  select(gvkey)

features_all <-
  g_funda |>
  semi_join(aus_cos, by = "gvkey") |>
  mutate(ni = ib + coalesce(xi, 0)) |>
  filter(indfmt == "INDL", datafmt == "HIST_STD",
         consol == "C", popsrc == "I") |>
  mutate(across(c(ivao, pstk, ivst, txp), 
                \(x) coalesce(x, 0))) |>
  select(gvkey, datadate, fyear, all_of(X_vars))

features <-
  features_all |>
  collect() |>
  na.omit()
features

#drop dltis & sstk due to unavailable data; pstk 0 value

data_train <- 
  features |>
  filter(fyear >= 2012 & fyear <= 2017)

# Logistic only for binomial so not appropriate here
formula <- str_c(y_var, " ~ ", str_c(X_vars, collapse = " + "))

fm2 <- rpart(formula, data = data_train, 
             control = rpart.control(cp = 0.001, minbucket = 5))
plot(fm2)
text(fm2)

within_sample <-
  data_train |>
  mutate(prob = predict(fm2, newdata = pick(everything()))[, 2],
         predicted = prob > 0.5)

table(predicted = within_sample$predicted, 
      response = within_sample$misstate)

# Train Lasso regression model with cross-validation

n_folds <- 5
folds <- 1:n_folds

sample_splits <- 
  data_train |>
  select(gvkey) |>
  distinct() |>
  mutate(fold = sample(folds, nrow(pick(everything())), replace = TRUE))

dft <- data_train |>
  inner_join(sample_splits, by = "gvkey")

fm_lasso_cv <- cv.glmnet(x = as.matrix(dft |> select(all_of(X_vars))),
                         y = dft[[y_var]],
                         alpha = 1,
                         type.measure = "mse",
                         foldid = dft[["fold"]],
                         keep = TRUE)

tibble(lambda = fm_lasso_cv$lambda, auc = fm_lasso_cv$cvm) |> 
  ggplot(aes(x = lambda, y = auc)) +
  geom_line()

idmin <- match(fm_lasso_cv$lambda.min, fm_lasso_cv$lambda)

fit_lasso_cv <- 
  dft |>
  select(ni) |>
  mutate(predicted_ni = fm_lasso_cv$fit.preval[, idmin])

mse <- mean((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2)
r_squared <- 1 - (sum((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2) / sum((fit_lasso_cv$ni - mean(fit_lasso_cv$ni))^2))

#---------------------Rus-----------------
# Define fit_rus_model function
fit_rus_model <- function(df, formula, size = 30, rus = TRUE, learn_rate = 1,
                          maxdepth = NULL, minbucket = NULL,
                          ir = 1) {
  if (!is.null(maxdepth)) control <- rpart.control(maxdepth = maxdepth)
  if (!is.null(minbucket)) control <- rpart.control(minbucket = minbucket)

  fm <- rusboost(formula, df, size = size, ir = ir, learn_rate = learn_rate,
                 rus = rus, control = control)
  return(fm)
}

# Define rus_predict function
rus_predict <- function(fold, ...) {
  dft <- 
    data_train |>
    inner_join(sample_splits, by = "gvkey")

  fm <- 
    dft |>
    filter(fold != !!fold) |>
    fit_rus_model(...)

  res <- 
    dft |>
    filter(fold == !!fold) |>
    mutate(predicted_ni = predict(fm, newdata = pick(everything()))$predictions) |>
    select(gvkey, fyear, predicted_ni)

  return(res)
}

# Define get_rmse function
get_rmse <- function(...) {
  set.seed(2021)
  rus_fit <- 
    future_map(rus_predict, .options = furrr_options(seed = 2021),
               ...) |>
    list_rbind() |>
    inner_join(data_train, by = c("gvkey", "fyear")) |>
    select(gvkey, fyear, predicted_ni, ni)

  rmse_val <- sqrt(mean((rus_fit$ni - rus_fit$predicted_ni)^2, na.rm = TRUE))

  return(rmse_val)
}

# Define the parameter grid
params <- expand_grid(
  size = c(10, 20),
  learn_rate = c(0.1, 0.5, 1),
  maxdepth = c(3, 5, 7)
)

# Set up parallel processing
plan(multisession)

# Calculate RMSE for each parameter combination
rusboost_results <- params |>
  mutate(rmse = pmap_dbl(params,get_rmse)) |>
           arrange(rmse)

iangow / han_rm

Collect data for ML exercise #1