Open iangow opened 2 weeks ago
Here is some code. I chose indfmt == "INDL", datafmt == "HIST_STD", consol == "C", popsrc == "I"
based on what is common in the data. I used filter(fic == "AUS")
to pick off Australian companies. Some of the fields are different:
cshoi
replaces csho
prcc_f
is absent from g_funda
. We could get that from g_secm
.ni
is missing. We covered that in the first class. I calculated that using the formula we discussed there.library(tidyverse)
library(DBI)
library(farr)
library(furrr)
library(rpart)
library(glmnet)
db <- dbConnect(duckdb::duckdb())
g_funda <- load_parquet(db, schema = "comp", table = "g_funda")
g_company <- load_parquet(db, schema = "comp", table = "g_company")
g_funda |> count(indfmt, datafmt, consol, popsrc)
g_company |> count(fic) |> arrange(desc(n))
X_vars <- c('act', 'ap', 'at', 'ceq', 'che', 'cogs', 'cshoi', 'dlc',
'dltis', 'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst',
'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale',
'sstk', 'txp', 'txt', 'xint')
y_var <- "misstate"
aus_cos <-
g_company |>
filter(fic == "AUS") |>
select(gvkey)
features_all <-
g_funda |>
semi_join(aus_cos, by = "gvkey") |>
mutate(ni = ib + xi) |>
filter(indfmt == "INDL", datafmt == "HIST_STD",
consol == "C", popsrc == "I") |>
mutate(across(c(ivao, pstk, ivst, txp),
\(x) coalesce(x, 0))) |>
select(gvkey, datadate, fyear, all_of(X_vars))
features <-
features_all |>
collect()
features
Here's a "chart of accounts" of sorts:
Hi Ian, this is what I've done so far.
I noticed that ni
will be NA using the code you provided since all xi
values are missing. As a result, I used ib
as y variable instead.
Additionally, I've dropped dltis
and sstk
due to a large number of missing values (even one of them all values are NA).
Given that income is a continuous variable, the logistic model might not be suitable here. However, I'm not sure if I've done the right thing using the classification tree below.
I'm unsure how to check continuous variables' performance such as ib
. Should we use mean squared error?
db <- dbConnect(duckdb::duckdb())
g_funda <- load_parquet(db, schema = "comp", table = "g_funda")
g_company <- load_parquet(db, schema = "comp", table = "g_company")
g_funda |> count(indfmt, datafmt, consol, popsrc)
g_company |> count(fic) |> arrange(desc(n))
X_vars <- c('act', 'ap', 'at', 'ceq', 'che', 'dlc', 'cogs','cshoi',
'dltt', 'dp','invt', 'ivao', 'ivst', 'pstk',
'lct', 'lt', 'ppegt', 're', 'rect', 'sale',
'txp', 'txt', 'xint')
y_var <- "ib"
aus_cos <-
g_company |>
filter(fic == "AUS") |>
select(gvkey)
features_all <-
g_funda |>
semi_join(aus_cos, by = "gvkey") |>
filter(indfmt == "INDL", datafmt == "HIST_STD",
consol == "C", popsrc == "I") |>
mutate(across(c(ivao, pstk, ivst, txp),
\(x) coalesce(x, 0))) |>
select(gvkey, datadate, fyear, ib, all_of(X_vars))
features <-
features_all |>
collect() |>
na.omit()
data_train <-
features |>
filter(fyear >= 2012 & fyear <= 2017)
formula <- str_c(y_var, " ~ ", str_c(X_vars, collapse = " + "))
fm2 <- rpart(formula, data = data_train,
control = rpart.control(cp = 0.001, minbucket = 5))
plot(fm2)
text(fm2)
You could use coalesce(xi, 0)
in place of xi
.
Hi Ian, thanks that works.
I've also tried the lasso model below and used MSE and R square. The lambda plot is a little bit wired. Could you please help me to check if there's any issue?
n_folds <- 5
folds <- 1:n_folds
sample_splits <-
data_train |>
select(gvkey) |>
distinct() |>
mutate(fold = sample(folds, nrow(pick(everything())), replace = TRUE))
dft <- data_train |>
inner_join(sample_splits, by = "gvkey")
fm_lasso_cv <- cv.glmnet(x = as.matrix(dft |> select(all_of(X_vars))),
y = dft[[y_var]],
alpha = 1,
type.measure = "mse",
foldid = dft[["fold"]],
keep = TRUE)
tibble(lambda = fm_lasso_cv$lambda, auc = fm_lasso_cv$cvm) |>
ggplot(aes(x = lambda, y = auc)) +
geom_line()
idmin <- match(fm_lasso_cv$lambda.min, fm_lasso_cv$lambda)
fit_lasso_cv <-
dft |>
select(ni) |>
mutate(predicted_ni = fm_lasso_cv$fit.preval[, idmin])
mse <- mean((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2)
r_squared <- 1 - (sum((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2) / sum((fit_lasso_cv$ni - mean(fit_lasso_cv$ni))^2))
Hi Ian,
Not sure if I've done the right thing in setting up functions, but I couldn't get the last chunk of code running for the rusboost_results
:
# Define fit_rus_model function
fit_rus_model <- function(df, formula, size = 30, rus = TRUE, learn_rate = 1,
maxdepth = NULL, minbucket = NULL,
ir = 1) {
if (!is.null(maxdepth)) control <- rpart.control(maxdepth = maxdepth)
if (!is.null(minbucket)) control <- rpart.control(minbucket = minbucket)
fm <- rusboost(formula, df, size = size, ir = ir, learn_rate = learn_rate,
rus = rus, control = control)
return(fm)
}
# Define rus_predict function
rus_predict <- function(fold, ...) {
dft <-
data_train |>
inner_join(sample_splits, by = "gvkey")
fm <-
dft |>
filter(fold != !!fold) |>
fit_rus_model(...)
res <-
dft |>
filter(fold == !!fold) |>
mutate(predicted_ni = predict(fm, newdata = pick(everything()))$predictions) |>
select(gvkey, fyear, predicted_ni)
return(res)
}
# Define get_rmse function
get_rmse <- function(...) {
set.seed(2021)
rus_fit <-
future_map(rus_predict, .options = furrr_options(seed = 2021),
...) |>
list_rbind() |>
inner_join(data_train, by = c("gvkey", "fyear")) |>
select(gvkey, fyear, predicted_ni, ni)
rmse_val <- sqrt(mean((rus_fit$ni - rus_fit$predicted_ni)^2, na.rm = TRUE))
return(rmse_val)
}
# Define the parameter grid
params <- expand_grid(
size = c(30, 50),
learn_rate = c(0.1, 0.5, 1),
maxdepth = c(3, 5, 7)
)
# Set up parallel processing
plan(multisession)
# Calculate RMSE for each parameter combination
rusboost_results <- params |>
mutate(rmse = pmap_dbl(params,get_rmse)) |>
arrange(rmse)
OK. Maybe supply the complete code and I can take a look. You probably want to dial down the metaparameters (e.g., not too many trees) so that the code runs fairly quickly while you're testing it.
Hi Ian,
Please find the complete code below.
library(tidyverse)
library(DBI)
library(farr)
library(furrr)
library(rpart)
library(glmnet)
library(dplyr)
db <- dbConnect(duckdb::duckdb())
g_funda <- load_parquet(db, schema = "comp", table = "g_funda")
g_company <- load_parquet(db, schema = "comp", table = "g_company")
g_funda |> count(indfmt, datafmt, consol, popsrc)
g_company |> count(fic) |> arrange(desc(n))
X_vars <- c('act', 'ap', 'at', 'ceq', 'che', 'cogs', 'cshoi', 'dlc',
'dltt', 'dp', 'invt', 'ivao', 'ivst',
'lct', 'lt', 'ni', 'ppegt', 'pstk', 're', 'rect', 'sale',
'sstk', 'txp', 'txt', 'xint')
y_var <- "ni"
aus_cos <-
g_company |>
filter(fic == "AUS") |>
select(gvkey)
features_all <-
g_funda |>
semi_join(aus_cos, by = "gvkey") |>
mutate(ni = ib + coalesce(xi, 0)) |>
filter(indfmt == "INDL", datafmt == "HIST_STD",
consol == "C", popsrc == "I") |>
mutate(across(c(ivao, pstk, ivst, txp),
\(x) coalesce(x, 0))) |>
select(gvkey, datadate, fyear, all_of(X_vars))
features <-
features_all |>
collect() |>
na.omit()
features
#drop dltis & sstk due to unavailable data; pstk 0 value
data_train <-
features |>
filter(fyear >= 2012 & fyear <= 2017)
# Logistic only for binomial so not appropriate here
formula <- str_c(y_var, " ~ ", str_c(X_vars, collapse = " + "))
fm2 <- rpart(formula, data = data_train,
control = rpart.control(cp = 0.001, minbucket = 5))
plot(fm2)
text(fm2)
within_sample <-
data_train |>
mutate(prob = predict(fm2, newdata = pick(everything()))[, 2],
predicted = prob > 0.5)
table(predicted = within_sample$predicted,
response = within_sample$misstate)
# Train Lasso regression model with cross-validation
n_folds <- 5
folds <- 1:n_folds
sample_splits <-
data_train |>
select(gvkey) |>
distinct() |>
mutate(fold = sample(folds, nrow(pick(everything())), replace = TRUE))
dft <- data_train |>
inner_join(sample_splits, by = "gvkey")
fm_lasso_cv <- cv.glmnet(x = as.matrix(dft |> select(all_of(X_vars))),
y = dft[[y_var]],
alpha = 1,
type.measure = "mse",
foldid = dft[["fold"]],
keep = TRUE)
tibble(lambda = fm_lasso_cv$lambda, auc = fm_lasso_cv$cvm) |>
ggplot(aes(x = lambda, y = auc)) +
geom_line()
idmin <- match(fm_lasso_cv$lambda.min, fm_lasso_cv$lambda)
fit_lasso_cv <-
dft |>
select(ni) |>
mutate(predicted_ni = fm_lasso_cv$fit.preval[, idmin])
mse <- mean((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2)
r_squared <- 1 - (sum((fit_lasso_cv$ni - fit_lasso_cv$predicted_ni)^2) / sum((fit_lasso_cv$ni - mean(fit_lasso_cv$ni))^2))
#---------------------Rus-----------------
# Define fit_rus_model function
fit_rus_model <- function(df, formula, size = 30, rus = TRUE, learn_rate = 1,
maxdepth = NULL, minbucket = NULL,
ir = 1) {
if (!is.null(maxdepth)) control <- rpart.control(maxdepth = maxdepth)
if (!is.null(minbucket)) control <- rpart.control(minbucket = minbucket)
fm <- rusboost(formula, df, size = size, ir = ir, learn_rate = learn_rate,
rus = rus, control = control)
return(fm)
}
# Define rus_predict function
rus_predict <- function(fold, ...) {
dft <-
data_train |>
inner_join(sample_splits, by = "gvkey")
fm <-
dft |>
filter(fold != !!fold) |>
fit_rus_model(...)
res <-
dft |>
filter(fold == !!fold) |>
mutate(predicted_ni = predict(fm, newdata = pick(everything()))$predictions) |>
select(gvkey, fyear, predicted_ni)
return(res)
}
# Define get_rmse function
get_rmse <- function(...) {
set.seed(2021)
rus_fit <-
future_map(rus_predict, .options = furrr_options(seed = 2021),
...) |>
list_rbind() |>
inner_join(data_train, by = c("gvkey", "fyear")) |>
select(gvkey, fyear, predicted_ni, ni)
rmse_val <- sqrt(mean((rus_fit$ni - rus_fit$predicted_ni)^2, na.rm = TRUE))
return(rmse_val)
}
# Define the parameter grid
params <- expand_grid(
size = c(10, 20),
learn_rate = c(0.1, 0.5, 1),
maxdepth = c(3, 5, 7)
)
# Set up parallel processing
plan(multisession)
# Calculate RMSE for each parameter combination
rusboost_results <- params |>
mutate(rmse = pmap_dbl(params,get_rmse)) |>
arrange(rmse)
See here for starter code.