Closed spsanderson closed 2 years ago
Start of function
#' Time Series Feature Clustering
#'
#' @family Clustering
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will return a list object output. The function itself
#' requires that a time series tibble/data.frame get passed to it, along with
#' the `.date_col`, the `.value_col` and a period of data. It uses the underlying
#' function `timetk::tk_tsfeatures()` and takes the output of that and performs
#' a clustering analysis using the `K-Means` algorithm.
#'
#' The function has a parameter of `.features` which can take any of the features
#' listed in the `tsfeatures` package by Rob Hyndman. You can also create custom
#' functions in the `.GlobalEnviron` and it will take them as quoted arguments.
#'
#' So you can make a function as follows
#'
#' my_mean <- function(x){
#' ret <- mean(x, na.rm = TRUE)
#' return(ret)
#' }
#'
#' You can then call this by using `.features = c("my_mean")`.
#'
#' The output of this function includes the following:
#'
#' __Data Section__
#' - ts_feature_tbl
#' - user_item_matrix_tbl
#' - mapped_tbl
#' - scree_data_tbl
#'
#' __Plots__
#' - static_plot
#' - plotly_plot
#'
#' @seealso \url{"https://pkg.robjhyndman.com/tsfeatures/"}
#'
#' @description This function returns an output list of data and plots that
#' come from using the `K-Means` clustering algorithm on a time series data.
#'
#' @param .data The data passed must be a `data.frame/tibble` only.
#' @param .date_col The date column.
#' @param .value_col The column that holds the value of the time series where you
#' want the features and clustering performed on.
#' @param .features This is a quoted string vector using c() of features that you
#' would like to pass. You can pass any feature you make or those from the `tsfeatures`
#' package.
#' @param .scale If TRUE, time series are scaled to mean 0 and sd 1 before features are computed
#' @param .prefix A prefix to prefix the feature columns. Default: "ts_"
#' @param .centers An integer of how many different centers you would like to generate.
#' The default is 3.
#' @param ... This is where you can place grouping variables that are passed off
#' to `dplyr::group_by()`
#'
#' @examples
#' data_tbl <- ts_to_tbl(AirPassengers) %>%
#' mutate(group_id = rep(1:12, 12))
#'
#' ts_feature_cluster(
#' .data = data_tbl,
#' .date_col = date_col,
#' .value_col = value,
#' group_id,
#' .features = c("acf_features","entropy"),
#' .scale = TRUE,
#' .prefix = "ts_",
#' .centers = 3
#' )
#'
#' @return A list output
#'
#' @export
#'
ts_feature_cluster <- function(.data, .date_col, .value_col, ...,
.features = c("frequency","entropy","acf_features"),
.scale = TRUE, .prefix = "ts_", .centers = 3){
# Tidyeval ----
date_col_var_expr <- rlang::enquo(.date_col)
value_col_var_expr <- rlang::enquo(.value_col)
centers <- as.numeric(.centers)
features <- as.character(.features)
scale <- as.logical(.scale)
prefix <- as.character(.prefix)
groups <- rlang::enquos(...)
# Checks ----
# Is .data a time-series object or data.frame/tibble?
if(inherits(.data, "ts") || inherits(.data, "mts") ||
inherits(.data, "xts") || inherits(.data, "zoo")){
rlang::abort(
message = paste0(
"You must pass a 'tibble/data.frame' to this function. You supplied
data of the class: ", class(.data)
),
use_cli_format = TRUE
)
} else if (is.data.frame(.data)){
data_tbl <- tibble::as_tibble(.data)
} else {
rlang::abort(
message = "You did not pass a 'tibble/data.frame'.",
use_cli_format = TRUE
)
}
if (rlang::quo_is_missing(date_col_var_expr)){
rlang::abort(
message = "You must supply '.date_col'.",
use_cli_format = TRUE
)
}
if (rlang::quo_is_missing(value_col_var_expr)){
rlang::abort(
message = "You must supply '.value_col'."
)
}
if (!is.logical(scale)){
rlang::abort(
message = "The parameter of '.scale' must be a logical TRUE/FALSE.",
use_cli_format = TRUE
)
}
if (!is.character(prefix)){
rlang::abort(
message = "The parameter of '.prefix' must be a character.",
use_cli_format = TRUE
)
}
# * Data ----
data_feature_tbl <- data_tbl %>%
dplyr::group_by(!!!groups) %>%
timetk::tk_tsfeatures(
.date_var = {{ date_col_var_expr }},
.value = {{ value_col_var_expr }},
.features = c(features),
.scale = scale,
.prefix = prefix
) %>%
dplyr::ungroup()
# Zero Variance function -- possible export on its own
remove_zero_variance_columns <- function(df){
df[, !sapply(df, function(x) min(x) == max(x))]
}
# * Remove zero var cols ----
data_feature_tbl <- data_feature_tbl[ , colSums(is.na(data_feature_tbl)) < nrow(data_feature_tbl)]
data_feature_tbl <- remove_zero_variance_columns(data_feature_tbl)
# * User Item Matrix ----
ui_tbl <- data_feature_tbl
# * Kmeans Mapped Tibble ----
kmm_tbl <- healthyR::kmeans_mapped_tbl(ui_tbl, .centers = centers)
# * Scree data and plot
kmm_scree_data <- healthyR::kmeans_scree_data_tbl(kmm_tbl)
scree_plt <- healthyR::kmeans_scree_plt(kmm_tbl)
scree_plotly_plt <- plotly::ggplotly(scree_plt)
# * Return ----
output <- list(
data = list(
ts_feature_tbl = data_feature_tbl,
user_item_matrix_tbl = ui_tbl,
mapped_tbl = kmm_tbl,
scree_data_tbl = kmm_scree_data
),
plots = list(
static_plot = scree_plt,
plotly_plot = scree_plotly_plt
)
)
attr(output, "output_type") <- "ts_feature_cluster"
return(output)
}
https://business-science.github.io/timetk/articles/TK09_Clustering.html