spsanderson / healthyR.ts

A time-series companion package to healthyR
https://www.spsanderson.com/healthyR.ts/
Other
18 stars 3 forks source link

Lag and Correlation Function #315

Closed spsanderson closed 2 years ago

spsanderson commented 2 years ago

Make a function that does lagged correlation and outputs the lagged tibble, correlation matrix, correlation long tibble, lagged plot and correlation matrix heatmap

Example:

library(healthyR.ts)
library(dplyr)
library(ggplot2)
library(tidyr)
library(timetk)

df <- ts_to_tbl(AirPassengers) %>% select(-index)
lags <- c(1,3,6,12)

lagged_list <- lapply(seq_along(lags), function(i){
  tibble(
    lagged_value   = df$value %>% dplyr::lag(lags[i]),
    original_value = df$value,
    lag            = factor(lags[i])
  )
})

Gives:

> lagged_list
[[1]]
# A tibble: 144 x 3
   lagged_value original_value lag  
          <dbl>          <dbl> <fct>
 1           NA            112 1    
 2          112            118 1    
 3          118            132 1    
 4          132            129 1    
 5          129            121 1    
 6          121            135 1    
 7          135            148 1    
 8          148            148 1    
 9          148            136 1    
10          136            119 1    
# ... with 134 more rows

[[2]]
# A tibble: 144 x 3
   lagged_value original_value lag  
          <dbl>          <dbl> <fct>
 1           NA            112 3    
 2           NA            118 3    
 3           NA            132 3    
 4          112            129 3    
 5          118            121 3    
 6          132            135 3    
 7          129            148 3    
 8          121            148 3    
 9          135            136 3    
10          148            119 3    
# ... with 134 more rows

[[3]]
# A tibble: 144 x 3
   lagged_value original_value lag  
          <dbl>          <dbl> <fct>
 1           NA            112 6    
 2           NA            118 6    
 3           NA            132 6    
 4           NA            129 6    
 5           NA            121 6    
 6           NA            135 6    
 7          112            148 6    
 8          118            148 6    
 9          132            136 6    
10          129            119 6    
# ... with 134 more rows

[[4]]
# A tibble: 144 x 3
   lagged_value original_value lag  
          <dbl>          <dbl> <fct>
 1           NA            112 12   
 2           NA            118 12   
 3           NA            132 12   
 4           NA            129 12   
 5           NA            121 12   
 6           NA            135 12   
 7           NA            148 12   
 8           NA            148 12   
 9           NA            136 12   
10           NA            119 12   
# ... with 134 more rows

Lag Plot:

plt <- lagged_list %>%
  map_df(as_tibble) %>%
  ggplot(
    aes(x = original_value, y = lagged_value, color = lag)
  ) + 
  geom_point() + 
  facet_wrap(~ lag, scales = "free") +
  theme_minimal() 
plt

image

Correlation Matrix/Tibble:

lagged_cor_matrix <- df %>%
  tk_augment_lags(
    .value = value,
    .lags = lags
  ) %>%
  select(-date_col) %>%
  drop_na() %>%
  cor()

lct_names <- rownames(lagged_cor_matrix)

lagged_cor_tbl <- lagged_cor_matrix %>%
  as_tibble() %>%
  mutate(data_names = lct_names) %>%
  select(data_names, everything()) %>%
  pivot_longer(cols = - data_names)

Gives:

> lagged_cor_matrix
                value value_lag1 value_lag3 value_lag6 value_lag12
value       1.0000000  0.9542938  0.8186636  0.7657001   0.9905274
value_lag1  0.9542938  1.0000000  0.8828054  0.7726530   0.9492382
value_lag3  0.8186636  0.8828054  1.0000000  0.8349550   0.8218493
value_lag6  0.7657001  0.7726530  0.8349550  1.0000000   0.7780911
value_lag12 0.9905274  0.9492382  0.8218493  0.7780911   1.0000000

> lagged_cor_tbl
# A tibble: 25 x 3
   data_names name        value
   <chr>      <chr>       <dbl>
 1 value      value       1    
 2 value      value_lag1  0.954
 3 value      value_lag3  0.819
 4 value      value_lag6  0.766
 5 value      value_lag12 0.991
 6 value_lag1 value       0.954
 7 value_lag1 value_lag1  1    
 8 value_lag1 value_lag3  0.883
 9 value_lag1 value_lag6  0.773
10 value_lag1 value_lag12 0.949
# ... with 15 more rows

Correlation Heatmap:

lagged_cor_tbl %>%
  ggplot(aes(
    x = name, 
    y = data_names
  )) +
  geom_tile(aes(fill = value), color = "white") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal()

image

spsanderson commented 2 years ago

Final Function:

ts_lag_correlation_tbl <- function(.data, .date_col, .value_col, .lags = 1,
                                   .heatmap_color_low = "white",
                                   .heatmap_color_hi = "steelblue"){

  # Tidyeval
  date_col_var_expr <- rlang::enquo(.date_col)
  value_col_var_expr <- rlang::enquo(.value_col)
  lags <- as.numeric(.lags)
  data_length <- nrow(.data)
  heatmap_low <- base::tolower(as.character(.heatmap_color_low))
  heatmap_hi <- base::tolower(as.character(.heatmap_color_hi))

  # Checks
  if (!is.data.frame(.data)){
    rlang::abort(
      message = "'.data' must be a data.frame/tibble.",
      use_cli_format = TRUE
    )
  }

  if (rlang::quo_is_missing(date_col_var_expr)){
    rlang::abort(
      message = "'.date_col' is required and must be a Date class.",
      use_cli_format = TRUE
    )
  }

  if (rlang::quo_is_missing(value_col_var_expr)){
    rlang::abort(
      message = "'.value_col' is required and must be numeric.",
      use_cli_format = TRUE
    )
  }

  # Data
  df <- dplyr::as_tibble(.data) %>%
    dplyr::select({{ date_col_var_expr }}, {{ value_col_var_expr }},
                  dplyr::everything()
    ) %>%
    dplyr::rename(value = {{ value_col_var_expr }})

  # Lagged Tibble List
  lagged_list <- lapply(seq_along(lags), function(i){
    dplyr::tibble(
      lag            = factor(lags[i]),
      original_value = df$value,
      lagged_value   = dplyr::lag(df$value, lags[i])
    ) %>%
      tidyr::drop_na() %>%
      dplyr::rename(
        {{value_col_var_expr}} := original_value
      )
  })

  # Lagged Tibble
  lagged_tibble <- purrr::map_df(lagged_list, dplyr::as_tibble) %>%
    dplyr::mutate(lag_title = paste0("Lag: ", lag) %>%
                    forcats::as_factor())

  # Lagged Correlation Matrix
  lagged_cor_matrix <- df %>%
    timetk::tk_augment_lags(
      .value = value,
      .lags = lags
    ) %>%
    dplyr::select(-date_col) %>%
    tidyr::drop_na() %>%
    stats::cor()

  # Lagged Correlation Long Tibble
  lct_names <- base::rownames(lagged_cor_matrix)

  lagged_cor_tbl <- lagged_cor_matrix %>%
    dplyr::as_tibble() %>%
    dplyr::mutate(data_names = lct_names) %>%
    dplyr::select(data_names, dplyr::everything()) %>%
    tidyr::pivot_longer(cols = -data_names) %>%
    dplyr::mutate(name = forcats::as_factor(name)) %>%
    dplyr::mutate(data_names = forcats::as_factor(data_names)) %>%
    dplyr::select(name, data_names, dplyr::everything())

  # Plots ----
  # Lagged Plot
  plt <- lagged_tibble %>%
    ggplot2::ggplot(
      ggplot2::aes(
        x = {{ value_col_var_expr }}, 
        y = lagged_value, 
        color = lag
      )
    ) + 
    ggplot2::geom_point() + 
    ggplot2::facet_wrap(~ lag_title, scales = "free") +
    ggplot2::theme_minimal() +
    ggplot2::labs(
      x = "Original Value",
      y = "Lagged Value",
      color = "Lags"
    ) +
    ggplot2::theme(legend.position = "none")

  # Correlation Heatmap
  correlation_heatmap <- lagged_cor_tbl %>%
    ggplot2::ggplot(ggplot2::aes(
      x = name, 
      y = data_names
    )) +
    ggplot2::geom_tile(ggplot2::aes(fill = value), color = "white") +
    ggplot2::scale_fill_gradient(
      low = heatmap_low, 
      high = heatmap_hi
    ) +
    ggplot2::theme_minimal() +
    ggplot2::labs(
      x = "",
      y = "",
      fill = "Correlation"
    ) 

  # Return ----
  output <- list(
    data = list(
      lag_list = lagged_list,
      lag_tbl = lagged_tibble,
      correlation_lag_matrix = lagged_cor_matrix,
      correlation_lag_tbl = lagged_cor_tbl
    ),
    plots = list(
      lag_plot = plt,
      plotly_lag_plot = plotly::ggplotly(plt),
      correlation_heatmap = correlation_heatmap,
      plotly_heatmap = plotly::ggplotly(correlation_heatmap)
    )
  )

  attr(output, "input_data_length") <- data_length
  attr(output, ".lags") <- .lags

  return(output)
}