jlivsey / UB-sping24-time-series

2 stars 1 forks source link

Anju Anand #32

Open anjuanand6 opened 9 months ago

anjuanand6 commented 8 months ago
# Load necessary libraries
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.3.2
#> Warning: package 'readr' was built under R version 4.3.2
library(reprex)
#> Warning: package 'reprex' was built under R version 4.3.2
library(forecast)
#> Warning: package 'forecast' was built under R version 4.3.2
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tseries)
#> Warning: package 'tseries' was built under R version 4.3.2
library(readr)
library(fredr)
#> Warning: package 'fredr' was built under R version 4.3.2
library(dplyr)
library(lubridate)
library(tsbox)
#> Warning: package 'tsbox' was built under R version 4.3.2
library(fpp3)
#> Warning: package 'fpp3' was built under R version 4.3.2
#> ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
#> ✔ tsibble     1.1.4     ✔ fable       0.3.3
#> ✔ tsibbledata 0.4.1     ✔ fabletools  0.3.4
#> ✔ feasts      0.3.1
#> Warning: package 'tsibble' was built under R version 4.3.2
#> Warning: package 'tsibbledata' was built under R version 4.3.2
#> Warning: package 'feasts' was built under R version 4.3.2
#> Warning: package 'fabletools' was built under R version 4.3.2
#> Warning: package 'fable' was built under R version 4.3.2
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date()    masks base::date()
#> ✖ dplyr::filter()      masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval()  masks lubridate::interval()
#> ✖ dplyr::lag()         masks stats::lag()
#> ✖ tsibble::setdiff()   masks base::setdiff()
#> ✖ tsibble::union()     masks base::union()
library(seasonal)
#> Warning: package 'seasonal' was built under R version 4.3.2
#> 
#> Attaching package: 'seasonal'
#> The following object is masked from 'package:tibble':
#> 
#>     view

# Set the API Key
fredr_set_key("6359e5121a13be3ece5d737e3c242a41")

# Load the ICNSA data
icnsa_data = fredr(series_id = "ICNSA")

# Check the first few rows to ensure that the data is loaded correctly
head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-22     2024-02-22  
#> 2 1967-01-14 ICNSA     334000 2024-02-22     2024-02-22  
#> 3 1967-01-21 ICNSA     277000 2024-02-22     2024-02-22  
#> 4 1967-01-28 ICNSA     252000 2024-02-22     2024-02-22  
#> 5 1967-02-04 ICNSA     274000 2024-02-22     2024-02-22  
#> 6 1967-02-11 ICNSA     276000 2024-02-22     2024-02-22

# Check the first few rows to ensure that the data is loaded correctly
head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-22     2024-02-22  
#> 2 1967-01-14 ICNSA     334000 2024-02-22     2024-02-22  
#> 3 1967-01-21 ICNSA     277000 2024-02-22     2024-02-22  
#> 4 1967-01-28 ICNSA     252000 2024-02-22     2024-02-22  
#> 5 1967-02-04 ICNSA     274000 2024-02-22     2024-02-22  
#> 6 1967-02-11 ICNSA     276000 2024-02-22     2024-02-22

# Load the covariate data
wei_data = fredr(series_id = "WEI")

# Check the first few rows to ensure that the data is loaded correctly
head(wei_data)
#> # A tibble: 6 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 2008-01-05 WEI        1.8  2024-02-22     2024-02-22  
#> 2 2008-01-12 WEI        1.78 2024-02-22     2024-02-22  
#> 3 2008-01-19 WEI        1.75 2024-02-22     2024-02-22  
#> 4 2008-01-26 WEI        1.28 2024-02-22     2024-02-22  
#> 5 2008-02-02 WEI        0.99 2024-02-22     2024-02-22  
#> 6 2008-02-09 WEI        0.95 2024-02-22     2024-02-22

# Inspect the structure of the datasets
str(icnsa_data)
#> tibble [2,980 × 5] (S3: tbl_df/tbl/data.frame)
#>  $ date          : Date[1:2980], format: "1967-01-07" "1967-01-14" ...
#>  $ series_id     : chr [1:2980] "ICNSA" "ICNSA" "ICNSA" "ICNSA" ...
#>  $ value         : num [1:2980] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#>  $ realtime_start: Date[1:2980], format: "2024-02-22" "2024-02-22" ...
#>  $ realtime_end  : Date[1:2980], format: "2024-02-22" "2024-02-22" ...
str(wei_data)
#> tibble [841 × 5] (S3: tbl_df/tbl/data.frame)
#>  $ date          : Date[1:841], format: "2008-01-05" "2008-01-12" ...
#>  $ series_id     : chr [1:841] "WEI" "WEI" "WEI" "WEI" ...
#>  $ value         : num [1:841] 1.8 1.78 1.75 1.28 0.99 0.95 1.45 1.27 1.29 0.82 ...
#>  $ realtime_start: Date[1:841], format: "2024-02-22" "2024-02-22" ...
#>  $ realtime_end  : Date[1:841], format: "2024-02-22" "2024-02-22" ...

# Display the first few rows of each dataset
head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-22     2024-02-22  
#> 2 1967-01-14 ICNSA     334000 2024-02-22     2024-02-22  
#> 3 1967-01-21 ICNSA     277000 2024-02-22     2024-02-22  
#> 4 1967-01-28 ICNSA     252000 2024-02-22     2024-02-22  
#> 5 1967-02-04 ICNSA     274000 2024-02-22     2024-02-22  
#> 6 1967-02-11 ICNSA     276000 2024-02-22     2024-02-22
head(wei_data)
#> # A tibble: 6 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 2008-01-05 WEI        1.8  2024-02-22     2024-02-22  
#> 2 2008-01-12 WEI        1.78 2024-02-22     2024-02-22  
#> 3 2008-01-19 WEI        1.75 2024-02-22     2024-02-22  
#> 4 2008-01-26 WEI        1.28 2024-02-22     2024-02-22  
#> 5 2008-02-02 WEI        0.99 2024-02-22     2024-02-22  
#> 6 2008-02-09 WEI        0.95 2024-02-22     2024-02-22

# Summary statistics of the datasets
summary(icnsa_data)
#>       date             series_id             value         realtime_start      
#>  Min.   :1967-01-07   Length:2980        Min.   : 133000   Min.   :2024-02-22  
#>  1st Qu.:1981-04-16   Class :character   1st Qu.: 265800   1st Qu.:2024-02-22  
#>  Median :1995-07-25   Mode  :character   Median : 326919   Median :2024-02-22  
#>  Mean   :1995-07-25                      Mean   : 365013   Mean   :2024-02-22  
#>  3rd Qu.:2009-11-01                      3rd Qu.: 406725   3rd Qu.:2024-02-22  
#>  Max.   :2024-02-10                      Max.   :6161268   Max.   :2024-02-22  
#>   realtime_end       
#>  Min.   :2024-02-22  
#>  1st Qu.:2024-02-22  
#>  Median :2024-02-22  
#>  Mean   :2024-02-22  
#>  3rd Qu.:2024-02-22  
#>  Max.   :2024-02-22
summary(wei_data)
#>       date             series_id             value        realtime_start      
#>  Min.   :2008-01-05   Length:841         Min.   :-8.100   Min.   :2024-02-22  
#>  1st Qu.:2012-01-14   Class :character   1st Qu.: 1.330   1st Qu.:2024-02-22  
#>  Median :2016-01-23   Mode  :character   Median : 1.990   Median :2024-02-22  
#>  Mean   :2016-01-23                      Mean   : 1.767   Mean   :2024-02-22  
#>  3rd Qu.:2020-02-01                      3rd Qu.: 2.510   3rd Qu.:2024-02-22  
#>  Max.   :2024-02-10                      Max.   :10.270   Max.   :2024-02-22  
#>   realtime_end       
#>  Min.   :2024-02-22  
#>  1st Qu.:2024-02-22  
#>  Median :2024-02-22  
#>  Mean   :2024-02-22  
#>  3rd Qu.:2024-02-22  
#>  Max.   :2024-02-22

# Convert the DATE column to Date type
icnsa_data$date <- as.Date(icnsa_data$date)
wei_data$date <- as.Date(wei_data$date)

# Check for missing values in both data sets
sum(is.na(icnsa_data))
#> [1] 0
sum(is.na(wei_data))
#> [1] 0

# Check column names for both data frames
names(icnsa_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
names(wei_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

# Ensure the 'DATE' column exists and is unique in both data frames
"date" %in% names(icnsa_data)
#> [1] TRUE
"date" %in% names(wei_data)
#> [1] TRUE

# Ensure that the column is unique (if it is supposed to be a unique identifier)
all(!duplicated(icnsa_data$date))
#> [1] TRUE
all(!duplicated(wei_data$date))
#> [1] TRUE

# Plot the time series for ICNSA data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA",
       x = "Year",
       y = "Number")


# Plot the time series for WEI data
ggplot(wei_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "WEI",
       x = "Year",
       y = "Index")


# Merge the data frames by the 'DATE' column
merged_data <- merge(icnsa_data, wei_data, by = "date")

# Print the merged data
head(merged_data)
#>         date series_id.x value.x realtime_start.x realtime_end.x series_id.y
#> 1 2008-01-05       ICNSA  522700       2024-02-22     2024-02-22         WEI
#> 2 2008-01-12       ICNSA  547943       2024-02-22     2024-02-22         WEI
#> 3 2008-01-19       ICNSA  415397       2024-02-22     2024-02-22         WEI
#> 4 2008-01-26       ICNSA  369498       2024-02-22     2024-02-22         WEI
#> 5 2008-02-02       ICNSA  380234       2024-02-22     2024-02-22         WEI
#> 6 2008-02-09       ICNSA  377595       2024-02-22     2024-02-22         WEI
#>   value.y realtime_start.y realtime_end.y
#> 1    1.80       2024-02-22     2024-02-22
#> 2    1.78       2024-02-22     2024-02-22
#> 3    1.75       2024-02-22     2024-02-22
#> 4    1.28       2024-02-22     2024-02-22
#> 5    0.99       2024-02-22     2024-02-22
#> 6    0.95       2024-02-22     2024-02-22

# Convert both the data sets into a time series object
icnsa_ts <- ts(icnsa_data$value, frequency = 52)
wei_ts <- ts(wei_data$value, frequency = 52)

# Plot ACF and PACF for ICNSA
ggtsdisplay(icnsa_ts, main = "ICNSA", lag.max = 52)


# Plot ACF and PACF for WEI
ggtsdisplay(wei_ts, main = "WEI", lag.max = 52)


# Fit an ARIMA model for ICNSA and WEI
icnsa_model <- auto.arima(icnsa_ts)
wei_model <- auto.arima(wei_ts)

# Display both the models
icnsa_model
#> Series: icnsa_ts 
#> ARIMA(2,0,2)(1,0,0)[52] with non-zero mean 
#> 
#> Coefficients:
#>          ar1     ar2     ma1     ma2    sar1       mean
#>       0.6146  0.1991  0.8399  0.2916  0.3423  365332.05
#> s.e.  0.0903  0.0812  0.0874  0.0448  0.0180   25487.73
#> 
#> sigma^2 = 6.698e+09:  log likelihood = -37941.27
#> AIC=75896.53   AICc=75896.57   BIC=75938.53
wei_model
#> Series: wei_ts 
#> ARIMA(3,1,2)(0,0,1)[52] 
#> 
#> Coefficients:
#>          ar1      ar2     ar3      ma1     ma2     sma1
#>       0.7280  -0.3299  0.0614  -0.8353  0.4842  -0.2390
#> s.e.  0.1717   0.1251  0.0513   0.1701  0.1273   0.0339
#> 
#> sigma^2 = 0.1632:  log likelihood = -429.09
#> AIC=872.18   AICc=872.31   BIC=905.31

# Display the correlation between ICNSA and WEI
correlation_data <- ccf(icnsa_ts, wei_ts, lag.max = 52, type = "correlation", main = "Correlation between ICNSA and WEI", xlab = "Lag", ylab = "Correlation")


# Check which lag has maximum correlation
correlation_lag <- which.max(correlation_data$acf)
correlation_val <- correlation_data$acf[correlation_lag]
cat("The correlation is maximum at lag", correlation_lag, "with value", correlation_val, "\n")
#> The correlation is maximum at lag 55 with value 0.1740053

# Check the number of objects present in ICNSA and WEI
length(icnsa_ts)
#> [1] 2980
length(wei_ts)
#> [1] 841

#There is a difference in the lengths of ICNSA and WEI data.
#We are trying to combine two time series with different lengths into a single data frame, which is not possible, since data frames require each column to have the same number of rows.

#We need to align them before we can use them together in a model.

#We can try some approaches to make both the datasets of the same length.

#Approach 1:

# Convert the time series to data frames with a Date column
icnsa_df <- data.frame(date=time(icnsa_ts), icnsa=as.numeric(icnsa_ts))
wei_df <- data.frame(date=time(wei_ts), wei=as.numeric(wei_ts))

# Find the common date range
start_date <- max(min(icnsa_df$date), min(wei_df$date))
end_date <- min(max(icnsa_df$date), max(wei_df$date))

# Trim the data frames to the common date range
icnsa_df <- subset(icnsa_df, date >= start_date & date <= end_date)
wei_df <- subset(wei_df, date >= start_date & date <= end_date)

# Check the number of objects present in ICNSA and WEI after implementing Approach 1
length(icnsa_df)
#> [1] 2
length(wei_df)
#> [1] 2

#This reduces the number of objects in both the datasets to 2 each, which is a very small value.

#So we try another approach.

#Approach 2:

# Find the common date range
common_dates <- intersect(icnsa_data$date, wei_data$date)

# Subset both datasets to the common date range
icnsa_common <- icnsa_data[icnsa_data$date %in% common_dates, ]
wei_common <- wei_data[wei_data$date %in% common_dates, ]

# Check the number of objects present in ICNSA and WEI after implementing Approach 2
length(icnsa_common)
#> [1] 5
length(wei_common)
#> [1] 5

#This reduces the number of objects in both the datasets to 5 each, which is again a very small value.

#So we try another approach.

#Approach 3:

# Trimming the longer series
min_length <- min(length(icnsa_ts), length(wei_ts))
icnsa_trimmed <- head(icnsa_ts, min_length)
wei_trimmed <- head(wei_ts, min_length)

# Check the number of objects present in ICNSA and WEI after implementing Approach 3
length(icnsa_trimmed)
#> [1] 841
length(wei_trimmed)
#> [1] 841

#Now, both the ICNSA and WEI datasets have the same value, so we can continue to merge them.

# Merge the ICNSA and WEI datasets into a single data frame
icnsa_wei_data <- data.frame(icnsa = icnsa_trimmed, wei = wei_trimmed)

# Fit the regARIMA model
fit_regArima <- auto.arima(icnsa_wei_data[, "icnsa"], xreg = icnsa_wei_data[, "wei"])

# Display the summary of the fitted model
summary(fit_regArima)
#> Series: icnsa_wei_data[, "icnsa"] 
#> Regression with ARIMA(4,0,0)(0,1,1)[52] errors 
#> 
#> Coefficients:
#>          ar1     ar2     ar3     ar4     sma1       xreg
#>       0.4972  0.2566  0.1203  0.0849  -0.3126   582.4508
#> s.e.  0.0364  0.0396  0.0395  0.0362   0.0360  1734.8945
#> 
#> sigma^2 = 1.134e+09:  log likelihood = -9345.02
#> AIC=18704.04   AICc=18704.18   BIC=18736.73
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set 1162.357 32487.42 21482.67 -0.1297713 5.939134 0.3062354
#>                      ACF1
#> Training set -0.001427712

# Check the diagnostics of the fitted model
# This will provide plots for residuals to help assess the model
checkresiduals(fit_regArima)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(4,0,0)(0,1,1)[52] errors
#> Q* = 135.54, df = 99, p-value = 0.00869
#> 
#> Model df: 5.   Total lags used: 104

# Alternative diagnostics

acf_fit <- acf(residuals(fit_regArima), plot = FALSE)
plot(acf_fit, main = "Residuals ACF")


pacf_fit <- pacf(residuals(fit_regArima), plot = FALSE)
plot(pacf_fit, main = "Residuals PACF")


# To check the coefficients of the regression part of the model
fit_regArima$coef
#>          ar1          ar2          ar3          ar4         sma1         xreg 
#>   0.49716436   0.25659887   0.12030769   0.08491272  -0.31260080 582.45075497

# Forecast the next time period using the fitted regARIMA model
new_covariate_value <- 1 # Assign new_covariate_value with future values of your covariate
future_covariate <- data.frame(Covariate = c(new_covariate_value))
forecast_result <- forecast(fit_regArima, xreg = wei_ts)

# Plot the forecast
plot(forecast_result)


# To check just the next point forecast value
future_data <- forecast(fit_regArima, xreg = wei_ts, h = 1)
future_forecast_value <- future_data$mean[1]
cat ("The next forecast value is: ", future_forecast_value)
#> The next forecast value is:  490680.7

Created on 2024-02-22 with reprex v2.1.0

anjuanand6 commented 8 months ago

HW 1

The following steps have been performed:

• Load the most current Initial Claims (ICNSA) data. Not seasonally adjusted. • Download another series that you think might be related to ICNSA to be used as a covariate. (Taken WEI data for this). • Do some empirical data analysis to decide on possible covariates and stochastic structure to use for a regARIMA model. • You may use automatic model identification, such at auto.arima() or ARIMA() without the pdf() argument, but the final model must be justified with your own words and analysis. • Fit your final model and comment on the regression diagnostics. • Produce a point forecast from your final model.

anjuanand6 commented 8 months ago
# Load necessary libraries
library(tidyverse)
library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tseries)
library(readr)
library(fredr)
library(dplyr)
library(lubridate)
library(tsbox)
library(fpp3)
#> ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
#> ✔ tsibble     1.1.4     ✔ fable       0.3.3
#> ✔ tsibbledata 0.4.1     ✔ fabletools  0.3.4
#> ✔ feasts      0.3.1
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date()    masks base::date()
#> ✖ dplyr::filter()      masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval()  masks lubridate::interval()
#> ✖ dplyr::lag()         masks stats::lag()
#> ✖ tsibble::setdiff()   masks base::setdiff()
#> ✖ tsibble::union()     masks base::union()
library(seasonal)
#> 
#> Attaching package: 'seasonal'
#> The following object is masked from 'package:tibble':
#> 
#>     view
library(ggplot2)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following object is masked from 'package:tsibble':
#> 
#>     index
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(splines)

#1. Load the most current Initial Claims (ICNSA) data

# Set the API Key
fredr_set_key("6359e5121a13be3ece5d737e3c242a41")

# Fetch the ICNSA data, not seasonally adjusted
icnsa_data <- fredr(series_id = "ICNSA")

#2. Empirical analysis for Covid pandemic period

# Plot the data to visually find the COVID period
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA Data Over Time", x = "Date", y = "Initial Claims")


# After plotting, manually defining the start and end date of the COVID period
start_date <- as.Date("2020-03-01")  # Observation start date
end_date <- as.Date("2021-03-01")    # Observation end date

# Step 3: Cubic Spline Imputation

# Convert the date to a proper Date object
icnsa_data$date <- as.Date(icnsa_data$date)

# Define the time points for the spline excluding the COVID period and create a spline function
time_points <- icnsa_data$date[icnsa_data$date < start_date | icnsa_data$date > end_date]
values <- icnsa_data$value[icnsa_data$date < start_date | icnsa_data$date > end_date]
cs_fit <- splinefun(time_points, values, method = "fmm")

# Apply the spline to impute values for the COVID period
imputed_values <- cs_fit(icnsa_data$date[icnsa_data$date >= start_date & icnsa_data$date <= end_date])
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- imputed_values

# Plot the data with imputed values
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_line(data = icnsa_data[icnsa_data$date >= start_date & icnsa_data$date <= end_date, ], aes(x = date, y = value), color = "red") +
  labs(title = "ICNSA Data with Imputed COVID Period", x = "Date", y = "Initial Claims")


#Non Covid Data
non_covid_data <- icnsa_data %>%
  filter(date < start_date | date > end_date)

#Covid Data
covid_data <- icnsa_data %>%
  filter(date >= start_date & date <= end_date)

# Define a range of lambda values
lambda_values <- seq(0.1, 1, by = 0.1)

# Initialize an empty list to store plots for each lambda
plots <- list()

# Loop through each lambda value to fit a spline and impute new values
for (i in seq_along(lambda_values)) {
  lambda <- lambda_values[i]

  # Fit the spline model with the current lambda value
  spline_fit <- smooth.spline(x = as.numeric(non_covid_data$date), y = non_covid_data$value, spar = lambda)

  # Impute new values for the COVID period
  imputed_values <- predict(spline_fit, x = as.numeric(covid_data$date))$y
  covid_data$value <- imputed_values

  # Combine non-COVID and COVID data with imputed values
  updated_icnsa_data <- bind_rows(non_covid_data, covid_data) %>%
    arrange(date)

  # Create a plot for the current lambda value
  p <- ggplot() +
    geom_line(data = icnsa_data, aes(x = date, y = value), color = "red") +
    geom_line(data = updated_icnsa_data, aes(x = date, y = value), color = "blue") +
    labs(title = paste("ICNSA Data with Imputed Values (lambda =", lambda, ")"),
         x = "Date", y = "Value") +
    theme_minimal()

  # Store the plot in the list
  plots[[i]] <- p
}

# Display all plots
plots
#> [[1]]

#> 
#> [[2]]

#> 
#> [[3]]

#> 
#> [[4]]

#> 
#> [[5]]

#> 
#> [[6]]

#> 
#> [[7]]

#> 
#> [[8]]

#> 
#> [[9]]

#> 
#> [[10]]


#Find the optimal lambda value

rmse_values <- numeric(length(lambda_values))

for (i in seq_along(lambda_values)) {
  # Fit model excluding each point one by one and calculate RMSE
  rmse <- numeric(length(covid_data$date))

  for (j in seq_along(covid_data$date)) {
    # Using all but one observation for fitting
    train_data <- non_covid_data[-j, ]
    test_data <- non_covid_data[j, ]

    # Fit spline with current lambda value
    spline_fit <- smooth.spline(x = as.numeric(train_data$date), y = train_data$value, spar = lambda_values[i])

    # Predict the excluded observation
    predicted_value <- predict(spline_fit, x = as.numeric(test_data$date))$y

    # Calculate RMSE for the prediction
    rmse[j] <- sqrt(mean((predicted_value - test_data$value) ^ 2))
  }

  # Average RMSE for current lambda
  rmse_values[i] <- mean(rmse)
}

# Find the lambda value with the smallest RMSE
optimal_lambda <- lambda_values[which.min(rmse_values)]

# Output the optimal lambda value
print(optimal_lambda)
#> [1] 0.2

# Taking the optimal_lambda value for spline fit
spline_fit <- smooth.spline(x = as.numeric(non_covid_data$date), y = non_covid_data$value, spar = optimal_lambda)

# Function to impute missing values using spline fit
impute_missing_values <- function(non_covid_data, covid_data, optimal_lambda) {
  spline_fit <- smooth.spline(x = as.numeric(non_covid_data$date),
                                    y = non_covid_data$value, spar = optimal_lambda)
  imputed_values <- predict(spline_fit, x = as.numeric(covid_data$date))$y
  covid_data$value <- imputed_values
  updated_icnsa_data <- rbind(non_covid_data, covid_data) %>% arrange(date)
  return(updated_icnsa_data)
}

# Plotting function for updated data
plot_updated_data <- function(updated_icnsa_data, original_icnsa_data) {
  plot(updated_icnsa_data$date, updated_icnsa_data$value, type = "l", col = "red", lwd = 2,
       main = "Updated Vs Original", xlab = "Year", ylab = "Number")
  lines(original_icnsa_data$date, original_icnsa_data$value, col = "blue", lwd = 2)
  legend("topright", legend = c("Updated", "Original"), col = c("red", "blue"), lty = 1, lwd = 2)
}

# Plotting function for comparison of time series
plot_time_series_comparison <- function(ts_data_old, ts_data) {
  plot(ts_data_old, col = "red", main = "Comparison of Time Series", ylab = "Value")
  lines(ts_data, col = "blue")
  legend("topright", legend = c("Original", "Updated"), col = c("red", "blue"), lty = 1, lwd = 2)
}

#4. Use both a multiplicative and an additive Holt-Winters model to forecast the next ICNSA value using the newly imputed Covid values.

# Function to forecast using Holt-Winters models
holt_winters_forecast <- function(ts_data) {
  hwm_multi <- HoltWinters(ts_data, seasonal = "multiplicative")
  forecasted_value_multi <- forecast(hwm_multi, h = 1)$mean
  hwa_add <- HoltWinters(ts_data, seasonal = "additive")
  forecasted_value_add <- forecast(hwa_add, h = 1)$mean
  return(list(multiplicative = forecasted_value_multi, additive = forecasted_value_add))
}

# Impute missing values
updated_icnsa_data <- impute_missing_values(non_covid_data, covid_data, optimal_lambda)

# Plot updated data
plot_updated_data(updated_icnsa_data, icnsa_data)


# Create time series for comparison
ts_data_old <- ts(icnsa_data$value, frequency = 52)
ts_data <- ts(updated_icnsa_data$value, frequency = 52)

# Plot time series comparison
plot_time_series_comparison(ts_data_old, ts_data)


# Forecast using Holt-Winters models
forecast_results <- holt_winters_forecast(ts_data)
cat("Forecasted Unemployment using Holt-Winters Multiplicative Model:", forecast_results$multiplicative, "\n")
#> Forecasted Unemployment using Holt-Winters Multiplicative Model: 191025.8
cat("Forecasted Unemployment using Holt-Winters Additive Model:", forecast_results$additive, "\n")
#> Forecasted Unemployment using Holt-Winters Additive Model: 197790.4

Created on 2024-02-28 with reprex v2.1.0

anjuanand6 commented 8 months ago

HW - 2

The following steps were performed:

  1. Loading the most current Initial Claims (ICNSA) data. Not seasonally adjusted.
  2. Done some empirical analysis to decide on a start and end date for the Covid pandemic times.
  3. Used a cubic spline methodology to impute new values for the Covid period. The λ value chosen is based on the least Root Mean Square error obtained (here λ = 0.2, which was having the lowest RMSE among the λ values ranging from 0.1 to 1 with an increment of 0.1)
  4. Used both a multiplicative and an additive Holt-Winters model to forecast the next ICNSA value using the newly imputed Covid values.

The results after forecasting are: Forecasted Unemployment using Holt-Winters Multiplicative Model: 191025.8 Forecasted Unemployment using Holt-Winters Additive Model: 197790.4

anjuanand6 commented 8 months ago
# Load necessary libraries
library(fredr)
library(ggplot2)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(KFAS)
#> Warning: package 'KFAS' was built under R version 4.3.3
#> Please cite KFAS in publications by using: 
#> 
#>   Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)

# Set API key for FRED data
fredr_set_key("6359e5121a13be3ece5d737e3c242a41")

icnsa_data <- fredr(series_id = "ICNSA")
ccnsa_data <- fredr(series_id = "CCNSA")

# Plot the ICNSA data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA Data Over Time", x = "Date", y = "Initial Claims")


# Plot the CCNSA data
ggplot(ccnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "CCNSA Data Over Time", x = "Date", y = "Continuous Claims")


# Retrieve and format Initial Claims Non-seasonally Adjusted (ICNSA) data
initial_claims_data <- fredr(series_id = "ICNSA") %>%
  mutate(Date = as.Date(date), InitialClaims = value) %>%
  select(Date, InitialClaims)

# Retrieve and format Continued Claims (Insured Unemployment) data
continuous_claims_data <- fredr(series_id = "CCNSA") %>%
  mutate(Date = as.Date(date), UnemploymentRate = value) %>%
  select(Date, UnemploymentRate)

# Define the period to exclude due to COVID-19 impact
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-03-01")

# Filter out the COVID-19 impact period from the ICNSA data
filtered_initial_claims_data <- initial_claims_data[!(initial_claims_data$Date >= covid_start_date & initial_claims_data$Date <= covid_end_date), ]

# Apply cubic spline interpolation to impute missing data for the COVID-19 period
cubic_spline_model <- smooth.spline(filtered_initial_claims_data$Date, filtered_initial_claims_data$InitialClaims, spar=0.2)
all_dates <- initial_claims_data$Date
cubic_spline_predictions <- predict(cubic_spline_model, x = as.numeric(all_dates))
initial_claims_data$CubicSplineImputed <- cubic_spline_predictions$y

# Merge and prepare data for analysis
merged_data <- merge(initial_claims_data, continuous_claims_data, by = "Date", all.x = TRUE) %>%
  na.locf()

# Plot initial claims time series with COVID-19 period highlighted
ggplot(merged_data, aes(x = Date, y = InitialClaims)) +
  geom_line() + labs(title = "ICNSA Time Series", x = "Date", y = "Initial Claims") +
  geom_vline(xintercept = as.numeric(covid_start_date), color = "red", linetype = "dashed") +
  geom_vline(xintercept = as.numeric(as.Date("2021-03-01")), color = "red", linetype = "dashed")


# Define and fit a state-space model
model_spec <- SSModel(InitialClaims ~ SSMtrend(degree = 2, Q = list(matrix(1), matrix(1))) + 
                        SSMseasonal(period = 52, sea.type = "dummy"), H = matrix(1), data = merged_data)
fit <- fitSSM(model_spec, inits = rep(0.1, 2))
summary(fit)
#>           Length Class   Mode
#> optim.out  5     -none-  list
#> model     14     SSModel list

# Apply Kalman filter to impute data using the state-space model
state_space_imputation <- KFS(fit$model)
merged_data$StateSpaceImputed <- state_space_imputation$alphahat[,"level"]

# Compare original, cubic spline imputed, and state-space model imputed data visually
ggplot(merged_data, aes(x = Date)) +
  geom_line(aes(y = InitialClaims), color = "grey", linetype="solid") +
  geom_line(aes(y = CubicSplineImputed), color = "green", linetype="dotted") +
  geom_line(aes(y = StateSpaceImputed), color = "blue", linetype="dashed") +
  labs(title = "ICNSA: Original vs. Cubic Spline Imputed vs. State-space Imputed", x = "Date", y = "Initial Claims") +
  theme_minimal()


# Focus on comparison between original and state-space imputed data with highlighted COVID-19 period
ggplot(merged_data, aes(x = Date)) +
  geom_line(aes(y = InitialClaims), color = "grey") +
  geom_line(aes(y = StateSpaceImputed), color = "blue") +
  labs(title = "Comparison of Original and State-space Imputed ICNSA Data", x = "Date", y = "Initial Claims") +
  geom_vline(xintercept = as.numeric(covid_start_date), color = "red", linetype = "dashed") +
  geom_vline(xintercept = as.numeric(as.Date("2021-03-01")), color = "red", linetype = "dashed") +
  theme_minimal()


# Forecast the next data point using the state-space model
forecast_results <- predict(fit$model, n.ahead = 1)
forecast_results
#> Time Series:
#> Start = 2984 
#> End = 2984 
#> Frequency = 1 
#>           fit
#> [1,] 212633.3

Created on 2024-03-07 with reprex v2.1.0

anjuanand6 commented 8 months ago

HW - 3

The following steps have been performed:

  1. Load the most current Initial Claims (ICNSA) data. Not seasonallyadjusted.
  2. Do some empirical analysis to decide on a start and end date for the Covid pandemic times.
  3. Use the state-space missing value methodology discussed in lecture to impute new values for the Covid period. Compare this imputation method to that of Hw 2 when you used a spline to impute values.
  4. Fit a structural time series model with trend, seasonal and error using state space methodology.
  5. Find a covariate series that is different than you used on Hw 1 or Hw 2 and add it to your model.

The forecasted value is: 212633.3

anjuanand6 commented 7 months ago
# !diagnostics off

# Load necessary libraries
library(readxl)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tseries)
#> Warning: package 'tseries' was built under R version 4.3.3
library(ggplot2)
library(reprex)
library(vars)
#> Warning: package 'vars' was built under R version 4.3.3
#> Loading required package: MASS
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Loading required package: lmtest
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.3.3
#> Loading required package: lattice

# Read the data
department_stores <- read_excel("C:/Users/Admin/MRTS4521.xlsx", skip = 7)
print(head(department_stores))
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 14134
#> 2 Feb-1992 14472
#> 3 Mar-1992 14543
#> 4 Apr-1992 14529
#> 5 May-1992 14634
#> 6 Jun-1992 14597

clothing_stores <- read_excel("C:/Users/Admin/MRTS4481.xlsx", skip = 7)
print(head(clothing_stores))
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 6758 
#> 2 Feb-1992 6791 
#> 3 Mar-1992 6721 
#> 4 Apr-1992 6954 
#> 5 May-1992 6947 
#> 6 Jun-1992 7091

jewelry_stores <- read_excel("C:/Users/Admin/MRTS44831.xlsx", skip = 7)
print(head(jewelry_stores))
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 1248 
#> 2 Feb-1992 1230 
#> 3 Mar-1992 1283 
#> 4 Apr-1992 1247 
#> 5 May-1992 1222 
#> 6 Jun-1992 1238

retail_trade <- read_excel("C:/Users/Admin/MRTS4400.xlsx", skip = 7)
print(head(retail_trade))
#> # A tibble: 6 × 2
#>   Period   Value 
#>   <chr>    <chr> 
#> 1 Jan-1992 146925
#> 2 Feb-1992 147223
#> 3 Mar-1992 146805
#> 4 Apr-1992 148032
#> 5 May-1992 149010
#> 6 Jun-1992 149800

# Ensure 'Value' column is numeric and replace non-numeric entries with NA
department_stores$Value <- as.numeric(as.character(department_stores$Value))
#> Warning: NAs introduced by coercion
clothing_stores$Value <- as.numeric(as.character(clothing_stores$Value))
#> Warning: NAs introduced by coercion
jewelry_stores$Value <- as.numeric(as.character(jewelry_stores$Value))
#> Warning: NAs introduced by coercion
retail_trade$Value <- as.numeric(as.character(retail_trade$Value))
#> Warning: NAs introduced by coercion

# Check for NA values after coercion
sum(is.na(department_stores$Value))
#> [1] 11
sum(is.na(clothing_stores$Value))
#> [1] 11
sum(is.na(jewelry_stores$Value))
#> [1] 46
sum(is.na(retail_trade$Value))
#> [1] 11

# Remove the NA values
department_stores <- department_stores[!is.na(department_stores$Value), ]
clothing_stores <- clothing_stores[!is.na(clothing_stores$Value), ]
jewelry_stores <- jewelry_stores[!is.na(jewelry_stores$Value), ]
retail_trade <- retail_trade[!is.na(retail_trade$Value), ]

# Convert to time series object
ts_dep_stores <- ts(department_stores$Value, start=c(1992, 1), frequency=12)
ts_clothing_stores <- ts(clothing_stores$Value, start=c(1992, 1), frequency=12)
ts_jewelry_stores <- ts(jewelry_stores$Value, start=c(1992, 1), frequency=12)
ts_retail_trade <- ts(retail_trade$Value, start=c(1992, 1), frequency=12)

# Plot the time series
plot(ts_dep_stores, main="Department Stores Sales", xlab="Year", ylab="Sales")

plot(ts_clothing_stores, main="Clothing Stores Sales", xlab="Year", ylab="Sales")

plot(ts_jewelry_stores, main="Jewelry Stores Sales", xlab="Year", ylab="Sales")

plot(ts_retail_trade, main="Retail Trade", xlab="Year", ylab="Sales")


# Decompose the time series
decomp_dept <- stl(ts_dep_stores, s.window="periodic")
plot(decomp_dept)

decomp_cloth <- stl(ts_clothing_stores, s.window="periodic")
plot(decomp_cloth)

decomp_jewel <- stl(ts_jewelry_stores, s.window="periodic")
plot(decomp_jewel)

decomp_retail <- stl(ts_retail_trade, s.window="periodic")
plot(decomp_retail)


# Check for stationarity
adf.test(ts_dep_stores)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_dep_stores
#> Dickey-Fuller = -3.3768, Lag order = 7, p-value = 0.05812
#> alternative hypothesis: stationary
adf.test(ts_clothing_stores)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_clothing_stores
#> Dickey-Fuller = -3.7445, Lag order = 7, p-value = 0.02201
#> alternative hypothesis: stationary
adf.test(ts_jewelry_stores)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_jewelry_stores
#> Dickey-Fuller = -3.795, Lag order = 7, p-value = 0.01955
#> alternative hypothesis: stationary
adf.test(ts_retail_trade)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_retail_trade
#> Dickey-Fuller = -0.45768, Lag order = 7, p-value = 0.9835
#> alternative hypothesis: stationary

# Fit an ARIMA model
fit_dept <- auto.arima(ts_dep_stores)
summary(fit_dept)
#> Series: ts_dep_stores 
#> ARIMA(3,2,2)(0,0,2)[12] 
#> 
#> Coefficients:
#>           ar1      ar2      ar3      ma1      ma2     sma1     sma2
#>       -0.9711  -0.4152  -0.2922  -0.2016  -0.7491  -0.1403  -0.1017
#> s.e.   0.1310   0.0712   0.0506   0.1299   0.1266   0.0534   0.0494
#> 
#> sigma^2 = 89245:  log likelihood = -2725.25
#> AIC=5466.5   AICc=5466.88   BIC=5498.08
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE       MPE     MAPE      MASE
#> Training set -16.72261 295.2265 173.5556 -0.132656 1.305143 0.2954698
#>                      ACF1
#> Training set -0.008641679
fit_cloth <- auto.arima(ts_clothing_stores)
summary(fit_cloth)
#> Series: ts_clothing_stores 
#> ARIMA(0,1,3)(0,0,1)[12] with drift 
#> 
#> Coefficients:
#>          ma1      ma2      ma3     sma1    drift
#>       0.3546  -0.3968  -0.3897  -0.1856  31.4015
#> s.e.  0.0480   0.0491   0.0484   0.0521  14.1911
#> 
#> sigma^2 = 355359:  log likelihood = -2996.91
#> AIC=6005.83   AICc=6006.05   BIC=6029.53
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set 0.2376561 591.4563 269.3632 -0.5275825 2.689432 0.3377146
#>                     ACF1
#> Training set 0.008129612
fit_jewel <- auto.arima(ts_jewelry_stores)
summary(fit_jewel)
#> Series: ts_jewelry_stores 
#> ARIMA(1,1,3)(0,0,2)[12] with drift 
#> 
#> Coefficients:
#>          ar1      ma1      ma2      ma3     sma1     sma2   drift
#>       0.3328  -0.1132  -0.4468  -0.2359  -0.1500  -0.1688  4.4865
#> s.e.  0.1377   0.1394   0.0465   0.0850   0.0838   0.0751  1.3311
#> 
#> sigma^2 = 12991:  log likelihood = -2145.6
#> AIC=4307.2   AICc=4307.63   BIC=4338.04
#> 
#> Training set error measures:
#>                     ME    RMSE      MAE        MPE     MAPE      MASE
#> Training set 0.3999299 112.669 66.62347 -0.4103501 3.482894 0.4829962
#>                      ACF1
#> Training set -0.001226958
fit_retail <- auto.arima(ts_retail_trade)
summary(fit_retail)
#> Series: ts_retail_trade 
#> ARIMA(0,1,2)(0,0,2)[12] with drift 
#> 
#> Coefficients:
#>           ma1      ma2     sma1     sma2      drift
#>       -0.1072  -0.1511  -0.0859  -0.0809  1196.3729
#> s.e.   0.0518   0.0541   0.0542   0.0515   204.2875
#> 
#> sigma^2 = 41594718:  log likelihood = -3910.87
#> AIC=7833.73   AICc=7833.96   BIC=7857.44
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE         MPE      MAPE      MASE
#> Training set -5.419509 6398.944 2914.286 -0.08863509 0.8278123 0.1662024
#>                      ACF1
#> Training set -0.002816978

# Diagnostics - check residuals
checkresiduals(fit_dept)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(3,2,2)(0,0,2)[12]
#> Q* = 27.531, df = 17, p-value = 0.05073
#> 
#> Model df: 7.   Total lags used: 24
checkresiduals(fit_cloth)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(0,1,3)(0,0,1)[12] with drift
#> Q* = 17.391, df = 20, p-value = 0.6274
#> 
#> Model df: 4.   Total lags used: 24
checkresiduals(fit_jewel)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(1,1,3)(0,0,2)[12] with drift
#> Q* = 13.148, df = 18, p-value = 0.7827
#> 
#> Model df: 6.   Total lags used: 24
checkresiduals(fit_retail)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(0,1,2)(0,0,2)[12] with drift
#> Q* = 69.593, df = 20, p-value = 2.122e-07
#> 
#> Model df: 4.   Total lags used: 24

# Plot ACF
acf(ts_dep_stores, main="ACF for Department Stores Sales")

acf(ts_clothing_stores, main="ACF for Clothing Stores Sales")

acf(ts_jewelry_stores, main="ACF for Jewelry Stores Sales")

acf(ts_retail_trade, main="ACF for Retail Trade")


# Plot PACF
pacf(ts_dep_stores, main="PACF for Department Stores Sales")

pacf(ts_clothing_stores, main="PACF for Clothing Stores Sales")

pacf(ts_jewelry_stores, main="PACF for Jewelry Stores Sales")

pacf(ts_retail_trade, main="PACF for Retail Trade")


# Determine the shortest length among the series
min_length <- min(length(ts_dep_stores), length(ts_clothing_stores), length(ts_jewelry_stores), length(ts_retail_trade))

# Trim the series to the shortest length
ts_dep_stores_trimmed <- ts_dep_stores[1:min_length]
ts_clothing_stores_trimmed <- ts_clothing_stores[1:min_length]
ts_jewelry_stores_trimmed <- ts_jewelry_stores[1:min_length]
ts_retail_trade_trimmed <- ts_retail_trade[1:min_length]

# Combine the trimmed series into a data frame
data_combined <- data.frame(department_stores = ts_dep_stores_trimmed,
                            clothing_stores = ts_clothing_stores_trimmed,
                            jewelry_stores = ts_jewelry_stores_trimmed,
                            retail_trade = ts_retail_trade_trimmed)

# Convert to a ts object with appropriate start and frequency
data_ts <- ts(data_combined, start=c(1992, 1), frequency=12)

# Fit a VAR(1) model
var1_model <- VAR(data_ts, p=1, type="both")

# Display summary of VAR(1) model
summary(var1_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: department_stores, clothing_stores, jewelry_stores, retail_trade 
#> Deterministic variables: both 
#> Sample size: 349 
#> Log Likelihood: -10408.866 
#> Roots of the characteristic polynomial:
#> 0.9845 0.9613 0.8336 0.6646
#> Call:
#> VAR(y = data_ts, p = 1, type = "both")
#> 
#> 
#> Estimation results for equation department_stores: 
#> ================================================== 
#> department_stores = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1  9.773e-01  1.377e-02  70.990  < 2e-16 ***
#> clothing_stores.l1   -7.362e-02  1.978e-02  -3.722 0.000231 ***
#> jewelry_stores.l1     3.088e-01  1.872e-01   1.650 0.099952 .  
#> retail_trade.l1      -4.702e-03  1.982e-03  -2.373 0.018208 *  
#> const                 1.235e+03  3.298e+02   3.745 0.000211 ***
#> trend                 4.194e+00  1.473e+00   2.847 0.004676 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 304.1 on 343 degrees of freedom
#> Multiple R-Squared: 0.9869,  Adjusted R-squared: 0.9867 
#> F-statistic:  5180 on 5 and 343 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation clothing_stores: 
#> ================================================ 
#> clothing_stores = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1   0.039301   0.031524   1.247   0.2134    
#> clothing_stores.l1     0.858498   0.045293  18.954   <2e-16 ***
#> jewelry_stores.l1     -0.082018   0.428650  -0.191   0.8484    
#> retail_trade.l1       -0.001558   0.004537  -0.343   0.7315    
#> const                617.932190 755.153651   0.818   0.4138    
#> trend                  6.274845   3.372526   1.861   0.0637 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 696.3 on 343 degrees of freedom
#> Multiple R-Squared: 0.9512,  Adjusted R-squared: 0.9505 
#> F-statistic:  1336 on 5 and 343 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation jewelry_stores: 
#> =============================================== 
#> jewelry_stores = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1  2.102e-02  5.548e-03   3.788 0.000179 ***
#> clothing_stores.l1   -4.509e-03  7.971e-03  -0.566 0.571973    
#> jewelry_stores.l1     6.598e-01  7.543e-02   8.747  < 2e-16 ***
#> retail_trade.l1       2.646e-03  7.985e-04   3.314 0.001018 ** 
#> const                -2.548e+02  1.329e+02  -1.917 0.056037 .  
#> trend                -4.662e-01  5.935e-01  -0.786 0.432695    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 122.5 on 343 degrees of freedom
#> Multiple R-Squared: 0.9298,  Adjusted R-squared: 0.9288 
#> F-statistic: 908.4 on 5 and 343 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation retail_trade: 
#> ============================================= 
#> retail_trade = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1  2.360e-01  2.525e-01   0.935 0.350583    
#> clothing_stores.l1   -1.419e+00  3.628e-01  -3.912 0.000110 ***
#> jewelry_stores.l1    -1.349e+00  3.433e+00  -0.393 0.694692    
#> retail_trade.l1       9.483e-01  3.634e-02  26.097  < 2e-16 ***
#> const                 1.556e+04  6.048e+03   2.573 0.010510 *  
#> trend                 9.722e+01  2.701e+01   3.599 0.000366 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 5577 on 343 degrees of freedom
#> Multiple R-Squared: 0.9963,  Adjusted R-squared: 0.9962 
#> F-statistic: 1.825e+04 on 5 and 343 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>                   department_stores clothing_stores jewelry_stores retail_trade
#> department_stores             92471          153362          23273      1268390
#> clothing_stores              153362          484825          71278      2801721
#> jewelry_stores                23273           71278          15015       487965
#> retail_trade                1268390         2801721         487965     31098936
#> 
#> Correlation matrix of residuals:
#>                   department_stores clothing_stores jewelry_stores retail_trade
#> department_stores            1.0000          0.7243         0.6246       0.7480
#> clothing_stores              0.7243          1.0000         0.8354       0.7215
#> jewelry_stores               0.6246          0.8354         1.0000       0.7141
#> retail_trade                 0.7480          0.7215         0.7141       1.0000

# Determine optimal number of lags for VAR(p) model using AIC
lag_selection <- VARselect(data_ts, lag.max=10, type="both")
optimal_lags <- lag_selection$selection["AIC(n)"]

# Fit a VAR(p) model with the optimal number of lags
varp_model <- VAR(data_ts, p=optimal_lags, type="both")

# Display summary of VAR(p) model
summary(varp_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: department_stores, clothing_stores, jewelry_stores, retail_trade 
#> Deterministic variables: both 
#> Sample size: 346 
#> Log Likelihood: -10074.681 
#> Roots of the characteristic polynomial:
#> 0.9878 0.9878 0.9185 0.7692 0.6972 0.6829 0.6829 0.6431 0.6431 0.5793 0.5793 0.5684 0.5684 0.555 0.5475 0.5475
#> Call:
#> VAR(y = data_ts, p = optimal_lags, type = "both")
#> 
#> 
#> Estimation results for equation department_stores: 
#> ================================================== 
#> department_stores = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + department_stores.l2 + clothing_stores.l2 + jewelry_stores.l2 + retail_trade.l2 + department_stores.l3 + clothing_stores.l3 + jewelry_stores.l3 + retail_trade.l3 + department_stores.l4 + clothing_stores.l4 + jewelry_stores.l4 + retail_trade.l4 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1   0.391793   0.079961   4.900 1.51e-06 ***
#> clothing_stores.l1     0.275423   0.043234   6.370 6.37e-10 ***
#> jewelry_stores.l1      0.714494   0.231380   3.088  0.00219 ** 
#> retail_trade.l1       -0.012353   0.004630  -2.668  0.00800 ** 
#> department_stores.l2   0.300850   0.092458   3.254  0.00126 ** 
#> clothing_stores.l2    -0.490130   0.056327  -8.702  < 2e-16 ***
#> jewelry_stores.l2     -0.797520   0.258746  -3.082  0.00223 ** 
#> retail_trade.l2        0.017460   0.005588   3.125  0.00194 ** 
#> department_stores.l3   0.124866   0.093058   1.342  0.18058    
#> clothing_stores.l3     0.195897   0.061762   3.172  0.00166 ** 
#> jewelry_stores.l3      0.484133   0.259908   1.863  0.06340 .  
#> retail_trade.l3        0.003161   0.005523   0.572  0.56747    
#> department_stores.l4   0.157334   0.077782   2.023  0.04391 *  
#> clothing_stores.l4    -0.022866   0.048953  -0.467  0.64073    
#> jewelry_stores.l4     -0.224192   0.226866  -0.988  0.32378    
#> retail_trade.l4       -0.010552   0.004555  -2.317  0.02115 *  
#> const                939.080318 307.260777   3.056  0.00242 ** 
#> trend                  1.267779   1.323701   0.958  0.33889    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 242.8 on 328 degrees of freedom
#> Multiple R-Squared: 0.992,   Adjusted R-squared: 0.9916 
#> F-statistic:  2396 on 17 and 328 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation clothing_stores: 
#> ================================================ 
#> clothing_stores = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + department_stores.l2 + clothing_stores.l2 + jewelry_stores.l2 + retail_trade.l2 + department_stores.l3 + clothing_stores.l3 + jewelry_stores.l3 + retail_trade.l3 + department_stores.l4 + clothing_stores.l4 + jewelry_stores.l4 + retail_trade.l4 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1  -0.470608   0.187166  -2.514  0.01240 *  
#> clothing_stores.l1     1.471416   0.101199  14.540  < 2e-16 ***
#> jewelry_stores.l1      1.481334   0.541593   2.735  0.00657 ** 
#> retail_trade.l1       -0.022066   0.010837  -2.036  0.04253 *  
#> department_stores.l2   0.283130   0.216416   1.308  0.19170    
#> clothing_stores.l2    -0.849183   0.131844  -6.441 4.23e-10 ***
#> jewelry_stores.l2     -2.490214   0.605647  -4.112 4.97e-05 ***
#> retail_trade.l2        0.020145   0.013080   1.540  0.12447    
#> department_stores.l3   0.301336   0.217820   1.383  0.16748    
#> clothing_stores.l3     0.008905   0.144567   0.062  0.95092    
#> jewelry_stores.l3      1.686023   0.608367   2.771  0.00590 ** 
#> retail_trade.l3        0.029211   0.012928   2.260  0.02450 *  
#> department_stores.l4  -0.092797   0.182064  -0.510  0.61061    
#> clothing_stores.l4     0.246877   0.114584   2.155  0.03192 *  
#> jewelry_stores.l4     -0.548916   0.531026  -1.034  0.30204    
#> retail_trade.l4       -0.031366   0.010662  -2.942  0.00350 ** 
#> const                901.039717 719.206842   1.253  0.21116    
#> trend                  6.571186   3.098394   2.121  0.03469 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 568.4 on 328 degrees of freedom
#> Multiple R-Squared: 0.9682,  Adjusted R-squared: 0.9665 
#> F-statistic: 586.9 on 17 and 328 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation jewelry_stores: 
#> =============================================== 
#> jewelry_stores = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + department_stores.l2 + clothing_stores.l2 + jewelry_stores.l2 + retail_trade.l2 + department_stores.l3 + clothing_stores.l3 + jewelry_stores.l3 + retail_trade.l3 + department_stores.l4 + clothing_stores.l4 + jewelry_stores.l4 + retail_trade.l4 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1 -1.017e-01  3.192e-02  -3.186 0.001579 ** 
#> clothing_stores.l1    1.513e-01  1.726e-02   8.770  < 2e-16 ***
#> jewelry_stores.l1     6.513e-01  9.236e-02   7.052 1.05e-11 ***
#> retail_trade.l1       1.262e-03  1.848e-03   0.683 0.495098    
#> department_stores.l2  1.287e-01  3.691e-02   3.488 0.000554 ***
#> clothing_stores.l2   -2.040e-01  2.248e-02  -9.073  < 2e-16 ***
#> jewelry_stores.l2    -1.251e-01  1.033e-01  -1.211 0.226634    
#> retail_trade.l2       5.435e-04  2.230e-03   0.244 0.807644    
#> department_stores.l3  3.259e-02  3.714e-02   0.877 0.380951    
#> clothing_stores.l3    6.650e-03  2.465e-02   0.270 0.787514    
#> jewelry_stores.l3     2.591e-01  1.037e-01   2.497 0.012998 *  
#> retail_trade.l3       4.426e-03  2.205e-03   2.008 0.045501 *  
#> department_stores.l4 -4.543e-02  3.105e-02  -1.463 0.144344    
#> clothing_stores.l4    2.916e-02  1.954e-02   1.492 0.136533    
#> jewelry_stores.l4     5.568e-03  9.055e-02   0.061 0.951009    
#> retail_trade.l4      -5.015e-03  1.818e-03  -2.758 0.006133 ** 
#> const                -2.625e+01  1.226e+02  -0.214 0.830687    
#> trend                 4.888e-01  5.284e-01   0.925 0.355630    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 96.92 on 328 degrees of freedom
#> Multiple R-Squared: 0.9564,  Adjusted R-squared: 0.9542 
#> F-statistic: 423.6 on 17 and 328 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation retail_trade: 
#> ============================================= 
#> retail_trade = department_stores.l1 + clothing_stores.l1 + jewelry_stores.l1 + retail_trade.l1 + department_stores.l2 + clothing_stores.l2 + jewelry_stores.l2 + retail_trade.l2 + department_stores.l3 + clothing_stores.l3 + jewelry_stores.l3 + retail_trade.l3 + department_stores.l4 + clothing_stores.l4 + jewelry_stores.l4 + retail_trade.l4 + const + trend 
#> 
#>                        Estimate Std. Error t value Pr(>|t|)    
#> department_stores.l1   -4.41938    1.39847  -3.160 0.001724 ** 
#> clothing_stores.l1      5.33965    0.75614   7.062 9.88e-12 ***
#> jewelry_stores.l1       6.63236    4.04668   1.639 0.102180    
#> retail_trade.l1         0.60132    0.08097   7.426 9.71e-13 ***
#> department_stores.l2    5.78024    1.61702   3.575 0.000403 ***
#> clothing_stores.l2    -12.89354    0.98511 -13.088  < 2e-16 ***
#> jewelry_stores.l2      -4.68000    4.52528  -1.034 0.301809    
#> retail_trade.l2         0.43146    0.09773   4.415 1.37e-05 ***
#> department_stores.l3    1.78181    1.62751   1.095 0.274404    
#> clothing_stores.l3      4.76946    1.08018   4.415 1.37e-05 ***
#> jewelry_stores.l3       6.35997    4.54560   1.399 0.162713    
#> retail_trade.l3         0.08492    0.09659   0.879 0.379971    
#> department_stores.l4   -2.94456    1.36035  -2.165 0.031142 *  
#> clothing_stores.l4      1.84239    0.85615   2.152 0.032131 *  
#> jewelry_stores.l4     -10.04188    3.96773  -2.531 0.011844 *  
#> retail_trade.l4        -0.14184    0.07967  -1.780 0.075927 .  
#> const                9587.19334 5373.77622   1.784 0.075337 .  
#> trend                  60.37537   23.15061   2.608 0.009525 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 4247 on 328 degrees of freedom
#> Multiple R-Squared: 0.9979,  Adjusted R-squared: 0.9978 
#> F-statistic:  9028 on 17 and 328 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>                   department_stores clothing_stores jewelry_stores retail_trade
#> department_stores             58961           97648          13653       667909
#> clothing_stores               97648          323040          42851      1608377
#> jewelry_stores                13653           42851           9394       272213
#> retail_trade                 667909         1608377         272213     18034658
#> 
#> Correlation matrix of residuals:
#>                   department_stores clothing_stores jewelry_stores retail_trade
#> department_stores            1.0000          0.7075         0.5801       0.6477
#> clothing_stores              0.7075          1.0000         0.7779       0.6664
#> jewelry_stores               0.5801          0.7779         1.0000       0.6613
#> retail_trade                 0.6477          0.6664         0.6613       1.0000

# AIC and BIC for VAR(1)
aic_var1 <- AIC(var1_model)
bic_var1 <- BIC(var1_model)

# AIC and BIC for VAR(p)
aic_varp <- AIC(varp_model)
bic_varp <- BIC(varp_model)

# Print the information criteria for comparison
cat("VAR(1) AIC:", aic_var1, "BIC:", bic_var1, "\n")
#> VAR(1) AIC: 20865.73 BIC: 20958.25
cat("VAR(p) AIC:", aic_varp, "BIC:", bic_varp, "\n")
#> VAR(p) AIC: 20293.36 BIC: 20570.31

# Decide based on the lower AIC and BIC
if(aic_var1 < aic_varp && bic_var1 < bic_varp) {
  cat("VAR(1) is preferred based on AIC and BIC.\n")
} else if(aic_varp < aic_var1 && bic_varp < bic_var1) {
  cat("VAR(p) is preferred based on AIC and BIC.\n")
} else {
  cat("The choice is not clear-cut. Further diagnostics and considerations may be necessary.\n")
}
#> VAR(p) is preferred based on AIC and BIC.

# One-month-ahead forecast for VAR(1) model
forecast_var1 <- predict(var1_model, n.ahead=1)

# One-month-ahead forecast for VAR(p) model
forecast_varp <- predict(varp_model, n.ahead=1)

# Printing the forecasts
print(forecast_var1)
#> $department_stores
#>                            fcst    lower    upper       CI
#> department_stores.fcst 10012.62 9416.608 10608.62 596.0077
#> 
#> $clothing_stores
#>                          fcst    lower    upper       CI
#> clothing_stores.fcst 13900.56 12535.85 15265.27 1364.711
#> 
#> $jewelry_stores
#>                         fcst    lower    upper       CI
#> jewelry_stores.fcst 3167.979 2927.815 3408.142 240.1633
#> 
#> $retail_trade
#>                       fcst    lower    upper       CI
#> retail_trade.fcst 500323.3 489393.2 511253.3 10930.02
print(forecast_varp)
#> $department_stores
#>                            fcst    lower    upper       CI
#> department_stores.fcst 10149.64 9673.721 10625.55 475.9152
#> 
#> $clothing_stores
#>                          fcst    lower    upper       CI
#> clothing_stores.fcst 14180.69 13066.71 15294.66 1113.977
#> 
#> $jewelry_stores
#>                         fcst   lower    upper       CI
#> jewelry_stores.fcst 3229.724 3039.76 3419.689 189.9647
#> 
#> $retail_trade
#>                       fcst    lower    upper       CI
#> retail_trade.fcst 505945.2 497621.8 514268.6 8323.425

# Plotting the forecasts
# For VAR(1)
plot(forecast_var1, main="One-Month-Ahead Forecast for VAR(1)")


# For VAR(p)
plot(forecast_varp, main="One-Month-Ahead Forecast for VAR(p)")


# Perform Granger causality test on the fitted VAR model
granger_test <- causality(varp_model)
#> Warning in causality(varp_model): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (department_stores) as cause variable.

# Print the results
print(granger_test)
#> $Granger
#> 
#>  Granger causality H0: department_stores do not Granger-cause
#>  clothing_stores jewelry_stores retail_trade
#> 
#> data:  VAR object varp_model
#> F-Test = 3.9509, df1 = 12, df2 = 1312, p-value = 5.148e-06
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: department_stores and
#>  clothing_stores jewelry_stores retail_trade
#> 
#> data:  VAR object varp_model
#> Chi-squared = 123.8, df = 3, p-value < 2.2e-16

# Create data matrix for BigVar
data_matrix <- as.matrix(data_ts)

# Fit the sparse VAR model with LASSO regularization, lambda = 0.1, and lag-based sparsity
sparse_var_model <- BigVAR.fit(data_matrix, p = 16, lambda = 0.1, struct = "Lag")

# Print summary of the sparse VAR model
print(summary(sparse_var_model))
#>       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
#> -11449.260     -0.118      0.000    -48.351      0.122   1032.755

Created on 2024-04-10 with reprex v2.1.0

anjuanand6 commented 7 months ago

Homework - 4

The following steps were performed:

  1. Choose 4 seasonally adjusted time series from the US Census Bureaus M3 survey: Chose MRTS4400 - Retail Trade, MRTS4521 - Department Stores, MRTS4481 - Clothing Stores and MRTS44831 - Jewelry Stores series.
  2. Did some empirical analysis and checked the features.
  3. Fit a VAR(1) model and a VAR(p) model with p > 1.
  4. Compare the two fits and checked which is better - Here VAR(p) is better.
  5. Produced a one month a head forecast of the series.
  6. Checked the Granger causality between the series.
  7. Used the BigVAR package to fit a sparse VAR model. Used LASSO regularization here.