Milind Rajendra Kulkarni

# Load required packages

library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)

# set the api key
fredr_set_key('48753a49458ab78ef7185cffd58175b6')
data <- fredr(series_id = 'ICNSA')

head(data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-08     2024-02-08  
#> 2 1967-01-14 ICNSA     334000 2024-02-08     2024-02-08  
#> 3 1967-01-21 ICNSA     277000 2024-02-08     2024-02-08  
#> 4 1967-01-28 ICNSA     252000 2024-02-08     2024-02-08  
#> 5 1967-02-04 ICNSA     274000 2024-02-08     2024-02-08
data$DATE <- as.Date(data$date, format="%m/%d/%Y")

# Time series object that stores annual data Create a time series object.
ts_data <- ts(data$value, frequency = 52)

# Seasonal Decomposition Plot of data
plot(stl(ts_data, s.window="periodic"), main = "Seasonal Decomposition")


# Plot of uninsured claims data
plot(ts_data, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

# Plot shows sudden spike during covid except that the uninsured claim increase gradually.

# arima model to build time series model
model <- auto.arima(ts_data)

# predicting unemployment
forecasted_values <- forecast(model, h = 1)
forecasted_values <- forecasted_values$mean

cat("Forecasted Unemployment  (including COVID year):", forecasted_values, "\n")
#> Forecasted Unemployment  (including COVID year): 276396.2

^{Created on 2024-02-08 with reprex v2.1.0}

library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3

# set the api key
fredr_set_key('48753a49458ab78ef7185cffd58175b6')
data <- fredr(series_id = 'ICNSA')

head(data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-09     2024-02-09  
#> 2 1967-01-14 ICNSA     334000 2024-02-09     2024-02-09  
#> 3 1967-01-21 ICNSA     277000 2024-02-09     2024-02-09  
#> 4 1967-01-28 ICNSA     252000 2024-02-09     2024-02-09  
#> 5 1967-02-04 ICNSA     274000 2024-02-09     2024-02-09
data$DATE <- as.Date(data$date, format="%m/%d/%Y")

# set the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2022-12-31")

# filter out the covid-19 period data
data_without_covid <- subset(data, DATE < covid_start | DATE > covid_end)
tail(data_without_covid, n = 60)
#> # A tibble: 60 × 6
#>    date       series_id  value realtime_start realtime_end DATE      
#>    <date>     <chr>      <dbl> <date>         <date>       <date>    
#>  1 2019-12-14 ICNSA     270547 2024-02-09     2024-02-09   2019-12-14
#>  2 2019-12-21 ICNSA     287243 2024-02-09     2024-02-09   2019-12-21
#>  3 2019-12-28 ICNSA     312524 2024-02-09     2024-02-09   2019-12-28
#>  4 2023-01-07 ICNSA     339883 2024-02-09     2024-02-09   2023-01-07
#>  5 2023-01-14 ICNSA     288330 2024-02-09     2024-02-09   2023-01-14
#>  6 2023-01-21 ICNSA     225228 2024-02-09     2024-02-09   2023-01-21
#>  7 2023-01-28 ICNSA     225026 2024-02-09     2024-02-09   2023-01-28
#>  8 2023-02-04 ICNSA     234007 2024-02-09     2024-02-09   2023-02-04
#>  9 2023-02-11 ICNSA     225332 2024-02-09     2024-02-09   2023-02-11
#> 10 2023-02-18 ICNSA     211007 2024-02-09     2024-02-09   2023-02-18
#> # ℹ 50 more rows

ts_data <- ts(data_without_covid$value, frequency = 52)

plot(ts_data, main = "Uninsured Claims without Covid-19 period", xlab = "Date", ylab = "Uninsured Claims")


# acf to detect auto-correlation between the data
acf(ts_data, main = "Autocorrelation Function (ACF)")


# pacf to detect any random spikes
pacf(ts_data, main = "Partial Autocorrelation Function (PACF)")


model <- auto.arima(ts_data)

# Forecast unemployment value
forecasted_values <- forecast(model, h = 1)
forecasted_values <- forecasted_values$mean

cat("Forecasted Unemployment (excluding COVID period):", forecasted_values, "\n")
#> Forecasted Unemployment (excluding COVID period): 232214.8

^{Created on 2024-02-09 with reprex v2.0.2}

# Load required packages
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3

# set the api key
fredr_set_key('48753a49458ab78ef7185cffd58175b6')

# Monthly State Retail Sales: Gasoline Stations in California
data <- fredr(series_id = 'MSRSCA447')
head(data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 2019-01-01 MSRSCA447   5.3 2024-02-12     2024-02-12  
#> 2 2019-02-01 MSRSCA447   9.7 2024-02-12     2024-02-12  
#> 3 2019-03-01 MSRSCA447   6.8 2024-02-12     2024-02-12  
#> 4 2019-04-01 MSRSCA447  14.1 2024-02-12     2024-02-12  
#> 5 2019-05-01 MSRSCA447   2.2 2024-02-12     2024-02-12

# create time series obj
ts_data <- ts(data$value, frequency = 12)

# plot for detecting trending and seasonality
plot(stl(ts_data, s.window="periodic"), main = "Seasonal Decomposition")

# we can observe seasonality in the data, there is also an hint of cyclic trend

# plot the time series
plot(ts_data, main = "Date", xlab = "Date", ylab = "Retail Sales: Gasoline Stations")


# plot ACF to detect models nature
acf(ts_data, main = "Autocorrelation Function (ACF)")

# ACF plot shows rapid decay indicating positive autocorrelation

# plot PACF
pacf(ts_data, main = "Partial Autocorrelation Function (PACF)")

# there aren't any significant spikes in the PACF plot, except a small one at LAG 1

# to find the best performing Arima model let us ttry and fetch the order with
# the best or lowest AIC score amongst the given range of p, d and q values
p_values <- 0:3
d_values <- 0:3
q_values <- 0:3

# initialize aic and order
top_aic <- Inf
top_order <- c(0, 0, 0)

# iterate
for (p in p_values) {
  for (d in d_values) {
    for (q in q_values) {
      curr_order <- c(p, d, q)
      curr_model <- Arima(ts_data, order=curr_order)
      curr_aic <- AIC(curr_model)
      if (curr_aic < top_aic) {
        # new top order found, overwrite
        top_aic <- curr_aic
        top_order <- curr_order
      }
    }
  }
}

# top performing order
top_order
#> [1] 2 1 2

# AIC score associated with the top order
top_aic
#> [1] 429.6667

# Arima model for the top order
model <- Arima(ts_data, order=top_order)
summary(model)
#> Series: ts_data 
#> ARIMA(2,1,2) 
#> 
#> Coefficients:
#>          ar1      ar2     ma1     ma2
#>       0.0182  -0.5750  0.1435  1.0000
#> s.e.  0.1759   0.1351  0.1035  0.0778
#> 
#> sigma^2 = 91.73:  log likelihood = -209.83
#> AIC=429.67   AICc=430.84   BIC=439.88
#> 
#> Training set error measures:
#>                      ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set -0.3980385 9.155217 7.393018 -0.3433542 85.88841 0.2017793
#>                     ACF1
#> Training set -0.04745976

# check the distribution of residuals
checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(2,1,2)
#> Q* = 17.1, df = 8, p-value = 0.02909
#> 
#> Model df: 4.   Total lags used: 12
# the model for the given top order has a p-value that implies 
# that there is still significant auto-correlation in our model

# lets try next iteration of model to see if we get a better fit while accepting
# some of the trade-offs between p-value and AIC score

model <- Arima(ts_data, order=c(3, 1, 0))
summary(model)
#> Series: ts_data 
#> ARIMA(3,1,0) 
#> 
#> Coefficients:
#>          ar1     ar2      ar3
#>       0.1030  0.0833  -0.2168
#> s.e.  0.1286  0.1279   0.1304
#> 
#> sigma^2 = 109.8:  log likelihood = -213.32
#> AIC=434.65   AICc=435.42   BIC=442.82
#> 
#> Training set error measures:
#>                      ME     RMSE      MAE      MPE     MAPE      MASE
#> Training set -0.4974367 10.10942 7.874796 2.063276 84.31892 0.2149286
#>                     ACF1
#> Training set -0.04552645
checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(3,1,0)
#> Q* = 25.523, df = 9, p-value = 0.002444
#> 
#> Model df: 3.   Total lags used: 12
# this model shows significant improvement in p-value while trading-off little
# to no difference in the AIC vlaues 

# predict the values
prediction <- forecast(model, h = 12)
prediction
#>       Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
#> Nov 5      -23.04473 -36.47174 -9.617719 -43.57957 -2.509890
#> Dec 5      -24.66909 -44.65972 -4.678463 -55.24212  5.903936
#> Jan 6      -23.65864 -49.30818  1.990888 -62.88622 15.568930
#> Feb 6      -22.74769 -51.68857  6.193186 -67.00895 21.513562
#> Mar 6      -22.21746 -53.91732  9.482399 -70.69821 26.263291
#> Apr 6      -22.30609 -56.34158 11.729403 -74.35888 29.746703
#> May 6      -22.46859 -58.85751 13.920326 -78.12065 33.183457
#> Jun 6      -22.60770 -61.24241 16.027018 -81.69439 36.478999
#> Jul 6      -22.61634 -63.42746 18.194780 -85.03156 39.798882
#> Aug 6      -22.59358 -65.44677 20.259616 -88.13188 42.944726
#> Sep 6      -22.56179 -67.35910 22.235525 -91.07336 45.949789
#> Oct 6      -22.55474 -69.20268 24.093198 -93.89661 48.787126
# the predicted data shows steady increase over time.

^{Created on 2024-02-13 with reprex v2.0.2}

Observations:

We found the best fitting model per the AIC but it still showed significant autocorrelation
We then found a model with better p-value whilst trading-off a little with the AIC score
The final predict values show a steady increase in the data over time.

library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(corrplot)
#> Warning: package 'corrplot' was built under R version 4.2.3
#> corrplot 0.92 loaded

fredr_set_key('48753a49458ab78ef7185cffd58175b6')

icnsa <- fredr(series_id = 'ICNSA')
icnsa$Date <- as.Date(icnsa$date)
ts_icnsa <- ts(icnsa$value, frequency = 52)
plot(stl(ts_icnsa, s.window="periodic"))

acf(ts_icnsa)

pacf(ts_icnsa)


ccnsa <- fredr(series_id = 'CCNSA')
ccnsa$Date <- as.Date(ccnsa$date)
ts_ccnsa <- ts(ccnsa$value, frequency = 52)
plot(stl(ts_ccnsa, s.window="periodic"))

acf(ts_ccnsa)

pacf(ts_ccnsa)



data <- merge(icnsa, ccnsa, by = "Date", suffixes = c("_icnsa", "_ccnsa"))
head(data, n =5)
#>         Date date_icnsa series_id_icnsa value_icnsa realtime_start_icnsa
#> 1 1967-01-07 1967-01-07           ICNSA      346000           2024-02-22
#> 2 1967-01-14 1967-01-14           ICNSA      334000           2024-02-22
#> 3 1967-01-21 1967-01-21           ICNSA      277000           2024-02-22
#> 4 1967-01-28 1967-01-28           ICNSA      252000           2024-02-22
#> 5 1967-02-04 1967-02-04           ICNSA      274000           2024-02-22
#>   realtime_end_icnsa date_ccnsa series_id_ccnsa value_ccnsa
#> 1         2024-02-22 1967-01-07           CCNSA     1594000
#> 2         2024-02-22 1967-01-14           CCNSA     1563000
#> 3         2024-02-22 1967-01-21           CCNSA     1551000
#> 4         2024-02-22 1967-01-28           CCNSA     1533000
#> 5         2024-02-22 1967-02-04           CCNSA     1534000
#>   realtime_start_ccnsa realtime_end_ccnsa
#> 1           2024-02-22         2024-02-22
#> 2           2024-02-22         2024-02-22
#> 3           2024-02-22         2024-02-22
#> 4           2024-02-22         2024-02-22
#> 5           2024-02-22         2024-02-22

corr <- cor(data$value_icnsa, data$value_ccnsa, use = "complete.obs")
corr
#> [1] 0.714954

ts_icnsa <- ts(data$value_icnsa, frequency = 52)
ts_ccnsa <- ts(data$value_ccnsa, frequency = 52)

model <- auto.arima(ts_icnsa, xreg = ts_ccnsa)
summary(model)
#> Series: ts_icnsa 
#> Regression with ARIMA(2,1,1)(1,0,2)[52] errors 
#> 
#> Coefficients:
#>          ar1      ar2      ma1    sar1     sma1    sma2     xreg
#>       0.5666  -0.2917  -0.0020  0.4014  -0.1051  0.0931  -0.0382
#> s.e.  0.0714   0.0350   0.0794  0.0523   0.0534  0.0232   0.0069
#> 
#> sigma^2 = 6.941e+09:  log likelihood = -37967.81
#> AIC=75951.63   AICc=75951.68   BIC=75999.62
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set -14.66077 83197.81 36955.39 -0.4938972 9.336523 0.3891743
#>                      ACF1
#> Training set 6.786416e-05

coeff <- mean(diff(ts_icnsa))
coeff
#> [1] -37.36434

forecast <- forecast(model, xreg = tail(data$value_ccnsa, 1), h = 1)
forecast_mean <- mean(forecast$mean)
cat("Forecasted Value:", forecast_mean, "\n")
#> Forecasted Value: 213860.3

<sup>Created on 2024-02-22 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>

Observations Data and Model:

Plots show that the data has decay in ACF and also the PACF is also consistent and within confidence limits.
The data is also highly correlated, 0.71.
The model is ARIMA (2, 1, 1) (1, 0, 2) [52].
The seasonality and non-seasonality is captured by the model.

Regression Diagnostics:

Model relies highly on the AR1, AR2 slightly less.
Inclusion of the regressor improves AIC, AICc, and BIC, indicating a better-fitting model compared to the one without the regressor.

# Load required packages
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
library(ggplot2)
library(zoo)
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('48753a49458ab78ef7185cffd58175b6')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
data <- fredr(series_id = 'ICNSA')
data$Date <- as.Date(data$date)
head(data, n = 5)
#> # A tibble: 5 × 6
#>   date       series_id  value realtime_start realtime_end Date      
#>   <date>     <chr>      <dbl> <date>         <date>       <date>    
#> 1 1967-01-07 ICNSA     346000 2024-02-28     2024-02-28   1967-01-07
#> 2 1967-01-14 ICNSA     334000 2024-02-28     2024-02-28   1967-01-14
#> 3 1967-01-21 ICNSA     277000 2024-02-28     2024-02-28   1967-01-21
#> 4 1967-01-28 ICNSA     252000 2024-02-28     2024-02-28   1967-01-28
#> 5 1967-02-04 ICNSA     274000 2024-02-28     2024-02-28   1967-02-04

# creating time series object
ts <- ts(data$value, frequency = 52)

# plot ICNSA
plot(ts, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

plot(stl(ts, s.window="periodic"), main = "Seasonal Decomposition")


# set the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2021-12-31")

# Plot highlighting the covid period in our dataset 
plot(data$date, data$value, type = "l", main = "Claims during Covid period", xlab = "Year", ylab = "Number")

abline(v = as.numeric(covid_start), col = "red", lty = 2)
abline(v = as.numeric(covid_end), col = "red", lty = 2)


# filter out the covid-19 period data
covid_period_data <- subset(data, Date >= covid_start | Date <= covid_end)
data_without_covid <- subset(data, Date < covid_start | Date > covid_end)

# Perform cubic spline interpolation for multiple values of spar to find the best fit
spar_values <- seq(0.1, 1.0, by = 0.1)

for (spar in spar_values) {
  # fit the spline on our data
  spline_fit  <- smooth.spline(x = as.numeric(data_without_covid$Date), y = data_without_covid$value, spar = spar, tol = 1e-6)

  # Impute the new values using the spline fit
  imputed_data <- predict(spline_fit, x = as.numeric(covid_period_data$Date))$y

  # Update Covid period data with imputed values
  covid_period_data$value <- imputed_data

  # Update the data with new predicted values
  updated_data <- rbind(data_without_covid, covid_period_data)
  updated_data <- updated_data %>% arrange(date)

  # Plot updated data
  plot(updated_data$date, updated_data$value, type = "l", col = "red", lwd = 2,
       main = paste("Smoothened Time Series (spar =", spar, ")"), xlab = "Year", ylab = "Number")

  lines(data$date, data$value, col = "black", lwd = 2)
  legend("topleft", legend = c("Updated", "Original"), col = c("red", "black"), lty = 1, lwd = 2)
}

# after visualizing all the splines we believe that the one the with spar=0.4 gives the best
# naturally fit of the data and we select it for predicting our future values.

# selecting the best fir spline
spline_fit_covid <- smooth.spline(x = as.numeric(data_without_covid$Date), y = data_without_covid$value, spar = 0.4, tol = 1e-6)
imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period_data$Date))$y
covid_period_data$value <- imputed_values
updated_data <- rbind(data_without_covid, covid_period_data)
updated_data <- updated_data %>% arrange(Date)

plot(updated_data$Date, updated_data$value, type = "l", col = "red", lwd = 2,
     main = "Updated TS Vs Original TS", xlab = "Year", ylab = "Number")
lines(data$Date, data$value, col = "black", lwd = 2)
legend("topright", legend = c("Updated", "Original"), col = c("red", "black"), lty = 1, lwd = 2)


# Plot both time series for change comparision
ts <- ts(data$value, frequency = 52)
new_ts <- ts(updated_data$value, frequency = 52)

plot(ts, col = "black", main = "Comparison of Time Series", ylab = "Value")
lines(new_ts, col = "red")
legend("topright", legend = c("Original", "Updated"), col = c("black", "red"), lty = 1, lwd = 2)


# Fit and forecast the multiplicative Holt-Winters model
hw_multiplicative <- HoltWinters(ts, seasonal = "multiplicative")
forecast_multiplicative <- forecast(hw_multiplicative, h = 1)

# Fit and forecast the additive Holt-Winters model
hw_additive <- HoltWinters(ts, seasonal = "additive")
forecast_additive <- forecast(hw_additive, h = 1)

# Print the forecasts
cat("Multiplicative Holt-Winters Forecast:", forecast_multiplicative$mean, "\n")
#> Multiplicative Holt-Winters Forecast: 197681.2
cat("Additive Holt-Winters Forecast:", forecast_additive$mean, "\n")
#> Additive Holt-Winters Forecast: 161954.6

^{Created on 2024-02-28 with reprex v2.0.2}

Multiplicative Holt-Winters Forecast: 197681.2
Additive Holt-Winters Forecast: 161954.6

# Load required packages
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
library(ggplot2)
library(zoo)
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(KFAS)
#> Warning: package 'KFAS' was built under R version 4.2.3
#> Please cite KFAS in publications by using: 
#> 
#>   Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
library(imputeTS)
#> Warning: package 'imputeTS' was built under R version 4.2.3
#> 
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:zoo':
#> 
#>     na.locf
#> The following object is masked from 'package:tseries':
#> 
#>     na.remove
library(MARSS)
#> Warning: package 'MARSS' was built under R version 4.2.3
library(dplyr)
library(stats)

# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('48753a49458ab78ef7185cffd58175b6')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
data <- fredr(series_id = 'ICNSA')
data$Date <- as.Date(data$date)
head(data, n = 5)
#> # A tibble: 5 × 6
#>   date       series_id  value realtime_start realtime_end Date      
#>   <date>     <chr>      <dbl> <date>         <date>       <date>    
#> 1 1967-01-07 ICNSA     346000 2024-03-07     2024-03-07   1967-01-07
#> 2 1967-01-14 ICNSA     334000 2024-03-07     2024-03-07   1967-01-14
#> 3 1967-01-21 ICNSA     277000 2024-03-07     2024-03-07   1967-01-21
#> 4 1967-01-28 ICNSA     252000 2024-03-07     2024-03-07   1967-01-28
#> 5 1967-02-04 ICNSA     274000 2024-03-07     2024-03-07   1967-02-04

# creating time series object
ts <- ts(data$value, frequency = 52)

# plot ICNSA
plot(ts, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

plot(stl(ts, s.window="periodic"), main = "Seasonal Decomposition")


# Plot highlighting the covid period in our dataset 
plot(data$date, data$value, type = "l", main = "Claims during Covid period", xlab = "Year", ylab = "Number")

# set the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2021-12-31")

abline(v = as.numeric(covid_start), col = "red", lty = 2)
abline(v = as.numeric(covid_end), col = "red", lty = 2)


# filter out the covid-19 period data
covid_period_data <- subset(data, Date >= covid_start & Date <= covid_end)
data_without_covid <- subset(data, Date < covid_start | Date > covid_end)

# Set values between Covid start and end date to NA
data$value[data$Date >= covid_start & data$Date <= covid_end] <- NA

# Plot data with NA values
plot(data$Date, data$value, type = "l", 
   main = "Initial Claims - Covid Period - Assigned NULL",
   xlab = "Year",
   ylab = "Number")


# Impute values for the Covid period using sSSM
imputed_values <- na_kalman(data$value, smooth = TRUE)
data$value <- imputed_values

plot(data$Date, data$value, type = "l", 
   main = "Imputed Values - Kalman",
   xlab = "Year",
   ylab = "Number")


# New time series object
new_ts <- ts(data$value, frequency = 52)

# Lets fit our spline at spar = 0.4
spline_fit <- smooth.spline(x = as.numeric(data_without_covid$Date), y = data_without_covid$value, spar = 0.4)

# Imputing new values using spline fit
imputed_values_spline <- predict(spline_fit, x = as.numeric(covid_period_data$date))$y

# Updating Covid period data with imputed values
covid_period_data$value <- imputed_values_spline

# Combine imputed values
updated_data <- rbind(data_without_covid, covid_period_data)
updated_data <- updated_data %>% arrange(Date)

plot(updated_data$date, updated_data$value, type = "l", col = "red", lwd = 2,
     main = "Updated (penalty - 0.2) Vs Original", xlab = "Year", ylab = "Number")
lines(data$Date, data$value, col = "black", lwd = 2)
legend("topright", legend = c("Updated", "Original"), col = c("red", "black"), lty = 1, lwd = 2)


ts_spline <- ts(updated_data$value, frequency = 52)

# Comparison of original series with imputed series
plot(new_ts, col = "red", type = "l", lty = 1, ylab = "Value", main = "Time Series Comparison")
lines(ts, col = "black", type = "l", lty = 2)
lines(ts_spline, col = "blue", type = "l", lty = 3)
legend("topright", legend = c("New (Kalman)", "Original", "Spline"), col = c("red", "black", "blue"), lty = 1:3)


# Seasonal decomposition to check tre
decomposition <- stl(new_ts, s.window = "periodic")
plot(decomposition)


# Fitting a structural time series model with both trend and seasonal components
ssm_model <- StructTS(new_ts, type = "BSM")
ssm_model
#> 
#> Call:
#> StructTS(x = new_ts, type = "BSM")
#> 
#> Variances:
#>     level      slope       seas    epsilon  
#> 2.967e+08  9.625e+02  2.199e+08  0.000e+00
summary(ssm_model)
#>           Length Class  Mode     
#> coef         4   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    8946   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Use tsSmooth to obtain the smoothed state estimates
smoothed_data <- tsSmooth(ssm_model)
residuals <- residuals(ssm_model)

# Plot residuals to visualize the error component
plot(residuals, main="Residuals", ylab="Residuals", xlab="Time")


# Load IURNSA data
iurnsa_data <- fredr(series_id = "IURNSA")
iurnsa_data$Date <- as.Date(iurnsa_data$date)

# Check the structure and values of iurnsa_data to ensure compatibility with ICNSA data
icnsa_data <- subset(data, Date >= as.Date("1971-01-01"))

# Remove last row of icnsa_data
icnsa_data <- icnsa_data[-nrow(icnsa_data), ]
merged_data <- merge(icnsa_data, iurnsa_data, by = "date", all.x = TRUE)
arima_model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y, seasonal = TRUE)
summary(arima_model)
#> Series: merged_data$value.x 
#> Regression with ARIMA(2,1,1) errors 
#> 
#> Coefficients:
#>          ar1     ar2      ma1       xreg
#>       0.6878  0.1547  -0.9862  23256.533
#> s.e.  0.0209  0.0204   0.0041   3959.789
#> 
#> sigma^2 = 2.443e+09:  log likelihood = -33892.12
#> AIC=67794.23   AICc=67794.26   BIC=67823.87
#> 
#> Training set error measures:
#>                    ME    RMSE      MAE      MPE     MAPE      MASE         ACF1
#> Training set -178.193 49377.9 33016.53 -1.38752 8.782257 0.9876402 -0.003897536

# Forecast the next week's value using the model with the covariate
forecasted_value <- forecast(arima_model, h = 1, xreg = tail(merged_data$value.y, 2))
forecasted_value
#>      Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
#> 2774       208369.2 145031.7 271706.6 111502.90 305235.5
#> 2775       210768.8 133396.8 288140.8  92438.51 329099.1

forecast_value <- mean(forecasted_value$mean, na.rm = TRUE)
forecast_value
#> [1] 209569

^{Created on 2024-03-07 with reprex v2.0.2}

# Load required packages
library(readxl)
#> Warning: package 'readxl' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.3.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
#> Warning: package 'readr' was built under R version 4.3.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.3.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.3
library(MARSS)
#> Warning: package 'MARSS' was built under R version 4.3.3
library(corrplot)
#> Warning: package 'corrplot' was built under R version 4.3.3
#> corrplot 0.92 loaded
library(vars)
#> Warning: package 'vars' was built under R version 4.2.3
#> Loading required package: MASS
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.3.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Warning: package 'urca' was built under R version 4.3.3
#> Loading required package: lmtest
#> Warning: package 'lmtest' was built under R version 4.3.3
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.2.3
#> Loading required package: lattice

# 25S   Chemical Products
# 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing
# 25B   Pharmaceutical and Medicine Manufacturing
# 25C   Paint, Coating, and Adhesive, Manufacturing

df_25s <- read_excel("C:/Study Material/R Studio/25STI.xlsx")
df_25a <- read_excel("C:/Study Material/R Studio/25ATI.xlsx")
df_25b <- read_excel("C:/Study Material/R Studio/25BTI.xlsx")
df_25c <- read_excel("C:/Study Material/R Studio/25CTI.xlsx")

df_25s$Period = as.Date(paste("01", df_25s$Period, sep = "-"), format = "%d-%b-%Y")
df_25a$Period = as.Date(paste("01", df_25a$Period, sep = "-"), format = "%d-%b-%Y")
df_25b$Period = as.Date(paste("01", df_25b$Period, sep = "-"), format = "%d-%b-%Y")
df_25c$Period = as.Date(paste("01", df_25c$Period, sep = "-"), format = "%d-%b-%Y")

df_25s <- subset(df_25s, Period <= as.Date("2024-02-01"))
df_25a <- subset(df_25a, Period <= as.Date("2024-02-01"))
df_25b <- subset(df_25b, Period <= as.Date("2024-02-01"))
df_25c <- subset(df_25c, Period <= as.Date("2024-02-01"))

df_25s$Value <- as.numeric(df_25s$Value)
df_25a$Value <- as.numeric(df_25a$Value)
df_25b$Value <- as.numeric(df_25b$Value)
df_25c$Value <- as.numeric(df_25c$Value)

plot(df_25s$Period, df_25s$Value, type = "l", 
     main = "25S Chemical Products",
     xlab = "Period",
     ylab = "Value")


plot(df_25a$Period, df_25a$Value, type = "l", 
     main = "25A Pesticide, Fertilizer and Other Agricultural Chemical Manufacturing",
     xlab = "Period",
     ylab = "Value")


plot(df_25b$Period, df_25b$Value, type = "l", 
     main = "25B Pharmaceutical and Medicine Manufacturing",
     xlab = "Period",
     ylab = "Value")


plot(df_25c$Period, df_25c$Value, type = "l", 
     main = "25C Paint, Coating, and Adhesive, Manufacturing",
     xlab = "Period",
     ylab = "Value")


# 4 in 1
ggplot(xlab = 'Period', ylab = 'Value') + 
  geom_line(data = df_25s, aes(x = Period, y = Value)) +
  geom_line(data = df_25a, aes(x = Period, y = Value), color = "red") +
  geom_line(data = df_25b, aes(x = Period, y = Value), color = "blue") +
  geom_line(data = df_25c, aes(x = Period, y = Value), color = "green")


cor(df_25s$Value, df_25a$Value)
#> [1] 0.9606849
cor(df_25s$Value, df_25b$Value)
#> [1] 0.9899559
cor(df_25s$Value, df_25c$Value)
#> [1] 0.9040823
cor(df_25a$Value, df_25b$Value)
#> [1] 0.9513506
cor(df_25a$Value, df_25c$Value)
#> [1] 0.8941125
cor(df_25b$Value, df_25c$Value)
#> [1] 0.8781852

ccf(df_25s$Value, df_25a$Value)

ccf(df_25s$Value, df_25b$Value)

ccf(df_25s$Value, df_25c$Value)

ccf(df_25a$Value, df_25b$Value)

ccf(df_25a$Value, df_25c$Value)

ccf(df_25b$Value, df_25c$Value)


adf.test(df_25s$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25s$Value
#> Dickey-Fuller = -3.2907, Lag order = 7, p-value = 0.07273
#> alternative hypothesis: stationary
df_25s$Value <- c(NA, diff(df_25s$Value))
df_25s <- subset(df_25s, !is.na(Value))
adf.test(df_25s$Value, alternative = "stationary")
#> Warning in adf.test(df_25s$Value, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25s$Value
#> Dickey-Fuller = -4.7374, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
acf(df_25s$Value)

pacf(df_25s$Value)


adf.test(df_25a$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25a$Value
#> Dickey-Fuller = -2.21, Lag order = 7, p-value = 0.4885
#> alternative hypothesis: stationary
df_25a$Value <- c(NA, diff(df_25a$Value))
df_25a <- subset(df_25a, !is.na(Value))
adf.test(df_25a$Value, alternative = "stationary")
#> Warning in adf.test(df_25a$Value, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25a$Value
#> Dickey-Fuller = -7.0763, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
acf(df_25a$Value)

pacf(df_25a$Value)


adf.test(df_25b$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25b$Value
#> Dickey-Fuller = -2.879, Lag order = 7, p-value = 0.206
#> alternative hypothesis: stationary
df_25b$Value <- c(NA, diff(df_25b$Value))
df_25b <- subset(df_25b, !is.na(Value))
adf.test(df_25b$Value, alternative = "stationary")
#> Warning in adf.test(df_25b$Value, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25b$Value
#> Dickey-Fuller = -6.2545, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
acf(df_25b$Value)

pacf(df_25b$Value)


adf.test(df_25c$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25c$Value
#> Dickey-Fuller = -3.7952, Lag order = 7, p-value = 0.01947
#> alternative hypothesis: stationary
df_25c$Value <- c(NA, diff(df_25c$Value))
df_25c <- subset(df_25c, !is.na(Value))
adf.test(df_25c$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25c$Value
#> Dickey-Fuller = -3.7229, Lag order = 7, p-value = 0.02309
#> alternative hypothesis: stationary
acf(df_25c$Value)

pacf(df_25c$Value)


ts_25s = ts(df_25s$Value, frequency = 12)
plot(stl(ts_25s, s.window="periodic"), main = "25A TI: Seasonal Decomposition")


ts_25a = ts(df_25a$Value, frequency = 12)
plot(stl(ts_25a, s.window="periodic"), main = "25A TI: Seasonal Decomposition")


ts_25b = ts(df_25b$Value, frequency = 12)
plot(stl(ts_25b, s.window="periodic"), main = "25B TI: Seasonal Decomposition")


ts_25c = ts(df_25c$Value, frequency = 12)
plot(stl(ts_25c, s.window="periodic"), main = "25C TI: Seasonal Decomposition")


# 4 in 1
ggplot(xlab = 'Period', ylab = 'Value') + 
  geom_line(data = df_25s, aes(x = Period, y = Value)) +
  geom_line(data = df_25a, aes(x = Period, y = Value), color = "red") +
  geom_line(data = df_25b, aes(x = Period, y = Value), color = "blue") +
  geom_line(data = df_25c, aes(x = Period, y = Value), color = "green")


# 25S   Chemical Products
# 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing
# 25B   Pharmaceutical and Medicine Manufacturing
# 25C   Paint, Coating, and Adhesive, Manufacturing

ts_df <- data.frame(Y1=ts_25s, Y2=ts_25a, Y3=ts_25b, Y4=ts_25c)
ts_obj <- ts(ts_df)

# VAR Model
var_model <- VAR(ts_obj, p = 1)
summary(var_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Y1, Y2, Y3, Y4 
#> Deterministic variables: const 
#> Sample size: 384 
#> Log Likelihood: -9709.25 
#> Roots of the characteristic polynomial:
#> 0.6195 0.1965 0.0824 0.02111
#> Call:
#> VAR(y = ts_obj, p = 1)
#> 
#> 
#> Estimation results for equation Y1: 
#> =================================== 
#> Y1 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1   0.57145    0.07175   7.964 1.96e-14 ***
#> Y2.l1  -0.80294    0.28004  -2.867  0.00437 ** 
#> Y3.l1  -0.52817    0.10312  -5.122 4.82e-07 ***
#> Y4.l1   0.30315    0.64276   0.472  0.63746    
#> const 147.61450   26.34659   5.603 4.06e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 478.1 on 379 degrees of freedom
#> Multiple R-Squared: 0.1801,  Adjusted R-squared: 0.1714 
#> F-statistic: 20.81 on 4 and 379 DF,  p-value: 1.597e-15 
#> 
#> 
#> Estimation results for equation Y2: 
#> =================================== 
#> Y2 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>       Estimate Std. Error t value Pr(>|t|)  
#> Y1.l1  0.02525    0.01366   1.849   0.0652 .
#> Y2.l1  0.06611    0.05331   1.240   0.2157  
#> Y3.l1 -0.03373    0.01963  -1.719   0.0865 .
#> Y4.l1  0.09959    0.12235   0.814   0.4162  
#> const  7.69050    5.01516   1.533   0.1260  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 91.01 on 379 degrees of freedom
#> Multiple R-Squared: 0.02855, Adjusted R-squared: 0.0183 
#> F-statistic: 2.785 on 4 and 379 DF,  p-value: 0.02649 
#> 
#> 
#> Estimation results for equation Y3: 
#> =================================== 
#> Y3 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1   0.03433    0.05064   0.678   0.4982    
#> Y2.l1  -0.16137    0.19763  -0.817   0.4147    
#> Y3.l1   0.01440    0.07277   0.198   0.8432    
#> Y4.l1  -0.82932    0.45361  -1.828   0.0683 .  
#> const 106.38511   18.59333   5.722 2.14e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 337.4 on 379 degrees of freedom
#> Multiple R-Squared: 0.01282, Adjusted R-squared: 0.002406 
#> F-statistic: 1.231 on 4 and 379 DF,  p-value: 0.2972 
#> 
#> 
#> Estimation results for equation Y4: 
#> =================================== 
#> Y4 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1  0.040659   0.005376   7.563 3.01e-13 ***
#> Y2.l1 -0.035630   0.020983  -1.698   0.0903 .  
#> Y3.l1 -0.045399   0.007726  -5.876 9.22e-09 ***
#> Y4.l1  0.267599   0.048161   5.556 5.20e-08 ***
#> const  2.899038   1.974086   1.469   0.1428    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 35.82 on 379 degrees of freedom
#> Multiple R-Squared: 0.2921,  Adjusted R-squared: 0.2847 
#> F-statistic:  39.1 on 4 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>        Y1      Y2        Y3      Y4
#> Y1 228573 8358.68 117792.82 2670.17
#> Y2   8359 8282.17    -38.41  -24.23
#> Y3 117793  -38.41 113838.64  321.61
#> Y4   2670  -24.23    321.61 1283.24
#> 
#> Correlation matrix of residuals:
#>        Y1        Y2        Y3        Y4
#> Y1 1.0000  0.192112  0.730233  0.155910
#> Y2 0.1921  1.000000 -0.001251 -0.007431
#> Y3 0.7302 -0.001251  1.000000  0.026609
#> Y4 0.1559 -0.007431  0.026609  1.000000

forecast_var <- forecast(var_model, h = 1, frequency = 12)
forecast_y1 <- mean(forecast_var$forecast$Y1$mean)
cat("Forecasted Value for 25S   Chemical Products:", forecast_y1, "\n")
#> Forecasted Value for 25S Chemical Products: 90.81823
forecast_y2 <-mean(forecast_var$forecast$Y2$mean)
cat("Forecasted Value for 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing:", forecast_y2, "\n")
#> Forecasted Value for 25A Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing: 7.383598
forecast_y3 <- mean(forecast_var$forecast$Y3$mean)
cat("Forecasted Value for 25B   Pharmaceutical and Medicine Manufacturing:", forecast_y3, "\n")
#> Forecasted Value for 25B Pharmaceutical and Medicine Manufacturing: 110.0477
forecast_y4 <- mean(forecast_var$forecast$Y4$mean)
cat("Forecasted Value for 25C   Paint, Coating, and Adhesive, Manufacturing:", forecast_y4, "\n")
#> Forecasted Value for 25C Paint, Coating, and Adhesive, Manufacturing: -3.597713

var_model_p <- VAR(ts_obj, p = 3)
summary(var_model_p)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Y1, Y2, Y3, Y4 
#> Deterministic variables: const 
#> Sample size: 382 
#> Log Likelihood: -9596.631 
#> Roots of the characteristic polynomial:
#> 0.8538 0.6481 0.5543 0.5486 0.5486 0.4684 0.4684 0.4649 0.4491 0.4491 0.4411 0.4411
#> Call:
#> VAR(y = ts_obj, p = 3)
#> 
#> 
#> Estimation results for equation Y1: 
#> =================================== 
#> Y1 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1   0.43830    0.08225   5.329 1.72e-07 ***
#> Y2.l1  -0.80206    0.27850  -2.880 0.004209 ** 
#> Y3.l1  -0.39972    0.11325  -3.530 0.000469 ***
#> Y4.l1  -0.69368    0.70540  -0.983 0.326063    
#> Y1.l2   0.16991    0.08622   1.971 0.049520 *  
#> Y2.l2  -0.30048    0.28379  -1.059 0.290379    
#> Y3.l2  -0.07193    0.11370  -0.633 0.527333    
#> Y4.l2   1.04045    0.70361   1.479 0.140063    
#> Y1.l3   0.13338    0.08751   1.524 0.128325    
#> Y2.l3  -0.05128    0.28489  -0.180 0.857241    
#> Y3.l3   0.03905    0.11574   0.337 0.735983    
#> Y4.l3  -0.42614    0.68232  -0.625 0.532651    
#> const 110.74442   28.29761   3.914 0.000108 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 469.9 on 369 degrees of freedom
#> Multiple R-Squared: 0.2281,  Adjusted R-squared: 0.203 
#> F-statistic: 9.088 on 12 and 369 DF,  p-value: 2.346e-15 
#> 
#> 
#> Estimation results for equation Y2: 
#> =================================== 
#> Y2 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>         Estimate Std. Error t value Pr(>|t|)   
#> Y1.l1  0.0200294  0.0156672   1.278  0.20190   
#> Y2.l1  0.0772226  0.0530504   1.456  0.14634   
#> Y3.l1 -0.0307709  0.0215723  -1.426  0.15460   
#> Y4.l1  0.0002908  0.1343688   0.002  0.99827   
#> Y1.l2  0.0328210  0.0164246   1.998  0.04642 * 
#> Y2.l2  0.0390587  0.0540587   0.723  0.47043   
#> Y3.l2 -0.0312264  0.0216580  -1.442  0.15021   
#> Y4.l2  0.3529468  0.1340279   2.633  0.00881 **
#> Y1.l3 -0.0135791  0.0166695  -0.815  0.41582   
#> Y2.l3 -0.0637017  0.0542672  -1.174  0.24121   
#> Y3.l3  0.0193455  0.0220467   0.877  0.38080   
#> Y4.l3 -0.3555227  0.1299723  -2.735  0.00653 **
#> const  7.0857831  5.3903220   1.315  0.18948   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 89.51 on 369 degrees of freedom
#> Multiple R-Squared: 0.08328, Adjusted R-squared: 0.05347 
#> F-statistic: 2.794 on 12 and 369 DF,  p-value: 0.001151 
#> 
#> 
#> Estimation results for equation Y3: 
#> =================================== 
#> Y3 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>         Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1  0.0595757  0.0588507   1.012   0.3120    
#> Y2.l1 -0.1623027  0.1992737  -0.814   0.4159    
#> Y3.l1 -0.0302021  0.0810323  -0.373   0.7096    
#> Y4.l1 -0.4771400  0.5047308  -0.945   0.3451    
#> Y1.l2 -0.0710367  0.0616960  -1.151   0.2503    
#> Y2.l2 -0.0142740  0.2030612  -0.070   0.9440    
#> Y3.l2  0.1938417  0.0813540   2.383   0.0177 *  
#> Y4.l2 -0.5769452  0.5034502  -1.146   0.2525    
#> Y1.l3 -0.0009571  0.0626160  -0.015   0.9878    
#> Y2.l3  0.0735605  0.2038443   0.361   0.7184    
#> Y3.l3  0.0865111  0.0828141   1.045   0.2969    
#> Y4.l3  0.4358522  0.4882163   0.893   0.3726    
#> const 88.3532020 20.2477201   4.364 1.66e-05 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 336.2 on 369 degrees of freedom
#> Multiple R-Squared: 0.04493, Adjusted R-squared: 0.01387 
#> F-statistic: 1.447 on 12 and 369 DF,  p-value: 0.1425 
#> 
#> 
#> Estimation results for equation Y4: 
#> =================================== 
#> Y4 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1  0.022011   0.005962   3.692 0.000256 ***
#> Y2.l1 -0.036180   0.020187  -1.792 0.073910 .  
#> Y3.l1 -0.021961   0.008209  -2.675 0.007800 ** 
#> Y4.l1  0.135365   0.051131   2.647 0.008458 ** 
#> Y1.l2  0.020041   0.006250   3.207 0.001461 ** 
#> Y2.l2 -0.042337   0.020571  -2.058 0.040282 *  
#> Y3.l2 -0.019101   0.008241  -2.318 0.021015 *  
#> Y4.l2  0.048952   0.051001   0.960 0.337769    
#> Y1.l3  0.008129   0.006343   1.282 0.200821    
#> Y2.l3 -0.026463   0.020650  -1.282 0.200819    
#> Y3.l3 -0.016983   0.008389  -2.024 0.043650 *  
#> Y4.l3  0.197127   0.049458   3.986 8.11e-05 ***
#> const  2.129868   2.051154   1.038 0.299775    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 34.06 on 369 degrees of freedom
#> Multiple R-Squared: 0.3743,  Adjusted R-squared: 0.354 
#> F-statistic:  18.4 on 12 and 369 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>        Y1      Y2       Y3      Y4
#> Y1 220828 7340.22 120144.4 2157.16
#> Y2   7340 8012.79    547.2  -35.25
#> Y3 120144  547.24 113059.3  550.13
#> Y4   2157  -35.25    550.1 1160.25
#> 
#> Correlation matrix of residuals:
#>        Y1       Y2      Y3       Y4
#> Y1 1.0000  0.17450 0.76037  0.13477
#> Y2 0.1745  1.00000 0.01818 -0.01156
#> Y3 0.7604  0.01818 1.00000  0.04803
#> Y4 0.1348 -0.01156 0.04803  1.00000

forecast_var_p <- forecast(var_model_p, h =1, frequency = 12)
forecast_p_y1 <- mean(forecast_var_p$forecast$Y1$mean)
cat("Forecasted Value for 25S   Chemical Products:", forecast_p_y1, "\n")
#> Forecasted Value for 25S Chemical Products: 5.004572
forecast_p_y2 <-mean(forecast_var_p$forecast$Y2$mean)
cat("Forecasted Value for 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing:", forecast_p_y2, "\n")
#> Forecasted Value for 25A Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing: -3.631642
forecast_p_y3 <- mean(forecast_var_p$forecast$Y3$mean)
cat("Forecasted Value for 25B   Pharmaceutical and Medicine Manufacturing:", forecast_p_y3, "\n")
#> Forecasted Value for 25B Pharmaceutical and Medicine Manufacturing: 189.0412
forecast_p_y4 <- mean(forecast_var_p$forecast$Y4$mean)
cat("Forecasted Value for 25C   Paint, Coating, and Adhesive, Manufacturing:", forecast_p_y4, "\n")
#> Forecasted Value for 25C Paint, Coating, and Adhesive, Manufacturing: -18.50666

cat("AIC for VAR (p=1)", AIC(var_model), "AIC for VAR (p=3)", AIC(var_model_p))
#> AIC for VAR (p=1) 19458.5 AIC for VAR (p=3) 19297.26
cat("BIC for VAR (p=1)", BIC(var_model), "BIC for VAR (p=3)", BIC(var_model_p))
#> BIC for VAR (p=1) 19537.51 BIC for VAR (p=3) 19502.42

# Granger causality test
granger_results <- causality(var_model_p)
#> Warning in causality(var_model_p): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Y1) as cause variable.
print(granger_results)
#> $Granger
#> 
#>  Granger causality H0: Y1 do not Granger-cause Y2 Y3 Y4
#> 
#> data:  VAR object var_model_p
#> F-Test = 6.8727, df1 = 9, df2 = 1476, p-value = 9.666e-10
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Y1 and Y2 Y3 Y4
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 145.33, df = 3, p-value < 2.2e-16

# BigVAR
data_matrix <- as.matrix(ts_df)
sparse_var_model <- BigVAR.fit(data_matrix, p = 1, lambda = 0.1, struct = "Lag")
summary(sparse_var_model)
#>      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#>  -0.82932  -0.03807   0.03749  13.17876   0.37022 147.61450

^{Created on 2024-04-11 with reprex v2.1.0}

jlivsey / UB-sping24-time-series

Milind Rajendra Kulkarni #20