jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Milind Rajendra Kulkarni #20

Open MilindKulkarni12 opened 4 months ago

MilindKulkarni12 commented 4 months ago
# Load required packages

library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)

# set the api key
fredr_set_key('48753a49458ab78ef7185cffd58175b6')
data <- fredr(series_id = 'ICNSA')

head(data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-08     2024-02-08  
#> 2 1967-01-14 ICNSA     334000 2024-02-08     2024-02-08  
#> 3 1967-01-21 ICNSA     277000 2024-02-08     2024-02-08  
#> 4 1967-01-28 ICNSA     252000 2024-02-08     2024-02-08  
#> 5 1967-02-04 ICNSA     274000 2024-02-08     2024-02-08
data$DATE <- as.Date(data$date, format="%m/%d/%Y")

# Time series object that stores annual data Create a time series object.
ts_data <- ts(data$value, frequency = 52)

# Seasonal Decomposition Plot of data
plot(stl(ts_data, s.window="periodic"), main = "Seasonal Decomposition")


# Plot of uninsured claims data
plot(ts_data, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

# Plot shows sudden spike during covid except that the uninsured claim increase gradually.

# arima model to build time series model
model <- auto.arima(ts_data)

# predicting unemployment
forecasted_values <- forecast(model, h = 1)
forecasted_values <- forecasted_values$mean

cat("Forecasted Unemployment  (including COVID year):", forecasted_values, "\n")
#> Forecasted Unemployment  (including COVID year): 276396.2

Created on 2024-02-08 with reprex v2.1.0

MilindKulkarni12 commented 4 months ago
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3

# set the api key
fredr_set_key('48753a49458ab78ef7185cffd58175b6')
data <- fredr(series_id = 'ICNSA')

head(data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-09     2024-02-09  
#> 2 1967-01-14 ICNSA     334000 2024-02-09     2024-02-09  
#> 3 1967-01-21 ICNSA     277000 2024-02-09     2024-02-09  
#> 4 1967-01-28 ICNSA     252000 2024-02-09     2024-02-09  
#> 5 1967-02-04 ICNSA     274000 2024-02-09     2024-02-09
data$DATE <- as.Date(data$date, format="%m/%d/%Y")

# set the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2022-12-31")

# filter out the covid-19 period data
data_without_covid <- subset(data, DATE < covid_start | DATE > covid_end)
tail(data_without_covid, n = 60)
#> # A tibble: 60 × 6
#>    date       series_id  value realtime_start realtime_end DATE      
#>    <date>     <chr>      <dbl> <date>         <date>       <date>    
#>  1 2019-12-14 ICNSA     270547 2024-02-09     2024-02-09   2019-12-14
#>  2 2019-12-21 ICNSA     287243 2024-02-09     2024-02-09   2019-12-21
#>  3 2019-12-28 ICNSA     312524 2024-02-09     2024-02-09   2019-12-28
#>  4 2023-01-07 ICNSA     339883 2024-02-09     2024-02-09   2023-01-07
#>  5 2023-01-14 ICNSA     288330 2024-02-09     2024-02-09   2023-01-14
#>  6 2023-01-21 ICNSA     225228 2024-02-09     2024-02-09   2023-01-21
#>  7 2023-01-28 ICNSA     225026 2024-02-09     2024-02-09   2023-01-28
#>  8 2023-02-04 ICNSA     234007 2024-02-09     2024-02-09   2023-02-04
#>  9 2023-02-11 ICNSA     225332 2024-02-09     2024-02-09   2023-02-11
#> 10 2023-02-18 ICNSA     211007 2024-02-09     2024-02-09   2023-02-18
#> # ℹ 50 more rows

ts_data <- ts(data_without_covid$value, frequency = 52)

plot(ts_data, main = "Uninsured Claims without Covid-19 period", xlab = "Date", ylab = "Uninsured Claims")


# acf to detect auto-correlation between the data
acf(ts_data, main = "Autocorrelation Function (ACF)")


# pacf to detect any random spikes
pacf(ts_data, main = "Partial Autocorrelation Function (PACF)")


model <- auto.arima(ts_data)

# Forecast unemployment value
forecasted_values <- forecast(model, h = 1)
forecasted_values <- forecasted_values$mean

cat("Forecasted Unemployment (excluding COVID period):", forecasted_values, "\n")
#> Forecasted Unemployment (excluding COVID period): 232214.8

Created on 2024-02-09 with reprex v2.0.2

MilindKulkarni12 commented 4 months ago
# Load required packages
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3

# set the api key
fredr_set_key('48753a49458ab78ef7185cffd58175b6')

# Monthly State Retail Sales: Gasoline Stations in California
data <- fredr(series_id = 'MSRSCA447')
head(data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 2019-01-01 MSRSCA447   5.3 2024-02-12     2024-02-12  
#> 2 2019-02-01 MSRSCA447   9.7 2024-02-12     2024-02-12  
#> 3 2019-03-01 MSRSCA447   6.8 2024-02-12     2024-02-12  
#> 4 2019-04-01 MSRSCA447  14.1 2024-02-12     2024-02-12  
#> 5 2019-05-01 MSRSCA447   2.2 2024-02-12     2024-02-12

# create time series obj
ts_data <- ts(data$value, frequency = 12)

# plot for detecting trending and seasonality
plot(stl(ts_data, s.window="periodic"), main = "Seasonal Decomposition")

# we can observe seasonality in the data, there is also an hint of cyclic trend

# plot the time series
plot(ts_data, main = "Date", xlab = "Date", ylab = "Retail Sales: Gasoline Stations")


# plot ACF to detect models nature
acf(ts_data, main = "Autocorrelation Function (ACF)")

# ACF plot shows rapid decay indicating positive autocorrelation

# plot PACF
pacf(ts_data, main = "Partial Autocorrelation Function (PACF)")

# there aren't any significant spikes in the PACF plot, except a small one at LAG 1

# to find the best performing Arima model let us ttry and fetch the order with
# the best or lowest AIC score amongst the given range of p, d and q values
p_values <- 0:3
d_values <- 0:3
q_values <- 0:3

# initialize aic and order
top_aic <- Inf
top_order <- c(0, 0, 0)

# iterate
for (p in p_values) {
  for (d in d_values) {
    for (q in q_values) {
      curr_order <- c(p, d, q)
      curr_model <- Arima(ts_data, order=curr_order)
      curr_aic <- AIC(curr_model)
      if (curr_aic < top_aic) {
        # new top order found, overwrite
        top_aic <- curr_aic
        top_order <- curr_order
      }
    }
  }
}

# top performing order
top_order
#> [1] 2 1 2

# AIC score associated with the top order
top_aic
#> [1] 429.6667

# Arima model for the top order
model <- Arima(ts_data, order=top_order)
summary(model)
#> Series: ts_data 
#> ARIMA(2,1,2) 
#> 
#> Coefficients:
#>          ar1      ar2     ma1     ma2
#>       0.0182  -0.5750  0.1435  1.0000
#> s.e.  0.1759   0.1351  0.1035  0.0778
#> 
#> sigma^2 = 91.73:  log likelihood = -209.83
#> AIC=429.67   AICc=430.84   BIC=439.88
#> 
#> Training set error measures:
#>                      ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set -0.3980385 9.155217 7.393018 -0.3433542 85.88841 0.2017793
#>                     ACF1
#> Training set -0.04745976

# check the distribution of residuals
checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(2,1,2)
#> Q* = 17.1, df = 8, p-value = 0.02909
#> 
#> Model df: 4.   Total lags used: 12
# the model for the given top order has a p-value that implies 
# that there is still significant auto-correlation in our model

# lets try next iteration of model to see if we get a better fit while accepting
# some of the trade-offs between p-value and AIC score

model <- Arima(ts_data, order=c(3, 1, 0))
summary(model)
#> Series: ts_data 
#> ARIMA(3,1,0) 
#> 
#> Coefficients:
#>          ar1     ar2      ar3
#>       0.1030  0.0833  -0.2168
#> s.e.  0.1286  0.1279   0.1304
#> 
#> sigma^2 = 109.8:  log likelihood = -213.32
#> AIC=434.65   AICc=435.42   BIC=442.82
#> 
#> Training set error measures:
#>                      ME     RMSE      MAE      MPE     MAPE      MASE
#> Training set -0.4974367 10.10942 7.874796 2.063276 84.31892 0.2149286
#>                     ACF1
#> Training set -0.04552645
checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(3,1,0)
#> Q* = 25.523, df = 9, p-value = 0.002444
#> 
#> Model df: 3.   Total lags used: 12
# this model shows significant improvement in p-value while trading-off little
# to no difference in the AIC vlaues 

# predict the values
prediction <- forecast(model, h = 12)
prediction
#>       Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
#> Nov 5      -23.04473 -36.47174 -9.617719 -43.57957 -2.509890
#> Dec 5      -24.66909 -44.65972 -4.678463 -55.24212  5.903936
#> Jan 6      -23.65864 -49.30818  1.990888 -62.88622 15.568930
#> Feb 6      -22.74769 -51.68857  6.193186 -67.00895 21.513562
#> Mar 6      -22.21746 -53.91732  9.482399 -70.69821 26.263291
#> Apr 6      -22.30609 -56.34158 11.729403 -74.35888 29.746703
#> May 6      -22.46859 -58.85751 13.920326 -78.12065 33.183457
#> Jun 6      -22.60770 -61.24241 16.027018 -81.69439 36.478999
#> Jul 6      -22.61634 -63.42746 18.194780 -85.03156 39.798882
#> Aug 6      -22.59358 -65.44677 20.259616 -88.13188 42.944726
#> Sep 6      -22.56179 -67.35910 22.235525 -91.07336 45.949789
#> Oct 6      -22.55474 -69.20268 24.093198 -93.89661 48.787126
# the predicted data shows steady increase over time.

Created on 2024-02-13 with reprex v2.0.2

MilindKulkarni12 commented 4 months ago

Observations:

MilindKulkarni12 commented 4 months ago
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(corrplot)
#> Warning: package 'corrplot' was built under R version 4.2.3
#> corrplot 0.92 loaded

fredr_set_key('48753a49458ab78ef7185cffd58175b6')

icnsa <- fredr(series_id = 'ICNSA')
icnsa$Date <- as.Date(icnsa$date)
ts_icnsa <- ts(icnsa$value, frequency = 52)
plot(stl(ts_icnsa, s.window="periodic"))

acf(ts_icnsa)

pacf(ts_icnsa)


ccnsa <- fredr(series_id = 'CCNSA')
ccnsa$Date <- as.Date(ccnsa$date)
ts_ccnsa <- ts(ccnsa$value, frequency = 52)
plot(stl(ts_ccnsa, s.window="periodic"))

acf(ts_ccnsa)

pacf(ts_ccnsa)



data <- merge(icnsa, ccnsa, by = "Date", suffixes = c("_icnsa", "_ccnsa"))
head(data, n =5)
#>         Date date_icnsa series_id_icnsa value_icnsa realtime_start_icnsa
#> 1 1967-01-07 1967-01-07           ICNSA      346000           2024-02-22
#> 2 1967-01-14 1967-01-14           ICNSA      334000           2024-02-22
#> 3 1967-01-21 1967-01-21           ICNSA      277000           2024-02-22
#> 4 1967-01-28 1967-01-28           ICNSA      252000           2024-02-22
#> 5 1967-02-04 1967-02-04           ICNSA      274000           2024-02-22
#>   realtime_end_icnsa date_ccnsa series_id_ccnsa value_ccnsa
#> 1         2024-02-22 1967-01-07           CCNSA     1594000
#> 2         2024-02-22 1967-01-14           CCNSA     1563000
#> 3         2024-02-22 1967-01-21           CCNSA     1551000
#> 4         2024-02-22 1967-01-28           CCNSA     1533000
#> 5         2024-02-22 1967-02-04           CCNSA     1534000
#>   realtime_start_ccnsa realtime_end_ccnsa
#> 1           2024-02-22         2024-02-22
#> 2           2024-02-22         2024-02-22
#> 3           2024-02-22         2024-02-22
#> 4           2024-02-22         2024-02-22
#> 5           2024-02-22         2024-02-22

corr <- cor(data$value_icnsa, data$value_ccnsa, use = "complete.obs")
corr
#> [1] 0.714954

ts_icnsa <- ts(data$value_icnsa, frequency = 52)
ts_ccnsa <- ts(data$value_ccnsa, frequency = 52)

model <- auto.arima(ts_icnsa, xreg = ts_ccnsa)
summary(model)
#> Series: ts_icnsa 
#> Regression with ARIMA(2,1,1)(1,0,2)[52] errors 
#> 
#> Coefficients:
#>          ar1      ar2      ma1    sar1     sma1    sma2     xreg
#>       0.5666  -0.2917  -0.0020  0.4014  -0.1051  0.0931  -0.0382
#> s.e.  0.0714   0.0350   0.0794  0.0523   0.0534  0.0232   0.0069
#> 
#> sigma^2 = 6.941e+09:  log likelihood = -37967.81
#> AIC=75951.63   AICc=75951.68   BIC=75999.62
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set -14.66077 83197.81 36955.39 -0.4938972 9.336523 0.3891743
#>                      ACF1
#> Training set 6.786416e-05

coeff <- mean(diff(ts_icnsa))
coeff
#> [1] -37.36434

forecast <- forecast(model, xreg = tail(data$value_ccnsa, 1), h = 1)
forecast_mean <- mean(forecast$mean)
cat("Forecasted Value:", forecast_mean, "\n")
#> Forecasted Value: 213860.3

<sup>Created on 2024-02-22 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
MilindKulkarni12 commented 4 months ago

Observations Data and Model:

Regression Diagnostics:

MilindKulkarni12 commented 4 months ago
# Load required packages
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
library(ggplot2)
library(zoo)
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('48753a49458ab78ef7185cffd58175b6')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
data <- fredr(series_id = 'ICNSA')
data$Date <- as.Date(data$date)
head(data, n = 5)
#> # A tibble: 5 × 6
#>   date       series_id  value realtime_start realtime_end Date      
#>   <date>     <chr>      <dbl> <date>         <date>       <date>    
#> 1 1967-01-07 ICNSA     346000 2024-02-28     2024-02-28   1967-01-07
#> 2 1967-01-14 ICNSA     334000 2024-02-28     2024-02-28   1967-01-14
#> 3 1967-01-21 ICNSA     277000 2024-02-28     2024-02-28   1967-01-21
#> 4 1967-01-28 ICNSA     252000 2024-02-28     2024-02-28   1967-01-28
#> 5 1967-02-04 ICNSA     274000 2024-02-28     2024-02-28   1967-02-04

# creating time series object
ts <- ts(data$value, frequency = 52)

# plot ICNSA
plot(ts, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

plot(stl(ts, s.window="periodic"), main = "Seasonal Decomposition")


# set the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2021-12-31")

# Plot highlighting the covid period in our dataset 
plot(data$date, data$value, type = "l", main = "Claims during Covid period", xlab = "Year", ylab = "Number")

abline(v = as.numeric(covid_start), col = "red", lty = 2)
abline(v = as.numeric(covid_end), col = "red", lty = 2)


# filter out the covid-19 period data
covid_period_data <- subset(data, Date >= covid_start | Date <= covid_end)
data_without_covid <- subset(data, Date < covid_start | Date > covid_end)

# Perform cubic spline interpolation for multiple values of spar to find the best fit
spar_values <- seq(0.1, 1.0, by = 0.1)

for (spar in spar_values) {
  # fit the spline on our data
  spline_fit  <- smooth.spline(x = as.numeric(data_without_covid$Date), y = data_without_covid$value, spar = spar, tol = 1e-6)

  # Impute the new values using the spline fit
  imputed_data <- predict(spline_fit, x = as.numeric(covid_period_data$Date))$y

  # Update Covid period data with imputed values
  covid_period_data$value <- imputed_data

  # Update the data with new predicted values
  updated_data <- rbind(data_without_covid, covid_period_data)
  updated_data <- updated_data %>% arrange(date)

  # Plot updated data
  plot(updated_data$date, updated_data$value, type = "l", col = "red", lwd = 2,
       main = paste("Smoothened Time Series (spar =", spar, ")"), xlab = "Year", ylab = "Number")

  lines(data$date, data$value, col = "black", lwd = 2)
  legend("topleft", legend = c("Updated", "Original"), col = c("red", "black"), lty = 1, lwd = 2)
}

# after visualizing all the splines we believe that the one the with spar=0.4 gives the best
# naturally fit of the data and we select it for predicting our future values.

# selecting the best fir spline
spline_fit_covid <- smooth.spline(x = as.numeric(data_without_covid$Date), y = data_without_covid$value, spar = 0.4, tol = 1e-6)
imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period_data$Date))$y
covid_period_data$value <- imputed_values
updated_data <- rbind(data_without_covid, covid_period_data)
updated_data <- updated_data %>% arrange(Date)

plot(updated_data$Date, updated_data$value, type = "l", col = "red", lwd = 2,
     main = "Updated TS Vs Original TS", xlab = "Year", ylab = "Number")
lines(data$Date, data$value, col = "black", lwd = 2)
legend("topright", legend = c("Updated", "Original"), col = c("red", "black"), lty = 1, lwd = 2)


# Plot both time series for change comparision
ts <- ts(data$value, frequency = 52)
new_ts <- ts(updated_data$value, frequency = 52)

plot(ts, col = "black", main = "Comparison of Time Series", ylab = "Value")
lines(new_ts, col = "red")
legend("topright", legend = c("Original", "Updated"), col = c("black", "red"), lty = 1, lwd = 2)


# Fit and forecast the multiplicative Holt-Winters model
hw_multiplicative <- HoltWinters(ts, seasonal = "multiplicative")
forecast_multiplicative <- forecast(hw_multiplicative, h = 1)

# Fit and forecast the additive Holt-Winters model
hw_additive <- HoltWinters(ts, seasonal = "additive")
forecast_additive <- forecast(hw_additive, h = 1)

# Print the forecasts
cat("Multiplicative Holt-Winters Forecast:", forecast_multiplicative$mean, "\n")
#> Multiplicative Holt-Winters Forecast: 197681.2
cat("Additive Holt-Winters Forecast:", forecast_additive$mean, "\n")
#> Additive Holt-Winters Forecast: 161954.6

Created on 2024-02-28 with reprex v2.0.2

MilindKulkarni12 commented 4 months ago
MilindKulkarni12 commented 4 months ago
# Load required packages
library(reprex)
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
library(ggplot2)
library(zoo)
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(KFAS)
#> Warning: package 'KFAS' was built under R version 4.2.3
#> Please cite KFAS in publications by using: 
#> 
#>   Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
library(imputeTS)
#> Warning: package 'imputeTS' was built under R version 4.2.3
#> 
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:zoo':
#> 
#>     na.locf
#> The following object is masked from 'package:tseries':
#> 
#>     na.remove
library(MARSS)
#> Warning: package 'MARSS' was built under R version 4.2.3
library(dplyr)
library(stats)

# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('48753a49458ab78ef7185cffd58175b6')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
data <- fredr(series_id = 'ICNSA')
data$Date <- as.Date(data$date)
head(data, n = 5)
#> # A tibble: 5 × 6
#>   date       series_id  value realtime_start realtime_end Date      
#>   <date>     <chr>      <dbl> <date>         <date>       <date>    
#> 1 1967-01-07 ICNSA     346000 2024-03-07     2024-03-07   1967-01-07
#> 2 1967-01-14 ICNSA     334000 2024-03-07     2024-03-07   1967-01-14
#> 3 1967-01-21 ICNSA     277000 2024-03-07     2024-03-07   1967-01-21
#> 4 1967-01-28 ICNSA     252000 2024-03-07     2024-03-07   1967-01-28
#> 5 1967-02-04 ICNSA     274000 2024-03-07     2024-03-07   1967-02-04

# creating time series object
ts <- ts(data$value, frequency = 52)

# plot ICNSA
plot(ts, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

plot(stl(ts, s.window="periodic"), main = "Seasonal Decomposition")


# Plot highlighting the covid period in our dataset 
plot(data$date, data$value, type = "l", main = "Claims during Covid period", xlab = "Year", ylab = "Number")

# set the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2021-12-31")

abline(v = as.numeric(covid_start), col = "red", lty = 2)
abline(v = as.numeric(covid_end), col = "red", lty = 2)


# filter out the covid-19 period data
covid_period_data <- subset(data, Date >= covid_start & Date <= covid_end)
data_without_covid <- subset(data, Date < covid_start | Date > covid_end)

# Set values between Covid start and end date to NA
data$value[data$Date >= covid_start & data$Date <= covid_end] <- NA

# Plot data with NA values
plot(data$Date, data$value, type = "l", 
   main = "Initial Claims - Covid Period - Assigned NULL",
   xlab = "Year",
   ylab = "Number")


# Impute values for the Covid period using sSSM
imputed_values <- na_kalman(data$value, smooth = TRUE)
data$value <- imputed_values

plot(data$Date, data$value, type = "l", 
   main = "Imputed Values - Kalman",
   xlab = "Year",
   ylab = "Number")


# New time series object
new_ts <- ts(data$value, frequency = 52)

# Lets fit our spline at spar = 0.4
spline_fit <- smooth.spline(x = as.numeric(data_without_covid$Date), y = data_without_covid$value, spar = 0.4)

# Imputing new values using spline fit
imputed_values_spline <- predict(spline_fit, x = as.numeric(covid_period_data$date))$y

# Updating Covid period data with imputed values
covid_period_data$value <- imputed_values_spline

# Combine imputed values
updated_data <- rbind(data_without_covid, covid_period_data)
updated_data <- updated_data %>% arrange(Date)

plot(updated_data$date, updated_data$value, type = "l", col = "red", lwd = 2,
     main = "Updated (penalty - 0.2) Vs Original", xlab = "Year", ylab = "Number")
lines(data$Date, data$value, col = "black", lwd = 2)
legend("topright", legend = c("Updated", "Original"), col = c("red", "black"), lty = 1, lwd = 2)


ts_spline <- ts(updated_data$value, frequency = 52)

# Comparison of original series with imputed series
plot(new_ts, col = "red", type = "l", lty = 1, ylab = "Value", main = "Time Series Comparison")
lines(ts, col = "black", type = "l", lty = 2)
lines(ts_spline, col = "blue", type = "l", lty = 3)
legend("topright", legend = c("New (Kalman)", "Original", "Spline"), col = c("red", "black", "blue"), lty = 1:3)


# Seasonal decomposition to check tre
decomposition <- stl(new_ts, s.window = "periodic")
plot(decomposition)


# Fitting a structural time series model with both trend and seasonal components
ssm_model <- StructTS(new_ts, type = "BSM")
ssm_model
#> 
#> Call:
#> StructTS(x = new_ts, type = "BSM")
#> 
#> Variances:
#>     level      slope       seas    epsilon  
#> 2.967e+08  9.625e+02  2.199e+08  0.000e+00
summary(ssm_model)
#>           Length Class  Mode     
#> coef         4   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    8946   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Use tsSmooth to obtain the smoothed state estimates
smoothed_data <- tsSmooth(ssm_model)
residuals <- residuals(ssm_model)

# Plot residuals to visualize the error component
plot(residuals, main="Residuals", ylab="Residuals", xlab="Time")


# Load IURNSA data
iurnsa_data <- fredr(series_id = "IURNSA")
iurnsa_data$Date <- as.Date(iurnsa_data$date)

# Check the structure and values of iurnsa_data to ensure compatibility with ICNSA data
icnsa_data <- subset(data, Date >= as.Date("1971-01-01"))

# Remove last row of icnsa_data
icnsa_data <- icnsa_data[-nrow(icnsa_data), ]
merged_data <- merge(icnsa_data, iurnsa_data, by = "date", all.x = TRUE)
arima_model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y, seasonal = TRUE)
summary(arima_model)
#> Series: merged_data$value.x 
#> Regression with ARIMA(2,1,1) errors 
#> 
#> Coefficients:
#>          ar1     ar2      ma1       xreg
#>       0.6878  0.1547  -0.9862  23256.533
#> s.e.  0.0209  0.0204   0.0041   3959.789
#> 
#> sigma^2 = 2.443e+09:  log likelihood = -33892.12
#> AIC=67794.23   AICc=67794.26   BIC=67823.87
#> 
#> Training set error measures:
#>                    ME    RMSE      MAE      MPE     MAPE      MASE         ACF1
#> Training set -178.193 49377.9 33016.53 -1.38752 8.782257 0.9876402 -0.003897536

# Forecast the next week's value using the model with the covariate
forecasted_value <- forecast(arima_model, h = 1, xreg = tail(merged_data$value.y, 2))
forecasted_value
#>      Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
#> 2774       208369.2 145031.7 271706.6 111502.90 305235.5
#> 2775       210768.8 133396.8 288140.8  92438.51 329099.1

forecast_value <- mean(forecasted_value$mean, na.rm = TRUE)
forecast_value
#> [1] 209569

Created on 2024-03-07 with reprex v2.0.2

MilindKulkarni12 commented 2 months ago
# Load required packages
library(readxl)
#> Warning: package 'readxl' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.3.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
#> Warning: package 'readr' was built under R version 4.3.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.3.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.3
library(MARSS)
#> Warning: package 'MARSS' was built under R version 4.3.3
library(corrplot)
#> Warning: package 'corrplot' was built under R version 4.3.3
#> corrplot 0.92 loaded
library(vars)
#> Warning: package 'vars' was built under R version 4.2.3
#> Loading required package: MASS
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.3.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Warning: package 'urca' was built under R version 4.3.3
#> Loading required package: lmtest
#> Warning: package 'lmtest' was built under R version 4.3.3
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.2.3
#> Loading required package: lattice

# 25S   Chemical Products
# 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing
# 25B   Pharmaceutical and Medicine Manufacturing
# 25C   Paint, Coating, and Adhesive, Manufacturing

df_25s <- read_excel("C:/Study Material/R Studio/25STI.xlsx")
df_25a <- read_excel("C:/Study Material/R Studio/25ATI.xlsx")
df_25b <- read_excel("C:/Study Material/R Studio/25BTI.xlsx")
df_25c <- read_excel("C:/Study Material/R Studio/25CTI.xlsx")

df_25s$Period = as.Date(paste("01", df_25s$Period, sep = "-"), format = "%d-%b-%Y")
df_25a$Period = as.Date(paste("01", df_25a$Period, sep = "-"), format = "%d-%b-%Y")
df_25b$Period = as.Date(paste("01", df_25b$Period, sep = "-"), format = "%d-%b-%Y")
df_25c$Period = as.Date(paste("01", df_25c$Period, sep = "-"), format = "%d-%b-%Y")

df_25s <- subset(df_25s, Period <= as.Date("2024-02-01"))
df_25a <- subset(df_25a, Period <= as.Date("2024-02-01"))
df_25b <- subset(df_25b, Period <= as.Date("2024-02-01"))
df_25c <- subset(df_25c, Period <= as.Date("2024-02-01"))

df_25s$Value <- as.numeric(df_25s$Value)
df_25a$Value <- as.numeric(df_25a$Value)
df_25b$Value <- as.numeric(df_25b$Value)
df_25c$Value <- as.numeric(df_25c$Value)

plot(df_25s$Period, df_25s$Value, type = "l", 
     main = "25S Chemical Products",
     xlab = "Period",
     ylab = "Value")


plot(df_25a$Period, df_25a$Value, type = "l", 
     main = "25A Pesticide, Fertilizer and Other Agricultural Chemical Manufacturing",
     xlab = "Period",
     ylab = "Value")


plot(df_25b$Period, df_25b$Value, type = "l", 
     main = "25B Pharmaceutical and Medicine Manufacturing",
     xlab = "Period",
     ylab = "Value")


plot(df_25c$Period, df_25c$Value, type = "l", 
     main = "25C Paint, Coating, and Adhesive, Manufacturing",
     xlab = "Period",
     ylab = "Value")


# 4 in 1
ggplot(xlab = 'Period', ylab = 'Value') + 
  geom_line(data = df_25s, aes(x = Period, y = Value)) +
  geom_line(data = df_25a, aes(x = Period, y = Value), color = "red") +
  geom_line(data = df_25b, aes(x = Period, y = Value), color = "blue") +
  geom_line(data = df_25c, aes(x = Period, y = Value), color = "green")


cor(df_25s$Value, df_25a$Value)
#> [1] 0.9606849
cor(df_25s$Value, df_25b$Value)
#> [1] 0.9899559
cor(df_25s$Value, df_25c$Value)
#> [1] 0.9040823
cor(df_25a$Value, df_25b$Value)
#> [1] 0.9513506
cor(df_25a$Value, df_25c$Value)
#> [1] 0.8941125
cor(df_25b$Value, df_25c$Value)
#> [1] 0.8781852

ccf(df_25s$Value, df_25a$Value)

ccf(df_25s$Value, df_25b$Value)

ccf(df_25s$Value, df_25c$Value)

ccf(df_25a$Value, df_25b$Value)

ccf(df_25a$Value, df_25c$Value)

ccf(df_25b$Value, df_25c$Value)


adf.test(df_25s$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25s$Value
#> Dickey-Fuller = -3.2907, Lag order = 7, p-value = 0.07273
#> alternative hypothesis: stationary
df_25s$Value <- c(NA, diff(df_25s$Value))
df_25s <- subset(df_25s, !is.na(Value))
adf.test(df_25s$Value, alternative = "stationary")
#> Warning in adf.test(df_25s$Value, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25s$Value
#> Dickey-Fuller = -4.7374, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
acf(df_25s$Value)

pacf(df_25s$Value)


adf.test(df_25a$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25a$Value
#> Dickey-Fuller = -2.21, Lag order = 7, p-value = 0.4885
#> alternative hypothesis: stationary
df_25a$Value <- c(NA, diff(df_25a$Value))
df_25a <- subset(df_25a, !is.na(Value))
adf.test(df_25a$Value, alternative = "stationary")
#> Warning in adf.test(df_25a$Value, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25a$Value
#> Dickey-Fuller = -7.0763, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
acf(df_25a$Value)

pacf(df_25a$Value)


adf.test(df_25b$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25b$Value
#> Dickey-Fuller = -2.879, Lag order = 7, p-value = 0.206
#> alternative hypothesis: stationary
df_25b$Value <- c(NA, diff(df_25b$Value))
df_25b <- subset(df_25b, !is.na(Value))
adf.test(df_25b$Value, alternative = "stationary")
#> Warning in adf.test(df_25b$Value, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25b$Value
#> Dickey-Fuller = -6.2545, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
acf(df_25b$Value)

pacf(df_25b$Value)


adf.test(df_25c$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25c$Value
#> Dickey-Fuller = -3.7952, Lag order = 7, p-value = 0.01947
#> alternative hypothesis: stationary
df_25c$Value <- c(NA, diff(df_25c$Value))
df_25c <- subset(df_25c, !is.na(Value))
adf.test(df_25c$Value, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  df_25c$Value
#> Dickey-Fuller = -3.7229, Lag order = 7, p-value = 0.02309
#> alternative hypothesis: stationary
acf(df_25c$Value)

pacf(df_25c$Value)


ts_25s = ts(df_25s$Value, frequency = 12)
plot(stl(ts_25s, s.window="periodic"), main = "25A TI: Seasonal Decomposition")


ts_25a = ts(df_25a$Value, frequency = 12)
plot(stl(ts_25a, s.window="periodic"), main = "25A TI: Seasonal Decomposition")


ts_25b = ts(df_25b$Value, frequency = 12)
plot(stl(ts_25b, s.window="periodic"), main = "25B TI: Seasonal Decomposition")


ts_25c = ts(df_25c$Value, frequency = 12)
plot(stl(ts_25c, s.window="periodic"), main = "25C TI: Seasonal Decomposition")


# 4 in 1
ggplot(xlab = 'Period', ylab = 'Value') + 
  geom_line(data = df_25s, aes(x = Period, y = Value)) +
  geom_line(data = df_25a, aes(x = Period, y = Value), color = "red") +
  geom_line(data = df_25b, aes(x = Period, y = Value), color = "blue") +
  geom_line(data = df_25c, aes(x = Period, y = Value), color = "green")


# 25S   Chemical Products
# 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing
# 25B   Pharmaceutical and Medicine Manufacturing
# 25C   Paint, Coating, and Adhesive, Manufacturing

ts_df <- data.frame(Y1=ts_25s, Y2=ts_25a, Y3=ts_25b, Y4=ts_25c)
ts_obj <- ts(ts_df)

# VAR Model
var_model <- VAR(ts_obj, p = 1)
summary(var_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Y1, Y2, Y3, Y4 
#> Deterministic variables: const 
#> Sample size: 384 
#> Log Likelihood: -9709.25 
#> Roots of the characteristic polynomial:
#> 0.6195 0.1965 0.0824 0.02111
#> Call:
#> VAR(y = ts_obj, p = 1)
#> 
#> 
#> Estimation results for equation Y1: 
#> =================================== 
#> Y1 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1   0.57145    0.07175   7.964 1.96e-14 ***
#> Y2.l1  -0.80294    0.28004  -2.867  0.00437 ** 
#> Y3.l1  -0.52817    0.10312  -5.122 4.82e-07 ***
#> Y4.l1   0.30315    0.64276   0.472  0.63746    
#> const 147.61450   26.34659   5.603 4.06e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 478.1 on 379 degrees of freedom
#> Multiple R-Squared: 0.1801,  Adjusted R-squared: 0.1714 
#> F-statistic: 20.81 on 4 and 379 DF,  p-value: 1.597e-15 
#> 
#> 
#> Estimation results for equation Y2: 
#> =================================== 
#> Y2 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>       Estimate Std. Error t value Pr(>|t|)  
#> Y1.l1  0.02525    0.01366   1.849   0.0652 .
#> Y2.l1  0.06611    0.05331   1.240   0.2157  
#> Y3.l1 -0.03373    0.01963  -1.719   0.0865 .
#> Y4.l1  0.09959    0.12235   0.814   0.4162  
#> const  7.69050    5.01516   1.533   0.1260  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 91.01 on 379 degrees of freedom
#> Multiple R-Squared: 0.02855, Adjusted R-squared: 0.0183 
#> F-statistic: 2.785 on 4 and 379 DF,  p-value: 0.02649 
#> 
#> 
#> Estimation results for equation Y3: 
#> =================================== 
#> Y3 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1   0.03433    0.05064   0.678   0.4982    
#> Y2.l1  -0.16137    0.19763  -0.817   0.4147    
#> Y3.l1   0.01440    0.07277   0.198   0.8432    
#> Y4.l1  -0.82932    0.45361  -1.828   0.0683 .  
#> const 106.38511   18.59333   5.722 2.14e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 337.4 on 379 degrees of freedom
#> Multiple R-Squared: 0.01282, Adjusted R-squared: 0.002406 
#> F-statistic: 1.231 on 4 and 379 DF,  p-value: 0.2972 
#> 
#> 
#> Estimation results for equation Y4: 
#> =================================== 
#> Y4 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1  0.040659   0.005376   7.563 3.01e-13 ***
#> Y2.l1 -0.035630   0.020983  -1.698   0.0903 .  
#> Y3.l1 -0.045399   0.007726  -5.876 9.22e-09 ***
#> Y4.l1  0.267599   0.048161   5.556 5.20e-08 ***
#> const  2.899038   1.974086   1.469   0.1428    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 35.82 on 379 degrees of freedom
#> Multiple R-Squared: 0.2921,  Adjusted R-squared: 0.2847 
#> F-statistic:  39.1 on 4 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>        Y1      Y2        Y3      Y4
#> Y1 228573 8358.68 117792.82 2670.17
#> Y2   8359 8282.17    -38.41  -24.23
#> Y3 117793  -38.41 113838.64  321.61
#> Y4   2670  -24.23    321.61 1283.24
#> 
#> Correlation matrix of residuals:
#>        Y1        Y2        Y3        Y4
#> Y1 1.0000  0.192112  0.730233  0.155910
#> Y2 0.1921  1.000000 -0.001251 -0.007431
#> Y3 0.7302 -0.001251  1.000000  0.026609
#> Y4 0.1559 -0.007431  0.026609  1.000000

forecast_var <- forecast(var_model, h = 1, frequency = 12)
forecast_y1 <- mean(forecast_var$forecast$Y1$mean)
cat("Forecasted Value for 25S   Chemical Products:", forecast_y1, "\n")
#> Forecasted Value for 25S Chemical Products: 90.81823
forecast_y2 <-mean(forecast_var$forecast$Y2$mean)
cat("Forecasted Value for 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing:", forecast_y2, "\n")
#> Forecasted Value for 25A Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing: 7.383598
forecast_y3 <- mean(forecast_var$forecast$Y3$mean)
cat("Forecasted Value for 25B   Pharmaceutical and Medicine Manufacturing:", forecast_y3, "\n")
#> Forecasted Value for 25B Pharmaceutical and Medicine Manufacturing: 110.0477
forecast_y4 <- mean(forecast_var$forecast$Y4$mean)
cat("Forecasted Value for 25C   Paint, Coating, and Adhesive, Manufacturing:", forecast_y4, "\n")
#> Forecasted Value for 25C Paint, Coating, and Adhesive, Manufacturing: -3.597713

var_model_p <- VAR(ts_obj, p = 3)
summary(var_model_p)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Y1, Y2, Y3, Y4 
#> Deterministic variables: const 
#> Sample size: 382 
#> Log Likelihood: -9596.631 
#> Roots of the characteristic polynomial:
#> 0.8538 0.6481 0.5543 0.5486 0.5486 0.4684 0.4684 0.4649 0.4491 0.4491 0.4411 0.4411
#> Call:
#> VAR(y = ts_obj, p = 3)
#> 
#> 
#> Estimation results for equation Y1: 
#> =================================== 
#> Y1 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1   0.43830    0.08225   5.329 1.72e-07 ***
#> Y2.l1  -0.80206    0.27850  -2.880 0.004209 ** 
#> Y3.l1  -0.39972    0.11325  -3.530 0.000469 ***
#> Y4.l1  -0.69368    0.70540  -0.983 0.326063    
#> Y1.l2   0.16991    0.08622   1.971 0.049520 *  
#> Y2.l2  -0.30048    0.28379  -1.059 0.290379    
#> Y3.l2  -0.07193    0.11370  -0.633 0.527333    
#> Y4.l2   1.04045    0.70361   1.479 0.140063    
#> Y1.l3   0.13338    0.08751   1.524 0.128325    
#> Y2.l3  -0.05128    0.28489  -0.180 0.857241    
#> Y3.l3   0.03905    0.11574   0.337 0.735983    
#> Y4.l3  -0.42614    0.68232  -0.625 0.532651    
#> const 110.74442   28.29761   3.914 0.000108 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 469.9 on 369 degrees of freedom
#> Multiple R-Squared: 0.2281,  Adjusted R-squared: 0.203 
#> F-statistic: 9.088 on 12 and 369 DF,  p-value: 2.346e-15 
#> 
#> 
#> Estimation results for equation Y2: 
#> =================================== 
#> Y2 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>         Estimate Std. Error t value Pr(>|t|)   
#> Y1.l1  0.0200294  0.0156672   1.278  0.20190   
#> Y2.l1  0.0772226  0.0530504   1.456  0.14634   
#> Y3.l1 -0.0307709  0.0215723  -1.426  0.15460   
#> Y4.l1  0.0002908  0.1343688   0.002  0.99827   
#> Y1.l2  0.0328210  0.0164246   1.998  0.04642 * 
#> Y2.l2  0.0390587  0.0540587   0.723  0.47043   
#> Y3.l2 -0.0312264  0.0216580  -1.442  0.15021   
#> Y4.l2  0.3529468  0.1340279   2.633  0.00881 **
#> Y1.l3 -0.0135791  0.0166695  -0.815  0.41582   
#> Y2.l3 -0.0637017  0.0542672  -1.174  0.24121   
#> Y3.l3  0.0193455  0.0220467   0.877  0.38080   
#> Y4.l3 -0.3555227  0.1299723  -2.735  0.00653 **
#> const  7.0857831  5.3903220   1.315  0.18948   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 89.51 on 369 degrees of freedom
#> Multiple R-Squared: 0.08328, Adjusted R-squared: 0.05347 
#> F-statistic: 2.794 on 12 and 369 DF,  p-value: 0.001151 
#> 
#> 
#> Estimation results for equation Y3: 
#> =================================== 
#> Y3 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>         Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1  0.0595757  0.0588507   1.012   0.3120    
#> Y2.l1 -0.1623027  0.1992737  -0.814   0.4159    
#> Y3.l1 -0.0302021  0.0810323  -0.373   0.7096    
#> Y4.l1 -0.4771400  0.5047308  -0.945   0.3451    
#> Y1.l2 -0.0710367  0.0616960  -1.151   0.2503    
#> Y2.l2 -0.0142740  0.2030612  -0.070   0.9440    
#> Y3.l2  0.1938417  0.0813540   2.383   0.0177 *  
#> Y4.l2 -0.5769452  0.5034502  -1.146   0.2525    
#> Y1.l3 -0.0009571  0.0626160  -0.015   0.9878    
#> Y2.l3  0.0735605  0.2038443   0.361   0.7184    
#> Y3.l3  0.0865111  0.0828141   1.045   0.2969    
#> Y4.l3  0.4358522  0.4882163   0.893   0.3726    
#> const 88.3532020 20.2477201   4.364 1.66e-05 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 336.2 on 369 degrees of freedom
#> Multiple R-Squared: 0.04493, Adjusted R-squared: 0.01387 
#> F-statistic: 1.447 on 12 and 369 DF,  p-value: 0.1425 
#> 
#> 
#> Estimation results for equation Y4: 
#> =================================== 
#> Y4 = Y1.l1 + Y2.l1 + Y3.l1 + Y4.l1 + Y1.l2 + Y2.l2 + Y3.l2 + Y4.l2 + Y1.l3 + Y2.l3 + Y3.l3 + Y4.l3 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> Y1.l1  0.022011   0.005962   3.692 0.000256 ***
#> Y2.l1 -0.036180   0.020187  -1.792 0.073910 .  
#> Y3.l1 -0.021961   0.008209  -2.675 0.007800 ** 
#> Y4.l1  0.135365   0.051131   2.647 0.008458 ** 
#> Y1.l2  0.020041   0.006250   3.207 0.001461 ** 
#> Y2.l2 -0.042337   0.020571  -2.058 0.040282 *  
#> Y3.l2 -0.019101   0.008241  -2.318 0.021015 *  
#> Y4.l2  0.048952   0.051001   0.960 0.337769    
#> Y1.l3  0.008129   0.006343   1.282 0.200821    
#> Y2.l3 -0.026463   0.020650  -1.282 0.200819    
#> Y3.l3 -0.016983   0.008389  -2.024 0.043650 *  
#> Y4.l3  0.197127   0.049458   3.986 8.11e-05 ***
#> const  2.129868   2.051154   1.038 0.299775    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 34.06 on 369 degrees of freedom
#> Multiple R-Squared: 0.3743,  Adjusted R-squared: 0.354 
#> F-statistic:  18.4 on 12 and 369 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>        Y1      Y2       Y3      Y4
#> Y1 220828 7340.22 120144.4 2157.16
#> Y2   7340 8012.79    547.2  -35.25
#> Y3 120144  547.24 113059.3  550.13
#> Y4   2157  -35.25    550.1 1160.25
#> 
#> Correlation matrix of residuals:
#>        Y1       Y2      Y3       Y4
#> Y1 1.0000  0.17450 0.76037  0.13477
#> Y2 0.1745  1.00000 0.01818 -0.01156
#> Y3 0.7604  0.01818 1.00000  0.04803
#> Y4 0.1348 -0.01156 0.04803  1.00000

forecast_var_p <- forecast(var_model_p, h =1, frequency = 12)
forecast_p_y1 <- mean(forecast_var_p$forecast$Y1$mean)
cat("Forecasted Value for 25S   Chemical Products:", forecast_p_y1, "\n")
#> Forecasted Value for 25S Chemical Products: 5.004572
forecast_p_y2 <-mean(forecast_var_p$forecast$Y2$mean)
cat("Forecasted Value for 25A   Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing:", forecast_p_y2, "\n")
#> Forecasted Value for 25A Pesticide, Fertilizer, and Other Agricultural Chemical Manufacturing: -3.631642
forecast_p_y3 <- mean(forecast_var_p$forecast$Y3$mean)
cat("Forecasted Value for 25B   Pharmaceutical and Medicine Manufacturing:", forecast_p_y3, "\n")
#> Forecasted Value for 25B Pharmaceutical and Medicine Manufacturing: 189.0412
forecast_p_y4 <- mean(forecast_var_p$forecast$Y4$mean)
cat("Forecasted Value for 25C   Paint, Coating, and Adhesive, Manufacturing:", forecast_p_y4, "\n")
#> Forecasted Value for 25C Paint, Coating, and Adhesive, Manufacturing: -18.50666

cat("AIC for VAR (p=1)", AIC(var_model), "AIC for VAR (p=3)", AIC(var_model_p))
#> AIC for VAR (p=1) 19458.5 AIC for VAR (p=3) 19297.26
cat("BIC for VAR (p=1)", BIC(var_model), "BIC for VAR (p=3)", BIC(var_model_p))
#> BIC for VAR (p=1) 19537.51 BIC for VAR (p=3) 19502.42

# Granger causality test
granger_results <- causality(var_model_p)
#> Warning in causality(var_model_p): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Y1) as cause variable.
print(granger_results)
#> $Granger
#> 
#>  Granger causality H0: Y1 do not Granger-cause Y2 Y3 Y4
#> 
#> data:  VAR object var_model_p
#> F-Test = 6.8727, df1 = 9, df2 = 1476, p-value = 9.666e-10
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Y1 and Y2 Y3 Y4
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 145.33, df = 3, p-value < 2.2e-16

# BigVAR
data_matrix <- as.matrix(ts_df)
sparse_var_model <- BigVAR.fit(data_matrix, p = 1, lambda = 0.1, struct = "Lag")
summary(sparse_var_model)
#>      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#>  -0.82932  -0.03807   0.03749  13.17876   0.37022 147.61450

Created on 2024-04-11 with reprex v2.1.0