Open Sree-lekshmi99 opened 9 months ago
HOMEWORK#1
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.1
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
fredr_set_key("dee75be4acc1b9d6c94d9f59867a4910")
icnsa <- fredr(series_id = "ICNSA")
icnsa$date <- as.Date(icnsa$date)
# CCNSA shows how many people are still getting unemployment benefits in the U.S., helping to understand job market trends without adjusting for seasonal changes.
continued_claims <- fredr(series_id = "CCNSA")
continued_claims$date <- as.Date(continued_claims$date)
summary(icnsa$value)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 133000 265800 326919 365013 406725 6161268
summary(continued_claims$value)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 785000 1996902 2494495 2761854 3145702 23037921
ggplot(icnsa, aes(x = date, y = value)) +
geom_line(color = "red") +
ggtitle("ICNSA Claims") +
xlab("Year") +
ylab("Number of Claims")
ggplot(continued_claims, aes(x = date, y = value)) +
geom_line(color = "red") +
ggtitle("Continued Claims (Insured Unemployment)") +
xlab("Year") +
ylab("Number of Claims")
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
adf_icnsa <- adf.test(icnsa$value, alternative = "stationary")
#> Warning in adf.test(icnsa$value, alternative = "stationary"): p-value smaller
#> than printed p-value
# Stationarity checks using Augmented Dickey-Fuller Test for CCNSA
adf_ccnsa <- adf.test(continued_claims$value, alternative = "stationary")
#> Warning in adf.test(continued_claims$value, alternative = "stationary"):
#> p-value smaller than printed p-value
# Output the ADF test results
adf_icnsa$p.value
#> [1] 0.01
adf_ccnsa$p.value
#> [1] 0.01
acf(icnsa$value)
pacf(icnsa$value)
acf(continued_claims$value)
pacf(continued_claims$value)
# Data is stationary by ADF test but acf and pacf does give a some characterstics of non stationary data
merged_data <- merge(icnsa, continued_claims, by = "date")
correlation <- cor(merged_data$value.x, merged_data$value.y)
print(correlation)
#> [1] 0.714954
library(forecast)
#merged_data is my merged and cleaned dataset with 'value.x' as ICNSA and 'value.y' as CCNSA.
# Log transformation
merged_data$value.x <- log(merged_data$value.x)
merged_data$value.y <- log(merged_data$value.y)
# Fit the regARIMA model with CCNSA as an external regressor
model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y)
summary(model)
#> Series: merged_data$value.x
#> Regression with ARIMA(3,1,1) errors
#>
#> Coefficients:
#> ar1 ar2 ar3 ma1 xreg
#> -0.5099 -0.1054 -0.0655 0.1678 0.6474
#> s.e. 0.1751 0.0623 0.0186 0.1743 0.0523
#>
#> sigma^2 = 0.01606: log likelihood = 1928.11
#> AIC=-3844.21 AICc=-3844.19 BIC=-3808.22
#>
#> Training set error measures:
#> ME RMSE MAE MPE MAPE
#> Training set -0.0002598266 0.1266183 0.08957493 -0.006119179 0.6994997
#> MASE ACF1
#> Training set 0.9719684 0.0005756628
# Diagnostic checks
checkresiduals(model)
#>
#> Ljung-Box test
#>
#> data: Residuals from Regression with ARIMA(3,1,1) errors
#> Q* = 210.41, df = 6, p-value < 2.2e-16
#>
#> Model df: 4. Total lags used: 10
forecasted_value <- forecast(model, xreg = as.numeric(future_ccnsa_value), h = 1)
#> Error in forecast.forecast_ARIMA(model, xreg = as.numeric(future_ccnsa_value), : object 'future_ccnsa_value' not found
exp_forecasted_value <- exp(forecasted_value$mean)
#> Error in eval(expr, envir, enclos): object 'forecasted_value' not found
# Print the exponentiated forecasted value
print(exp_forecasted_value)
#> Error in print(exp_forecasted_value): object 'exp_forecasted_value' not found
# Plot the forecast
plot(forecasted_value)
#> Error in plot(forecasted_value): object 'forecasted_value' not found
Created on 2024-02-22 with reprex v2.1.0
HW1#updated
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.1
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
fredr_set_key("dee75be4acc1b9d6c94d9f59867a4910")
icnsa <- fredr(series_id = "ICNSA")
icnsa$date <- as.Date(icnsa$date)
# CCNSA shows how many people are still getting unemployment benefits in the U.S., helping to understand job market trends without adjusting for seasonal changes.
continued_claims <- fredr(series_id = "CCNSA")
continued_claims$date <- as.Date(continued_claims$date)
summary(icnsa$value)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 133000 265794 326900 364957 406633 6161268
summary(continued_claims$value)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 785000 1996951 2493752 2761628 3145554 23037921
ggplot(icnsa, aes(x = date, y = value)) +
geom_line(color = "red") +
ggtitle("ICNSA Claims") +
xlab("Year") +
ylab("Number of Claims")
ggplot(continued_claims, aes(x = date, y = value)) +
geom_line(color = "red") +
ggtitle("Continued Claims (Insured Unemployment)") +
xlab("Year") +
ylab("Number of Claims")
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
adf_icnsa <- adf.test(icnsa$value, alternative = "stationary")
#> Warning in adf.test(icnsa$value, alternative = "stationary"): p-value smaller
#> than printed p-value
# Stationarity checks using Augmented Dickey-Fuller Test for CCNSA
adf_ccnsa <- adf.test(continued_claims$value, alternative = "stationary")
#> Warning in adf.test(continued_claims$value, alternative = "stationary"):
#> p-value smaller than printed p-value
# Output the ADF test results
adf_icnsa$p.value
#> [1] 0.01
adf_ccnsa$p.value
#> [1] 0.01
acf(icnsa$value)
pacf(icnsa$value)
acf(continued_claims$value)
pacf(continued_claims$value)
# Data is stationary by ADF test but acf and pacf does give a some characterstics of non stationary data
merged_data <- merge(icnsa, continued_claims, by = "date")
correlation <- cor(merged_data$value.x, merged_data$value.y)
print(correlation)
#> [1] 0.7149735
library(forecast)
#merged_data is my merged and cleaned dataset with 'value.x' as ICNSA and 'value.y' as CCNSA.
# Log transformation
merged_data$value.x <- log(merged_data$value.x)
merged_data$value.y <- log(merged_data$value.y)
# Fitting the regARIMA model with CCNSA as an external regressor
model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y)
summary(model)
#> Series: merged_data$value.x
#> Regression with ARIMA(3,1,1) errors
#>
#> Coefficients:
#> ar1 ar2 ar3 ma1 xreg
#> -0.5112 -0.1059 -0.0653 0.1693 0.6472
#> s.e. 0.1758 0.0625 0.0186 0.1751 0.0523
#>
#> sigma^2 = 0.01606: log likelihood = 1929.06
#> AIC=-3846.13 AICc=-3846.1 BIC=-3810.13
#>
#> Training set error measures:
#> ME RMSE MAE MPE MAPE MASE
#> Training set -0.0002861374 0.1266051 0.08956942 -0.006331525 0.6994647 0.972069
#> ACF1
#> Training set 0.0005792167
checkresiduals(model)
#>
#> Ljung-Box test
#>
#> data: Residuals from Regression with ARIMA(3,1,1) errors
#> Q* = 210.94, df = 6, p-value < 2.2e-16
#>
#> Model df: 4. Total lags used: 10
future_ccnsa_value <- tail(merged_data$value.y)
forecasted_value <- forecast(model, xreg = as.numeric(future_ccnsa_value), h = 52)
exp_forecasted_value <- exp(forecasted_value$mean)
# Print the exponentiated forecasted value
print(exp_forecasted_value)
#> Time Series:
#> Start = 2981
#> End = 2986
#> Frequency = 1
#> [1] 228467.5 224855.6 233403.1 229741.5 230442.0 227342.1
# Plot the forecast
plot(forecasted_value)
Created on 2024-02-28 with reprex v2.1.0
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.1
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(urca)
#> Warning: package 'urca' was built under R version 4.2.3
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.3.1
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.3.1
#> Warning: package 'readr' was built under R version 4.3.1
#> Warning: package 'purrr' was built under R version 4.3.1
#> Warning: package 'stringr' was built under R version 4.3.1
#> Warning: package 'forcats' was built under R version 4.3.1
#> Warning: package 'lubridate' was built under R version 4.3.1
fredr_set_key("dee75be4acc1b9d6c94d9f59867a4910")
icnsa <- fredr(series_id = "ICNSA")
head(icnsa)
#> # A tibble: 6 × 5
#> date series_id value realtime_start realtime_end
#> <date> <chr> <dbl> <date> <date>
#> 1 1967-01-07 ICNSA 346000 2024-02-28 2024-02-28
#> 2 1967-01-14 ICNSA 334000 2024-02-28 2024-02-28
#> 3 1967-01-21 ICNSA 277000 2024-02-28 2024-02-28
#> 4 1967-01-28 ICNSA 252000 2024-02-28 2024-02-28
#> 5 1967-02-04 ICNSA 274000 2024-02-28 2024-02-28
#> 6 1967-02-11 ICNSA 276000 2024-02-28 2024-02-28
#The start of the spike is just before the last major tick mark on the x-axis, which is 2020.
#The end of the spike, or at least when it starts to drop, This could extend well into 2021 or even 2022,
#Will betaking start date as the February 2020 and end date as May 2022
covidStart <- as.Date("2020-02-01")
covidEnd <- as.Date("2021-05-1")
covidPlot <- ggplot(icnsa, aes(date, value)) +
geom_line() +
geom_vline(xintercept = as.numeric(covidStart), linetype = "dotdash", color = "darkred") +
geom_vline(xintercept = as.numeric(covidEnd), linetype = "dotdash", color = "green") +
# Define the labels for the plot
ggtitle("Claims During Covid Period") +
xlab("Year") +
ylab("Claims Count")
covidPlot
#Subset data without the covid
dataNotInCovid <- icnsa[!(icnsa$date >= covidStart & icnsa$date <= covidEnd), ]
fit <- smooth.spline(dataNotInCovid$date, dataNotInCovid$value)
# fit is the result of fitting a smooth spline to the data where icnsa$date is the independent variable and icnsa$value is the dependent variable. The optimal lambda (smoothing parameter) value chosen by smooth.spline is stored in fit$spar.
# To access the chosen lambda value
optimal_lambda <- fit$spar
optimal_lambda
#> [1] 0.1292562
#so our lambda value is 0.129
spline_fit <- smooth.spline(x = as.numeric(dataNotInCovid$date), y = dataNotInCovid$value,lambda = 0.129)
#plotting the spline against our original data
plot(as.numeric(dataNotInCovid$date), dataNotInCovid$value, type = "p", col = "red", main = "Original vs. Smoothed Data", xlab = "Date", ylab = "Value", xlim = range(as.numeric(dataNotInCovid$date)), ylim = range(dataNotInCovid$value))
lines(spline_fit, col = "black")
legend("topright", legend = c("Original Data", "Smoothed Spline"), col = c("red", "blue"), lty = 1)
#Collecting the covid years
dataInCovid <- icnsa[(icnsa$date >= covidStart & icnsa$date <= covidEnd), ]
head(dataInCovid)
#> # A tibble: 6 × 5
#> date series_id value realtime_start realtime_end
#> <date> <chr> <dbl> <date> <date>
#> 1 2020-02-01 ICNSA 224561 2024-02-28 2024-02-28
#> 2 2020-02-08 ICNSA 219459 2024-02-28 2024-02-28
#> 3 2020-02-15 ICNSA 209218 2024-02-28 2024-02-28
#> 4 2020-02-22 ICNSA 198845 2024-02-28 2024-02-28
#> 5 2020-02-29 ICNSA 216625 2024-02-28 2024-02-28
#> 6 2020-03-07 ICNSA 199914 2024-02-28 2024-02-28
# imputing values from our spline fit from the covid years
newImputedvalues <- predict(spline_fit, x = as.numeric(dataInCovid$date))
dataInCovid$value <- newImputedvalues$y
updatedIcnsaData <- bind_rows(dataNotInCovid, newImputedvalues)
updatedIcnsaData <- updatedIcnsaData %>%
filter(date < Sys.Date())
ggplot(updatedIcnsaData, aes(x = date, y = value)) +
geom_line() +
labs(title = "Claims",
x = "Year",
y = "Number")
weekly_ts <- ts(updatedIcnsaData$value, frequency = 52)
multiplicative_hw_model <- HoltWinters(weekly_ts, seasonal = "multiplicative")
multiplicative_forecast <- forecast(multiplicative_hw_model, h = 1)
cat("Multiplicative Model Forecast:\n")
#> Multiplicative Model Forecast:
print(multiplicative_forecast)
#> Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 57.05769 227059 185861 268257.1 164052 290066
additive_hw_model <- HoltWinters(weekly_ts, seasonal = "additive")
additive_forecast <- forecast(additive_hw_model, h = 1)
cat("Additive Model Forecast:\n")
#> Additive Model Forecast:
print(additive_forecast)
#> Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 57.05769 227225.7 185052.1 269399.3 162726.7 291724.6
# Plotting the forecast from the multiplicative model
plot(multiplicative_forecast, main = "Holt-Winters Forecast with Multiplicative Seasonality", xlab = "Time", ylab = "Values")
summary(multiplicative_forecast)
#>
#> Forecast method: HoltWinters
#>
#> Model Information:
#> Holt-Winters exponential smoothing with trend and multiplicative seasonal component.
#>
#> Call:
#> HoltWinters(x = weekly_ts, seasonal = "multiplicative")
#>
#> Smoothing parameters:
#> alpha: 0.363287
#> beta : 0
#> gamma: 1
#>
#> Coefficients:
#> [,1]
#> a 2.615162e+05
#> b -6.023969e+02
#> s1 8.702453e-01
#> s2 1.058463e+00
#> s3 1.033361e+00
#> s4 1.049705e+00
#> s5 1.163360e+00
#> s6 1.163575e+00
#> s7 1.413893e+00
#> s8 1.422607e+00
#> s9 1.421077e+00
#> s10 1.350095e+00
#> s11 1.291571e+00
#> s12 1.313315e+00
#> s13 1.288542e+00
#> s14 1.248284e+00
#> s15 1.204924e+00
#> s16 1.253161e+00
#> s17 1.157585e+00
#> s18 1.021321e+00
#> s19 1.058974e+00
#> s20 1.061529e+00
#> s21 1.035407e+00
#> s22 8.573020e-01
#> s23 8.363360e-01
#> s24 9.078450e-01
#> s25 8.528292e-01
#> s26 8.128506e-01
#> s27 8.111990e-01
#> s28 8.276516e-01
#> s29 7.821558e-01
#> s30 8.438507e-01
#> s31 8.753853e-01
#> s32 9.206168e-01
#> s33 1.096786e+00
#> s34 1.029678e+00
#> s35 1.093594e+00
#> s36 1.097072e+00
#> s37 1.148801e+00
#> s38 1.107082e+00
#> s39 1.187394e+00
#> s40 9.418815e-01
#> s41 1.298725e+00
#> s42 1.049057e+00
#> s43 9.846624e-01
#> s44 1.061426e+00
#> s45 1.026028e+00
#> s46 1.209388e+00
#> s47 1.062952e+00
#> s48 8.648177e-01
#> s49 8.745151e-01
#> s50 8.021370e-01
#> s51 8.002309e-01
#> s52 7.568633e-01
#>
#> Error measures:
#> ME RMSE MAE MPE MAPE MASE
#> Training set 849.4492 32152.63 20977.83 -0.02160217 5.886518 0.3844578
#> ACF1
#> Training set 0.06343894
#>
#> Forecasts:
#> Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 57.05769 227059 185861 268257.1 164052 290066
# Plotting the forecast from the additive model
plot(additive_forecast, main = "Holt-Winters Forecast with Additive Seasonality", xlab = "Time", ylab = "Values", col = "red")
summary(additive_forecast)
#>
#> Forecast method: HoltWinters
#>
#> Model Information:
#> Holt-Winters exponential smoothing with trend and additive seasonal component.
#>
#> Call:
#> HoltWinters(x = weekly_ts, seasonal = "additive")
#>
#> Smoothing parameters:
#> alpha: 0.3287789
#> beta : 0.0001124617
#> gamma: 1
#>
#> Coefficients:
#> [,1]
#> a 239112.5244
#> b -437.8078
#> s1 -11449.0607
#> s2 33732.1332
#> s3 32786.5257
#> s4 41215.7813
#> s5 67641.7108
#> s6 73068.0543
#> s7 122179.0641
#> s8 130824.3764
#> s9 137180.1524
#> s10 132911.9050
#> s11 136376.4436
#> s12 151045.7516
#> s13 155577.1443
#> s14 155240.7753
#> s15 150053.6403
#> s16 160442.2974
#> s17 140537.5172
#> s18 106087.7424
#> s19 110915.5912
#> s20 106510.7072
#> s21 95607.3828
#> s22 43957.1630
#> s23 30707.7583
#> s24 41615.2321
#> s25 21146.3871
#> s26 4993.1586
#> s27 -763.1147
#> s28 -1522.9617
#> s29 -15312.9822
#> s30 -4073.3996
#> s31 1281.6092
#> s32 8029.0751
#> s33 39089.1224
#> s34 25671.9992
#> s35 36567.0917
#> s36 37909.8039
#> s37 48972.5332
#> s38 43792.3493
#> s39 63229.6548
#> s40 14292.6538
#> s41 96536.9621
#> s42 42526.1296
#> s43 28387.6534
#> s44 50279.7644
#> s45 41980.0919
#> s46 93495.0491
#> s47 56546.2829
#> s48 1209.1054
#> s49 1948.2873
#> s50 -24106.1758
#> s51 -27845.8795
#> s52 -41180.5244
#>
#> Error measures:
#> ME RMSE MAE MPE MAPE MASE ACF1
#> Training set 1554.788 32939.2 21998.03 0.2016047 6.274324 0.4031549 0.1270346
#>
#> Forecasts:
#> Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 57.05769 227225.7 185052.1 269399.3 162726.7 291724.6
Created on 2024-02-28 with reprex v2.1.0
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.3.1
#> Warning: package 'ggplot2' was built under R version 4.3.1
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.3.1
#> Warning: package 'readr' was built under R version 4.3.1
#> Warning: package 'purrr' was built under R version 4.3.1
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.3.1
#> Warning: package 'forcats' was built under R version 4.3.1
#> Warning: package 'lubridate' was built under R version 4.3.1
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(imputeTS)
#> Warning: package 'imputeTS' was built under R version 4.2.3
fredr_set_key('dee75be4acc1b9d6c94d9f59867a4910')
icnsa_data <- fredr(series_id = "ICNSA")
# Backup original data
original_icnsa <- icnsa_data
ggplot(icnsa_data, aes(x = date, y = value)) +
geom_line() +
labs(title = "Time Series Plot of ICNSA Claims Data",
x = "Date (Year)",
y = "value")
start_date <- as.Date("2020-03-21")
end_date <- as.Date("2021-02-21")
# Visulaizing the covid data
covidPlot <- ggplot(icnsa_data, aes(date, value)) +
geom_line() +
geom_vline(xintercept = as.numeric(start_date), linetype = "dotdash", color = "darkred") +
geom_vline(xintercept = as.numeric(end_date), linetype = "dotdash", color = "green") +
# Define the labels for the plot
ggtitle("Claims During Covid Period") +
xlab("Year") +
ylab("Claims Count")
covidPlot
icnsa_data_without_covid <- subset(icnsa_data, !(date >= start_date & date <= end_date))
acf(icnsa_data_without_covid$value)
pacf(icnsa_data_without_covid$value)
# Setting the values to NA for the COVID period
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- NA
# Plot data with NA values to visualize the missing period
plot(icnsa_data$date, icnsa_data$value, type = "l", main = "ICNSA Data with Simulated Missing Values", xlab = "Date", ylab = "ICNSA Value")
# Impute missing values using Kalman filter
icnsa_data$value <- na.kalman(icnsa_data$value)
#> Warning: na.kalman will be replaced by na_kalman.
#> Functionality stays the same.
#> The new function name better fits modern R code style guidelines.
#> Please adjust your code accordingly.
# Plotting imputed data to visualize the effect of imputation
plot(icnsa_data$date, icnsa_data$value, type = "l", col = "blue", main = "ICNSA Data After Kalman Filter Imputation", xlab = "Date", ylab = "ICNSA Value")
lines(original_icnsa$date, original_icnsa$value, col = "red") # Add original data for comparison
legend("topleft", legend=c("Imputed", "Original"), col=c("blue", "red"), lty=1, cex = 0.8)
icnsa_ts <- ts(icnsa_data$value, frequency = 52)
# Fit the structural time series model
fit <- StructTS(icnsa_ts, type = "BSM") # Basic Structural Model
print(names(fit))
#> [1] "coef" "loglik" "loglik0" "data" "residuals" "fitted"
#> [7] "call" "series" "code" "model" "model0" "xtsp"
#SPLINE
spline_fit <- smooth.spline(icnsa_data$date[!is.na(icnsa_data$value)], icnsa_data$value[!is.na(icnsa_data$value)])
spline_imputed <- predict(spline_fit, xout = as.numeric(icnsa_data$date))$y
smoothed <- tsSmooth(fit)
print(str(smoothed))
#> Time-Series [1:2983, 1:3] from 1 to 58.3: 285370 275800 270376 265487 261846 ...
#> - attr(*, "dimnames")=List of 2
#> ..$ : NULL
#> ..$ : chr [1:3] "level" "slope" "sea"
#> NULL
# Apply the tsSmooth function
smoothed <- tsSmooth(fit)
# Extract the 'level' and 'seasonal' components
trend <- smoothed[, "level"]
seasonal <- smoothed[, "sea"]
# Plot the original data
plot(icnsa_data$date, icnsa_ts, type = "l", col = "black", main = "Original Data and Extracted Components", ylab = "ICNSA Value")
# Plot the trend component
lines(icnsa_data$date, trend, col = "blue", lty = 2)
# Plot the original data with the trend component added to the seasonal component
lines(icnsa_data$date, trend + seasonal, col = "green", lty = 3)
# Add a legend to the plot
legend("topright", legend = c("Original Data", "Trend", "Trend + Seasonal"), col = c("black", "blue", "green"), lty = 1:3, cex = 0.8)
library(KFAS)
#> Warning: package 'KFAS' was built under R version 4.2.3
#> Please cite KFAS in publications by using:
#>
#> Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
irusa_data <- fredr(series_id = "IURSA")
irusa_data$date <- as.Date(irusa_data$date)
head(irusa_data)
#> # A tibble: 6 × 5
#> date series_id value realtime_start realtime_end
#> <date> <chr> <dbl> <date> <date>
#> 1 1971-01-02 IURSA 3.9 2024-03-07 2024-03-07
#> 2 1971-01-09 IURSA 4 2024-03-07 2024-03-07
#> 3 1971-01-16 IURSA 3.9 2024-03-07 2024-03-07
#> 4 1971-01-23 IURSA 3.9 2024-03-07 2024-03-07
#> 5 1971-01-30 IURSA 3.9 2024-03-07 2024-03-07
#> 6 1971-02-06 IURSA 4 2024-03-07 2024-03-07
modified_icnsa <- subset(icnsa_data, date >= as.Date("1967-01-07") & date <= as.Date("2024-02-17"))
merged_data <- merge(modified_icnsa, irusa_data, by = "date", all = FALSE)
head(merged_data)
#> date series_id.x value.x realtime_start.x realtime_end.x series_id.y
#> 1 1971-01-02 ICNSA 443000 2024-03-07 2024-03-07 IURSA
#> 2 1971-01-09 ICNSA 500000 2024-03-07 2024-03-07 IURSA
#> 3 1971-01-16 ICNSA 452000 2024-03-07 2024-03-07 IURSA
#> 4 1971-01-23 ICNSA 399000 2024-03-07 2024-03-07 IURSA
#> 5 1971-01-30 ICNSA 353000 2024-03-07 2024-03-07 IURSA
#> 6 1971-02-06 ICNSA 375000 2024-03-07 2024-03-07 IURSA
#> value.y realtime_start.y realtime_end.y
#> 1 3.9 2024-03-07 2024-03-07
#> 2 4.0 2024-03-07 2024-03-07
#> 3 3.9 2024-03-07 2024-03-07
#> 4 3.9 2024-03-07 2024-03-07
#> 5 3.9 2024-03-07 2024-03-07
#> 6 4.0 2024-03-07 2024-03-07
library(dlm)
#> Warning: package 'dlm' was built under R version 4.2.3
#>
#> Attaching package: 'dlm'
#> The following object is masked from 'package:ggplot2':
#>
#> %+%
start_period <- as.Date("1967-01-07")
end_period <- as.Date("2024-02-17")
adjusted_icnsa <- icnsa_data[icnsa_data$date >= start_period & icnsa_data$date <= end_period, ]
# Combine 'adjusted_icnsa' with 'irusa_data' based on the date field
combined_series <- merge(adjusted_icnsa, irusa_data, by = "date")
combined_series <- combined_series[order(combined_series$date), ]
observation_matrix <- cbind(intercept = 1, covariate = combined_series$value.y)
# Defining dimensions for model components
number_of_observations <- nrow(combined_series)
number_of_state_variables <- ncol(observation_matrix)
# State equation components
state_transition_matrix <- diag(rep(1, number_of_state_variables))
state_noise_covariance <- diag(rep(0.01, number_of_state_variables))
# Observation equation components
observation_noise_covariance <- diag(rep(1, number_of_observations))
initial_conditions_vector <- rep(0, number_of_state_variables)
initial_covariance_matrix <- diag(rep(1e5, number_of_state_variables))
# Model with 'dlm'
ss_model <- dlm(
FF = observation_matrix,
GG = state_transition_matrix,
V = observation_noise_covariance,
W = state_noise_covariance,
m0 = initial_conditions_vector,
C0 = initial_covariance_matrix
)
filtered_fit <- dlmFilter(merged_data$value.x, ss_model)
forecast_result <- dlmForecast(filtered_fit, nAhead = 1)
forecasted_next_value <- forecast_result$f[1]
cat("Forecasted value for the next period:", forecasted_next_value, "\n")
#> Forecasted value for the next period: 400725.4
summary(ss_model)
#> Length Class Mode
#> m0 2 -none- numeric
#> C0 4 -none- numeric
#> FF 5546 -none- numeric
#> V 7689529 -none- numeric
#> GG 4 -none- numeric
#> W 4 -none- numeric
Created on 2024-03-07 with reprex v2.1.0
Comparison of spline and statespace When we compare the original data with the model's fitted values, they look almost the same.. Normally, we expect the trend line to be smoother, and when combined with the seasonal component, it should still be less erratic compared to the original data.
Visualized ICNSA Data: Initial visualization of the ICNSA claims data to understand its trend over time. Identified COVID-19 Period: Marked the COVID-19 period in the data, from March 21, 2020, to February 21, 2021, to simulate the scenario of missing data. Simulated Missing Data: Set the values in the COVID-19 period to NA in the dataset to replicate missing data conditions. Imputed Missing Data Using Kalman Filter: Applied the Kalman filter (via na.kalman function in imputeTS package) to fill in the missing data Decomposed Data into Trend and Seasonal Components: Utilized a structural time series model (StructTS) to break down the data into trend and seasonal components. Incorporated IURSA as a Covariate: I included the Insured Unemployment Rate (IURSA) as a related factor and combined it with the ICNSA data. This was because I thought changes in unemployment rates might affect the number of unemployment claims. Constructed and Fitted State Space Model: I created a model using the dlm package that combined both the ICNSA series and the IURSA covariate. This model was meant to understand how unemployment claims change over time, considering the impact of unemployment rates. Forecasted Next Value: Finally, I used the fitted state space model to predict the next period's value (dlmForecast), providing an estimate based on the model's understanding of the data. **_
My forecasted value is 400725.5
_**
Defining Model Dimensions: I count how many data points I have (number_of_observations) and how many aspects of the data I'm tracking (number_of_state_variables). State Equation Components: I set up rules for how these aspects of the data change over time. Here, I assume they mostly stay the same (state_transition_matrix), with a little bit of random change (state_noise_covariance). Observation Equation Components: I specify how much noise or randomness I think is in my data (observation_noise_covariance) and where I think my data starts from (initial_conditions_vector and initial_covariance_matrix).
library(vars)
#> Loading required package: MASS
#> Loading required package: strucchange
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Loading required package: urca
#> Loading required package: lmtest
library(readr)
library(tseries)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(urca)
library(ggplot2)
library(reprex)
library(zoo)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:MASS':
#>
#> select
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyverse)
library(lubridate)
library(readxl)
library(vars)
library(BigVAR)
#> Loading required package: lattice
cdgvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/CDGVS.xlsx")
cdgvs_xl$Value <- as.numeric(cdgvs_xl$Value)
#> Warning: NAs introduced by coercion
cdgvs_xl <- cdgvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
cdgvs_xl <- cdgvs_xl %>% filter(format(Period, "%Y") != "2024")
head(cdgvs_xl)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <dbl>
#> 1 1992-01-01 18331
#> 2 1992-02-01 19558
#> 3 1992-03-01 20770
#> 4 1992-04-01 21024
#> 5 1992-05-01 21813
#> 6 1992-06-01 21069
sum(is.na(cdgvs_xl))
#> [1] 0
ggplot(cdgvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of CDGVS Value", x="Period", y="Value")
cgvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/CGVS.xlsx")
cgvs_xl$Value <- as.numeric(cgvs_xl$Value)
#> Warning: NAs introduced by coercion
cgvs_xl <- cgvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
head(cgvs_xl)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <dbl>
#> 1 1992-01-01 85583
#> 2 1992-02-01 86140
#> 3 1992-03-01 90380
#> 4 1992-04-01 89924
#> 5 1992-05-01 92558
#> 6 1992-06-01 93773
sum(is.na(cgvs_xl))
#> [1] 10
cgvs_xl <- cgvs_xl %>% filter(format(Period, "%Y") != "2024")
ggplot(cgvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of CGVS Value", x="Period", y="Value")
cngvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/CNGVS.xlsx")
cngvs_xl$Value <- as.numeric(cngvs_xl$Value)
#> Warning: NAs introduced by coercion
cngvs_xl <- cngvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
head(cngvs_xl)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <dbl>
#> 1 1992-01-01 19302
#> 2 1992-02-01 19547
#> 3 1992-03-01 20556
#> 4 1992-04-01 20528
#> 5 1992-05-01 20703
#> 6 1992-06-01 20945
cngvs_xl <- cngvs_xl %>% filter(format(Period, "%Y") != "2024")
sum(is.na(cngvs_xl))
#> [1] 0
ggplot(cngvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of CNGVS Value", x="Period", y="Value")
dgvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/DGVS.xlsx")
dgvs_xl$Value <- as.numeric(dgvs_xl$Value)
#> Warning: NAs introduced by coercion
dgvs_xl <- dgvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
head(dgvs_xl)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <dbl>
#> 1 1992-01-01 117958
#> 2 1992-02-01 119895
#> 3 1992-03-01 124897
#> 4 1992-04-01 126174
#> 5 1992-05-01 127638
#> 6 1992-06-01 127867
dgvs_xl <- dgvs_xl %>% filter(format(Period, "%Y") != "2024")
sum(is.na(dgvs_xl))
#> [1] 0
ggplot(dgvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of DGVS Value", x="Period", y="Value")
#DECOMPISTION
cdgvs_ts_data <- ts(cdgvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_cdgvs <- decompose(cdgvs_ts_data)
plot(decomposed_cdgvs)
cgvs_ts_data <- ts(cgvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_cgvs <- decompose(cgvs_ts_data)
plot(decomposed_cgvs)
cngvs_ts_data <- ts(cngvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_cngvs <- decompose(cngvs_ts_data)
plot(decomposed_cngvs)
dgvs_ts_data <- ts(dgvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_dgvs <- decompose(dgvs_ts_data)
plot(decomposed_dgvs)
#ADF TESTS
cdgvs_adf_result <- adf.test(cdgvs_ts_data)
cdgvs_adf_result
#>
#> Augmented Dickey-Fuller Test
#>
#> data: cdgvs_ts_data
#> Dickey-Fuller = -2.5158, Lag order = 7, p-value = 0.3593
#> alternative hypothesis: stationary
cgvs_adf_result <- adf.test(cgvs_ts_data)
cgvs_adf_result
#>
#> Augmented Dickey-Fuller Test
#>
#> data: cgvs_ts_data
#> Dickey-Fuller = -3.4213, Lag order = 7, p-value = 0.05057
#> alternative hypothesis: stationary
dgvs_adf_result <- adf.test(dgvs_ts_data)
dgvs_adf_result
#>
#> Augmented Dickey-Fuller Test
#>
#> data: dgvs_ts_data
#> Dickey-Fuller = -3.1486, Lag order = 7, p-value = 0.09684
#> alternative hypothesis: stationary
cngvs_adf_result <- adf.test(cngvs_ts_data)
cngvs_adf_result
#>
#> Augmented Dickey-Fuller Test
#>
#> data: cngvs_ts_data
#> Dickey-Fuller = -1.939, Lag order = 7, p-value = 0.6029
#> alternative hypothesis: stationary
#All the series turned to be non staionary now we will difference them and test adf again\
series1_diff <- diff(cgvs_ts_data, differences = 1)
series2_diff <- diff(cdgvs_ts_data, differences = 1)
series3_diff <- diff(dgvs_ts_data, differences =1 )
series4_diff <- diff(cngvs_ts_data, differences = 1)
adf_test_series1 <- adf.test(series1_diff)
#> Warning in adf.test(series1_diff): p-value smaller than printed p-value
adf_test_series2 <- adf.test(series2_diff)
#> Warning in adf.test(series2_diff): p-value smaller than printed p-value
adf_test_series3 <- adf.test(series3_diff)
#> Warning in adf.test(series3_diff): p-value smaller than printed p-value
adf_test_series4 <- adf.test(series4_diff)
#> Warning in adf.test(series4_diff): p-value smaller than printed p-value
#NOW combining all the series together for VAR model
combined_ts <- cbind(cngvs_ts_data, cgvs_ts_data, cdgvs_ts_data, dgvs_ts_data)
str(combined_ts)
#> Time-Series [1:384, 1:4] from 1992 to 2024: 19302 19547 20556 20528 20703 ...
#> - attr(*, "dimnames")=List of 2
#> ..$ : NULL
#> ..$ : chr [1:4] "cngvs_ts_data" "cgvs_ts_data" "cdgvs_ts_data" "dgvs_ts_data"
df_simplified <- combined_ts[, c('dgvs_ts_data', 'cgvs_ts_data', 'cdgvs_ts_data', 'cngvs_ts_data')]
# Fit the VAR(1) model
var1.model <- VAR(combined_ts, p = 1, type = "const")
summary(var1.model)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: cngvs_ts_data, cgvs_ts_data, cdgvs_ts_data, dgvs_ts_data
#> Deterministic variables: const
#> Sample size: 383
#> Log Likelihood: -13280.047
#> Roots of the characteristic polynomial:
#> 0.9938 0.9938 0.951 0.8017
#> Call:
#> VAR(y = combined_ts, p = 1, type = "const")
#>
#>
#> Estimation results for equation cngvs_ts_data:
#> ==============================================
#> cngvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 1.008637 0.011792 85.536 < 2e-16 ***
#> cgvs_ts_data.l1 0.003807 0.001800 2.115 0.03512 *
#> cdgvs_ts_data.l1 0.020447 0.010284 1.988 0.04750 *
#> dgvs_ts_data.l1 -0.009731 0.003417 -2.848 0.00464 **
#> const 467.609580 174.828829 2.675 0.00781 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 475.9 on 378 degrees of freedom
#> Multiple R-Squared: 0.9947, Adjusted R-squared: 0.9946
#> F-statistic: 1.771e+04 on 4 and 378 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation cgvs_ts_data:
#> =============================================
#> cgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 0.38147 0.09422 4.049 6.25e-05 ***
#> cgvs_ts_data.l1 0.99597 0.01438 69.242 < 2e-16 ***
#> cdgvs_ts_data.l1 -0.17593 0.08217 -2.141 0.0329 *
#> dgvs_ts_data.l1 -0.03884 0.02730 -1.423 0.1557
#> const 1803.95692 1396.86072 1.291 0.1973
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 3802 on 378 degrees of freedom
#> Multiple R-Squared: 0.9928, Adjusted R-squared: 0.9928
#> F-statistic: 1.308e+04 on 4 and 378 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation cdgvs_ts_data:
#> ==============================================
#> cdgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 1.655e-01 4.498e-02 3.679 0.000269 ***
#> cgvs_ts_data.l1 -6.417e-03 6.867e-03 -0.935 0.350637
#> cdgvs_ts_data.l1 8.247e-01 3.923e-02 21.023 < 2e-16 ***
#> dgvs_ts_data.l1 9.257e-04 1.303e-02 0.071 0.943415
#> const 1.083e+03 6.669e+02 1.624 0.105307
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 1815 on 378 degrees of freedom
#> Multiple R-Squared: 0.9131, Adjusted R-squared: 0.9122
#> F-statistic: 993 on 4 and 378 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation dgvs_ts_data:
#> =============================================
#> dgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 0.33772 0.10569 3.195 0.00151 **
#> cgvs_ts_data.l1 0.02932 0.01614 1.817 0.07003 .
#> cdgvs_ts_data.l1 -0.06188 0.09217 -0.671 0.50238
#> dgvs_ts_data.l1 0.91105 0.03063 29.748 < 2e-16 ***
#> const 4332.90201 1566.99267 2.765 0.00597 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 4265 on 378 degrees of freedom
#> Multiple R-Squared: 0.9865, Adjusted R-squared: 0.9863
#> F-statistic: 6893 on 4 and 378 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data 226472 880043 443968 1310690
#> cgvs_ts_data 880043 14457568 4658727 10910001
#> cdgvs_ts_data 443968 4658727 3295422 6548669
#> dgvs_ts_data 1310690 10910001 6548669 18193781
#>
#> Correlation matrix of residuals:
#> cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data 1.0000 0.4863 0.5139 0.6457
#> cgvs_ts_data 0.4863 1.0000 0.6749 0.6727
#> cdgvs_ts_data 0.5139 0.6749 1.0000 0.8457
#> dgvs_ts_data 0.6457 0.6727 0.8457 1.0000
#For optimal lag for VAR Model
var_result <- VARselect(combined_ts, lag.max = 10, type = "both")
optimal_lag <- var_result$selection
print(optimal_lag)
#> AIC(n) HQ(n) SC(n) FPE(n)
#> 4 4 2 4
# Fit the VAR(4) model based on the optimal lag length
var4.model <- VAR(combined_ts, p = 4, type = "const")
summary(var4.model)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: cngvs_ts_data, cgvs_ts_data, cdgvs_ts_data, dgvs_ts_data
#> Deterministic variables: const
#> Sample size: 380
#> Log Likelihood: -13054.148
#> Roots of the characteristic polynomial:
#> 0.9877 0.9877 0.971 0.8944 0.8141 0.7398 0.7398 0.6232 0.6232 0.501 0.501 0.4052 0.351 0.351 0.3106 0.04926
#> Call:
#> VAR(y = combined_ts, p = 4, type = "const")
#>
#>
#> Estimation results for equation cngvs_ts_data:
#> ==============================================
#> cngvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 9.698e-01 6.599e-02 14.695 < 2e-16 ***
#> cgvs_ts_data.l1 4.028e-02 9.091e-03 4.431 1.25e-05 ***
#> cdgvs_ts_data.l1 2.005e-02 2.683e-02 0.747 0.45542
#> dgvs_ts_data.l1 -2.521e-02 1.250e-02 -2.016 0.04454 *
#> cngvs_ts_data.l2 1.626e-01 8.769e-02 1.855 0.06445 .
#> cgvs_ts_data.l2 -3.912e-02 1.423e-02 -2.749 0.00627 **
#> cdgvs_ts_data.l2 -3.478e-02 3.578e-02 -0.972 0.33175
#> dgvs_ts_data.l2 1.117e-02 1.499e-02 0.745 0.45668
#> cngvs_ts_data.l3 6.702e-03 8.799e-02 0.076 0.93933
#> cgvs_ts_data.l3 2.172e-03 1.444e-02 0.150 0.88054
#> cdgvs_ts_data.l3 -2.362e-02 3.488e-02 -0.677 0.49868
#> dgvs_ts_data.l3 2.747e-02 1.506e-02 1.824 0.06893 .
#> cngvs_ts_data.l4 -1.433e-01 6.721e-02 -2.132 0.03371 *
#> cgvs_ts_data.l4 -6.053e-04 9.634e-03 -0.063 0.94994
#> cdgvs_ts_data.l4 5.705e-02 2.463e-02 2.317 0.02108 *
#> dgvs_ts_data.l4 -1.944e-02 1.225e-02 -1.588 0.11323
#> const 3.535e+02 1.741e+02 2.030 0.04305 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 454 on 363 degrees of freedom
#> Multiple R-Squared: 0.9952, Adjusted R-squared: 0.995
#> F-statistic: 4691 on 16 and 363 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation cgvs_ts_data:
#> =============================================
#> cgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 0.34996 0.50556 0.692 0.48925
#> cgvs_ts_data.l1 1.36455 0.06965 19.593 < 2e-16 ***
#> cdgvs_ts_data.l1 -0.27807 0.20557 -1.353 0.17700
#> dgvs_ts_data.l1 -0.05367 0.09579 -0.560 0.57560
#> cngvs_ts_data.l2 1.41584 0.67180 2.108 0.03576 *
#> cgvs_ts_data.l2 -0.44419 0.10902 -4.074 5.67e-05 ***
#> cdgvs_ts_data.l2 -0.34167 0.27413 -1.246 0.21344
#> dgvs_ts_data.l2 0.03328 0.11482 0.290 0.77211
#> cngvs_ts_data.l3 -1.17569 0.67409 -1.744 0.08198 .
#> cgvs_ts_data.l3 0.10899 0.11065 0.985 0.32526
#> cdgvs_ts_data.l3 -0.26208 0.26721 -0.981 0.32734
#> dgvs_ts_data.l3 0.26851 0.11536 2.328 0.02048 *
#> cngvs_ts_data.l4 -0.28667 0.51492 -0.557 0.57806
#> cgvs_ts_data.l4 -0.05064 0.07381 -0.686 0.49309
#> cdgvs_ts_data.l4 0.60871 0.18866 3.227 0.00137 **
#> dgvs_ts_data.l4 -0.24149 0.09383 -2.574 0.01045 *
#> const 917.57323 1333.82070 0.688 0.49194
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 3478 on 363 degrees of freedom
#> Multiple R-Squared: 0.9941, Adjusted R-squared: 0.9939
#> F-statistic: 3831 on 16 and 363 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation cdgvs_ts_data:
#> ==============================================
#> cdgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 0.426004 0.244967 1.739 0.082877 .
#> cgvs_ts_data.l1 0.112590 0.033746 3.336 0.000937 ***
#> cdgvs_ts_data.l1 0.951656 0.099608 9.554 < 2e-16 ***
#> dgvs_ts_data.l1 -0.080570 0.046414 -1.736 0.083428 .
#> cngvs_ts_data.l2 0.644588 0.325517 1.980 0.048436 *
#> cgvs_ts_data.l2 -0.145373 0.052826 -2.752 0.006222 **
#> cdgvs_ts_data.l2 -0.104645 0.132828 -0.788 0.431315
#> dgvs_ts_data.l2 -0.074377 0.055637 -1.337 0.182115
#> cngvs_ts_data.l3 -0.594116 0.326624 -1.819 0.069741 .
#> cgvs_ts_data.l3 0.021015 0.053613 0.392 0.695302
#> cdgvs_ts_data.l3 -0.158001 0.129476 -1.220 0.223136
#> dgvs_ts_data.l3 0.189997 0.055896 3.399 0.000751 ***
#> cngvs_ts_data.l4 -0.373582 0.249503 -1.497 0.135183
#> cgvs_ts_data.l4 0.005635 0.035764 0.158 0.874882
#> cdgvs_ts_data.l4 0.157391 0.091412 1.722 0.085964 .
#> dgvs_ts_data.l4 -0.025975 0.045463 -0.571 0.568124
#> const 728.323981 646.294010 1.127 0.260519
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 1685 on 363 degrees of freedom
#> Multiple R-Squared: 0.9256, Adjusted R-squared: 0.9223
#> F-statistic: 282.1 on 16 and 363 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation dgvs_ts_data:
#> =============================================
#> dgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> cngvs_ts_data.l1 1.35219 0.55341 2.443 0.015026 *
#> cgvs_ts_data.l1 0.38841 0.07624 5.095 5.63e-07 ***
#> cdgvs_ts_data.l1 0.50328 0.22503 2.237 0.025924 *
#> dgvs_ts_data.l1 0.47477 0.10485 4.528 8.09e-06 ***
#> cngvs_ts_data.l2 0.94609 0.73538 1.287 0.199077
#> cgvs_ts_data.l2 -0.31706 0.11934 -2.657 0.008237 **
#> cdgvs_ts_data.l2 -0.75683 0.30007 -2.522 0.012091 *
#> dgvs_ts_data.l2 0.17962 0.12569 1.429 0.153851
#> cngvs_ts_data.l3 -1.55298 0.73788 -2.105 0.036008 *
#> cgvs_ts_data.l3 0.02873 0.12112 0.237 0.812600
#> cdgvs_ts_data.l3 -0.73967 0.29250 -2.529 0.011869 *
#> dgvs_ts_data.l3 0.69092 0.12628 5.471 8.33e-08 ***
#> cngvs_ts_data.l4 -0.55131 0.56366 -0.978 0.328682
#> cgvs_ts_data.l4 -0.09118 0.08079 -1.129 0.259849
#> cdgvs_ts_data.l4 0.85035 0.20651 4.118 4.74e-05 ***
#> dgvs_ts_data.l4 -0.37438 0.10271 -3.645 0.000306 ***
#> const 2711.78203 1460.05415 1.857 0.064076 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 3807 on 363 degrees of freedom
#> Multiple R-Squared: 0.9892, Adjusted R-squared: 0.9888
#> F-statistic: 2087 on 16 and 363 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data 206133 651228 362505 1045325
#> cgvs_ts_data 651228 12098358 3792233 8209165
#> cdgvs_ts_data 362505 3792233 2840480 5483311
#> dgvs_ts_data 1045325 8209165 5483311 14496710
#>
#> Correlation matrix of residuals:
#> cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data 1.0000 0.4124 0.4737 0.6047
#> cgvs_ts_data 0.4124 1.0000 0.6469 0.6199
#> cdgvs_ts_data 0.4737 0.6469 1.0000 0.8545
#> dgvs_ts_data 0.6047 0.6199 0.8545 1.0000
#Forecasting the values
forecast <- predict(var4.model, n.ahead = 1, ci = 0.95) # ci is the confidence interval and I gave the threshhold of 0.95
print(forecast)
#> $cngvs_ts_data
#> fcst lower upper CI
#> cngvs_ts_data.fcst 47709.96 46820.1 48599.82 889.8605
#>
#> $cgvs_ts_data
#> fcst lower upper CI
#> cgvs_ts_data.fcst 244795.3 237978 251612.5 6817.283
#>
#> $cdgvs_ts_data
#> fcst lower upper CI
#> cdgvs_ts_data.fcst 44109.68 40806.41 47412.95 3303.269
#>
#> $dgvs_ts_data
#> fcst lower upper CI
#> dgvs_ts_data.fcst 280796.2 273333.7 288258.7 7462.474
#Granger Test
granger_test <- causality(var4.model)
#> Warning in causality(var4.model):
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (cngvs_ts_data) as cause variable.
print(granger_test)
#> $Granger
#>
#> Granger causality H0: cngvs_ts_data do not Granger-cause cgvs_ts_data
#> cdgvs_ts_data dgvs_ts_data
#>
#> data: VAR object var4.model
#> F-Test = 3.0066, df1 = 12, df2 = 1452, p-value = 0.0003539
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: cngvs_ts_data and cgvs_ts_data
#> cdgvs_ts_data dgvs_ts_data
#>
#> data: VAR object var4.model
#> Chi-squared = 104.26, df = 3, p-value < 2.2e-16
#Apply BigVAR with BASIC struct
combined_matrix <- cbind(cgvs_ts_data, cdgvs_ts_data, cngvs_ts_data, dgvs_ts_data)
sparse_var_fit <- BigVAR.fit(Y=combined_matrix,struct='Basic',p=2,lambda=1)[,,1]
print(sparse_var_fit)
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 840.3703 1.16271304 -0.14150564 0.2421101 0.07491145 -0.1917155
#> [2,] 344.6862 0.11765031 0.38418171 0.1352681 0.16302836 -0.1460612
#> [3,] 692.7337 0.05780321 0.08645465 0.4141812 0.01270520 -0.0436268
#> [4,] 4041.2572 0.33713395 0.11909525 0.1514570 0.76268669 -0.3106586
#> [,7] [,8] [,9]
#> [1,] -0.22813109 0.21367517 -0.07064288
#> [2,] 0.23767560 0.10750817 -0.12139601
#> [3,] 0.07910418 0.40037979 -0.02287711
#> [4,] -0.14835613 0.09192961 0.16221371
summary(sparse_var_fit)
#> V1 V2 V3 V4
#> Min. : 344.7 Min. :0.0578 Min. :-0.14151 Min. :0.1353
#> 1st Qu.: 605.7 1st Qu.:0.1027 1st Qu.: 0.02946 1st Qu.:0.1474
#> Median : 766.6 Median :0.2274 Median : 0.10278 Median :0.1968
#> Mean :1479.8 Mean :0.4188 Mean : 0.11206 Mean :0.2358
#> 3rd Qu.:1640.6 3rd Qu.:0.5435 3rd Qu.: 0.18537 3rd Qu.:0.2851
#> Max. :4041.3 Max. :1.1627 Max. : 0.38418 Max. :0.4142
#> V5 V6 V7 V8
#> Min. :0.01271 Min. :-0.31066 Min. :-0.22813 Min. :0.09193
#> 1st Qu.:0.05936 1st Qu.:-0.22145 1st Qu.:-0.16830 1st Qu.:0.10361
#> Median :0.11897 Median :-0.16889 Median :-0.03463 Median :0.16059
#> Mean :0.25333 Mean :-0.17302 Mean :-0.01493 Mean :0.20337
#> 3rd Qu.:0.31294 3rd Qu.:-0.12045 3rd Qu.: 0.11875 3rd Qu.:0.26035
#> Max. :0.76269 Max. :-0.04363 Max. : 0.23768 Max. :0.40038
#> V9
#> Min. :-0.12140
#> 1st Qu.:-0.08333
#> Median :-0.04676
#> Mean :-0.01318
#> 3rd Qu.: 0.02340
#> Max. : 0.16221
Created on 2024-04-11 with reprex v2.1.0
Consumer Durable Goods (CDGVS): Focuses on long-lasting goods like electronics and furniture, capturing their shipment values. It's an indicator of consumer confidence and economic health.
Consumer Goods (CGVS): Represents all consumer goods, both durable and nondurable. This dataset helps understand overall consumer spending, a key economic driver.
Consumer Nondurable Goods (CNGVS): Targets short-lived goods such as food and clothing, measuring their shipment values. Reflects essential, less volatile consumer spending.
Durable Goods (DGVS): Includes durable goods beyond just consumer items, covering business and government purchases. It's a sign of broader economic activity and investment trends.
Identified 10 null values at the end of all the 2024, reflecting the data collection period and removed those dates
ADF tests, revealing non-stationarity. Applied first-order differencing (d=1) to achieve stationarity for empirical analysis. After the first differing all the series is now stationary.
After that applied VAR Model start with var with p =1 and found the optimal lag and found that 4 will be suitable for our dataset. so did VAR4
VAR4 fit better and captured more data and did bigVAR with threshold of 0.95 confidence intervals
Created on 2024-02-08 with reprex v2.1.0