jlivsey / UB-sping24-time-series

2 stars 1 forks source link

Sree Lekshmi Prasannan #24

Open Sree-lekshmi99 opened 9 months ago

Sree-lekshmi99 commented 9 months ago
library(astsa)
#> Warning: package 'astsa' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
#> 
#> Attaching package: 'forecast'
#> The following object is masked from 'package:astsa':
#> 
#>     gas
library(fpp3)
#> Warning: package 'fpp3' was built under R version 4.2.3
#> ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
#> ✔ tibble      3.2.1     ✔ tsibble     1.1.3
#> ✔ dplyr       1.1.3     ✔ tsibbledata 0.4.1
#> ✔ tidyr       1.3.0     ✔ feasts      0.3.1
#> ✔ lubridate   1.9.3     ✔ fable       0.3.3
#> ✔ ggplot2     3.4.4     ✔ fabletools  0.3.4
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.3.1
#> Warning: package 'lubridate' was built under R version 4.3.1
#> Warning: package 'ggplot2' was built under R version 4.3.1
#> Warning: package 'tsibble' was built under R version 4.2.3
#> Warning: package 'tsibbledata' was built under R version 4.2.3
#> Warning: package 'feasts' was built under R version 4.2.3
#> Warning: package 'fabletools' was built under R version 4.2.3
#> Warning: package 'fable' was built under R version 4.2.3
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date()    masks base::date()
#> ✖ dplyr::filter()      masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval()  masks lubridate::interval()
#> ✖ dplyr::lag()         masks stats::lag()
#> ✖ tsibble::setdiff()   masks base::setdiff()
#> ✖ tsibble::union()     masks base::union()
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(lubridate)
library(seasonal)
#> Warning: package 'seasonal' was built under R version 4.2.3
#> 
#> Attaching package: 'seasonal'
#> The following object is masked from 'package:tibble':
#> 
#>     view
#> The following objects are masked from 'package:astsa':
#> 
#>     trend, unemp
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(zoo)
#> Warning: package 'zoo' was built under R version 4.3.1
#> 
#> Attaching package: 'zoo'
#> The following object is masked from 'package:tsibble':
#> 
#>     index
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(zoo)
library(tseries)
fredr_set_key('b6f6d4e821ce7228138fcd4382f659ae')
data_icnsa_fred <- fredr(series_id = 'ICNSA')
head(data_icnsa_fred)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-08     2024-02-08  
#> 2 1967-01-14 ICNSA     334000 2024-02-08     2024-02-08  
#> 3 1967-01-21 ICNSA     277000 2024-02-08     2024-02-08  
#> 4 1967-01-28 ICNSA     252000 2024-02-08     2024-02-08  
#> 5 1967-02-04 ICNSA     274000 2024-02-08     2024-02-08  
#> 6 1967-02-11 ICNSA     276000 2024-02-08     2024-02-08
sum(is.na(data_icnsa_fred))
#> [1] 0
data_ts <- zoo(data_icnsa_fred$value, order.by = data_icnsa_fred$date)

adf_result <- adf.test(data_ts)
#> Warning in adf.test(data_ts): p-value smaller than printed p-value

print(adf_result)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data_ts
#> Dickey-Fuller = -8.8641, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary

#Our time series data with very minimal value of p suggesting our times series data is stationary

#Plotting our time series data

plot(data_ts, main = "ICNSA Time Series Plot", xlab = "Date", ylab = "Value")

summary(data_icnsa_fred$value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  133000  265852  326938  365060  406816 6161268

# decomposition of the graph to analyse it more
covid_start <- as.Date("2020-03-14")
covid_end <- as.Date("2021-01-02")

covid_data <- subset(data_icnsa_fred, date >= covid_start & date <= covid_end)

data_ts_zoo <- zoo(data_icnsa_fred$value, order.by = data_icnsa_fred$date)
data_ts <- ts(data_icnsa_fred$value, start = c(1967, 1), frequency = 52) 

stl_decomp <- stl(data_ts, s.window = "periodic", robust = TRUE)
plot(stl_decomp)


#ACF and PACF 
acf(data_ts)

pacf(data_ts)


#As from PACF graph it cuts after the first lag, which suggests that ap value of 1 and the first lag in ACF is significantly above the significance level, which is also suggest a q value of 1 and as our data is already stationary the value of d is 0
library(forecast)
model_arima <- Arima(data_ts, order = c(1,0,1))
print(model_arima)
#> Series: data_ts 
#> ARIMA(1,0,1) with non-zero mean 
#> 
#> Coefficients:
#>          ar1     ma1       mean
#>       0.8792  0.4179  365051.32
#> s.e.  0.0091  0.0156   18827.52
#> 
#> sigma^2 = 7.632e+09:  log likelihood = -38121.17
#> AIC=76250.35   AICc=76250.36   BIC=76274.35

forecast_arima <- forecast(model_arima, h = 7)  

plot(forecast_arima)


rounded_forecast <- round(forecast_arima$mean)
print(rounded_forecast)
#> Time Series:
#> Start = c(2024, 16) 
#> End = c(2024, 22) 
#> Frequency = 52 
#> [1] 228495 244988 259489 272238 283448 293304 301969

# FORCASTED VALUE IS 244988 

Created on 2024-02-08 with reprex v2.1.0

Sree-lekshmi99 commented 9 months ago

HOMEWORK#1

library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.1
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
fredr_set_key("dee75be4acc1b9d6c94d9f59867a4910")

icnsa <- fredr(series_id = "ICNSA") 

icnsa$date <- as.Date(icnsa$date)
# CCNSA shows how many people are still getting unemployment benefits in the U.S., helping to understand job market trends without adjusting for seasonal changes.

continued_claims <- fredr(series_id = "CCNSA")

continued_claims$date <- as.Date(continued_claims$date)

summary(icnsa$value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  133000  265800  326919  365013  406725 6161268

summary(continued_claims$value)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>   785000  1996902  2494495  2761854  3145702 23037921

ggplot(icnsa, aes(x = date, y = value)) +
  geom_line(color = "red") +  
  ggtitle("ICNSA Claims") + 
  xlab("Year") + 
  ylab("Number of Claims")


ggplot(continued_claims, aes(x = date, y = value)) +
  geom_line(color = "red") +  
  ggtitle("Continued Claims (Insured Unemployment)") + 
  xlab("Year") + 
  ylab("Number of Claims")


library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
adf_icnsa <- adf.test(icnsa$value, alternative = "stationary")
#> Warning in adf.test(icnsa$value, alternative = "stationary"): p-value smaller
#> than printed p-value

# Stationarity checks using Augmented Dickey-Fuller Test for CCNSA
adf_ccnsa <- adf.test(continued_claims$value, alternative = "stationary")
#> Warning in adf.test(continued_claims$value, alternative = "stationary"):
#> p-value smaller than printed p-value

# Output the ADF test results
adf_icnsa$p.value
#> [1] 0.01
adf_ccnsa$p.value
#> [1] 0.01

acf(icnsa$value)

pacf(icnsa$value)


acf(continued_claims$value)

pacf(continued_claims$value)

# Data is stationary by ADF test but acf and pacf does give a some characterstics of non stationary data

merged_data <- merge(icnsa, continued_claims, by = "date")

correlation <- cor(merged_data$value.x, merged_data$value.y)

print(correlation)
#> [1] 0.714954

library(forecast)

#merged_data is my merged and cleaned dataset with 'value.x' as ICNSA and 'value.y' as CCNSA.

# Log transformation
merged_data$value.x <- log(merged_data$value.x)
merged_data$value.y <- log(merged_data$value.y)

# Fit the regARIMA model with CCNSA as an external regressor
model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y)

summary(model)
#> Series: merged_data$value.x 
#> Regression with ARIMA(3,1,1) errors 
#> 
#> Coefficients:
#>           ar1      ar2      ar3     ma1    xreg
#>       -0.5099  -0.1054  -0.0655  0.1678  0.6474
#> s.e.   0.1751   0.0623   0.0186  0.1743  0.0523
#> 
#> sigma^2 = 0.01606:  log likelihood = 1928.11
#> AIC=-3844.21   AICc=-3844.19   BIC=-3808.22
#> 
#> Training set error measures:
#>                         ME      RMSE        MAE          MPE      MAPE
#> Training set -0.0002598266 0.1266183 0.08957493 -0.006119179 0.6994997
#>                   MASE         ACF1
#> Training set 0.9719684 0.0005756628

# Diagnostic checks
checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(3,1,1) errors
#> Q* = 210.41, df = 6, p-value < 2.2e-16
#> 
#> Model df: 4.   Total lags used: 10

forecasted_value <- forecast(model, xreg = as.numeric(future_ccnsa_value), h = 1)
#> Error in forecast.forecast_ARIMA(model, xreg = as.numeric(future_ccnsa_value), : object 'future_ccnsa_value' not found

exp_forecasted_value <- exp(forecasted_value$mean)
#> Error in eval(expr, envir, enclos): object 'forecasted_value' not found

# Print the exponentiated forecasted value
print(exp_forecasted_value)
#> Error in print(exp_forecasted_value): object 'exp_forecasted_value' not found

# Plot the forecast
plot(forecasted_value)
#> Error in plot(forecasted_value): object 'forecasted_value' not found

Created on 2024-02-22 with reprex v2.1.0

Sree-lekshmi99 commented 9 months ago

HW1#updated

library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.1
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
fredr_set_key("dee75be4acc1b9d6c94d9f59867a4910")

icnsa <- fredr(series_id = "ICNSA") 

icnsa$date <- as.Date(icnsa$date)
# CCNSA shows how many people are still getting unemployment benefits in the U.S., helping to understand job market trends without adjusting for seasonal changes.

continued_claims <- fredr(series_id = "CCNSA")

continued_claims$date <- as.Date(continued_claims$date)

summary(icnsa$value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  133000  265794  326900  364957  406633 6161268

summary(continued_claims$value)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>   785000  1996951  2493752  2761628  3145554 23037921

ggplot(icnsa, aes(x = date, y = value)) +
  geom_line(color = "red") +  
  ggtitle("ICNSA Claims") + 
  xlab("Year") + 
  ylab("Number of Claims")


ggplot(continued_claims, aes(x = date, y = value)) +
  geom_line(color = "red") +  
  ggtitle("Continued Claims (Insured Unemployment)") + 
  xlab("Year") + 
  ylab("Number of Claims")


library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
adf_icnsa <- adf.test(icnsa$value, alternative = "stationary")
#> Warning in adf.test(icnsa$value, alternative = "stationary"): p-value smaller
#> than printed p-value

# Stationarity checks using Augmented Dickey-Fuller Test for CCNSA
adf_ccnsa <- adf.test(continued_claims$value, alternative = "stationary")
#> Warning in adf.test(continued_claims$value, alternative = "stationary"):
#> p-value smaller than printed p-value

# Output the ADF test results
adf_icnsa$p.value
#> [1] 0.01
adf_ccnsa$p.value
#> [1] 0.01

acf(icnsa$value)

pacf(icnsa$value)


acf(continued_claims$value)

pacf(continued_claims$value)

# Data is stationary by ADF test but acf and pacf does give a some characterstics of non stationary data

merged_data <- merge(icnsa, continued_claims, by = "date")

correlation <- cor(merged_data$value.x, merged_data$value.y)

print(correlation)
#> [1] 0.7149735

library(forecast)

#merged_data is my merged and cleaned dataset with 'value.x' as ICNSA and 'value.y' as CCNSA.

# Log transformation
merged_data$value.x <- log(merged_data$value.x)
merged_data$value.y <- log(merged_data$value.y)

# Fitting the regARIMA model with CCNSA as an external regressor
model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y)

summary(model)
#> Series: merged_data$value.x 
#> Regression with ARIMA(3,1,1) errors 
#> 
#> Coefficients:
#>           ar1      ar2      ar3     ma1    xreg
#>       -0.5112  -0.1059  -0.0653  0.1693  0.6472
#> s.e.   0.1758   0.0625   0.0186  0.1751  0.0523
#> 
#> sigma^2 = 0.01606:  log likelihood = 1929.06
#> AIC=-3846.13   AICc=-3846.1   BIC=-3810.13
#> 
#> Training set error measures:
#>                         ME      RMSE        MAE          MPE      MAPE     MASE
#> Training set -0.0002861374 0.1266051 0.08956942 -0.006331525 0.6994647 0.972069
#>                      ACF1
#> Training set 0.0005792167

checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(3,1,1) errors
#> Q* = 210.94, df = 6, p-value < 2.2e-16
#> 
#> Model df: 4.   Total lags used: 10
future_ccnsa_value <- tail(merged_data$value.y)

forecasted_value <- forecast(model, xreg = as.numeric(future_ccnsa_value), h = 52)

exp_forecasted_value <- exp(forecasted_value$mean)

# Print the exponentiated forecasted value
print(exp_forecasted_value)
#> Time Series:
#> Start = 2981 
#> End = 2986 
#> Frequency = 1 
#> [1] 228467.5 224855.6 233403.1 229741.5 230442.0 227342.1

# Plot the forecast
plot(forecasted_value)

Created on 2024-02-28 with reprex v2.1.0

Sree-lekshmi99 commented 9 months ago

HOMEWORK2

library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.3.1
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(urca)
#> Warning: package 'urca' was built under R version 4.2.3
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.3.1
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.3.1
#> Warning: package 'readr' was built under R version 4.3.1
#> Warning: package 'purrr' was built under R version 4.3.1
#> Warning: package 'stringr' was built under R version 4.3.1
#> Warning: package 'forcats' was built under R version 4.3.1
#> Warning: package 'lubridate' was built under R version 4.3.1
fredr_set_key("dee75be4acc1b9d6c94d9f59867a4910")

icnsa <- fredr(series_id = "ICNSA") 

head(icnsa)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-28     2024-02-28  
#> 2 1967-01-14 ICNSA     334000 2024-02-28     2024-02-28  
#> 3 1967-01-21 ICNSA     277000 2024-02-28     2024-02-28  
#> 4 1967-01-28 ICNSA     252000 2024-02-28     2024-02-28  
#> 5 1967-02-04 ICNSA     274000 2024-02-28     2024-02-28  
#> 6 1967-02-11 ICNSA     276000 2024-02-28     2024-02-28

#The start of the spike is  just before the last major tick mark on the x-axis, which is 2020.
#The end of the spike, or at least when it starts to  drop, This could extend well into 2021 or even 2022,

#Will betaking start date as the February 2020 and end date as May 2022

covidStart <- as.Date("2020-02-01")
covidEnd <- as.Date("2021-05-1")

covidPlot <- ggplot(icnsa, aes(date, value)) +
  geom_line() +  
  geom_vline(xintercept = as.numeric(covidStart), linetype = "dotdash", color = "darkred") +
  geom_vline(xintercept = as.numeric(covidEnd), linetype = "dotdash", color = "green") +
  # Define the labels for the plot
  ggtitle("Claims During Covid Period") +
  xlab("Year") +
  ylab("Claims Count")

covidPlot


#Subset data without the covid 
dataNotInCovid <- icnsa[!(icnsa$date >= covidStart & icnsa$date <= covidEnd), ]

fit <- smooth.spline(dataNotInCovid$date, dataNotInCovid$value)

# fit is the result of fitting a smooth spline to the data where icnsa$date is the independent variable and icnsa$value is the dependent variable. The optimal lambda (smoothing parameter) value chosen by smooth.spline is stored in fit$spar.
# To access the chosen lambda value
optimal_lambda <- fit$spar
optimal_lambda
#> [1] 0.1292562

#so our lambda value is 0.129
spline_fit <- smooth.spline(x = as.numeric(dataNotInCovid$date), y = dataNotInCovid$value,lambda = 0.129)

#plotting the spline against our original data
plot(as.numeric(dataNotInCovid$date), dataNotInCovid$value, type = "p", col = "red", main = "Original vs. Smoothed Data", xlab = "Date", ylab = "Value", xlim = range(as.numeric(dataNotInCovid$date)), ylim = range(dataNotInCovid$value))
lines(spline_fit, col = "black")

legend("topright", legend = c("Original Data", "Smoothed Spline"), col = c("red", "blue"), lty = 1)


#Collecting the covid years
dataInCovid <- icnsa[(icnsa$date >= covidStart & icnsa$date <= covidEnd), ]

head(dataInCovid)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 2020-02-01 ICNSA     224561 2024-02-28     2024-02-28  
#> 2 2020-02-08 ICNSA     219459 2024-02-28     2024-02-28  
#> 3 2020-02-15 ICNSA     209218 2024-02-28     2024-02-28  
#> 4 2020-02-22 ICNSA     198845 2024-02-28     2024-02-28  
#> 5 2020-02-29 ICNSA     216625 2024-02-28     2024-02-28  
#> 6 2020-03-07 ICNSA     199914 2024-02-28     2024-02-28

# imputing values from our spline fit from the covid years 
newImputedvalues <- predict(spline_fit, x = as.numeric(dataInCovid$date))

dataInCovid$value <- newImputedvalues$y

updatedIcnsaData <- bind_rows(dataNotInCovid, newImputedvalues)

updatedIcnsaData <- updatedIcnsaData %>%
  filter(date < Sys.Date())

ggplot(updatedIcnsaData, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Claims",
       x = "Year",
       y = "Number")


weekly_ts <- ts(updatedIcnsaData$value, frequency = 52)

multiplicative_hw_model <- HoltWinters(weekly_ts, seasonal = "multiplicative")
multiplicative_forecast <- forecast(multiplicative_hw_model, h = 1)
cat("Multiplicative Model Forecast:\n")
#> Multiplicative Model Forecast:
print(multiplicative_forecast)
#>          Point Forecast  Lo 80    Hi 80  Lo 95  Hi 95
#> 57.05769         227059 185861 268257.1 164052 290066

additive_hw_model <- HoltWinters(weekly_ts, seasonal = "additive")
additive_forecast <- forecast(additive_hw_model, h = 1)
cat("Additive Model Forecast:\n")
#> Additive Model Forecast:
print(additive_forecast)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 57.05769       227225.7 185052.1 269399.3 162726.7 291724.6

# Plotting the forecast from the multiplicative model
plot(multiplicative_forecast, main = "Holt-Winters Forecast with Multiplicative Seasonality", xlab = "Time", ylab = "Values")


summary(multiplicative_forecast)
#> 
#> Forecast method: HoltWinters
#> 
#> Model Information:
#> Holt-Winters exponential smoothing with trend and multiplicative seasonal component.
#> 
#> Call:
#> HoltWinters(x = weekly_ts, seasonal = "multiplicative")
#> 
#> Smoothing parameters:
#>  alpha: 0.363287
#>  beta : 0
#>  gamma: 1
#> 
#> Coefficients:
#>              [,1]
#> a    2.615162e+05
#> b   -6.023969e+02
#> s1   8.702453e-01
#> s2   1.058463e+00
#> s3   1.033361e+00
#> s4   1.049705e+00
#> s5   1.163360e+00
#> s6   1.163575e+00
#> s7   1.413893e+00
#> s8   1.422607e+00
#> s9   1.421077e+00
#> s10  1.350095e+00
#> s11  1.291571e+00
#> s12  1.313315e+00
#> s13  1.288542e+00
#> s14  1.248284e+00
#> s15  1.204924e+00
#> s16  1.253161e+00
#> s17  1.157585e+00
#> s18  1.021321e+00
#> s19  1.058974e+00
#> s20  1.061529e+00
#> s21  1.035407e+00
#> s22  8.573020e-01
#> s23  8.363360e-01
#> s24  9.078450e-01
#> s25  8.528292e-01
#> s26  8.128506e-01
#> s27  8.111990e-01
#> s28  8.276516e-01
#> s29  7.821558e-01
#> s30  8.438507e-01
#> s31  8.753853e-01
#> s32  9.206168e-01
#> s33  1.096786e+00
#> s34  1.029678e+00
#> s35  1.093594e+00
#> s36  1.097072e+00
#> s37  1.148801e+00
#> s38  1.107082e+00
#> s39  1.187394e+00
#> s40  9.418815e-01
#> s41  1.298725e+00
#> s42  1.049057e+00
#> s43  9.846624e-01
#> s44  1.061426e+00
#> s45  1.026028e+00
#> s46  1.209388e+00
#> s47  1.062952e+00
#> s48  8.648177e-01
#> s49  8.745151e-01
#> s50  8.021370e-01
#> s51  8.002309e-01
#> s52  7.568633e-01
#> 
#> Error measures:
#>                    ME     RMSE      MAE         MPE     MAPE      MASE
#> Training set 849.4492 32152.63 20977.83 -0.02160217 5.886518 0.3844578
#>                    ACF1
#> Training set 0.06343894
#> 
#> Forecasts:
#>          Point Forecast  Lo 80    Hi 80  Lo 95  Hi 95
#> 57.05769         227059 185861 268257.1 164052 290066

# Plotting the forecast from the additive model
plot(additive_forecast, main = "Holt-Winters Forecast with Additive Seasonality", xlab = "Time", ylab = "Values", col = "red")


summary(additive_forecast)
#> 
#> Forecast method: HoltWinters
#> 
#> Model Information:
#> Holt-Winters exponential smoothing with trend and additive seasonal component.
#> 
#> Call:
#> HoltWinters(x = weekly_ts, seasonal = "additive")
#> 
#> Smoothing parameters:
#>  alpha: 0.3287789
#>  beta : 0.0001124617
#>  gamma: 1
#> 
#> Coefficients:
#>            [,1]
#> a   239112.5244
#> b     -437.8078
#> s1  -11449.0607
#> s2   33732.1332
#> s3   32786.5257
#> s4   41215.7813
#> s5   67641.7108
#> s6   73068.0543
#> s7  122179.0641
#> s8  130824.3764
#> s9  137180.1524
#> s10 132911.9050
#> s11 136376.4436
#> s12 151045.7516
#> s13 155577.1443
#> s14 155240.7753
#> s15 150053.6403
#> s16 160442.2974
#> s17 140537.5172
#> s18 106087.7424
#> s19 110915.5912
#> s20 106510.7072
#> s21  95607.3828
#> s22  43957.1630
#> s23  30707.7583
#> s24  41615.2321
#> s25  21146.3871
#> s26   4993.1586
#> s27   -763.1147
#> s28  -1522.9617
#> s29 -15312.9822
#> s30  -4073.3996
#> s31   1281.6092
#> s32   8029.0751
#> s33  39089.1224
#> s34  25671.9992
#> s35  36567.0917
#> s36  37909.8039
#> s37  48972.5332
#> s38  43792.3493
#> s39  63229.6548
#> s40  14292.6538
#> s41  96536.9621
#> s42  42526.1296
#> s43  28387.6534
#> s44  50279.7644
#> s45  41980.0919
#> s46  93495.0491
#> s47  56546.2829
#> s48   1209.1054
#> s49   1948.2873
#> s50 -24106.1758
#> s51 -27845.8795
#> s52 -41180.5244
#> 
#> Error measures:
#>                    ME    RMSE      MAE       MPE     MAPE      MASE      ACF1
#> Training set 1554.788 32939.2 21998.03 0.2016047 6.274324 0.4031549 0.1270346
#> 
#> Forecasts:
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 57.05769       227225.7 185052.1 269399.3 162726.7 291724.6

Created on 2024-02-28 with reprex v2.1.0

Sree-lekshmi99 commented 8 months ago

hW3

library(tidyverse)    
#> Warning: package 'tidyverse' was built under R version 4.3.1
#> Warning: package 'ggplot2' was built under R version 4.3.1
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.3.1
#> Warning: package 'readr' was built under R version 4.3.1
#> Warning: package 'purrr' was built under R version 4.3.1
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.3.1
#> Warning: package 'forcats' was built under R version 4.3.1
#> Warning: package 'lubridate' was built under R version 4.3.1
library(fredr)        
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)     
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(imputeTS)     
#> Warning: package 'imputeTS' was built under R version 4.2.3

fredr_set_key('dee75be4acc1b9d6c94d9f59867a4910')

icnsa_data <- fredr(series_id = "ICNSA")

# Backup original data
original_icnsa <- icnsa_data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


start_date <- as.Date("2020-03-21") 
end_date <- as.Date("2021-02-21")

# Visulaizing the covid data
covidPlot <- ggplot(icnsa_data, aes(date, value)) +
  geom_line() +  
  geom_vline(xintercept = as.numeric(start_date), linetype = "dotdash", color = "darkred") +
  geom_vline(xintercept = as.numeric(end_date), linetype = "dotdash", color = "green") +
  # Define the labels for the plot
  ggtitle("Claims During Covid Period") +
  xlab("Year") +
  ylab("Claims Count")

covidPlot


icnsa_data_without_covid <- subset(icnsa_data, !(date >= start_date & date <= end_date))
acf(icnsa_data_without_covid$value)

pacf(icnsa_data_without_covid$value)


# Setting the values to NA for the COVID period
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- NA

# Plot data with NA values to visualize the missing period
plot(icnsa_data$date, icnsa_data$value, type = "l", main = "ICNSA Data with Simulated Missing Values", xlab = "Date", ylab = "ICNSA Value")


# Impute missing values using Kalman filter
icnsa_data$value <- na.kalman(icnsa_data$value)
#> Warning: na.kalman will be replaced by na_kalman.
#>     Functionality stays the same.
#>     The new function name better fits modern R code style guidelines.
#>     Please adjust your code accordingly.

# Plotting imputed data to visualize the effect of imputation
plot(icnsa_data$date, icnsa_data$value, type = "l", col = "blue", main = "ICNSA Data After Kalman Filter Imputation", xlab = "Date", ylab = "ICNSA Value")
lines(original_icnsa$date, original_icnsa$value, col = "red") # Add original data for comparison

legend("topleft", legend=c("Imputed", "Original"), col=c("blue", "red"), lty=1, cex = 0.8)


icnsa_ts <- ts(icnsa_data$value, frequency = 52) 

# Fit the structural time series model
fit <- StructTS(icnsa_ts, type = "BSM")  #  Basic Structural Model

print(names(fit))
#>  [1] "coef"      "loglik"    "loglik0"   "data"      "residuals" "fitted"   
#>  [7] "call"      "series"    "code"      "model"     "model0"    "xtsp"

#SPLINE
spline_fit <- smooth.spline(icnsa_data$date[!is.na(icnsa_data$value)], icnsa_data$value[!is.na(icnsa_data$value)])
spline_imputed <- predict(spline_fit, xout = as.numeric(icnsa_data$date))$y

smoothed <- tsSmooth(fit)
print(str(smoothed))
#>  Time-Series [1:2983, 1:3] from 1 to 58.3: 285370 275800 270376 265487 261846 ...
#>  - attr(*, "dimnames")=List of 2
#>   ..$ : NULL
#>   ..$ : chr [1:3] "level" "slope" "sea"
#> NULL

# Apply the tsSmooth function
smoothed <- tsSmooth(fit)

# Extract the 'level' and 'seasonal' components
trend <- smoothed[, "level"]
seasonal <- smoothed[, "sea"]

# Plot the original data
plot(icnsa_data$date, icnsa_ts, type = "l", col = "black", main = "Original Data and Extracted Components", ylab = "ICNSA Value")

# Plot the trend component
lines(icnsa_data$date, trend, col = "blue", lty = 2)

# Plot the original data with the trend component added to the seasonal component
lines(icnsa_data$date, trend + seasonal, col = "green", lty = 3)

# Add a legend to the plot
legend("topright", legend = c("Original Data", "Trend", "Trend + Seasonal"), col = c("black", "blue", "green"), lty = 1:3, cex = 0.8)


library(KFAS)
#> Warning: package 'KFAS' was built under R version 4.2.3
#> Please cite KFAS in publications by using: 
#> 
#>   Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
irusa_data <- fredr(series_id = "IURSA")
irusa_data$date <- as.Date(irusa_data$date)
head(irusa_data)
#> # A tibble: 6 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 1971-01-02 IURSA       3.9 2024-03-07     2024-03-07  
#> 2 1971-01-09 IURSA       4   2024-03-07     2024-03-07  
#> 3 1971-01-16 IURSA       3.9 2024-03-07     2024-03-07  
#> 4 1971-01-23 IURSA       3.9 2024-03-07     2024-03-07  
#> 5 1971-01-30 IURSA       3.9 2024-03-07     2024-03-07  
#> 6 1971-02-06 IURSA       4   2024-03-07     2024-03-07

modified_icnsa <- subset(icnsa_data, date >= as.Date("1967-01-07") & date <= as.Date("2024-02-17"))

merged_data <- merge(modified_icnsa, irusa_data, by = "date", all = FALSE)
head(merged_data)
#>         date series_id.x value.x realtime_start.x realtime_end.x series_id.y
#> 1 1971-01-02       ICNSA  443000       2024-03-07     2024-03-07       IURSA
#> 2 1971-01-09       ICNSA  500000       2024-03-07     2024-03-07       IURSA
#> 3 1971-01-16       ICNSA  452000       2024-03-07     2024-03-07       IURSA
#> 4 1971-01-23       ICNSA  399000       2024-03-07     2024-03-07       IURSA
#> 5 1971-01-30       ICNSA  353000       2024-03-07     2024-03-07       IURSA
#> 6 1971-02-06       ICNSA  375000       2024-03-07     2024-03-07       IURSA
#>   value.y realtime_start.y realtime_end.y
#> 1     3.9       2024-03-07     2024-03-07
#> 2     4.0       2024-03-07     2024-03-07
#> 3     3.9       2024-03-07     2024-03-07
#> 4     3.9       2024-03-07     2024-03-07
#> 5     3.9       2024-03-07     2024-03-07
#> 6     4.0       2024-03-07     2024-03-07

library(dlm)
#> Warning: package 'dlm' was built under R version 4.2.3
#> 
#> Attaching package: 'dlm'
#> The following object is masked from 'package:ggplot2':
#> 
#>     %+%

start_period <- as.Date("1967-01-07")
end_period <- as.Date("2024-02-17")

adjusted_icnsa <- icnsa_data[icnsa_data$date >= start_period & icnsa_data$date <= end_period, ]

# Combine 'adjusted_icnsa' with 'irusa_data' based on the date field
combined_series <- merge(adjusted_icnsa, irusa_data, by = "date")
combined_series <- combined_series[order(combined_series$date), ]

observation_matrix <- cbind(intercept = 1, covariate = combined_series$value.y)

# Defining dimensions for model components
number_of_observations <- nrow(combined_series)
number_of_state_variables <- ncol(observation_matrix)

# State equation components
state_transition_matrix <- diag(rep(1, number_of_state_variables))
state_noise_covariance <- diag(rep(0.01, number_of_state_variables))

# Observation equation components
observation_noise_covariance <- diag(rep(1, number_of_observations))
initial_conditions_vector <- rep(0, number_of_state_variables)
initial_covariance_matrix <- diag(rep(1e5, number_of_state_variables))

# Model with 'dlm'
ss_model <- dlm(
  FF = observation_matrix,  
  GG = state_transition_matrix,
  V = observation_noise_covariance,
  W = state_noise_covariance,
  m0 = initial_conditions_vector,
  C0 = initial_covariance_matrix
)

filtered_fit <- dlmFilter(merged_data$value.x, ss_model)

forecast_result <- dlmForecast(filtered_fit, nAhead = 1)

forecasted_next_value <- forecast_result$f[1]  

cat("Forecasted value for the next period:", forecasted_next_value, "\n")
#> Forecasted value for the next period: 400725.4

summary(ss_model)
#>    Length  Class  Mode   
#> m0       2 -none- numeric
#> C0       4 -none- numeric
#> FF    5546 -none- numeric
#> V  7689529 -none- numeric
#> GG       4 -none- numeric
#> W        4 -none- numeric

Created on 2024-03-07 with reprex v2.1.0

Sree-lekshmi99 commented 8 months ago

Comparison of spline and statespace When we compare the original data with the model's fitted values, they look almost the same.. Normally, we expect the trend line to be smoother, and when combined with the seasonal component, it should still be less erratic compared to the original data.

Visualized ICNSA Data: Initial visualization of the ICNSA claims data to understand its trend over time. Identified COVID-19 Period: Marked the COVID-19 period in the data, from March 21, 2020, to February 21, 2021, to simulate the scenario of missing data. Simulated Missing Data: Set the values in the COVID-19 period to NA in the dataset to replicate missing data conditions. Imputed Missing Data Using Kalman Filter: Applied the Kalman filter (via na.kalman function in imputeTS package) to fill in the missing data Decomposed Data into Trend and Seasonal Components: Utilized a structural time series model (StructTS) to break down the data into trend and seasonal components. Incorporated IURSA as a Covariate: I included the Insured Unemployment Rate (IURSA) as a related factor and combined it with the ICNSA data. This was because I thought changes in unemployment rates might affect the number of unemployment claims. Constructed and Fitted State Space Model: I created a model using the dlm package that combined both the ICNSA series and the IURSA covariate. This model was meant to understand how unemployment claims change over time, considering the impact of unemployment rates. Forecasted Next Value: Finally, I used the fitted state space model to predict the next period's value (dlmForecast), providing an estimate based on the model's understanding of the data. **_

My forecasted value is 400725.5

_**

Defining Model Dimensions: I count how many data points I have (number_of_observations) and how many aspects of the data I'm tracking (number_of_state_variables). State Equation Components: I set up rules for how these aspects of the data change over time. Here, I assume they mostly stay the same (state_transition_matrix), with a little bit of random change (state_noise_covariance). Observation Equation Components: I specify how much noise or randomness I think is in my data (observation_noise_covariance) and where I think my data starts from (initial_conditions_vector and initial_covariance_matrix).

Sree-lekshmi99 commented 7 months ago

Homework 4

library(vars)
#> Loading required package: MASS
#> Loading required package: strucchange
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Loading required package: urca
#> Loading required package: lmtest
library(readr)
library(tseries)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(urca)
library(ggplot2)
library(reprex)
library(zoo)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:MASS':
#> 
#>     select
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyverse)
library(lubridate)
library(readxl)
library(vars)
library(BigVAR)
#> Loading required package: lattice

cdgvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/CDGVS.xlsx")
cdgvs_xl$Value <- as.numeric(cdgvs_xl$Value)
#> Warning: NAs introduced by coercion
cdgvs_xl <- cdgvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
cdgvs_xl <- cdgvs_xl %>% filter(format(Period, "%Y") != "2024")
head(cdgvs_xl)
#> # A tibble: 6 × 2
#>   Period     Value
#>   <date>     <dbl>
#> 1 1992-01-01 18331
#> 2 1992-02-01 19558
#> 3 1992-03-01 20770
#> 4 1992-04-01 21024
#> 5 1992-05-01 21813
#> 6 1992-06-01 21069
sum(is.na(cdgvs_xl))
#> [1] 0
ggplot(cdgvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of CDGVS Value", x="Period", y="Value")


cgvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/CGVS.xlsx")
cgvs_xl$Value <- as.numeric(cgvs_xl$Value)
#> Warning: NAs introduced by coercion
cgvs_xl <- cgvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
head(cgvs_xl)
#> # A tibble: 6 × 2
#>   Period     Value
#>   <date>     <dbl>
#> 1 1992-01-01 85583
#> 2 1992-02-01 86140
#> 3 1992-03-01 90380
#> 4 1992-04-01 89924
#> 5 1992-05-01 92558
#> 6 1992-06-01 93773

sum(is.na(cgvs_xl))
#> [1] 10
cgvs_xl <- cgvs_xl %>% filter(format(Period, "%Y") != "2024")
ggplot(cgvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of CGVS Value", x="Period", y="Value")


cngvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/CNGVS.xlsx")
cngvs_xl$Value <- as.numeric(cngvs_xl$Value)
#> Warning: NAs introduced by coercion
cngvs_xl <- cngvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
head(cngvs_xl)
#> # A tibble: 6 × 2
#>   Period     Value
#>   <date>     <dbl>
#> 1 1992-01-01 19302
#> 2 1992-02-01 19547
#> 3 1992-03-01 20556
#> 4 1992-04-01 20528
#> 5 1992-05-01 20703
#> 6 1992-06-01 20945
cngvs_xl <- cngvs_xl %>% filter(format(Period, "%Y") != "2024")
sum(is.na(cngvs_xl))
#> [1] 0
ggplot(cngvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of CNGVS Value", x="Period", y="Value")


dgvs_xl <- read_excel("/Users/jaskirat/Downloads/xlx/DGVS.xlsx")
dgvs_xl$Value <- as.numeric(dgvs_xl$Value)
#> Warning: NAs introduced by coercion
dgvs_xl <- dgvs_xl %>% mutate(Period = as.Date(paste0(Period, "-01"), format="%b-%Y-%d"))
head(dgvs_xl)
#> # A tibble: 6 × 2
#>   Period      Value
#>   <date>      <dbl>
#> 1 1992-01-01 117958
#> 2 1992-02-01 119895
#> 3 1992-03-01 124897
#> 4 1992-04-01 126174
#> 5 1992-05-01 127638
#> 6 1992-06-01 127867
dgvs_xl <- dgvs_xl %>% filter(format(Period, "%Y") != "2024")
sum(is.na(dgvs_xl))
#> [1] 0
ggplot(dgvs_xl, aes(x=Period, y=Value)) + geom_line() + labs(title="Time Series of DGVS Value", x="Period", y="Value")


#DECOMPISTION 

cdgvs_ts_data <- ts(cdgvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_cdgvs <- decompose(cdgvs_ts_data)
plot(decomposed_cdgvs)


cgvs_ts_data <- ts(cgvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_cgvs <- decompose(cgvs_ts_data)
plot(decomposed_cgvs)


cngvs_ts_data <- ts(cngvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_cngvs <- decompose(cngvs_ts_data)
plot(decomposed_cngvs)


dgvs_ts_data <- ts(dgvs_xl$Value, start=c(1992, 1), frequency=12)
decomposed_dgvs <- decompose(dgvs_ts_data)
plot(decomposed_dgvs)


#ADF TESTS

cdgvs_adf_result <- adf.test(cdgvs_ts_data)
cdgvs_adf_result
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  cdgvs_ts_data
#> Dickey-Fuller = -2.5158, Lag order = 7, p-value = 0.3593
#> alternative hypothesis: stationary

cgvs_adf_result <- adf.test(cgvs_ts_data)
cgvs_adf_result
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  cgvs_ts_data
#> Dickey-Fuller = -3.4213, Lag order = 7, p-value = 0.05057
#> alternative hypothesis: stationary

dgvs_adf_result <- adf.test(dgvs_ts_data)
dgvs_adf_result
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  dgvs_ts_data
#> Dickey-Fuller = -3.1486, Lag order = 7, p-value = 0.09684
#> alternative hypothesis: stationary

cngvs_adf_result <- adf.test(cngvs_ts_data)
cngvs_adf_result
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  cngvs_ts_data
#> Dickey-Fuller = -1.939, Lag order = 7, p-value = 0.6029
#> alternative hypothesis: stationary

#All the series turned to be non staionary now we will difference them and test adf again\

series1_diff <- diff(cgvs_ts_data, differences = 1)
series2_diff <- diff(cdgvs_ts_data, differences = 1)
series3_diff <- diff(dgvs_ts_data, differences =1 )
series4_diff <- diff(cngvs_ts_data, differences = 1)

adf_test_series1 <- adf.test(series1_diff)
#> Warning in adf.test(series1_diff): p-value smaller than printed p-value
adf_test_series2 <- adf.test(series2_diff)
#> Warning in adf.test(series2_diff): p-value smaller than printed p-value
adf_test_series3 <- adf.test(series3_diff)
#> Warning in adf.test(series3_diff): p-value smaller than printed p-value
adf_test_series4 <- adf.test(series4_diff)
#> Warning in adf.test(series4_diff): p-value smaller than printed p-value

#NOW combining all the series together for VAR model

combined_ts <- cbind(cngvs_ts_data, cgvs_ts_data, cdgvs_ts_data, dgvs_ts_data)

str(combined_ts)
#>  Time-Series [1:384, 1:4] from 1992 to 2024: 19302 19547 20556 20528 20703 ...
#>  - attr(*, "dimnames")=List of 2
#>   ..$ : NULL
#>   ..$ : chr [1:4] "cngvs_ts_data" "cgvs_ts_data" "cdgvs_ts_data" "dgvs_ts_data"

df_simplified <- combined_ts[, c('dgvs_ts_data', 'cgvs_ts_data', 'cdgvs_ts_data', 'cngvs_ts_data')]

# Fit the VAR(1) model
var1.model <- VAR(combined_ts, p = 1, type = "const")
summary(var1.model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: cngvs_ts_data, cgvs_ts_data, cdgvs_ts_data, dgvs_ts_data 
#> Deterministic variables: const 
#> Sample size: 383 
#> Log Likelihood: -13280.047 
#> Roots of the characteristic polynomial:
#> 0.9938 0.9938 0.951 0.8017
#> Call:
#> VAR(y = combined_ts, p = 1, type = "const")
#> 
#> 
#> Estimation results for equation cngvs_ts_data: 
#> ============================================== 
#> cngvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1   1.008637   0.011792  85.536  < 2e-16 ***
#> cgvs_ts_data.l1    0.003807   0.001800   2.115  0.03512 *  
#> cdgvs_ts_data.l1   0.020447   0.010284   1.988  0.04750 *  
#> dgvs_ts_data.l1   -0.009731   0.003417  -2.848  0.00464 ** 
#> const            467.609580 174.828829   2.675  0.00781 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 475.9 on 378 degrees of freedom
#> Multiple R-Squared: 0.9947,  Adjusted R-squared: 0.9946 
#> F-statistic: 1.771e+04 on 4 and 378 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation cgvs_ts_data: 
#> ============================================= 
#> cgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1    0.38147    0.09422   4.049 6.25e-05 ***
#> cgvs_ts_data.l1     0.99597    0.01438  69.242  < 2e-16 ***
#> cdgvs_ts_data.l1   -0.17593    0.08217  -2.141   0.0329 *  
#> dgvs_ts_data.l1    -0.03884    0.02730  -1.423   0.1557    
#> const            1803.95692 1396.86072   1.291   0.1973    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 3802 on 378 degrees of freedom
#> Multiple R-Squared: 0.9928,  Adjusted R-squared: 0.9928 
#> F-statistic: 1.308e+04 on 4 and 378 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation cdgvs_ts_data: 
#> ============================================== 
#> cdgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1  1.655e-01  4.498e-02   3.679 0.000269 ***
#> cgvs_ts_data.l1  -6.417e-03  6.867e-03  -0.935 0.350637    
#> cdgvs_ts_data.l1  8.247e-01  3.923e-02  21.023  < 2e-16 ***
#> dgvs_ts_data.l1   9.257e-04  1.303e-02   0.071 0.943415    
#> const             1.083e+03  6.669e+02   1.624 0.105307    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 1815 on 378 degrees of freedom
#> Multiple R-Squared: 0.9131,  Adjusted R-squared: 0.9122 
#> F-statistic:   993 on 4 and 378 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation dgvs_ts_data: 
#> ============================================= 
#> dgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1    0.33772    0.10569   3.195  0.00151 ** 
#> cgvs_ts_data.l1     0.02932    0.01614   1.817  0.07003 .  
#> cdgvs_ts_data.l1   -0.06188    0.09217  -0.671  0.50238    
#> dgvs_ts_data.l1     0.91105    0.03063  29.748  < 2e-16 ***
#> const            4332.90201 1566.99267   2.765  0.00597 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 4265 on 378 degrees of freedom
#> Multiple R-Squared: 0.9865,  Adjusted R-squared: 0.9863 
#> F-statistic:  6893 on 4 and 378 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>               cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data        226472       880043        443968      1310690
#> cgvs_ts_data         880043     14457568       4658727     10910001
#> cdgvs_ts_data        443968      4658727       3295422      6548669
#> dgvs_ts_data        1310690     10910001       6548669     18193781
#> 
#> Correlation matrix of residuals:
#>               cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data        1.0000       0.4863        0.5139       0.6457
#> cgvs_ts_data         0.4863       1.0000        0.6749       0.6727
#> cdgvs_ts_data        0.5139       0.6749        1.0000       0.8457
#> dgvs_ts_data         0.6457       0.6727        0.8457       1.0000

#For optimal lag for VAR Model
var_result <- VARselect(combined_ts, lag.max = 10, type = "both")
optimal_lag <- var_result$selection
print(optimal_lag)
#> AIC(n)  HQ(n)  SC(n) FPE(n) 
#>      4      4      2      4

# Fit the VAR(4) model based on the optimal lag length 
var4.model <- VAR(combined_ts, p = 4, type = "const")
summary(var4.model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: cngvs_ts_data, cgvs_ts_data, cdgvs_ts_data, dgvs_ts_data 
#> Deterministic variables: const 
#> Sample size: 380 
#> Log Likelihood: -13054.148 
#> Roots of the characteristic polynomial:
#> 0.9877 0.9877 0.971 0.8944 0.8141 0.7398 0.7398 0.6232 0.6232 0.501 0.501 0.4052 0.351 0.351 0.3106 0.04926
#> Call:
#> VAR(y = combined_ts, p = 4, type = "const")
#> 
#> 
#> Estimation results for equation cngvs_ts_data: 
#> ============================================== 
#> cngvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1  9.698e-01  6.599e-02  14.695  < 2e-16 ***
#> cgvs_ts_data.l1   4.028e-02  9.091e-03   4.431 1.25e-05 ***
#> cdgvs_ts_data.l1  2.005e-02  2.683e-02   0.747  0.45542    
#> dgvs_ts_data.l1  -2.521e-02  1.250e-02  -2.016  0.04454 *  
#> cngvs_ts_data.l2  1.626e-01  8.769e-02   1.855  0.06445 .  
#> cgvs_ts_data.l2  -3.912e-02  1.423e-02  -2.749  0.00627 ** 
#> cdgvs_ts_data.l2 -3.478e-02  3.578e-02  -0.972  0.33175    
#> dgvs_ts_data.l2   1.117e-02  1.499e-02   0.745  0.45668    
#> cngvs_ts_data.l3  6.702e-03  8.799e-02   0.076  0.93933    
#> cgvs_ts_data.l3   2.172e-03  1.444e-02   0.150  0.88054    
#> cdgvs_ts_data.l3 -2.362e-02  3.488e-02  -0.677  0.49868    
#> dgvs_ts_data.l3   2.747e-02  1.506e-02   1.824  0.06893 .  
#> cngvs_ts_data.l4 -1.433e-01  6.721e-02  -2.132  0.03371 *  
#> cgvs_ts_data.l4  -6.053e-04  9.634e-03  -0.063  0.94994    
#> cdgvs_ts_data.l4  5.705e-02  2.463e-02   2.317  0.02108 *  
#> dgvs_ts_data.l4  -1.944e-02  1.225e-02  -1.588  0.11323    
#> const             3.535e+02  1.741e+02   2.030  0.04305 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 454 on 363 degrees of freedom
#> Multiple R-Squared: 0.9952,  Adjusted R-squared: 0.995 
#> F-statistic:  4691 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation cgvs_ts_data: 
#> ============================================= 
#> cgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1    0.34996    0.50556   0.692  0.48925    
#> cgvs_ts_data.l1     1.36455    0.06965  19.593  < 2e-16 ***
#> cdgvs_ts_data.l1   -0.27807    0.20557  -1.353  0.17700    
#> dgvs_ts_data.l1    -0.05367    0.09579  -0.560  0.57560    
#> cngvs_ts_data.l2    1.41584    0.67180   2.108  0.03576 *  
#> cgvs_ts_data.l2    -0.44419    0.10902  -4.074 5.67e-05 ***
#> cdgvs_ts_data.l2   -0.34167    0.27413  -1.246  0.21344    
#> dgvs_ts_data.l2     0.03328    0.11482   0.290  0.77211    
#> cngvs_ts_data.l3   -1.17569    0.67409  -1.744  0.08198 .  
#> cgvs_ts_data.l3     0.10899    0.11065   0.985  0.32526    
#> cdgvs_ts_data.l3   -0.26208    0.26721  -0.981  0.32734    
#> dgvs_ts_data.l3     0.26851    0.11536   2.328  0.02048 *  
#> cngvs_ts_data.l4   -0.28667    0.51492  -0.557  0.57806    
#> cgvs_ts_data.l4    -0.05064    0.07381  -0.686  0.49309    
#> cdgvs_ts_data.l4    0.60871    0.18866   3.227  0.00137 ** 
#> dgvs_ts_data.l4    -0.24149    0.09383  -2.574  0.01045 *  
#> const             917.57323 1333.82070   0.688  0.49194    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 3478 on 363 degrees of freedom
#> Multiple R-Squared: 0.9941,  Adjusted R-squared: 0.9939 
#> F-statistic:  3831 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation cdgvs_ts_data: 
#> ============================================== 
#> cdgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1   0.426004   0.244967   1.739 0.082877 .  
#> cgvs_ts_data.l1    0.112590   0.033746   3.336 0.000937 ***
#> cdgvs_ts_data.l1   0.951656   0.099608   9.554  < 2e-16 ***
#> dgvs_ts_data.l1   -0.080570   0.046414  -1.736 0.083428 .  
#> cngvs_ts_data.l2   0.644588   0.325517   1.980 0.048436 *  
#> cgvs_ts_data.l2   -0.145373   0.052826  -2.752 0.006222 ** 
#> cdgvs_ts_data.l2  -0.104645   0.132828  -0.788 0.431315    
#> dgvs_ts_data.l2   -0.074377   0.055637  -1.337 0.182115    
#> cngvs_ts_data.l3  -0.594116   0.326624  -1.819 0.069741 .  
#> cgvs_ts_data.l3    0.021015   0.053613   0.392 0.695302    
#> cdgvs_ts_data.l3  -0.158001   0.129476  -1.220 0.223136    
#> dgvs_ts_data.l3    0.189997   0.055896   3.399 0.000751 ***
#> cngvs_ts_data.l4  -0.373582   0.249503  -1.497 0.135183    
#> cgvs_ts_data.l4    0.005635   0.035764   0.158 0.874882    
#> cdgvs_ts_data.l4   0.157391   0.091412   1.722 0.085964 .  
#> dgvs_ts_data.l4   -0.025975   0.045463  -0.571 0.568124    
#> const            728.323981 646.294010   1.127 0.260519    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 1685 on 363 degrees of freedom
#> Multiple R-Squared: 0.9256,  Adjusted R-squared: 0.9223 
#> F-statistic: 282.1 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation dgvs_ts_data: 
#> ============================================= 
#> dgvs_ts_data = cngvs_ts_data.l1 + cgvs_ts_data.l1 + cdgvs_ts_data.l1 + dgvs_ts_data.l1 + cngvs_ts_data.l2 + cgvs_ts_data.l2 + cdgvs_ts_data.l2 + dgvs_ts_data.l2 + cngvs_ts_data.l3 + cgvs_ts_data.l3 + cdgvs_ts_data.l3 + dgvs_ts_data.l3 + cngvs_ts_data.l4 + cgvs_ts_data.l4 + cdgvs_ts_data.l4 + dgvs_ts_data.l4 + const 
#> 
#>                    Estimate Std. Error t value Pr(>|t|)    
#> cngvs_ts_data.l1    1.35219    0.55341   2.443 0.015026 *  
#> cgvs_ts_data.l1     0.38841    0.07624   5.095 5.63e-07 ***
#> cdgvs_ts_data.l1    0.50328    0.22503   2.237 0.025924 *  
#> dgvs_ts_data.l1     0.47477    0.10485   4.528 8.09e-06 ***
#> cngvs_ts_data.l2    0.94609    0.73538   1.287 0.199077    
#> cgvs_ts_data.l2    -0.31706    0.11934  -2.657 0.008237 ** 
#> cdgvs_ts_data.l2   -0.75683    0.30007  -2.522 0.012091 *  
#> dgvs_ts_data.l2     0.17962    0.12569   1.429 0.153851    
#> cngvs_ts_data.l3   -1.55298    0.73788  -2.105 0.036008 *  
#> cgvs_ts_data.l3     0.02873    0.12112   0.237 0.812600    
#> cdgvs_ts_data.l3   -0.73967    0.29250  -2.529 0.011869 *  
#> dgvs_ts_data.l3     0.69092    0.12628   5.471 8.33e-08 ***
#> cngvs_ts_data.l4   -0.55131    0.56366  -0.978 0.328682    
#> cgvs_ts_data.l4    -0.09118    0.08079  -1.129 0.259849    
#> cdgvs_ts_data.l4    0.85035    0.20651   4.118 4.74e-05 ***
#> dgvs_ts_data.l4    -0.37438    0.10271  -3.645 0.000306 ***
#> const            2711.78203 1460.05415   1.857 0.064076 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 3807 on 363 degrees of freedom
#> Multiple R-Squared: 0.9892,  Adjusted R-squared: 0.9888 
#> F-statistic:  2087 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>               cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data        206133       651228        362505      1045325
#> cgvs_ts_data         651228     12098358       3792233      8209165
#> cdgvs_ts_data        362505      3792233       2840480      5483311
#> dgvs_ts_data        1045325      8209165       5483311     14496710
#> 
#> Correlation matrix of residuals:
#>               cngvs_ts_data cgvs_ts_data cdgvs_ts_data dgvs_ts_data
#> cngvs_ts_data        1.0000       0.4124        0.4737       0.6047
#> cgvs_ts_data         0.4124       1.0000        0.6469       0.6199
#> cdgvs_ts_data        0.4737       0.6469        1.0000       0.8545
#> dgvs_ts_data         0.6047       0.6199        0.8545       1.0000

#Forecasting the values

forecast <- predict(var4.model, n.ahead = 1, ci = 0.95) # ci is the confidence interval and I gave the threshhold of 0.95
print(forecast)
#> $cngvs_ts_data
#>                        fcst   lower    upper       CI
#> cngvs_ts_data.fcst 47709.96 46820.1 48599.82 889.8605
#> 
#> $cgvs_ts_data
#>                       fcst  lower    upper       CI
#> cgvs_ts_data.fcst 244795.3 237978 251612.5 6817.283
#> 
#> $cdgvs_ts_data
#>                        fcst    lower    upper       CI
#> cdgvs_ts_data.fcst 44109.68 40806.41 47412.95 3303.269
#> 
#> $dgvs_ts_data
#>                       fcst    lower    upper       CI
#> dgvs_ts_data.fcst 280796.2 273333.7 288258.7 7462.474

#Granger Test

granger_test <- causality(var4.model)
#> Warning in causality(var4.model): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (cngvs_ts_data) as cause variable.
print(granger_test)
#> $Granger
#> 
#>  Granger causality H0: cngvs_ts_data do not Granger-cause cgvs_ts_data
#>  cdgvs_ts_data dgvs_ts_data
#> 
#> data:  VAR object var4.model
#> F-Test = 3.0066, df1 = 12, df2 = 1452, p-value = 0.0003539
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: cngvs_ts_data and cgvs_ts_data
#>  cdgvs_ts_data dgvs_ts_data
#> 
#> data:  VAR object var4.model
#> Chi-squared = 104.26, df = 3, p-value < 2.2e-16

#Apply BigVAR with BASIC struct

combined_matrix <- cbind(cgvs_ts_data, cdgvs_ts_data, cngvs_ts_data, dgvs_ts_data)

sparse_var_fit <- BigVAR.fit(Y=combined_matrix,struct='Basic',p=2,lambda=1)[,,1]

print(sparse_var_fit)
#>           [,1]       [,2]        [,3]      [,4]       [,5]       [,6]
#> [1,]  840.3703 1.16271304 -0.14150564 0.2421101 0.07491145 -0.1917155
#> [2,]  344.6862 0.11765031  0.38418171 0.1352681 0.16302836 -0.1460612
#> [3,]  692.7337 0.05780321  0.08645465 0.4141812 0.01270520 -0.0436268
#> [4,] 4041.2572 0.33713395  0.11909525 0.1514570 0.76268669 -0.3106586
#>             [,7]       [,8]        [,9]
#> [1,] -0.22813109 0.21367517 -0.07064288
#> [2,]  0.23767560 0.10750817 -0.12139601
#> [3,]  0.07910418 0.40037979 -0.02287711
#> [4,] -0.14835613 0.09192961  0.16221371

summary(sparse_var_fit)
#>        V1               V2               V3                 V4        
#>  Min.   : 344.7   Min.   :0.0578   Min.   :-0.14151   Min.   :0.1353  
#>  1st Qu.: 605.7   1st Qu.:0.1027   1st Qu.: 0.02946   1st Qu.:0.1474  
#>  Median : 766.6   Median :0.2274   Median : 0.10278   Median :0.1968  
#>  Mean   :1479.8   Mean   :0.4188   Mean   : 0.11206   Mean   :0.2358  
#>  3rd Qu.:1640.6   3rd Qu.:0.5435   3rd Qu.: 0.18537   3rd Qu.:0.2851  
#>  Max.   :4041.3   Max.   :1.1627   Max.   : 0.38418   Max.   :0.4142  
#>        V5                V6                 V7                 V8         
#>  Min.   :0.01271   Min.   :-0.31066   Min.   :-0.22813   Min.   :0.09193  
#>  1st Qu.:0.05936   1st Qu.:-0.22145   1st Qu.:-0.16830   1st Qu.:0.10361  
#>  Median :0.11897   Median :-0.16889   Median :-0.03463   Median :0.16059  
#>  Mean   :0.25333   Mean   :-0.17302   Mean   :-0.01493   Mean   :0.20337  
#>  3rd Qu.:0.31294   3rd Qu.:-0.12045   3rd Qu.: 0.11875   3rd Qu.:0.26035  
#>  Max.   :0.76269   Max.   :-0.04363   Max.   : 0.23768   Max.   :0.40038  
#>        V9          
#>  Min.   :-0.12140  
#>  1st Qu.:-0.08333  
#>  Median :-0.04676  
#>  Mean   :-0.01318  
#>  3rd Qu.: 0.02340  
#>  Max.   : 0.16221

Created on 2024-04-11 with reprex v2.1.0

Sree-lekshmi99 commented 7 months ago

Consumer Durable Goods (CDGVS): Focuses on long-lasting goods like electronics and furniture, capturing their shipment values. It's an indicator of consumer confidence and economic health.

Consumer Goods (CGVS): Represents all consumer goods, both durable and nondurable. This dataset helps understand overall consumer spending, a key economic driver.

Consumer Nondurable Goods (CNGVS): Targets short-lived goods such as food and clothing, measuring their shipment values. Reflects essential, less volatile consumer spending.

Durable Goods (DGVS): Includes durable goods beyond just consumer items, covering business and government purchases. It's a sign of broader economic activity and investment trends.

Identified 10 null values at the end of all the 2024, reflecting the data collection period and removed those dates

ADF tests, revealing non-stationarity. Applied first-order differencing (d=1) to achieve stationarity for empirical analysis. After the first differing all the series is now stationary.

After that applied VAR Model start with var with p =1 and found the optimal lag and found that 4 will be suitable for our dataset. so did VAR4

VAR4 fit better and captured more data and did bigVAR with threshold of 0.95 confidence intervals