jlivsey / UB-sping24-time-series

2 stars 1 forks source link

Disha Maru #28

Open dmarub opened 7 months ago

dmarub commented 7 months ago
# Loading the libraries
library(fredr)    
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast) 
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
# Setting FRED API key and fetching the data
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1') # Replace with your actual FRED API key
claims_data <- fredr(series_id = 'ICNSA')
# Transforming DATE column to Date format
claims_data <- claims_data %>%
  mutate(date = as.Date(date)) 

# Selecting relevant columns
claims_data <- claims_data %>%
  select(date, value)

# Converting the data to a ts object assuming weekly frequency
claims_ts <- ts(claims_data$value, frequency = 52)
# Fitting an Auto ARIMA model
auto_arima_model <- auto.arima(claims_ts)
# Summary of the fitted model
cat("Summary of the fitted Auto ARIMA model:\n")
#> Summary of the fitted Auto ARIMA model:
summary(auto_arima_model)
#> Series: claims_ts 
#> ARIMA(1,0,4)(1,0,2)[52] with non-zero mean 
#> 
#> Coefficients:
#>          ar1     ma1     ma2      ma3      ma4    sar1     sma1    sma2
#>       0.8661  0.5856  0.1167  -0.0737  -0.0110  0.4014  -0.0976  0.0890
#> s.e.  0.0156  0.0250  0.0327   0.0322   0.0229  0.0522   0.0532  0.0233
#>            mean
#>       365177.64
#> s.e.   29265.14
#> 
#> sigma^2 = 6.626e+09:  log likelihood = -37911.62
#> AIC=75843.24   AICc=75843.32   BIC=75903.24
#> 
#> Training set error measures:
#>                   ME    RMSE      MAE       MPE     MAPE    MASE         ACF1
#> Training set -122.68 81276.2 35971.29 -1.514057 9.208137 0.37881 0.0005084039
cat("\n")
# Forecasting the next 12 weeks
forecast_arima <- forecast(auto_arima_model, h = 12)
cat("Forecast Summary:\n")
#> Forecast Summary:
print(summary(forecast_arima))
#> 
#> Forecast method: ARIMA(1,0,4)(1,0,2)[52] with non-zero mean
#> 
#> Model Information:
#> Series: claims_ts 
#> ARIMA(1,0,4)(1,0,2)[52] with non-zero mean 
#> 
#> Coefficients:
#>          ar1     ma1     ma2      ma3      ma4    sar1     sma1    sma2
#>       0.8661  0.5856  0.1167  -0.0737  -0.0110  0.4014  -0.0976  0.0890
#> s.e.  0.0156  0.0250  0.0327   0.0322   0.0229  0.0522   0.0532  0.0233
#>            mean
#>       365177.64
#> s.e.   29265.14
#> 
#> sigma^2 = 6.626e+09:  log likelihood = -37911.62
#> AIC=75843.24   AICc=75843.32   BIC=75903.24
#> 
#> Error measures:
#>                   ME    RMSE      MAE       MPE     MAPE    MASE         ACF1
#> Training set -122.68 81276.2 35971.29 -1.514057 9.208137 0.37881 0.0005084039
#> 
#> Forecasts:
#>          Point Forecast      Lo 80    Hi 80      Lo 95    Hi 95
#> 58.28846       216867.3  112550.00 321184.7   57327.74 376406.9
#> 58.30769       216113.2   32224.65 400001.7  -65120.04 497346.4
#> 58.32692       224383.2   -8765.06 457531.4 -132186.28 580952.6
#> 58.34615       248570.4  -12041.02 509181.9 -150000.39 647141.3
#> 58.36538       246868.3  -32166.61 525903.2 -179878.78 673615.4
#> 58.38462       219080.3  -73012.11 511172.7 -227636.48 665797.1
#> 58.40385       191843.2 -109672.55 493358.9 -269285.33 652971.7
#> 58.42308       189735.3 -118659.81 498130.4 -281914.31 661384.9
#> 58.44231       222048.5  -91407.62 535504.6 -257341.28 701438.3
#> 58.46154       229566.8  -87632.59 546766.1 -255547.78 714681.3
#> 58.48077       240890.5  -79087.89 560868.8 -248474.20 730255.1
#> 58.50000       251128.2  -70918.90 573175.3 -241400.34 743656.8
cat("\n")
# ACF and PACF plots 
acf(claims_ts, main = "Autocorrelation Function")

pacf(claims_ts, main = "Partial Autocorrelation Function")

# Combining actual and forecasted values for comparison
# Extracting actual values up to the last date in the training dataset
actual_data <- claims_data %>%
  filter(date <= max(claims_data$date))

# Creating a dataframe for forecasted values
forecast_data <- data.frame(
  date = seq(from = max(claims_data$date) + 1, by = "week", length.out = 12),
  value = forecast_arima$mean
)
# Combining actual and forecasted data
combined_data <- bind_rows(actual_data, forecast_data) %>%
  mutate(type = if_else(date <= max(actual_data$date), "Actual", "Forecast"))
# Printing the combined data
cat("Combined Actual and Forecasted Values:\n")
#> Combined Actual and Forecasted Values:
print(combined_data)
#> # A tibble: 2,991 × 3
#>    date        value type  
#>    <date>      <dbl> <chr> 
#>  1 1967-01-07 346000 Actual
#>  2 1967-01-14 334000 Actual
#>  3 1967-01-21 277000 Actual
#>  4 1967-01-28 252000 Actual
#>  5 1967-02-04 274000 Actual
#>  6 1967-02-11 276000 Actual
#>  7 1967-02-18 247000 Actual
#>  8 1967-02-25 248000 Actual
#>  9 1967-03-04 326000 Actual
#> 10 1967-03-11 240000 Actual
#> # ℹ 2,981 more rows
cat("\n")
# Plotting actual vs forecasted values
ggplot(combined_data, aes(x = date, y = value, color = type)) +
  geom_line() +
  labs(title = "Actual vs Forecasted Weekly Unemployment Claims", x = "Date", y = "Claims") +
  scale_color_manual(values = c("Actual" = "blue", "Forecast" = "red"))

# Calculating and printing the mean of the forecasted values
forecast_mean <- mean(forecast_arima$mean, na.rm = TRUE)
cat("Mean Forecasted Value:\n")
#> Mean Forecasted Value:
print(forecast_mean)
#> [1] 224757.9
# Forecasting for next week
next_week_forecast <- predict(auto_arima_model, n.ahead = 1)
cat("Next week's forecasted value:\n")
#> Next week's forecasted value:
print(next_week_forecast$pred)
#> Time Series:
#> Start = c(58, 16) 
#> End = c(58, 16) 
#> Frequency = 52 
#> [1] 216867.3
## Excluding the COVID-19 outlier year - 2020

# Transforming DATE column to Date format
claims_data <- claims_data %>%
  mutate(date = as.Date(date)) # Adjust the date format if needed

# Filtering out the COVID-19 year 
claims_data_filtered <- claims_data %>%
  filter(year(date) != 2020)

# Printing the years to check if 2020 has been removed
cat("Unique years in the filtered dataset:\n")
#> Unique years in the filtered dataset:
print(unique(year(claims_data_filtered$date)))
#>  [1] 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
#> [16] 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
#> [31] 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
#> [46] 2012 2013 2014 2015 2016 2017 2018 2019 2021 2022 2023 2024
# Selecting relevant columns
claims_data_filtered <- claims_data_filtered %>%
  select(date, value)

# Converting the filtered data to a ts object assuming weekly frequency
claims_ts_filtered <- ts(claims_data_filtered$value, frequency = 52)

# Fitting an Auto ARIMA model on the filtered data
auto_arima_model_filtered <- auto.arima(claims_ts_filtered)

# Summary of the fitted model on filtered data
cat("Summary of the fitted Auto ARIMA model on filtered data:\n")
#> Summary of the fitted Auto ARIMA model on filtered data:
summary(auto_arima_model_filtered)
#> Series: claims_ts_filtered 
#> ARIMA(1,0,1)(0,1,1)[52] 
#> 
#> Coefficients:
#>          ar1      ma1     sma1
#>       0.9754  -0.4395  -0.2591
#> s.e.  0.0044   0.0175   0.0194
#> 
#> sigma^2 = 1.262e+09:  log likelihood = -34205.17
#> AIC=68418.34   AICc=68418.36   BIC=68442.2
#> 
#> Training set error measures:
#>                     ME     RMSE     MAE        MPE     MAPE      MASE
#> Training set -6.692661 35192.16 20838.5 -0.4117156 5.736247 0.3460124
#>                      ACF1
#> Training set -0.007921028
cat("\n")

# Forecasting the next week using the filtered model
next_week_forecast_filtered <- forecast(auto_arima_model_filtered, h = 1)
cat("Next week's forecasted value (after removing COVID-19 year):\n")
#> Next week's forecasted value (after removing COVID-19 year):
print(next_week_forecast_filtered$mean)
#> Time Series:
#> Start = c(57, 16) 
#> End = c(57, 16) 
#> Frequency = 52 
#> [1] 236030.2
## The predicted value before removing the Covid year is 216867.3 and after removing covid year is 236030.2

Created on 2024-02-09 with reprex v2.0.2

vgunturi commented 7 months ago

@dmarub - The auto-correlation plot is not decaying and it is significant for several lags. I faced the same issue for ICNSA data analysis. Please let me know if that can be solved?

dmarub commented 7 months ago

Homework 1 - UBLearns

# Loading the libraries
library(fredr)    
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast) 
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(ggplot2)
library(scales)
#> 
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#> 
#>     discard
#> The following object is masked from 'package:readr':
#> 
#>     col_factor
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(dplyr)
library(readr)
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(forecast)
# Setting FRED API key and fetching the data
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1') 
claims1_data <- fredr(series_id = 'ICNSA')
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1') 
claims2_data <- fredr(series_id = 'IURNSA')
colnames(claims1_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
colnames(claims2_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
# Converting the date columns to Date type
claims1_data$date <- as.Date(claims1_data$date)
claims2_data$date <- as.Date(claims2_data$date)
# Merging the datasets by date to align them
data_merged <- left_join(claims1_data,claims2_data , by = "date")
str(data_merged)
#> tibble [2,980 × 9] (S3: tbl_df/tbl/data.frame)
#>  $ date            : Date[1:2980], format: "1967-01-07" "1967-01-14" ...
#>  $ series_id.x     : chr [1:2980] "ICNSA" "ICNSA" "ICNSA" "ICNSA" ...
#>  $ value.x         : num [1:2980] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#>  $ realtime_start.x: Date[1:2980], format: "2024-02-22" "2024-02-22" ...
#>  $ realtime_end.x  : Date[1:2980], format: "2024-02-22" "2024-02-22" ...
#>  $ series_id.y     : chr [1:2980] NA NA NA NA ...
#>  $ value.y         : num [1:2980] NA NA NA NA NA NA NA NA NA NA ...
#>  $ realtime_start.y: Date[1:2980], format: NA NA ...
#>  $ realtime_end.y  : Date[1:2980], format: NA NA ...
summary(data_merged)
#>       date            series_id.x           value.x        realtime_start.x    
#>  Min.   :1967-01-07   Length:2980        Min.   : 133000   Min.   :2024-02-22  
#>  1st Qu.:1981-04-16   Class :character   1st Qu.: 265800   1st Qu.:2024-02-22  
#>  Median :1995-07-25   Mode  :character   Median : 326919   Median :2024-02-22  
#>  Mean   :1995-07-25                      Mean   : 365013   Mean   :2024-02-22  
#>  3rd Qu.:2009-11-01                      3rd Qu.: 406725   3rd Qu.:2024-02-22  
#>  Max.   :2024-02-10                      Max.   :6161268   Max.   :2024-02-22  
#>                                                                                
#>  realtime_end.x       series_id.y           value.y       realtime_start.y    
#>  Min.   :2024-02-22   Length:2980        Min.   : 0.800   Min.   :2024-02-22  
#>  1st Qu.:2024-02-22   Class :character   1st Qu.: 1.900   1st Qu.:2024-02-22  
#>  Median :2024-02-22   Mode  :character   Median : 2.500   Median :2024-02-22  
#>  Mean   :2024-02-22                      Mean   : 2.759   Mean   :2024-02-22  
#>  3rd Qu.:2024-02-22                      3rd Qu.: 3.300   3rd Qu.:2024-02-22  
#>  Max.   :2024-02-22                      Max.   :15.800   Max.   :2024-02-22  
#>                                          NA's   :209      NA's   :209         
#>  realtime_end.y      
#>  Min.   :2024-02-22  
#>  1st Qu.:2024-02-22  
#>  Median :2024-02-22  
#>  Mean   :2024-02-22  
#>  3rd Qu.:2024-02-22  
#>  Max.   :2024-02-22  
#>  NA's   :209
#plotting ICNSA and IURNSA over time to identify trend
gg <- ggplot(data_merged, aes(x = date)) +
  geom_line(aes(y = value.x, colour = "ICNSA"), size = 1) +
  labs(title = "ICNSA and IURNSA Over Time", x = "Year", y = "ICNSA Value")
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.

# Add the IURNSA dataset with a secondary axis
# The scale factor is used to align the secondary y-axis with the primary y-axis
max_value_x <- max(data_merged$value.x, na.rm = TRUE)
max_value_y <- max(data_merged$value.y, na.rm = TRUE)
scale_factor <- max_value_x / max_value_y

# Add IURNSA values to the plot, scaled and with a new y-axis on the right
# Also, specify the color as part of the aesthetics for the legend
gg <- gg + geom_line(aes(y = value.y * scale_factor, colour = "IURNSA"), linetype = "dashed", size = 1) +
  scale_y_continuous(
    name = "ICNSA Value",
    sec.axis = sec_axis(~./scale_factor, name = "IURNSA Value", breaks = pretty_breaks(n = 5))
  ) +
  scale_colour_manual("", 
                      values = c("ICNSA" = "blue", "IURNSA" = "red"),
                      breaks = c("ICNSA", "IURNSA"),
                      labels = c("ICNSA", "IURNSA")) +
  theme(legend.position = "bottom")

# Output the plot
print(gg)
#> Warning: Removed 209 rows containing missing values (`geom_line()`).

## Both ICNSA and IURNSA show economic cycles of boom and recession. Periods of economic downturn can be seen as spikes in the data particularly noticeable in the sharp increase in ICNSA during these times which is expected as more individuals file for unemployment claims.The most significant spike in ICNSA occurs around 2020 which likely corresponds to the COVID-19 pandemic's impact on the job market.The IURNSA appears to mirror these trends but at a different scale as indicated by the secondary y-axis on the right.Visually there appears to be a correlation between the two series as they seem to move together over time especially during significant economic events.
#The frequency of both the ICNSA and IURNSA datasets is weekly as the common difference between dates is 7 days. So I  can directly compute the correlation between them as long as its the same time period. Hence finding the common date range : 

common_start_date <- max(min(claims1_data$date), min(claims2_data$date))
common_end_date <- min(max(claims1_data$date), max(claims2_data$date))

# Filtering both datasets to the common date range using the column names
icnsa_common <- filter(claims1_data, date >= common_start_date & date <= common_end_date)
iurnsa_common <- filter(claims2_data, date >= common_start_date & date <= common_end_date)

# Merging the datasets on the 'date' column
merged_data <- inner_join(icnsa_common, iurnsa_common, by = "date")

# Calculating the correlation between the 'value' columns which represent ICNSA and IURNSA values
correlation <- cor(merged_data$value.x, merged_data$value.y, use = "complete.obs")

# Printing the correlation
print(correlation)
#> [1] 0.6052214
##The correlation coefficient between the ICNSA and IURNSA value over the common date range is 0.6052214. This shows a moderate positive relationship between the two series.

# Plotting the correlation  between ICNSA and IURNSA values
ggplot(merged_data, aes(x = value.x, y = value.y)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Correlation between ICNSA and IURNSA",
       x = "ICNSA Value",
       y = "IURNSA Value") +
  theme_minimal() +
  theme(legend.position = "none") 
#> `geom_smooth()` using formula = 'y ~ x'


print(gg)
#> Warning: Removed 209 rows containing missing values (`geom_line()`).

# Empirical analysis: 
#Using the Augmented Dickey-Fuller (ADF) test to check the stationarity
# ICNSA Stationarity Test
adf.test(merged_data$value.x)
#> Warning in adf.test(merged_data$value.x): p-value smaller than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  merged_data$value.x
#> Dickey-Fuller = -8.6593, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary

# IURNSA Stationarity Test
adf.test(merged_data$value.y)
#> Warning in adf.test(merged_data$value.y): p-value smaller than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  merged_data$value.y
#> Dickey-Fuller = -7.1617, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary
#Since both time series are stationary can now  look at the ACF and PACF plots for each series to identify the order of the AR (p) and MA (q) parts of the ARIMA model. 
# ACF and PACF for ICNSA
Acf(merged_data$value.x, main="ACF for ICNSA", xlab="Lags")

Pacf(merged_data$value.x, main="PACF for ICNSA", xlab="Lags")


# ACF and PACF plots for IURNSA
Acf(merged_data$value.y, main="ACF for IURNSA", xlab="Lags")

Pacf(merged_data$value.y, main="PACF for IURNSA", xlab="Lags")

## The ACF plot shows a gradual decline in correlation as the lags increase.This shows that past values have a diminishing but prolonged impact on future values.The PACF plot shows a significant spike at the first lag and then cuts off which shows an AR(1) process might be a good starting point for the AR component of the ARIMA model for ICNSA.Similar to the ACF for ICNSA the ACF plot for IURNSA displays a slow decay in the autocorrelation as the lags increase. This shows a strong autocorrelation at multiple lags and suggests that an MA component may be needed in the ARIMA model for IURNSA.The PACF plot for IURNSA shows that after the initial few lags the partial autocorrelations are not significant.
# ICNSA as the primary series and IURNSA as a covariate
auto_model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y)

# Summary of the model
summary(auto_model)
#> Series: merged_data$value.x 
#> Regression with ARIMA(1,1,3) errors 
#> 
#> Coefficients:
#>          ar1      ma1      ma2      ma3       xreg
#>       0.8355  -0.3875  -0.4043  -0.1674  -49576.08
#> s.e.  0.0268   0.0339   0.0207   0.0236   10502.92
#> 
#> sigma^2 = 7.969e+09:  log likelihood = -35504.89
#> AIC=71021.77   AICc=71021.8   BIC=71057.33
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE       MPE     MAPE     MASE
#> Training set -407.9169 89172.87 43038.19 -1.527199 10.70149 1.108449
#>                      ACF1
#> Training set -0.008027662
## The ARIMA(1,1,3) shows that The AR coefficient (ar1) is significant with a small standard error indicating a strong and reliable relationship.The MA coefficients  have small standard errors suggesting that these coefficients are significantly different from zero.The coefficient for the external regressor (xreg) representing the IURNSA covariate is significant as indicated by its standard error implying a meaningful relationship between the covariate and the ICNSA series.The AIC  and BIC  values are relatively high. The Mean Error is relatively small suggesting that there is no large bias in the forecasts.The RMSE and MAE provide measures of the average magnitude of the errors. The MAPE is a relative measure of error which is useful. The ACF1 value close to zero suggests that there is no significant autocorrelation in the residuals at lag 1 which is a good sign that the model is capturing the data's structure well.

# Checking residuals
checkresiduals(auto_model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(1,1,3) errors
#> Q* = 18.878, df = 6, p-value = 0.004375
#> 
#> Model df: 4.   Total lags used: 10
## The Ljung-Box test is significant (p-value = 0.004375) which shows that there is still some autocorrelation present in the residuals at the lags tested. The top plot of residuals over time does not show any obvious patterns or trends which is a good indication that the model has captured the major components of the time series.However there is a large spike at the end which could be an outlier or a change in the process. The ACF plot shows a few lags outside the confidence bounds which corroborates the results of the Ljung-Box test and suggests that the residuals have some autocorrelation.The histogram of residuals overlaid with a density plot suggests that the residuals are not perfectly normally distributed. 
# Producing a point forecast from the final model

forecast_values <- forecast(auto_model, xreg = tail(merged_data$value.y, 1), h = 1)

# Extracting the single point forecast value
single_forecast <- forecast_values$mean[1]

# Outputing the next predicted value
cat("Next predicted value will be: ", single_forecast, "\n")
#> Next predicted value will be:  217485.5

Created on 2024-02-22 with reprex v2.0.2

dmarub commented 7 months ago

Homework -2

# Loading the libraries
library(fredr)    
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast) 
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(ggplot2)
library(scales)
#> 
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#> 
#>     discard
#> The following object is masked from 'package:readr':
#> 
#>     col_factor
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(dplyr)
library(readr)
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(forecast)
library(fable)
#> Warning: package 'fable' was built under R version 4.2.3
#> Loading required package: fabletools
#> Warning: package 'fabletools' was built under R version 4.2.3
library(tsibble)
#> Warning: package 'tsibble' was built under R version 4.2.3
#> 
#> Attaching package: 'tsibble'
#> The following object is masked from 'package:zoo':
#> 
#>     index
#> The following object is masked from 'package:lubridate':
#> 
#>     interval
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(fabletools)
library(lubridate)
# Setting FRED API key and fetching the data
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1') 
claims_data <- fredr(series_id = 'ICNSA')
colnames(claims_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
# Originaal data before filtering the dates
claims_data$date <- as.Date(claims_data$date)
ggplot(claims_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA Claims ",
       x = "Year",
       y = "Value")

# Aggregateing data by month to analyze the trend
monthly_claims_data <- claims_data %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(total_claims = sum(value))

# Finding the month with the highest claims
peak <- monthly_claims_data[which.max(monthly_claims_data$total_claims), ]

# Plotting the monthly claims data with a point highlighting the peak
ggplot(monthly_claims_data, aes(x = month, y = total_claims)) +
  geom_line() +
  geom_point(data = peak, aes(x = month, y = total_claims), color = "red", size = 3) +
  labs(title = "Monthly ICNSA Claims During COVID-19",
       x = "Month",
       y = "Total Claims") +
  theme_minimal()


print(paste("Peak month: ", peak$month))
#> [1] "Peak month:  2020-04-01"
print(paste("Peak claims: ", peak$total_claims))
#> [1] "Peak claims:  18747756"
#Since it is peaking from April, determing the start date and end date : 

# Calculating the average claims before the peak
avg_claims_before_peak <- mean(claims_data$value[claims_data$date < "2020-04-01"])

# Calculating the standard deviation before the peak
std_dev_before_peak <- sd(claims_data$value[claims_data$date < "2020-04-01"])

# Defining start and end dates based on deviation from average
start_date <- min(claims_data$date[claims_data$value > (avg_claims_before_peak + 2 * std_dev_before_peak)])
end_date <- max(claims_data$date[claims_data$value > (avg_claims_before_peak + 2 * std_dev_before_peak)])

print(paste(" start date: ", start_date))
#> [1] " start date:  1974-12-07"
print(paste(" end date: ", end_date))
#> [1] " end date:  2021-03-27"
# Since the start date of "1967-07-01" might to be a result of either an outlier or an incorrect calculation since the COVID-19 pandemic began around late 2019 and early 2020. examing and Filtering the data from this period. 

# Filtering the dataset to include only data from January 2020 onwards
filtered_claims_data <- claims_data %>%
  filter(date >= as.Date("2020-01-01"))

# Aggregating this filtered data by month and calculating total claims per month
monthly_filtered_claims_data <- filtered_claims_data %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(total_claims = sum(value))

monthly_filtered_claims_data <- monthly_filtered_claims_data %>%
  mutate(perc_change = (total_claims - lag(total_claims)) / lag(total_claims) * 100)

ggplot(monthly_filtered_claims_data, aes(x = month, y = perc_change)) +
  geom_line() +
  geom_hline(yintercept = 50, linetype = "dashed", color = "red") + # Example threshold line
  labs(title = "Monthly Percentage Change in ICNSA Claims (2020 Onwards)",
       x = "Month",
       y = "Percentage Change")
#> Warning: Removed 1 row containing missing values (`geom_line()`).


significant_increase <- filter(monthly_filtered_claims_data, perc_change > 50)
if (nrow(significant_increase) > 0) {
  start_date_corrected <- min(significant_increase$month)
  print(paste("Corrected identified start date: ", start_date_corrected))
} else {
  print("No significant increase found based on the threshold.")
}
#> [1] "Corrected identified start date:  2020-03-01"
# Now that the start date and end date has been determined, Using  cubic spline methodology to impute new values for the Covid period:

end_date <- as.Date("2021-03-27")

# Filtering out COVID period data for non-COVID analysis
non_covid_data <- claims_data %>%
  filter(date < as.Date("2020-03-01") | date > as.Date("2021-03-27"))

# Fitting cubic spline to the non-COVID data
cubic_spline_fit <- smooth.spline(x = as.numeric(non_covid_data$date), y = non_covid_data$value, spar = 0.7)

# Filtering for the COVID period data
covid_period_data <- claims_data %>%
  filter(date >= as.Date("2020-03-01") & date <= as.Date("2021-03-27"))

# Imputing new values for the COVID period using the cubic spline model
imputed_values <- predict(cubic_spline_fit, x = as.numeric(covid_period_data$date))

# Replacing COVID period values with imputed values in the dataset
covid_period_data$value <- imputed_values$y

# Combining the non-COVID data with the imputed COVID period data
updated_claims_data <- bind_rows(non_covid_data, covid_period_data) %>%
  arrange(date)

# Plotting the updated data to see the effect of the imputation
ggplot(updated_claims_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA Claims with Imputed COVID-19 Period",
       x = "Year",
       y = "Value")

# Comparing the original and updated imputed data :

claims_data$date <- as.Date(claims_data$date)
updated_claims_data$date <- as.Date(updated_claims_data$date)

comparison_data <- left_join(claims_data, updated_claims_data, by = "date", suffix = c("_original", "_imputed"))

long_comparison_data <- comparison_data %>%
  select(date, value_original, value_imputed) %>%
  gather(key = "type", value = "value", -date)

ggplot(long_comparison_data, aes(x = date, y = value, color = type)) +
  geom_line() +
  labs(title = "ICNSA Claims with Original vs. Imputed COVID-19 Period Data",
       x = "Year",
       y = "Claims",
       color = "Data Type") +
  scale_color_manual(values = c("value_original" = "black", "value_imputed" = "red")) +
  theme_minimal()

# Using both  multiplicative and an additive Holt-Winters model to forecast the next ICNSA value using the newly imputed Covid values.

ts_data <- ts(updated_claims_data$value, frequency = 12)

# Multiplicative Holt-Winters model and forecast using the hw() function
hw_multiplicative_model <- hw(ts_data, seasonal = "multiplicative", h = 1)

print(hw_multiplicative_model)
#>         Point Forecast    Lo 80    Hi 80    Lo 95  Hi 95
#> Jun 249       206476.8 172502.5 240451.1 154517.6 258436

plot(hw_multiplicative_model)

ts_data <- ts(updated_claims_data$value, frequency = 12)

# Additive Holt-Winters model and forecast using the hw() function
hw_additive_model <- hw(ts_data, seasonal = "additive", h = 1)

print(hw_additive_model)
#>         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> Jun 249       201457.8 136868.5 266047.2 102676.9 300238.7

plot(hw_additive_model)

Created on 2024-02-28 with reprex v2.0.2

dmarub commented 6 months ago

Homework - 3

# Loading the libraries
library(fredr)    
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast) 
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(ggplot2)
library(scales)
#> 
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#> 
#>     discard
#> The following object is masked from 'package:readr':
#> 
#>     col_factor
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(dplyr)
library(readr)
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(forecast)
library(fable)
#> Warning: package 'fable' was built under R version 4.2.3
#> Loading required package: fabletools
#> Warning: package 'fabletools' was built under R version 4.2.3
library(tsibble)
#> Warning: package 'tsibble' was built under R version 4.2.3
#> 
#> Attaching package: 'tsibble'
#> The following object is masked from 'package:zoo':
#> 
#>     index
#> The following object is masked from 'package:lubridate':
#> 
#>     interval
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(fabletools)
library(lubridate)
library(imputeTS)
#> Warning: package 'imputeTS' was built under R version 4.2.3
#> 
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:tseries':
#> 
#>     na.remove
#> The following object is masked from 'package:zoo':
#> 
#>     na.locf
# Setting FRED API key and fetching the data
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1') 
claims1_data <- fredr(series_id = 'ICNSA')
colnames(claims1_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1') 
claims2_data <- fredr(series_id = 'CACCLAIMS')
colnames(claims2_data)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
claims1_data$date <- as.Date(claims1_data$date) 

# Plotting to visualize the claims data
ggplot(claims1_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Initial Claims for Unemployment Benefits",
       x = "Date", y = "Initial Claims") +
  theme_minimal()

# identifying potential_start date and potential_end date through emperical analysis :  

# Convert 'date' to Date type
claims1_data$date <- as.Date(claims1_data$date)
claims2_data$date <- as.Date(claims2_data$date)

# Calculate the difference between consecutive values
claims1_data$diff <- c(NA, diff(claims1_data$value))
claims2_data$diff <- c(NA, diff(claims2_data$value))

# Find the index of the maximum difference
start_index <- which.max(claims1_data$diff)

# Check if the start_index is valid
if (!is.na(start_index) && start_index > 0) {
  potential_start_date <- claims1_data$date[start_index]
} else {
  potential_start_date <- NA 
}

# Assuming post_spike_data is defined elsewhere in your code
if (nrow(post_spike_data) > 0) {
  potential_end_date <- as.Date(min(post_spike_data$date)) 
} else {
  potential_end_date <- NA 
}
#> Error in nrow(post_spike_data): object 'post_spike_data' not found

# Print the potential start and end dates
if (!is.na(potential_start_date)) {
  cat("Potential start of the pandemic impact:", format(potential_start_date, "%Y-%m-%d"), "\n")
} else {
  cat("Potential start of the pandemic impact: Not Found\n")
}
#> Potential start of the pandemic impact: 2020-03-28

if (!is.na(potential_end_date)) {
  cat("Potential end of the pandemic impact:", format(potential_end_date, "%Y-%m-%d"), "\n")
} else {
  cat("Potential end of the pandemic impact: Not Found\n")
}
#> Error in eval(expr, envir, enclos): object 'potential_end_date' not found
#Plotting the dates identified
ggplot(claims1_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(potential_start_date), color = "red") +
  geom_vline(xintercept = as.numeric(potential_end_date), color = "blue") +
  labs(title = "Initial Claims with Pandemic Start and End Identified",
       x = "Date", y = "Initial Claims") +
  theme_minimal()
#> Error in list2(...): object 'potential_end_date' not found
potential_start_date <- as.Date("2020-03-28")
potential_end_date <- as.Date("2021-05-15")

# Splitting the data into pandemic and non-pandemic periods
pandemic_claims <- claims1_data %>%
  filter(date >= potential_start_date & date <= potential_end_date)

outside_pandemic_claims <- anti_join(claims1_data, pandemic_claims, by = "date")

# Ensure pandemic_claims has the same columns as outside_pandemic_claims filled with NA for missing ones
missing_cols <- setdiff(colnames(outside_pandemic_claims), colnames(pandemic_claims))
for (col in missing_cols) {
  pandemic_claims[[col]] <- NA
}

pandemic_claims <- pandemic_claims[colnames(outside_pandemic_claims)]

pandemic_claims$value <- NA

# Performing smooth spline imputation for missing values during the pandemic
spline_model_fit <- smooth.spline(x = as.numeric(outside_pandemic_claims$date), y = outside_pandemic_claims$value, lambda = 0.6)
spline_predicted_values <- predict(spline_model_fit, x = as.numeric(pandemic_claims$date))
pandemic_claims$value <- spline_predicted_values$y

# Combining the datasets after spline imputation
complete_claims_spline <- rbind(outside_pandemic_claims, pandemic_claims) %>% arrange(date)

# Performing Kalman filter imputation

pandemic_claims$value <- NA
complete_claims_kalman <- rbind(outside_pandemic_claims, pandemic_claims) %>% arrange(date)
complete_claims_kalman$value <- na_kalman(complete_claims_kalman$value, model = "StructTS", smooth = TRUE)

# Plotting the results for comparison between smooth spline and Kalman filter imputations
plot(complete_claims_spline$date, complete_claims_spline$value, type = "l", col = "blue", main = "Imputation Comparison: Smooth Spline vs Kalman Filter", xlab = "Date", ylab = "Claims")
lines(complete_claims_kalman$date, complete_claims_kalman$value, col = "red")
legend("topleft", legend = c("Smooth Spline", "Kalman Filter"), col = c("blue", "red"), lty = 1)

library(ggplot2)

# Plotting for Smooth Spline Imputation Results
ggplot(complete_claims_spline, aes(x = date, y = value)) +
  geom_line(color = "blue") +
  ggtitle("Smooth Spline Imputation Results") +
  xlab("Date") +
  ylab("Claims") +
  theme_minimal()

# Plotting for Kalman Filter Imputation Results
ggplot(complete_claims_kalman, aes(x = date, y = value)) +
  geom_line(color = "red") +
  ggtitle("Kalman Filter Imputation Results") +
  xlab("Date") +
  ylab("Claims") +
  theme_minimal()


# Converting the imputed data to a time series object as it is weekly data (frequency = 52)
icnsa_ts <- ts(complete_claims_kalman$value, frequency = 52)

# Fitting a structural time series model using Basic Structural Model 
model <- StructTS(icnsa_ts, type = "BSM")

pred <- forecast(model, h=52)  

# Plotting the forecast along with the original time series data
autoplot(pred) +
  autolayer(icnsa_ts, series="ICNSA TS", alpha=0.5) +
  labs(title="Forecast vs Original Time Series", x="Time", y="Claims") +
  guides(colour=guide_legend(title="Series"))

#covariate series used is Continued Claims (Insured Unemployment) in California (CACCLAIMS)
claims2_data$date <- as.Date(claims2_data$date)

# Filter both datasets for the COVID period
claims1_data_covid_period <- claims1_data %>%
  filter(date >= potential_start_date & date <= potential_end_date)

claims2_data_covid_period <- claims2_data %>%
  filter(date >= potential_start_date & date <= potential_end_date)

# Merge the datasets based on the date for the COVID period
merged_data_covid_period <- merge(claims1_data_covid_period, claims2_data_covid_period, by = "date", suffixes = c(".icnsa", ".cacclaims"))

# Prepare the data for modeling
# Assuming you are going to use 'value.icnsa' from claims1_data and 'value.cacclaims' from claims2_data as the covariate
target_vector_covid_period <- merged_data_covid_period$value.icnsa
covariate_matrix_covid_period <- as.matrix(merged_data_covid_period$value.cacclaims)

# Fit the ARIMA model with CACCLAIMS as a covariate for the COVID period
auto_model_covid_period <- auto.arima(target_vector_covid_period, xreg = covariate_matrix_covid_period)

# Summary of the model to review its parameters and fit
summary(auto_model_covid_period)
#> Series: target_vector_covid_period 
#> Regression with ARIMA(2,2,0) errors 
#> 
#> Coefficients:
#>           ar1      ar2     xreg
#>       -1.0407  -0.6632  -0.1220
#> s.e.   0.1638   0.1539   0.0405
#> 
#> sigma^2 = 3.547e+10:  log likelihood = -786.05
#> AIC=1580.1   AICc=1580.85   BIC=1588.34
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
#> Training set 8558.055 180308.5 121528.9 0.8563479 10.38045 0.8731565 0.3796071

# Check residuals to evaluate model performance
checkresiduals(auto_model_covid_period)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(2,2,0) errors
#> Q* = 23.54, df = 8, p-value = 0.002736
#> 
#> Model df: 2.   Total lags used: 10

next_week_covariate_value <- 1000

# Correcting the matrix creation with the actual value
next_week_covariate <- matrix(c(next_week_covariate_value), ncol = 1)

# Forecasting for the next week using the ARIMA model and the next covariate value
next_week_forecast <- forecast(auto_model_covid_period, xreg = next_week_covariate, h = 1)

# Print the point forecast for the next week
print(next_week_forecast$mean)
#> Time Series:
#> Start = 61 
#> End = 61 
#> Frequency = 1 
#> [1] 450380

Created on 2024-03-07 with reprex v2.0.2

dmarub commented 5 months ago

Homework - 4

library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
library(cluster)
library(magrittr)
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#> 
#>     set_names
#> The following object is masked from 'package:tidyr':
#> 
#>     extract
library(plotly)
#> Warning: package 'plotly' was built under R version 4.2.3
#> 
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following object is masked from 'package:graphics':
#> 
#>     layout
library(cowplot)
#> Warning: package 'cowplot' was built under R version 4.2.3
#> 
#> Attaching package: 'cowplot'
#> The following object is masked from 'package:lubridate':
#> 
#>     stamp
library(ggdendro)
#> Warning: package 'ggdendro' was built under R version 4.2.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
library(vars)
#> Warning: package 'vars' was built under R version 4.2.3
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:plotly':
#> 
#>     select
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.2.3
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.2.3
#> 
#> Attaching package: 'strucchange'
#> The following object is masked from 'package:stringr':
#> 
#>     boundary
#> Loading required package: urca
#> Warning: package 'urca' was built under R version 4.2.3
#> Loading required package: lmtest
#> Warning: package 'lmtest' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
# Reading the four where S.csv =  35SNA, A.csv = 35ANA, B.csv = 35BNA, C.csv = 35CNA datasets
s_data <- read.csv('C:/Users/admin/Desktop/TSHW4/S.csv', skip = 7)
a_data <- read.csv('C:/Users/admin/Desktop/TSHW4/A.csv', skip = 7)
b_data <- read.csv('C:/Users/admin/Desktop/TSHW4/B.csv', skip = 7)
c_data <- read.csv('C:/Users/admin/Desktop/TSHW4/C.csv', skip = 7)

# Displaying the header of each dataset
cat("Header for S dataset:\n")
#> Header for S dataset:
print(names(s_data))
#> [1] "Period" "Value"
cat("\nHeader for A dataset:\n")
#> 
#> Header for A dataset:
print(names(a_data))
#> [1] "Period" "Value"
cat("\nHeader for B dataset:\n")
#> 
#> Header for B dataset:
print(names(b_data))
#> [1] "Period" "Value"
cat("\nHeader for C dataset:\n")
#> 
#> Header for C dataset:
print(names(c_data))
#> [1] "Period" "Value"

# Printing the first few rows to check if the files have been read
cat("\nFirst few rows of S dataset:\n")
#> 
#> First few rows of S dataset:
print(head(s_data))
#>     Period Value
#> 1 Jan-1992    NA
#> 2 Feb-1992  6199
#> 3 Mar-1992  6614
#> 4 Apr-1992  6858
#> 5 May-1992  6825
#> 6 Jun-1992  6606
cat("\nFirst few rows of A dataset:\n")
#> 
#> First few rows of A dataset:
print(head(a_data))
#>     Period Value
#> 1 Jan-1992    NA
#> 2 Feb-1992   794
#> 3 Mar-1992   804
#> 4 Apr-1992   825
#> 5 May-1992   811
#> 6 Jun-1992   799
cat("\nFirst few rows of B dataset:\n")
#> 
#> First few rows of B dataset:
print(head(b_data))
#>     Period Value
#> 1 Jan-1992    NA
#> 2 Feb-1992  1373
#> 3 Mar-1992  1490
#> 4 Apr-1992  1452
#> 5 May-1992  1467
#> 6 Jun-1992  1402
cat("\nFirst few rows of C dataset:\n")
#> 
#> First few rows of C dataset:
print(head(c_data))
#>     Period Value
#> 1 Jan-1992    NA
#> 2 Feb-1992  1865
#> 3 Mar-1992  1998
#> 4 Apr-1992  2047
#> 5 May-1992  2162
#> 6 Jun-1992  1963
# Cleaning the data

clean_data <- function(df) {
  # Splitting the 'Period' into month and year
  date_parts <- strsplit(as.character(df$Period), "-")

  # Extracting the month names and year
  months <- sapply(date_parts, function(x) x[1])
  years <- sapply(date_parts, function(x) x[2])

  # Converting month names to month numbers
  month_nums <- match(months, month.abb)  

  # Handling any NAs that arise from this conversion
  if(any(is.na(month_nums))) {
    warning("There was an error in converting month names to numbers.")
  }

  df$Period <- as.Date(paste(years, month_nums, "01", sep = "-"))

  if(any(is.na(df$Period))) {
    warning("There are still NA values in the 'Period' column after conversion.")
  }

  df$Value <- as.numeric(as.character(df$Value))

  df <- df[!is.na(df$Value), ]

  return(df)
}

s_data_clean <- clean_data(s_data)
a_data_clean <- clean_data(a_data)
b_data_clean <- clean_data(b_data)
c_data_clean <- clean_data(c_data)

str(s_data_clean)
#> 'data.frame':    385 obs. of  2 variables:
#>  $ Period: Date, format: "1992-02-01" "1992-03-01" ...
#>  $ Value : num  6199 6614 6858 6825 6606 ...
summary(s_data_clean)
#>      Period               Value      
#>  Min.   :1992-02-01   Min.   : 6199  
#>  1st Qu.:2000-02-01   1st Qu.: 8808  
#>  Median :2008-02-01   Median :10069  
#>  Mean   :2008-01-31   Mean   : 9999  
#>  3rd Qu.:2016-02-01   3rd Qu.:10771  
#>  Max.   :2024-02-01   Max.   :14585
str(a_data_clean)
#> 'data.frame':    385 obs. of  2 variables:
#>  $ Period: Date, format: "1992-02-01" "1992-03-01" ...
#>  $ Value : num  794 804 825 811 799 783 807 829 802 810 ...
summary(a_data_clean)
#>      Period               Value     
#>  Min.   :1992-02-01   Min.   : 772  
#>  1st Qu.:2000-02-01   1st Qu.: 971  
#>  Median :2008-02-01   Median :1063  
#>  Mean   :2008-01-31   Mean   :1062  
#>  3rd Qu.:2016-02-01   3rd Qu.:1152  
#>  Max.   :2024-02-01   Max.   :1391
str(b_data_clean)
#> 'data.frame':    385 obs. of  2 variables:
#>  $ Period: Date, format: "1992-02-01" "1992-03-01" ...
#>  $ Value : num  1373 1490 1452 1467 1402 ...
summary(b_data_clean)
#>      Period               Value     
#>  Min.   :1992-02-01   Min.   :1373  
#>  1st Qu.:2000-02-01   1st Qu.:1685  
#>  Median :2008-02-01   Median :1839  
#>  Mean   :2008-01-31   Mean   :1868  
#>  3rd Qu.:2016-02-01   3rd Qu.:1992  
#>  Max.   :2024-02-01   Max.   :2851
str(c_data_clean)
#> 'data.frame':    385 obs. of  2 variables:
#>  $ Period: Date, format: "1992-02-01" "1992-03-01" ...
#>  $ Value : num  1865 1998 2047 2162 1963 ...
summary(c_data_clean)
#>      Period               Value     
#>  Min.   :1992-02-01   Min.   :1865  
#>  1st Qu.:2000-02-01   1st Qu.:2763  
#>  Median :2008-02-01   Median :3104  
#>  Mean   :2008-01-31   Mean   :3154  
#>  3rd Qu.:2016-02-01   3rd Qu.:3408  
#>  Max.   :2024-02-01   Max.   :4861
# Emperical Analysis

# Checking for stationarity:

# Performing the Augmented Dickey-Fuller test on each time series

adf_results <- lapply(list(s_data_clean$Value, a_data_clean$Value, b_data_clean$Value, c_data_clean$Value), function(ts) {
  adf_test_result <- tseries::adf.test(ts, alternative = "stationary")
  return(adf_test_result)
})

print(adf_results)
#> [[1]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.4116, Lag order = 7, p-value = 0.4033
#> alternative hypothesis: stationary
#> 
#> 
#> [[2]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.7467, Lag order = 7, p-value = 0.2618
#> alternative hypothesis: stationary
#> 
#> 
#> [[3]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -1.4555, Lag order = 7, p-value = 0.807
#> alternative hypothesis: stationary
#> 
#> 
#> [[4]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.0146, Lag order = 7, p-value = 0.571
#> alternative hypothesis: stationary
# The output from the Augmented Dickey-Fuller (ADF) tests shows that the p-values for all four time series are above common significance levels which means that each of the time series is non-stationary based on the ADF test results.

#differencing to the data and then re-run the ADF test to check for stationarity

# Applying first differencing

s_data_diff <- diff(s_data_clean$Value)
a_data_diff <- diff(a_data_clean$Value)
b_data_diff <- diff(b_data_clean$Value)
c_data_diff <- diff(c_data_clean$Value)

# Performing ADF test again on the differenced data
adf_results_diff <- lapply(list(s_data_diff, a_data_diff, b_data_diff, c_data_diff), function(ts) {
  adf.test(ts, alternative = "stationary")
})
#> Warning in adf.test(ts, alternative = "stationary"): p-value smaller than
#> printed p-value
#> Warning in adf.test(ts, alternative = "stationary"): p-value smaller than
#> printed p-value

#> Warning in adf.test(ts, alternative = "stationary"): p-value smaller than
#> printed p-value

#> Warning in adf.test(ts, alternative = "stationary"): p-value smaller than
#> printed p-value

# Printing the results of the differenced data
print(adf_results_diff)
#> [[1]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -6.287, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> 
#> 
#> [[2]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -8.4507, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> 
#> 
#> [[3]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -8.7966, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> 
#> 
#> [[4]]
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -6.9735, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
par(mfrow=c(2,2))
plot(s_data_diff, main="Differenced S Data")
plot(a_data_diff, main="Differenced A Data")
plot(b_data_diff, main="Differenced B Data")
plot(c_data_diff, main="Differenced C Data")

par(mfrow=c(1,1))

# The differenced data and the ADF test results shows that each of the time series is now stationary after differencing as indicated by the low p-values that is 0.01.

# Plotting AIC and BIC
mts <- cbind(s_data_diff, a_data_diff, b_data_diff, c_data_diff)

max_lag <- 10

# Initializing vectors to store AIC and BIC values
aic_values <- numeric(max_lag)
bic_values <- numeric(max_lag)

# Looping over lag orders from 1 to max_lag to calculate AIC and BIC
for (p in 1:max_lag) {
  model <- VAR(mts, p = p, type = "const")
  aic_values[p] <- AIC(model)
  bic_values[p] <- BIC(model)
}

plot(1:max_lag, aic_values, type = "o", pch = 20, col = "blue", xlab = "Lag Order", ylab = "Information Criterion", main = "AIC and BIC for VAR models")
points(1:max_lag, bic_values, type = "o", pch = 20, col = "red")
legend("topright", legend = c("AIC", "BIC"), col = c("blue", "red"), pch = 20)

par(mar=c(4, 4, 2, 2))  
par(mfrow = c(1, 1))    

# Plotting ACF and PACF for each series separately

# S Data
Acf(s_data_diff, main = "ACF for Differenced S Data")

Pacf(s_data_diff, main = "PACF for Differenced S Data")


# A Data
Acf(a_data_diff, main = "ACF for Differenced A Data")

Pacf(a_data_diff, main = "PACF for Differenced A Data")


# B Data
Acf(b_data_diff, main = "ACF for Differenced B Data")

Pacf(b_data_diff, main = "PACF for Differenced B Data")


# C Data
Acf(c_data_diff, main = "ACF for Differenced C Data")

Pacf(c_data_diff, main = "PACF for Differenced C Data") 

# Fitting  VAR(1) model and a VAR(p) model with p > 1
if (!require(vars)) install.packages("vars")
library(vars)

# Combining the differenced series into a multivariate time series matrix
mts_diff <- cbind(s_data_diff, a_data_diff, b_data_diff, c_data_diff)

# Fit a VAR(1) model
var1_model <- VAR(mts_diff, p=1, type="const")
cat("Summary of VAR(1) model:\n")
#> Summary of VAR(1) model:
summary(var1_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: s_data_diff, a_data_diff, b_data_diff, c_data_diff 
#> Deterministic variables: const 
#> Sample size: 383 
#> Log Likelihood: -9649.792 
#> Roots of the characteristic polynomial:
#> 0.4734 0.4342 0.3595 0.3595
#> Call:
#> VAR(y = mts_diff, p = 1, type = "const")
#> 
#> 
#> Estimation results for equation s_data_diff: 
#> ============================================ 
#> s_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)   
#> s_data_diff.l1 -0.24402    0.08514  -2.866  0.00439 **
#> a_data_diff.l1 -0.51004    0.31411  -1.624  0.10525   
#> b_data_diff.l1 -0.32062    0.16755  -1.914  0.05643 . 
#> c_data_diff.l1 -0.15808    0.12297  -1.286  0.19939   
#> const          28.30956   18.46996   1.533  0.12618   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 360.9 on 378 degrees of freedom
#> Multiple R-Squared: 0.163,   Adjusted R-squared: 0.1541 
#> F-statistic:  18.4 on 4 and 378 DF,  p-value: 7.985e-14 
#> 
#> 
#> Estimation results for equation a_data_diff: 
#> ============================================ 
#> a_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.01941    0.01433   1.354    0.176    
#> a_data_diff.l1 -0.37278    0.05288  -7.049 8.61e-12 ***
#> b_data_diff.l1 -0.04707    0.02821  -1.669    0.096 .  
#> c_data_diff.l1 -0.01931    0.02070  -0.933    0.352    
#> const           1.82853    3.10961   0.588    0.557    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 60.76 on 378 degrees of freedom
#> Multiple R-Squared: 0.1321,  Adjusted R-squared: 0.1229 
#> F-statistic: 14.38 on 4 and 378 DF,  p-value: 6.112e-11 
#> 
#> 
#> Estimation results for equation b_data_diff: 
#> ============================================ 
#> b_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.03351    0.02869   1.168    0.244    
#> a_data_diff.l1 -0.15146    0.10586  -1.431    0.153    
#> b_data_diff.l1 -0.51743    0.05646  -9.164   <2e-16 ***
#> c_data_diff.l1 -0.02632    0.04144  -0.635    0.526    
#> const           4.11170    6.22446   0.661    0.509    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 121.6 on 378 degrees of freedom
#> Multiple R-Squared: 0.2423,  Adjusted R-squared: 0.2342 
#> F-statistic: 30.21 on 4 and 378 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation c_data_diff: 
#> ============================================ 
#> c_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.06003    0.04949   1.213    0.226    
#> a_data_diff.l1 -0.07154    0.18260  -0.392    0.695    
#> b_data_diff.l1 -0.05449    0.09740  -0.559    0.576    
#> c_data_diff.l1 -0.48630    0.07148  -6.803 4.02e-11 ***
#> const           9.52562   10.73700   0.887    0.376    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 209.8 on 378 degrees of freedom
#> Multiple R-Squared: 0.1795,  Adjusted R-squared: 0.1709 
#> F-statistic: 20.68 on 4 and 378 DF,  p-value: 1.999e-15 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      130240        7416     18772.9     51364.1
#> a_data_diff        7416        3692       907.0      1127.9
#> b_data_diff       18773         907     14791.6       273.8
#> c_data_diff       51364        1128       273.8     44012.6
#> 
#> Correlation matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      1.0000     0.33820     0.42771     0.67842
#> a_data_diff      0.3382     1.00000     0.12274     0.08848
#> b_data_diff      0.4277     0.12274     1.00000     0.01073
#> c_data_diff      0.6784     0.08848     0.01073     1.00000

# Determining the optimal number of lags for VAR(p) model using AIC
lag_selection <- VARselect(mts_diff, lag.max=10, type="const")
optimal_lag <- lag_selection$selection["AIC"]

# Checking if optimal_lag is NA or not greater than 1
if (is.na(optimal_lag) || optimal_lag <= 1) {
  cat("\nThe optimal lag order according to AIC is NA or not greater than 1. No VAR(p) with p > 1 model is fitted.\n")
} else {
  # Fitting a VAR(p) model using the optimal lag order from AIC
  varp_model <- VAR(mts_diff, p=optimal_lag, type="const")
  cat("Summary of VAR(", optimal_lag, ") model:\n")
  summary(varp_model)

}
#> 
#> The optimal lag order according to AIC is NA or not greater than 1. No VAR(p) with p > 1 model is fitted.
lag_selection <- VARselect(mts_diff, lag.max=10, type="const")
print(lag_selection)
#> $selection
#> AIC(n)  HQ(n)  SC(n) FPE(n) 
#>      4      2      2      4 
#> 
#> $criteria
#>                   1            2            3            4            5
#> AIC(n) 3.922027e+01 3.882554e+01 3.877775e+01 3.874366e+01 3.874793e+01
#> HQ(n)  3.930359e+01 3.897551e+01 3.899438e+01 3.902695e+01 3.909788e+01
#> SC(n)  3.943013e+01 3.920327e+01 3.932337e+01 3.945716e+01 3.962932e+01
#> FPE(n) 1.079322e+17 7.273310e+16 6.934437e+16 6.702928e+16 6.733160e+16
#>                   6            7            8            9           10
#> AIC(n) 3.877009e+01 3.878591e+01 3.876261e+01 3.881866e+01 3.883276e+01
#> HQ(n)  3.918670e+01 3.926917e+01 3.931253e+01 3.943524e+01 3.951599e+01
#> SC(n)  3.981936e+01 4.000306e+01 4.014764e+01 4.037157e+01 4.055356e+01
#> FPE(n) 6.886256e+16 6.999187e+16 6.842059e+16 7.241988e+16 7.351738e+16
sum(is.na(mts_diff))
#> [1] 0
sum(is.infinite(mts_diff))
#> [1] 0
criteria <- sapply(1:10, function(p) {
  model <- VAR(mts_diff, p=p, type="const")
  c(AIC=AIC(model), BIC=BIC(model))
})
criteria 
#>         [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
#> AIC 19339.58 19140.51 19075.49 19015.80 18971.33 18933.31 18891.65 18836.68
#> BIC 19418.54 19282.54 19280.52 19283.73 19302.08 19326.80 19347.79 19355.38
#>         [,9]    [,10]
#> AIC 18811.32 18768.92
#> BIC 19392.51 19412.49
# The results indicates that based on AIC the optimal lag order for the VAR model is 4 as the lowest AIC value is observed at p = 4. This shows that a VAR(4) model would be the best fit.

# Fitting the VAR(1) model
var1_model <- VAR(mts_diff, p=1, type="const")
cat("Summary of VAR(1) model:\n")
#> Summary of VAR(1) model:
summary(var1_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: s_data_diff, a_data_diff, b_data_diff, c_data_diff 
#> Deterministic variables: const 
#> Sample size: 383 
#> Log Likelihood: -9649.792 
#> Roots of the characteristic polynomial:
#> 0.4734 0.4342 0.3595 0.3595
#> Call:
#> VAR(y = mts_diff, p = 1, type = "const")
#> 
#> 
#> Estimation results for equation s_data_diff: 
#> ============================================ 
#> s_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)   
#> s_data_diff.l1 -0.24402    0.08514  -2.866  0.00439 **
#> a_data_diff.l1 -0.51004    0.31411  -1.624  0.10525   
#> b_data_diff.l1 -0.32062    0.16755  -1.914  0.05643 . 
#> c_data_diff.l1 -0.15808    0.12297  -1.286  0.19939   
#> const          28.30956   18.46996   1.533  0.12618   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 360.9 on 378 degrees of freedom
#> Multiple R-Squared: 0.163,   Adjusted R-squared: 0.1541 
#> F-statistic:  18.4 on 4 and 378 DF,  p-value: 7.985e-14 
#> 
#> 
#> Estimation results for equation a_data_diff: 
#> ============================================ 
#> a_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.01941    0.01433   1.354    0.176    
#> a_data_diff.l1 -0.37278    0.05288  -7.049 8.61e-12 ***
#> b_data_diff.l1 -0.04707    0.02821  -1.669    0.096 .  
#> c_data_diff.l1 -0.01931    0.02070  -0.933    0.352    
#> const           1.82853    3.10961   0.588    0.557    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 60.76 on 378 degrees of freedom
#> Multiple R-Squared: 0.1321,  Adjusted R-squared: 0.1229 
#> F-statistic: 14.38 on 4 and 378 DF,  p-value: 6.112e-11 
#> 
#> 
#> Estimation results for equation b_data_diff: 
#> ============================================ 
#> b_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.03351    0.02869   1.168    0.244    
#> a_data_diff.l1 -0.15146    0.10586  -1.431    0.153    
#> b_data_diff.l1 -0.51743    0.05646  -9.164   <2e-16 ***
#> c_data_diff.l1 -0.02632    0.04144  -0.635    0.526    
#> const           4.11170    6.22446   0.661    0.509    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 121.6 on 378 degrees of freedom
#> Multiple R-Squared: 0.2423,  Adjusted R-squared: 0.2342 
#> F-statistic: 30.21 on 4 and 378 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation c_data_diff: 
#> ============================================ 
#> c_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.06003    0.04949   1.213    0.226    
#> a_data_diff.l1 -0.07154    0.18260  -0.392    0.695    
#> b_data_diff.l1 -0.05449    0.09740  -0.559    0.576    
#> c_data_diff.l1 -0.48630    0.07148  -6.803 4.02e-11 ***
#> const           9.52562   10.73700   0.887    0.376    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 209.8 on 378 degrees of freedom
#> Multiple R-Squared: 0.1795,  Adjusted R-squared: 0.1709 
#> F-statistic: 20.68 on 4 and 378 DF,  p-value: 1.999e-15 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      130240        7416     18772.9     51364.1
#> a_data_diff        7416        3692       907.0      1127.9
#> b_data_diff       18773         907     14791.6       273.8
#> c_data_diff       51364        1128       273.8     44012.6
#> 
#> Correlation matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      1.0000     0.33820     0.42771     0.67842
#> a_data_diff      0.3382     1.00000     0.12274     0.08848
#> b_data_diff      0.4277     0.12274     1.00000     0.01073
#> c_data_diff      0.6784     0.08848     0.01073     1.00000

# Fitting the VAR(4) model
var4_model <- VAR(mts_diff, p=4, type="const")
cat("\nSummary of VAR(4) model:\n")
#> 
#> Summary of VAR(4) model:
summary(var4_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: s_data_diff, a_data_diff, b_data_diff, c_data_diff 
#> Deterministic variables: const 
#> Sample size: 380 
#> Log Likelihood: -9439.899 
#> Roots of the characteristic polynomial:
#> 0.7146 0.7146 0.713 0.713 0.6698 0.6698 0.6458 0.6458 0.6299 0.6299 0.5812 0.5812 0.5305 0.5305 0.4945 0.3414
#> Call:
#> VAR(y = mts_diff, p = 4, type = "const")
#> 
#> 
#> Estimation results for equation s_data_diff: 
#> ============================================ 
#> s_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1 -0.19399    0.09379  -2.068 0.039309 *  
#> a_data_diff.l1 -0.48945    0.33984  -1.440 0.150661    
#> b_data_diff.l1 -0.54906    0.19968  -2.750 0.006264 ** 
#> c_data_diff.l1 -0.50180    0.14576  -3.443 0.000643 ***
#> s_data_diff.l2  0.06498    0.10631   0.611 0.541423    
#> a_data_diff.l2  0.17086    0.37538   0.455 0.649264    
#> b_data_diff.l2 -0.39444    0.23872  -1.652 0.099328 .  
#> c_data_diff.l2 -0.74968    0.17497  -4.285 2.35e-05 ***
#> s_data_diff.l3  0.20925    0.10841   1.930 0.054374 .  
#> a_data_diff.l3 -0.09967    0.37307  -0.267 0.789495    
#> b_data_diff.l3 -0.20072    0.23961  -0.838 0.402752    
#> c_data_diff.l3 -0.50618    0.17637  -2.870 0.004345 ** 
#> s_data_diff.l4  0.06139    0.09565   0.642 0.521391    
#> a_data_diff.l4  0.62664    0.33891   1.849 0.065269 .  
#> b_data_diff.l4 -0.12326    0.19922  -0.619 0.536480    
#> c_data_diff.l4 -0.29614    0.14838  -1.996 0.046693 *  
#> const          35.20783   17.97243   1.959 0.050879 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 345.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.2617,  Adjusted R-squared: 0.2292 
#> F-statistic: 8.042 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation a_data_diff: 
#> ============================================ 
#> a_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.04126    0.01577   2.616 0.009276 ** 
#> a_data_diff.l1 -0.52722    0.05715  -9.225  < 2e-16 ***
#> b_data_diff.l1 -0.09537    0.03358  -2.840 0.004764 ** 
#> c_data_diff.l1 -0.04139    0.02451  -1.689 0.092131 .  
#> s_data_diff.l2  0.06573    0.01788   3.676 0.000272 ***
#> a_data_diff.l2 -0.40232    0.06313  -6.373 5.63e-10 ***
#> b_data_diff.l2 -0.10089    0.04015  -2.513 0.012400 *  
#> c_data_diff.l2 -0.07198    0.02943  -2.446 0.014914 *  
#> s_data_diff.l3  0.03987    0.01823   2.187 0.029401 *  
#> a_data_diff.l3 -0.22090    0.06274  -3.521 0.000485 ***
#> b_data_diff.l3 -0.06953    0.04030  -1.725 0.085306 .  
#> c_data_diff.l3 -0.02282    0.02966  -0.769 0.442136    
#> s_data_diff.l4  0.03252    0.01609   2.021 0.043964 *  
#> a_data_diff.l4 -0.17450    0.05699  -3.062 0.002365 ** 
#> b_data_diff.l4 -0.04608    0.03350  -1.375 0.169900    
#> c_data_diff.l4 -0.04756    0.02495  -1.906 0.057463 .  
#> const           1.76746    3.02246   0.585 0.559063    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 58.11 on 363 degrees of freedom
#> Multiple R-Squared: 0.2372,  Adjusted R-squared: 0.2036 
#> F-statistic: 7.056 on 16 and 363 DF,  p-value: 3.247e-14 
#> 
#> 
#> Estimation results for equation b_data_diff: 
#> ============================================ 
#> b_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                 Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.052549   0.030123   1.744  0.08192 .  
#> a_data_diff.l1 -0.198193   0.109151  -1.816  0.07023 .  
#> b_data_diff.l1 -0.806183   0.064134 -12.570  < 2e-16 ***
#> c_data_diff.l1 -0.082504   0.046815  -1.762  0.07885 .  
#> s_data_diff.l2  0.030739   0.034144   0.900  0.36857    
#> a_data_diff.l2 -0.065624   0.120567  -0.544  0.58657    
#> b_data_diff.l2 -0.572247   0.076672  -7.464 6.29e-13 ***
#> c_data_diff.l2 -0.119509   0.056199  -2.127  0.03413 *  
#> s_data_diff.l3  0.041181   0.034820   1.183  0.23771    
#> a_data_diff.l3 -0.007054   0.119823  -0.059  0.95309    
#> b_data_diff.l3 -0.317568   0.076958  -4.126 4.57e-05 ***
#> c_data_diff.l3 -0.158564   0.056647  -2.799  0.00540 ** 
#> s_data_diff.l4  0.001143   0.030723   0.037  0.97034    
#> a_data_diff.l4  0.188398   0.108851   1.731  0.08434 .  
#> b_data_diff.l4 -0.171689   0.063986  -2.683  0.00763 ** 
#> c_data_diff.l4 -0.066062   0.047656  -1.386  0.16653    
#> const           8.703650   5.772443   1.508  0.13248    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 111 on 363 degrees of freedom
#> Multiple R-Squared: 0.3935,  Adjusted R-squared: 0.3668 
#> F-statistic: 14.72 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation c_data_diff: 
#> ============================================ 
#> c_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.16556    0.05139   3.222 0.001390 ** 
#> a_data_diff.l1 -0.01309    0.18621  -0.070 0.943989    
#> b_data_diff.l1 -0.14162    0.10941  -1.294 0.196361    
#> c_data_diff.l1 -0.86066    0.07986 -10.777  < 2e-16 ***
#> s_data_diff.l2  0.22236    0.05825   3.817 0.000159 ***
#> a_data_diff.l2  0.36181    0.20568   1.759 0.079408 .  
#> b_data_diff.l2 -0.16957    0.13080  -1.296 0.195664    
#> c_data_diff.l2 -0.77216    0.09587  -8.054 1.16e-14 ***
#> s_data_diff.l3  0.13732    0.05940   2.312 0.021353 *  
#> a_data_diff.l3  0.13901    0.20441   0.680 0.496904    
#> b_data_diff.l3  0.02174    0.13129   0.166 0.868574    
#> c_data_diff.l3 -0.37146    0.09664  -3.844 0.000143 ***
#> s_data_diff.l4  0.01605    0.05241   0.306 0.759639    
#> a_data_diff.l4  0.22577    0.18570   1.216 0.224851    
#> b_data_diff.l4  0.11501    0.10916   1.054 0.292781    
#> c_data_diff.l4 -0.13135    0.08130  -1.616 0.107042    
#> const          10.84861    9.84753   1.102 0.271340    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 189.3 on 363 degrees of freedom
#> Multiple R-Squared: 0.3565,  Adjusted R-squared: 0.3282 
#> F-statistic: 12.57 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      119398      7587.5     16357.3     43548.8
#> a_data_diff        7587      3376.8       864.7      1207.5
#> b_data_diff       16357       864.7     12316.9      -117.3
#> c_data_diff       43549      1207.5      -117.3     35845.8
#> 
#> Correlation matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      1.0000      0.3779     0.42654     0.66567
#> a_data_diff      0.3779      1.0000     0.13408     0.10975
#> b_data_diff      0.4265      0.1341     1.00000    -0.00558
#> c_data_diff      0.6657      0.1097    -0.00558     1.00000

# Performing diagnostic checking for VAR(1) model
cat("\nDiagnostic checking for VAR(1) model:\n")
#> 
#> Diagnostic checking for VAR(1) model:
serial_test_var1 <- serial.test(var1_model, lags.pt=16, type="PT.asymptotic")
print(serial_test_var1)
#> 
#>  Portmanteau Test (asymptotic)
#> 
#> data:  Residuals of VAR object var1_model
#> Chi-squared = 519.49, df = 240, p-value < 2.2e-16
#> $serial
#> 
#>  Portmanteau Test (asymptotic)
#> 
#> data:  Residuals of VAR object var1_model
#> Chi-squared = 519.49, df = 240, p-value < 2.2e-16

# Performing diagnostic checking for VAR(4) model
cat("\nDiagnostic checking for VAR(4) model:\n")
#> 
#> Diagnostic checking for VAR(4) model:
serial_test_var4 <- serial.test(var4_model, lags.pt=16, type="PT.asymptotic")
print(serial_test_var4)
#> 
#>  Portmanteau Test (asymptotic)
#> 
#> data:  Residuals of VAR object var4_model
#> Chi-squared = 280.23, df = 192, p-value = 3.351e-05
#> $serial
#> 
#>  Portmanteau Test (asymptotic)
#> 
#> data:  Residuals of VAR object var4_model
#> Chi-squared = 280.23, df = 192, p-value = 3.351e-05

# Plotting the diagnostic checking results
plot(serial_test_var1)

plot(serial_test_var4)

# Considering Var(4) as my best fit, justification is mentioned in the summary
# ##  Producing a one month a head forecast of the series using Var[4] model :
forecast_results <- predict(var4_model, n.ahead = 1)

# Printing the forecast results 
cat("\nForecasts for One Month Ahead:\n")
#> 
#> Forecasts for One Month Ahead:
if (!is.null(forecast_results$fcst)) {
  lapply(names(forecast_results$fcst), function(var_name) {
    cat("\nForecast for", var_name, ":\n")
    cat("Forecast: ", forecast_results$fcst[[var_name]][1, "fcst"], "\n")
    cat("Lower 95% CI: ", forecast_results$fcst[[var_name]][1, "lower"], "\n")
    cat("Upper 95% CI: ", forecast_results$fcst[[var_name]][1, "upper"], "\n")
  })
} else {
  cat("No forecast data available.\n")
}
#> 
#> Forecast for s_data_diff :
#> Forecast:  73.97064 
#> Lower 95% CI:  -603.2749 
#> Upper 95% CI:  751.2162 
#> 
#> Forecast for a_data_diff :
#> Forecast:  -16.38179 
#> Lower 95% CI:  -130.2757 
#> Upper 95% CI:  97.5121 
#> 
#> Forecast for b_data_diff :
#> Forecast:  -117.6317 
#> Lower 95% CI:  -335.1517 
#> Upper 95% CI:  99.8882 
#> 
#> Forecast for c_data_diff :
#> Forecast:  160.391 
#> Lower 95% CI:  -210.6885 
#> Upper 95% CI:  531.4705
#> [[1]]
#> NULL
#> 
#> [[2]]
#> NULL
#> 
#> [[3]]
#> NULL
#> 
#> [[4]]
#> NULL
# Plotting the historical data
plot(mts_diff, main="Historical Data and One Month Ahead Forecast", col="blue", pch=20)

# Adding the forecasted points
num_series <- ncol(mts_diff)
forecast_points <- lapply(forecast_results$fcst, function(x) x[1, "fcst"])
for (i in 1:num_series) {
  points(nrow(mts_diff) + 1, forecast_points[[i]], col="red", pch=19, cex=1.5)
}

#Granger causality between the series using Var(4) model
if (!require(vars)) install.packages("vars")
library(vars)

# Combine the differenced series into a multivariate time series matrix
mts_diff <- cbind(s_data_diff, a_data_diff, b_data_diff, c_data_diff)

# Fit a VAR(4) model
var4_model <- VAR(mts_diff, p=4, type="const")
cat("Summary of VAR(4) model:\n")
#> Summary of VAR(4) model:
summary(var4_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: s_data_diff, a_data_diff, b_data_diff, c_data_diff 
#> Deterministic variables: const 
#> Sample size: 380 
#> Log Likelihood: -9439.899 
#> Roots of the characteristic polynomial:
#> 0.7146 0.7146 0.713 0.713 0.6698 0.6698 0.6458 0.6458 0.6299 0.6299 0.5812 0.5812 0.5305 0.5305 0.4945 0.3414
#> Call:
#> VAR(y = mts_diff, p = 4, type = "const")
#> 
#> 
#> Estimation results for equation s_data_diff: 
#> ============================================ 
#> s_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1 -0.19399    0.09379  -2.068 0.039309 *  
#> a_data_diff.l1 -0.48945    0.33984  -1.440 0.150661    
#> b_data_diff.l1 -0.54906    0.19968  -2.750 0.006264 ** 
#> c_data_diff.l1 -0.50180    0.14576  -3.443 0.000643 ***
#> s_data_diff.l2  0.06498    0.10631   0.611 0.541423    
#> a_data_diff.l2  0.17086    0.37538   0.455 0.649264    
#> b_data_diff.l2 -0.39444    0.23872  -1.652 0.099328 .  
#> c_data_diff.l2 -0.74968    0.17497  -4.285 2.35e-05 ***
#> s_data_diff.l3  0.20925    0.10841   1.930 0.054374 .  
#> a_data_diff.l3 -0.09967    0.37307  -0.267 0.789495    
#> b_data_diff.l3 -0.20072    0.23961  -0.838 0.402752    
#> c_data_diff.l3 -0.50618    0.17637  -2.870 0.004345 ** 
#> s_data_diff.l4  0.06139    0.09565   0.642 0.521391    
#> a_data_diff.l4  0.62664    0.33891   1.849 0.065269 .  
#> b_data_diff.l4 -0.12326    0.19922  -0.619 0.536480    
#> c_data_diff.l4 -0.29614    0.14838  -1.996 0.046693 *  
#> const          35.20783   17.97243   1.959 0.050879 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 345.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.2617,  Adjusted R-squared: 0.2292 
#> F-statistic: 8.042 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation a_data_diff: 
#> ============================================ 
#> a_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.04126    0.01577   2.616 0.009276 ** 
#> a_data_diff.l1 -0.52722    0.05715  -9.225  < 2e-16 ***
#> b_data_diff.l1 -0.09537    0.03358  -2.840 0.004764 ** 
#> c_data_diff.l1 -0.04139    0.02451  -1.689 0.092131 .  
#> s_data_diff.l2  0.06573    0.01788   3.676 0.000272 ***
#> a_data_diff.l2 -0.40232    0.06313  -6.373 5.63e-10 ***
#> b_data_diff.l2 -0.10089    0.04015  -2.513 0.012400 *  
#> c_data_diff.l2 -0.07198    0.02943  -2.446 0.014914 *  
#> s_data_diff.l3  0.03987    0.01823   2.187 0.029401 *  
#> a_data_diff.l3 -0.22090    0.06274  -3.521 0.000485 ***
#> b_data_diff.l3 -0.06953    0.04030  -1.725 0.085306 .  
#> c_data_diff.l3 -0.02282    0.02966  -0.769 0.442136    
#> s_data_diff.l4  0.03252    0.01609   2.021 0.043964 *  
#> a_data_diff.l4 -0.17450    0.05699  -3.062 0.002365 ** 
#> b_data_diff.l4 -0.04608    0.03350  -1.375 0.169900    
#> c_data_diff.l4 -0.04756    0.02495  -1.906 0.057463 .  
#> const           1.76746    3.02246   0.585 0.559063    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 58.11 on 363 degrees of freedom
#> Multiple R-Squared: 0.2372,  Adjusted R-squared: 0.2036 
#> F-statistic: 7.056 on 16 and 363 DF,  p-value: 3.247e-14 
#> 
#> 
#> Estimation results for equation b_data_diff: 
#> ============================================ 
#> b_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                 Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.052549   0.030123   1.744  0.08192 .  
#> a_data_diff.l1 -0.198193   0.109151  -1.816  0.07023 .  
#> b_data_diff.l1 -0.806183   0.064134 -12.570  < 2e-16 ***
#> c_data_diff.l1 -0.082504   0.046815  -1.762  0.07885 .  
#> s_data_diff.l2  0.030739   0.034144   0.900  0.36857    
#> a_data_diff.l2 -0.065624   0.120567  -0.544  0.58657    
#> b_data_diff.l2 -0.572247   0.076672  -7.464 6.29e-13 ***
#> c_data_diff.l2 -0.119509   0.056199  -2.127  0.03413 *  
#> s_data_diff.l3  0.041181   0.034820   1.183  0.23771    
#> a_data_diff.l3 -0.007054   0.119823  -0.059  0.95309    
#> b_data_diff.l3 -0.317568   0.076958  -4.126 4.57e-05 ***
#> c_data_diff.l3 -0.158564   0.056647  -2.799  0.00540 ** 
#> s_data_diff.l4  0.001143   0.030723   0.037  0.97034    
#> a_data_diff.l4  0.188398   0.108851   1.731  0.08434 .  
#> b_data_diff.l4 -0.171689   0.063986  -2.683  0.00763 ** 
#> c_data_diff.l4 -0.066062   0.047656  -1.386  0.16653    
#> const           8.703650   5.772443   1.508  0.13248    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 111 on 363 degrees of freedom
#> Multiple R-Squared: 0.3935,  Adjusted R-squared: 0.3668 
#> F-statistic: 14.72 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation c_data_diff: 
#> ============================================ 
#> c_data_diff = s_data_diff.l1 + a_data_diff.l1 + b_data_diff.l1 + c_data_diff.l1 + s_data_diff.l2 + a_data_diff.l2 + b_data_diff.l2 + c_data_diff.l2 + s_data_diff.l3 + a_data_diff.l3 + b_data_diff.l3 + c_data_diff.l3 + s_data_diff.l4 + a_data_diff.l4 + b_data_diff.l4 + c_data_diff.l4 + const 
#> 
#>                Estimate Std. Error t value Pr(>|t|)    
#> s_data_diff.l1  0.16556    0.05139   3.222 0.001390 ** 
#> a_data_diff.l1 -0.01309    0.18621  -0.070 0.943989    
#> b_data_diff.l1 -0.14162    0.10941  -1.294 0.196361    
#> c_data_diff.l1 -0.86066    0.07986 -10.777  < 2e-16 ***
#> s_data_diff.l2  0.22236    0.05825   3.817 0.000159 ***
#> a_data_diff.l2  0.36181    0.20568   1.759 0.079408 .  
#> b_data_diff.l2 -0.16957    0.13080  -1.296 0.195664    
#> c_data_diff.l2 -0.77216    0.09587  -8.054 1.16e-14 ***
#> s_data_diff.l3  0.13732    0.05940   2.312 0.021353 *  
#> a_data_diff.l3  0.13901    0.20441   0.680 0.496904    
#> b_data_diff.l3  0.02174    0.13129   0.166 0.868574    
#> c_data_diff.l3 -0.37146    0.09664  -3.844 0.000143 ***
#> s_data_diff.l4  0.01605    0.05241   0.306 0.759639    
#> a_data_diff.l4  0.22577    0.18570   1.216 0.224851    
#> b_data_diff.l4  0.11501    0.10916   1.054 0.292781    
#> c_data_diff.l4 -0.13135    0.08130  -1.616 0.107042    
#> const          10.84861    9.84753   1.102 0.271340    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 189.3 on 363 degrees of freedom
#> Multiple R-Squared: 0.3565,  Adjusted R-squared: 0.3282 
#> F-statistic: 12.57 on 16 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      119398      7587.5     16357.3     43548.8
#> a_data_diff        7587      3376.8       864.7      1207.5
#> b_data_diff       16357       864.7     12316.9      -117.3
#> c_data_diff       43549      1207.5      -117.3     35845.8
#> 
#> Correlation matrix of residuals:
#>             s_data_diff a_data_diff b_data_diff c_data_diff
#> s_data_diff      1.0000      0.3779     0.42654     0.66567
#> a_data_diff      0.3779      1.0000     0.13408     0.10975
#> b_data_diff      0.4265      0.1341     1.00000    -0.00558
#> c_data_diff      0.6657      0.1097    -0.00558     1.00000

# Perform Granger causality tests
causality_s <- causality(var4_model, cause = "s_data_diff")
print(causality_s)
#> $Granger
#> 
#>  Granger causality H0: s_data_diff do not Granger-cause a_data_diff
#>  b_data_diff c_data_diff
#> 
#> data:  VAR object var4_model
#> F-Test = 2.856, df1 = 12, df2 = 1452, p-value = 0.000677
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: s_data_diff and a_data_diff
#>  b_data_diff c_data_diff
#> 
#> data:  VAR object var4_model
#> Chi-squared = 155.29, df = 3, p-value < 2.2e-16

causality_a <- causality(var4_model, cause = "a_data_diff")
print(causality_a)
#> $Granger
#> 
#>  Granger causality H0: a_data_diff do not Granger-cause s_data_diff
#>  b_data_diff c_data_diff
#> 
#> data:  VAR object var4_model
#> F-Test = 1.2162, df1 = 12, df2 = 1452, p-value = 0.2657
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: a_data_diff and s_data_diff
#>  b_data_diff c_data_diff
#> 
#> data:  VAR object var4_model
#> Chi-squared = 61.773, df = 3, p-value = 2.456e-13

causality_b <- causality(var4_model, cause = "b_data_diff")
print(causality_b)
#> $Granger
#> 
#>  Granger causality H0: b_data_diff do not Granger-cause s_data_diff
#>  a_data_diff c_data_diff
#> 
#> data:  VAR object var4_model
#> F-Test = 1.6378, df1 = 12, df2 = 1452, p-value = 0.07526
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: b_data_diff and s_data_diff
#>  a_data_diff c_data_diff
#> 
#> data:  VAR object var4_model
#> Chi-squared = 97.437, df = 3, p-value < 2.2e-16

causality_c <- causality(var4_model, cause = "c_data_diff")
print(causality_c)
#> $Granger
#> 
#>  Granger causality H0: c_data_diff do not Granger-cause s_data_diff
#>  a_data_diff b_data_diff
#> 
#> data:  VAR object var4_model
#> F-Test = 2.4322, df1 = 12, df2 = 1452, p-value = 0.003945
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: c_data_diff and s_data_diff
#>  a_data_diff b_data_diff
#> 
#> data:  VAR object var4_model
#> Chi-squared = 138.32, df = 3, p-value < 2.2e-16

# The two models used are Sparse00 and SparseLag
# Using the BigVAR package to fit a sparse VAR model
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.2.3
#> Loading required package: lattice
#> Loading required package: lattice

p_optimal =4
model_spec <- constructModel(mts_diff, p = p_optimal, struct = "SparseLag", gran=c(50,10))
ms <- constructModel(mts_diff, p = p_optimal, struct = "SparseOO", gran=c(50,10))

results=cv.BigVAR(model_spec)
#> Validation Stage: SparseLag  |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  17%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  23%  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |========================                                              |  34%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  77%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  80%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |===========================================================           |  84%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================| 100%[1] "Evaluation Stage"
plot(results)

print(results)
#> *** BIGVAR MODEL Results *** 
#> Structure
#> [1] "SparseLag"
#> Loss 
#> [1] "L2"
#> Forecast Horizon 
#> [1] 1
#> Minnesota VAR
#> [1] FALSE
#> Maximum Lag Order 
#> [1] 4
#> Optimal Lambda 
#> [1] 106700
#> Grid Depth 
#> [1] 50
#> Index of Optimal Lambda 
#> [1] 10
#> Fraction of active coefficients 
#> [1] 0.7987
#> In-Sample Loss
#> [1] 245000
#> BigVAR Out of Sample Loss
#> [1] 143000
#> *** Benchmark Results *** 
#> Conditional Mean Out of Sample Loss
#> [1] 135000
#> AIC Out of Sample Loss
#> [1] 156000
#> BIC Out of Sample Loss
#> [1] 144000
#> RW Out of Sample Loss
#> [1] 337000
#I chose the SparseLag sparsity structure for the VAR model using the BigVAR package which assumes that the future values of a time series are influenced by only a few of its past values. The optimal regularization parameter lambda was determined to be 106700 which balances the model complexity and predictive accuracy. The resulting in-sample loss was 245000 indicating the model’s fit to the training data while the out-of-sample loss was 143000, showcasing the model's predictive performance on unseen data. Compared to benchmark models the BigVAR model outperformed most except for the Conditional Mean model, suggesting that while the BigVAR model is robust, there may be simpler models that perform similarly or better. The plot of Mean Squared Forecast Error (MSFE) against various lambda values confirms that the chosen lambda minimizes the forecast error, demonstrating an effective level of sparsity.  

Created on 2024-04-11 with reprex v2.0.2