jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Ayeshee Patra #19

Open ayesheepatra opened 4 months ago

ayesheepatra commented 4 months ago
# Load necessary libraries
library(reprex)
library(fredr)
library(dygraphs)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(xts)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric

# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key("1d2fb74509b6e2b2a9ba83de71fedf09")

# Print all the columns of the initial dataset for visualization
icnsa = fredr(series_id = "ICNSA")
columns <- colnames(icnsa)
print(columns)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
#[1] "date"           "series_id"      "value"          "realtime_start" "realtime_end" 

# Create a backup of the initial dataset
icnsa_bk <- icnsa
icnsa_bk <- ts(icnsa_bk$value, frequency = 12)

# Convert date column to Date type
icnsa$date <- as.Date(icnsa$date)

# Create a new column 'covid_time' based on a condition
icnsa$covid_time <- ifelse(icnsa$date >= as.Date("2020-01-01") & icnsa$date <= as.Date("2022-12-31"), 1, 0)

# Generate a time series object using xts
generated_time_series <- xts(cbind(value = icnsa$value, covid_time = icnsa$covid_time), order.by = icnsa$date)

# Print the generated time series
print(generated_time_series)
#>             value covid_time
#> 1967-01-07 346000          0
#> 1967-01-14 334000          0
#> 1967-01-21 277000          0
#> 1967-01-28 252000          0
#> 1967-02-04 274000          0
#> 1967-02-11 276000          0
#> 1967-02-18 247000          0
#> 1967-02-25 248000          0
#> 1967-03-04 326000          0
#> 1967-03-11 240000          0
#>        ...                  
#> 2023-11-25 199750          0
#> 2023-12-02 294615          0
#> 2023-12-09 249090          0
#> 2023-12-16 241040          0
#> 2023-12-23 274840          0
#> 2023-12-30 269409          0
#> 2024-01-06 318906          0
#> 2024-01-13 291330          0
#> 2024-01-20 249947          0
#> 2024-01-27 261029          0
# value covid_time
# 1967-01-07 346000          0
# 1967-01-14 334000          0
# 1967-01-21 277000          0
# 1967-01-28 252000          0
# 1967-02-04 274000          0
# 1967-02-11 276000          0
# ...

# Plot the generated time series
plot(generated_time_series, main = "Unemployment Rate Time Series")


# decompisition of Time series
icnsa_decomposition <- decompose(icnsa_bk)
plot(icnsa_decomposition)


# Print the structure of the generated time series
print(str(generated_time_series))
#> An xts object on 1967-01-07 / 2024-01-27 containing: 
#>   Data:    double [2978, 2]
#>   Columns: value, covid_time
#>   Index:   Date [2978] (TZ: "UTC")
#> NULL
#An xts object on 1967-01-07 / 2024-01-27 containing: 
#Data:    double [2978, 2]
#Columns: value, covid_time
#Index:   Date [2978] (TZ: "UTC")
#NULL
# Fit an ARIMA time series model with COVID time variable as a regressor
arima_timeseries_model <- auto.arima(generated_time_series[,"value"], xreg = generated_time_series[,"covid_time"])

# Display a summary of the ARIMA model
summary(arima_timeseries_model)
#> Series: generated_time_series[, "value"] 
#> Regression with ARIMA(5,0,2) errors 
#> 
#> Coefficients:
#>          ar1      ar2     ar3      ar4     ar5    ma1     ma2  intercept
#>       0.3159  -0.1505  0.8228  -0.3928  0.1234  1.026  0.9756  359539.38
#> s.e.  0.0223   0.0228  0.0176   0.0200  0.0191  0.013  0.0101   17008.22
#>       covid_time
#>        128563.38
#> s.e.    48351.73
#> 
#> sigma^2 = 7.465e+09:  log likelihood = -38072.69
#> AIC=76165.37   AICc=76165.45   BIC=76225.36
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE     MPE     MAPE      MASE         ACF1
#> Training set -130.095 86272.33 41770.79 -2.0614 10.91033 0.1144081 0.0005112262
#Series: generated_time_series[, "value"] 
#Regression with ARIMA(5,0,2) errors 
#
#Coefficients:
#  ar1      ar2     ar3      ar4     ar5    ma1     ma2  intercept  covid_time
#0.3159  -0.1505  0.8228  -0.3928  0.1234  1.026  0.9756  359539.38   128563.38
#s.e.  0.0223   0.0228  0.0176   0.0200  0.0191  0.013  0.0101   17008.22    48351.73
#
#sigma^2 = 7.465e+09:  log likelihood = -38072.69
#AIC=76165.37   AICc=76165.45   BIC=76225.36
#
#Training set error measures:
#  ME     RMSE      MAE     MPE     MAPE      MASE         ACF1
#Training set -130.095 86272.33 41770.79 -2.0614 10.91033 0.1144081 0.0005112262

# Forecast values using the ARIMA model, predicting 12 periods into the future
forecasted_values <- forecast(arima_timeseries_model, h = 12, xreg = matrix(1, ncol = 1, nrow = 12))
#> Warning in forecast.forecast_ARIMA(arima_timeseries_model, h = 12, xreg =
#> matrix(1, : xreg contains different column names from the xreg used in
#> training. Please check that the regressors are in the same order.
#Warning message:
#In forecast.forecast_ARIMA(arima_timeseries_model, h = 12, xreg = matrix(1,  :
#                                                                           xreg contains different column names from the xreg used in training. Please check that the regressors are in the same order.

# Plot the forecasted values with the COVID time variable
plot(forecasted_values, main = "Unemployment Forecast with COVID Time Variable")


print(forecasted_values)
#>       Point Forecast    Lo 80    Hi 80       Lo 95    Hi 95
#> 20847       402868.5 292138.6 513598.4 233521.7403 572215.2
#> 20854       416116.4 230802.8 601430.1 132703.6485 699529.2
#> 20861       431765.1 200528.5 663001.7  78119.2401 785411.0
#> 20868       436176.8 179054.0 693299.7  42941.3711 829412.3
#> 20875       442268.6 168683.8 715853.4  23856.8106 860680.4
#> 20882       452839.5 167315.0 738364.0  16167.4691 889511.6
#> 20889       454380.7 160808.0 747953.3   5400.0508 903361.3
#> 20896       458487.4 159028.5 757946.4    504.4737 916470.4
#> 20903       466402.2 162327.6 770476.8   1360.2145 931444.2
#> 20910       466152.2 158996.6 773307.9  -3601.8180 935906.3
#> 20917       468960.5 159506.3 778414.8  -4308.9461 942230.0
#> 20924       474974.5 163657.2 786291.8  -1144.2061 951093.3
#Point Forecast    Lo 80    Hi 80       Lo 95    Hi 95
#20847       402868.5 292138.6 513598.4 233521.7403 572215.2
#20854       416116.4 230802.8 601430.1 132703.6485 699529.2
#20861       431765.1 200528.5 663001.7  78119.2401 785411.0
#20868       436176.8 179054.0 693299.7  42941.3711 829412.3
#20875       442268.6 168683.8 715853.4  23856.8106 860680.4
#20882       452839.5 167315.0 738364.0  16167.4691 889511.6
#20889       454380.7 160808.0 747953.3   5400.0508 903361.3
#20896       458487.4 159028.5 757946.4    504.4737 916470.4
#20903       466402.2 162327.6 770476.8   1360.2145 931444.2
#20910       466152.2 158996.6 773307.9  -3601.8180 935906.3
#20917       468960.5 159506.3 778414.8  -4308.9461 942230.0
#20924       474974.5 163657.2 786291.8  -1144.2061 951093.3

mean_forecast <- mean(forecasted_values$mean, na.rm = TRUE)
print(mean_forecast)
#> [1] 447616
#[1] 447616

Created on 2024-02-08 with reprex v2.1.0

ayesheepatra commented 4 months ago

Smoothened the peak of unemployment during covid by using a Moving average model with a window size of 1 (52 weeks) year

# Load necessary libraries
library(reprex)
library(fredr)
library(dygraphs)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(xts)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)

# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key("1d2fb74509b6e2b2a9ba83de71fedf09")

# Print all the columns of the initial dataset for visualization
icnsa = fredr(series_id = "ICNSA")
columns <- colnames(icnsa)
print(columns)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
#[1] "date"           "series_id"      "value"          "realtime_start" "realtime_end" 

# Create a backup of the initial dataset
icnsa_bk <- icnsa
icnsa_bk <- ts(icnsa_bk$value, frequency = 12)

# Convert date column to Date type
icnsa$date <- as.Date(icnsa$date)

# Create a new column 'covid_time' based on a condition
icnsa$covid_time <- ifelse(icnsa$date >= as.Date("2020-01-01") & icnsa$date <= as.Date("2022-12-31"), 1, 0)
generated_time_series <- xts(cbind(value = icnsa$value, covid_time = icnsa$covid_time), order.by = icnsa$date)

# Print the generated time series
print(generated_time_series)
#>             value covid_time
#> 1967-01-07 346000          0
#> 1967-01-14 334000          0
#> 1967-01-21 277000          0
#> 1967-01-28 252000          0
#> 1967-02-04 274000          0
#> 1967-02-11 276000          0
#> 1967-02-18 247000          0
#> 1967-02-25 248000          0
#> 1967-03-04 326000          0
#> 1967-03-11 240000          0
#>        ...                  
#> 2023-12-02 294615          0
#> 2023-12-09 249090          0
#> 2023-12-16 241040          0
#> 2023-12-23 274840          0
#> 2023-12-30 269409          0
#> 2024-01-06 318906          0
#> 2024-01-13 291330          0
#> 2024-01-20 249947          0
#> 2024-01-27 263919          0
#> 2024-02-03 232727          0
plot(generated_time_series, main = "Unemployment Rate Time Series")


# Decomposition of Time series
icnsa_decomposition <- decompose(icnsa_bk)
plot(icnsa_decomposition)

print(str(generated_time_series))
#> An xts object on 1967-01-07 / 2024-02-03 containing: 
#>   Data:    double [2979, 2]
#>   Columns: value, covid_time
#>   Index:   Date [2979] (TZ: "UTC")
#> NULL
arima_timeseries_model <- auto.arima(generated_time_series[,"value"], xreg = generated_time_series[,"covid_time"])

summary(arima_timeseries_model)
#> Series: generated_time_series[, "value"] 
#> Regression with ARIMA(5,0,2) errors 
#> 
#> Coefficients:
#>          ar1      ar2     ar3      ar4     ar5     ma1     ma2  intercept
#>       0.3153  -0.1500  0.8233  -0.3932  0.1237  1.0267  0.9761  358753.45
#> s.e.  0.0221   0.0226  0.0176   0.0200  0.0191  0.0126  0.0099   17059.72
#>       covid_time
#>        126260.06
#> s.e.    48367.48
#> 
#> sigma^2 = 7.464e+09:  log likelihood = -38085.11
#> AIC=76190.22   AICc=76190.3   BIC=76250.22
#> 
#> Training set error measures:
#>                     ME     RMSE     MAE       MPE     MAPE      MASE
#> Training set -59.20931 86261.83 41755.3 -2.038431 10.90387 0.1143793
#>                     ACF1
#> Training set 0.000529841
window_size <- 52
# Smoothen the peak of unemployment during COVID time in 2020 using a moving average
icnsa$smoothed_value <- zoo::rollmean(icnsa$value, k = window_size, fill = NA)

# Plot the smoothed data
ggplot(icnsa, aes(x = date, y = smoothed_value)) +
  geom_line() +
  labs(title = "Smoothed Unemployment Numbers Over Time", x = "Date", y = "Smoothed Unemployment Numbers") +
  theme_minimal()
#> Warning: Removed 51 rows containing missing values (`geom_line()`).


forecasted_values <- forecast(arima_timeseries_model, h = 12, xreg = matrix(1, ncol = 1, nrow = 12))
#> Warning in forecast.forecast_ARIMA(arima_timeseries_model, h = 12, xreg =
#> matrix(1, : xreg contains different column names from the xreg used in
#> training. Please check that the regressors are in the same order.

# Plot the forecasted values with the COVID time variable
plot(forecasted_values, main = "Unemployment Forecast with COVID Time Variable")

print(forecasted_values)
#>       Point Forecast    Lo 80    Hi 80       Lo 95    Hi 95
#> 20854       356464.6 245748.3 467181.0 187138.5643 525790.7
#> 20861       375736.3 190442.2 561030.3  92353.4301 659119.1
#> 20868       389917.4 158698.7 621136.1  36298.8774 743535.9
#> 20875       403413.5 146290.3 660536.8  10177.4320 796649.7
#> 20882       418542.9 144943.9 692141.9    109.3981 836976.4
#> 20889       425074.6 139523.9 710625.4 -11637.4830 861786.8
#> 20896       432783.2 139168.6 726397.7 -16261.5239 881827.9
#> 20903       443136.8 143623.6 742649.9 -14929.0409 901202.6
#> 20910       446342.6 142203.7 750481.4 -18797.6776 911482.8
#> 20917       451449.4 144219.3 758679.4 -18418.4072 921317.1
#> 20924       458879.4 149343.1 768415.8 -14515.5311 932274.4
#> 20931       459977.6 148572.0 771383.2 -16276.1798 936231.4
# Print the structure of forecasted_values
str(forecasted_values)
#> List of 10
#>  $ method   : chr "Regression with ARIMA(5,0,2) errors"
#>  $ model    :List of 19
#>   ..$ coef     : Named num [1:9] 0.315 -0.15 0.823 -0.393 0.124 ...
#>   .. ..- attr(*, "names")= chr [1:9] "ar1" "ar2" "ar3" "ar4" ...
#>   ..$ sigma2   : num 7.46e+09
#>   ..$ var.coef : num [1:9, 1:9] 4.89e-04 -2.24e-04 -1.31e-05 -1.82e-04 5.50e-05 ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..$ : chr [1:9] "ar1" "ar2" "ar3" "ar4" ...
#>   .. .. ..$ : chr [1:9] "ar1" "ar2" "ar3" "ar4" ...
#>   ..$ mask     : logi [1:9] TRUE TRUE TRUE TRUE TRUE TRUE ...
#>   ..$ loglik   : num -38085
#>   ..$ aic      : num 76190
#>   ..$ arma     : int [1:7] 5 2 0 0 0 0 0
#>   ..$ residuals: Time-Series [1:2979] from 1 to 20847: -4464 -11769 -54216 -9478 15598 ...
#>   ..$ call     : language auto.arima(y = generated_time_series[, "value"], xreg = c(0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | __truncated__ ...
#>   ..$ series   : chr "generated_time_series[, \"value\"]"
#>   ..$ code     : int 0
#>   ..$ n.cond   : int 0
#>   ..$ nobs     : int 2979
#>   ..$ model    :List of 10
#>   .. ..$ phi  : num [1:5] 0.315 -0.15 0.823 -0.393 0.124
#>   .. ..$ theta: num [1:4] 1.027 0.976 0 0
#>   .. ..$ Delta: num(0) 
#>   .. ..$ Z    : num [1:5] 1 0 0 0 0
#>   .. ..$ a    : num [1:5] -126026 -88814 -87650 23834 -11726
#>   .. ..$ P    : num [1:5, 1:5] 0 0 0 0 0 ...
#>   .. ..$ T    : num [1:5, 1:5] 0.315 -0.15 0.823 -0.393 0.124 ...
#>   .. ..$ V    : num [1:5, 1:5] 1 1.027 0.976 0 0 ...
#>   .. ..$ h    : num 0
#>   .. ..$ Pn   : num [1:5, 1:5] 1 1.027 0.976 0 0 ...
#>   ..$ bic      : num 76250
#>   ..$ aicc     : num 76190
#>   ..$ xreg     : num [1:2979, 1] 0 0 0 0 0 0 0 0 0 0 ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..$ : chr [1:2979] "1967-01-07" "1967-01-14" "1967-01-21" "1967-01-28" ...
#>   .. .. ..$ : chr "covid_time"
#>   ..$ x        : Time-Series [1:2979] from 1 to 20847: 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#>   ..$ fitted   : Time-Series [1:2979] from 1 to 20847: 350464 345769 331216 261478 258402 ...
#>   ..- attr(*, "class")= chr [1:3] "forecast_ARIMA" "ARIMA" "Arima"
#>  $ level    : num [1:2] 80 95
#>  $ mean     : Time-Series [1:12] from 20854 to 20931: 356465 375736 389917 403414 418543 ...
#>  $ lower    : Time-Series [1:12, 1:2] from 20854 to 20931: 245748 190442 158699 146290 144944 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr [1:2] "80%" "95%"
#>  $ upper    : Time-Series [1:12, 1:2] from 20854 to 20931: 467181 561030 621136 660537 692142 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr [1:2] "80%" "95%"
#>  $ x        : Time-Series [1:2979] from 1 to 20847: 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#>  $ series   : chr "generated_time_series[, \"value\"]"
#>  $ fitted   : Time-Series [1:2979] from 1 to 20847: 350464 345769 331216 261478 258402 ...
#>  $ residuals: Time-Series [1:2979] from 1 to 20847: -4464 -11769 -54216 -9478 15598 ...
#>  - attr(*, "class")= chr "forecast"

Created on 2024-02-09 with reprex v2.1.0

ayesheepatra commented 4 months ago

Monthly data analysis to determine p,d,q for ARIMA model

library(reprex)
library(fredr)
library(dygraphs)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(xts)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
library(tseries)

# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key("1d2fb74509b6e2b2a9ba83de71fedf09")

# Print all the columns of the initial dataset for visualization
monthly = fredr(series_id = "M0417BDE00BERM370NNBR")
columns <- colnames(monthly)
print(columns)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
plot(monthly)

head(monthly)
#> # A tibble: 6 × 5
#>   date       series_id             value realtime_start realtime_end
#>   <date>     <chr>                 <dbl> <date>         <date>      
#> 1 1897-07-01 M0417BDE00BERM370NNBR  114. 2024-02-13     2024-02-13  
#> 2 1897-08-01 M0417BDE00BERM370NNBR  116. 2024-02-13     2024-02-13  
#> 3 1897-09-01 M0417BDE00BERM370NNBR  115. 2024-02-13     2024-02-13  
#> 4 1897-10-01 M0417BDE00BERM370NNBR  116. 2024-02-13     2024-02-13  
#> 5 1897-11-01 M0417BDE00BERM370NNBR  114. 2024-02-13     2024-02-13  
#> 6 1897-12-01 M0417BDE00BERM370NNBR  113. 2024-02-13     2024-02-13
generated_time_series <- xts(cbind(value = monthly$value, date = monthly$date), order.by = monthly$date)
plot(generated_time_series)

ts_data <- ts(monthly$value, frequency = 12)
# Seasonal Decomposition Plot of data
plot(stl(ts_data, s.window="periodic"), main = "Seasonal Decomposition")

#data has trend and seasonality
#plotting ACF to understand seasonality pattern
acf(ts_data, main = "Autocorrelation Function (ACF)")

#shows a fast decaying plot, which means there correlation between data in short lags, but long term data points are not correlated
pacf(ts_data, main = "Partial Autocorrelation Function (PACF)")

#pacf is well within confidence internal, so a lower oder AR model can be considered

adf_result <- adf.test(monthly$value)
#> Warning in adf.test(monthly$value): p-value greater than printed p-value

#results
cat("ADF Statistic:", adf_result$statistic, "\n")
#> ADF Statistic: 5.602348
cat("p-value:", adf_result$p.value, "\n")
#> p-value: 0.99
cat("Critical Values:", adf_result$critical, "\n")
#> Critical Values:
cat("Test Type:", adf_result$test, "\n")
#> Test Type:
#p-value is high, indicating need of fifferencing as data is not stationary

arima_model1 <- Arima(ts_data, order = c(1, 2, 1))
summary(arima_model1)
#> Series: ts_data 
#> ARIMA(1,2,1) 
#> 
#> Coefficients:
#>           ar1      ma1
#>       -0.2487  -0.2797
#> s.e.   0.2051   0.1925
#> 
#> sigma^2 = 32.89:  log likelihood = -701.89
#> AIC=1409.78   AICc=1409.89   BIC=1419.99
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE       MPE     MAPE      MASE
#> Training set 0.3400288 5.683833 3.335583 0.1229903 2.108703 0.2990036
#>                      ACF1
#> Training set -0.001010589

arima_model2 <- Arima(ts_data, order = c(1, 3, 1))
summary(arima_model2)
#> Series: ts_data 
#> ARIMA(1,3,1) 
#> 
#> Coefficients:
#>           ar1      ma1
#>       -0.4927  -0.9857
#> s.e.   0.0861   0.0213
#> 
#> sigma^2 = 33.47:  log likelihood = -702.83
#> AIC=1411.66   AICc=1411.77   BIC=1421.86
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE       MPE    MAPE      MASE        ACF1
#> Training set 0.387566 5.720577 3.435684 0.1831659 2.18323 0.3079767 -0.03727901

arima_model3 <- Arima(ts_data, order = c(2, 3, 2))
summary(arima_model3)
#> Series: ts_data 
#> ARIMA(2,3,2) 
#> 
#> Coefficients:
#>         ar1     ar2      ma1     ma2
#>       0.281  0.2052  -1.9667  0.9999
#> s.e.  0.093  0.0953   0.0187  0.0187
#> 
#> sigma^2 = 30.01:  log likelihood = -692.75
#> AIC=1395.49   AICc=1395.77   BIC=1412.49
#> 
#> Training set error measures:
#>                     ME    RMSE      MAE       MPE    MAPE      MASE        ACF1
#> Training set 0.3707702 5.39157 3.156113 0.1656218 2.00119 0.2829158 0.007535675
#AICC reduced

# Forecast values 
predicted_values <- forecast(arima_model3, h = 12)
print(predicted_values)
#>        Point Forecast    Lo 80     Hi 80    Lo 95     Hi 95
#> Sep 19       391.7538 384.7030  398.8046 380.9705  402.5371
#> Oct 19       431.7470 420.0547  443.4394 413.8651  449.6289
#> Nov 19       468.1901 451.4773  484.9029 442.6301  493.7501
#> Dec 19       507.1262 485.3855  528.8670 473.8766  540.3758
#> Jan 20       548.4500 521.4638  575.4362 507.1782  589.7218
#> Feb 20       593.3721 560.8204  625.9237 543.5886  643.1555
#> Mar 20       642.2109 603.6272  680.7946 583.2023  701.2195
#> Apr 20       695.3045 650.0914  740.5177 626.1570  764.4520
#> May 20       752.8133 700.2462  805.3804 672.4189  833.2077
#> Jun 20       814.8516 754.0895  875.6138 721.9239  907.7793
#> Jul 20       881.4846 811.5792  951.3900 774.5735  988.3957
#> Aug 20       952.7539 872.6598 1032.8481 830.2605 1075.2473

Created on 2024-02-13 with reprex v2.1.0

ayesheepatra commented 4 months ago

Homework 1 RegARIMA model

# Load required packages
library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
library(tseries)
library(tidyverse)
library(ggplot2)
library(corrplot)
#> corrplot 0.92 loaded
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('1d2fb74509b6e2b2a9ba83de71fedf09')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
icnsa_data <- fredr(series_id = 'ICNSA')
icnsa_data$Date <- as.Date(icnsa_data$date)
head(icnsa_data, n = 5)
#> # A tibble: 5 × 6
#>   date       series_id  value realtime_start realtime_end Date      
#>   <date>     <chr>      <dbl> <date>         <date>       <date>    
#> 1 1967-01-07 ICNSA     346000 2024-02-22     2024-02-22   1967-01-07
#> 2 1967-01-14 ICNSA     334000 2024-02-22     2024-02-22   1967-01-14
#> 3 1967-01-21 ICNSA     277000 2024-02-22     2024-02-22   1967-01-21
#> 4 1967-01-28 ICNSA     252000 2024-02-22     2024-02-22   1967-01-28
#> 5 1967-02-04 ICNSA     274000 2024-02-22     2024-02-22   1967-02-04

# Display the CCNSA data
ccnsa_data <- fredr(series_id = 'CCNSA')
ccnsa_data$Date <- as.Date(ccnsa_data$date)
head(ccnsa_data, n = 5)
#> # A tibble: 5 × 6
#>   date       series_id   value realtime_start realtime_end Date      
#>   <date>     <chr>       <dbl> <date>         <date>       <date>    
#> 1 1967-01-07 CCNSA     1594000 2024-02-22     2024-02-22   1967-01-07
#> 2 1967-01-14 CCNSA     1563000 2024-02-22     2024-02-22   1967-01-14
#> 3 1967-01-21 CCNSA     1551000 2024-02-22     2024-02-22   1967-01-21
#> 4 1967-01-28 CCNSA     1533000 2024-02-22     2024-02-22   1967-01-28
#> 5 1967-02-04 CCNSA     1534000 2024-02-22     2024-02-22   1967-02-04

# creating time series object
icnsa_ts <- ts(icnsa_data$value, frequency = 52)
ccnsa_ts <- ts(ccnsa_data$value, frequency = 52)

# plot ICNSA
plot(icnsa_ts, main = "Date", xlab = "Date", ylab = "Uninsured Claims")

plot(stl(icnsa_ts, s.window="periodic"), main = "Seasonal Decomposition")

acf(icnsa_ts, main = "ACF for ICNSA")

pacf(icnsa_ts, main = "PACF for ICNSA")


# plot CCNSA
plot(ccnsa_ts, main = "Date", xlab = "Date", ylab = "Continued Claims")

plot(stl(ccnsa_ts, s.window="periodic"), main = "Seasonal Decomposition")

acf(ccnsa_ts, main = "ACF for CCNSA")

pacf(ccnsa_ts, main = "PACF for CCNSA")


# show the ADF test to prove stationarity
adf.test(icnsa_ts, alternative="stationary")
#> Warning in adf.test(icnsa_ts, alternative = "stationary"): p-value smaller than
#> printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  icnsa_ts
#> Dickey-Fuller = -8.8664, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary
adf.test(ccnsa_ts, alternative="stationary")
#> Warning in adf.test(ccnsa_ts, alternative = "stationary"): p-value smaller than
#> printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ccnsa_ts
#> Dickey-Fuller = -7.5226, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary

merged_data <- merge(icnsa_data, ccnsa_data, by = "Date", suffixes = c("_icn", "_ccn"))
head(merged_data, n =5)
#>         Date   date_icn series_id_icn value_icn realtime_start_icn
#> 1 1967-01-07 1967-01-07         ICNSA    346000         2024-02-22
#> 2 1967-01-14 1967-01-14         ICNSA    334000         2024-02-22
#> 3 1967-01-21 1967-01-21         ICNSA    277000         2024-02-22
#> 4 1967-01-28 1967-01-28         ICNSA    252000         2024-02-22
#> 5 1967-02-04 1967-02-04         ICNSA    274000         2024-02-22
#>   realtime_end_icn   date_ccn series_id_ccn value_ccn realtime_start_ccn
#> 1       2024-02-22 1967-01-07         CCNSA   1594000         2024-02-22
#> 2       2024-02-22 1967-01-14         CCNSA   1563000         2024-02-22
#> 3       2024-02-22 1967-01-21         CCNSA   1551000         2024-02-22
#> 4       2024-02-22 1967-01-28         CCNSA   1533000         2024-02-22
#> 5       2024-02-22 1967-02-04         CCNSA   1534000         2024-02-22
#>   realtime_end_ccn
#> 1       2024-02-22
#> 2       2024-02-22
#> 3       2024-02-22
#> 4       2024-02-22
#> 5       2024-02-22

#checking for correlation
corr <- cor(merged_data$value_icn, merged_data$value_ccn, use = "complete.obs")
corr
#> [1] 0.714954
# corr of 0.71 indicates that both the datasets are heavily related

# cross-correlation function
ccf(merged_data$value_icn, merged_data$value_ccn, lag.max = 20, main = "CCF: ICNSA and CCNSA")


# update the ts object for the merged data
icnsa_ts <- ts(merged_data$value_icn, frequency = 52)
ccnsa_ts <- ts(merged_data$value_ccn, frequency = 52)

# regARIMA model
model <- auto.arima(icnsa_ts, xreg = ccnsa_ts)
summary(model)
#> Series: icnsa_ts 
#> Regression with ARIMA(0,1,1)(0,0,1)[52] errors 
#> 
#> Coefficients:
#>          ma1    sma1      drift     xreg
#>       0.4973  0.2557    55.5232  -0.0383
#> s.e.  0.0161  0.0161  2953.2570   0.0066
#> 
#> sigma^2 = 7.406e+09:  log likelihood = -38063.85
#> AIC=76137.71   AICc=76137.73   BIC=76167.7
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set -52.27764 85985.98 38993.81 -0.5353816 9.804793 0.4106407
#>                   ACF1
#> Training set 0.0394591

# Forcasted value
forecast <- forecast(model, xreg = tail(merged_data$value_ccn, 1), h = 1)
forecast_mean <- mean(forecast$mean, na.rm = TRUE)
cat("Forecasted Value:", forecast_mean, "\n")
#> Forecasted Value: 211577.3

mean_diff <- mean(diff(icnsa_ts))
mean_diff
#> [1] -37.36434

Created on 2024-02-22 with reprex v2.1.0

ayesheepatra commented 4 months ago

Home work 2: Using smoothing and spline methods to impute new values during peak covid period and then predicting value for current period

# Load required packages
library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
library(tseries)
library(tidyverse)
library(ggplot2)
library(corrplot)
#> corrplot 0.92 loaded
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('1d2fb74509b6e2b2a9ba83de71fedf09')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
icnsa_data <- fredr(series_id = 'ICNSA')
icnsa_data$date <- as.Date(icnsa_data$date)
column_names_icnsa <- colnames(icnsa_data)
print(column_names_icnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
head(icnsa_data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-28     2024-02-28  
#> 2 1967-01-14 ICNSA     334000 2024-02-28     2024-02-28  
#> 3 1967-01-21 ICNSA     277000 2024-02-28     2024-02-28  
#> 4 1967-01-28 ICNSA     252000 2024-02-28     2024-02-28  
#> 5 1967-02-04 ICNSA     274000 2024-02-28     2024-02-28
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA over Time", x = "Date", y = "ICNSA Value") +
  theme_minimal()


#setting the covid period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2022-01-31")

ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(covid_start), linetype = "dashed", color = "red") +
  geom_vline(xintercept = as.numeric(covid_end), linetype = "dashed", color = "red") +
  labs(title = "ICNSA Marking Covid Period ",
       x = "Year",
       y = "Number")


#setting the values during the COVID date range to NA
icnsa_data$value[icnsa_data$date >= covid_start & icnsa_data$date <= covid_end] <- NA

covid_period_data <- icnsa_data[icnsa_data$date >= covid_start & icnsa_data$date <= covid_end, ]

# Print and plotting the subset of the dataframe during covid to check its "NA"
print(covid_period_data)
#> # A tibble: 109 × 5
#>    date       series_id value realtime_start realtime_end
#>    <date>     <chr>     <dbl> <date>         <date>      
#>  1 2020-01-04 ICNSA        NA 2024-02-28     2024-02-28  
#>  2 2020-01-11 ICNSA        NA 2024-02-28     2024-02-28  
#>  3 2020-01-18 ICNSA        NA 2024-02-28     2024-02-28  
#>  4 2020-01-25 ICNSA        NA 2024-02-28     2024-02-28  
#>  5 2020-02-01 ICNSA        NA 2024-02-28     2024-02-28  
#>  6 2020-02-08 ICNSA        NA 2024-02-28     2024-02-28  
#>  7 2020-02-15 ICNSA        NA 2024-02-28     2024-02-28  
#>  8 2020-02-22 ICNSA        NA 2024-02-28     2024-02-28  
#>  9 2020-02-29 ICNSA        NA 2024-02-28     2024-02-28  
#> 10 2020-03-07 ICNSA        NA 2024-02-28     2024-02-28  
#> # ℹ 99 more rows
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(covid_start), linetype = "dashed", color = "red") +
  geom_vline(xintercept = as.numeric(covid_end), linetype = "dashed", color = "red") +
  labs(title = "ICNSA Marking Covid Period ",
       x = "Year",
       y = "Number")


# Apply cubic spline imputation for the COVID period
icnsa_data$value <- na.spline(icnsa_data$value)

# Plot the data with imputed values
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA over Time (Imputed Values for COVID Period)", x = "Date", y = "ICNSA Value") +
  theme_minimal()


#filtering covid data out 
icnsa_non_covid <- icnsa_data %>%
  filter(date < covid_start | date > covid_end)

spar_values <- c(0.1, 0.4, 1)
# Prepare the plot window
par(mfrow=c(3, 1)) 

for(spar in spar_values) {
  # Fit smooth spline with current spar value
  fit_spline <- smooth.spline(as.numeric(icnsa_non_covid$date), icnsa_non_covid$value, spar=spar)

  # Predict using the fitted spline for the entire period
  predicted_values <- predict(fit_spline, as.numeric(icnsa_data$date))

  # Plot
  plot(icnsa_data$date, icnsa_data$value, type='l', main=paste("Spar =", spar),
       xlab="Date", ylab="ICNSA Values", col="blue")
  lines(icnsa_data$date, predicted_values$y, col="red")
  legend("topleft", legend=c("Imputed", "Smoothed"), col=c("blue", "red"), lty=1)
}

#choosing spar=0.4

fit_spline <- smooth.spline(as.numeric(icnsa_non_covid$date), icnsa_non_covid$value, spar=0.4)

predicted_values <- predict(fit_spline, as.numeric(icnsa_data$date))

plot(icnsa_data$date, icnsa_data$value, type='l', main=paste("Spar =", "0.4"),
     xlab="Date", ylab="ICNSA Values", col="blue")
lines(icnsa_data$date, predicted_values$y, col="red")
legend("topleft", legend=c("Imputed", "Smoothed"), col=c("blue", "red"), lty=1)

#Using both a multiplicative and an additive Holt-Winters model to forecast next value
ts_data <- ts(icnsa_data$value, frequency = 52)

hwm_multi <- HoltWinters(ts_data, seasonal = "multiplicative")
forecast_hwm_multi <- forecast(hwm_multi, h = 1)
print(forecast_hwm_multi)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 58.32692       190326.5 149762.9 230890.1 128289.8 252363.2
forecast_hwm_multi_mean <- mean(forecast_hwm_multi$mean, na.rm = TRUE)
print(forecast_hwm_multi_mean)
#> [1] 190326.5

hwm_additive <- HoltWinters(ts_data, seasonal = "additive")
forecast_hwm_additive <- forecast(hwm_additive, h = 1)
print(forecast_hwm_additive)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 58.32692       217107.6 176164.8 258050.4 154490.9 279724.3
forecast_hwm_additive_mean <- mean(forecast_hwm_additive$mean, na.rm = TRUE)
print(forecast_hwm_additive_mean)
#> [1] 217107.6

Created on 2024-02-28 with reprex v2.1.0

ayesheepatra commented 4 months ago

HW3 Imputing covid period data using Structure time series model and fitting a model with trend and seasonal components to predict next week data

# Load required packages
library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(fredr)
library(tseries)
library(tidyverse)
library(ggplot2)
library(corrplot)
#> corrplot 0.92 loaded
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(xts)
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(stats)
# Set up API key to read the unemployment rate dataset from FRED
fredr_set_key('1d2fb74509b6e2b2a9ba83de71fedf09')

# Get the ICNSA data
# Print all the columns of the initial dataset for visualization
icnsa_data <- fredr(series_id = 'ICNSA')
icnsa_data$date <- as.Date(icnsa_data$date)
column_names_icnsa <- colnames(icnsa_data)
print(column_names_icnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
head(icnsa_data, n = 5)
#> # A tibble: 5 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-03-07     2024-03-07  
#> 2 1967-01-14 ICNSA     334000 2024-03-07     2024-03-07  
#> 3 1967-01-21 ICNSA     277000 2024-03-07     2024-03-07  
#> 4 1967-01-28 ICNSA     252000 2024-03-07     2024-03-07  
#> 5 1967-02-04 ICNSA     274000 2024-03-07     2024-03-07

#change ppoint detection
library(changepoint)
#> Successfully loaded changepoint package version 2.2.4
#>  See NEWS for details of changes.

ts_data <- icnsa_data$value
cpt_result <- cpt.mean(ts_data)
plot(cpt_result, ts_data)

# Get the indices of change points
change_point_indices <- cpts(cpt_result)
change_point_date <- icnsa_data$date[change_point_indices-1]
print(change_point_date)
#> [1] "2020-03-07"

covid_start <- as.Date(change_point_date)
covid_end <- as.Date("2021-01-02")

# Identifying the index of rows between this covid period
covid_start_idx <- which(icnsa_data$date == covid_start)
covid_end_idx <- which(icnsa_data$date == covid_end)
covid_idx <- rep(0, nrow(icnsa_data))
covid_idx[covid_start_idx:covid_end_idx] <- 1
covid_idx <- as.logical(covid_idx)

#marking the covid period
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(covid_start), linetype = "dashed", color = "red") +
  geom_vline(xintercept = as.numeric(covid_end), linetype = "dashed", color = "red") +
  labs(title = "ICNSA Marking Covid Period ",
       x = "Year",
       y = "Number")


#setting the values during the COVID date range to NA
icnsa_data$value[icnsa_data$date >= covid_start & icnsa_data$date <= covid_end] <- NA

covid_period_data <- icnsa_data[icnsa_data$date >= covid_start & icnsa_data$date <= covid_end, ]

# Print and plotting the subset of the dataframe during covid to check its "NA"
print(covid_period_data)
#> # A tibble: 44 × 5
#>    date       series_id value realtime_start realtime_end
#>    <date>     <chr>     <dbl> <date>         <date>      
#>  1 2020-03-07 ICNSA        NA 2024-03-07     2024-03-07  
#>  2 2020-03-14 ICNSA        NA 2024-03-07     2024-03-07  
#>  3 2020-03-21 ICNSA        NA 2024-03-07     2024-03-07  
#>  4 2020-03-28 ICNSA        NA 2024-03-07     2024-03-07  
#>  5 2020-04-04 ICNSA        NA 2024-03-07     2024-03-07  
#>  6 2020-04-11 ICNSA        NA 2024-03-07     2024-03-07  
#>  7 2020-04-18 ICNSA        NA 2024-03-07     2024-03-07  
#>  8 2020-04-25 ICNSA        NA 2024-03-07     2024-03-07  
#>  9 2020-05-02 ICNSA        NA 2024-03-07     2024-03-07  
#> 10 2020-05-09 ICNSA        NA 2024-03-07     2024-03-07  
#> # ℹ 34 more rows
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(covid_start), linetype = "dashed", color = "red") +
  geom_vline(xintercept = as.numeric(covid_end), linetype = "dashed", color = "red") +
  labs(title = "ICNSA Marking Covid Period ",
       x = "Year",
       y = "Number")

# Convert data to time series object
ts_data <- ts(icnsa_data$value, start = start(icnsa_data$date), frequency = 52)

# Mark the missing values during the COVID period
covid_start <- as.Date("2021-01-02")
covid_end <- as.Date("2022-01-31")
ts_data[icnsa_data$date >= covid_start & icnsa_data$date <= covid_end] <- NA

# Perform local linear trend model
model_fit <- StructTS(ts_data, type = "trend")

print(model_fit)
#> 
#> Call:
#> StructTS(x = ts_data, type = "trend")
#> 
#> Variances:
#>     level      slope    epsilon  
#> 1.564e+09  0.000e+00  5.446e+08
# Use KalmanSmooth to obtain the smoothed state estimates
smoothed_model <- tsSmooth(model_fit)

# Updating NA values
icnsa_data$value[covid_idx] <- ifelse(is.na(icnsa_data$value), smoothed_model[covid_idx], icnsa_data$value)
#> Warning in icnsa_data$value[covid_idx] <- ifelse(is.na(icnsa_data$value), :
#> number of items to replace is not a multiple of replacement length
#icnsa_data$value[is.na(icnsa_data$value)] <- smoothed_model[is.na(icnsa_data$value)]
ggplot() +
  geom_line() + 
  geom_line(data = icnsa_data, aes(x = date, y = value), color = "black") +
  labs(title = "ICNSA over Time with Imputed Values", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# Fitting structure time series model to the entire ICNSA dataset including the covid imputed values 
# Converting the imputed data to a 'ts' object
icnsa_imputed_ts <- ts(icnsa_data$value, start = c(start(icnsa_data$date)), frequency = 52)

# Fitting a structural time series model with  trend, error and seasonal components
ss_imputed_model <- StructTS(icnsa_imputed_ts, type = "BSM")

summary(ss_imputed_model)
#>           Length Class  Mode     
#> coef         4   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    8946   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Getting the smoothed state estimates
smoothed_imputed <- tsSmooth(ss_imputed_model)

residuals_model <- residuals(ss_imputed_model)

# Plot residuals to visualize the error component
plot(residuals_model, main="Residuals from Structural Model", ylab="Residuals", xlab="Time")


# reading the CCNSA data
ccnsa_data = fredr(series_id = "CCNSA")
column_names_ccnsa <- colnames(ccnsa_data)
print(column_names_ccnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"
head(ccnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id   value realtime_start realtime_end
#>   <date>     <chr>       <dbl> <date>         <date>      
#> 1 1967-01-07 CCNSA     1594000 2024-03-07     2024-03-07  
#> 2 1967-01-14 CCNSA     1563000 2024-03-07     2024-03-07  
#> 3 1967-01-21 CCNSA     1551000 2024-03-07     2024-03-07  
#> 4 1967-01-28 CCNSA     1533000 2024-03-07     2024-03-07  
#> 5 1967-02-04 CCNSA     1534000 2024-03-07     2024-03-07  
#> 6 1967-02-11 CCNSA     1557000 2024-03-07     2024-03-07

# Merging ICNSA and CCNSA data
merged_data <- merge(icnsa_data, ccnsa_data, by = "date", all = TRUE)
print(colnames(merged_data))
#> [1] "date"             "series_id.x"      "value.x"          "realtime_start.x"
#> [5] "realtime_end.x"   "series_id.y"      "value.y"          "realtime_start.y"
#> [9] "realtime_end.y"

# Assuming there are no NAs in the dataset, convert to 'ts' object
merged_ts <- ts(merged_data$value.y, start = c(1967, 1), frequency = 52)

# Fit an ARIMA model with a covariate
arima_with_covariate <- auto.arima(icnsa_imputed_ts, xreg = merged_ts, seasonal = TRUE)

#using past value from iurnsa as we don't have a future value.
past_value_ccnsa <- tail(merged_data$value.y, 2)
print(past_value_ccnsa)
#> [1] 2098851      NA

# Forecast the next week's value using the model with the covariate
future_week_forecast <- forecast(arima_with_covariate, h = 1, xreg = past_value_ccnsa)
#> Warning in forecast.forecast_ARIMA(arima_with_covariate, h = 1, xreg =
#> past_value_ccnsa): Upper prediction intervals are not finite.
print(future_week_forecast)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 58.34615       231104.5 176050.1 286158.8 146906.1 315302.8
#> 58.36538             NA       NA       NA       NA       NA
forecast_mean <- mean(future_week_forecast$mean, na.rm = TRUE)
print(forecast_mean)
#> [1] 231104.5

Created on 2024-03-07 with reprex v2.1.0

ayesheepatra commented 2 months ago

Fitting VAR and BigVAR on four time series data

library(reprex)
library(fredr)
library(dygraphs)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(xts)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
library(tseries)
library(vars)
#> Loading required package: MASS
#> Loading required package: strucchange
#> Loading required package: sandwich
#> Loading required package: urca
#> Loading required package: lmtest
library(BigVAR) 
#> Loading required package: lattice
library(readxl)

data_13STI <- read_excel("/Users/umakantapatra/Downloads/SeriesReport-13S-V.xlsx")
data_14STI <- read_excel("/Users/umakantapatra/Downloads/SeriesReport-14S-V.xlsx")
data_15STI <- read_excel("/Users/umakantapatra/Downloads/SeriesReport-15S-V.xlsx")
data_16STI <- read_excel("/Users/umakantapatra/Downloads/SeriesReport-16S-V.xlsx")

# Step 3: Empirical Analysis

#View(data_13STI)
data_13STI$Period = as.Date(paste("01", data_13STI$Period, sep = "-"), format = "%d-%b-%Y")
#View(data_13STI)
plot(data_13STI, type="l")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


data_14STI$Period = as.Date(paste("01", data_14STI$Period, sep = "-"), format = "%d-%b-%Y")
#View(data_14STI)
plot(data_14STI, type="l")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


data_15STI$Period = as.Date(paste("01", data_15STI$Period, sep = "-"), format = "%d-%b-%Y")
#View(data_15STI)
plot(data_15STI, type="l")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


data_16STI$Period = as.Date(paste("01", data_16STI$Period, sep = "-"), format = "%d-%b-%Y")
#View(data_16STI)
plot(data_16STI, type="l")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


data_13STI <- subset(data_13STI, Period <= as.Date("2024-02-01"))
data_15STI <- subset(data_15STI, Period <= as.Date("2024-02-01"))
data_14STI <- subset(data_14STI, Period <= as.Date("2024-02-01"))
data_16STI <- subset(data_16STI, Period <= as.Date("2024-02-01"))

par(mfrow = c(2, 2))  # 2 rows and 2 columns
plot(data_13STI, main = "Plot 13S - Textile Mills", type = "l")
plot(data_14STI, main = "Plot 14S - Textile Products", type = "l")
plot(data_15STI, main = "Plot 15S - Apparel", type = "l")
plot(data_16STI, main = "Plot 16S - Leather and Allied Leather and Allied Products", type = "l")


# ADF test for stationarity for each time series

data_13STI$Value <- as.numeric(data_13STI$Value)
ts_data_13STI = ts(data_13STI$Value, frequency = 12)
adf.test(ts_data_13STI, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_13STI
#> Dickey-Fuller = -0.93277, Lag order = 7, p-value = 0.9488
#> alternative hypothesis: stationary
ts_data_13STI = diff(ts_data_13STI)
adf.test(ts_data_13STI, alternative = "stationary")
#> Warning in adf.test(ts_data_13STI, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_13STI
#> Dickey-Fuller = -5.1367, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary

data_14STI$Value <- as.numeric(data_14STI$Value)
ts_data_14STI = ts(data_14STI$Value, frequency = 12)
adf.test(ts_data_14STI, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_14STI
#> Dickey-Fuller = -2.9436, Lag order = 7, p-value = 0.1787
#> alternative hypothesis: stationary
ts_data_14STI = diff(ts_data_14STI)
adf.test(ts_data_14STI, alternative = "stationary")
#> Warning in adf.test(ts_data_14STI, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_14STI
#> Dickey-Fuller = -4.4991, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary

data_15STI$Value <- as.numeric(data_15STI$Value)
ts_data_15STI = ts(data_15STI$Value, frequency = 12)
adf.test(ts_data_15STI, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_15STI
#> Dickey-Fuller = -1.2818, Lag order = 7, p-value = 0.8804
#> alternative hypothesis: stationary
ts_data_15STI = diff(ts_data_15STI)
adf.test(ts_data_15STI, alternative = "stationary")
#> Warning in adf.test(ts_data_15STI, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_15STI
#> Dickey-Fuller = -5.3159, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary

data_16STI$Value <- as.numeric(data_16STI$Value)
ts_data_16STI = ts(data_16STI$Value, frequency = 12)
adf.test(ts_data_16STI, alternative = "stationary")
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_16STI
#> Dickey-Fuller = -1.4755, Lag order = 7, p-value = 0.7986
#> alternative hypothesis: stationary
ts_data_16STI = diff(ts_data_16STI)
adf.test(ts_data_16STI, alternative = "stationary")
#> Warning in adf.test(ts_data_16STI, alternative = "stationary"): p-value smaller
#> than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts_data_16STI
#> Dickey-Fuller = -5.5215, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary

#Performed 1- order differencing to make the series stationary

#ACF and PACF
acf(data_13STI$Value)
pacf(data_13STI$Value)

acf(data_14STI$Value)
pacf(data_14STI$Value)


acf(data_15STI$Value)
pacf(data_15STI$Value)

acf(data_16STI$Value)
pacf(data_16STI$Value)


#Correlation Analysis

# Combine all series into one dataframe
all_series <- data.frame(data_13STI$Value, data_14STI$Value, data_15STI$Value, data_16STI$Value)

# Compute correlation matrix
correlation_matrix <- cor(all_series)

# Print correlation matrix
print("Correlation Matrix between Series:")
#> [1] "Correlation Matrix between Series:"
print(correlation_matrix)
#>                  data_13STI.Value data_14STI.Value data_15STI.Value
#> data_13STI.Value        1.0000000        0.6216084        0.9926421
#> data_14STI.Value        0.6216084        1.0000000        0.6254968
#> data_15STI.Value        0.9926421        0.6254968        1.0000000
#> data_16STI.Value        0.9047105        0.6654211        0.8926975
#>                  data_16STI.Value
#> data_13STI.Value        0.9047105
#> data_14STI.Value        0.6654211
#> data_15STI.Value        0.8926975
#> data_16STI.Value        1.0000000

#Cross-Correlation Function Analysis

names <- c("13STI", "14STI", "15STI", "16STI")

# Initialize an empty list to store the cross-correlation results
ccf_results <- list()

# Loop through each pair of time series
for (i in 1:(length(names)-1)) {
  for (j in (i+1):length(names)) {
    # Extract the time series data
    series1 <- get(paste0("data_", names[i]))
    series2 <- get(paste0("data_", names[j]))

    # Compute cross-correlation
    ccf_result <- ccf(series1$Value, series2$Value)

    # Store the result in the list
    ccf_results[[paste(names[i], names[j], sep = "_vs_")]] <- ccf_result
  }
}


# Plot the cross-correlation functions
par(mfrow = c(2, 3))  # Adjust the layout as needed

for (k in 1:length(ccf_results)) {
  plot(ccf_results[[k]], main = names(ccf_results)[k], mar = c(3, 3, 3, 3))  # Adjust margin settings
}


# Fit VAR(1) and VAR(p) model
# Combine all series into one dataframe
all_series <- data.frame(data_13STI$Value, data_14STI$Value, data_15STI$Value, data_16STI$Value)

# Convert the dataframe to a time series object
ts_all_series <- ts(all_series, frequency = 12)

plot(ts_all_series[, 1], main = "Time Series Plot for Series 13S")
plot(ts_all_series[, 2], main = "Time Series Plot for Series 14S")
plot(ts_all_series[, 3], main = "Time Series Plot for Series 15S")
plot(ts_all_series[, 4], main = "Time Series Plot for Series 16S")

# Fit VAR(1) model
var_model_1 <- VAR(ts_all_series, p = 1)

# Summary of VAR(1) model
summary(var_model_1)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: data_13STI.Value, data_14STI.Value, data_15STI.Value, data_16STI.Value 
#> Deterministic variables: const 
#> Sample size: 385 
#> Log Likelihood: -7900.344 
#> Roots of the characteristic polynomial:
#> 0.9974 0.9924 0.9789 0.9789
#> Call:
#> VAR(y = ts_all_series, p = 1)
#> 
#> 
#> Estimation results for equation data_13STI.Value: 
#> ================================================= 
#> data_13STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1   0.924142   0.012971  71.249  < 2e-16 ***
#> data_14STI.Value.l1  -0.029038   0.006077  -4.778 2.53e-06 ***
#> data_15STI.Value.l1   0.029501   0.005937   4.969 1.02e-06 ***
#> data_16STI.Value.l1   0.091492   0.015245   6.002 4.56e-09 ***
#> const               203.485325  34.441965   5.908 7.70e-09 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 46.48 on 380 degrees of freedom
#> Multiple R-Squared: 0.9992,  Adjusted R-squared: 0.9992 
#> F-statistic: 1.156e+05 on 4 and 380 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation data_14STI.Value: 
#> ================================================= 
#> data_14STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1  -0.051328   0.012045  -4.261 2.57e-05 ***
#> data_14STI.Value.l1   0.979558   0.005644 173.568  < 2e-16 ***
#> data_15STI.Value.l1   0.021675   0.005513   3.931    1e-04 ***
#> data_16STI.Value.l1   0.057005   0.014157   4.027 6.83e-05 ***
#> const               143.457187  31.984643   4.485 9.66e-06 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 43.16 on 380 degrees of freedom
#> Multiple R-Squared: 0.9934,  Adjusted R-squared: 0.9933 
#> F-statistic: 1.427e+04 on 4 and 380 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation data_15STI.Value: 
#> ================================================= 
#> data_15STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + const 
#> 
#>                      Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1  -0.09902    0.02199  -4.503 8.92e-06 ***
#> data_14STI.Value.l1  -0.07908    0.01030  -7.675 1.41e-13 ***
#> data_15STI.Value.l1   1.03245    0.01007 102.575  < 2e-16 ***
#> data_16STI.Value.l1   0.21406    0.02585   8.283 2.08e-15 ***
#> const               331.65249   58.39173   5.680 2.68e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 78.8 on 380 degrees of freedom
#> Multiple R-Squared: 0.9995,  Adjusted R-squared: 0.9994 
#> F-statistic: 1.728e+05 on 4 and 380 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation data_16STI.Value: 
#> ================================================= 
#> data_16STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1  0.0005947  0.0062391   0.095    0.924    
#> data_14STI.Value.l1 -0.0053299  0.0029233  -1.823    0.069 .  
#> data_15STI.Value.l1 -0.0016955  0.0028558  -0.594    0.553    
#> data_16STI.Value.l1  1.0109152  0.0073330 137.858   <2e-16 ***
#> const               11.0417398 16.5673887   0.666    0.506    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 22.36 on 380 degrees of freedom
#> Multiple R-Squared: 0.9967,  Adjusted R-squared: 0.9967 
#> F-statistic: 2.887e+04 on 4 and 380 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>                  data_13STI.Value data_14STI.Value data_15STI.Value
#> data_13STI.Value           2160.3            430.0           1278.6
#> data_14STI.Value            430.0           1863.0            571.3
#> data_15STI.Value           1278.6            571.3           6209.2
#> data_16STI.Value            336.3            223.2            664.4
#>                  data_16STI.Value
#> data_13STI.Value            336.3
#> data_14STI.Value            223.2
#> data_15STI.Value            664.4
#> data_16STI.Value            499.9
#> 
#> Correlation matrix of residuals:
#>                  data_13STI.Value data_14STI.Value data_15STI.Value
#> data_13STI.Value           1.0000           0.2143           0.3491
#> data_14STI.Value           0.2143           1.0000           0.1680
#> data_15STI.Value           0.3491           0.1680           1.0000
#> data_16STI.Value           0.3236           0.2313           0.3771
#>                  data_16STI.Value
#> data_13STI.Value           0.3236
#> data_14STI.Value           0.2313
#> data_15STI.Value           0.3771
#> data_16STI.Value           1.0000

# Fit VAR(p) model with p > 1 (e.g., p = 2)
var_model_p <- VAR(ts_all_series, p = 6)

# Summary of VAR(p) model
summary(var_model_p)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: data_13STI.Value, data_14STI.Value, data_15STI.Value, data_16STI.Value 
#> Deterministic variables: const 
#> Sample size: 380 
#> Log Likelihood: -7509.59 
#> Roots of the characteristic polynomial:
#> 0.995 0.9682 0.9682 0.8979 0.8979 0.8565 0.8565 0.7569 0.7152 0.7152 0.7084 0.7084 0.6971 0.6971 0.6404 0.6404 0.6342 0.6342 0.5779 0.5411 0.5411 0.4792 0.4792 0.419
#> Call:
#> VAR(y = ts_all_series, p = 6)
#> 
#> 
#> Estimation results for equation data_13STI.Value: 
#> ================================================= 
#> data_13STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + data_13STI.Value.l2 + data_14STI.Value.l2 + data_15STI.Value.l2 + data_16STI.Value.l2 + data_13STI.Value.l3 + data_14STI.Value.l3 + data_15STI.Value.l3 + data_16STI.Value.l3 + data_13STI.Value.l4 + data_14STI.Value.l4 + data_15STI.Value.l4 + data_16STI.Value.l4 + data_13STI.Value.l5 + data_14STI.Value.l5 + data_15STI.Value.l5 + data_16STI.Value.l5 + data_13STI.Value.l6 + data_14STI.Value.l6 + data_15STI.Value.l6 + data_16STI.Value.l6 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1   1.043988   0.053020  19.690  < 2e-16 ***
#> data_14STI.Value.l1   0.098502   0.060789   1.620  0.10604    
#> data_15STI.Value.l1   0.062769   0.036362   1.726  0.08518 .  
#> data_16STI.Value.l1   0.255755   0.118188   2.164  0.03113 *  
#> data_13STI.Value.l2   0.032838   0.076462   0.429  0.66785    
#> data_14STI.Value.l2  -0.025540   0.095720  -0.267  0.78976    
#> data_15STI.Value.l2  -0.057613   0.058964  -0.977  0.32920    
#> data_16STI.Value.l2  -0.004918   0.183263  -0.027  0.97861    
#> data_13STI.Value.l3  -0.012372   0.076115  -0.163  0.87097    
#> data_14STI.Value.l3   0.010943   0.092014   0.119  0.90540    
#> data_15STI.Value.l3   0.095023   0.057977   1.639  0.10210    
#> data_16STI.Value.l3  -0.393937   0.179389  -2.196  0.02874 *  
#> data_13STI.Value.l4  -0.067336   0.075713  -0.889  0.37441    
#> data_14STI.Value.l4  -0.138506   0.092321  -1.500  0.13443    
#> data_15STI.Value.l4  -0.027788   0.058203  -0.477  0.63335    
#> data_16STI.Value.l4   0.144842   0.179422   0.807  0.42005    
#> data_13STI.Value.l5  -0.021986   0.076589  -0.287  0.77423    
#> data_14STI.Value.l5  -0.062572   0.092046  -0.680  0.49708    
#> data_15STI.Value.l5  -0.142906   0.059908  -2.385  0.01758 *  
#> data_16STI.Value.l5   0.060575   0.178990   0.338  0.73524    
#> data_13STI.Value.l6  -0.018330   0.050281  -0.365  0.71566    
#> data_14STI.Value.l6   0.107421   0.058172   1.847  0.06563 .  
#> data_15STI.Value.l6   0.089110   0.038706   2.302  0.02190 *  
#> data_16STI.Value.l6  -0.034897   0.113877  -0.306  0.75944    
#> const               111.524950  38.418820   2.903  0.00393 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 40.48 on 355 degrees of freedom
#> Multiple R-Squared: 0.9994,  Adjusted R-squared: 0.9994 
#> F-statistic: 2.509e+04 on 24 and 355 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation data_14STI.Value: 
#> ================================================= 
#> data_14STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + data_13STI.Value.l2 + data_14STI.Value.l2 + data_15STI.Value.l2 + data_16STI.Value.l2 + data_13STI.Value.l3 + data_14STI.Value.l3 + data_15STI.Value.l3 + data_16STI.Value.l3 + data_13STI.Value.l4 + data_14STI.Value.l4 + data_15STI.Value.l4 + data_16STI.Value.l4 + data_13STI.Value.l5 + data_14STI.Value.l5 + data_15STI.Value.l5 + data_16STI.Value.l5 + data_13STI.Value.l6 + data_14STI.Value.l6 + data_15STI.Value.l6 + data_16STI.Value.l6 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1  -0.033813   0.046493  -0.727 0.467540    
#> data_14STI.Value.l1   1.283687   0.053306  24.082  < 2e-16 ***
#> data_15STI.Value.l1   0.007999   0.031886   0.251 0.802064    
#> data_16STI.Value.l1   0.219926   0.103638   2.122 0.034525 *  
#> data_13STI.Value.l2  -0.014401   0.067050  -0.215 0.830056    
#> data_14STI.Value.l2  -0.128809   0.083937  -1.535 0.125774    
#> data_15STI.Value.l2   0.038333   0.051705   0.741 0.458955    
#> data_16STI.Value.l2  -0.082675   0.160703  -0.514 0.607252    
#> data_13STI.Value.l3   0.053207   0.066745   0.797 0.425882    
#> data_14STI.Value.l3  -0.085521   0.080687  -1.060 0.289905    
#> data_15STI.Value.l3  -0.114217   0.050840  -2.247 0.025279 *  
#> data_16STI.Value.l3  -0.181017   0.157306  -1.151 0.250616    
#> data_13STI.Value.l4  -0.105645   0.066393  -1.591 0.112451    
#> data_14STI.Value.l4   0.060648   0.080956   0.749 0.454262    
#> data_15STI.Value.l4   0.125179   0.051038   2.453 0.014661 *  
#> data_16STI.Value.l4   0.017360   0.157334   0.110 0.912203    
#> data_13STI.Value.l5   0.152838   0.067161   2.276 0.023459 *  
#> data_14STI.Value.l5  -0.210779   0.080715  -2.611 0.009400 ** 
#> data_15STI.Value.l5  -0.082923   0.052533  -1.578 0.115344    
#> data_16STI.Value.l5   0.042797   0.156956   0.273 0.785265    
#> data_13STI.Value.l6  -0.087702   0.044091  -1.989 0.047454 *  
#> data_14STI.Value.l6   0.062114   0.051010   1.218 0.224160    
#> data_15STI.Value.l6   0.041545   0.033941   1.224 0.221752    
#> data_16STI.Value.l6   0.015222   0.099858   0.152 0.878929    
#> const               119.317885  33.689353   3.542 0.000451 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 35.5 on 355 degrees of freedom
#> Multiple R-Squared: 0.9958,  Adjusted R-squared: 0.9955 
#> F-statistic:  3510 on 24 and 355 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation data_15STI.Value: 
#> ================================================= 
#> data_15STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + data_13STI.Value.l2 + data_14STI.Value.l2 + data_15STI.Value.l2 + data_16STI.Value.l2 + data_13STI.Value.l3 + data_14STI.Value.l3 + data_15STI.Value.l3 + data_16STI.Value.l3 + data_13STI.Value.l4 + data_14STI.Value.l4 + data_15STI.Value.l4 + data_16STI.Value.l4 + data_13STI.Value.l5 + data_14STI.Value.l5 + data_15STI.Value.l5 + data_16STI.Value.l5 + data_13STI.Value.l6 + data_14STI.Value.l6 + data_15STI.Value.l6 + data_16STI.Value.l6 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1  -0.003032   0.078040  -0.039 0.969035    
#> data_14STI.Value.l1   0.033352   0.089475   0.373 0.709556    
#> data_15STI.Value.l1   1.281285   0.053521  23.940  < 2e-16 ***
#> data_16STI.Value.l1   0.288078   0.173959   1.656 0.098604 .  
#> data_13STI.Value.l2  -0.019449   0.112544  -0.173 0.862897    
#> data_14STI.Value.l2  -0.180493   0.140890  -1.281 0.200996    
#> data_15STI.Value.l2  -0.071343   0.086789  -0.822 0.411609    
#> data_16STI.Value.l2   0.079626   0.269743   0.295 0.768022    
#> data_13STI.Value.l3  -0.004346   0.112032  -0.039 0.969076    
#> data_14STI.Value.l3   0.268212   0.135435   1.980 0.048433 *  
#> data_15STI.Value.l3   0.084757   0.085336   0.993 0.321282    
#> data_16STI.Value.l3  -0.312962   0.264041  -1.185 0.236700    
#> data_13STI.Value.l4   0.084344   0.111442   0.757 0.449645    
#> data_14STI.Value.l4  -0.100597   0.135886  -0.740 0.459608    
#> data_15STI.Value.l4  -0.302176   0.085669  -3.527 0.000475 ***
#> data_16STI.Value.l4  -0.089145   0.264089  -0.338 0.735897    
#> data_13STI.Value.l5   0.060070   0.112731   0.533 0.594461    
#> data_14STI.Value.l5  -0.062304   0.135482  -0.460 0.645891    
#> data_15STI.Value.l5  -0.051144   0.088178  -0.580 0.562274    
#> data_16STI.Value.l5  -0.401537   0.263454  -1.524 0.128368    
#> data_13STI.Value.l6  -0.088132   0.074008  -1.191 0.234507    
#> data_14STI.Value.l6   0.028643   0.085622   0.335 0.738183    
#> data_15STI.Value.l6   0.043306   0.056971   0.760 0.447676    
#> data_16STI.Value.l6   0.450306   0.167614   2.687 0.007559 ** 
#> const               -34.272653  56.548336  -0.606 0.544851    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 59.58 on 355 degrees of freedom
#> Multiple R-Squared: 0.9997,  Adjusted R-squared: 0.9997 
#> F-statistic: 4.974e+04 on 24 and 355 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation data_16STI.Value: 
#> ================================================= 
#> data_16STI.Value = data_13STI.Value.l1 + data_14STI.Value.l1 + data_15STI.Value.l1 + data_16STI.Value.l1 + data_13STI.Value.l2 + data_14STI.Value.l2 + data_15STI.Value.l2 + data_16STI.Value.l2 + data_13STI.Value.l3 + data_14STI.Value.l3 + data_15STI.Value.l3 + data_16STI.Value.l3 + data_13STI.Value.l4 + data_14STI.Value.l4 + data_15STI.Value.l4 + data_16STI.Value.l4 + data_13STI.Value.l5 + data_14STI.Value.l5 + data_15STI.Value.l5 + data_16STI.Value.l5 + data_13STI.Value.l6 + data_14STI.Value.l6 + data_15STI.Value.l6 + data_16STI.Value.l6 + const 
#> 
#>                       Estimate Std. Error t value Pr(>|t|)    
#> data_13STI.Value.l1  -0.003563   0.023883  -0.149  0.88151    
#> data_14STI.Value.l1   0.017717   0.027383   0.647  0.51806    
#> data_15STI.Value.l1   0.020262   0.016380   1.237  0.21690    
#> data_16STI.Value.l1   1.203170   0.053239  22.600  < 2e-16 ***
#> data_13STI.Value.l2   0.029492   0.034443   0.856  0.39243    
#> data_14STI.Value.l2  -0.022466   0.043118  -0.521  0.60267    
#> data_15STI.Value.l2   0.016643   0.026561   0.627  0.53132    
#> data_16STI.Value.l2  -0.120588   0.082553  -1.461  0.14497    
#> data_13STI.Value.l3   0.050559   0.034287   1.475  0.14121    
#> data_14STI.Value.l3   0.057987   0.041449   1.399  0.16268    
#> data_15STI.Value.l3  -0.006818   0.026116  -0.261  0.79418    
#> data_16STI.Value.l3  -0.021617   0.080807  -0.268  0.78923    
#> data_13STI.Value.l4  -0.091732   0.034106  -2.690  0.00749 ** 
#> data_14STI.Value.l4  -0.045034   0.041587  -1.083  0.27959    
#> data_15STI.Value.l4  -0.050007   0.026218  -1.907  0.05728 .  
#> data_16STI.Value.l4  -0.090067   0.080822  -1.114  0.26587    
#> data_13STI.Value.l5   0.035333   0.034500   1.024  0.30647    
#> data_14STI.Value.l5   0.001145   0.041463   0.028  0.97798    
#> data_15STI.Value.l5  -0.013021   0.026986  -0.483  0.62973    
#> data_16STI.Value.l5   0.103259   0.080628   1.281  0.20114    
#> data_13STI.Value.l6  -0.003718   0.022649  -0.164  0.86969    
#> data_14STI.Value.l6  -0.006254   0.026204  -0.239  0.81151    
#> data_15STI.Value.l6   0.026853   0.017436   1.540  0.12442    
#> data_16STI.Value.l6  -0.098949   0.051297  -1.929  0.05453 .  
#> const               -28.927881  17.306106  -1.672  0.09550 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 18.24 on 355 degrees of freedom
#> Multiple R-Squared: 0.9979,  Adjusted R-squared: 0.9978 
#> F-statistic:  7052 on 24 and 355 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>                  data_13STI.Value data_14STI.Value data_15STI.Value
#> data_13STI.Value          1638.72            73.95            434.7
#> data_14STI.Value            73.95          1260.09            272.1
#> data_15STI.Value           434.68           272.06           3550.2
#> data_16STI.Value            88.35            54.90            114.3
#>                  data_16STI.Value
#> data_13STI.Value            88.35
#> data_14STI.Value            54.90
#> data_15STI.Value           114.30
#> data_16STI.Value           332.52
#> 
#> Correlation matrix of residuals:
#>                  data_13STI.Value data_14STI.Value data_15STI.Value
#> data_13STI.Value          1.00000          0.05146           0.1802
#> data_14STI.Value          0.05146          1.00000           0.1286
#> data_15STI.Value          0.18022          0.12863           1.0000
#> data_16STI.Value          0.11968          0.08482           0.1052
#>                  data_16STI.Value
#> data_13STI.Value          0.11968
#> data_14STI.Value          0.08482
#> data_15STI.Value          0.10520
#> data_16STI.Value          1.00000

# Produce a one-month-ahead forecast
forecast_var_p <- predict(var_model_p, n.ahead = 1)

# Print the forecasted values
print(forecast_var_p)
#> $data_13STI.Value
#>                           fcst    lower    upper       CI
#> data_13STI.Value.fcst 3515.507 3436.166 3594.849 79.34144
#> 
#> $data_14STI.Value
#>                           fcst    lower    upper       CI
#> data_14STI.Value.fcst 3613.969 3544.395 3683.543 69.57428
#> 
#> $data_15STI.Value
#>                           fcst   lower    upper      CI
#> data_15STI.Value.fcst 2194.372 2077.59 2311.154 116.782
#> 
#> $data_16STI.Value
#>                           fcst    lower    upper       CI
#> data_16STI.Value.fcst 1235.018 1199.278 1270.758 35.74007

# Granger causality test
granger_results <- causality(var_model_p)
#> Warning in causality(var_model_p): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (data_13STI.Value) as cause variable.
print(granger_results)
#> $Granger
#> 
#>  Granger causality H0: data_13STI.Value do not Granger-cause
#>  data_14STI.Value data_15STI.Value data_16STI.Value
#> 
#> data:  VAR object var_model_p
#> F-Test = 2.0166, df1 = 18, df2 = 1420, p-value = 0.006966
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: data_13STI.Value and
#>  data_14STI.Value data_15STI.Value data_16STI.Value
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 15.732, df = 3, p-value = 0.001287

#Fit BigVar
all_series_matrix <- as.matrix(all_series)

# Fit the sparse VAR model with LASSO regularization, lambda = 0.1, and OwnOther-based sparsity
sparse_var_model <- BigVAR.fit(Y = all_series_matrix, p = 6, lambda = 0.1, struct = "OwnOther")

# Print summary of the sparse VAR model
print(summary(sparse_var_model))
#>      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#> -61.94693  -0.05029  -0.00913   1.42869   0.03857 116.98556

Created on 2024-04-10 with reprex v2.1.0