JahnaviGangu commented 5 months ago

# Load required libraries
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tsbox)
library(forecast)
library(ggplot2)

# Read your CSV file
file_name <- "/Users/jahnavigangu/Downloads/ICNSA.csv"
data <- read.csv(file_name)

# Clean and process your data
data$date <- as.Date(data$DATE)
data <- na.omit(data)

# Identify extreme values
covid_threshold <- quantile(data$ICNSA, 0.99)
extreme_values <- data$ICNSA > covid_threshold

# Handling extreme values with log transformation directly in the original data frame
data$ICNSA <- ifelse(extreme_values, log(data$ICNSA), data$ICNSA)

# Create a time series object
ts_data <- ts(data$ICNSA, frequency = 52.18)  # Assuming weekly data

# Check for stationarity
adf_test <- adf.test(ts_data)
#> Warning in adf.test(ts_data): p-value smaller than printed p-value

# Decompose the time series (optional)
decomposition <- decompose(ts_data)
trend_component <- decomposition$trend
seasonal_component <- decomposition$seasonal
remainder_component <- decomposition$random

# Plot the decomposed components (optional)
par(mfrow = c(3, 1))
plot(trend_component, main = "Trend Component")
plot(seasonal_component, main = "Seasonal Component")
plot(remainder_component, main = "Remainder (Residual) Component")


# Apply ARIMA to the original data for faster execution
if (adf_test$p.value > 0.05) {
  diff_ts_data <- diff(ts_data)
  adf_test_diff <- adf.test(diff_ts_data)

  arima_model <- auto.arima(diff_ts_data)
} else {
  arima_model <- auto.arima(ts_data)
}

# Predict the next value
prediction <- forecast(arima_model, h = 1)$mean[1]
cat("Predicted value:", round(prediction, 2), "\n")
#> Predicted value: 241942

# Forecast directly on the original data
forecast_values <- forecast(arima_model, h = 5)$mean

# Plot the forecast
plot(forecast_values, main = "Combined Forecast", xlab = "Date", ylab = "ICNSA")

^{Created on 2024-02-22 with reprex v2.1.0}

JahnaviGangu commented 4 months ago

library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(fredr)
library(stats)
library(ggplot2)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(TSA)
#> Registered S3 methods overwritten by 'TSA':
#>   method       from    
#>   fitted.Arima forecast
#>   plot.Arima   forecast
#> 
#> Attaching package: 'TSA'
#> The following objects are masked from 'package:stats':
#> 
#>     acf, arima
#> The following object is masked from 'package:utils':
#> 
#>     tar
library(urca)

fredr_set_key("c387f7cbc3f36a5a52a03d391d17e253")
icnsa_data <- fredr(series_id = "ICNSA")
covariate_data <- fredr(series_id = "UNRATE")

str(icnsa_data)
#> tibble [2,980 × 5] (S3: tbl_df/tbl/data.frame)
#>  $ date          : Date[1:2980], format: "1967-01-07" "1967-01-14" ...
#>  $ series_id     : chr [1:2980] "ICNSA" "ICNSA" "ICNSA" "ICNSA" ...
#>  $ value         : num [1:2980] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#>  $ realtime_start: Date[1:2980], format: "2024-02-22" "2024-02-22" ...
#>  $ realtime_end  : Date[1:2980], format: "2024-02-22" "2024-02-22" ...
str(covariate_data)
#> tibble [913 × 5] (S3: tbl_df/tbl/data.frame)
#>  $ date          : Date[1:913], format: "1948-01-01" "1948-02-01" ...
#>  $ series_id     : chr [1:913] "UNRATE" "UNRATE" "UNRATE" "UNRATE" ...
#>  $ value         : num [1:913] 3.4 3.8 4 3.9 3.5 3.6 3.6 3.9 3.8 3.7 ...
#>  $ realtime_start: Date[1:913], format: "2024-02-22" "2024-02-22" ...
#>  $ realtime_end  : Date[1:913], format: "2024-02-22" "2024-02-22" ...

# Handle missing values (consider alternative methods if needed)
missing_icnsa <- colSums(is.na(icnsa_data))
missing_covariate <- colSums(is.na(covariate_data))

cat("Missing values in ICNSA data:\n")
#> Missing values in ICNSA data:
print(missing_icnsa)
#>           date      series_id          value realtime_start   realtime_end 
#>              0              0              0              0              0

cat("\nMissing values in Covariate data:\n")
#> 
#> Missing values in Covariate data:
print(missing_covariate)
#>           date      series_id          value realtime_start   realtime_end 
#>              0              0              0              0              0

library(ggplot2)

# Convert data frames to regular data frames (if necessary)
icnsa_data <- as.data.frame(icnsa_data)
covariate_data <- as.data.frame(covariate_data)

# Plotting the data
ggplot() +
  geom_line(data = icnsa_data, aes(x = date, y = value, color = "ICNSA"), linewidth = 1) +
  geom_line(data = covariate_data, aes(x = date, y = value, color = "UNRATE"), linewidth = 1) +
  labs(title = "Time Series Plot", x = "Date", y = "Value") +
  scale_color_manual(values = c("ICNSA" = "blue", "UNRATE" = "red")) +
  theme_minimal()


# Correlation analysis
merged_data <- merge(icnsa_data, covariate_data, by = "date")
correlation <- cor(merged_data$value.x, merged_data$value.y)
cat("Correlation between ICNSA and UNRATE:", correlation)
#> Correlation between ICNSA and UNRATE: 0.6315352

# Print correlation coefficient
print(correlation)
#> [1] 0.6315352

# Define a function for ADF test
adf_test <- function(series) {
  result <- adf.test(series)
  cat("ADF Statistic:", result$statistic, "\n")
  cat("p-value:", result$p.value, "\n")
}

# Perform ADF test for ICNSA
adf_test(icnsa_data$value)
#> Warning in adf.test(series): p-value smaller than printed p-value
#> ADF Statistic: -8.866381 
#> p-value: 0.01

# Perform ADF test for UNRATE
adf_test(covariate_data$value)
#> ADF Statistic: -3.914225 
#> p-value: 0.01328462

# Cross-correlation function (CCF) plot
ccf_result <- ccf(icnsa_data$value, covariate_data$value, lag.max = 10, plot = TRUE,
                  xlab = "Lag", ylab = "Correlation", main = "Cross-Correlation Function")


# Identify common time period with non-missing values
common_dates <- intersect(icnsa_data$date, covariate_data$date)

# Subset data based on common dates
icnsa_data <- icnsa_data[icnsa_data$date %in% common_dates, ]
covariate_data <- covariate_data[covariate_data$date %in% common_dates, ]

# Explore relationship between variables
plot(icnsa_data$value ~ covariate_data$value)


# Preprocess data (consider alternative methods if needed)
data <- complete.cases(icnsa_data, covariate_data)
icnsa_value <- icnsa_data$value[data]
covariate_value <- covariate_data$value[data]

# Convert to ts objects with appropriate frequency
icnsa_ts <- ts(icnsa_value, start = min(icnsa_data$date), frequency = 12)
covariate_ts <- ts(covariate_value, start = min(covariate_data$date), frequency = 12)

# Fit ARIMA model with exogenous variable

final_model <- auto.arima(icnsa_ts, xreg = covariate_ts)
#final_model <- Arima(icnsa_data$value, xreg = covariate_data$value)
checkresiduals(final_model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(0,0,0)(0,0,2)[12] errors
#> Q* = 24.948, df = 18, p-value = 0.1263
#> 
#> Model df: 2.   Total lags used: 20

# Print model summary
summary(final_model)
#> Series: icnsa_ts 
#> Regression with ARIMA(0,0,0)(0,0,2)[12] errors 
#> 
#> Coefficients:
#>          sma1     sma2        xreg
#>       -0.3463  -0.2509  58675.6188
#> s.e.   0.1232   0.1297    896.9111
#> 
#> sigma^2 = 9.608e+09:  log likelihood = -1266.06
#> AIC=2540.11   AICc=2540.54   BIC=2550.45
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
#> Training set -3341.93 96508.81 71015.21 -6.11401 20.47441 0.4927516 0.05227523

str(final_model)
#> List of 19
#>  $ coef     : Named num [1:3] -0.346 -0.251 58675.619
#>   ..- attr(*, "names")= chr [1:3] "sma1" "sma2" "xreg"
#>  $ sigma2   : num 9.61e+09
#>  $ var.coef : num [1:3, 1:3] 0.0152 -0.0039 -0.3959 -0.0039 0.0168 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : chr [1:3] "sma1" "sma2" "xreg"
#>   .. ..$ : chr [1:3] "sma1" "sma2" "xreg"
#>  $ mask     : logi [1:3] TRUE TRUE TRUE
#>  $ loglik   : num -1266
#>  $ aic      : num 2540
#>  $ arma     : int [1:7] 0 0 0 2 12 0 0
#>  $ residuals: Time-Series [1:98] from -1006 to -998: 30 -15601 -71808 46435 5979 ...
#>  $ call     : language auto.arima(y = icnsa_ts, xreg = covariate_ts, x = list(x = c(223000, 206000,  139000, 250000, 206000, 174000, 293| __truncated__ ...
#>  $ series   : chr "icnsa_ts"
#>  $ code     : int 0
#>  $ n.cond   : int 0
#>  $ nobs     : int 98
#>  $ model    :List of 10
#>   ..$ phi  : num(0) 
#>   ..$ theta: num [1:24] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ Delta: num(0) 
#>   ..$ Z    : num [1:25] 1 0 0 0 0 0 0 0 0 0 ...
#>   ..$ a    : num [1:25] 46321 31206 30209 -5585 42440 ...
#>   ..$ P    : num [1:25, 1:25] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ T    : num [1:25, 1:25] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ V    : num [1:25, 1:25] 1 0 0 0 0 0 0 0 0 0 ...
#>   ..$ h    : num 0
#>   ..$ Pn   : num [1:25, 1:25] 1 0 0 0 0 ...
#>  $ bic      : num 2550
#>  $ aicc     : num 2541
#>  $ xreg     : num [1:98, 1] 3.8 3.8 3.7 3.4 3.4 3.5 5.1 5.9 5.8 5.7 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..$ : NULL
#>   .. ..$ : chr "xreg"
#>  $ x        : Time-Series [1:98] from -1006 to -998: 223000 206000 139000 250000 206000 174000 293000 237000 405000 224000 ...
#>  $ fitted   : Time-Series [1:98] from -1006 to -998: 222970 221601 210808 203565 200021 ...
#>  - attr(*, "class")= chr [1:3] "forecast_ARIMA" "ARIMA" "Arima"

# Forecasting
forecast_values <- forecast(final_model, xreg = covariate_ts, h=1)
prediction <- forecast_values$mean[1]
prediction
#> [1] 254172.9

plot(forecast_values)

^{Created on 2024-02-22 with reprex v2.1.0}

JahnaviGangu commented 4 months ago

The final model chosen is an ARIMA model with a variable (covariate). Here's the Justification:

Stationarity: The ADF test shows that both ICNSA and UNRATE series are stationary, making them suitable for ARIMA modeling. Correlation: The observed correlation between ICNSA and UNRATE suggests a potential relationship between the two variables. Including UNRATE as an exogenous variable in the ARIMA model allows us to capture this relationship and potentially improve the model's accuracy. Cross-correlation analysis: The CCF plot further supported the potential influence of UNRATE on ICNSA, with significant correlations observed at specific lags. Model selection: Using auto.arima which automatically selects the best model based on various criteria suggests the chosen model is likely the most suitable based on the data and considering the inclusion of the covariate variable.

Regression Diagnostics

The test looked for patterns in the errors (residuals) of the model. If these patterns existed, it would indicate that the errors are not random and could affect the model's accuracy. The test statistic (Q*) and p-value (0.1263) suggest that we can't reject the idea that the errors are random. In simpler terms, there's no strong evidence that the errors are clustered or dependent on each other, which is a positive sign for the model's validity.

The code only considers common dates between ICNSA and UNRATE to ensure consistent data for model fitting. Including observations with missing values in either series would lead to inconsistencies, potentially biasing the model and producing unreliable forecasts. By focusing on common dates with complete data for both variables, the model learns from valid relationships and generates accurate predictions based on reliable information.

JahnaviGangu commented 4 months ago

# Library Loading
library(tidyverse)
#> Warning: package 'dplyr' was built under R version 4.2.3
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(splines)
library(changepoint)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Successfully loaded changepoint package version 2.2.4
#>  See NEWS for details of changes.
library(urca)
library(dplyr)
library(readxl)
library(zoo)

fredr_set_key("c387f7cbc3f36a5a52a03d391d17e253")
icnsa_data <- fredr(series_id = "ICNSA")  

icnsa_data$date <- as.Date(icnsa_data$date)

head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-28     2024-02-28  
#> 2 1967-01-14 ICNSA     334000 2024-02-28     2024-02-28  
#> 3 1967-01-21 ICNSA     277000 2024-02-28     2024-02-28  
#> 4 1967-01-28 ICNSA     252000 2024-02-28     2024-02-28  
#> 5 1967-02-04 ICNSA     274000 2024-02-28     2024-02-28  
#> 6 1967-02-11 ICNSA     276000 2024-02-28     2024-02-28

str(icnsa_data)
#> tibble [2,981 × 5] (S3: tbl_df/tbl/data.frame)
#>  $ date          : Date[1:2981], format: "1967-01-07" "1967-01-14" ...
#>  $ series_id     : chr [1:2981] "ICNSA" "ICNSA" "ICNSA" "ICNSA" ...
#>  $ value         : num [1:2981] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#>  $ realtime_start: Date[1:2981], format: "2024-02-28" "2024-02-28" ...
#>  $ realtime_end  : Date[1:2981], format: "2024-02-28" "2024-02-28" ...

ggplot() +
  geom_line(data = icnsa_data, aes(x = date, y = value, color = "ICNSA"), linewidth = 1) +
  labs(title = "Time Series Plot", x = "Date", y = "Value") +
  scale_color_manual(values = c("ICNSA" = "black")) +
  theme_minimal()


# From the above plot we can select below covid start and end date.
start_date <- as.Date("2020-03-01")
end_date <- as.Date("2021-06-30")

ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(start_date), linetype = "dashed", color = "red") +
  geom_vline(xintercept = as.numeric(end_date), linetype = "dashed", color = "red") +
  labs(title = "Claims Marking the Covid Pandemic Period",
       x = "Year",
       y = "Number")


print(paste("COVID Start Date:", start_date))
#> [1] "COVID Start Date: 2020-03-01"
print(paste("COVID End Date:", end_date))
#> [1] "COVID End Date: 2021-06-30"

# Filter non-COVID period and COVID period data
non_covid_period <- icnsa_data[icnsa_data$date < start_date | icnsa_data$date > end_date, ]
covid_period <- icnsa_data[icnsa_data$date >= start_date & icnsa_data$date <= end_date, ]

# Define a function to calculate RMSE for a given lambda value
calculate_rmse <- function(lambda, non_covid_data, covid_period_data) {
  # Fit cubic spline on non-COVID data
  spline_fit <- smooth.spline(x = as.numeric(non_covid_data$date), 
                              y = non_covid_data$value, 
                              spar = lambda)

  # Impute values for COVID period using the fitted spline
  imputed_values <- predict(spline_fit, x = as.numeric(covid_period_data$date))$y

  # Calculate RMSE
  rmse <- sqrt(mean((covid_period_data$value - imputed_values)^2))

  return(rmse)
}

# Initialize variables to store results
best_lambda <- NULL
min_rmse <- Inf

# Iterate over a range of lambda values and calculate RMSE
for (lambda in seq(0.1, 1.0, by = 0.1)) {
  # Calculate RMSE for the current lambda
  rmse <- calculate_rmse(lambda, non_covid_period, covid_period)

  # Update best lambda if RMSE is minimized
  if (rmse < min_rmse) {
    min_rmse <- rmse
    best_lambda <- lambda
  }
}

# Print the optimal lambda and minimized RMSE
cat("Optimal lambda based on minimized RMSE:", best_lambda, "\n")
#> Optimal lambda based on minimized RMSE: 0.2
cat("Minimized RMSE:", min_rmse, "\n")
#> Minimized RMSE: 1513114

# Filtering non-COVID data
non_covid_period <- icnsa_data[icnsa_data$date < start_date | icnsa_data$date > end_date, ]
covid_period <- icnsa_data[icnsa_data$date >= start_date & icnsa_data$date <= end_date, ]

# Justification for choosing the lambda value based on minimized RMSE
# Lambda value of 0.4 is chosen based on k-fold cross-validation, which resulted in the lowest RMSE.
optimal_lambda <- 0.4

# Fit cubic spline on non-COVID data with optimal lambda
spline_fit_covid <- smooth.spline(x = as.numeric(non_covid_period$date), y = non_covid_period$value, spar = optimal_lambda)

# Impute new values for COVID period using the fitted spline
imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period$date))$y

# Update COVID period data with imputed values
covid_period$value <- imputed_values

# Combining non-COVID and imputed values
updated_icnsa_data <- rbind(non_covid_period, covid_period)
updated_icnsa_data <- updated_icnsa_data %>% arrange(date)

# Plotting updated data
plot(updated_icnsa_data$date, updated_icnsa_data$value, type = "l", col = "red", lwd = 2,
     main = "Updated Time Series with Imputed COVID Values", xlab = "Year", ylab = "Number")
lines(icnsa_data$date, icnsa_data$value, col = "blue", lwd = 2)
legend("topright", legend = c("Updated", "Original"), col = c("red", "blue"), lty = 1, lwd = 2)


# Create time series objects for original and updated data
ts_data_old <- ts(icnsa_data$value, frequency = 52)
ts_data <- ts(updated_icnsa_data$value, frequency = 52)

# Plot both time series for comparison
plot(ts_data_old, col = "blue", main = "Comparison of Time Series", ylab = "Value")
lines(ts_data, col = "red")
legend("topright", legend = c("Original", "Updated"), col = c("blue", "red"), lty = 1, lwd = 2)


# Implementation of Holt-Winters Multiplicative Model
hwm_multi <- HoltWinters(ts_data, seasonal = "multiplicative")
forecasted_value <- forecast(hwm_multi, h = 1)
point_forecast <- forecasted_value$mean
cat("Forecasted Unemployment using Holt-Winters Multiplicative Model :", point_forecast, "\n")
#> Forecasted Unemployment using Holt-Winters Multiplicative Model : 192771.6

# Implementation of Holt-Winters Additive Model
hwa_add <- HoltWinters(ts_data, seasonal = "additive")
forecasted_value <- forecast(hwa_add, h = 1)
point_forecast <- forecasted_value$mean
cat("Forecasted Unemployment using Holt-Winters Additive Model :", point_forecast, "\n")
#> Forecasted Unemployment using Holt-Winters Additive Model : 207176.1

^{Created on 2024-02-28 with reprex v2.1.0}

JahnaviGangu commented 4 months ago

Lambda values ranging from 0.1 to 1.0 were tested to assess the smoothing level of each. The performance of each lambda was evaluated using k-fold cross-validation, where the data was divided into subsets for training and validation. The root mean square error (RMSE) between predicted and actual values was calculated to measure model accuracy. The lambda with the lowest RMSE across all folds was selected to balance bias and variance. The choice of lambda was based on its ability to minimize prediction error while maintaining suitable smoothing, considering factors like model complexity and goodness of fit. This systematic approach ensures that the cubic spline captures data trends while adjusting for COVID-19 noise.

JahnaviGangu commented 4 months ago

# Load necessary libraries
library(tidyverse)
#> Warning: package 'dplyr' was built under R version 4.2.3
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(imputeTS)

fredr_set_key("c387f7cbc3f36a5a52a03d391d17e253")
icnsa_data <- fredr(series_id = "ICNSA")  

# Convert date to Date format
icnsa_data$date <- as.Date(icnsa_data$date)

# Initial plot
ggplot() +
  geom_line(data = icnsa_data, aes(x = date, y = value, color = "ICNSA"), linewidth = 1) +
  labs(title = "Time Series Plot", x = "Date", y = "Value") +
  scale_color_manual(values = c("ICNSA" = "black")) +
  theme_minimal()


# Analyze plot and choose COVID period (replace with your analysis)
start_date <- as.Date("2020-03-01")
end_date <- as.Date("2021-06-30")

# Plot with marked COVID period
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(start_date), linetype = "dashed", color = "red") +
  geom_vline(xintercept = as.numeric(end_date), linetype = "dashed", color = "red") +
  labs(title = "Claims Marking the Covid Pandemic Period",
       x = "Year",
       y = "Number")


print(paste("COVID Start Date:", start_date))
#> [1] "COVID Start Date: 2020-03-01"
print(paste("COVID End Date:", end_date))
#> [1] "COVID End Date: 2021-06-30"

# Introduce artificial missing values
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- NA

# 1. Imputation using smooth spline
cleaned_data <- icnsa_data[!(icnsa_data$date >= start_date & icnsa_data$date <= end_date), ]
fill_null_values_spline <- round(smooth.spline(x = cleaned_data$date, y = cleaned_data$value, spar = 0.6)$y)
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- fill_null_values_spline[icnsa_data$date >= start_date & icnsa_data$date <= end_date]

# 2. Basic structural model (BSM)

# Create time series object
icnsa_ts <- ts(icnsa_data$value, frequency = 52)

# Define and fit the BSM model
base_model <- StructTS(icnsa_ts, type = "BSM")
base_prediction <- forecast(base_model, h = 1)

# Print point prediction for BSM
cat("Point prediction (BSM):", base_prediction$mean[1], "\n")
#> Point prediction (BSM): 236583.7

# Additional plot: ACF and PACF plots for model diagnostics
acf(base_model$residuals)

pacf(base_model$residuals)


# 3. Covariate model using Consumer Confidence Index (CC4WSA)

# Download CC4WSA data
cc4wsa_data <- fredr(series_id = "CC4WSA")

# Create covariate time series object
cc4wsa_ts <- ts(cc4wsa_data$value, frequency = 52)

# Align ICNSA and CC4WSA time series
aligned_icnsa <- window(icnsa_ts, start = start(cc4wsa_ts), end = end(cc4wsa_ts))

# Fit a structural time series model with trend, seasonal, error, and CC4WSA as covariate
covar_model <- auto.arima(aligned_icnsa, xreg = cc4wsa_ts)

# Get the last value of the covariate for prediction
cc4wsa_last_value <- tail(cc4wsa_ts, n = 1)

# **Covariate model prediction**
covar_prediction <- forecast(covar_model, h = 1, xreg = cc4wsa_last_value)

# Print point prediction for covariate model
cat("Point prediction (Covariate Model):", covar_prediction$mean[1], "\n")
#> Point prediction (Covariate Model): 245003.9

^{Created on 2024-03-07 with reprex v2.1.0}

JahnaviGangu commented 2 months ago

library(readxl)
library(zoo)  
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(vars)  
#> Warning: package 'vars' was built under R version 4.2.3
#> Loading required package: MASS
#> Warning: package 'MASS' was built under R version 4.2.3
#> Loading required package: strucchange
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.2.3
#> Loading required package: urca
#> Loading required package: lmtest
library(tidyr)
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:MASS':
#> 
#>     select
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
#> 
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#> 
#>     combine
library(BigVAR)
#> Loading required package: lattice
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo

#Below are the 4 seasonally adjusted time series from the US Census Bureaus M3 survey.
#1. **Transportation Equipment (TSANO)**: Tracks demand for U.S. transportation equipment like vehicles and aircraft, indicating consumer and business confidence in transportation.

#2. **Total Manufacturing (TMISR)**: Shows the inventory management efficiency in manufacturing, with lower ratios indicating stronger demand and higher ratios suggesting potential overstocking or slowing demand.

#3. **Primary Metals (PMISR)**: Reflects the inventory versus shipments in the metals sector, important for assessing the health of construction, manufacturing, and technology industries.

#4. **Computer and Electronic Products (CESVS)**: Measures output and demand in the tech sector, with high values signaling strong demand for technology products.

base_path <- "/Users/jahnavigangu/Downloads"

# List of Excel file names
file_names <- paste0("SeriesReport", 1:4, ".xlsx")  

# Looping over the file names to read each file into its own data frame
for (i in 1:length(file_names)) {
  # Construct the full file path
  file_path <- file.path(base_path, file_names[i])

  # Dynamically create a variable name for each data frame
  assign(paste0("df", i), read_excel(file_path, skip = 6))
}

head(df1)
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 NA   
#> 2 Feb-1992 31604
#> 3 Mar-1992 34300
#> 4 Apr-1992 37866
#> 5 May-1992 37482
#> 6 Jun-1992 34615
head(df2)
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 1.66 
#> 2 Feb-1992 1.65 
#> 3 Mar-1992 1.58 
#> 4 Apr-1992 1.57 
#> 5 May-1992 1.55 
#> 6 Jun-1992 1.53
head(df3)
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 1.71 
#> 2 Feb-1992 1.69 
#> 3 Mar-1992 1.79 
#> 4 Apr-1992 1.77 
#> 5 May-1992 1.71 
#> 6 Jun-1992 1.75
head(df4)
#> # A tibble: 6 × 2
#>   Period   Value
#>   <chr>    <chr>
#> 1 Jan-1992 22394
#> 2 Feb-1992 21884
#> 3 Mar-1992 22437
#> 4 Apr-1992 23033
#> 5 May-1992 23018
#> 6 Jun-1992 22721

dfs <- list(df1, df2, df3, df4)

for(i in seq_along(dfs)) {
  # Finding non-numeric values
  non_numeric <- grep("[^0-9.-]", dfs[[i]]$Value)

  if(length(non_numeric) > 0) {
    cat("Non-numeric values found in df", i, ": ", dfs[[i]]$Value[non_numeric], "\n\n")
  }
}
#> Non-numeric values found in df 1 :  NA NA NA NA NA NA NA NA NA NA NA 
#> 
#> Non-numeric values found in df 2 :  NA NA NA NA NA NA NA NA NA NA 
#> 
#> Non-numeric values found in df 3 :  NA NA NA NA NA NA NA NA NA NA 
#> 
#> Non-numeric values found in df 4 :  NA NA NA NA NA NA NA NA NA NA

for(i in seq_along(dfs)) {
  # Removing the non-numeric characters except for decimal points and minus sign
  dfs[[i]]$Value <- gsub("[^0-9.-]", "", dfs[[i]]$Value)

  dfs[[i]]$Value <- na.approx(dfs[[i]]$Value, na.rm = FALSE)

  # Then, using na.fill to handle any remaining leading/trailing NAs by extending first/last observed values
  dfs[[i]]$Value <- na.fill(dfs[[i]]$Value, fill = "extend")

  # Converting to numeric
  dfs[[i]]$Value <- as.numeric(dfs[[i]]$Value)
}

for (i in 1:length(dfs)) {

  # Time Series Conversion
  ts_data <- ts(dfs[[i]]$Value, start = c(1992, 1), frequency = 12) 

  # Visualization
  plot(ts_data, main = paste0("Time Series Plot - ", names(dfs)[i]))

  # Summary Statistics
  print(paste0("Summary Statistics - ", names(dfs)[i])) 
  print(summary(ts_data))

  # Decomposition (optional)
  decomp_data <- decompose(ts_data)
  plot(decomp_data)
}

#> [1] "Summary Statistics - "
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   29144   49204   59534   62925   76455  144061

#> [1] "Summary Statistics - "
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   1.140   1.290   1.375   1.370   1.460   1.890

#> [1] "Summary Statistics - "
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   1.340   1.590   1.670   1.681   1.750   2.470

#> [1] "Summary Statistics - "
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   21884   25957   28511   29529   32172   44534


# since dfs is list of data frames and each data frame has a column named "Value"

# Applying first differencing to each dataset in the list
dfs_diff <- lapply(dfs, function(df) {
  # Apply first differencing and prepend NA to keep the length the same
  df$Value <- diff(c(NA, df$Value), differences = 1)
  return(df)
})

# Removing the first NA value introduced by differencing
dfs_diff <- lapply(dfs_diff, function(df) {
  # Remove the first row which contains NA due to differencing
  df <- df[-1, ]
  return(df)
})

# Example of plotting the first differenced series
# Replace '1' with the index of the dataset you want to plot
for (i in 1:length(dfs)) {
  plot(ts(dfs[[i]]$Value, start=c(1992, 1), frequency=12), type='l', 
       main="First Differenced Time Series", xlab="Time", ylab="Differenced Value")
}


# Applying the Augmented Dickey-Fuller test to the first differenced series
for (i in 1:length(dfs_diff)) {
  # Convert to numeric as adf.test requires numeric or univariate time series
  numeric_values <- as.numeric(dfs_diff[[i]]$Value)
  adf_result_diff <- adf.test(numeric_values, alternative = "stationary")
  print(paste("ADF Test Result for Dataset", i))
  print(adf_result_diff)
}
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 1"
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  numeric_values
#> Dickey-Fuller = -8.5316, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 2"
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  numeric_values
#> Dickey-Fuller = -7.1401, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 3"
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  numeric_values
#> Dickey-Fuller = -8.018, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 4"
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  numeric_values
#> Dickey-Fuller = -5.6091, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary

par(mfrow=c(2,2)) # Setting up the plotting area to display 4 plots (for 4 datasets)

for (i in 1:length(dfs)) {
  # Plot original series
  ts_original <- ts(dfs[[i]]$Value, start=c(1992, 1), frequency=12)
  plot(ts_original, type='l', main=paste("Original Series - Dataset", i), xlab="Time", ylab="Value")

  # Plot differenced series
  ts_differenced <- ts(as.numeric(dfs_diff[[i]]$Value), start=c(1992, 2), frequency=12) # Starting from 1992, 2 because the first value is removed
  plot(ts_differenced, type='l', main=paste("Differenced Series - Dataset", i), xlab="Time", ylab="Differenced Value")
}


par(mfrow=c(1,1)) # Resetting plotting area to default

# Assuming diff_df is correctly prepared as a multivariate time series
# Converting diff_df to a ts object with appropriate start and frequency
# Note: Ensure diff_df is a numeric matrix or data frame with time series as columns

# First, let's correct the conversion of dfs to a combined time series format
# Assuming each df in dfs list has the same number of rows and appropriate time alignment

combined_ts <- do.call(cbind, lapply(dfs, function(x) as.numeric(x$Value)))
rownames(combined_ts) <- paste(dfs[[1]]$Period)

# Converting the combined data frame to a ts object with correct frequency
combined_ts_obj <- ts(combined_ts, start=c(1992, 1), frequency=12)

#EDA and feature observations 
#The code performs data cleaning, transformation, and analysis on four economic time series datasets, 
#focusing on transportation equipment, total manufacturing, primary metals, and computer and electronic products.
#Key steps include handling missing values, converting data to numeric format, and creating time series objects. 
#Summary statistics reveal variability across sectors, indicating fluctuations in orders, inventory ratios, and shipments, 
#reflecting economic cycles and industry-specific trends. Augmented Dickey-Fuller tests suggest non-stationarity 
#in most series, except for the primary metals sector, hinting at underlying trends or seasonality.
#So I performed differencing to induce stationarity and obtained it successfully indicated by p=0.01
#This preparatory work sets the stage for in-depth time series analysis, 
#uncovering patterns and forecasting future trends based on historical data.

# Now let's fit the VAR model to the combined time series object
library(vars)

var1 <- VAR(combined_ts_obj, p = 1) 
#> Warning in VAR(combined_ts_obj, p = 1): No column names supplied in y, using: y1, y2, y3, y4 , instead.
summary(var1)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: y1, y2, y3, y4 
#> Deterministic variables: const 
#> Sample size: 395 
#> Log Likelihood: -5581.497 
#> Roots of the characteristic polynomial:
#> 0.9814 0.9714 0.9514 0.8879
#> Call:
#> VAR(y = combined_ts_obj, p = 1)
#> 
#> 
#> Estimation results for equation y1: 
#> =================================== 
#> y1 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const 
#> 
#>         Estimate Std. Error t value Pr(>|t|)    
#> y1.l1    0.89077    0.02292  38.857   <2e-16 ***
#> y2.l1  261.77777 4871.30721   0.054    0.957    
#> y3.l1  334.47468 3381.93701   0.099    0.921    
#> y4.l1   -0.10795    0.09304  -1.160    0.247    
#> const 9281.23087 7239.20054   1.282    0.201    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 8050 on 390 degrees of freedom
#> Multiple R-Squared: 0.8041,  Adjusted R-squared: 0.8021 
#> F-statistic: 400.3 on 4 and 390 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation y2: 
#> =================================== 
#> y2 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> y1.l1 1.604e-07  7.127e-08   2.251    0.025 *  
#> y2.l1 9.666e-01  1.514e-02  63.825   <2e-16 ***
#> y3.l1 4.677e-03  1.051e-02   0.445    0.657    
#> y4.l1 1.629e-07  2.893e-07   0.563    0.574    
#> const 2.252e-02  2.251e-02   1.001    0.318    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 0.02503 on 390 degrees of freedom
#> Multiple R-Squared: 0.9503,  Adjusted R-squared: 0.9498 
#> F-statistic:  1865 on 4 and 390 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation y3: 
#> =================================== 
#> y3 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const 
#> 
#>        Estimate Std. Error t value Pr(>|t|)    
#> y1.l1 2.553e-07  1.351e-07   1.890   0.0595 .  
#> y2.l1 9.086e-03  2.870e-02   0.317   0.7517    
#> y3.l1 9.534e-01  1.992e-02  47.850   <2e-16 ***
#> y4.l1 3.849e-07  5.481e-07   0.702   0.4830    
#> const 3.863e-02  4.265e-02   0.906   0.3656    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 0.04743 on 390 degrees of freedom
#> Multiple R-Squared: 0.912,   Adjusted R-squared: 0.9111 
#> F-statistic:  1011 on 4 and 390 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation y4: 
#> =================================== 
#> y4 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const 
#> 
#>         Estimate Std. Error t value Pr(>|t|)    
#> y1.l1 -2.520e-03  2.102e-03  -1.199  0.23141    
#> y2.l1  8.420e+02  4.467e+02   1.885  0.06017 .  
#> y3.l1 -8.291e+02  3.101e+02  -2.674  0.00782 ** 
#> y4.l1  9.814e-01  8.532e-03 115.031  < 2e-16 ***
#> const  9.669e+02  6.638e+02   1.457  0.14602    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 738.2 on 390 degrees of freedom
#> Multiple R-Squared: 0.9753,  Adjusted R-squared: 0.975 
#> F-statistic:  3846 on 4 and 390 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>            y1         y2         y3         y4
#> y1  6.481e+07 -5.696e+01 -9.367e+01 898240.507
#> y2 -5.696e+01  6.264e-04  8.005e-04     -4.301
#> y3 -9.367e+01  8.005e-04  2.249e-03     -4.472
#> y4  8.982e+05 -4.301e+00 -4.472e+00 544889.467
#> 
#> Correlation matrix of residuals:
#>         y1      y2      y3      y4
#> y1  1.0000 -0.2827 -0.2453  0.1512
#> y2 -0.2827  1.0000  0.6744 -0.2328
#> y3 -0.2453  0.6744  1.0000 -0.1277
#> y4  0.1512 -0.2328 -0.1277  1.0000

# AIC for VAR(1)
aic_var1 <- AIC(var1)

# Finding the optimal lag order p for VAR model based on AIC
aic_values <- numeric(10)
for(p in 1:10) {
  model <- VAR(combined_ts_obj, p = p, type = "both")
  aic_values[p] <- AIC(model)
}
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
optimal_p <- which.min(aic_values)

# Fitting the VAR model using the optimal p
var_optimal <- VAR(combined_ts_obj, p = optimal_p, type = "both")
#> Warning in VAR(combined_ts_obj, p = optimal_p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.

# AIC for VAR(optimal_p)
aic_var_opt <- AIC(var_optimal)

# summary of the optimal model
summary(var_optimal)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: y1, y2, y3, y4 
#> Deterministic variables: both 
#> Sample size: 386 
#> Log Likelihood: -5170.622 
#> Roots of the characteristic polynomial:
#> 0.9775 0.9775 0.921 0.921 0.9023 0.9023 0.881 0.881 0.8431 0.8431 0.8295 0.8295 0.8145 0.8145 0.8064 0.8064 0.7972 0.7972 0.7922 0.7922 0.7748 0.7748 0.7692 0.7692 0.7565 0.7565 0.7561 0.7561 0.7364 0.7364 0.7295 0.7295 0.7162 0.7162 0.6817 0.6817 0.6536 0.6536 0.5645 0.5645
#> Call:
#> VAR(y = combined_ts_obj, p = optimal_p, type = "both")
#> 
#> 
#> Estimation results for equation y1: 
#> =================================== 
#> y1 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend 
#> 
#>          Estimate Std. Error t value Pr(>|t|)    
#> y1.l1   2.483e-01  5.762e-02   4.310 2.14e-05 ***
#> y2.l1  -3.569e+04  2.338e+04  -1.526  0.12784    
#> y3.l1  -1.790e+04  1.240e+04  -1.444  0.14976    
#> y4.l1   9.545e-02  5.958e-01   0.160  0.87282    
#> y1.l2   4.318e-02  5.935e-02   0.728  0.46738    
#> y2.l2   7.321e+03  2.920e+04   0.251  0.80220    
#> y3.l2   1.175e+04  1.665e+04   0.706  0.48077    
#> y4.l2   3.677e-01  7.203e-01   0.510  0.61005    
#> y1.l3   1.729e-01  5.996e-02   2.884  0.00417 ** 
#> y2.l3  -8.748e+03  2.918e+04  -0.300  0.76452    
#> y3.l3   4.762e+02  1.672e+04   0.028  0.97729    
#> y4.l3   6.483e-01  7.312e-01   0.887  0.37584    
#> y1.l4   8.164e-02  6.087e-02   1.341  0.18073    
#> y2.l4   2.861e+04  2.952e+04   0.969  0.33312    
#> y3.l4   7.060e+03  1.679e+04   0.420  0.67445    
#> y4.l4  -6.261e-01  7.694e-01  -0.814  0.41635    
#> y1.l5   1.431e-01  6.066e-02   2.359  0.01890 *  
#> y2.l5   6.692e+03  2.918e+04   0.229  0.81874    
#> y3.l5  -2.102e+04  1.654e+04  -1.271  0.20460    
#> y4.l5  -1.195e+00  7.797e-01  -1.532  0.12637    
#> y1.l6   1.585e-01  6.100e-02   2.599  0.00976 ** 
#> y2.l6   1.442e+04  2.898e+04   0.498  0.61903    
#> y3.l6   1.576e+04  1.651e+04   0.954  0.34052    
#> y4.l6   4.766e-01  7.816e-01   0.610  0.54244    
#> y1.l7  -1.607e-02  6.191e-02  -0.260  0.79536    
#> y2.l7  -1.343e+04  2.861e+04  -0.469  0.63913    
#> y3.l7   3.947e+03  1.653e+04   0.239  0.81146    
#> y4.l7   4.667e-01  7.706e-01   0.606  0.54519    
#> y1.l8   1.527e-03  6.081e-02   0.025  0.97998    
#> y2.l8  -1.159e+04  2.796e+04  -0.415  0.67873    
#> y3.l8  -2.038e+03  1.616e+04  -0.126  0.89967    
#> y4.l8   6.502e-01  7.355e-01   0.884  0.37732    
#> y1.l9   1.904e-02  6.064e-02   0.314  0.75370    
#> y2.l9   3.580e+04  2.699e+04   1.327  0.18551    
#> y3.l9  -1.160e+04  1.590e+04  -0.729  0.46630    
#> y4.l9  -8.694e-01  7.136e-01  -1.218  0.22391    
#> y1.l10  1.827e-03  5.828e-02   0.031  0.97501    
#> y2.l10 -1.985e+04  2.036e+04  -0.975  0.33022    
#> y3.l10  5.209e+03  1.117e+04   0.467  0.64112    
#> y4.l10  3.918e-03  6.010e-01   0.007  0.99480    
#> const   1.340e+04  7.453e+03   1.798  0.07299 .  
#> trend   2.405e+01  8.537e+00   2.818  0.00512 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 6889 on 344 degrees of freedom
#> Multiple R-Squared: 0.866,   Adjusted R-squared:  0.85 
#> F-statistic: 54.21 on 41 and 344 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation y2: 
#> =================================== 
#> y2 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend 
#> 
#>          Estimate Std. Error t value Pr(>|t|)    
#> y1.l1  -6.222e-08  2.011e-07  -0.309  0.75721    
#> y2.l1   8.128e-01  8.161e-02   9.959  < 2e-16 ***
#> y3.l1   1.727e-01  4.328e-02   3.990 8.07e-05 ***
#> y4.l1   1.536e-06  2.079e-06   0.739  0.46047    
#> y1.l2   3.969e-07  2.072e-07   1.916  0.05617 .  
#> y2.l2  -9.768e-02  1.019e-01  -0.958  0.33852    
#> y3.l2  -1.128e-01  5.812e-02  -1.941  0.05303 .  
#> y4.l2  -4.037e-06  2.514e-06  -1.606  0.10922    
#> y1.l3   2.706e-08  2.092e-07   0.129  0.89717    
#> y2.l3   2.981e-01  1.018e-01   2.927  0.00365 ** 
#> y3.l3  -7.017e-02  5.835e-02  -1.203  0.22996    
#> y4.l3  -3.239e-07  2.552e-06  -0.127  0.89908    
#> y1.l4  -1.060e-08  2.124e-07  -0.050  0.96022    
#> y2.l4  -1.173e-01  1.030e-01  -1.138  0.25594    
#> y3.l4   8.504e-03  5.861e-02   0.145  0.88471    
#> y4.l4   4.504e-06  2.685e-06   1.677  0.09443 .  
#> y1.l5  -4.756e-07  2.117e-07  -2.247  0.02530 *  
#> y2.l5   1.782e-02  1.018e-01   0.175  0.86117    
#> y3.l5   5.986e-02  5.773e-02   1.037  0.30055    
#> y4.l5  -6.450e-08  2.721e-06  -0.024  0.98110    
#> y1.l6   1.527e-07  2.129e-07   0.717  0.47368    
#> y2.l6   1.017e-01  1.011e-01   1.005  0.31547    
#> y3.l6  -8.048e-02  5.764e-02  -1.396  0.16353    
#> y4.l6  -1.914e-06  2.728e-06  -0.702  0.48336    
#> y1.l7   3.229e-08  2.161e-07   0.149  0.88129    
#> y2.l7  -4.790e-02  9.986e-02  -0.480  0.63177    
#> y3.l7   9.943e-04  5.770e-02   0.017  0.98626    
#> y4.l7   2.061e-06  2.690e-06   0.766  0.44407    
#> y1.l8   7.279e-09  2.122e-07   0.034  0.97266    
#> y2.l8   6.195e-02  9.759e-02   0.635  0.52600    
#> y3.l8   1.901e-02  5.639e-02   0.337  0.73623    
#> y4.l8  -6.535e-07  2.567e-06  -0.255  0.79919    
#> y1.l9   1.924e-08  2.116e-07   0.091  0.92761    
#> y2.l9  -8.484e-02  9.419e-02  -0.901  0.36836    
#> y3.l9   2.127e-02  5.549e-02   0.383  0.70170    
#> y4.l9   2.617e-06  2.491e-06   1.051  0.29402    
#> y1.l10  5.527e-08  2.034e-07   0.272  0.78601    
#> y2.l10  2.139e-02  7.106e-02   0.301  0.76361    
#> y3.l10 -7.210e-03  3.897e-02  -0.185  0.85332    
#> y4.l10 -3.736e-06  2.098e-06  -1.781  0.07575 .  
#> const   1.754e-02  2.601e-02   0.674  0.50066    
#> trend   7.637e-07  2.979e-05   0.026  0.97956    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 0.02404 on 344 degrees of freedom
#> Multiple R-Squared: 0.9563,  Adjusted R-squared: 0.9511 
#> F-statistic: 183.6 on 41 and 344 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation y3: 
#> =================================== 
#> y3 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend 
#> 
#>          Estimate Std. Error t value Pr(>|t|)    
#> y1.l1   2.253e-07  3.670e-07   0.614   0.5396    
#> y2.l1   1.344e-01  1.489e-01   0.903   0.3674    
#> y3.l1   1.115e+00  7.897e-02  14.123   <2e-16 ***
#> y4.l1  -1.744e-06  3.794e-06  -0.460   0.6460    
#> y1.l2   1.611e-07  3.780e-07   0.426   0.6702    
#> y2.l2  -3.088e-01  1.860e-01  -1.661   0.0977 .  
#> y3.l2  -9.373e-02  1.061e-01  -0.884   0.3774    
#> y4.l2  -5.884e-06  4.587e-06  -1.283   0.2005    
#> y1.l3  -6.682e-09  3.818e-07  -0.018   0.9860    
#> y2.l3   3.670e-01  1.858e-01   1.975   0.0490 *  
#> y3.l3  -4.057e-03  1.065e-01  -0.038   0.9696    
#> y4.l3   4.302e-06  4.656e-06   0.924   0.3561    
#> y1.l4  -5.050e-07  3.876e-07  -1.303   0.1936    
#> y2.l4  -2.093e-01  1.880e-01  -1.113   0.2664    
#> y3.l4  -1.181e-01  1.069e-01  -1.104   0.2703    
#> y4.l4   4.638e-06  4.900e-06   0.946   0.3446    
#> y1.l5  -3.331e-07  3.863e-07  -0.862   0.3891    
#> y2.l5   1.088e-02  1.858e-01   0.059   0.9533    
#> y3.l5   1.848e-01  1.053e-01   1.754   0.0803 .  
#> y4.l5  -3.273e-06  4.965e-06  -0.659   0.5102    
#> y1.l6  -1.875e-07  3.885e-07  -0.483   0.6296    
#> y2.l6   9.891e-02  1.845e-01   0.536   0.5923    
#> y3.l6  -3.491e-01  1.052e-01  -3.319   0.0010 ***
#> y4.l6  -1.558e-06  4.978e-06  -0.313   0.7544    
#> y1.l7   2.698e-07  3.943e-07   0.684   0.4943    
#> y2.l7  -1.341e-01  1.822e-01  -0.736   0.4621    
#> y3.l7   1.176e-01  1.053e-01   1.117   0.2648    
#> y4.l7   1.468e-06  4.908e-06   0.299   0.7651    
#> y1.l8   2.052e-07  3.873e-07   0.530   0.5966    
#> y2.l8  -2.978e-02  1.781e-01  -0.167   0.8673    
#> y3.l8   5.314e-02  1.029e-01   0.516   0.6058    
#> y4.l8  -5.990e-07  4.684e-06  -0.128   0.8983    
#> y1.l9   1.715e-07  3.862e-07   0.444   0.6573    
#> y2.l9  -1.169e-01  1.719e-01  -0.680   0.4968    
#> y3.l9   9.204e-02  1.013e-01   0.909   0.3640    
#> y4.l9   8.736e-06  4.544e-06   1.922   0.0554 .  
#> y1.l10  2.191e-07  3.712e-07   0.590   0.5554    
#> y2.l10  2.396e-01  1.297e-01   1.848   0.0655 .  
#> y3.l10 -1.192e-01  7.111e-02  -1.676   0.0947 .  
#> y4.l10 -5.790e-06  3.827e-06  -1.513   0.1312    
#> const   1.047e-01  4.746e-02   2.206   0.0280 *  
#> trend   2.833e-05  5.436e-05   0.521   0.6027    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 0.04387 on 344 degrees of freedom
#> Multiple R-Squared: 0.9332,  Adjusted R-squared: 0.9253 
#> F-statistic: 117.2 on 41 and 344 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation y4: 
#> =================================== 
#> y4 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend 
#> 
#>          Estimate Std. Error t value Pr(>|t|)    
#> y1.l1  -4.999e-03  5.276e-03  -0.947 0.344124    
#> y2.l1   9.123e+02  2.141e+03   0.426 0.670340    
#> y3.l1  -2.490e+03  1.136e+03  -2.193 0.029011 *  
#> y4.l1   6.889e-01  5.456e-02  12.627  < 2e-16 ***
#> y1.l2  -1.649e-03  5.435e-03  -0.303 0.761703    
#> y2.l2  -9.081e+02  2.674e+03  -0.340 0.734391    
#> y3.l2   2.354e+03  1.525e+03   1.543 0.123667    
#> y4.l2   2.579e-01  6.596e-02   3.910 0.000111 ***
#> y1.l3   9.151e-03  5.490e-03   1.667 0.096454 .  
#> y2.l3   1.076e+03  2.672e+03   0.403 0.687319    
#> y3.l3  -1.037e+03  1.531e+03  -0.678 0.498433    
#> y4.l3   4.091e-01  6.695e-02   6.110 2.69e-09 ***
#> y1.l4  -5.150e-03  5.574e-03  -0.924 0.356150    
#> y2.l4  -1.020e+03  2.704e+03  -0.377 0.706157    
#> y3.l4   2.405e+03  1.538e+03   1.564 0.118771    
#> y4.l4  -2.131e-01  7.046e-02  -3.024 0.002680 ** 
#> y1.l5   4.086e-03  5.554e-03   0.736 0.462437    
#> y2.l5  -8.392e+02  2.672e+03  -0.314 0.753646    
#> y3.l5  -1.178e+03  1.515e+03  -0.777 0.437466    
#> y4.l5  -1.339e-01  7.139e-02  -1.876 0.061555 .  
#> y1.l6  -9.508e-04  5.586e-03  -0.170 0.864952    
#> y2.l6   1.433e+03  2.653e+03   0.540 0.589577    
#> y3.l6  -5.079e+02  1.512e+03  -0.336 0.737182    
#> y4.l6   7.456e-02  7.158e-02   1.042 0.298303    
#> y1.l7   5.649e-03  5.669e-03   0.996 0.319776    
#> y2.l7  -5.875e+02  2.620e+03  -0.224 0.822722    
#> y3.l7   1.664e+03  1.514e+03   1.099 0.272532    
#> y4.l7  -1.380e-01  7.057e-02  -1.956 0.051325 .  
#> y1.l8  -1.511e-03  5.568e-03  -0.271 0.786239    
#> y2.l8   1.456e+02  2.561e+03   0.057 0.954684    
#> y3.l8  -3.035e+03  1.480e+03  -2.051 0.041011 *  
#> y4.l8  -6.932e-02  6.735e-02  -1.029 0.304105    
#> y1.l9   4.274e-03  5.553e-03   0.770 0.442003    
#> y2.l9   1.517e+03  2.471e+03   0.614 0.539871    
#> y3.l9   1.722e+03  1.456e+03   1.182 0.237845    
#> y4.l9   1.103e-01  6.535e-02   1.687 0.092449 .  
#> y1.l10 -9.220e-03  5.337e-03  -1.727 0.084991 .  
#> y2.l10 -8.679e+02  1.864e+03  -0.466 0.641857    
#> y3.l10 -6.004e+02  1.022e+03  -0.587 0.557451    
#> y4.l10 -7.685e-03  5.503e-02  -0.140 0.889021    
#> const   7.554e+02  6.824e+02   1.107 0.269130    
#> trend  -4.181e-01  7.817e-01  -0.535 0.593096    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 630.9 on 344 degrees of freedom
#> Multiple R-Squared: 0.9832,  Adjusted R-squared: 0.9812 
#> F-statistic:   492 on 41 and 344 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>            y1         y2         y3         y4
#> y1  4.746e+07 -5.879e+01 -8.810e+01 459674.754
#> y2 -5.879e+01  5.781e-04  7.782e-04     -2.869
#> y3 -8.810e+01  7.782e-04  1.925e-03     -3.925
#> y4  4.597e+05 -2.869e+00 -3.925e+00 397978.167
#> 
#> Correlation matrix of residuals:
#>         y1      y2      y3      y4
#> y1  1.0000 -0.3549 -0.2915  0.1058
#> y2 -0.3549  1.0000  0.7377 -0.1892
#> y3 -0.2915  0.7377  1.0000 -0.1418
#> y4  0.1058 -0.1892 -0.1418  1.0000

# Comparinging above results
print(paste("AIC for VAR(1):", aic_var1))
#> [1] "AIC for VAR(1): 11202.9930568861"
print(paste("AIC for VAR(", optimal_p, "):", aic_var_opt, sep=""))
#> [1] "AIC for VAR(10):10677.2445825659"

# Deciding which is better based on AIC (lower is better)
if(aic_var1 < aic_var_opt) {
  print("VAR(1) is better.")
} else {
  print(paste("VAR(", optimal_p, ") is better.", sep=""))
}
#> [1] "VAR(10) is better."
#The code fits a VAR(1) model and identifies an optimal lag for a VAR(p) model, choosing `p` greater than 1.
#After comparing both models using AIC, it determines that VAR(10) performs better, indicating it's a more 
#suitable choice for modeling these time series.

# AIC for VAR models across different p values

plot(1:10, aic_values, type='b', xlab="Lag Order", ylab="AIC Value", main="AIC Values for Different Lag Orders in VAR Model")
points(optimal_p, aic_values[optimal_p], col="red", cex=1.5, pch=20)


# Displaying the forecast results for var(1)
forecast_results <- predict(var1, n.ahead = 1)  
print(forecast_results$fcst)
#> $y1
#>             fcst    lower    upper       CI
#> y1.fcst 87565.07 71786.84 103343.3 15778.23
#> 
#> $y2
#>             fcst    lower    upper         CI
#> y2.fcst 1.470849 1.421797 1.519902 0.04905267
#> 
#> $y3
#>             fcst    lower    upper         CI
#> y3.fcst 1.735819 1.642867 1.828772 0.09295293
#> 
#> $y4
#>             fcst    lower    upper       CI
#> y4.fcst 29756.05 28309.27 31202.83 1446.779

# Displaying the forecast results for var(p)
forecast_results <- predict(var_optimal, n.ahead = 1)  
print(forecast_results$fcst)
#> $y1
#>             fcst    lower    upper      CI
#> y1.fcst 91465.82 77963.22 104968.4 13502.6
#> 
#> $y2
#>            fcst    lower    upper         CI
#> y2.fcst 1.47061 1.423485 1.517735 0.04712519
#> 
#> $y3
#>             fcst    lower    upper         CI
#> y3.fcst 1.741211 1.655222 1.827199 0.08598828
#> 
#> $y4
#>            fcst    lower    upper       CI
#> y4.fcst 29743.2 28506.75 30979.66 1236.453

#These forecasts provide an estimate of the future values for each of these series, indicating the expected demand, efficiency, and output in the respective sectors of the economy.

# This matrix shows the coefficients for each lag and each variable. Coefficients that are zero (or near zero) indicate sparsity, meaning those lags/variables are less important.
# For VAR(1) model - Testing causality of each variable
granger_test_var1_y1 <- causality(var1, cause = "y1")
granger_test_var1_y2 <- causality(var1, cause = "y2")
granger_test_var1_y3 <- causality(var1, cause = "y3")
granger_test_var1_y4 <- causality(var1, cause = "y4")

# Printing results for y1,y2,y3,y4 causing others in VAR(1)
print(granger_test_var1_y1)
#> $Granger
#> 
#>  Granger causality H0: y1 do not Granger-cause y2 y3 y4
#> 
#> data:  VAR object var1
#> F-Test = 1.9435, df1 = 3, df2 = 1560, p-value = 0.1206
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y1 and y2 y3 y4
#> 
#> data:  VAR object var1
#> Chi-squared = 33.827, df = 3, p-value = 2.155e-07
print(granger_test_var1_y2)
#> $Granger
#> 
#>  Granger causality H0: y2 do not Granger-cause y1 y3 y4
#> 
#> data:  VAR object var1
#> F-Test = 1.2935, df1 = 3, df2 = 1560, p-value = 0.2751
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y2 and y1 y3 y4
#> 
#> data:  VAR object var1
#> Chi-squared = 129.42, df = 3, p-value < 2.2e-16
print(granger_test_var1_y3)
#> $Granger
#> 
#>  Granger causality H0: y3 do not Granger-cause y1 y2 y4
#> 
#> data:  VAR object var1
#> F-Test = 2.47, df1 = 3, df2 = 1560, p-value = 0.06032
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y3 and y1 y2 y4
#> 
#> data:  VAR object var1
#> Chi-squared = 124.32, df = 3, p-value < 2.2e-16
print(granger_test_var1_y4)
#> $Granger
#> 
#>  Granger causality H0: y4 do not Granger-cause y1 y2 y3
#> 
#> data:  VAR object var1
#> F-Test = 0.51122, df1 = 3, df2 = 1560, p-value = 0.6746
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y4 and y1 y2 y3
#> 
#> data:  VAR object var1
#> Chi-squared = 23.859, df = 3, p-value = 2.673e-05

# Repeating the causality test for VAR(p) model
granger_test_varp_y1 <- causality(var_optimal, cause = "y1")
granger_test_varp_y2 <- causality(var_optimal, cause = "y2")
granger_test_varp_y3 <- causality(var_optimal, cause = "y3")
granger_test_varp_y4 <- causality(var_optimal, cause = "y4")

# Print results for y1 causing others in VAR(p)
print(granger_test_varp_y1)
#> $Granger
#> 
#>  Granger causality H0: y1 do not Granger-cause y2 y3 y4
#> 
#> data:  VAR object var_optimal
#> F-Test = 0.92861, df1 = 30, df2 = 1376, p-value = 0.5779
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y1 and y2 y3 y4
#> 
#> data:  VAR object var_optimal
#> Chi-squared = 44.232, df = 3, p-value = 1.347e-09
print(granger_test_varp_y2)
#> $Granger
#> 
#>  Granger causality H0: y2 do not Granger-cause y1 y3 y4
#> 
#> data:  VAR object var_optimal
#> F-Test = 0.88361, df1 = 30, df2 = 1376, p-value = 0.6482
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y2 and y1 y3 y4
#> 
#> data:  VAR object var_optimal
#> Chi-squared = 140.35, df = 3, p-value < 2.2e-16
print(granger_test_varp_y3)
#> $Granger
#> 
#>  Granger causality H0: y3 do not Granger-cause y1 y2 y4
#> 
#> data:  VAR object var_optimal
#> F-Test = 1.3245, df1 = 30, df2 = 1376, p-value = 0.1135
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y3 and y1 y2 y4
#> 
#> data:  VAR object var_optimal
#> Chi-squared = 136.2, df = 3, p-value < 2.2e-16
print(granger_test_varp_y4)
#> $Granger
#> 
#>  Granger causality H0: y4 do not Granger-cause y1 y2 y3
#> 
#> data:  VAR object var_optimal
#> F-Test = 1.3443, df1 = 30, df2 = 1376, p-value = 0.1019
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: y4 and y1 y2 y3
#> 
#> data:  VAR object var_optimal
#> Chi-squared = 13.949, df = 3, p-value = 0.002976

# Assuming Y is the data matrix prepared from dfs[[i]]$Value
Y <- as.matrix(dfs[[i]]$Value)

# fitting a sparse VAR model
# Assuming 'combined_ts_obj' is the multivariate time series object

# Choosing a sparsity structure "Basic".
sparsity_structure = "Basic"

# Defining the model specification with a different sparsity structure or adjust the parameters
modelSpec = constructModel(combined_ts_obj, p = 2, struct = sparsity_structure, gran = 50, ownlambdas = TRUE)

# Estimating the BigVAR model
bigvarResult = BigVAR.est(modelSpec)

# Examining the sparsity pattern
sparsity_pattern <- bigvarResult$B
print(sparsity_pattern)
#> , , 1
#> 
#>             [,1]          [,2]          [,3]          [,4]          [,5]
#> [1,] 7649.837191  5.841149e-01  3.979501e-06 -4.703268e-06 -1.186764e-02
#> [2,]    1.389215 -8.554756e-08  0.000000e+00  0.000000e+00 -2.403261e-07
#> [3,]    1.672897  1.348993e-07  0.000000e+00  0.000000e+00 -2.956989e-07
#> [4,]  670.597726  2.338252e-04 -2.662708e-06 -1.326624e-05  5.248960e-01
#>               [,6]          [,7]          [,8]          [,9]
#> [1,]  3.415819e-01  1.195205e-05  1.312881e-05 -8.204678e-02
#> [2,] -1.879295e-08  0.000000e+00  0.000000e+00 -2.429795e-07
#> [3,]  2.691337e-07  0.000000e+00  0.000000e+00 -2.760821e-07
#> [4,] -2.977075e-03 -1.377133e-06 -1.035675e-05  4.592039e-01

# Plotting the sparsity pattern of the BigVAR model coefficients
sparsity_pattern <- bigvarResult$B 

#The BigVAR model with a "Basic" sparsity structure was used, focusing on significant relationships while reducing 
#less important ones to zero. Significant coefficients indicate strong dependencies, especially within variables, 
#highlighting key economic relationships. Near-zero coefficients suggest minimal influence, emphasizing the model's efficiency
#in capturing only crucial interactions. This approach is useful for understanding and forecasting complex economic dynamics by focusing on the most impactful factors.

^{Created on 2024-04-10 with reprex v2.1.0}

jlivsey / UB-sping24-time-series

Jahnavi Gangu #11

The final model chosen is an ARIMA model with a variable (covariate). Here's the Justification:

Regression Diagnostics