Open JahnaviGangu opened 5 months ago
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(fredr)
library(stats)
library(ggplot2)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(TSA)
#> Registered S3 methods overwritten by 'TSA':
#> method from
#> fitted.Arima forecast
#> plot.Arima forecast
#>
#> Attaching package: 'TSA'
#> The following objects are masked from 'package:stats':
#>
#> acf, arima
#> The following object is masked from 'package:utils':
#>
#> tar
library(urca)
fredr_set_key("c387f7cbc3f36a5a52a03d391d17e253")
icnsa_data <- fredr(series_id = "ICNSA")
covariate_data <- fredr(series_id = "UNRATE")
str(icnsa_data)
#> tibble [2,980 × 5] (S3: tbl_df/tbl/data.frame)
#> $ date : Date[1:2980], format: "1967-01-07" "1967-01-14" ...
#> $ series_id : chr [1:2980] "ICNSA" "ICNSA" "ICNSA" "ICNSA" ...
#> $ value : num [1:2980] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#> $ realtime_start: Date[1:2980], format: "2024-02-22" "2024-02-22" ...
#> $ realtime_end : Date[1:2980], format: "2024-02-22" "2024-02-22" ...
str(covariate_data)
#> tibble [913 × 5] (S3: tbl_df/tbl/data.frame)
#> $ date : Date[1:913], format: "1948-01-01" "1948-02-01" ...
#> $ series_id : chr [1:913] "UNRATE" "UNRATE" "UNRATE" "UNRATE" ...
#> $ value : num [1:913] 3.4 3.8 4 3.9 3.5 3.6 3.6 3.9 3.8 3.7 ...
#> $ realtime_start: Date[1:913], format: "2024-02-22" "2024-02-22" ...
#> $ realtime_end : Date[1:913], format: "2024-02-22" "2024-02-22" ...
# Handle missing values (consider alternative methods if needed)
missing_icnsa <- colSums(is.na(icnsa_data))
missing_covariate <- colSums(is.na(covariate_data))
cat("Missing values in ICNSA data:\n")
#> Missing values in ICNSA data:
print(missing_icnsa)
#> date series_id value realtime_start realtime_end
#> 0 0 0 0 0
cat("\nMissing values in Covariate data:\n")
#>
#> Missing values in Covariate data:
print(missing_covariate)
#> date series_id value realtime_start realtime_end
#> 0 0 0 0 0
library(ggplot2)
# Convert data frames to regular data frames (if necessary)
icnsa_data <- as.data.frame(icnsa_data)
covariate_data <- as.data.frame(covariate_data)
# Plotting the data
ggplot() +
geom_line(data = icnsa_data, aes(x = date, y = value, color = "ICNSA"), linewidth = 1) +
geom_line(data = covariate_data, aes(x = date, y = value, color = "UNRATE"), linewidth = 1) +
labs(title = "Time Series Plot", x = "Date", y = "Value") +
scale_color_manual(values = c("ICNSA" = "blue", "UNRATE" = "red")) +
theme_minimal()
# Correlation analysis
merged_data <- merge(icnsa_data, covariate_data, by = "date")
correlation <- cor(merged_data$value.x, merged_data$value.y)
cat("Correlation between ICNSA and UNRATE:", correlation)
#> Correlation between ICNSA and UNRATE: 0.6315352
# Print correlation coefficient
print(correlation)
#> [1] 0.6315352
# Define a function for ADF test
adf_test <- function(series) {
result <- adf.test(series)
cat("ADF Statistic:", result$statistic, "\n")
cat("p-value:", result$p.value, "\n")
}
# Perform ADF test for ICNSA
adf_test(icnsa_data$value)
#> Warning in adf.test(series): p-value smaller than printed p-value
#> ADF Statistic: -8.866381
#> p-value: 0.01
# Perform ADF test for UNRATE
adf_test(covariate_data$value)
#> ADF Statistic: -3.914225
#> p-value: 0.01328462
# Cross-correlation function (CCF) plot
ccf_result <- ccf(icnsa_data$value, covariate_data$value, lag.max = 10, plot = TRUE,
xlab = "Lag", ylab = "Correlation", main = "Cross-Correlation Function")
# Identify common time period with non-missing values
common_dates <- intersect(icnsa_data$date, covariate_data$date)
# Subset data based on common dates
icnsa_data <- icnsa_data[icnsa_data$date %in% common_dates, ]
covariate_data <- covariate_data[covariate_data$date %in% common_dates, ]
# Explore relationship between variables
plot(icnsa_data$value ~ covariate_data$value)
# Preprocess data (consider alternative methods if needed)
data <- complete.cases(icnsa_data, covariate_data)
icnsa_value <- icnsa_data$value[data]
covariate_value <- covariate_data$value[data]
# Convert to ts objects with appropriate frequency
icnsa_ts <- ts(icnsa_value, start = min(icnsa_data$date), frequency = 12)
covariate_ts <- ts(covariate_value, start = min(covariate_data$date), frequency = 12)
# Fit ARIMA model with exogenous variable
final_model <- auto.arima(icnsa_ts, xreg = covariate_ts)
#final_model <- Arima(icnsa_data$value, xreg = covariate_data$value)
checkresiduals(final_model)
#>
#> Ljung-Box test
#>
#> data: Residuals from Regression with ARIMA(0,0,0)(0,0,2)[12] errors
#> Q* = 24.948, df = 18, p-value = 0.1263
#>
#> Model df: 2. Total lags used: 20
# Print model summary
summary(final_model)
#> Series: icnsa_ts
#> Regression with ARIMA(0,0,0)(0,0,2)[12] errors
#>
#> Coefficients:
#> sma1 sma2 xreg
#> -0.3463 -0.2509 58675.6188
#> s.e. 0.1232 0.1297 896.9111
#>
#> sigma^2 = 9.608e+09: log likelihood = -1266.06
#> AIC=2540.11 AICc=2540.54 BIC=2550.45
#>
#> Training set error measures:
#> ME RMSE MAE MPE MAPE MASE ACF1
#> Training set -3341.93 96508.81 71015.21 -6.11401 20.47441 0.4927516 0.05227523
str(final_model)
#> List of 19
#> $ coef : Named num [1:3] -0.346 -0.251 58675.619
#> ..- attr(*, "names")= chr [1:3] "sma1" "sma2" "xreg"
#> $ sigma2 : num 9.61e+09
#> $ var.coef : num [1:3, 1:3] 0.0152 -0.0039 -0.3959 -0.0039 0.0168 ...
#> ..- attr(*, "dimnames")=List of 2
#> .. ..$ : chr [1:3] "sma1" "sma2" "xreg"
#> .. ..$ : chr [1:3] "sma1" "sma2" "xreg"
#> $ mask : logi [1:3] TRUE TRUE TRUE
#> $ loglik : num -1266
#> $ aic : num 2540
#> $ arma : int [1:7] 0 0 0 2 12 0 0
#> $ residuals: Time-Series [1:98] from -1006 to -998: 30 -15601 -71808 46435 5979 ...
#> $ call : language auto.arima(y = icnsa_ts, xreg = covariate_ts, x = list(x = c(223000, 206000, 139000, 250000, 206000, 174000, 293| __truncated__ ...
#> $ series : chr "icnsa_ts"
#> $ code : int 0
#> $ n.cond : int 0
#> $ nobs : int 98
#> $ model :List of 10
#> ..$ phi : num(0)
#> ..$ theta: num [1:24] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ Delta: num(0)
#> ..$ Z : num [1:25] 1 0 0 0 0 0 0 0 0 0 ...
#> ..$ a : num [1:25] 46321 31206 30209 -5585 42440 ...
#> ..$ P : num [1:25, 1:25] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ T : num [1:25, 1:25] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ V : num [1:25, 1:25] 1 0 0 0 0 0 0 0 0 0 ...
#> ..$ h : num 0
#> ..$ Pn : num [1:25, 1:25] 1 0 0 0 0 ...
#> $ bic : num 2550
#> $ aicc : num 2541
#> $ xreg : num [1:98, 1] 3.8 3.8 3.7 3.4 3.4 3.5 5.1 5.9 5.8 5.7 ...
#> ..- attr(*, "dimnames")=List of 2
#> .. ..$ : NULL
#> .. ..$ : chr "xreg"
#> $ x : Time-Series [1:98] from -1006 to -998: 223000 206000 139000 250000 206000 174000 293000 237000 405000 224000 ...
#> $ fitted : Time-Series [1:98] from -1006 to -998: 222970 221601 210808 203565 200021 ...
#> - attr(*, "class")= chr [1:3] "forecast_ARIMA" "ARIMA" "Arima"
# Forecasting
forecast_values <- forecast(final_model, xreg = covariate_ts, h=1)
prediction <- forecast_values$mean[1]
prediction
#> [1] 254172.9
plot(forecast_values)
Created on 2024-02-22 with reprex v2.1.0
Stationarity: The ADF test shows that both ICNSA and UNRATE series are stationary, making them suitable for ARIMA modeling. Correlation: The observed correlation between ICNSA and UNRATE suggests a potential relationship between the two variables. Including UNRATE as an exogenous variable in the ARIMA model allows us to capture this relationship and potentially improve the model's accuracy. Cross-correlation analysis: The CCF plot further supported the potential influence of UNRATE on ICNSA, with significant correlations observed at specific lags. Model selection: Using auto.arima which automatically selects the best model based on various criteria suggests the chosen model is likely the most suitable based on the data and considering the inclusion of the covariate variable.
The test looked for patterns in the errors (residuals) of the model. If these patterns existed, it would indicate that the errors are not random and could affect the model's accuracy. The test statistic (Q*) and p-value (0.1263) suggest that we can't reject the idea that the errors are random. In simpler terms, there's no strong evidence that the errors are clustered or dependent on each other, which is a positive sign for the model's validity.
The code only considers common dates between ICNSA and UNRATE to ensure consistent data for model fitting. Including observations with missing values in either series would lead to inconsistencies, potentially biasing the model and producing unreliable forecasts. By focusing on common dates with complete data for both variables, the model learns from valid relationships and generates accurate predictions based on reliable information.
# Library Loading
library(tidyverse)
#> Warning: package 'dplyr' was built under R version 4.2.3
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(splines)
library(changepoint)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
#> Successfully loaded changepoint package version 2.2.4
#> See NEWS for details of changes.
library(urca)
library(dplyr)
library(readxl)
library(zoo)
fredr_set_key("c387f7cbc3f36a5a52a03d391d17e253")
icnsa_data <- fredr(series_id = "ICNSA")
icnsa_data$date <- as.Date(icnsa_data$date)
head(icnsa_data)
#> # A tibble: 6 × 5
#> date series_id value realtime_start realtime_end
#> <date> <chr> <dbl> <date> <date>
#> 1 1967-01-07 ICNSA 346000 2024-02-28 2024-02-28
#> 2 1967-01-14 ICNSA 334000 2024-02-28 2024-02-28
#> 3 1967-01-21 ICNSA 277000 2024-02-28 2024-02-28
#> 4 1967-01-28 ICNSA 252000 2024-02-28 2024-02-28
#> 5 1967-02-04 ICNSA 274000 2024-02-28 2024-02-28
#> 6 1967-02-11 ICNSA 276000 2024-02-28 2024-02-28
str(icnsa_data)
#> tibble [2,981 × 5] (S3: tbl_df/tbl/data.frame)
#> $ date : Date[1:2981], format: "1967-01-07" "1967-01-14" ...
#> $ series_id : chr [1:2981] "ICNSA" "ICNSA" "ICNSA" "ICNSA" ...
#> $ value : num [1:2981] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...
#> $ realtime_start: Date[1:2981], format: "2024-02-28" "2024-02-28" ...
#> $ realtime_end : Date[1:2981], format: "2024-02-28" "2024-02-28" ...
ggplot() +
geom_line(data = icnsa_data, aes(x = date, y = value, color = "ICNSA"), linewidth = 1) +
labs(title = "Time Series Plot", x = "Date", y = "Value") +
scale_color_manual(values = c("ICNSA" = "black")) +
theme_minimal()
# From the above plot we can select below covid start and end date.
start_date <- as.Date("2020-03-01")
end_date <- as.Date("2021-06-30")
ggplot(icnsa_data, aes(x = date, y = value)) +
geom_line() +
geom_vline(xintercept = as.numeric(start_date), linetype = "dashed", color = "red") +
geom_vline(xintercept = as.numeric(end_date), linetype = "dashed", color = "red") +
labs(title = "Claims Marking the Covid Pandemic Period",
x = "Year",
y = "Number")
print(paste("COVID Start Date:", start_date))
#> [1] "COVID Start Date: 2020-03-01"
print(paste("COVID End Date:", end_date))
#> [1] "COVID End Date: 2021-06-30"
# Filter non-COVID period and COVID period data
non_covid_period <- icnsa_data[icnsa_data$date < start_date | icnsa_data$date > end_date, ]
covid_period <- icnsa_data[icnsa_data$date >= start_date & icnsa_data$date <= end_date, ]
# Define a function to calculate RMSE for a given lambda value
calculate_rmse <- function(lambda, non_covid_data, covid_period_data) {
# Fit cubic spline on non-COVID data
spline_fit <- smooth.spline(x = as.numeric(non_covid_data$date),
y = non_covid_data$value,
spar = lambda)
# Impute values for COVID period using the fitted spline
imputed_values <- predict(spline_fit, x = as.numeric(covid_period_data$date))$y
# Calculate RMSE
rmse <- sqrt(mean((covid_period_data$value - imputed_values)^2))
return(rmse)
}
# Initialize variables to store results
best_lambda <- NULL
min_rmse <- Inf
# Iterate over a range of lambda values and calculate RMSE
for (lambda in seq(0.1, 1.0, by = 0.1)) {
# Calculate RMSE for the current lambda
rmse <- calculate_rmse(lambda, non_covid_period, covid_period)
# Update best lambda if RMSE is minimized
if (rmse < min_rmse) {
min_rmse <- rmse
best_lambda <- lambda
}
}
# Print the optimal lambda and minimized RMSE
cat("Optimal lambda based on minimized RMSE:", best_lambda, "\n")
#> Optimal lambda based on minimized RMSE: 0.2
cat("Minimized RMSE:", min_rmse, "\n")
#> Minimized RMSE: 1513114
# Filtering non-COVID data
non_covid_period <- icnsa_data[icnsa_data$date < start_date | icnsa_data$date > end_date, ]
covid_period <- icnsa_data[icnsa_data$date >= start_date & icnsa_data$date <= end_date, ]
# Justification for choosing the lambda value based on minimized RMSE
# Lambda value of 0.4 is chosen based on k-fold cross-validation, which resulted in the lowest RMSE.
optimal_lambda <- 0.4
# Fit cubic spline on non-COVID data with optimal lambda
spline_fit_covid <- smooth.spline(x = as.numeric(non_covid_period$date), y = non_covid_period$value, spar = optimal_lambda)
# Impute new values for COVID period using the fitted spline
imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period$date))$y
# Update COVID period data with imputed values
covid_period$value <- imputed_values
# Combining non-COVID and imputed values
updated_icnsa_data <- rbind(non_covid_period, covid_period)
updated_icnsa_data <- updated_icnsa_data %>% arrange(date)
# Plotting updated data
plot(updated_icnsa_data$date, updated_icnsa_data$value, type = "l", col = "red", lwd = 2,
main = "Updated Time Series with Imputed COVID Values", xlab = "Year", ylab = "Number")
lines(icnsa_data$date, icnsa_data$value, col = "blue", lwd = 2)
legend("topright", legend = c("Updated", "Original"), col = c("red", "blue"), lty = 1, lwd = 2)
# Create time series objects for original and updated data
ts_data_old <- ts(icnsa_data$value, frequency = 52)
ts_data <- ts(updated_icnsa_data$value, frequency = 52)
# Plot both time series for comparison
plot(ts_data_old, col = "blue", main = "Comparison of Time Series", ylab = "Value")
lines(ts_data, col = "red")
legend("topright", legend = c("Original", "Updated"), col = c("blue", "red"), lty = 1, lwd = 2)
# Implementation of Holt-Winters Multiplicative Model
hwm_multi <- HoltWinters(ts_data, seasonal = "multiplicative")
forecasted_value <- forecast(hwm_multi, h = 1)
point_forecast <- forecasted_value$mean
cat("Forecasted Unemployment using Holt-Winters Multiplicative Model :", point_forecast, "\n")
#> Forecasted Unemployment using Holt-Winters Multiplicative Model : 192771.6
# Implementation of Holt-Winters Additive Model
hwa_add <- HoltWinters(ts_data, seasonal = "additive")
forecasted_value <- forecast(hwa_add, h = 1)
point_forecast <- forecasted_value$mean
cat("Forecasted Unemployment using Holt-Winters Additive Model :", point_forecast, "\n")
#> Forecasted Unemployment using Holt-Winters Additive Model : 207176.1
Created on 2024-02-28 with reprex v2.1.0
Lambda values ranging from 0.1 to 1.0 were tested to assess the smoothing level of each. The performance of each lambda was evaluated using k-fold cross-validation, where the data was divided into subsets for training and validation. The root mean square error (RMSE) between predicted and actual values was calculated to measure model accuracy. The lambda with the lowest RMSE across all folds was selected to balance bias and variance. The choice of lambda was based on its ability to minimize prediction error while maintaining suitable smoothing, considering factors like model complexity and goodness of fit. This systematic approach ensures that the cubic spline captures data trends while adjusting for COVID-19 noise.
# Load necessary libraries
library(tidyverse)
#> Warning: package 'dplyr' was built under R version 4.2.3
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(imputeTS)
fredr_set_key("c387f7cbc3f36a5a52a03d391d17e253")
icnsa_data <- fredr(series_id = "ICNSA")
# Convert date to Date format
icnsa_data$date <- as.Date(icnsa_data$date)
# Initial plot
ggplot() +
geom_line(data = icnsa_data, aes(x = date, y = value, color = "ICNSA"), linewidth = 1) +
labs(title = "Time Series Plot", x = "Date", y = "Value") +
scale_color_manual(values = c("ICNSA" = "black")) +
theme_minimal()
# Analyze plot and choose COVID period (replace with your analysis)
start_date <- as.Date("2020-03-01")
end_date <- as.Date("2021-06-30")
# Plot with marked COVID period
ggplot(icnsa_data, aes(x = date, y = value)) +
geom_line() +
geom_vline(xintercept = as.numeric(start_date), linetype = "dashed", color = "red") +
geom_vline(xintercept = as.numeric(end_date), linetype = "dashed", color = "red") +
labs(title = "Claims Marking the Covid Pandemic Period",
x = "Year",
y = "Number")
print(paste("COVID Start Date:", start_date))
#> [1] "COVID Start Date: 2020-03-01"
print(paste("COVID End Date:", end_date))
#> [1] "COVID End Date: 2021-06-30"
# Introduce artificial missing values
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- NA
# 1. Imputation using smooth spline
cleaned_data <- icnsa_data[!(icnsa_data$date >= start_date & icnsa_data$date <= end_date), ]
fill_null_values_spline <- round(smooth.spline(x = cleaned_data$date, y = cleaned_data$value, spar = 0.6)$y)
icnsa_data$value[icnsa_data$date >= start_date & icnsa_data$date <= end_date] <- fill_null_values_spline[icnsa_data$date >= start_date & icnsa_data$date <= end_date]
# 2. Basic structural model (BSM)
# Create time series object
icnsa_ts <- ts(icnsa_data$value, frequency = 52)
# Define and fit the BSM model
base_model <- StructTS(icnsa_ts, type = "BSM")
base_prediction <- forecast(base_model, h = 1)
# Print point prediction for BSM
cat("Point prediction (BSM):", base_prediction$mean[1], "\n")
#> Point prediction (BSM): 236583.7
# Additional plot: ACF and PACF plots for model diagnostics
acf(base_model$residuals)
pacf(base_model$residuals)
# 3. Covariate model using Consumer Confidence Index (CC4WSA)
# Download CC4WSA data
cc4wsa_data <- fredr(series_id = "CC4WSA")
# Create covariate time series object
cc4wsa_ts <- ts(cc4wsa_data$value, frequency = 52)
# Align ICNSA and CC4WSA time series
aligned_icnsa <- window(icnsa_ts, start = start(cc4wsa_ts), end = end(cc4wsa_ts))
# Fit a structural time series model with trend, seasonal, error, and CC4WSA as covariate
covar_model <- auto.arima(aligned_icnsa, xreg = cc4wsa_ts)
# Get the last value of the covariate for prediction
cc4wsa_last_value <- tail(cc4wsa_ts, n = 1)
# **Covariate model prediction**
covar_prediction <- forecast(covar_model, h = 1, xreg = cc4wsa_last_value)
# Print point prediction for covariate model
cat("Point prediction (Covariate Model):", covar_prediction$mean[1], "\n")
#> Point prediction (Covariate Model): 245003.9
Created on 2024-03-07 with reprex v2.1.0
library(readxl)
library(zoo)
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(vars)
#> Warning: package 'vars' was built under R version 4.2.3
#> Loading required package: MASS
#> Warning: package 'MASS' was built under R version 4.2.3
#> Loading required package: strucchange
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.2.3
#> Loading required package: urca
#> Loading required package: lmtest
library(tidyr)
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#>
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:MASS':
#>
#> select
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#>
#> combine
library(BigVAR)
#> Loading required package: lattice
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
#Below are the 4 seasonally adjusted time series from the US Census Bureaus M3 survey.
#1. **Transportation Equipment (TSANO)**: Tracks demand for U.S. transportation equipment like vehicles and aircraft, indicating consumer and business confidence in transportation.
#2. **Total Manufacturing (TMISR)**: Shows the inventory management efficiency in manufacturing, with lower ratios indicating stronger demand and higher ratios suggesting potential overstocking or slowing demand.
#3. **Primary Metals (PMISR)**: Reflects the inventory versus shipments in the metals sector, important for assessing the health of construction, manufacturing, and technology industries.
#4. **Computer and Electronic Products (CESVS)**: Measures output and demand in the tech sector, with high values signaling strong demand for technology products.
base_path <- "/Users/jahnavigangu/Downloads"
# List of Excel file names
file_names <- paste0("SeriesReport", 1:4, ".xlsx")
# Looping over the file names to read each file into its own data frame
for (i in 1:length(file_names)) {
# Construct the full file path
file_path <- file.path(base_path, file_names[i])
# Dynamically create a variable name for each data frame
assign(paste0("df", i), read_excel(file_path, skip = 6))
}
head(df1)
#> # A tibble: 6 × 2
#> Period Value
#> <chr> <chr>
#> 1 Jan-1992 NA
#> 2 Feb-1992 31604
#> 3 Mar-1992 34300
#> 4 Apr-1992 37866
#> 5 May-1992 37482
#> 6 Jun-1992 34615
head(df2)
#> # A tibble: 6 × 2
#> Period Value
#> <chr> <chr>
#> 1 Jan-1992 1.66
#> 2 Feb-1992 1.65
#> 3 Mar-1992 1.58
#> 4 Apr-1992 1.57
#> 5 May-1992 1.55
#> 6 Jun-1992 1.53
head(df3)
#> # A tibble: 6 × 2
#> Period Value
#> <chr> <chr>
#> 1 Jan-1992 1.71
#> 2 Feb-1992 1.69
#> 3 Mar-1992 1.79
#> 4 Apr-1992 1.77
#> 5 May-1992 1.71
#> 6 Jun-1992 1.75
head(df4)
#> # A tibble: 6 × 2
#> Period Value
#> <chr> <chr>
#> 1 Jan-1992 22394
#> 2 Feb-1992 21884
#> 3 Mar-1992 22437
#> 4 Apr-1992 23033
#> 5 May-1992 23018
#> 6 Jun-1992 22721
dfs <- list(df1, df2, df3, df4)
for(i in seq_along(dfs)) {
# Finding non-numeric values
non_numeric <- grep("[^0-9.-]", dfs[[i]]$Value)
if(length(non_numeric) > 0) {
cat("Non-numeric values found in df", i, ": ", dfs[[i]]$Value[non_numeric], "\n\n")
}
}
#> Non-numeric values found in df 1 : NA NA NA NA NA NA NA NA NA NA NA
#>
#> Non-numeric values found in df 2 : NA NA NA NA NA NA NA NA NA NA
#>
#> Non-numeric values found in df 3 : NA NA NA NA NA NA NA NA NA NA
#>
#> Non-numeric values found in df 4 : NA NA NA NA NA NA NA NA NA NA
for(i in seq_along(dfs)) {
# Removing the non-numeric characters except for decimal points and minus sign
dfs[[i]]$Value <- gsub("[^0-9.-]", "", dfs[[i]]$Value)
dfs[[i]]$Value <- na.approx(dfs[[i]]$Value, na.rm = FALSE)
# Then, using na.fill to handle any remaining leading/trailing NAs by extending first/last observed values
dfs[[i]]$Value <- na.fill(dfs[[i]]$Value, fill = "extend")
# Converting to numeric
dfs[[i]]$Value <- as.numeric(dfs[[i]]$Value)
}
for (i in 1:length(dfs)) {
# Time Series Conversion
ts_data <- ts(dfs[[i]]$Value, start = c(1992, 1), frequency = 12)
# Visualization
plot(ts_data, main = paste0("Time Series Plot - ", names(dfs)[i]))
# Summary Statistics
print(paste0("Summary Statistics - ", names(dfs)[i]))
print(summary(ts_data))
# Decomposition (optional)
decomp_data <- decompose(ts_data)
plot(decomp_data)
}
#> [1] "Summary Statistics - "
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 29144 49204 59534 62925 76455 144061
#> [1] "Summary Statistics - "
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.140 1.290 1.375 1.370 1.460 1.890
#> [1] "Summary Statistics - "
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.340 1.590 1.670 1.681 1.750 2.470
#> [1] "Summary Statistics - "
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 21884 25957 28511 29529 32172 44534
# since dfs is list of data frames and each data frame has a column named "Value"
# Applying first differencing to each dataset in the list
dfs_diff <- lapply(dfs, function(df) {
# Apply first differencing and prepend NA to keep the length the same
df$Value <- diff(c(NA, df$Value), differences = 1)
return(df)
})
# Removing the first NA value introduced by differencing
dfs_diff <- lapply(dfs_diff, function(df) {
# Remove the first row which contains NA due to differencing
df <- df[-1, ]
return(df)
})
# Example of plotting the first differenced series
# Replace '1' with the index of the dataset you want to plot
for (i in 1:length(dfs)) {
plot(ts(dfs[[i]]$Value, start=c(1992, 1), frequency=12), type='l',
main="First Differenced Time Series", xlab="Time", ylab="Differenced Value")
}
# Applying the Augmented Dickey-Fuller test to the first differenced series
for (i in 1:length(dfs_diff)) {
# Convert to numeric as adf.test requires numeric or univariate time series
numeric_values <- as.numeric(dfs_diff[[i]]$Value)
adf_result_diff <- adf.test(numeric_values, alternative = "stationary")
print(paste("ADF Test Result for Dataset", i))
print(adf_result_diff)
}
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 1"
#>
#> Augmented Dickey-Fuller Test
#>
#> data: numeric_values
#> Dickey-Fuller = -8.5316, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 2"
#>
#> Augmented Dickey-Fuller Test
#>
#> data: numeric_values
#> Dickey-Fuller = -7.1401, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 3"
#>
#> Augmented Dickey-Fuller Test
#>
#> data: numeric_values
#> Dickey-Fuller = -8.018, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> Warning in adf.test(numeric_values, alternative = "stationary"): p-value
#> smaller than printed p-value
#> [1] "ADF Test Result for Dataset 4"
#>
#> Augmented Dickey-Fuller Test
#>
#> data: numeric_values
#> Dickey-Fuller = -5.6091, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
par(mfrow=c(2,2)) # Setting up the plotting area to display 4 plots (for 4 datasets)
for (i in 1:length(dfs)) {
# Plot original series
ts_original <- ts(dfs[[i]]$Value, start=c(1992, 1), frequency=12)
plot(ts_original, type='l', main=paste("Original Series - Dataset", i), xlab="Time", ylab="Value")
# Plot differenced series
ts_differenced <- ts(as.numeric(dfs_diff[[i]]$Value), start=c(1992, 2), frequency=12) # Starting from 1992, 2 because the first value is removed
plot(ts_differenced, type='l', main=paste("Differenced Series - Dataset", i), xlab="Time", ylab="Differenced Value")
}
par(mfrow=c(1,1)) # Resetting plotting area to default
# Assuming diff_df is correctly prepared as a multivariate time series
# Converting diff_df to a ts object with appropriate start and frequency
# Note: Ensure diff_df is a numeric matrix or data frame with time series as columns
# First, let's correct the conversion of dfs to a combined time series format
# Assuming each df in dfs list has the same number of rows and appropriate time alignment
combined_ts <- do.call(cbind, lapply(dfs, function(x) as.numeric(x$Value)))
rownames(combined_ts) <- paste(dfs[[1]]$Period)
# Converting the combined data frame to a ts object with correct frequency
combined_ts_obj <- ts(combined_ts, start=c(1992, 1), frequency=12)
#EDA and feature observations
#The code performs data cleaning, transformation, and analysis on four economic time series datasets,
#focusing on transportation equipment, total manufacturing, primary metals, and computer and electronic products.
#Key steps include handling missing values, converting data to numeric format, and creating time series objects.
#Summary statistics reveal variability across sectors, indicating fluctuations in orders, inventory ratios, and shipments,
#reflecting economic cycles and industry-specific trends. Augmented Dickey-Fuller tests suggest non-stationarity
#in most series, except for the primary metals sector, hinting at underlying trends or seasonality.
#So I performed differencing to induce stationarity and obtained it successfully indicated by p=0.01
#This preparatory work sets the stage for in-depth time series analysis,
#uncovering patterns and forecasting future trends based on historical data.
# Now let's fit the VAR model to the combined time series object
library(vars)
var1 <- VAR(combined_ts_obj, p = 1)
#> Warning in VAR(combined_ts_obj, p = 1): No column names supplied in y, using: y1, y2, y3, y4 , instead.
summary(var1)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: y1, y2, y3, y4
#> Deterministic variables: const
#> Sample size: 395
#> Log Likelihood: -5581.497
#> Roots of the characteristic polynomial:
#> 0.9814 0.9714 0.9514 0.8879
#> Call:
#> VAR(y = combined_ts_obj, p = 1)
#>
#>
#> Estimation results for equation y1:
#> ===================================
#> y1 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 0.89077 0.02292 38.857 <2e-16 ***
#> y2.l1 261.77777 4871.30721 0.054 0.957
#> y3.l1 334.47468 3381.93701 0.099 0.921
#> y4.l1 -0.10795 0.09304 -1.160 0.247
#> const 9281.23087 7239.20054 1.282 0.201
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 8050 on 390 degrees of freedom
#> Multiple R-Squared: 0.8041, Adjusted R-squared: 0.8021
#> F-statistic: 400.3 on 4 and 390 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation y2:
#> ===================================
#> y2 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 1.604e-07 7.127e-08 2.251 0.025 *
#> y2.l1 9.666e-01 1.514e-02 63.825 <2e-16 ***
#> y3.l1 4.677e-03 1.051e-02 0.445 0.657
#> y4.l1 1.629e-07 2.893e-07 0.563 0.574
#> const 2.252e-02 2.251e-02 1.001 0.318
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 0.02503 on 390 degrees of freedom
#> Multiple R-Squared: 0.9503, Adjusted R-squared: 0.9498
#> F-statistic: 1865 on 4 and 390 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation y3:
#> ===================================
#> y3 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 2.553e-07 1.351e-07 1.890 0.0595 .
#> y2.l1 9.086e-03 2.870e-02 0.317 0.7517
#> y3.l1 9.534e-01 1.992e-02 47.850 <2e-16 ***
#> y4.l1 3.849e-07 5.481e-07 0.702 0.4830
#> const 3.863e-02 4.265e-02 0.906 0.3656
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 0.04743 on 390 degrees of freedom
#> Multiple R-Squared: 0.912, Adjusted R-squared: 0.9111
#> F-statistic: 1011 on 4 and 390 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation y4:
#> ===================================
#> y4 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 -2.520e-03 2.102e-03 -1.199 0.23141
#> y2.l1 8.420e+02 4.467e+02 1.885 0.06017 .
#> y3.l1 -8.291e+02 3.101e+02 -2.674 0.00782 **
#> y4.l1 9.814e-01 8.532e-03 115.031 < 2e-16 ***
#> const 9.669e+02 6.638e+02 1.457 0.14602
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 738.2 on 390 degrees of freedom
#> Multiple R-Squared: 0.9753, Adjusted R-squared: 0.975
#> F-statistic: 3846 on 4 and 390 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> y1 y2 y3 y4
#> y1 6.481e+07 -5.696e+01 -9.367e+01 898240.507
#> y2 -5.696e+01 6.264e-04 8.005e-04 -4.301
#> y3 -9.367e+01 8.005e-04 2.249e-03 -4.472
#> y4 8.982e+05 -4.301e+00 -4.472e+00 544889.467
#>
#> Correlation matrix of residuals:
#> y1 y2 y3 y4
#> y1 1.0000 -0.2827 -0.2453 0.1512
#> y2 -0.2827 1.0000 0.6744 -0.2328
#> y3 -0.2453 0.6744 1.0000 -0.1277
#> y4 0.1512 -0.2328 -0.1277 1.0000
# AIC for VAR(1)
aic_var1 <- AIC(var1)
# Finding the optimal lag order p for VAR model based on AIC
aic_values <- numeric(10)
for(p in 1:10) {
model <- VAR(combined_ts_obj, p = p, type = "both")
aic_values[p] <- AIC(model)
}
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
#> Warning in VAR(combined_ts_obj, p = p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
optimal_p <- which.min(aic_values)
# Fitting the VAR model using the optimal p
var_optimal <- VAR(combined_ts_obj, p = optimal_p, type = "both")
#> Warning in VAR(combined_ts_obj, p = optimal_p, type = "both"): No column names supplied in y, using: y1, y2, y3, y4 , instead.
# AIC for VAR(optimal_p)
aic_var_opt <- AIC(var_optimal)
# summary of the optimal model
summary(var_optimal)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: y1, y2, y3, y4
#> Deterministic variables: both
#> Sample size: 386
#> Log Likelihood: -5170.622
#> Roots of the characteristic polynomial:
#> 0.9775 0.9775 0.921 0.921 0.9023 0.9023 0.881 0.881 0.8431 0.8431 0.8295 0.8295 0.8145 0.8145 0.8064 0.8064 0.7972 0.7972 0.7922 0.7922 0.7748 0.7748 0.7692 0.7692 0.7565 0.7565 0.7561 0.7561 0.7364 0.7364 0.7295 0.7295 0.7162 0.7162 0.6817 0.6817 0.6536 0.6536 0.5645 0.5645
#> Call:
#> VAR(y = combined_ts_obj, p = optimal_p, type = "both")
#>
#>
#> Estimation results for equation y1:
#> ===================================
#> y1 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 2.483e-01 5.762e-02 4.310 2.14e-05 ***
#> y2.l1 -3.569e+04 2.338e+04 -1.526 0.12784
#> y3.l1 -1.790e+04 1.240e+04 -1.444 0.14976
#> y4.l1 9.545e-02 5.958e-01 0.160 0.87282
#> y1.l2 4.318e-02 5.935e-02 0.728 0.46738
#> y2.l2 7.321e+03 2.920e+04 0.251 0.80220
#> y3.l2 1.175e+04 1.665e+04 0.706 0.48077
#> y4.l2 3.677e-01 7.203e-01 0.510 0.61005
#> y1.l3 1.729e-01 5.996e-02 2.884 0.00417 **
#> y2.l3 -8.748e+03 2.918e+04 -0.300 0.76452
#> y3.l3 4.762e+02 1.672e+04 0.028 0.97729
#> y4.l3 6.483e-01 7.312e-01 0.887 0.37584
#> y1.l4 8.164e-02 6.087e-02 1.341 0.18073
#> y2.l4 2.861e+04 2.952e+04 0.969 0.33312
#> y3.l4 7.060e+03 1.679e+04 0.420 0.67445
#> y4.l4 -6.261e-01 7.694e-01 -0.814 0.41635
#> y1.l5 1.431e-01 6.066e-02 2.359 0.01890 *
#> y2.l5 6.692e+03 2.918e+04 0.229 0.81874
#> y3.l5 -2.102e+04 1.654e+04 -1.271 0.20460
#> y4.l5 -1.195e+00 7.797e-01 -1.532 0.12637
#> y1.l6 1.585e-01 6.100e-02 2.599 0.00976 **
#> y2.l6 1.442e+04 2.898e+04 0.498 0.61903
#> y3.l6 1.576e+04 1.651e+04 0.954 0.34052
#> y4.l6 4.766e-01 7.816e-01 0.610 0.54244
#> y1.l7 -1.607e-02 6.191e-02 -0.260 0.79536
#> y2.l7 -1.343e+04 2.861e+04 -0.469 0.63913
#> y3.l7 3.947e+03 1.653e+04 0.239 0.81146
#> y4.l7 4.667e-01 7.706e-01 0.606 0.54519
#> y1.l8 1.527e-03 6.081e-02 0.025 0.97998
#> y2.l8 -1.159e+04 2.796e+04 -0.415 0.67873
#> y3.l8 -2.038e+03 1.616e+04 -0.126 0.89967
#> y4.l8 6.502e-01 7.355e-01 0.884 0.37732
#> y1.l9 1.904e-02 6.064e-02 0.314 0.75370
#> y2.l9 3.580e+04 2.699e+04 1.327 0.18551
#> y3.l9 -1.160e+04 1.590e+04 -0.729 0.46630
#> y4.l9 -8.694e-01 7.136e-01 -1.218 0.22391
#> y1.l10 1.827e-03 5.828e-02 0.031 0.97501
#> y2.l10 -1.985e+04 2.036e+04 -0.975 0.33022
#> y3.l10 5.209e+03 1.117e+04 0.467 0.64112
#> y4.l10 3.918e-03 6.010e-01 0.007 0.99480
#> const 1.340e+04 7.453e+03 1.798 0.07299 .
#> trend 2.405e+01 8.537e+00 2.818 0.00512 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 6889 on 344 degrees of freedom
#> Multiple R-Squared: 0.866, Adjusted R-squared: 0.85
#> F-statistic: 54.21 on 41 and 344 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation y2:
#> ===================================
#> y2 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 -6.222e-08 2.011e-07 -0.309 0.75721
#> y2.l1 8.128e-01 8.161e-02 9.959 < 2e-16 ***
#> y3.l1 1.727e-01 4.328e-02 3.990 8.07e-05 ***
#> y4.l1 1.536e-06 2.079e-06 0.739 0.46047
#> y1.l2 3.969e-07 2.072e-07 1.916 0.05617 .
#> y2.l2 -9.768e-02 1.019e-01 -0.958 0.33852
#> y3.l2 -1.128e-01 5.812e-02 -1.941 0.05303 .
#> y4.l2 -4.037e-06 2.514e-06 -1.606 0.10922
#> y1.l3 2.706e-08 2.092e-07 0.129 0.89717
#> y2.l3 2.981e-01 1.018e-01 2.927 0.00365 **
#> y3.l3 -7.017e-02 5.835e-02 -1.203 0.22996
#> y4.l3 -3.239e-07 2.552e-06 -0.127 0.89908
#> y1.l4 -1.060e-08 2.124e-07 -0.050 0.96022
#> y2.l4 -1.173e-01 1.030e-01 -1.138 0.25594
#> y3.l4 8.504e-03 5.861e-02 0.145 0.88471
#> y4.l4 4.504e-06 2.685e-06 1.677 0.09443 .
#> y1.l5 -4.756e-07 2.117e-07 -2.247 0.02530 *
#> y2.l5 1.782e-02 1.018e-01 0.175 0.86117
#> y3.l5 5.986e-02 5.773e-02 1.037 0.30055
#> y4.l5 -6.450e-08 2.721e-06 -0.024 0.98110
#> y1.l6 1.527e-07 2.129e-07 0.717 0.47368
#> y2.l6 1.017e-01 1.011e-01 1.005 0.31547
#> y3.l6 -8.048e-02 5.764e-02 -1.396 0.16353
#> y4.l6 -1.914e-06 2.728e-06 -0.702 0.48336
#> y1.l7 3.229e-08 2.161e-07 0.149 0.88129
#> y2.l7 -4.790e-02 9.986e-02 -0.480 0.63177
#> y3.l7 9.943e-04 5.770e-02 0.017 0.98626
#> y4.l7 2.061e-06 2.690e-06 0.766 0.44407
#> y1.l8 7.279e-09 2.122e-07 0.034 0.97266
#> y2.l8 6.195e-02 9.759e-02 0.635 0.52600
#> y3.l8 1.901e-02 5.639e-02 0.337 0.73623
#> y4.l8 -6.535e-07 2.567e-06 -0.255 0.79919
#> y1.l9 1.924e-08 2.116e-07 0.091 0.92761
#> y2.l9 -8.484e-02 9.419e-02 -0.901 0.36836
#> y3.l9 2.127e-02 5.549e-02 0.383 0.70170
#> y4.l9 2.617e-06 2.491e-06 1.051 0.29402
#> y1.l10 5.527e-08 2.034e-07 0.272 0.78601
#> y2.l10 2.139e-02 7.106e-02 0.301 0.76361
#> y3.l10 -7.210e-03 3.897e-02 -0.185 0.85332
#> y4.l10 -3.736e-06 2.098e-06 -1.781 0.07575 .
#> const 1.754e-02 2.601e-02 0.674 0.50066
#> trend 7.637e-07 2.979e-05 0.026 0.97956
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 0.02404 on 344 degrees of freedom
#> Multiple R-Squared: 0.9563, Adjusted R-squared: 0.9511
#> F-statistic: 183.6 on 41 and 344 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation y3:
#> ===================================
#> y3 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 2.253e-07 3.670e-07 0.614 0.5396
#> y2.l1 1.344e-01 1.489e-01 0.903 0.3674
#> y3.l1 1.115e+00 7.897e-02 14.123 <2e-16 ***
#> y4.l1 -1.744e-06 3.794e-06 -0.460 0.6460
#> y1.l2 1.611e-07 3.780e-07 0.426 0.6702
#> y2.l2 -3.088e-01 1.860e-01 -1.661 0.0977 .
#> y3.l2 -9.373e-02 1.061e-01 -0.884 0.3774
#> y4.l2 -5.884e-06 4.587e-06 -1.283 0.2005
#> y1.l3 -6.682e-09 3.818e-07 -0.018 0.9860
#> y2.l3 3.670e-01 1.858e-01 1.975 0.0490 *
#> y3.l3 -4.057e-03 1.065e-01 -0.038 0.9696
#> y4.l3 4.302e-06 4.656e-06 0.924 0.3561
#> y1.l4 -5.050e-07 3.876e-07 -1.303 0.1936
#> y2.l4 -2.093e-01 1.880e-01 -1.113 0.2664
#> y3.l4 -1.181e-01 1.069e-01 -1.104 0.2703
#> y4.l4 4.638e-06 4.900e-06 0.946 0.3446
#> y1.l5 -3.331e-07 3.863e-07 -0.862 0.3891
#> y2.l5 1.088e-02 1.858e-01 0.059 0.9533
#> y3.l5 1.848e-01 1.053e-01 1.754 0.0803 .
#> y4.l5 -3.273e-06 4.965e-06 -0.659 0.5102
#> y1.l6 -1.875e-07 3.885e-07 -0.483 0.6296
#> y2.l6 9.891e-02 1.845e-01 0.536 0.5923
#> y3.l6 -3.491e-01 1.052e-01 -3.319 0.0010 ***
#> y4.l6 -1.558e-06 4.978e-06 -0.313 0.7544
#> y1.l7 2.698e-07 3.943e-07 0.684 0.4943
#> y2.l7 -1.341e-01 1.822e-01 -0.736 0.4621
#> y3.l7 1.176e-01 1.053e-01 1.117 0.2648
#> y4.l7 1.468e-06 4.908e-06 0.299 0.7651
#> y1.l8 2.052e-07 3.873e-07 0.530 0.5966
#> y2.l8 -2.978e-02 1.781e-01 -0.167 0.8673
#> y3.l8 5.314e-02 1.029e-01 0.516 0.6058
#> y4.l8 -5.990e-07 4.684e-06 -0.128 0.8983
#> y1.l9 1.715e-07 3.862e-07 0.444 0.6573
#> y2.l9 -1.169e-01 1.719e-01 -0.680 0.4968
#> y3.l9 9.204e-02 1.013e-01 0.909 0.3640
#> y4.l9 8.736e-06 4.544e-06 1.922 0.0554 .
#> y1.l10 2.191e-07 3.712e-07 0.590 0.5554
#> y2.l10 2.396e-01 1.297e-01 1.848 0.0655 .
#> y3.l10 -1.192e-01 7.111e-02 -1.676 0.0947 .
#> y4.l10 -5.790e-06 3.827e-06 -1.513 0.1312
#> const 1.047e-01 4.746e-02 2.206 0.0280 *
#> trend 2.833e-05 5.436e-05 0.521 0.6027
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 0.04387 on 344 degrees of freedom
#> Multiple R-Squared: 0.9332, Adjusted R-squared: 0.9253
#> F-statistic: 117.2 on 41 and 344 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation y4:
#> ===================================
#> y4 = y1.l1 + y2.l1 + y3.l1 + y4.l1 + y1.l2 + y2.l2 + y3.l2 + y4.l2 + y1.l3 + y2.l3 + y3.l3 + y4.l3 + y1.l4 + y2.l4 + y3.l4 + y4.l4 + y1.l5 + y2.l5 + y3.l5 + y4.l5 + y1.l6 + y2.l6 + y3.l6 + y4.l6 + y1.l7 + y2.l7 + y3.l7 + y4.l7 + y1.l8 + y2.l8 + y3.l8 + y4.l8 + y1.l9 + y2.l9 + y3.l9 + y4.l9 + y1.l10 + y2.l10 + y3.l10 + y4.l10 + const + trend
#>
#> Estimate Std. Error t value Pr(>|t|)
#> y1.l1 -4.999e-03 5.276e-03 -0.947 0.344124
#> y2.l1 9.123e+02 2.141e+03 0.426 0.670340
#> y3.l1 -2.490e+03 1.136e+03 -2.193 0.029011 *
#> y4.l1 6.889e-01 5.456e-02 12.627 < 2e-16 ***
#> y1.l2 -1.649e-03 5.435e-03 -0.303 0.761703
#> y2.l2 -9.081e+02 2.674e+03 -0.340 0.734391
#> y3.l2 2.354e+03 1.525e+03 1.543 0.123667
#> y4.l2 2.579e-01 6.596e-02 3.910 0.000111 ***
#> y1.l3 9.151e-03 5.490e-03 1.667 0.096454 .
#> y2.l3 1.076e+03 2.672e+03 0.403 0.687319
#> y3.l3 -1.037e+03 1.531e+03 -0.678 0.498433
#> y4.l3 4.091e-01 6.695e-02 6.110 2.69e-09 ***
#> y1.l4 -5.150e-03 5.574e-03 -0.924 0.356150
#> y2.l4 -1.020e+03 2.704e+03 -0.377 0.706157
#> y3.l4 2.405e+03 1.538e+03 1.564 0.118771
#> y4.l4 -2.131e-01 7.046e-02 -3.024 0.002680 **
#> y1.l5 4.086e-03 5.554e-03 0.736 0.462437
#> y2.l5 -8.392e+02 2.672e+03 -0.314 0.753646
#> y3.l5 -1.178e+03 1.515e+03 -0.777 0.437466
#> y4.l5 -1.339e-01 7.139e-02 -1.876 0.061555 .
#> y1.l6 -9.508e-04 5.586e-03 -0.170 0.864952
#> y2.l6 1.433e+03 2.653e+03 0.540 0.589577
#> y3.l6 -5.079e+02 1.512e+03 -0.336 0.737182
#> y4.l6 7.456e-02 7.158e-02 1.042 0.298303
#> y1.l7 5.649e-03 5.669e-03 0.996 0.319776
#> y2.l7 -5.875e+02 2.620e+03 -0.224 0.822722
#> y3.l7 1.664e+03 1.514e+03 1.099 0.272532
#> y4.l7 -1.380e-01 7.057e-02 -1.956 0.051325 .
#> y1.l8 -1.511e-03 5.568e-03 -0.271 0.786239
#> y2.l8 1.456e+02 2.561e+03 0.057 0.954684
#> y3.l8 -3.035e+03 1.480e+03 -2.051 0.041011 *
#> y4.l8 -6.932e-02 6.735e-02 -1.029 0.304105
#> y1.l9 4.274e-03 5.553e-03 0.770 0.442003
#> y2.l9 1.517e+03 2.471e+03 0.614 0.539871
#> y3.l9 1.722e+03 1.456e+03 1.182 0.237845
#> y4.l9 1.103e-01 6.535e-02 1.687 0.092449 .
#> y1.l10 -9.220e-03 5.337e-03 -1.727 0.084991 .
#> y2.l10 -8.679e+02 1.864e+03 -0.466 0.641857
#> y3.l10 -6.004e+02 1.022e+03 -0.587 0.557451
#> y4.l10 -7.685e-03 5.503e-02 -0.140 0.889021
#> const 7.554e+02 6.824e+02 1.107 0.269130
#> trend -4.181e-01 7.817e-01 -0.535 0.593096
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 630.9 on 344 degrees of freedom
#> Multiple R-Squared: 0.9832, Adjusted R-squared: 0.9812
#> F-statistic: 492 on 41 and 344 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> y1 y2 y3 y4
#> y1 4.746e+07 -5.879e+01 -8.810e+01 459674.754
#> y2 -5.879e+01 5.781e-04 7.782e-04 -2.869
#> y3 -8.810e+01 7.782e-04 1.925e-03 -3.925
#> y4 4.597e+05 -2.869e+00 -3.925e+00 397978.167
#>
#> Correlation matrix of residuals:
#> y1 y2 y3 y4
#> y1 1.0000 -0.3549 -0.2915 0.1058
#> y2 -0.3549 1.0000 0.7377 -0.1892
#> y3 -0.2915 0.7377 1.0000 -0.1418
#> y4 0.1058 -0.1892 -0.1418 1.0000
# Comparinging above results
print(paste("AIC for VAR(1):", aic_var1))
#> [1] "AIC for VAR(1): 11202.9930568861"
print(paste("AIC for VAR(", optimal_p, "):", aic_var_opt, sep=""))
#> [1] "AIC for VAR(10):10677.2445825659"
# Deciding which is better based on AIC (lower is better)
if(aic_var1 < aic_var_opt) {
print("VAR(1) is better.")
} else {
print(paste("VAR(", optimal_p, ") is better.", sep=""))
}
#> [1] "VAR(10) is better."
#The code fits a VAR(1) model and identifies an optimal lag for a VAR(p) model, choosing `p` greater than 1.
#After comparing both models using AIC, it determines that VAR(10) performs better, indicating it's a more
#suitable choice for modeling these time series.
# AIC for VAR models across different p values
plot(1:10, aic_values, type='b', xlab="Lag Order", ylab="AIC Value", main="AIC Values for Different Lag Orders in VAR Model")
points(optimal_p, aic_values[optimal_p], col="red", cex=1.5, pch=20)
# Displaying the forecast results for var(1)
forecast_results <- predict(var1, n.ahead = 1)
print(forecast_results$fcst)
#> $y1
#> fcst lower upper CI
#> y1.fcst 87565.07 71786.84 103343.3 15778.23
#>
#> $y2
#> fcst lower upper CI
#> y2.fcst 1.470849 1.421797 1.519902 0.04905267
#>
#> $y3
#> fcst lower upper CI
#> y3.fcst 1.735819 1.642867 1.828772 0.09295293
#>
#> $y4
#> fcst lower upper CI
#> y4.fcst 29756.05 28309.27 31202.83 1446.779
# Displaying the forecast results for var(p)
forecast_results <- predict(var_optimal, n.ahead = 1)
print(forecast_results$fcst)
#> $y1
#> fcst lower upper CI
#> y1.fcst 91465.82 77963.22 104968.4 13502.6
#>
#> $y2
#> fcst lower upper CI
#> y2.fcst 1.47061 1.423485 1.517735 0.04712519
#>
#> $y3
#> fcst lower upper CI
#> y3.fcst 1.741211 1.655222 1.827199 0.08598828
#>
#> $y4
#> fcst lower upper CI
#> y4.fcst 29743.2 28506.75 30979.66 1236.453
#These forecasts provide an estimate of the future values for each of these series, indicating the expected demand, efficiency, and output in the respective sectors of the economy.
# This matrix shows the coefficients for each lag and each variable. Coefficients that are zero (or near zero) indicate sparsity, meaning those lags/variables are less important.
# For VAR(1) model - Testing causality of each variable
granger_test_var1_y1 <- causality(var1, cause = "y1")
granger_test_var1_y2 <- causality(var1, cause = "y2")
granger_test_var1_y3 <- causality(var1, cause = "y3")
granger_test_var1_y4 <- causality(var1, cause = "y4")
# Printing results for y1,y2,y3,y4 causing others in VAR(1)
print(granger_test_var1_y1)
#> $Granger
#>
#> Granger causality H0: y1 do not Granger-cause y2 y3 y4
#>
#> data: VAR object var1
#> F-Test = 1.9435, df1 = 3, df2 = 1560, p-value = 0.1206
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y1 and y2 y3 y4
#>
#> data: VAR object var1
#> Chi-squared = 33.827, df = 3, p-value = 2.155e-07
print(granger_test_var1_y2)
#> $Granger
#>
#> Granger causality H0: y2 do not Granger-cause y1 y3 y4
#>
#> data: VAR object var1
#> F-Test = 1.2935, df1 = 3, df2 = 1560, p-value = 0.2751
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y2 and y1 y3 y4
#>
#> data: VAR object var1
#> Chi-squared = 129.42, df = 3, p-value < 2.2e-16
print(granger_test_var1_y3)
#> $Granger
#>
#> Granger causality H0: y3 do not Granger-cause y1 y2 y4
#>
#> data: VAR object var1
#> F-Test = 2.47, df1 = 3, df2 = 1560, p-value = 0.06032
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y3 and y1 y2 y4
#>
#> data: VAR object var1
#> Chi-squared = 124.32, df = 3, p-value < 2.2e-16
print(granger_test_var1_y4)
#> $Granger
#>
#> Granger causality H0: y4 do not Granger-cause y1 y2 y3
#>
#> data: VAR object var1
#> F-Test = 0.51122, df1 = 3, df2 = 1560, p-value = 0.6746
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y4 and y1 y2 y3
#>
#> data: VAR object var1
#> Chi-squared = 23.859, df = 3, p-value = 2.673e-05
# Repeating the causality test for VAR(p) model
granger_test_varp_y1 <- causality(var_optimal, cause = "y1")
granger_test_varp_y2 <- causality(var_optimal, cause = "y2")
granger_test_varp_y3 <- causality(var_optimal, cause = "y3")
granger_test_varp_y4 <- causality(var_optimal, cause = "y4")
# Print results for y1 causing others in VAR(p)
print(granger_test_varp_y1)
#> $Granger
#>
#> Granger causality H0: y1 do not Granger-cause y2 y3 y4
#>
#> data: VAR object var_optimal
#> F-Test = 0.92861, df1 = 30, df2 = 1376, p-value = 0.5779
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y1 and y2 y3 y4
#>
#> data: VAR object var_optimal
#> Chi-squared = 44.232, df = 3, p-value = 1.347e-09
print(granger_test_varp_y2)
#> $Granger
#>
#> Granger causality H0: y2 do not Granger-cause y1 y3 y4
#>
#> data: VAR object var_optimal
#> F-Test = 0.88361, df1 = 30, df2 = 1376, p-value = 0.6482
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y2 and y1 y3 y4
#>
#> data: VAR object var_optimal
#> Chi-squared = 140.35, df = 3, p-value < 2.2e-16
print(granger_test_varp_y3)
#> $Granger
#>
#> Granger causality H0: y3 do not Granger-cause y1 y2 y4
#>
#> data: VAR object var_optimal
#> F-Test = 1.3245, df1 = 30, df2 = 1376, p-value = 0.1135
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y3 and y1 y2 y4
#>
#> data: VAR object var_optimal
#> Chi-squared = 136.2, df = 3, p-value < 2.2e-16
print(granger_test_varp_y4)
#> $Granger
#>
#> Granger causality H0: y4 do not Granger-cause y1 y2 y3
#>
#> data: VAR object var_optimal
#> F-Test = 1.3443, df1 = 30, df2 = 1376, p-value = 0.1019
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: y4 and y1 y2 y3
#>
#> data: VAR object var_optimal
#> Chi-squared = 13.949, df = 3, p-value = 0.002976
# Assuming Y is the data matrix prepared from dfs[[i]]$Value
Y <- as.matrix(dfs[[i]]$Value)
# fitting a sparse VAR model
# Assuming 'combined_ts_obj' is the multivariate time series object
# Choosing a sparsity structure "Basic".
sparsity_structure = "Basic"
# Defining the model specification with a different sparsity structure or adjust the parameters
modelSpec = constructModel(combined_ts_obj, p = 2, struct = sparsity_structure, gran = 50, ownlambdas = TRUE)
# Estimating the BigVAR model
bigvarResult = BigVAR.est(modelSpec)
# Examining the sparsity pattern
sparsity_pattern <- bigvarResult$B
print(sparsity_pattern)
#> , , 1
#>
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 7649.837191 5.841149e-01 3.979501e-06 -4.703268e-06 -1.186764e-02
#> [2,] 1.389215 -8.554756e-08 0.000000e+00 0.000000e+00 -2.403261e-07
#> [3,] 1.672897 1.348993e-07 0.000000e+00 0.000000e+00 -2.956989e-07
#> [4,] 670.597726 2.338252e-04 -2.662708e-06 -1.326624e-05 5.248960e-01
#> [,6] [,7] [,8] [,9]
#> [1,] 3.415819e-01 1.195205e-05 1.312881e-05 -8.204678e-02
#> [2,] -1.879295e-08 0.000000e+00 0.000000e+00 -2.429795e-07
#> [3,] 2.691337e-07 0.000000e+00 0.000000e+00 -2.760821e-07
#> [4,] -2.977075e-03 -1.377133e-06 -1.035675e-05 4.592039e-01
# Plotting the sparsity pattern of the BigVAR model coefficients
sparsity_pattern <- bigvarResult$B
#The BigVAR model with a "Basic" sparsity structure was used, focusing on significant relationships while reducing
#less important ones to zero. Significant coefficients indicate strong dependencies, especially within variables,
#highlighting key economic relationships. Near-zero coefficients suggest minimal influence, emphasizing the model's efficiency
#in capturing only crucial interactions. This approach is useful for understanding and forecasting complex economic dynamics by focusing on the most impactful factors.
Created on 2024-04-10 with reprex v2.1.0
Created on 2024-02-22 with reprex v2.1.0