Open jaskirat-singh-pahwa opened 4 months ago
Homework 1 - regARIMA on ICNSA data and IURNSA as a covariate
library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(reprex)
# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")
# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]
# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)
# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2980
head(icnsa_data)
#> # A tibble: 6 × 2
#> date value
#> <date> <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000
# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#> date value
#> Min. :1967-01-07 Min. : 133000
#> 1st Qu.:1981-04-16 1st Qu.: 265800
#> Median :1995-07-25 Median : 326919
#> Mean :1995-07-25 Mean : 365013
#> 3rd Qu.:2009-11-01 3rd Qu.: 406725
#> Max. :2024-02-10 Max. :6161268
cat("\n\n")
########### Another series ##############
# Reading related data to ICNSA to be used as a covariate - Insured Unemployment Rate
iurnsa_data <- fredr(series_id = "IURNSA")
# Taking date and value column from the dataset
iurnsa_data <- iurnsa_data[c("date", "value")]
# Converting `date` column from string to date
iurnsa_data$date <- as.Date(iurnsa_data$date)
# Checking count of rows in the dataset
cat("Total number of rows in the IURNSA dataset:", nrow(iurnsa_data), "\n")
#> Total number of rows in the IURNSA dataset: 2771
head(iurnsa_data)
#> # A tibble: 6 × 2
#> date value
#> <date> <dbl>
#> 1 1971-01-02 5.2
#> 2 1971-01-09 5.3
#> 3 1971-01-16 5.3
#> 4 1971-01-23 5.2
#> 5 1971-01-30 5.2
#> 6 1971-02-06 5.2
# Basic summary statistics
cat("Summary of value column in IURNSA dataset:", "\n")
#> Summary of value column in IURNSA dataset:
summary(iurnsa_data)
#> date value
#> Min. :1971-01-02 Min. : 0.800
#> 1st Qu.:1984-04-10 1st Qu.: 1.900
#> Median :1997-07-19 Median : 2.500
#> Mean :1997-07-19 Mean : 2.759
#> 3rd Qu.:2010-10-26 3rd Qu.: 3.300
#> Max. :2024-02-03 Max. :15.800
# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
geom_line() +
labs(title = "Time Series Plot of ICNSA Claims Data",
x = "Date (Year)",
y = "value")
# Plotting IURNSA Time-Series data
ggplot(iurnsa_data, aes(x = date, y = value)) +
geom_line() +
labs(title = "Time Series Plot of IURNSA Claims Data",
x = "Date (Year)",
y = "value")
# Calculating the correlation coefficient between ICNSA and IURNSA
# To find the correlation, we need to match the dimesions of both the series i.e., the number of rows should be same
# Since, both the series are weekly and IURSA is from 1971-01-02 to 2024-02-03, we need to filter ICNSA from the same date
filtered_icnsa_data <- subset(icnsa_data, date >= "1971-01-02" & date <= "2024-02-03")
# Also, removing the data for COVID period
filtered_icnsa_data <- subset(filtered_icnsa_data, !(year(date) == 2020 | (year(date) == 2021 & month(date) < 7)))
filtered_iurnsa_data <- subset(iurnsa_data, !(year(date) == 2020 | (year(date) == 2021 & month(date) < 7)))
cat("Number of rows in ICNSA after filteration: ", nrow(filtered_icnsa_data), "\n")
#> Number of rows in ICNSA after filteration: 2693
cat("Number of rows in IURNSA after filteration: ", nrow(filtered_iurnsa_data), "\n")
#> Number of rows in IURNSA after filteration: 2693
icnsa_iursa_corr <- cor(filtered_icnsa_data$value, filtered_iurnsa_data$value)
cat("Correlation between ICNSA and IURNSA data: ", icnsa_iursa_corr, "\n")
#> Correlation between ICNSA and IURNSA data: 0.6490541
# Performing time series decomposition for ICNSA data
filtered_icnsa_ts <- ts(filtered_icnsa_data$value, frequency=52)
filtered_icnsa_decomp <- decompose(filtered_icnsa_ts, "multiplicative")
plot(filtered_icnsa_decomp)
# Performing time series decomposition for IURNSA data
fiiltered_iurnsa_ts <- ts(filtered_iurnsa_data$value, frequency=52)
filtered_iurnsa_decomp <- decompose(fiiltered_iurnsa_ts, "multiplicative")
plot(filtered_iurnsa_decomp)
# Plotting ACF and PACF for filtered ICNSA and IURNSA data
acf(filtered_icnsa_data$value)
pacf(filtered_icnsa_data$value)
acf(filtered_iurnsa_data$value)
pacf(filtered_iurnsa_data$value)
# Cross-correlation analysis
cross_correlation <- ccf(filtered_icnsa_data$value, filtered_iurnsa_data$value)
print("Cross-correlation between filtered ICNSA and IURNSA data:")
#> [1] "Cross-correlation between filtered ICNSA and IURNSA data:"
print(cross_correlation)
#>
#> Autocorrelations of series 'X', by lag
#>
#> -31 -30 -29 -28 -27 -26 -25 -24 -23 -22 -21 -20 -19
#> 0.397 0.406 0.417 0.424 0.432 0.435 0.433 0.422 0.420 0.424 0.427 0.434 0.441
#> -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6
#> 0.456 0.472 0.492 0.515 0.539 0.564 0.586 0.606 0.630 0.651 0.669 0.683 0.690
#> -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
#> 0.698 0.699 0.693 0.681 0.674 0.649 0.627 0.573 0.530 0.501 0.470 0.440 0.405
#> 8 9 10 11 12 13 14 15 16 17 18 19 20
#> 0.381 0.362 0.346 0.332 0.326 0.321 0.317 0.316 0.318 0.322 0.327 0.332 0.337
#> 21 22 23 24 25 26 27 28 29 30 31
#> 0.343 0.344 0.346 0.345 0.344 0.339 0.330 0.310 0.297 0.291 0.285
# Fit an ARIMA model to ICNSA data
arima_model <- auto.arima(filtered_icnsa_data$value)
print(arima_model)
#> Series: filtered_icnsa_data$value
#> ARIMA(2,1,2)
#>
#> Coefficients:
#> ar1 ar2 ma1 ma2
#> 0.9676 -0.0723 -1.2112 0.2228
#> s.e. 0.1181 0.0996 0.1159 0.1135
#>
#> sigma^2 = 2.547e+09: log likelihood = -32970.14
#> AIC=65950.27 AICc=65950.29 BIC=65979.76
# Fit a regARIMA model using ICNSA and IURNSA data
reg_arima_model <- auto.arima(filtered_icnsa_data$value, xreg = filtered_iurnsa_data$value)
print(summary(reg_arima_model))
#> Series: filtered_icnsa_data$value
#> Regression with ARIMA(2,1,1) errors
#>
#> Coefficients:
#> ar1 ar2 ma1 xreg
#> 0.5630 0.2223 -0.9883 71614.85
#> s.e. 0.0222 0.0207 0.0028 5478.41
#>
#> sigma^2 = 2.452e+09: log likelihood = -32919.15
#> AIC=65848.3 AICc=65848.32 BIC=65877.79
#>
#> Training set error measures:
#> ME RMSE MAE MPE MAPE MASE
#> Training set 840.8758 49468.03 33157.92 -0.9803505 8.795835 0.957651
#> ACF1
#> Training set -0.007676926
tsdisplay(residuals(reg_arima_model), lag.max=30)
# Perform Ljung-Box test for residuals autocorrelation
ljung_box_test <- Box.test(reg_arima_model$residuals, lag = 12, type = "Ljung-Box")
print(ljung_box_test)
#>
#> Box-Ljung test
#>
#> data: reg_arima_model$residuals
#> X-squared = 188.71, df = 12, p-value < 2.2e-16
# Forecasting for the next week
forecast_icnsa <- forecast(reg_arima_model, xreg = tail(filtered_iurnsa_data$value, 1))
forecast_value <- forecast_icnsa$mean[1]
cat("\nForecasted value for the next week:", forecast_value)
#>
#> Forecasted value for the next week: 242908.1
Data Analysis:
Model Diagnostics
Forecasted value for the next week: 242908.1
Created on 2024-02-22 with reprex v2.1.0
Homework 2: ICNSA data forecast using smoothing technique - Holt-Winters model
library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(reprex)
# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")
# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]
# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)
# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2981
head(icnsa_data)
#> # A tibble: 6 × 2
#> date value
#> <date> <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000
# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#> date value
#> Min. :1967-01-07 Min. : 133000
#> 1st Qu.:1981-04-18 1st Qu.: 265794
#> Median :1995-07-29 Median : 326900
#> Mean :1995-07-29 Mean : 364957
#> 3rd Qu.:2009-11-07 3rd Qu.: 406633
#> Max. :2024-02-17 Max. :6161268
cat("\n\n")
# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
geom_line() +
labs(title = "Time Series Plot of ICNSA Claims Data",
x = "Date (Year)",
y = "value")
# Lets visually see the ICNSA trend during pandemic
# According to the web sources, the pandemic was declared in March 2020 till June/July 2021
ggplot(icnsa_data, aes(x=date, y=value)) +
geom_line() +
labs(title="Initial Claims (ICNSA) over Time", x="Date", y="Initial Claims (ICNSA)") +
theme_minimal() +
geom_rect(aes(xmin=as.Date("2020-03-01"), xmax=as.Date("2021-07-31"), ymin=-Inf, ymax=Inf),
fill="white", alpha=0.004) +
geom_vline(xintercept=as.Date("2020-03-01"), color="orange", linetype="dashed") +
geom_vline(xintercept=as.Date("2021-07-31"), color="orange", linetype="dashed")
# Calculating the average number of claims before and during the pandemic
pre_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date < as.Date("2020-01-01")) ])
subset_data <- subset(icnsa_data, date >= as.Date("2020-03-01") & date <= as.Date("2021-07-31"))
pandemic_avg <- mean(subset_data$value)
post_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date > as.Date("2021-07-31")) ])
cat("Average number of claims before pandemic:", pre_pandemic_avg, "\n")
#> Average number of claims before pandemic: 350231.6
cat("Average number of claims during pandemic:", pandemic_avg, "\n")
#> Average number of claims during pandemic: 1172247
cat("Average number of claims after pandemic:", post_pandemic_avg, "\n")
#> Average number of claims after pandemic: 229688.1
# Create a dataframe for the bar chart
bar_data <- data.frame(period = c("Pre-pandemic", "Pandemic", "Post-pandemic"),
avg_value = c(pre_pandemic_avg, pandemic_avg, post_pandemic_avg))
# Create a bar chart
ggplot(bar_data, aes(x=period, y=avg_value, fill=period)) +
geom_bar(stat="identity") +
labs(title="Average Initial Claims (ICNSA) by Period", x="Period", y="Average Initial Claims (ICNSA)") +
theme_minimal()
pre_pandemic_sd <- sd(icnsa_data$value[which(icnsa_data$date < as.Date("2020-03-01"))])
# Calculate the Z-score of the pandemic
z_score <- (pandemic_avg - pre_pandemic_avg) / pre_pandemic_sd
# Output the Z-score
cat("Z-score of the pandemic: ", z_score, "\n")
#> Z-score of the pandemic: 7.03949
# A positive Z-score indicates that the pandemic average is higher than the pre-pandemic average
# Removing the data for COVID period
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-07-31")
icnsa_data_without_covid <- subset(icnsa_data, !(date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA without Covid period: ", nrow(icnsa_data_without_covid), "\n")
#> Number of rows in ICNSA without Covid period: 2907
# Plotting ACF and PACF for filtered ICNSA data
acf(icnsa_data_without_covid$value)
pacf(icnsa_data_without_covid$value)
# Getting ICNSA data during covid period
icnsa_data_during_covid <- subset(icnsa_data, (date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA during Covid period: ", nrow(icnsa_data_during_covid), "\n")
#> Number of rows in ICNSA during Covid period: 74
# Getting data for covid period and using cubic spline to impute new values
# To choose value of lambda - I would be using smoothing parameter is called `spar`
# Choose a range of spar values
spar_values <- seq(0.1, 0.6, by = 0.2)
best_spar <- NULL
best_mse <- Inf
par(mfcol=c(length(spar_values), 1), mar=c(2, 2, 1, 1))
for(spar in spar_values) {
# Fit smooth spline for each spar value
spline_fit <- smooth.spline(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, spar=spar)
predicted_values <- predict(spline_fit, as.numeric(icnsa_data$date))
# Calculate mean squared error
mse <- mean((predicted_values$y - icnsa_data$value)^2)
# If the current spar value has a lower MSE than the previous best spar value, update the best spar value and its corresponding MSE
if(mse < best_mse) {
best_spar <- spar
best_mse <- mse
}
plot(icnsa_data$date, icnsa_data$value, type='l', main=paste("Spar =", spar),
xlab="Date", ylab="ICNSA Values", col="purple")
lines(icnsa_data$date, predicted_values$y, col="black")
legend("topleft", legend=c("Original", "Smoothed"), col=c("purple", "black"), lty=1)
}
cat("Best value for lambda:", best_spar, "\n")
#> Best value for lambda: 0.1
cat("Best value for MSE:", best_mse, "\n")
#> Best value for MSE: 57230081019
par(mfcol=c(1, 1))
# Impute missing values using cubic spline
spline_fit <- smooth.spline(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, spar = 0.1)
#plotting the spline against our original data
plot(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, type = "p", col = "purple", main = "Original ICNSA vs. Smoothed ICNSA", xlab = "Date", ylab = "Value", xlim = range(as.numeric(icnsa_data_without_covid$date)), ylim = range(icnsa_data_without_covid$value))
lines(spline_fit, col = "black")
legend("topright", legend = c("Original ICNSA series", "Smoothed spline series "), col = c("purple", "black"), lty = 1)
imputed_values <- predict(spline_fit, as.numeric(icnsa_data_during_covid$date))$y
cat("Head for imputed values:", head(imputed_values), "\n")
#> Head for imputed values: 269471.7 271585.5 273635.9 275623.4 277548.6 279412.1
# Combine the imputed values with the original dataframe
new_covid_data <- data.frame(date = icnsa_data_during_covid$date, value = imputed_values)
combined_ICNSA <- rbind(icnsa_data_without_covid, new_covid_data)
cat("Total number of rows in the updated ICNSA dataset:", nrow(combined_ICNSA), "\n")
#> Total number of rows in the updated ICNSA dataset: 2981
# Converting it into time-series object
combined_ICNSA_ts <- ts(combined_ICNSA$value, frequency = 52)
# Fit a multiplicative Holt-Winters model
multiplicative_hw_model <- HoltWinters(combined_ICNSA_ts, seasonal="multiplicative")
multiplicative_hw_forecast <- forecast(multiplicative_hw_model, h=1)
# Fit an additive Holt-Winters model
additive_hw_model <- HoltWinters(combined_ICNSA_ts, seasonal="additive")
additive_hw_forecast <- forecast(additive_hw_model, h=1)
cat("Multiplicative Forecast:", multiplicative_hw_forecast$mean, "\n")
#> Multiplicative Forecast: 247172
cat("Additive Forecast:", additive_hw_forecast$mean, "\n")
#> Additive Forecast: 271732.7
Forecast value for the next week using multiplicative Holt-Winters model: 247172
Created on 2024-02-28 with reprex v2.1.0
Steps Performed
Loaded the most current Initial claims (ICNSA) data, not seasonally adjusted.
Empirical Data Analysis for the Covid period:
First the time-series plot for an entire ICNSA data to see the trend and check which year it is changing.
Chose a time window for the pandemic: 1st March 2020 - 31st July 2021.
To verify I calculated the average number of claims before the pandemic, during the above-chosen dates, and after the pandemic.
From the above stats we can notice that during the chosen time period for Covid, the average number of claims is more than 3 times.
To get more clarity I calculated the Z score of the above-chosen dates: Z-score of the pandemic: 7.03949 A positive Z-score indicates that the pandemic average is higher than the pre-pandemic average
Used a cubic spline methodology to impute new values for the Covid period: After careful observation, the best values of Lambda came out to be 0.1 spar_values <- seq(0.1, 0.6, by = 0.2) The value of lambda was tested from 0.1 to 0.6 with a step size of 0.2 As we can see in the image, lambda=0.1 matches the best with the overall trend of the ICNSA series and the MSE value is the least for lambda = 0.1.
Used both a multiplicative and an additive Holt-Winters model to forecast the next ICNSA value using the newly imputed Covid values.
The forecasted value using the multiplicative Holt-Winters model: 247172 The forecasted value using the additive Holt-Winters model: 271732.7
Homework3 - Structural Time Series model using State Space methodology
library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(reprex)
library(KFAS)
#> Please cite KFAS in publications by using:
#>
#> Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
library(zoo)
library(dlm)
#>
#> Attaching package: 'dlm'
#> The following object is masked from 'package:ggplot2':
#>
#> %+%
library(dplyr)
library(zoo)
library(imputeTS)
#>
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:zoo':
#>
#> na.locf
# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")
# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]
# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)
# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2982
head(icnsa_data)
#> # A tibble: 6 × 2
#> date value
#> <date> <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000
# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#> date value
#> Min. :1967-01-07 Min. : 133000
#> 1st Qu.:1981-04-19 1st Qu.: 265755
#> Median :1995-08-01 Median : 326806
#> Mean :1995-08-01 Mean : 364900
#> 3rd Qu.:2009-11-12 3rd Qu.: 406549
#> Max. :2024-02-24 Max. :6161268
cat("\n\n")
# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
geom_line() +
labs(title = "Time Series Plot of ICNSA Claims Data",
x = "Date (Year)",
y = "value")
from_date <- as.Date("2020-01-01")
to_date <- as.Date("2021-12-31")
filtered_df <- icnsa_data %>%
filter(date >= from_date & date <= to_date)
ggplot(filtered_df, aes(x = date, y = value)) +
geom_line() +
labs(title = "Data from Date Range",
x = "Date",
y = "Value")
# From the above graph we can see that the covid start date is somewhere around 1st March 2020 till 30th June 2021
# Lets visually see the ICNSA trend during pandemic
# According to the web sources, the pandemic was declared in March 2020 till June/July 2021
ggplot(icnsa_data, aes(x=date, y=value)) +
geom_line() +
labs(title="Initial Claims (ICNSA) over Time", x="Date", y="Initial Claims (ICNSA)") +
theme_minimal() +
geom_rect(aes(xmin=as.Date("2020-03-01"), xmax=as.Date("2021-06-30"), ymin=-Inf, ymax=Inf),
fill="white", alpha=0.004) +
geom_vline(xintercept=as.Date("2020-03-01"), color="orange", linetype="dashed") +
geom_vline(xintercept=as.Date("2021-07-31"), color="orange", linetype="dashed")
# Calculating the average number of claims before and during the pandemic
pre_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date < as.Date("2020-03-01")) ])
subset_data <- subset(icnsa_data, date >= as.Date("2020-03-01") & date <= as.Date("2021-06-30"))
pandemic_avg <- mean(subset_data$value)
post_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date > as.Date("2021-06-30")) ])
cat("Average number of claims before pandemic:", pre_pandemic_avg, "\n")
#> Average number of claims before pandemic: 349907.1
cat("Average number of claims during pandemic:", pandemic_avg, "\n")
#> Average number of claims during pandemic: 1230384
cat("Average number of claims after pandemic:", post_pandemic_avg, "\n")
#> Average number of claims after pandemic: 234487.1
# Create a dataframe for the bar chart
bar_data <- data.frame(period = c("Pre-pandemic avg", "Pandemic avg", "Post-pandemic avg"),
avg_value = c(pre_pandemic_avg, pandemic_avg, post_pandemic_avg))
# Create a bar chart
ggplot(bar_data, aes(x=period, y=avg_value, fill=period)) +
geom_bar(stat="identity") +
labs(title="Average Initial Claims (ICNSA) by Period", x="Period", y="Average Initial Claims (ICNSA)") +
theme_minimal()
# Removing the data for COVID period
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-06-30")
non_covid_data <- subset(icnsa_data, !(date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA including Covid period:", nrow(icnsa_data), "\n")
#> Number of rows in ICNSA including Covid period: 2982
cat("Number of rows in ICNSA without Covid period:", nrow(non_covid_data), "\n")
#> Number of rows in ICNSA without Covid period: 2913
# Plotting ACF and PACF for filtered ICNSA data
acf(non_covid_data$value)
pacf(non_covid_data$value)
# Getting ICNSA data during covid period
covid_data_rows = nrow(subset(icnsa_data, (date >= covid_start_date & date <= covid_end_date)))
cat("Number of rows in ICNSA during Covid period: ", covid_data_rows, "\n")
#> Number of rows in ICNSA during Covid period: 69
# Setting covid period data to NA
modified_icnsa <- icnsa_data
modified_icnsa$value[modified_icnsa$date >= as.Date("2020-03-01") & modified_icnsa$date <= as.Date("2021-06-30")] <- NA
nrow(modified_icnsa)
#> [1] 2982
# Getting the index of the covid period
covid_period_index <- which(modified_icnsa$date >= as.Date("2020-03-01") & modified_icnsa$date <= as.Date("2021-06-30"))
# Imputing missing values using na_kalman() within the COVID period
imputed_values <- na_kalman(modified_icnsa$value, smooth = TRUE)
# Updating the ICNSA dataframe with the imputed values within the COVID period
modified_icnsa$value[covid_period_index] <- imputed_values[covid_period_index]
# Plot the original and imputed data
plot(modified_icnsa$date, icnsa_data$value, type = "l", col = "blue", lwd = 2,
xlab = "Date", ylab = "Initial Claims (ICNSA)", main = "Comparison of Original and Imputed Values")
lines(modified_icnsa$date, imputed_values, col = "red", lwd = 2)
legend("topright", legend = c("Original", "Imputed"), col = c("blue", "red"), lwd = 2, bty = "n")
plot(modified_icnsa$date[covid_period_index], icnsa_data$value[covid_period_index], type = "l", col = "blue", lwd = 2,
xlab = "Date", ylab = "Initial Claims (ICNSA)", main = "Comparison of Original and Imputed Values")
lines(modified_icnsa$date[covid_period_index], imputed_values[covid_period_index], col = "red", lwd = 2)
legend("topright", legend = c("Original", "Imputed"), col = c("blue", "red"), lwd = 2, bty = "n")
# Fit a structural time series model with trend, seasonal, and error components
state_space_model <- StructTS(ts(modified_icnsa$value, frequency = 52), type = "BSM")
summary(state_space_model)
#> Length Class Mode
#> coef 4 -none- numeric
#> loglik 1 -none- numeric
#> loglik0 1 -none- numeric
#> data 2982 ts numeric
#> residuals 2982 ts numeric
#> fitted 8946 mts numeric
#> call 3 -none- call
#> series 1 -none- character
#> code 1 -none- numeric
#> model 7 -none- list
#> model0 7 -none- list
#> xtsp 3 -none- numeric
# Taking another series as covariate series
# Reading CCNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
ccnsa_data <- fredr(series_id = "CCNSA")
# Taking date and value column from the dataset
ccnsa_data <- ccnsa_data[c("date", "value")]
# Converting `date` column from string to date
ccnsa_data$date <- as.Date(ccnsa_data$date)
# Checking count of rows in the dataset
cat("Total number of rows in the CCNSA dataset:", nrow(ccnsa_data), "\n")
#> Total number of rows in the CCNSA dataset: 2981
head(ccnsa_data)
#> # A tibble: 6 × 2
#> date value
#> <date> <dbl>
#> 1 1967-01-07 1594000
#> 2 1967-01-14 1563000
#> 3 1967-01-21 1551000
#> 4 1967-01-28 1533000
#> 5 1967-02-04 1534000
#> 6 1967-02-11 1557000
# Basic summary statistics
cat("Summary of value column in CCNSA dataset:", "\n")
#> Summary of value column in CCNSA dataset:
summary(ccnsa_data)
#> date value
#> Min. :1967-01-07 Min. : 785000
#> 1st Qu.:1981-04-18 1st Qu.: 1997000
#> Median :1995-07-29 Median : 2493008
#> Mean :1995-07-29 Mean : 2761405
#> 3rd Qu.:2009-11-07 3rd Qu.: 3145405
#> Max. :2024-02-17 Max. :23037921
cat("\n\n")
# Making ICNSA data compatible with CCNSA data
modified_icnsa <- subset(modified_icnsa, date >= as.Date("1967-01-07") & date <= as.Date("2024-02-17"))
# Merge ICNSA and CCNSA dataframes based on their common date column
merged_data <- merge(modified_icnsa, ccnsa_data, by = "date", all = FALSE)
head(merged_data)
#> date value.x value.y
#> 1 1967-01-07 346000 1594000
#> 2 1967-01-14 334000 1563000
#> 3 1967-01-21 277000 1551000
#> 4 1967-01-28 252000 1533000
#> 5 1967-02-04 274000 1534000
#> 6 1967-02-11 276000 1557000
# 'merged_data' is your data frame containing 'value.x' and 'value.y'
# Create design matrix Z with covariate
Z <- cbind(1, merged_data$value.y) # Trend and covariate
# Check dimensions of Z
dim(Z) # Should be (n, p)
#> [1] 2981 2
# Create state equation matrices
p <- ncol(Z) # Number of state variables
T <- diag(1, p) # Transition matrix for state variables
Q <- diag(0.01, p) # State covariance matrix
# Create observation equation matrices
n <- nrow(merged_data) # Number of observations
H <- diag(1, n) # Observation covariance matrix
# Check dimensions of H
dim(H) # Should be (n, n)
#> [1] 2981 2981
# Create initial state and initial state covariance
initial_state <- rep(0, p) # Initial state vector
initial_state_cov <- diag(1e5, p) # Initial state covariance matrix
# Check dimensions of initial_state_cov
dim(initial_state_cov) # Should be (p, p)
#> [1] 2 2
# Construct state space model
model <- dlm(
FF = Z, # Design matrix
V = H, # Observation covariance matrix
GG = T, # Transition matrix
W = Q, # State covariance matrix
m0 = initial_state, # Initial state vector
C0 = initial_state_cov # Initial state covariance matrix
)
# Fit the model
fit <- dlmFilter(merged_data$value.x, model)
# Print summary of the fitted model
print(summary(fit))
#> Length Class Mode
#> y 2981 -none- numeric
#> mod 6 dlm list
#> m 4 -none- numeric
#> U.C 2 -none- list
#> D.C 4 -none- numeric
#> a 2 -none- numeric
#> U.R 1 -none- list
#> D.R 2 -none- numeric
#> f 2981 -none- numeric
str(model)
#> List of 6
#> $ m0: num [1:2] 0 0
#> $ C0: num [1:2, 1:2] 1e+05 0e+00 0e+00 1e+05
#> $ FF: num [1:2981, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
#> $ V : num [1:2981, 1:2981] 1 0 0 0 0 0 0 0 0 0 ...
#> $ GG: num [1:2, 1:2] 1 0 0 1
#> $ W : num [1:2, 1:2] 0.01 0 0 0.01
#> - attr(*, "class")= chr "dlm"
# Forecasting horizon
forecast_horizon <- 1 # Forecast for the next week
# Forecast using the fitted model
forecast <- tryCatch(
{
dlmForecast(fit, nAhead = forecast_horizon)
},
error = function(e) {
message("Error occurred during forecasting: ", conditionMessage(e))
return(NULL)
}
)
forecasted_value <- forecast$f[1, 1]
print(paste("Forecasted value for the next week:", forecasted_value))
#> [1] "Forecasted value for the next week: 312076.351407013"
Loaded the most current Initial claims (ICNSA) data, not seasonally adjusted.
Empirical Data Analysis for the Covid period:
Used Kalman filtering technique to impute the Pendamic values.
Then I fitted a structural time series model with trend, seasonal, and error components using the StructTS() function.
Then I loaded another series i.e., CCNSA (Continued Claims (Insured Unemployment)) - This is similar to the ICNSA series as it is also based on the unemployment claims.
Then I made both series compatible to merge them.
Then I defined and fitted a structural time series model with trend, seasonal, and error using state space methodology and including covariate series.
Forecasted value - 312076.35
Created on 2024-03-07 with reprex v2.1.0
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(fpp3)
#> ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
#> ✔ tibble 3.2.1 ✔ tsibble 1.1.4
#> ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
#> ✔ tidyr 1.3.1 ✔ feasts 0.3.1
#> ✔ lubridate 1.9.3 ✔ fable 0.3.3
#> ✔ ggplot2 3.4.4 ✔ fabletools 0.4.0
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date() masks base::date()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval() masks lubridate::interval()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ tsibble::setdiff() masks base::setdiff()
#> ✖ tsibble::union() masks base::union()
library(tidyverse)
library(tsbox)
library(zoo)
#>
#> Attaching package: 'zoo'
#> The following object is masked from 'package:tsibble':
#>
#> index
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(seasonal)
#>
#> Attaching package: 'seasonal'
#> The following object is masked from 'package:tibble':
#>
#> view
library(astsa)
#>
#> Attaching package: 'astsa'
#> The following objects are masked from 'package:seasonal':
#>
#> trend, unemp
#> The following object is masked from 'package:forecast':
#>
#> gas
library(patchwork)
library(tseries)
library(fredr)
library(quarto)
library(xts)
#>
#> ######################### Warning from 'xts' package ##########################
#> # #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
#> # source() into this session won't work correctly. #
#> # #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
#> # dplyr from breaking base R's lag() function. #
#> # #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
#> # #
#> ###############################################################################
#>
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#>
#> first, last
library(dplyr)
library(ggplot2)
library(vars)
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:patchwork':
#>
#> area
#> The following object is masked from 'package:dplyr':
#>
#> select
#> Loading required package: strucchange
#> Loading required package: sandwich
#>
#> Attaching package: 'strucchange'
#> The following object is masked from 'package:stringr':
#>
#> boundary
#> Loading required package: urca
#> Loading required package: lmtest
#>
#> Attaching package: 'vars'
#> The following object is masked from 'package:fable':
#>
#> VAR
library(readxl)
library(lubridate)
library(tseries)
library(BigVAR)
#> Loading required package: lattice
library(reprex)
# Reading all time series from excel file
crp_no <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/CRP_NO.xlsx")
crp_vs <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/CRP_VS.xlsx")
iti_ti <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/ITI_TI.xlsx")
iti_uo <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/ITI_UO.xlsx")
# Converting Period column to date type for all 4 time series
crp_no$Period <- as.Date(crp_no$Period)
crp_vs$Period <- as.Date(crp_vs$Period)
iti_ti$Period <- as.Date(iti_ti$Period)
iti_uo$Period <- as.Date(iti_uo$Period)
# Show the dataframe
head(crp_no)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <chr>
#> 1 1993-01-01 5245.0
#> 2 1993-02-01 5313.0
#> 3 1993-03-01 4723.0
#> 4 1993-04-01 5068.0
#> 5 1993-05-01 5290.0
#> 6 1993-06-01 4969.0
head(crp_vs)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <chr>
#> 1 1993-01-01 5441.0
#> 2 1993-02-01 5100.0
#> 3 1993-03-01 5157.0
#> 4 1993-04-01 5016.0
#> 5 1993-05-01 4936.0
#> 6 1993-06-01 5167.0
head(iti_ti)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <chr>
#> 1 1993-01-01 39506.0
#> 2 1993-02-01 39528.0
#> 3 1993-03-01 39371.0
#> 4 1993-04-01 39231.0
#> 5 1993-05-01 39398.0
#> 6 1993-06-01 39309.0
head(iti_uo)
#> # A tibble: 6 × 2
#> Period Value
#> <date> <chr>
#> 1 1993-01-01 80506.0
#> 2 1993-02-01 80035.0
#> 3 1993-03-01 79572.0
#> 4 1993-04-01 80068.0
#> 5 1993-05-01 80174.0
#> 6 1993-06-01 79100.0
# Function to clean data and convert 'Value' column to double
clean_and_convert_data <- function(data) {
# Remove rows where 'Value' column is "NA"
data <- data[!is.na(data$Value) & data$Value != "NA", ]
# Remove commas from 'Value' column
data$Value <- gsub(",", "", data$Value)
# Convert 'Value' column to numeric
data$Value <- as.numeric(data$Value)
return(data)
}
crp_no <- clean_and_convert_data(crp_no)
crp_vs <- clean_and_convert_data(crp_vs)
iti_ti <- clean_and_convert_data(iti_ti)
iti_uo <- clean_and_convert_data(iti_uo)
# Checking total number of rows in all the time series
cat("- Total number of rows in crp_no dataset:", nrow(crp_no), "\n")
#> - Total number of rows in crp_no dataset: 374
cat("- Total number of rows in crp_vs dataset:", nrow(crp_vs), "\n")
#> - Total number of rows in crp_vs dataset: 374
cat("- Total number of rows in iti_ti dataset:", nrow(iti_ti), "\n")
#> - Total number of rows in iti_ti dataset: 374
cat("- Total number of rows in iti_uo dataset:", nrow(iti_uo), "\n\n")
#> - Total number of rows in iti_uo dataset: 374
# Basic summary statistics for all 4 series
cat("- Summary of crp_no dataset:", "\n")
#> - Summary of crp_no dataset:
summary(crp_no)
#> Period Value
#> Min. :1993-01-01 Min. : 1393
#> 1st Qu.:2000-10-08 1st Qu.: 2110
#> Median :2008-07-16 Median : 4868
#> Mean :2008-07-16 Mean : 4628
#> 3rd Qu.:2016-04-23 3rd Qu.: 6202
#> Max. :2024-02-01 Max. :10535
cat("- Summary of crp_vs dataset:", "\n")
#> - Summary of crp_vs dataset:
summary(crp_vs)
#> Period Value
#> Min. :1993-01-01 Min. : 1384
#> 1st Qu.:2000-10-08 1st Qu.: 2139
#> Median :2008-07-16 Median : 5028
#> Mean :2008-07-16 Mean : 4642
#> 3rd Qu.:2016-04-23 3rd Qu.: 6164
#> Max. :2024-02-01 Max. :10203
cat("- Summary of iti_ti dataset:", "\n")
#> - Summary of iti_ti dataset:
summary(iti_ti)
#> Period Value
#> Min. :1993-01-01 Min. :33132
#> 1st Qu.:2000-10-08 1st Qu.:37582
#> Median :2008-07-16 Median :39319
#> Mean :2008-07-16 Mean :40669
#> 3rd Qu.:2016-04-23 3rd Qu.:44426
#> Max. :2024-02-01 Max. :53721
cat("- Summary of iti_uo dataset:", "\n")
#> - Summary of iti_uo dataset:
summary(iti_uo)
#> Period Value
#> Min. :1993-01-01 Min. : 76417
#> 1st Qu.:2000-10-08 1st Qu.: 93123
#> Median :2008-07-16 Median :111062
#> Mean :2008-07-16 Mean :108078
#> 3rd Qu.:2016-04-23 3rd Qu.:119714
#> Max. :2024-02-01 Max. :134990
cat("\n\n")
# Data Analysis
## Visualizing the datasets -
# Plotting crp_no time-series data
ggplot(crp_no, aes(x = Period, y = Value)) +
geom_line() +
labs(title = "Time Series Plot of CRP_NO series",
x = "Date (Month-Year)",
y = "Value")
# Plotting crp_vs time-series data
ggplot(crp_vs, aes(x = Period, y = Value)) +
geom_line() +
labs(title = "Time Series Plot of CRP_VS series",
x = "Date (Month-Year)",
y = "Value")
# Plotting iti_ti time-series data
ggplot(iti_ti, aes(x = Period, y = Value)) +
geom_line() +
labs(title = "Time Series Plot of ITI_TI series",
x = "Date (Month-Year)",
y = "Value")
# Plotting iti_uo time-series data
ggplot(iti_uo, aes(x = Period, y = Value)) +
geom_line() +
labs(title = "Time Series Plot of ITI_UO series",
x = "Date (Month-Year)",
y = "Value")
## Correlation coefficient -
crpNo_crpVs_corr <- cor(crp_no$Value, crp_vs$Value)
cat("Correlation between Crp_no and Crp_vs series: ", crpNo_crpVs_corr, "\n")
#> Correlation between Crp_no and Crp_vs series: 0.9954609
crpNo_itiTi_corr <- cor(crp_no$Value, iti_ti$Value)
cat("Correlation between Crp_no and Iti_ti series: ", crpNo_itiTi_corr, "\n")
#> Correlation between Crp_no and Iti_ti series: 0.5531556
crpNo_itiUo_corr <- cor(crp_no$Value, iti_uo$Value)
cat("Correlation between Crp_no and Iti_uo series: ", crpNo_itiUo_corr, "\n")
#> Correlation between Crp_no and Iti_uo series: -0.6597469
## Checking for stationarity -
# Function to check stationarity using ADF and KPSS tests
check_stationarity <- function(data) {
adf_test <- adf.test(data)
kpss_test <- kpss.test(data)
cat("ADF Test:\n")
print(adf_test)
#cat("\nKPSS Test:\n")
#print(kpss_test)
}
# Assuming you have two dataframes: real_estate_data and mortgage_interest_data
# Check stationarity for both series
cat("Checking stationarity for crp_no: ")
#> Checking stationarity for crp_no:
check_stationarity(crp_no$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#>
#> Augmented Dickey-Fuller Test
#>
#> data: data
#> Dickey-Fuller = -2.7454, Lag order = 7, p-value = 0.2624
#> alternative hypothesis: stationary
cat("Checking stationarity for crp_vs: ")
#> Checking stationarity for crp_vs:
check_stationarity(crp_vs$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#>
#> Augmented Dickey-Fuller Test
#>
#> data: data
#> Dickey-Fuller = -2.703, Lag order = 7, p-value = 0.2802
#> alternative hypothesis: stationary
cat("Checking stationarity for iti_ti: ")
#> Checking stationarity for iti_ti:
check_stationarity(iti_ti$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#>
#> Augmented Dickey-Fuller Test
#>
#> data: data
#> Dickey-Fuller = -2.4416, Lag order = 7, p-value = 0.3906
#> alternative hypothesis: stationary
cat("Checking stationarity for iti_uo: ")
#> Checking stationarity for iti_uo:
check_stationarity(iti_uo$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#>
#> Augmented Dickey-Fuller Test
#>
#> data: data
#> Dickey-Fuller = -3.5722, Lag order = 7, p-value = 0.03585
#> alternative hypothesis: stationary
## ACF and PACF of all 4 time series
# Plotting ACF and PACF for crp_no series
acf(crp_no$Value)
pacf(crp_no$Value)
# Plotting ACF and PACF for crp_vs series
acf(crp_vs$Value)
pacf(crp_vs$Value)
# Plotting ACF and PACF for iti_ti series
acf(iti_ti$Value)
pacf(iti_ti$Value)
# Plotting ACF and PACF for iti_uo series
acf(iti_uo$Value)
pacf(iti_uo$Value)
## Decomposition for crp_no -
crp_no_ts <- ts(crp_no$Value, frequency=12)
crp_no_decomp <- decompose(crp_no_ts, "multiplicative")
plot(crp_no_decomp)
## Decomposition for crp_vs -
crp_vs_ts <- ts(crp_vs$Value, frequency=12)
crp_vs_decomp <- decompose(crp_vs_ts, "multiplicative")
plot(crp_vs_decomp)
## Decomposition for iti_ti -
iti_ti_ts <- ts(iti_ti$Value, frequency=12)
iti_ti_decomp <- decompose(iti_ti_ts, "multiplicative")
plot(iti_ti_decomp)
## Decomposition for iti_uo -
iti_uo_ts <- ts(iti_uo$Value, frequency=12)
iti_uo_decomp <- decompose(iti_uo_ts, "multiplicative")
plot(iti_uo_decomp)
# Modelling
# Merging all 4 time series -
combined_data <- merge(crp_no, crp_vs, by = "Period", all = TRUE, suffixes = c("_crp_no", "_crp_vs"))
combined_data <- merge(combined_data, iti_ti, by = "Period", all = TRUE, suffixes = c("_crp_no", "_iti_ti"))
combined_data <- merge(combined_data, iti_uo, by = "Period", all = TRUE, suffixes = c("_crp_no", "_iti_uo"))
#> Warning in merge.data.frame(combined_data, iti_uo, by = "Period", all = TRUE, :
#> column name 'Value_crp_no' is duplicated in the result
colnames(combined_data)[4] <- "Value_iti_ti"
nrow(combined_data)
#> [1] 374
head(combined_data)
#> Period Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> 1 1993-01-01 5245 5441 39506 80506
#> 2 1993-02-01 5313 5100 39528 80035
#> 3 1993-03-01 4723 5157 39371 79572
#> 4 1993-04-01 5068 5016 39231 80068
#> 5 1993-05-01 5290 4936 39398 80174
#> 6 1993-06-01 4969 5167 39309 79100
# Create VAR model object for VAR(1)
combined_data$Period <- as.Date(combined_data$Period)
combined_data <- na.omit(combined_data)
# Convert the combined data to a time series object
ts_combined_data <- ts(combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], start = as.Date("1993-01-01"), frequency = 12)
var1_model <- VAR(ts_combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], p = 1)
# Print summary of VAR(1) model
summary(var1_model)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: Value_crp_no, Value_crp_vs, Value_iti_ti, Value_iti_uo
#> Deterministic variables: const
#> Sample size: 373
#> Log Likelihood: -11038.371
#> Roots of the characteristic polynomial:
#> 0.9938 0.9938 0.9852 0.1941
#> Call:
#> VAR(y = ts_combined_data[, c("Value_crp_no", "Value_crp_vs",
#> "Value_iti_ti", "Value_iti_uo")], p = 1)
#>
#>
#> Estimation results for equation Value_crp_no:
#> =============================================
#> Value_crp_no = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 -0.013019 0.076306 -0.171 0.864620
#> Value_crp_vs.l1 0.969321 0.076112 12.735 < 2e-16 ***
#> Value_iti_ti.l1 0.011246 0.005921 1.899 0.058310 .
#> Value_iti_uo.l1 -0.006073 0.001709 -3.553 0.000431 ***
#> const 379.330929 238.470679 1.591 0.112540
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 367.9 on 368 degrees of freedom
#> Multiple R-Squared: 0.9808, Adjusted R-squared: 0.9806
#> F-statistic: 4693 on 4 and 368 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_crp_vs:
#> =============================================
#> Value_crp_vs = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 0.184887 0.059757 3.094 0.00213 **
#> Value_crp_vs.l1 0.790988 0.059605 13.271 < 2e-16 ***
#> Value_iti_ti.l1 0.007557 0.004637 1.630 0.10400
#> Value_iti_uo.l1 -0.003186 0.001339 -2.380 0.01783 *
#> const 143.137783 186.751102 0.766 0.44389
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 288.1 on 368 degrees of freedom
#> Multiple R-Squared: 0.9882, Adjusted R-squared: 0.9881
#> F-statistic: 7708 on 4 and 368 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_iti_ti:
#> =============================================
#> Value_iti_ti = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 -0.294046 0.085233 -3.450 0.000626 ***
#> Value_crp_vs.l1 0.295386 0.085016 3.474 0.000573 ***
#> Value_iti_ti.l1 0.996967 0.006614 150.745 < 2e-16 ***
#> Value_iti_uo.l1 0.001352 0.001909 0.708 0.479210
#> const -15.845201 266.367194 -0.059 0.952597
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 410.9 on 368 degrees of freedom
#> Multiple R-Squared: 0.9908, Adjusted R-squared: 0.9907
#> F-statistic: 9941 on 4 and 368 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_iti_uo:
#> =============================================
#> Value_iti_uo = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 -0.068184 0.193488 -0.352 0.7247
#> Value_crp_vs.l1 0.130156 0.192996 0.674 0.5005
#> Value_iti_ti.l1 -0.030413 0.015014 -2.026 0.0435 *
#> Value_iti_uo.l1 1.003758 0.004334 231.589 <2e-16 ***
#> const 686.995408 604.683739 1.136 0.2566
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 932.8 on 368 degrees of freedom
#> Multiple R-Squared: 0.9967, Adjusted R-squared: 0.9967
#> F-statistic: 2.812e+04 on 4 and 368 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no 135317 78920 7607 68864
#> Value_crp_vs 78920 82987 5282 5598
#> Value_iti_ti 7607 5282 168827 100727
#> Value_iti_uo 68864 5598 100727 870038
#>
#> Correlation matrix of residuals:
#> Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no 1.00000 0.74474 0.05033 0.20070
#> Value_crp_vs 0.74474 1.00000 0.04462 0.02083
#> Value_iti_ti 0.05033 0.04462 1.00000 0.26282
#> Value_iti_uo 0.20070 0.02083 0.26282 1.00000
# Fit VAR(p) model
p <- 2
varp_model <- VAR(ts_combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], p = p)
# Print summary of VAR(1) model
summary(varp_model)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: Value_crp_no, Value_crp_vs, Value_iti_ti, Value_iti_uo
#> Deterministic variables: const
#> Sample size: 372
#> Log Likelihood: -10886.418
#> Roots of the characteristic polynomial:
#> 0.9921 0.9884 0.9884 0.5737 0.5162 0.5162 0.3349 0.225
#> Call:
#> VAR(y = ts_combined_data[, c("Value_crp_no", "Value_crp_vs",
#> "Value_iti_ti", "Value_iti_uo")], p = p)
#>
#>
#> Estimation results for equation Value_crp_no:
#> =============================================
#> Value_crp_no = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 -0.10827 0.07774 -1.393 0.1646
#> Value_crp_vs.l1 0.76596 0.09734 7.869 4.15e-14 ***
#> Value_iti_ti.l1 0.04488 0.04628 0.970 0.3328
#> Value_iti_uo.l1 0.01265 0.02120 0.597 0.5510
#> Value_crp_no.l2 -0.23947 0.07654 -3.129 0.0019 **
#> Value_crp_vs.l2 0.53579 0.09066 5.910 7.90e-09 ***
#> Value_iti_ti.l2 -0.03400 0.04666 -0.729 0.4667
#> Value_iti_uo.l2 -0.01961 0.02139 -0.917 0.3599
#> const 490.55230 229.86271 2.134 0.0335 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 351.2 on 363 degrees of freedom
#> Multiple R-Squared: 0.9827, Adjusted R-squared: 0.9823
#> F-statistic: 2579 on 8 and 363 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_crp_vs:
#> =============================================
#> Value_crp_vs = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 0.13401 0.06007 2.231 0.0263 *
#> Value_crp_vs.l1 0.52272 0.07522 6.949 1.71e-11 ***
#> Value_iti_ti.l1 0.07795 0.03576 2.180 0.0299 *
#> Value_iti_uo.l1 0.01880 0.01638 1.148 0.2519
#> Value_crp_no.l2 0.03833 0.05915 0.648 0.5174
#> Value_crp_vs.l2 0.28184 0.07006 4.023 7.00e-05 ***
#> Value_iti_ti.l2 -0.07127 0.03606 -1.977 0.0488 *
#> Value_iti_uo.l2 -0.02248 0.01653 -1.360 0.1747
#> const 221.67170 177.63499 1.248 0.2129
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 271.4 on 363 degrees of freedom
#> Multiple R-Squared: 0.9897, Adjusted R-squared: 0.9894
#> F-statistic: 4348 on 8 and 363 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_iti_ti:
#> =============================================
#> Value_iti_ti = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 -0.37535 0.08045 -4.665 4.34e-06 ***
#> Value_crp_vs.l1 0.49486 0.10074 4.912 1.36e-06 ***
#> Value_iti_ti.l1 1.39798 0.04789 29.189 < 2e-16 ***
#> Value_iti_uo.l1 0.06986 0.02194 3.184 0.00158 **
#> Value_crp_no.l2 0.09328 0.07921 1.178 0.23973
#> Value_crp_vs.l2 -0.21455 0.09383 -2.287 0.02280 *
#> Value_iti_ti.l2 -0.40276 0.04829 -8.341 1.55e-15 ***
#> Value_iti_uo.l2 -0.06962 0.02214 -3.144 0.00180 **
#> const 174.33160 237.89019 0.733 0.46414
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 363.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.9929, Adjusted R-squared: 0.9928
#> F-statistic: 6364 on 8 and 363 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_iti_uo:
#> =============================================
#> Value_iti_uo = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Value_crp_no.l1 -0.59956 0.18495 -3.242 0.00130 **
#> Value_crp_vs.l1 0.45086 0.23158 1.947 0.05232 .
#> Value_iti_ti.l1 0.33136 0.11010 3.010 0.00280 **
#> Value_iti_uo.l1 1.37959 0.05043 27.355 < 2e-16 ***
#> Value_crp_no.l2 -0.41512 0.18210 -2.280 0.02321 *
#> Value_crp_vs.l2 0.58504 0.21570 2.712 0.00700 **
#> Value_iti_ti.l2 -0.35270 0.11101 -3.177 0.00161 **
#> Value_iti_uo.l2 -0.38204 0.05090 -7.506 4.77e-13 ***
#> const 1102.69133 546.87635 2.016 0.04450 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 835.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.9974, Adjusted R-squared: 0.9973
#> F-statistic: 1.739e+04 on 8 and 363 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no 123339 70167 7224 46686
#> Value_crp_vs 70167 73658 1860 -14500
#> Value_iti_ti 7224 1860 132104 40401
#> Value_iti_uo 46686 -14500 40401 698140
#>
#> Correlation matrix of residuals:
#> Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no 1.00000 0.73616 0.05659 0.15910
#> Value_crp_vs 0.73616 1.00000 0.01886 -0.06394
#> Value_iti_ti 0.05659 0.01886 1.00000 0.13303
#> Value_iti_uo 0.15910 -0.06394 0.13303 1.00000
# Comparison between two fits using AIC (Akaike Information Criterion) and BIC (Bayesian Information Criterion)
# Lower values indicates better fit.
# Calculate AIC and BIC for VAR(1) model
aic_var1 <- AIC(var1_model)
bic_var1 <- BIC(var1_model)
# Calculate AIC and BIC for VAR(p) model
aic_varp <- AIC(varp_model)
bic_varp <- BIC(varp_model)
# Print AIC and BIC values
cat("AIC for VAR(1):", aic_var1, "\n")
#> AIC for VAR(1): 22116.74
cat("BIC for VAR(1):", bic_var1, "\n")
#> BIC for VAR(1): 22195.17
cat("AIC for VAR(p):", aic_varp, "\n")
#> AIC for VAR(p): 21844.84
cat("BIC for VAR(p):", bic_varp, "\n")
#> BIC for VAR(p): 21985.92
## We can observe that both AIC and BIC are lower for the VAR(p) model compared to the VAR(1) model. This suggests that the VAR(p) model provides a better balance between goodness of fit and model complexity. Lower AIC and BIC values indicate a better fit while taking into account the complexity of the model.
## Forecasting
# Produce a one-month-ahead forecast using VAR(1) model
one_month_forecast_var1 <- predict(var1_model, n.ahead = 1)
# Extract the forecasted values
forecasted_values_var1 <- one_month_forecast_var1$fcst[1]$Value_crp_no[1]
# Print the forecasted value for the next month
cat("Forecasted value for the next month using VAR 1 model:", forecasted_values_var1, "\n")
#> Forecasted value for the next month using VAR 1 model: 2241.609
# Produce a one-month-ahead forecast using VAR(p) model
one_month_forecast_varp <- predict(varp_model, n.ahead = 1)
# Extract the forecasted values
forecasted_values_varp <- one_month_forecast_varp$fcst[1]$Value_crp_no[1]
# Print the forecasted value for the next month
cat("Forecasted value for the next month using VAR p model:", forecasted_values_varp, "\n")
#> Forecasted value for the next month using VAR p model: 2190.083
# Granger Causality
causality_test_var1 <- causality(var1_model)
#> Warning in causality(var1_model):
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Value_crp_no) as cause variable.
print(causality_test_var1)
#> $Granger
#>
#> Granger causality H0: Value_crp_no do not Granger-cause Value_crp_vs
#> Value_iti_ti Value_iti_uo
#>
#> data: VAR object var1_model
#> F-Test = 7.5892, df1 = 3, df2 = 1472, p-value = 4.89e-05
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: Value_crp_no and Value_crp_vs
#> Value_iti_ti Value_iti_uo
#>
#> data: VAR object var1_model
#> Chi-squared = 138.41, df = 3, p-value < 2.2e-16
causality_test_varp <- causality(varp_model)
#> Warning in causality(varp_model):
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Value_crp_no) as cause variable.
print(causality_test_varp)
#> $Granger
#>
#> Granger causality H0: Value_crp_no do not Granger-cause Value_crp_vs
#> Value_iti_ti Value_iti_uo
#>
#> data: VAR object varp_model
#> F-Test = 6.9702, df1 = 6, df2 = 1452, p-value = 2.542e-07
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: Value_crp_no and Value_crp_vs
#> Value_iti_ti Value_iti_uo
#>
#> data: VAR object varp_model
#> Chi-squared = 137.28, df = 3, p-value < 2.2e-16
# Load the BigVAR package
library(BigVAR)
bigvar_matrix <- cbind(crp_no$Value,
crp_vs$Value,
iti_ti$Value,
iti_uo$Value)
# Fit a sparse VAR model using Lasso regularization
sparse_var_fit <- BigVAR.fit(Y=bigvar_matrix,struct='Basic',p=2,lambda=1)[,,1]
# Describe the sparsity structure
print(sparse_var_fit)
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 417.59572 0.24196716 0.249637724 0.003185001 0.007121556 0.23873930
#> [2,] 222.58455 0.24856635 0.252579633 0.001598170 0.011755905 0.24551514
#> [3,] -34.94354 -0.03225582 0.054272649 0.792747721 0.133064078 -0.01802307
#> [4,] 908.21645 0.02025621 0.007536235 0.154881877 1.234473633 -0.03508725
#> [,7] [,8] [,9]
#> [1,] 0.24796739 -0.0003493715 -0.01131492
#> [2,] 0.25061997 -0.0027416290 -0.01333212
#> [3,] -0.00726279 0.2051545935 -0.13180078
#> [4,] 0.05198889 -0.1821780945 -0.23351711
#sparsity_structure <- sparse_var_fit$selected
#print(sparsity_structure)
# Summarize the results
summary(sparse_var_fit)
#> V1 V2 V3 V4
#> Min. :-34.94 Min. :-0.032256 Min. :0.007536 Min. :0.001598
#> 1st Qu.:158.20 1st Qu.: 0.007128 1st Qu.:0.042589 1st Qu.:0.002788
#> Median :320.09 Median : 0.131112 Median :0.151955 Median :0.079033
#> Mean :378.36 Mean : 0.119633 Mean :0.141007 Mean :0.238103
#> 3rd Qu.:540.25 3rd Qu.: 0.243617 3rd Qu.:0.250373 3rd Qu.:0.314348
#> Max. :908.22 Max. : 0.248566 Max. :0.252580 Max. :0.792748
#> V5 V6 V7 V8
#> Min. :0.007122 Min. :-0.03509 Min. :-0.007263 Min. :-0.182178
#> 1st Qu.:0.010597 1st Qu.:-0.02229 1st Qu.: 0.037176 1st Qu.:-0.047601
#> Median :0.072410 Median : 0.11036 Median : 0.149978 Median :-0.001545
#> Mean :0.346604 Mean : 0.10779 Mean : 0.135828 Mean : 0.004971
#> 3rd Qu.:0.408417 3rd Qu.: 0.24043 3rd Qu.: 0.248631 3rd Qu.: 0.051027
#> Max. :1.234474 Max. : 0.24552 Max. : 0.250620 Max. : 0.205155
#> V9
#> Min. :-0.23352
#> 1st Qu.:-0.15723
#> Median :-0.07257
#> Mean :-0.09749
#> 3rd Qu.:-0.01283
#> Max. :-0.01131
Forecasted value for the next month using VAR 1 model: 2241.609 Forecasted value for the next month using VAR p model: 2190.083
Created on 2024-04-10 with reprex v2.1.0
Forecasted value for next week - 218288.1 Created on 2024-02-22 with reprex v2.1.0