library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)

# Reading data
#claims_data <- read.csv("ICNSA.csv")
#head(claims_data)
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1')
claims_data <- fredr(series_id = 'ICNSA')

# Converting DATE column type from string to date
claims_data$date <- as.Date(claims_data$date)
head(claims_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-22     2024-02-22  
#> 2 1967-01-14 ICNSA     334000 2024-02-22     2024-02-22  
#> 3 1967-01-21 ICNSA     277000 2024-02-22     2024-02-22  
#> 4 1967-01-28 ICNSA     252000 2024-02-22     2024-02-22  
#> 5 1967-02-04 ICNSA     274000 2024-02-22     2024-02-22  
#> 6 1967-02-11 ICNSA     276000 2024-02-22     2024-02-22

# Basic summary statistics
summary(claims_data$value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  133000  265800  326919  365013  406725 6161268

# Plotting Time-Series data
ggplot(claims_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of Claims Data",
       x = "Date",
       y = "ICNSA")


# Autocorrelation Plot
acf(claims_data$value)


# Partial Autocorrelation Plot
pacf(claims_data$value)


# Forecasting claims value for the next week
# We will be fitting an ARIMA model for out time series data

# Creating a time series object, since this is weekly data so choosing frequency as 7
ts_data <- ts(claims_data$value, frequency = 7)

# Fitting an ARIMA model
arima_model <- arima(ts_data, order = c(2,1,1))

forecast_week <- forecast(arima_model, h = 1)
print(forecast_week)
#>          Point Forecast    Lo 80    Hi 80    Lo 95  Hi 95
#> 426.7143       218288.1 106262.1 330314.1 46959.08 389617

# Forecasting for a targeted date
target_date <- as.Date("2024-02-15")
forecast_target <- forecast(arima_model, h = 1, newdata = data.frame(date = target_date))
#> Warning in forecast.Arima(arima_model, h = 1, newdata = data.frame(date =
#> target_date)): The non-existent newdata arguments will be ignored.
print(forecast_target)
#>          Point Forecast    Lo 80    Hi 80    Lo 95  Hi 95
#> 426.7143       218288.1 106262.1 330314.1 46959.08 389617

Forecasted value for next week - 218288.1 ^{Created on 2024-02-22 with reprex v2.1.0}

Homework 1 - regARIMA on ICNSA data and IURNSA as a covariate

library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)

# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")

# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]

# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2980

head(icnsa_data)
#> # A tibble: 6 × 2
#>   date        value
#>   <date>      <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000

# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#>       date                value        
#>  Min.   :1967-01-07   Min.   : 133000  
#>  1st Qu.:1981-04-16   1st Qu.: 265800  
#>  Median :1995-07-25   Median : 326919  
#>  Mean   :1995-07-25   Mean   : 365013  
#>  3rd Qu.:2009-11-01   3rd Qu.: 406725  
#>  Max.   :2024-02-10   Max.   :6161268
cat("\n\n")

########### Another series ##############

# Reading related data to ICNSA to be used as a covariate - Insured Unemployment Rate 
iurnsa_data <- fredr(series_id = "IURNSA")

# Taking date and value column from the dataset
iurnsa_data <- iurnsa_data[c("date", "value")]

# Converting `date` column from string to date
iurnsa_data$date <- as.Date(iurnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the IURNSA dataset:", nrow(iurnsa_data), "\n")
#> Total number of rows in the IURNSA dataset: 2771

head(iurnsa_data)
#> # A tibble: 6 × 2
#>   date       value
#>   <date>     <dbl>
#> 1 1971-01-02   5.2
#> 2 1971-01-09   5.3
#> 3 1971-01-16   5.3
#> 4 1971-01-23   5.2
#> 5 1971-01-30   5.2
#> 6 1971-02-06   5.2

# Basic summary statistics
cat("Summary of value column in IURNSA dataset:", "\n")
#> Summary of value column in IURNSA dataset:
summary(iurnsa_data)
#>       date                value       
#>  Min.   :1971-01-02   Min.   : 0.800  
#>  1st Qu.:1984-04-10   1st Qu.: 1.900  
#>  Median :1997-07-19   Median : 2.500  
#>  Mean   :1997-07-19   Mean   : 2.759  
#>  3rd Qu.:2010-10-26   3rd Qu.: 3.300  
#>  Max.   :2024-02-03   Max.   :15.800

# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


# Plotting IURNSA Time-Series data
ggplot(iurnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of IURNSA Claims Data",
       x = "Date (Year)",
       y = "value")


# Calculating the correlation coefficient between ICNSA and IURNSA 
# To find the correlation, we need to match the dimesions of both the series i.e., the number of rows should be same
# Since, both the series are weekly and IURSA is from 1971-01-02 to 2024-02-03, we need to filter ICNSA from the same date
filtered_icnsa_data <- subset(icnsa_data, date >= "1971-01-02" & date <= "2024-02-03")

# Also, removing the data for COVID period
filtered_icnsa_data <- subset(filtered_icnsa_data, !(year(date) == 2020 | (year(date) == 2021 & month(date) < 7)))
filtered_iurnsa_data <- subset(iurnsa_data, !(year(date) == 2020 | (year(date) == 2021 & month(date) < 7)))

cat("Number of rows in ICNSA after filteration: ", nrow(filtered_icnsa_data), "\n")
#> Number of rows in ICNSA after filteration:  2693
cat("Number of rows in IURNSA after filteration: ", nrow(filtered_iurnsa_data), "\n")
#> Number of rows in IURNSA after filteration:  2693

icnsa_iursa_corr <- cor(filtered_icnsa_data$value, filtered_iurnsa_data$value)
cat("Correlation between ICNSA and IURNSA data: ", icnsa_iursa_corr, "\n")
#> Correlation between ICNSA and IURNSA data:  0.6490541

# Performing time series decomposition for ICNSA data
filtered_icnsa_ts <- ts(filtered_icnsa_data$value, frequency=52)
filtered_icnsa_decomp <- decompose(filtered_icnsa_ts, "multiplicative")
plot(filtered_icnsa_decomp)


# Performing time series decomposition for IURNSA data
fiiltered_iurnsa_ts <- ts(filtered_iurnsa_data$value, frequency=52)
filtered_iurnsa_decomp <- decompose(fiiltered_iurnsa_ts, "multiplicative")
plot(filtered_iurnsa_decomp)


# Plotting ACF and PACF for filtered ICNSA and IURNSA data
acf(filtered_icnsa_data$value)

pacf(filtered_icnsa_data$value)


acf(filtered_iurnsa_data$value)

pacf(filtered_iurnsa_data$value)


# Cross-correlation analysis
cross_correlation <- ccf(filtered_icnsa_data$value, filtered_iurnsa_data$value)

print("Cross-correlation between filtered ICNSA and IURNSA data:")
#> [1] "Cross-correlation between filtered ICNSA and IURNSA data:"
print(cross_correlation)
#> 
#> Autocorrelations of series 'X', by lag
#> 
#>   -31   -30   -29   -28   -27   -26   -25   -24   -23   -22   -21   -20   -19 
#> 0.397 0.406 0.417 0.424 0.432 0.435 0.433 0.422 0.420 0.424 0.427 0.434 0.441 
#>   -18   -17   -16   -15   -14   -13   -12   -11   -10    -9    -8    -7    -6 
#> 0.456 0.472 0.492 0.515 0.539 0.564 0.586 0.606 0.630 0.651 0.669 0.683 0.690 
#>    -5    -4    -3    -2    -1     0     1     2     3     4     5     6     7 
#> 0.698 0.699 0.693 0.681 0.674 0.649 0.627 0.573 0.530 0.501 0.470 0.440 0.405 
#>     8     9    10    11    12    13    14    15    16    17    18    19    20 
#> 0.381 0.362 0.346 0.332 0.326 0.321 0.317 0.316 0.318 0.322 0.327 0.332 0.337 
#>    21    22    23    24    25    26    27    28    29    30    31 
#> 0.343 0.344 0.346 0.345 0.344 0.339 0.330 0.310 0.297 0.291 0.285

# Fit an ARIMA model to ICNSA data
arima_model <- auto.arima(filtered_icnsa_data$value)
print(arima_model)
#> Series: filtered_icnsa_data$value 
#> ARIMA(2,1,2) 
#> 
#> Coefficients:
#>          ar1      ar2      ma1     ma2
#>       0.9676  -0.0723  -1.2112  0.2228
#> s.e.  0.1181   0.0996   0.1159  0.1135
#> 
#> sigma^2 = 2.547e+09:  log likelihood = -32970.14
#> AIC=65950.27   AICc=65950.29   BIC=65979.76

# Fit a regARIMA model using ICNSA and IURNSA data
reg_arima_model <- auto.arima(filtered_icnsa_data$value, xreg = filtered_iurnsa_data$value)
print(summary(reg_arima_model))
#> Series: filtered_icnsa_data$value 
#> Regression with ARIMA(2,1,1) errors 
#> 
#> Coefficients:
#>          ar1     ar2      ma1      xreg
#>       0.5630  0.2223  -0.9883  71614.85
#> s.e.  0.0222  0.0207   0.0028   5478.41
#> 
#> sigma^2 = 2.452e+09:  log likelihood = -32919.15
#> AIC=65848.3   AICc=65848.32   BIC=65877.79
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE        MPE     MAPE     MASE
#> Training set 840.8758 49468.03 33157.92 -0.9803505 8.795835 0.957651
#>                      ACF1
#> Training set -0.007676926

tsdisplay(residuals(reg_arima_model), lag.max=30)


# Perform Ljung-Box test for residuals autocorrelation
ljung_box_test <- Box.test(reg_arima_model$residuals, lag = 12, type = "Ljung-Box")
print(ljung_box_test)
#> 
#>  Box-Ljung test
#> 
#> data:  reg_arima_model$residuals
#> X-squared = 188.71, df = 12, p-value < 2.2e-16

# Forecasting for the next week
forecast_icnsa <- forecast(reg_arima_model, xreg = tail(filtered_iurnsa_data$value, 1))
forecast_value <- forecast_icnsa$mean[1]

cat("\nForecasted value for the next week:", forecast_value)
#> 
#> Forecasted value for the next week: 242908.1

Data Analysis:

Both datasets are stationary - except for the pandemic period.
Visualized ACF and PACF plots for both the series. The ACF plot for ICNSA shows that the correlation between the observation decreases as the lag value increases significantly as compared to the IURNSA series.

Model Diagnostics

ARIMA(2,1,1): This represents the ARIMA component of the model. It means that the model includes an autoregressive term of order 2 (AR(2)), a differencing term of order 1 (I(1)), and a moving average term of order 1 (MA(1)).
The negative value of the MA term indicates an inverse relationship between the previous errors and the current value of the dependent variable.
The MAPE value is 8.79% which means that the forecasted values are off by 8.8% from the actual value which is not bad.
The log-likelihood is -32919.15, which is a high value, indicating a good fit of the model to the data.
The P-value is significantly low for this regARIMA model which indicates that the coefficient for the MA term (ma1) is statistically significant. This means that the previous errors have a statistically significant impact on the current value of the dependent variable.

Forecasted value for the next week: 242908.1

^{Created on 2024-02-22 with reprex v2.1.0}

Homework 2: ICNSA data forecast using smoothing technique - Holt-Winters model

library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)

# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")

# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]

# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2981

head(icnsa_data)
#> # A tibble: 6 × 2
#>   date        value
#>   <date>      <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000

# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#>       date                value        
#>  Min.   :1967-01-07   Min.   : 133000  
#>  1st Qu.:1981-04-18   1st Qu.: 265794  
#>  Median :1995-07-29   Median : 326900  
#>  Mean   :1995-07-29   Mean   : 364957  
#>  3rd Qu.:2009-11-07   3rd Qu.: 406633  
#>  Max.   :2024-02-17   Max.   :6161268
cat("\n\n")


# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


# Lets visually see the ICNSA trend during pandemic
# According to the web sources, the pandemic was declared in March 2020 till June/July 2021
ggplot(icnsa_data, aes(x=date, y=value)) +
  geom_line() +
  labs(title="Initial Claims (ICNSA) over Time", x="Date", y="Initial Claims (ICNSA)") +
  theme_minimal() +
  geom_rect(aes(xmin=as.Date("2020-03-01"), xmax=as.Date("2021-07-31"), ymin=-Inf, ymax=Inf), 
            fill="white", alpha=0.004) +
  geom_vline(xintercept=as.Date("2020-03-01"), color="orange", linetype="dashed") +
  geom_vline(xintercept=as.Date("2021-07-31"), color="orange", linetype="dashed")


# Calculating the average number of claims before and during the pandemic
pre_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date < as.Date("2020-01-01")) ])
subset_data <- subset(icnsa_data, date >= as.Date("2020-03-01") & date <= as.Date("2021-07-31"))
pandemic_avg <- mean(subset_data$value)
post_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date > as.Date("2021-07-31")) ])
cat("Average number of claims before pandemic:", pre_pandemic_avg, "\n")
#> Average number of claims before pandemic: 350231.6
cat("Average number of claims during pandemic:", pandemic_avg, "\n")
#> Average number of claims during pandemic: 1172247
cat("Average number of claims after pandemic:", post_pandemic_avg, "\n")
#> Average number of claims after pandemic: 229688.1

# Create a dataframe for the bar chart
bar_data <- data.frame(period = c("Pre-pandemic", "Pandemic", "Post-pandemic"),
                       avg_value = c(pre_pandemic_avg, pandemic_avg, post_pandemic_avg))

# Create a bar chart
ggplot(bar_data, aes(x=period, y=avg_value, fill=period)) +
  geom_bar(stat="identity") +
  labs(title="Average Initial Claims (ICNSA) by Period", x="Period", y="Average Initial Claims (ICNSA)") +
  theme_minimal()


pre_pandemic_sd <- sd(icnsa_data$value[which(icnsa_data$date < as.Date("2020-03-01"))])

# Calculate the Z-score of the pandemic
z_score <- (pandemic_avg - pre_pandemic_avg) / pre_pandemic_sd

# Output the Z-score
cat("Z-score of the pandemic: ", z_score, "\n")
#> Z-score of the pandemic:  7.03949

# A positive Z-score indicates that the pandemic average is higher than the pre-pandemic average

# Removing the data for COVID period
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-07-31")

icnsa_data_without_covid <- subset(icnsa_data, !(date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA without Covid period: ", nrow(icnsa_data_without_covid), "\n")
#> Number of rows in ICNSA without Covid period:  2907

# Plotting ACF and PACF for filtered ICNSA data
acf(icnsa_data_without_covid$value)

pacf(icnsa_data_without_covid$value)


# Getting ICNSA data during covid period
icnsa_data_during_covid <- subset(icnsa_data, (date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA during Covid period: ", nrow(icnsa_data_during_covid), "\n")
#> Number of rows in ICNSA during Covid period:  74

# Getting data for covid period and using cubic spline to impute new values
# To choose value of lambda - I would be using smoothing parameter is called `spar`

# Choose a range of spar values
spar_values <- seq(0.1, 0.6, by = 0.2)

best_spar <- NULL
best_mse <- Inf

par(mfcol=c(length(spar_values), 1), mar=c(2, 2, 1, 1))

for(spar in spar_values) {
  # Fit smooth spline for each spar value
  spline_fit <- smooth.spline(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, spar=spar)

  predicted_values <- predict(spline_fit, as.numeric(icnsa_data$date))

  # Calculate mean squared error
  mse <- mean((predicted_values$y - icnsa_data$value)^2)

  # If the current spar value has a lower MSE than the previous best spar value, update the best spar value and its corresponding MSE
  if(mse < best_mse) {
    best_spar <- spar
    best_mse <- mse
  }

  plot(icnsa_data$date, icnsa_data$value, type='l', main=paste("Spar =", spar),
       xlab="Date", ylab="ICNSA Values", col="purple")
  lines(icnsa_data$date, predicted_values$y, col="black")
  legend("topleft", legend=c("Original", "Smoothed"), col=c("purple", "black"), lty=1)

}


cat("Best value for lambda:", best_spar, "\n")
#> Best value for lambda: 0.1
cat("Best value for MSE:", best_mse, "\n")
#> Best value for MSE: 57230081019

par(mfcol=c(1, 1))

# Impute missing values using cubic spline
spline_fit <- smooth.spline(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, spar = 0.1)

#plotting the spline against our original data
plot(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, type = "p", col = "purple", main = "Original ICNSA vs. Smoothed ICNSA", xlab = "Date", ylab = "Value", xlim = range(as.numeric(icnsa_data_without_covid$date)), ylim = range(icnsa_data_without_covid$value))
lines(spline_fit, col = "black")
legend("topright", legend = c("Original ICNSA series", "Smoothed spline series "), col = c("purple", "black"), lty = 1)


imputed_values <- predict(spline_fit, as.numeric(icnsa_data_during_covid$date))$y
cat("Head for imputed values:", head(imputed_values), "\n")
#> Head for imputed values: 269471.7 271585.5 273635.9 275623.4 277548.6 279412.1

# Combine the imputed values with the original dataframe
new_covid_data <- data.frame(date = icnsa_data_during_covid$date, value = imputed_values)
combined_ICNSA <- rbind(icnsa_data_without_covid, new_covid_data)
cat("Total number of rows in the updated ICNSA dataset:", nrow(combined_ICNSA), "\n")
#> Total number of rows in the updated ICNSA dataset: 2981

# Converting it into time-series object
combined_ICNSA_ts <- ts(combined_ICNSA$value, frequency = 52)

# Fit a multiplicative Holt-Winters model
multiplicative_hw_model <- HoltWinters(combined_ICNSA_ts, seasonal="multiplicative")
multiplicative_hw_forecast <- forecast(multiplicative_hw_model, h=1)

# Fit an additive Holt-Winters model
additive_hw_model <- HoltWinters(combined_ICNSA_ts, seasonal="additive")
additive_hw_forecast <- forecast(additive_hw_model, h=1)

cat("Multiplicative Forecast:", multiplicative_hw_forecast$mean, "\n")
#> Multiplicative Forecast: 247172
cat("Additive Forecast:", additive_hw_forecast$mean, "\n")
#> Additive Forecast: 271732.7

Forecast value for the next week using multiplicative Holt-Winters model: 247172

^{Created on 2024-02-28 with reprex v2.1.0}

Steps Performed

Loaded the most current Initial claims (ICNSA) data, not seasonally adjusted.
Empirical Data Analysis for the Covid period:
- First the time-series plot for an entire ICNSA data to see the trend and check which year it is changing.
- Chose a time window for the pandemic: 1st March 2020 - 31st July 2021.
- To verify I calculated the average number of claims before the pandemic, during the above-chosen dates, and after the pandemic.
  - The average number of claims before the pandemic: 350231.6 -> 350k (approx)
  - The average number of claims during the pandemic: 1172247 -> 1.17M (approx)
  - The average number of claims after the pandemic: 229688.1 -> 229.6k (approx)
  From the above stats we can notice that during the chosen time period for Covid, the average number of claims is more than 3 times.
- To get more clarity I calculated the Z score of the above-chosen dates: Z-score of the pandemic: 7.03949 A positive Z-score indicates that the pandemic average is higher than the pre-pandemic average
Used a cubic spline methodology to impute new values for the Covid period: After careful observation, the best values of Lambda came out to be 0.1 spar_values <- seq(0.1, 0.6, by = 0.2) The value of lambda was tested from 0.1 to 0.6 with a step size of 0.2 As we can see in the image, lambda=0.1 matches the best with the overall trend of the ICNSA series and the MSE value is the least for lambda = 0.1.
Used both a multiplicative and an additive Holt-Winters model to forecast the next ICNSA value using the newly imputed Covid values.

The forecasted value using the multiplicative Holt-Winters model: 247172 The forecasted value using the additive Holt-Winters model: 271732.7

Homework3 - Structural Time Series model using State Space methodology

library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)
library(KFAS)
#> Please cite KFAS in publications by using: 
#> 
#>   Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
library(zoo)
library(dlm)
#> 
#> Attaching package: 'dlm'
#> The following object is masked from 'package:ggplot2':
#> 
#>     %+%
library(dplyr)
library(zoo)
library(imputeTS)
#> 
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:zoo':
#> 
#>     na.locf

# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")

# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]

# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2982

head(icnsa_data)
#> # A tibble: 6 × 2
#>   date        value
#>   <date>      <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000

# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#>       date                value        
#>  Min.   :1967-01-07   Min.   : 133000  
#>  1st Qu.:1981-04-19   1st Qu.: 265755  
#>  Median :1995-08-01   Median : 326806  
#>  Mean   :1995-08-01   Mean   : 364900  
#>  3rd Qu.:2009-11-12   3rd Qu.: 406549  
#>  Max.   :2024-02-24   Max.   :6161268
cat("\n\n")


# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


from_date <- as.Date("2020-01-01")
to_date <- as.Date("2021-12-31")

filtered_df <- icnsa_data %>%
  filter(date >= from_date & date <= to_date)

ggplot(filtered_df, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Data from Date Range",
       x = "Date",
       y = "Value")

# From the above graph we can see that the covid start date is somewhere around 1st March 2020 till 30th June 2021

# Lets visually see the ICNSA trend during pandemic
# According to the web sources, the pandemic was declared in March 2020 till June/July 2021
ggplot(icnsa_data, aes(x=date, y=value)) +
  geom_line() +
  labs(title="Initial Claims (ICNSA) over Time", x="Date", y="Initial Claims (ICNSA)") +
  theme_minimal() +
  geom_rect(aes(xmin=as.Date("2020-03-01"), xmax=as.Date("2021-06-30"), ymin=-Inf, ymax=Inf), 
            fill="white", alpha=0.004) +
  geom_vline(xintercept=as.Date("2020-03-01"), color="orange", linetype="dashed") +
  geom_vline(xintercept=as.Date("2021-07-31"), color="orange", linetype="dashed")


# Calculating the average number of claims before and during the pandemic
pre_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date < as.Date("2020-03-01")) ])
subset_data <- subset(icnsa_data, date >= as.Date("2020-03-01") & date <= as.Date("2021-06-30"))
pandemic_avg <- mean(subset_data$value)
post_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date > as.Date("2021-06-30")) ])
cat("Average number of claims before pandemic:", pre_pandemic_avg, "\n")
#> Average number of claims before pandemic: 349907.1
cat("Average number of claims during pandemic:", pandemic_avg, "\n")
#> Average number of claims during pandemic: 1230384
cat("Average number of claims after pandemic:", post_pandemic_avg, "\n")
#> Average number of claims after pandemic: 234487.1

# Create a dataframe for the bar chart
bar_data <- data.frame(period = c("Pre-pandemic avg", "Pandemic avg", "Post-pandemic avg"),
                       avg_value = c(pre_pandemic_avg, pandemic_avg, post_pandemic_avg))

# Create a bar chart
ggplot(bar_data, aes(x=period, y=avg_value, fill=period)) +
  geom_bar(stat="identity") +
  labs(title="Average Initial Claims (ICNSA) by Period", x="Period", y="Average Initial Claims (ICNSA)") +
  theme_minimal()


# Removing the data for COVID period
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-06-30")

non_covid_data <- subset(icnsa_data, !(date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA including Covid period:", nrow(icnsa_data), "\n")
#> Number of rows in ICNSA including Covid period: 2982
cat("Number of rows in ICNSA without Covid period:", nrow(non_covid_data), "\n")
#> Number of rows in ICNSA without Covid period: 2913

# Plotting ACF and PACF for filtered ICNSA data
acf(non_covid_data$value)

pacf(non_covid_data$value)


# Getting ICNSA data during covid period
covid_data_rows = nrow(subset(icnsa_data, (date >= covid_start_date & date <= covid_end_date)))
cat("Number of rows in ICNSA during Covid period: ", covid_data_rows, "\n")
#> Number of rows in ICNSA during Covid period:  69

# Setting covid period data to NA
modified_icnsa <- icnsa_data

modified_icnsa$value[modified_icnsa$date >= as.Date("2020-03-01") & modified_icnsa$date <= as.Date("2021-06-30")] <- NA
nrow(modified_icnsa)
#> [1] 2982

# Getting the index of the covid period
covid_period_index <- which(modified_icnsa$date >= as.Date("2020-03-01") & modified_icnsa$date <= as.Date("2021-06-30"))

# Imputing missing values using na_kalman() within the COVID period
imputed_values <- na_kalman(modified_icnsa$value, smooth = TRUE)

# Updating the ICNSA dataframe with the imputed values within the COVID period
modified_icnsa$value[covid_period_index] <- imputed_values[covid_period_index]

# Plot the original and imputed data
plot(modified_icnsa$date, icnsa_data$value, type = "l", col = "blue", lwd = 2,
     xlab = "Date", ylab = "Initial Claims (ICNSA)", main = "Comparison of Original and Imputed Values")
lines(modified_icnsa$date, imputed_values, col = "red", lwd = 2)
legend("topright", legend = c("Original", "Imputed"), col = c("blue", "red"), lwd = 2, bty = "n")


plot(modified_icnsa$date[covid_period_index], icnsa_data$value[covid_period_index], type = "l", col = "blue", lwd = 2,
     xlab = "Date", ylab = "Initial Claims (ICNSA)", main = "Comparison of Original and Imputed Values")
lines(modified_icnsa$date[covid_period_index], imputed_values[covid_period_index], col = "red", lwd = 2)
legend("topright", legend = c("Original", "Imputed"), col = c("blue", "red"), lwd = 2, bty = "n")


# Fit a structural time series model with trend, seasonal, and error components
state_space_model <- StructTS(ts(modified_icnsa$value, frequency = 52), type = "BSM")
summary(state_space_model)
#>           Length Class  Mode     
#> coef         4   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    8946   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Taking another series as covariate series
# Reading CCNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
ccnsa_data <- fredr(series_id = "CCNSA")

# Taking date and value column from the dataset
ccnsa_data <- ccnsa_data[c("date", "value")]

# Converting `date` column from string to date
ccnsa_data$date <- as.Date(ccnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the CCNSA dataset:", nrow(ccnsa_data), "\n")
#> Total number of rows in the CCNSA dataset: 2981

head(ccnsa_data)
#> # A tibble: 6 × 2
#>   date         value
#>   <date>       <dbl>
#> 1 1967-01-07 1594000
#> 2 1967-01-14 1563000
#> 3 1967-01-21 1551000
#> 4 1967-01-28 1533000
#> 5 1967-02-04 1534000
#> 6 1967-02-11 1557000

# Basic summary statistics
cat("Summary of value column in CCNSA dataset:", "\n")
#> Summary of value column in CCNSA dataset:
summary(ccnsa_data)
#>       date                value         
#>  Min.   :1967-01-07   Min.   :  785000  
#>  1st Qu.:1981-04-18   1st Qu.: 1997000  
#>  Median :1995-07-29   Median : 2493008  
#>  Mean   :1995-07-29   Mean   : 2761405  
#>  3rd Qu.:2009-11-07   3rd Qu.: 3145405  
#>  Max.   :2024-02-17   Max.   :23037921
cat("\n\n")

# Making ICNSA data compatible with CCNSA data
modified_icnsa <- subset(modified_icnsa, date >= as.Date("1967-01-07") & date <= as.Date("2024-02-17"))

# Merge ICNSA and CCNSA dataframes based on their common date column
merged_data <- merge(modified_icnsa, ccnsa_data, by = "date", all = FALSE)
head(merged_data)
#>         date value.x value.y
#> 1 1967-01-07  346000 1594000
#> 2 1967-01-14  334000 1563000
#> 3 1967-01-21  277000 1551000
#> 4 1967-01-28  252000 1533000
#> 5 1967-02-04  274000 1534000
#> 6 1967-02-11  276000 1557000

# 'merged_data' is your data frame containing 'value.x' and 'value.y'

# Create design matrix Z with covariate
Z <- cbind(1, merged_data$value.y)  # Trend and covariate

# Check dimensions of Z
dim(Z)  # Should be (n, p)
#> [1] 2981    2

# Create state equation matrices
p <- ncol(Z)  # Number of state variables
T <- diag(1, p)  # Transition matrix for state variables
Q <- diag(0.01, p)  # State covariance matrix

# Create observation equation matrices
n <- nrow(merged_data)  # Number of observations
H <- diag(1, n)  # Observation covariance matrix

# Check dimensions of H
dim(H)  # Should be (n, n)
#> [1] 2981 2981

# Create initial state and initial state covariance
initial_state <- rep(0, p)  # Initial state vector
initial_state_cov <- diag(1e5, p)  # Initial state covariance matrix

# Check dimensions of initial_state_cov
dim(initial_state_cov)  # Should be (p, p)
#> [1] 2 2

# Construct state space model
model <- dlm(
  FF = Z,  # Design matrix
  V = H,  # Observation covariance matrix
  GG = T,  # Transition matrix
  W = Q,  # State covariance matrix
  m0 = initial_state,  # Initial state vector
  C0 = initial_state_cov  # Initial state covariance matrix
)

# Fit the model
fit <- dlmFilter(merged_data$value.x, model)

# Print summary of the fitted model
print(summary(fit))
#>     Length Class  Mode   
#> y   2981   -none- numeric
#> mod    6   dlm    list   
#> m      4   -none- numeric
#> U.C    2   -none- list   
#> D.C    4   -none- numeric
#> a      2   -none- numeric
#> U.R    1   -none- list   
#> D.R    2   -none- numeric
#> f   2981   -none- numeric

str(model)
#> List of 6
#>  $ m0: num [1:2] 0 0
#>  $ C0: num [1:2, 1:2] 1e+05 0e+00 0e+00 1e+05
#>  $ FF: num [1:2981, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ V : num [1:2981, 1:2981] 1 0 0 0 0 0 0 0 0 0 ...
#>  $ GG: num [1:2, 1:2] 1 0 0 1
#>  $ W : num [1:2, 1:2] 0.01 0 0 0.01
#>  - attr(*, "class")= chr "dlm"

# Forecasting horizon
forecast_horizon <- 1  # Forecast for the next week

# Forecast using the fitted model
forecast <- tryCatch(
  {
    dlmForecast(fit, nAhead = forecast_horizon)
  },
  error = function(e) {
    message("Error occurred during forecasting: ", conditionMessage(e))
    return(NULL)
  }
)

forecasted_value <- forecast$f[1, 1]
print(paste("Forecasted value for the next week:", forecasted_value))
#> [1] "Forecasted value for the next week: 312076.351407013"

Steps Performed

Loaded the most current Initial claims (ICNSA) data, not seasonally adjusted.
Empirical Data Analysis for the Covid period:
- First the time-series plot for an entire ICNSA data to see the trend and check which year it is changing.
- Chose a time window for the pandemic: 1st March 2020 - 30th June 2021.
- To verify I calculated the average number of claims before the pandemic, during the above-chosen dates, and after the pandemic.
  - The average number of claims before the pandemic: 349907.1 -> 350k (approx)
  - The average number of claims during the pandemic: 1230384 -> 1.23M (approx)
  - The average number of claims after the pandemic: 234487.1 -> 234.4k (approx) From the above stats we can notice that during the chosen time period for Covid, the average number of claims is more than 3 times.
Used Kalman filtering technique to impute the Pendamic values.
Then I fitted a structural time series model with trend, seasonal, and error components using the StructTS() function.
- Here I used frequency as 52 (its weekly data)
- Type as “BSM", a basic structural model that includes trend, seasonal, and error components.
Then I loaded another series i.e., CCNSA (Continued Claims (Insured Unemployment)) - This is similar to the ICNSA series as it is also based on the unemployment claims.
Then I made both series compatible to merge them.
Then I defined and fitted a structural time series model with trend, seasonal, and error using state space methodology and including covariate series.
Forecasted value - 312076.35

^{Created on 2024-03-07 with reprex v2.1.0}

Homework 4 - VAR model on US Census Bureaus M3 survey data

library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(fpp3)
#> ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
#> ✔ tibble      3.2.1     ✔ tsibble     1.1.4
#> ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
#> ✔ tidyr       1.3.1     ✔ feasts      0.3.1
#> ✔ lubridate   1.9.3     ✔ fable       0.3.3
#> ✔ ggplot2     3.4.4     ✔ fabletools  0.4.0
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date()    masks base::date()
#> ✖ dplyr::filter()      masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval()  masks lubridate::interval()
#> ✖ dplyr::lag()         masks stats::lag()
#> ✖ tsibble::setdiff()   masks base::setdiff()
#> ✖ tsibble::union()     masks base::union()
library(tidyverse)
library(tsbox)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following object is masked from 'package:tsibble':
#> 
#>     index
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(seasonal)
#> 
#> Attaching package: 'seasonal'
#> The following object is masked from 'package:tibble':
#> 
#>     view
library(astsa)
#> 
#> Attaching package: 'astsa'
#> The following objects are masked from 'package:seasonal':
#> 
#>     trend, unemp
#> The following object is masked from 'package:forecast':
#> 
#>     gas
library(patchwork)
library(tseries)
library(fredr)
library(quarto)
library(xts)
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(dplyr)
library(ggplot2)
library(vars)
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:patchwork':
#> 
#>     area
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> Loading required package: strucchange
#> Loading required package: sandwich
#> 
#> Attaching package: 'strucchange'
#> The following object is masked from 'package:stringr':
#> 
#>     boundary
#> Loading required package: urca
#> Loading required package: lmtest
#> 
#> Attaching package: 'vars'
#> The following object is masked from 'package:fable':
#> 
#>     VAR
library(readxl)
library(lubridate)
library(tseries)
library(BigVAR)
#> Loading required package: lattice
library(reprex)

# Reading all time series from excel file
crp_no <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/CRP_NO.xlsx")
crp_vs <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/CRP_VS.xlsx")
iti_ti <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/ITI_TI.xlsx")
iti_uo <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/ITI_UO.xlsx")

# Converting Period column to date type for all 4 time series
crp_no$Period <- as.Date(crp_no$Period)
crp_vs$Period <- as.Date(crp_vs$Period)
iti_ti$Period <- as.Date(iti_ti$Period)
iti_uo$Period <- as.Date(iti_uo$Period)

# Show the dataframe
head(crp_no)
#> # A tibble: 6 × 2
#>   Period     Value 
#>   <date>     <chr> 
#> 1 1993-01-01 5245.0
#> 2 1993-02-01 5313.0
#> 3 1993-03-01 4723.0
#> 4 1993-04-01 5068.0
#> 5 1993-05-01 5290.0
#> 6 1993-06-01 4969.0
head(crp_vs)
#> # A tibble: 6 × 2
#>   Period     Value 
#>   <date>     <chr> 
#> 1 1993-01-01 5441.0
#> 2 1993-02-01 5100.0
#> 3 1993-03-01 5157.0
#> 4 1993-04-01 5016.0
#> 5 1993-05-01 4936.0
#> 6 1993-06-01 5167.0
head(iti_ti)
#> # A tibble: 6 × 2
#>   Period     Value  
#>   <date>     <chr>  
#> 1 1993-01-01 39506.0
#> 2 1993-02-01 39528.0
#> 3 1993-03-01 39371.0
#> 4 1993-04-01 39231.0
#> 5 1993-05-01 39398.0
#> 6 1993-06-01 39309.0
head(iti_uo)
#> # A tibble: 6 × 2
#>   Period     Value  
#>   <date>     <chr>  
#> 1 1993-01-01 80506.0
#> 2 1993-02-01 80035.0
#> 3 1993-03-01 79572.0
#> 4 1993-04-01 80068.0
#> 5 1993-05-01 80174.0
#> 6 1993-06-01 79100.0

# Function to clean data and convert 'Value' column to double
clean_and_convert_data <- function(data) {
  # Remove rows where 'Value' column is "NA"
  data <- data[!is.na(data$Value) & data$Value != "NA", ]

  # Remove commas from 'Value' column
  data$Value <- gsub(",", "", data$Value)

  # Convert 'Value' column to numeric
  data$Value <- as.numeric(data$Value)
  return(data)
}

crp_no <- clean_and_convert_data(crp_no)
crp_vs <- clean_and_convert_data(crp_vs)
iti_ti <- clean_and_convert_data(iti_ti)
iti_uo <- clean_and_convert_data(iti_uo)

# Checking total number of rows in all the time series
cat("- Total number of rows in crp_no dataset:", nrow(crp_no), "\n")
#> - Total number of rows in crp_no dataset: 374
cat("- Total number of rows in crp_vs dataset:", nrow(crp_vs), "\n")
#> - Total number of rows in crp_vs dataset: 374
cat("- Total number of rows in iti_ti dataset:", nrow(iti_ti), "\n")
#> - Total number of rows in iti_ti dataset: 374
cat("- Total number of rows in iti_uo dataset:", nrow(iti_uo), "\n\n")
#> - Total number of rows in iti_uo dataset: 374

# Basic summary statistics for all 4 series
cat("- Summary of crp_no dataset:", "\n")
#> - Summary of crp_no dataset:
summary(crp_no)
#>      Period               Value      
#>  Min.   :1993-01-01   Min.   : 1393  
#>  1st Qu.:2000-10-08   1st Qu.: 2110  
#>  Median :2008-07-16   Median : 4868  
#>  Mean   :2008-07-16   Mean   : 4628  
#>  3rd Qu.:2016-04-23   3rd Qu.: 6202  
#>  Max.   :2024-02-01   Max.   :10535

cat("- Summary of crp_vs dataset:", "\n")
#> - Summary of crp_vs dataset:
summary(crp_vs)
#>      Period               Value      
#>  Min.   :1993-01-01   Min.   : 1384  
#>  1st Qu.:2000-10-08   1st Qu.: 2139  
#>  Median :2008-07-16   Median : 5028  
#>  Mean   :2008-07-16   Mean   : 4642  
#>  3rd Qu.:2016-04-23   3rd Qu.: 6164  
#>  Max.   :2024-02-01   Max.   :10203

cat("- Summary of iti_ti dataset:", "\n")
#> - Summary of iti_ti dataset:
summary(iti_ti)
#>      Period               Value      
#>  Min.   :1993-01-01   Min.   :33132  
#>  1st Qu.:2000-10-08   1st Qu.:37582  
#>  Median :2008-07-16   Median :39319  
#>  Mean   :2008-07-16   Mean   :40669  
#>  3rd Qu.:2016-04-23   3rd Qu.:44426  
#>  Max.   :2024-02-01   Max.   :53721

cat("- Summary of iti_uo dataset:", "\n")
#> - Summary of iti_uo dataset:
summary(iti_uo)
#>      Period               Value       
#>  Min.   :1993-01-01   Min.   : 76417  
#>  1st Qu.:2000-10-08   1st Qu.: 93123  
#>  Median :2008-07-16   Median :111062  
#>  Mean   :2008-07-16   Mean   :108078  
#>  3rd Qu.:2016-04-23   3rd Qu.:119714  
#>  Max.   :2024-02-01   Max.   :134990
cat("\n\n")

# Data Analysis
## Visualizing the datasets - 

# Plotting crp_no time-series data
ggplot(crp_no, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of CRP_NO series",
       x = "Date (Month-Year)",
       y = "Value")


# Plotting crp_vs time-series data
ggplot(crp_vs, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of CRP_VS series",
       x = "Date (Month-Year)",
       y = "Value")


# Plotting iti_ti time-series data
ggplot(iti_ti, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of ITI_TI series",
       x = "Date (Month-Year)",
       y = "Value")


# Plotting iti_uo time-series data
ggplot(iti_uo, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of ITI_UO series",
       x = "Date (Month-Year)",
       y = "Value")


## Correlation coefficient - 
crpNo_crpVs_corr <- cor(crp_no$Value, crp_vs$Value)
cat("Correlation between Crp_no and Crp_vs series: ", crpNo_crpVs_corr, "\n")
#> Correlation between Crp_no and Crp_vs series:  0.9954609

crpNo_itiTi_corr <- cor(crp_no$Value, iti_ti$Value)
cat("Correlation between Crp_no and Iti_ti series: ", crpNo_itiTi_corr, "\n")
#> Correlation between Crp_no and Iti_ti series:  0.5531556

crpNo_itiUo_corr <- cor(crp_no$Value, iti_uo$Value)
cat("Correlation between Crp_no and Iti_uo series: ", crpNo_itiUo_corr, "\n")
#> Correlation between Crp_no and Iti_uo series:  -0.6597469

## Checking for stationarity - 

# Function to check stationarity using ADF and KPSS tests
check_stationarity <- function(data) {
  adf_test <- adf.test(data)
  kpss_test <- kpss.test(data)

  cat("ADF Test:\n")
  print(adf_test)
  #cat("\nKPSS Test:\n")
  #print(kpss_test)
}

# Assuming you have two dataframes: real_estate_data and mortgage_interest_data
# Check stationarity for both series
cat("Checking stationarity for crp_no: ")
#> Checking stationarity for crp_no:
check_stationarity(crp_no$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -2.7454, Lag order = 7, p-value = 0.2624
#> alternative hypothesis: stationary

cat("Checking stationarity for crp_vs: ")
#> Checking stationarity for crp_vs:
check_stationarity(crp_vs$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -2.703, Lag order = 7, p-value = 0.2802
#> alternative hypothesis: stationary

cat("Checking stationarity for iti_ti: ")
#> Checking stationarity for iti_ti:
check_stationarity(iti_ti$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -2.4416, Lag order = 7, p-value = 0.3906
#> alternative hypothesis: stationary

cat("Checking stationarity for iti_uo: ")
#> Checking stationarity for iti_uo:
check_stationarity(iti_uo$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -3.5722, Lag order = 7, p-value = 0.03585
#> alternative hypothesis: stationary

## ACF and PACF of all 4 time series 

# Plotting ACF and PACF for crp_no series
acf(crp_no$Value)

pacf(crp_no$Value)


# Plotting ACF and PACF for crp_vs series
acf(crp_vs$Value)

pacf(crp_vs$Value)


# Plotting ACF and PACF for iti_ti series
acf(iti_ti$Value)

pacf(iti_ti$Value)


# Plotting ACF and PACF for iti_uo series
acf(iti_uo$Value)

pacf(iti_uo$Value)


## Decomposition for crp_no -
crp_no_ts <- ts(crp_no$Value, frequency=12)
crp_no_decomp <- decompose(crp_no_ts, "multiplicative")
plot(crp_no_decomp)


## Decomposition for crp_vs -
crp_vs_ts <- ts(crp_vs$Value, frequency=12)
crp_vs_decomp <- decompose(crp_vs_ts, "multiplicative")
plot(crp_vs_decomp)


## Decomposition for iti_ti -
iti_ti_ts <- ts(iti_ti$Value, frequency=12)
iti_ti_decomp <- decompose(iti_ti_ts, "multiplicative")
plot(iti_ti_decomp)


## Decomposition for iti_uo -
iti_uo_ts <- ts(iti_uo$Value, frequency=12)
iti_uo_decomp <- decompose(iti_uo_ts, "multiplicative")
plot(iti_uo_decomp)


# Modelling

# Merging all 4 time series - 
combined_data <- merge(crp_no, crp_vs, by = "Period", all = TRUE, suffixes = c("_crp_no", "_crp_vs"))
combined_data <- merge(combined_data, iti_ti, by = "Period", all = TRUE, suffixes = c("_crp_no", "_iti_ti"))
combined_data <- merge(combined_data, iti_uo, by = "Period", all = TRUE, suffixes = c("_crp_no", "_iti_uo"))
#> Warning in merge.data.frame(combined_data, iti_uo, by = "Period", all = TRUE, :
#> column name 'Value_crp_no' is duplicated in the result

colnames(combined_data)[4] <- "Value_iti_ti"

nrow(combined_data)
#> [1] 374
head(combined_data)
#>       Period Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> 1 1993-01-01         5245         5441        39506        80506
#> 2 1993-02-01         5313         5100        39528        80035
#> 3 1993-03-01         4723         5157        39371        79572
#> 4 1993-04-01         5068         5016        39231        80068
#> 5 1993-05-01         5290         4936        39398        80174
#> 6 1993-06-01         4969         5167        39309        79100

# Create VAR model object for VAR(1)
combined_data$Period <- as.Date(combined_data$Period)
combined_data <- na.omit(combined_data)

# Convert the combined data to a time series object
ts_combined_data <- ts(combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], start = as.Date("1993-01-01"), frequency = 12)

var1_model <- VAR(ts_combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], p = 1)

# Print summary of VAR(1) model
summary(var1_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Value_crp_no, Value_crp_vs, Value_iti_ti, Value_iti_uo 
#> Deterministic variables: const 
#> Sample size: 373 
#> Log Likelihood: -11038.371 
#> Roots of the characteristic polynomial:
#> 0.9938 0.9938 0.9852 0.1941
#> Call:
#> VAR(y = ts_combined_data[, c("Value_crp_no", "Value_crp_vs", 
#>     "Value_iti_ti", "Value_iti_uo")], p = 1)
#> 
#> 
#> Estimation results for equation Value_crp_no: 
#> ============================================= 
#> Value_crp_no = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.013019   0.076306  -0.171 0.864620    
#> Value_crp_vs.l1   0.969321   0.076112  12.735  < 2e-16 ***
#> Value_iti_ti.l1   0.011246   0.005921   1.899 0.058310 .  
#> Value_iti_uo.l1  -0.006073   0.001709  -3.553 0.000431 ***
#> const           379.330929 238.470679   1.591 0.112540    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 367.9 on 368 degrees of freedom
#> Multiple R-Squared: 0.9808,  Adjusted R-squared: 0.9806 
#> F-statistic:  4693 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_crp_vs: 
#> ============================================= 
#> Value_crp_vs = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1   0.184887   0.059757   3.094  0.00213 ** 
#> Value_crp_vs.l1   0.790988   0.059605  13.271  < 2e-16 ***
#> Value_iti_ti.l1   0.007557   0.004637   1.630  0.10400    
#> Value_iti_uo.l1  -0.003186   0.001339  -2.380  0.01783 *  
#> const           143.137783 186.751102   0.766  0.44389    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 288.1 on 368 degrees of freedom
#> Multiple R-Squared: 0.9882,  Adjusted R-squared: 0.9881 
#> F-statistic:  7708 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_ti: 
#> ============================================= 
#> Value_iti_ti = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.294046   0.085233  -3.450 0.000626 ***
#> Value_crp_vs.l1   0.295386   0.085016   3.474 0.000573 ***
#> Value_iti_ti.l1   0.996967   0.006614 150.745  < 2e-16 ***
#> Value_iti_uo.l1   0.001352   0.001909   0.708 0.479210    
#> const           -15.845201 266.367194  -0.059 0.952597    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 410.9 on 368 degrees of freedom
#> Multiple R-Squared: 0.9908,  Adjusted R-squared: 0.9907 
#> F-statistic:  9941 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_uo: 
#> ============================================= 
#> Value_iti_uo = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.068184   0.193488  -0.352   0.7247    
#> Value_crp_vs.l1   0.130156   0.192996   0.674   0.5005    
#> Value_iti_ti.l1  -0.030413   0.015014  -2.026   0.0435 *  
#> Value_iti_uo.l1   1.003758   0.004334 231.589   <2e-16 ***
#> const           686.995408 604.683739   1.136   0.2566    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 932.8 on 368 degrees of freedom
#> Multiple R-Squared: 0.9967,  Adjusted R-squared: 0.9967 
#> F-statistic: 2.812e+04 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no       135317        78920         7607        68864
#> Value_crp_vs        78920        82987         5282         5598
#> Value_iti_ti         7607         5282       168827       100727
#> Value_iti_uo        68864         5598       100727       870038
#> 
#> Correlation matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no      1.00000      0.74474      0.05033      0.20070
#> Value_crp_vs      0.74474      1.00000      0.04462      0.02083
#> Value_iti_ti      0.05033      0.04462      1.00000      0.26282
#> Value_iti_uo      0.20070      0.02083      0.26282      1.00000

# Fit VAR(p) model
p <- 2
varp_model <- VAR(ts_combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], p = p)

# Print summary of VAR(1) model
summary(varp_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Value_crp_no, Value_crp_vs, Value_iti_ti, Value_iti_uo 
#> Deterministic variables: const 
#> Sample size: 372 
#> Log Likelihood: -10886.418 
#> Roots of the characteristic polynomial:
#> 0.9921 0.9884 0.9884 0.5737 0.5162 0.5162 0.3349 0.225
#> Call:
#> VAR(y = ts_combined_data[, c("Value_crp_no", "Value_crp_vs", 
#>     "Value_iti_ti", "Value_iti_uo")], p = p)
#> 
#> 
#> Estimation results for equation Value_crp_no: 
#> ============================================= 
#> Value_crp_no = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                  Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.10827    0.07774  -1.393   0.1646    
#> Value_crp_vs.l1   0.76596    0.09734   7.869 4.15e-14 ***
#> Value_iti_ti.l1   0.04488    0.04628   0.970   0.3328    
#> Value_iti_uo.l1   0.01265    0.02120   0.597   0.5510    
#> Value_crp_no.l2  -0.23947    0.07654  -3.129   0.0019 ** 
#> Value_crp_vs.l2   0.53579    0.09066   5.910 7.90e-09 ***
#> Value_iti_ti.l2  -0.03400    0.04666  -0.729   0.4667    
#> Value_iti_uo.l2  -0.01961    0.02139  -0.917   0.3599    
#> const           490.55230  229.86271   2.134   0.0335 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 351.2 on 363 degrees of freedom
#> Multiple R-Squared: 0.9827,  Adjusted R-squared: 0.9823 
#> F-statistic:  2579 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_crp_vs: 
#> ============================================= 
#> Value_crp_vs = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                  Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1   0.13401    0.06007   2.231   0.0263 *  
#> Value_crp_vs.l1   0.52272    0.07522   6.949 1.71e-11 ***
#> Value_iti_ti.l1   0.07795    0.03576   2.180   0.0299 *  
#> Value_iti_uo.l1   0.01880    0.01638   1.148   0.2519    
#> Value_crp_no.l2   0.03833    0.05915   0.648   0.5174    
#> Value_crp_vs.l2   0.28184    0.07006   4.023 7.00e-05 ***
#> Value_iti_ti.l2  -0.07127    0.03606  -1.977   0.0488 *  
#> Value_iti_uo.l2  -0.02248    0.01653  -1.360   0.1747    
#> const           221.67170  177.63499   1.248   0.2129    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 271.4 on 363 degrees of freedom
#> Multiple R-Squared: 0.9897,  Adjusted R-squared: 0.9894 
#> F-statistic:  4348 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_ti: 
#> ============================================= 
#> Value_iti_ti = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                  Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.37535    0.08045  -4.665 4.34e-06 ***
#> Value_crp_vs.l1   0.49486    0.10074   4.912 1.36e-06 ***
#> Value_iti_ti.l1   1.39798    0.04789  29.189  < 2e-16 ***
#> Value_iti_uo.l1   0.06986    0.02194   3.184  0.00158 ** 
#> Value_crp_no.l2   0.09328    0.07921   1.178  0.23973    
#> Value_crp_vs.l2  -0.21455    0.09383  -2.287  0.02280 *  
#> Value_iti_ti.l2  -0.40276    0.04829  -8.341 1.55e-15 ***
#> Value_iti_uo.l2  -0.06962    0.02214  -3.144  0.00180 ** 
#> const           174.33160  237.89019   0.733  0.46414    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 363.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.9929,  Adjusted R-squared: 0.9928 
#> F-statistic:  6364 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_uo: 
#> ============================================= 
#> Value_iti_uo = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1   -0.59956    0.18495  -3.242  0.00130 ** 
#> Value_crp_vs.l1    0.45086    0.23158   1.947  0.05232 .  
#> Value_iti_ti.l1    0.33136    0.11010   3.010  0.00280 ** 
#> Value_iti_uo.l1    1.37959    0.05043  27.355  < 2e-16 ***
#> Value_crp_no.l2   -0.41512    0.18210  -2.280  0.02321 *  
#> Value_crp_vs.l2    0.58504    0.21570   2.712  0.00700 ** 
#> Value_iti_ti.l2   -0.35270    0.11101  -3.177  0.00161 ** 
#> Value_iti_uo.l2   -0.38204    0.05090  -7.506 4.77e-13 ***
#> const           1102.69133  546.87635   2.016  0.04450 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 835.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.9974,  Adjusted R-squared: 0.9973 
#> F-statistic: 1.739e+04 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no       123339        70167         7224        46686
#> Value_crp_vs        70167        73658         1860       -14500
#> Value_iti_ti         7224         1860       132104        40401
#> Value_iti_uo        46686       -14500        40401       698140
#> 
#> Correlation matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no      1.00000      0.73616      0.05659      0.15910
#> Value_crp_vs      0.73616      1.00000      0.01886     -0.06394
#> Value_iti_ti      0.05659      0.01886      1.00000      0.13303
#> Value_iti_uo      0.15910     -0.06394      0.13303      1.00000

# Comparison between two fits using AIC (Akaike Information Criterion) and BIC (Bayesian Information Criterion)
# Lower values indicates better fit.

# Calculate AIC and BIC for VAR(1) model
aic_var1 <- AIC(var1_model)
bic_var1 <- BIC(var1_model)

# Calculate AIC and BIC for VAR(p) model
aic_varp <- AIC(varp_model)
bic_varp <- BIC(varp_model)

# Print AIC and BIC values
cat("AIC for VAR(1):", aic_var1, "\n")
#> AIC for VAR(1): 22116.74
cat("BIC for VAR(1):", bic_var1, "\n")
#> BIC for VAR(1): 22195.17
cat("AIC for VAR(p):", aic_varp, "\n")
#> AIC for VAR(p): 21844.84
cat("BIC for VAR(p):", bic_varp, "\n")
#> BIC for VAR(p): 21985.92

## We can observe that both AIC and BIC are lower for the VAR(p) model compared to the VAR(1) model. This suggests that the VAR(p) model provides a better balance between goodness of fit and model complexity. Lower AIC and BIC values indicate a better fit while taking into account the complexity of the model.

## Forecasting

# Produce a one-month-ahead forecast using VAR(1) model
one_month_forecast_var1 <- predict(var1_model, n.ahead = 1)

# Extract the forecasted values
forecasted_values_var1 <- one_month_forecast_var1$fcst[1]$Value_crp_no[1]

# Print the forecasted value for the next month
cat("Forecasted value for the next month using VAR 1 model:", forecasted_values_var1, "\n")
#> Forecasted value for the next month using VAR 1 model: 2241.609

# Produce a one-month-ahead forecast using VAR(p) model
one_month_forecast_varp <- predict(varp_model, n.ahead = 1)

# Extract the forecasted values
forecasted_values_varp <- one_month_forecast_varp$fcst[1]$Value_crp_no[1]

# Print the forecasted value for the next month
cat("Forecasted value for the next month using VAR p model:", forecasted_values_varp, "\n")
#> Forecasted value for the next month using VAR p model: 2190.083

# Granger Causality
causality_test_var1 <- causality(var1_model)
#> Warning in causality(var1_model): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Value_crp_no) as cause variable.
print(causality_test_var1)
#> $Granger
#> 
#>  Granger causality H0: Value_crp_no do not Granger-cause Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object var1_model
#> F-Test = 7.5892, df1 = 3, df2 = 1472, p-value = 4.89e-05
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_crp_no and Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object var1_model
#> Chi-squared = 138.41, df = 3, p-value < 2.2e-16

causality_test_varp <- causality(varp_model)
#> Warning in causality(varp_model): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Value_crp_no) as cause variable.
print(causality_test_varp)
#> $Granger
#> 
#>  Granger causality H0: Value_crp_no do not Granger-cause Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object varp_model
#> F-Test = 6.9702, df1 = 6, df2 = 1452, p-value = 2.542e-07
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_crp_no and Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object varp_model
#> Chi-squared = 137.28, df = 3, p-value < 2.2e-16

# Load the BigVAR package
library(BigVAR)

bigvar_matrix <- cbind(crp_no$Value,
                       crp_vs$Value,
                       iti_ti$Value,
                       iti_uo$Value)

# Fit a sparse VAR model using Lasso regularization
sparse_var_fit <- BigVAR.fit(Y=bigvar_matrix,struct='Basic',p=2,lambda=1)[,,1]

# Describe the sparsity structure
print(sparse_var_fit)
#>           [,1]        [,2]        [,3]        [,4]        [,5]        [,6]
#> [1,] 417.59572  0.24196716 0.249637724 0.003185001 0.007121556  0.23873930
#> [2,] 222.58455  0.24856635 0.252579633 0.001598170 0.011755905  0.24551514
#> [3,] -34.94354 -0.03225582 0.054272649 0.792747721 0.133064078 -0.01802307
#> [4,] 908.21645  0.02025621 0.007536235 0.154881877 1.234473633 -0.03508725
#>             [,7]          [,8]        [,9]
#> [1,]  0.24796739 -0.0003493715 -0.01131492
#> [2,]  0.25061997 -0.0027416290 -0.01333212
#> [3,] -0.00726279  0.2051545935 -0.13180078
#> [4,]  0.05198889 -0.1821780945 -0.23351711

#sparsity_structure <- sparse_var_fit$selected
#print(sparsity_structure)

# Summarize the results
summary(sparse_var_fit)
#>        V1               V2                  V3                 V4          
#>  Min.   :-34.94   Min.   :-0.032256   Min.   :0.007536   Min.   :0.001598  
#>  1st Qu.:158.20   1st Qu.: 0.007128   1st Qu.:0.042589   1st Qu.:0.002788  
#>  Median :320.09   Median : 0.131112   Median :0.151955   Median :0.079033  
#>  Mean   :378.36   Mean   : 0.119633   Mean   :0.141007   Mean   :0.238103  
#>  3rd Qu.:540.25   3rd Qu.: 0.243617   3rd Qu.:0.250373   3rd Qu.:0.314348  
#>  Max.   :908.22   Max.   : 0.248566   Max.   :0.252580   Max.   :0.792748  
#>        V5                 V6                 V7                  V8           
#>  Min.   :0.007122   Min.   :-0.03509   Min.   :-0.007263   Min.   :-0.182178  
#>  1st Qu.:0.010597   1st Qu.:-0.02229   1st Qu.: 0.037176   1st Qu.:-0.047601  
#>  Median :0.072410   Median : 0.11036   Median : 0.149978   Median :-0.001545  
#>  Mean   :0.346604   Mean   : 0.10779   Mean   : 0.135828   Mean   : 0.004971  
#>  3rd Qu.:0.408417   3rd Qu.: 0.24043   3rd Qu.: 0.248631   3rd Qu.: 0.051027  
#>  Max.   :1.234474   Max.   : 0.24552   Max.   : 0.250620   Max.   : 0.205155  
#>        V9          
#>  Min.   :-0.23352  
#>  1st Qu.:-0.15723  
#>  Median :-0.07257  
#>  Mean   :-0.09749  
#>  3rd Qu.:-0.01283  
#>  Max.   :-0.01131

Forecasted value for the next month using VAR 1 model: 2241.609 Forecasted value for the next month using VAR p model: 2190.083

^{Created on 2024-04-10 with reprex v2.1.0}

jlivsey / UB-sping24-time-series

Jaskirat Singh #15

Steps Performed

Homework 4 - VAR model on US Census Bureaus M3 survey data