jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Jaskirat Singh #15

Open jaskirat-singh-pahwa opened 4 months ago

jaskirat-singh-pahwa commented 4 months ago
library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)

# Reading data
#claims_data <- read.csv("ICNSA.csv")
#head(claims_data)
fredr_set_key('4e2b405e7ea0d612c659d24c185134a1')
claims_data <- fredr(series_id = 'ICNSA')

# Converting DATE column type from string to date
claims_data$date <- as.Date(claims_data$date)
head(claims_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-22     2024-02-22  
#> 2 1967-01-14 ICNSA     334000 2024-02-22     2024-02-22  
#> 3 1967-01-21 ICNSA     277000 2024-02-22     2024-02-22  
#> 4 1967-01-28 ICNSA     252000 2024-02-22     2024-02-22  
#> 5 1967-02-04 ICNSA     274000 2024-02-22     2024-02-22  
#> 6 1967-02-11 ICNSA     276000 2024-02-22     2024-02-22

# Basic summary statistics
summary(claims_data$value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  133000  265800  326919  365013  406725 6161268

# Plotting Time-Series data
ggplot(claims_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of Claims Data",
       x = "Date",
       y = "ICNSA")


# Autocorrelation Plot
acf(claims_data$value)


# Partial Autocorrelation Plot
pacf(claims_data$value)


# Forecasting claims value for the next week
# We will be fitting an ARIMA model for out time series data

# Creating a time series object, since this is weekly data so choosing frequency as 7
ts_data <- ts(claims_data$value, frequency = 7)

# Fitting an ARIMA model
arima_model <- arima(ts_data, order = c(2,1,1))

forecast_week <- forecast(arima_model, h = 1)
print(forecast_week)
#>          Point Forecast    Lo 80    Hi 80    Lo 95  Hi 95
#> 426.7143       218288.1 106262.1 330314.1 46959.08 389617

# Forecasting for a targeted date
target_date <- as.Date("2024-02-15")
forecast_target <- forecast(arima_model, h = 1, newdata = data.frame(date = target_date))
#> Warning in forecast.Arima(arima_model, h = 1, newdata = data.frame(date =
#> target_date)): The non-existent newdata arguments will be ignored.
print(forecast_target)
#>          Point Forecast    Lo 80    Hi 80    Lo 95  Hi 95
#> 426.7143       218288.1 106262.1 330314.1 46959.08 389617

Forecasted value for next week - 218288.1 Created on 2024-02-22 with reprex v2.1.0

jaskirat-singh-pahwa commented 4 months ago

Homework 1 - regARIMA on ICNSA data and IURNSA as a covariate

library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)

# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")

# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]

# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2980

head(icnsa_data)
#> # A tibble: 6 × 2
#>   date        value
#>   <date>      <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000

# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#>       date                value        
#>  Min.   :1967-01-07   Min.   : 133000  
#>  1st Qu.:1981-04-16   1st Qu.: 265800  
#>  Median :1995-07-25   Median : 326919  
#>  Mean   :1995-07-25   Mean   : 365013  
#>  3rd Qu.:2009-11-01   3rd Qu.: 406725  
#>  Max.   :2024-02-10   Max.   :6161268
cat("\n\n")

########### Another series ##############

# Reading related data to ICNSA to be used as a covariate - Insured Unemployment Rate 
iurnsa_data <- fredr(series_id = "IURNSA")

# Taking date and value column from the dataset
iurnsa_data <- iurnsa_data[c("date", "value")]

# Converting `date` column from string to date
iurnsa_data$date <- as.Date(iurnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the IURNSA dataset:", nrow(iurnsa_data), "\n")
#> Total number of rows in the IURNSA dataset: 2771

head(iurnsa_data)
#> # A tibble: 6 × 2
#>   date       value
#>   <date>     <dbl>
#> 1 1971-01-02   5.2
#> 2 1971-01-09   5.3
#> 3 1971-01-16   5.3
#> 4 1971-01-23   5.2
#> 5 1971-01-30   5.2
#> 6 1971-02-06   5.2

# Basic summary statistics
cat("Summary of value column in IURNSA dataset:", "\n")
#> Summary of value column in IURNSA dataset:
summary(iurnsa_data)
#>       date                value       
#>  Min.   :1971-01-02   Min.   : 0.800  
#>  1st Qu.:1984-04-10   1st Qu.: 1.900  
#>  Median :1997-07-19   Median : 2.500  
#>  Mean   :1997-07-19   Mean   : 2.759  
#>  3rd Qu.:2010-10-26   3rd Qu.: 3.300  
#>  Max.   :2024-02-03   Max.   :15.800

# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


# Plotting IURNSA Time-Series data
ggplot(iurnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of IURNSA Claims Data",
       x = "Date (Year)",
       y = "value")


# Calculating the correlation coefficient between ICNSA and IURNSA 
# To find the correlation, we need to match the dimesions of both the series i.e., the number of rows should be same
# Since, both the series are weekly and IURSA is from 1971-01-02 to 2024-02-03, we need to filter ICNSA from the same date
filtered_icnsa_data <- subset(icnsa_data, date >= "1971-01-02" & date <= "2024-02-03")

# Also, removing the data for COVID period
filtered_icnsa_data <- subset(filtered_icnsa_data, !(year(date) == 2020 | (year(date) == 2021 & month(date) < 7)))
filtered_iurnsa_data <- subset(iurnsa_data, !(year(date) == 2020 | (year(date) == 2021 & month(date) < 7)))

cat("Number of rows in ICNSA after filteration: ", nrow(filtered_icnsa_data), "\n")
#> Number of rows in ICNSA after filteration:  2693
cat("Number of rows in IURNSA after filteration: ", nrow(filtered_iurnsa_data), "\n")
#> Number of rows in IURNSA after filteration:  2693

icnsa_iursa_corr <- cor(filtered_icnsa_data$value, filtered_iurnsa_data$value)
cat("Correlation between ICNSA and IURNSA data: ", icnsa_iursa_corr, "\n")
#> Correlation between ICNSA and IURNSA data:  0.6490541

# Performing time series decomposition for ICNSA data
filtered_icnsa_ts <- ts(filtered_icnsa_data$value, frequency=52)
filtered_icnsa_decomp <- decompose(filtered_icnsa_ts, "multiplicative")
plot(filtered_icnsa_decomp)


# Performing time series decomposition for IURNSA data
fiiltered_iurnsa_ts <- ts(filtered_iurnsa_data$value, frequency=52)
filtered_iurnsa_decomp <- decompose(fiiltered_iurnsa_ts, "multiplicative")
plot(filtered_iurnsa_decomp)


# Plotting ACF and PACF for filtered ICNSA and IURNSA data
acf(filtered_icnsa_data$value)

pacf(filtered_icnsa_data$value)


acf(filtered_iurnsa_data$value)

pacf(filtered_iurnsa_data$value)


# Cross-correlation analysis
cross_correlation <- ccf(filtered_icnsa_data$value, filtered_iurnsa_data$value)

print("Cross-correlation between filtered ICNSA and IURNSA data:")
#> [1] "Cross-correlation between filtered ICNSA and IURNSA data:"
print(cross_correlation)
#> 
#> Autocorrelations of series 'X', by lag
#> 
#>   -31   -30   -29   -28   -27   -26   -25   -24   -23   -22   -21   -20   -19 
#> 0.397 0.406 0.417 0.424 0.432 0.435 0.433 0.422 0.420 0.424 0.427 0.434 0.441 
#>   -18   -17   -16   -15   -14   -13   -12   -11   -10    -9    -8    -7    -6 
#> 0.456 0.472 0.492 0.515 0.539 0.564 0.586 0.606 0.630 0.651 0.669 0.683 0.690 
#>    -5    -4    -3    -2    -1     0     1     2     3     4     5     6     7 
#> 0.698 0.699 0.693 0.681 0.674 0.649 0.627 0.573 0.530 0.501 0.470 0.440 0.405 
#>     8     9    10    11    12    13    14    15    16    17    18    19    20 
#> 0.381 0.362 0.346 0.332 0.326 0.321 0.317 0.316 0.318 0.322 0.327 0.332 0.337 
#>    21    22    23    24    25    26    27    28    29    30    31 
#> 0.343 0.344 0.346 0.345 0.344 0.339 0.330 0.310 0.297 0.291 0.285

# Fit an ARIMA model to ICNSA data
arima_model <- auto.arima(filtered_icnsa_data$value)
print(arima_model)
#> Series: filtered_icnsa_data$value 
#> ARIMA(2,1,2) 
#> 
#> Coefficients:
#>          ar1      ar2      ma1     ma2
#>       0.9676  -0.0723  -1.2112  0.2228
#> s.e.  0.1181   0.0996   0.1159  0.1135
#> 
#> sigma^2 = 2.547e+09:  log likelihood = -32970.14
#> AIC=65950.27   AICc=65950.29   BIC=65979.76

# Fit a regARIMA model using ICNSA and IURNSA data
reg_arima_model <- auto.arima(filtered_icnsa_data$value, xreg = filtered_iurnsa_data$value)
print(summary(reg_arima_model))
#> Series: filtered_icnsa_data$value 
#> Regression with ARIMA(2,1,1) errors 
#> 
#> Coefficients:
#>          ar1     ar2      ma1      xreg
#>       0.5630  0.2223  -0.9883  71614.85
#> s.e.  0.0222  0.0207   0.0028   5478.41
#> 
#> sigma^2 = 2.452e+09:  log likelihood = -32919.15
#> AIC=65848.3   AICc=65848.32   BIC=65877.79
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE        MPE     MAPE     MASE
#> Training set 840.8758 49468.03 33157.92 -0.9803505 8.795835 0.957651
#>                      ACF1
#> Training set -0.007676926

tsdisplay(residuals(reg_arima_model), lag.max=30)


# Perform Ljung-Box test for residuals autocorrelation
ljung_box_test <- Box.test(reg_arima_model$residuals, lag = 12, type = "Ljung-Box")
print(ljung_box_test)
#> 
#>  Box-Ljung test
#> 
#> data:  reg_arima_model$residuals
#> X-squared = 188.71, df = 12, p-value < 2.2e-16

# Forecasting for the next week
forecast_icnsa <- forecast(reg_arima_model, xreg = tail(filtered_iurnsa_data$value, 1))
forecast_value <- forecast_icnsa$mean[1]

cat("\nForecasted value for the next week:", forecast_value)
#> 
#> Forecasted value for the next week: 242908.1

Data Analysis:

Model Diagnostics

Forecasted value for the next week: 242908.1

Created on 2024-02-22 with reprex v2.1.0

jaskirat-singh-pahwa commented 4 months ago

Homework 2: ICNSA data forecast using smoothing technique - Holt-Winters model

library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)

# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")

# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]

# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2981

head(icnsa_data)
#> # A tibble: 6 × 2
#>   date        value
#>   <date>      <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000

# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#>       date                value        
#>  Min.   :1967-01-07   Min.   : 133000  
#>  1st Qu.:1981-04-18   1st Qu.: 265794  
#>  Median :1995-07-29   Median : 326900  
#>  Mean   :1995-07-29   Mean   : 364957  
#>  3rd Qu.:2009-11-07   3rd Qu.: 406633  
#>  Max.   :2024-02-17   Max.   :6161268
cat("\n\n")

# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


# Lets visually see the ICNSA trend during pandemic
# According to the web sources, the pandemic was declared in March 2020 till June/July 2021
ggplot(icnsa_data, aes(x=date, y=value)) +
  geom_line() +
  labs(title="Initial Claims (ICNSA) over Time", x="Date", y="Initial Claims (ICNSA)") +
  theme_minimal() +
  geom_rect(aes(xmin=as.Date("2020-03-01"), xmax=as.Date("2021-07-31"), ymin=-Inf, ymax=Inf), 
            fill="white", alpha=0.004) +
  geom_vline(xintercept=as.Date("2020-03-01"), color="orange", linetype="dashed") +
  geom_vline(xintercept=as.Date("2021-07-31"), color="orange", linetype="dashed")


# Calculating the average number of claims before and during the pandemic
pre_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date < as.Date("2020-01-01")) ])
subset_data <- subset(icnsa_data, date >= as.Date("2020-03-01") & date <= as.Date("2021-07-31"))
pandemic_avg <- mean(subset_data$value)
post_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date > as.Date("2021-07-31")) ])
cat("Average number of claims before pandemic:", pre_pandemic_avg, "\n")
#> Average number of claims before pandemic: 350231.6
cat("Average number of claims during pandemic:", pandemic_avg, "\n")
#> Average number of claims during pandemic: 1172247
cat("Average number of claims after pandemic:", post_pandemic_avg, "\n")
#> Average number of claims after pandemic: 229688.1

# Create a dataframe for the bar chart
bar_data <- data.frame(period = c("Pre-pandemic", "Pandemic", "Post-pandemic"),
                       avg_value = c(pre_pandemic_avg, pandemic_avg, post_pandemic_avg))

# Create a bar chart
ggplot(bar_data, aes(x=period, y=avg_value, fill=period)) +
  geom_bar(stat="identity") +
  labs(title="Average Initial Claims (ICNSA) by Period", x="Period", y="Average Initial Claims (ICNSA)") +
  theme_minimal()


pre_pandemic_sd <- sd(icnsa_data$value[which(icnsa_data$date < as.Date("2020-03-01"))])

# Calculate the Z-score of the pandemic
z_score <- (pandemic_avg - pre_pandemic_avg) / pre_pandemic_sd

# Output the Z-score
cat("Z-score of the pandemic: ", z_score, "\n")
#> Z-score of the pandemic:  7.03949

# A positive Z-score indicates that the pandemic average is higher than the pre-pandemic average

# Removing the data for COVID period
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-07-31")

icnsa_data_without_covid <- subset(icnsa_data, !(date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA without Covid period: ", nrow(icnsa_data_without_covid), "\n")
#> Number of rows in ICNSA without Covid period:  2907

# Plotting ACF and PACF for filtered ICNSA data
acf(icnsa_data_without_covid$value)

pacf(icnsa_data_without_covid$value)


# Getting ICNSA data during covid period
icnsa_data_during_covid <- subset(icnsa_data, (date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA during Covid period: ", nrow(icnsa_data_during_covid), "\n")
#> Number of rows in ICNSA during Covid period:  74

# Getting data for covid period and using cubic spline to impute new values
# To choose value of lambda - I would be using smoothing parameter is called `spar`

# Choose a range of spar values
spar_values <- seq(0.1, 0.6, by = 0.2)

best_spar <- NULL
best_mse <- Inf

par(mfcol=c(length(spar_values), 1), mar=c(2, 2, 1, 1))

for(spar in spar_values) {
  # Fit smooth spline for each spar value
  spline_fit <- smooth.spline(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, spar=spar)

  predicted_values <- predict(spline_fit, as.numeric(icnsa_data$date))

  # Calculate mean squared error
  mse <- mean((predicted_values$y - icnsa_data$value)^2)

  # If the current spar value has a lower MSE than the previous best spar value, update the best spar value and its corresponding MSE
  if(mse < best_mse) {
    best_spar <- spar
    best_mse <- mse
  }

  plot(icnsa_data$date, icnsa_data$value, type='l', main=paste("Spar =", spar),
       xlab="Date", ylab="ICNSA Values", col="purple")
  lines(icnsa_data$date, predicted_values$y, col="black")
  legend("topleft", legend=c("Original", "Smoothed"), col=c("purple", "black"), lty=1)

}


cat("Best value for lambda:", best_spar, "\n")
#> Best value for lambda: 0.1
cat("Best value for MSE:", best_mse, "\n")
#> Best value for MSE: 57230081019

par(mfcol=c(1, 1))

# Impute missing values using cubic spline
spline_fit <- smooth.spline(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, spar = 0.1)

#plotting the spline against our original data
plot(as.numeric(icnsa_data_without_covid$date), icnsa_data_without_covid$value, type = "p", col = "purple", main = "Original ICNSA vs. Smoothed ICNSA", xlab = "Date", ylab = "Value", xlim = range(as.numeric(icnsa_data_without_covid$date)), ylim = range(icnsa_data_without_covid$value))
lines(spline_fit, col = "black")
legend("topright", legend = c("Original ICNSA series", "Smoothed spline series "), col = c("purple", "black"), lty = 1)


imputed_values <- predict(spline_fit, as.numeric(icnsa_data_during_covid$date))$y
cat("Head for imputed values:", head(imputed_values), "\n")
#> Head for imputed values: 269471.7 271585.5 273635.9 275623.4 277548.6 279412.1

# Combine the imputed values with the original dataframe
new_covid_data <- data.frame(date = icnsa_data_during_covid$date, value = imputed_values)
combined_ICNSA <- rbind(icnsa_data_without_covid, new_covid_data)
cat("Total number of rows in the updated ICNSA dataset:", nrow(combined_ICNSA), "\n")
#> Total number of rows in the updated ICNSA dataset: 2981

# Converting it into time-series object
combined_ICNSA_ts <- ts(combined_ICNSA$value, frequency = 52)

# Fit a multiplicative Holt-Winters model
multiplicative_hw_model <- HoltWinters(combined_ICNSA_ts, seasonal="multiplicative")
multiplicative_hw_forecast <- forecast(multiplicative_hw_model, h=1)

# Fit an additive Holt-Winters model
additive_hw_model <- HoltWinters(combined_ICNSA_ts, seasonal="additive")
additive_hw_forecast <- forecast(additive_hw_model, h=1)

cat("Multiplicative Forecast:", multiplicative_hw_forecast$mean, "\n")
#> Multiplicative Forecast: 247172
cat("Additive Forecast:", additive_hw_forecast$mean, "\n")
#> Additive Forecast: 271732.7

Forecast value for the next week using multiplicative Holt-Winters model: 247172

Created on 2024-02-28 with reprex v2.1.0

Steps Performed

The forecasted value using the multiplicative Holt-Winters model: 247172 The forecasted value using the additive Holt-Winters model: 271732.7

jaskirat-singh-pahwa commented 4 months ago

Homework3 - Structural Time Series model using State Space methodology

library(tidyverse)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)
library(KFAS)
#> Please cite KFAS in publications by using: 
#> 
#>   Jouni Helske (2017). KFAS: Exponential Family State Space Models in R. Journal of Statistical Software, 78(10), 1-39. doi:10.18637/jss.v078.i10.
library(zoo)
library(dlm)
#> 
#> Attaching package: 'dlm'
#> The following object is masked from 'package:ggplot2':
#> 
#>     %+%
library(dplyr)
library(zoo)
library(imputeTS)
#> 
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:zoo':
#> 
#>     na.locf

# Reading ICNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
icnsa_data <- fredr(series_id = "ICNSA")

# Taking date and value column from the dataset
icnsa_data <- icnsa_data[c("date", "value")]

# Converting `date` column from string to date
icnsa_data$date <- as.Date(icnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the ICNSA dataset:", nrow(icnsa_data), "\n")
#> Total number of rows in the ICNSA dataset: 2982

head(icnsa_data)
#> # A tibble: 6 × 2
#>   date        value
#>   <date>      <dbl>
#> 1 1967-01-07 346000
#> 2 1967-01-14 334000
#> 3 1967-01-21 277000
#> 4 1967-01-28 252000
#> 5 1967-02-04 274000
#> 6 1967-02-11 276000

# Basic summary statistics
cat("Summary of value column in ICNSA dataset:", "\n")
#> Summary of value column in ICNSA dataset:
summary(icnsa_data)
#>       date                value        
#>  Min.   :1967-01-07   Min.   : 133000  
#>  1st Qu.:1981-04-19   1st Qu.: 265755  
#>  Median :1995-08-01   Median : 326806  
#>  Mean   :1995-08-01   Mean   : 364900  
#>  3rd Qu.:2009-11-12   3rd Qu.: 406549  
#>  Max.   :2024-02-24   Max.   :6161268
cat("\n\n")

# Plotting ICNSA Time-Series data
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Time Series Plot of ICNSA Claims Data",
       x = "Date (Year)",
       y = "value")


from_date <- as.Date("2020-01-01")
to_date <- as.Date("2021-12-31")

filtered_df <- icnsa_data %>%
  filter(date >= from_date & date <= to_date)

ggplot(filtered_df, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Data from Date Range",
       x = "Date",
       y = "Value")

# From the above graph we can see that the covid start date is somewhere around 1st March 2020 till 30th June 2021

# Lets visually see the ICNSA trend during pandemic
# According to the web sources, the pandemic was declared in March 2020 till June/July 2021
ggplot(icnsa_data, aes(x=date, y=value)) +
  geom_line() +
  labs(title="Initial Claims (ICNSA) over Time", x="Date", y="Initial Claims (ICNSA)") +
  theme_minimal() +
  geom_rect(aes(xmin=as.Date("2020-03-01"), xmax=as.Date("2021-06-30"), ymin=-Inf, ymax=Inf), 
            fill="white", alpha=0.004) +
  geom_vline(xintercept=as.Date("2020-03-01"), color="orange", linetype="dashed") +
  geom_vline(xintercept=as.Date("2021-07-31"), color="orange", linetype="dashed")


# Calculating the average number of claims before and during the pandemic
pre_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date < as.Date("2020-03-01")) ])
subset_data <- subset(icnsa_data, date >= as.Date("2020-03-01") & date <= as.Date("2021-06-30"))
pandemic_avg <- mean(subset_data$value)
post_pandemic_avg <- mean(icnsa_data$value[ which(icnsa_data$date > as.Date("2021-06-30")) ])
cat("Average number of claims before pandemic:", pre_pandemic_avg, "\n")
#> Average number of claims before pandemic: 349907.1
cat("Average number of claims during pandemic:", pandemic_avg, "\n")
#> Average number of claims during pandemic: 1230384
cat("Average number of claims after pandemic:", post_pandemic_avg, "\n")
#> Average number of claims after pandemic: 234487.1

# Create a dataframe for the bar chart
bar_data <- data.frame(period = c("Pre-pandemic avg", "Pandemic avg", "Post-pandemic avg"),
                       avg_value = c(pre_pandemic_avg, pandemic_avg, post_pandemic_avg))

# Create a bar chart
ggplot(bar_data, aes(x=period, y=avg_value, fill=period)) +
  geom_bar(stat="identity") +
  labs(title="Average Initial Claims (ICNSA) by Period", x="Period", y="Average Initial Claims (ICNSA)") +
  theme_minimal()


# Removing the data for COVID period
covid_start_date <- as.Date("2020-03-01")
covid_end_date <- as.Date("2021-06-30")

non_covid_data <- subset(icnsa_data, !(date >= covid_start_date & date <= covid_end_date))
cat("Number of rows in ICNSA including Covid period:", nrow(icnsa_data), "\n")
#> Number of rows in ICNSA including Covid period: 2982
cat("Number of rows in ICNSA without Covid period:", nrow(non_covid_data), "\n")
#> Number of rows in ICNSA without Covid period: 2913

# Plotting ACF and PACF for filtered ICNSA data
acf(non_covid_data$value)

pacf(non_covid_data$value)


# Getting ICNSA data during covid period
covid_data_rows = nrow(subset(icnsa_data, (date >= covid_start_date & date <= covid_end_date)))
cat("Number of rows in ICNSA during Covid period: ", covid_data_rows, "\n")
#> Number of rows in ICNSA during Covid period:  69

# Setting covid period data to NA
modified_icnsa <- icnsa_data

modified_icnsa$value[modified_icnsa$date >= as.Date("2020-03-01") & modified_icnsa$date <= as.Date("2021-06-30")] <- NA
nrow(modified_icnsa)
#> [1] 2982

# Getting the index of the covid period
covid_period_index <- which(modified_icnsa$date >= as.Date("2020-03-01") & modified_icnsa$date <= as.Date("2021-06-30"))

# Imputing missing values using na_kalman() within the COVID period
imputed_values <- na_kalman(modified_icnsa$value, smooth = TRUE)

# Updating the ICNSA dataframe with the imputed values within the COVID period
modified_icnsa$value[covid_period_index] <- imputed_values[covid_period_index]

# Plot the original and imputed data
plot(modified_icnsa$date, icnsa_data$value, type = "l", col = "blue", lwd = 2,
     xlab = "Date", ylab = "Initial Claims (ICNSA)", main = "Comparison of Original and Imputed Values")
lines(modified_icnsa$date, imputed_values, col = "red", lwd = 2)
legend("topright", legend = c("Original", "Imputed"), col = c("blue", "red"), lwd = 2, bty = "n")


plot(modified_icnsa$date[covid_period_index], icnsa_data$value[covid_period_index], type = "l", col = "blue", lwd = 2,
     xlab = "Date", ylab = "Initial Claims (ICNSA)", main = "Comparison of Original and Imputed Values")
lines(modified_icnsa$date[covid_period_index], imputed_values[covid_period_index], col = "red", lwd = 2)
legend("topright", legend = c("Original", "Imputed"), col = c("blue", "red"), lwd = 2, bty = "n")


# Fit a structural time series model with trend, seasonal, and error components
state_space_model <- StructTS(ts(modified_icnsa$value, frequency = 52), type = "BSM")
summary(state_space_model)
#>           Length Class  Mode     
#> coef         4   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    8946   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Taking another series as covariate series
# Reading CCNSA data
fredr_set_key("4e2b405e7ea0d612c659d24c185134a1")
ccnsa_data <- fredr(series_id = "CCNSA")

# Taking date and value column from the dataset
ccnsa_data <- ccnsa_data[c("date", "value")]

# Converting `date` column from string to date
ccnsa_data$date <- as.Date(ccnsa_data$date)

# Checking count of rows in the dataset
cat("Total number of rows in the CCNSA dataset:", nrow(ccnsa_data), "\n")
#> Total number of rows in the CCNSA dataset: 2981

head(ccnsa_data)
#> # A tibble: 6 × 2
#>   date         value
#>   <date>       <dbl>
#> 1 1967-01-07 1594000
#> 2 1967-01-14 1563000
#> 3 1967-01-21 1551000
#> 4 1967-01-28 1533000
#> 5 1967-02-04 1534000
#> 6 1967-02-11 1557000

# Basic summary statistics
cat("Summary of value column in CCNSA dataset:", "\n")
#> Summary of value column in CCNSA dataset:
summary(ccnsa_data)
#>       date                value         
#>  Min.   :1967-01-07   Min.   :  785000  
#>  1st Qu.:1981-04-18   1st Qu.: 1997000  
#>  Median :1995-07-29   Median : 2493008  
#>  Mean   :1995-07-29   Mean   : 2761405  
#>  3rd Qu.:2009-11-07   3rd Qu.: 3145405  
#>  Max.   :2024-02-17   Max.   :23037921
cat("\n\n")

# Making ICNSA data compatible with CCNSA data
modified_icnsa <- subset(modified_icnsa, date >= as.Date("1967-01-07") & date <= as.Date("2024-02-17"))

# Merge ICNSA and CCNSA dataframes based on their common date column
merged_data <- merge(modified_icnsa, ccnsa_data, by = "date", all = FALSE)
head(merged_data)
#>         date value.x value.y
#> 1 1967-01-07  346000 1594000
#> 2 1967-01-14  334000 1563000
#> 3 1967-01-21  277000 1551000
#> 4 1967-01-28  252000 1533000
#> 5 1967-02-04  274000 1534000
#> 6 1967-02-11  276000 1557000

# 'merged_data' is your data frame containing 'value.x' and 'value.y'

# Create design matrix Z with covariate
Z <- cbind(1, merged_data$value.y)  # Trend and covariate

# Check dimensions of Z
dim(Z)  # Should be (n, p)
#> [1] 2981    2

# Create state equation matrices
p <- ncol(Z)  # Number of state variables
T <- diag(1, p)  # Transition matrix for state variables
Q <- diag(0.01, p)  # State covariance matrix

# Create observation equation matrices
n <- nrow(merged_data)  # Number of observations
H <- diag(1, n)  # Observation covariance matrix

# Check dimensions of H
dim(H)  # Should be (n, n)
#> [1] 2981 2981

# Create initial state and initial state covariance
initial_state <- rep(0, p)  # Initial state vector
initial_state_cov <- diag(1e5, p)  # Initial state covariance matrix

# Check dimensions of initial_state_cov
dim(initial_state_cov)  # Should be (p, p)
#> [1] 2 2

# Construct state space model
model <- dlm(
  FF = Z,  # Design matrix
  V = H,  # Observation covariance matrix
  GG = T,  # Transition matrix
  W = Q,  # State covariance matrix
  m0 = initial_state,  # Initial state vector
  C0 = initial_state_cov  # Initial state covariance matrix
)

# Fit the model
fit <- dlmFilter(merged_data$value.x, model)

# Print summary of the fitted model
print(summary(fit))
#>     Length Class  Mode   
#> y   2981   -none- numeric
#> mod    6   dlm    list   
#> m      4   -none- numeric
#> U.C    2   -none- list   
#> D.C    4   -none- numeric
#> a      2   -none- numeric
#> U.R    1   -none- list   
#> D.R    2   -none- numeric
#> f   2981   -none- numeric

str(model)
#> List of 6
#>  $ m0: num [1:2] 0 0
#>  $ C0: num [1:2, 1:2] 1e+05 0e+00 0e+00 1e+05
#>  $ FF: num [1:2981, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ V : num [1:2981, 1:2981] 1 0 0 0 0 0 0 0 0 0 ...
#>  $ GG: num [1:2, 1:2] 1 0 0 1
#>  $ W : num [1:2, 1:2] 0.01 0 0 0.01
#>  - attr(*, "class")= chr "dlm"

# Forecasting horizon
forecast_horizon <- 1  # Forecast for the next week

# Forecast using the fitted model
forecast <- tryCatch(
  {
    dlmForecast(fit, nAhead = forecast_horizon)
  },
  error = function(e) {
    message("Error occurred during forecasting: ", conditionMessage(e))
    return(NULL)
  }
)

forecasted_value <- forecast$f[1, 1]
print(paste("Forecasted value for the next week:", forecasted_value))
#> [1] "Forecasted value for the next week: 312076.351407013"

Steps Performed

Created on 2024-03-07 with reprex v2.1.0

jaskirat-singh-pahwa commented 2 months ago

Homework 4 - VAR model on US Census Bureaus M3 survey data

library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(fpp3)
#> ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
#> ✔ tibble      3.2.1     ✔ tsibble     1.1.4
#> ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
#> ✔ tidyr       1.3.1     ✔ feasts      0.3.1
#> ✔ lubridate   1.9.3     ✔ fable       0.3.3
#> ✔ ggplot2     3.4.4     ✔ fabletools  0.4.0
#> ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
#> ✖ lubridate::date()    masks base::date()
#> ✖ dplyr::filter()      masks stats::filter()
#> ✖ tsibble::intersect() masks base::intersect()
#> ✖ tsibble::interval()  masks lubridate::interval()
#> ✖ dplyr::lag()         masks stats::lag()
#> ✖ tsibble::setdiff()   masks base::setdiff()
#> ✖ tsibble::union()     masks base::union()
library(tidyverse)
library(tsbox)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following object is masked from 'package:tsibble':
#> 
#>     index
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(seasonal)
#> 
#> Attaching package: 'seasonal'
#> The following object is masked from 'package:tibble':
#> 
#>     view
library(astsa)
#> 
#> Attaching package: 'astsa'
#> The following objects are masked from 'package:seasonal':
#> 
#>     trend, unemp
#> The following object is masked from 'package:forecast':
#> 
#>     gas
library(patchwork)
library(tseries)
library(fredr)
library(quarto)
library(xts)
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(dplyr)
library(ggplot2)
library(vars)
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:patchwork':
#> 
#>     area
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> Loading required package: strucchange
#> Loading required package: sandwich
#> 
#> Attaching package: 'strucchange'
#> The following object is masked from 'package:stringr':
#> 
#>     boundary
#> Loading required package: urca
#> Loading required package: lmtest
#> 
#> Attaching package: 'vars'
#> The following object is masked from 'package:fable':
#> 
#>     VAR
library(readxl)
library(lubridate)
library(tseries)
library(BigVAR)
#> Loading required package: lattice
library(reprex)

# Reading all time series from excel file
crp_no <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/CRP_NO.xlsx")
crp_vs <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/CRP_VS.xlsx")
iti_ti <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/ITI_TI.xlsx")
iti_uo <- read_excel("/Users/jaskirat/University_at_Buffalo/courses/time_series/assignments/hw4/ITI_UO.xlsx")

# Converting Period column to date type for all 4 time series
crp_no$Period <- as.Date(crp_no$Period)
crp_vs$Period <- as.Date(crp_vs$Period)
iti_ti$Period <- as.Date(iti_ti$Period)
iti_uo$Period <- as.Date(iti_uo$Period)

# Show the dataframe
head(crp_no)
#> # A tibble: 6 × 2
#>   Period     Value 
#>   <date>     <chr> 
#> 1 1993-01-01 5245.0
#> 2 1993-02-01 5313.0
#> 3 1993-03-01 4723.0
#> 4 1993-04-01 5068.0
#> 5 1993-05-01 5290.0
#> 6 1993-06-01 4969.0
head(crp_vs)
#> # A tibble: 6 × 2
#>   Period     Value 
#>   <date>     <chr> 
#> 1 1993-01-01 5441.0
#> 2 1993-02-01 5100.0
#> 3 1993-03-01 5157.0
#> 4 1993-04-01 5016.0
#> 5 1993-05-01 4936.0
#> 6 1993-06-01 5167.0
head(iti_ti)
#> # A tibble: 6 × 2
#>   Period     Value  
#>   <date>     <chr>  
#> 1 1993-01-01 39506.0
#> 2 1993-02-01 39528.0
#> 3 1993-03-01 39371.0
#> 4 1993-04-01 39231.0
#> 5 1993-05-01 39398.0
#> 6 1993-06-01 39309.0
head(iti_uo)
#> # A tibble: 6 × 2
#>   Period     Value  
#>   <date>     <chr>  
#> 1 1993-01-01 80506.0
#> 2 1993-02-01 80035.0
#> 3 1993-03-01 79572.0
#> 4 1993-04-01 80068.0
#> 5 1993-05-01 80174.0
#> 6 1993-06-01 79100.0

# Function to clean data and convert 'Value' column to double
clean_and_convert_data <- function(data) {
  # Remove rows where 'Value' column is "NA"
  data <- data[!is.na(data$Value) & data$Value != "NA", ]

  # Remove commas from 'Value' column
  data$Value <- gsub(",", "", data$Value)

  # Convert 'Value' column to numeric
  data$Value <- as.numeric(data$Value)
  return(data)
}

crp_no <- clean_and_convert_data(crp_no)
crp_vs <- clean_and_convert_data(crp_vs)
iti_ti <- clean_and_convert_data(iti_ti)
iti_uo <- clean_and_convert_data(iti_uo)

# Checking total number of rows in all the time series
cat("- Total number of rows in crp_no dataset:", nrow(crp_no), "\n")
#> - Total number of rows in crp_no dataset: 374
cat("- Total number of rows in crp_vs dataset:", nrow(crp_vs), "\n")
#> - Total number of rows in crp_vs dataset: 374
cat("- Total number of rows in iti_ti dataset:", nrow(iti_ti), "\n")
#> - Total number of rows in iti_ti dataset: 374
cat("- Total number of rows in iti_uo dataset:", nrow(iti_uo), "\n\n")
#> - Total number of rows in iti_uo dataset: 374

# Basic summary statistics for all 4 series
cat("- Summary of crp_no dataset:", "\n")
#> - Summary of crp_no dataset:
summary(crp_no)
#>      Period               Value      
#>  Min.   :1993-01-01   Min.   : 1393  
#>  1st Qu.:2000-10-08   1st Qu.: 2110  
#>  Median :2008-07-16   Median : 4868  
#>  Mean   :2008-07-16   Mean   : 4628  
#>  3rd Qu.:2016-04-23   3rd Qu.: 6202  
#>  Max.   :2024-02-01   Max.   :10535

cat("- Summary of crp_vs dataset:", "\n")
#> - Summary of crp_vs dataset:
summary(crp_vs)
#>      Period               Value      
#>  Min.   :1993-01-01   Min.   : 1384  
#>  1st Qu.:2000-10-08   1st Qu.: 2139  
#>  Median :2008-07-16   Median : 5028  
#>  Mean   :2008-07-16   Mean   : 4642  
#>  3rd Qu.:2016-04-23   3rd Qu.: 6164  
#>  Max.   :2024-02-01   Max.   :10203

cat("- Summary of iti_ti dataset:", "\n")
#> - Summary of iti_ti dataset:
summary(iti_ti)
#>      Period               Value      
#>  Min.   :1993-01-01   Min.   :33132  
#>  1st Qu.:2000-10-08   1st Qu.:37582  
#>  Median :2008-07-16   Median :39319  
#>  Mean   :2008-07-16   Mean   :40669  
#>  3rd Qu.:2016-04-23   3rd Qu.:44426  
#>  Max.   :2024-02-01   Max.   :53721

cat("- Summary of iti_uo dataset:", "\n")
#> - Summary of iti_uo dataset:
summary(iti_uo)
#>      Period               Value       
#>  Min.   :1993-01-01   Min.   : 76417  
#>  1st Qu.:2000-10-08   1st Qu.: 93123  
#>  Median :2008-07-16   Median :111062  
#>  Mean   :2008-07-16   Mean   :108078  
#>  3rd Qu.:2016-04-23   3rd Qu.:119714  
#>  Max.   :2024-02-01   Max.   :134990
cat("\n\n")

# Data Analysis
## Visualizing the datasets - 

# Plotting crp_no time-series data
ggplot(crp_no, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of CRP_NO series",
       x = "Date (Month-Year)",
       y = "Value")


# Plotting crp_vs time-series data
ggplot(crp_vs, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of CRP_VS series",
       x = "Date (Month-Year)",
       y = "Value")


# Plotting iti_ti time-series data
ggplot(iti_ti, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of ITI_TI series",
       x = "Date (Month-Year)",
       y = "Value")


# Plotting iti_uo time-series data
ggplot(iti_uo, aes(x = Period, y = Value)) +
  geom_line() +
  labs(title = "Time Series Plot of ITI_UO series",
       x = "Date (Month-Year)",
       y = "Value")


## Correlation coefficient - 
crpNo_crpVs_corr <- cor(crp_no$Value, crp_vs$Value)
cat("Correlation between Crp_no and Crp_vs series: ", crpNo_crpVs_corr, "\n")
#> Correlation between Crp_no and Crp_vs series:  0.9954609

crpNo_itiTi_corr <- cor(crp_no$Value, iti_ti$Value)
cat("Correlation between Crp_no and Iti_ti series: ", crpNo_itiTi_corr, "\n")
#> Correlation between Crp_no and Iti_ti series:  0.5531556

crpNo_itiUo_corr <- cor(crp_no$Value, iti_uo$Value)
cat("Correlation between Crp_no and Iti_uo series: ", crpNo_itiUo_corr, "\n")
#> Correlation between Crp_no and Iti_uo series:  -0.6597469

## Checking for stationarity - 

# Function to check stationarity using ADF and KPSS tests
check_stationarity <- function(data) {
  adf_test <- adf.test(data)
  kpss_test <- kpss.test(data)

  cat("ADF Test:\n")
  print(adf_test)
  #cat("\nKPSS Test:\n")
  #print(kpss_test)
}

# Assuming you have two dataframes: real_estate_data and mortgage_interest_data
# Check stationarity for both series
cat("Checking stationarity for crp_no: ")
#> Checking stationarity for crp_no:
check_stationarity(crp_no$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -2.7454, Lag order = 7, p-value = 0.2624
#> alternative hypothesis: stationary

cat("Checking stationarity for crp_vs: ")
#> Checking stationarity for crp_vs:
check_stationarity(crp_vs$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -2.703, Lag order = 7, p-value = 0.2802
#> alternative hypothesis: stationary

cat("Checking stationarity for iti_ti: ")
#> Checking stationarity for iti_ti:
check_stationarity(iti_ti$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -2.4416, Lag order = 7, p-value = 0.3906
#> alternative hypothesis: stationary

cat("Checking stationarity for iti_uo: ")
#> Checking stationarity for iti_uo:
check_stationarity(iti_uo$Value)
#> Warning in kpss.test(data): p-value smaller than printed p-value
#> ADF Test:
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  data
#> Dickey-Fuller = -3.5722, Lag order = 7, p-value = 0.03585
#> alternative hypothesis: stationary

## ACF and PACF of all 4 time series 

# Plotting ACF and PACF for crp_no series
acf(crp_no$Value)

pacf(crp_no$Value)


# Plotting ACF and PACF for crp_vs series
acf(crp_vs$Value)

pacf(crp_vs$Value)


# Plotting ACF and PACF for iti_ti series
acf(iti_ti$Value)

pacf(iti_ti$Value)


# Plotting ACF and PACF for iti_uo series
acf(iti_uo$Value)

pacf(iti_uo$Value)


## Decomposition for crp_no -
crp_no_ts <- ts(crp_no$Value, frequency=12)
crp_no_decomp <- decompose(crp_no_ts, "multiplicative")
plot(crp_no_decomp)


## Decomposition for crp_vs -
crp_vs_ts <- ts(crp_vs$Value, frequency=12)
crp_vs_decomp <- decompose(crp_vs_ts, "multiplicative")
plot(crp_vs_decomp)


## Decomposition for iti_ti -
iti_ti_ts <- ts(iti_ti$Value, frequency=12)
iti_ti_decomp <- decompose(iti_ti_ts, "multiplicative")
plot(iti_ti_decomp)


## Decomposition for iti_uo -
iti_uo_ts <- ts(iti_uo$Value, frequency=12)
iti_uo_decomp <- decompose(iti_uo_ts, "multiplicative")
plot(iti_uo_decomp)


# Modelling

# Merging all 4 time series - 
combined_data <- merge(crp_no, crp_vs, by = "Period", all = TRUE, suffixes = c("_crp_no", "_crp_vs"))
combined_data <- merge(combined_data, iti_ti, by = "Period", all = TRUE, suffixes = c("_crp_no", "_iti_ti"))
combined_data <- merge(combined_data, iti_uo, by = "Period", all = TRUE, suffixes = c("_crp_no", "_iti_uo"))
#> Warning in merge.data.frame(combined_data, iti_uo, by = "Period", all = TRUE, :
#> column name 'Value_crp_no' is duplicated in the result

colnames(combined_data)[4] <- "Value_iti_ti"

nrow(combined_data)
#> [1] 374
head(combined_data)
#>       Period Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> 1 1993-01-01         5245         5441        39506        80506
#> 2 1993-02-01         5313         5100        39528        80035
#> 3 1993-03-01         4723         5157        39371        79572
#> 4 1993-04-01         5068         5016        39231        80068
#> 5 1993-05-01         5290         4936        39398        80174
#> 6 1993-06-01         4969         5167        39309        79100

# Create VAR model object for VAR(1)
combined_data$Period <- as.Date(combined_data$Period)
combined_data <- na.omit(combined_data)

# Convert the combined data to a time series object
ts_combined_data <- ts(combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], start = as.Date("1993-01-01"), frequency = 12)

var1_model <- VAR(ts_combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], p = 1)

# Print summary of VAR(1) model
summary(var1_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Value_crp_no, Value_crp_vs, Value_iti_ti, Value_iti_uo 
#> Deterministic variables: const 
#> Sample size: 373 
#> Log Likelihood: -11038.371 
#> Roots of the characteristic polynomial:
#> 0.9938 0.9938 0.9852 0.1941
#> Call:
#> VAR(y = ts_combined_data[, c("Value_crp_no", "Value_crp_vs", 
#>     "Value_iti_ti", "Value_iti_uo")], p = 1)
#> 
#> 
#> Estimation results for equation Value_crp_no: 
#> ============================================= 
#> Value_crp_no = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.013019   0.076306  -0.171 0.864620    
#> Value_crp_vs.l1   0.969321   0.076112  12.735  < 2e-16 ***
#> Value_iti_ti.l1   0.011246   0.005921   1.899 0.058310 .  
#> Value_iti_uo.l1  -0.006073   0.001709  -3.553 0.000431 ***
#> const           379.330929 238.470679   1.591 0.112540    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 367.9 on 368 degrees of freedom
#> Multiple R-Squared: 0.9808,  Adjusted R-squared: 0.9806 
#> F-statistic:  4693 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_crp_vs: 
#> ============================================= 
#> Value_crp_vs = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1   0.184887   0.059757   3.094  0.00213 ** 
#> Value_crp_vs.l1   0.790988   0.059605  13.271  < 2e-16 ***
#> Value_iti_ti.l1   0.007557   0.004637   1.630  0.10400    
#> Value_iti_uo.l1  -0.003186   0.001339  -2.380  0.01783 *  
#> const           143.137783 186.751102   0.766  0.44389    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 288.1 on 368 degrees of freedom
#> Multiple R-Squared: 0.9882,  Adjusted R-squared: 0.9881 
#> F-statistic:  7708 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_ti: 
#> ============================================= 
#> Value_iti_ti = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.294046   0.085233  -3.450 0.000626 ***
#> Value_crp_vs.l1   0.295386   0.085016   3.474 0.000573 ***
#> Value_iti_ti.l1   0.996967   0.006614 150.745  < 2e-16 ***
#> Value_iti_uo.l1   0.001352   0.001909   0.708 0.479210    
#> const           -15.845201 266.367194  -0.059 0.952597    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 410.9 on 368 degrees of freedom
#> Multiple R-Squared: 0.9908,  Adjusted R-squared: 0.9907 
#> F-statistic:  9941 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_uo: 
#> ============================================= 
#> Value_iti_uo = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.068184   0.193488  -0.352   0.7247    
#> Value_crp_vs.l1   0.130156   0.192996   0.674   0.5005    
#> Value_iti_ti.l1  -0.030413   0.015014  -2.026   0.0435 *  
#> Value_iti_uo.l1   1.003758   0.004334 231.589   <2e-16 ***
#> const           686.995408 604.683739   1.136   0.2566    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 932.8 on 368 degrees of freedom
#> Multiple R-Squared: 0.9967,  Adjusted R-squared: 0.9967 
#> F-statistic: 2.812e+04 on 4 and 368 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no       135317        78920         7607        68864
#> Value_crp_vs        78920        82987         5282         5598
#> Value_iti_ti         7607         5282       168827       100727
#> Value_iti_uo        68864         5598       100727       870038
#> 
#> Correlation matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no      1.00000      0.74474      0.05033      0.20070
#> Value_crp_vs      0.74474      1.00000      0.04462      0.02083
#> Value_iti_ti      0.05033      0.04462      1.00000      0.26282
#> Value_iti_uo      0.20070      0.02083      0.26282      1.00000

# Fit VAR(p) model
p <- 2
varp_model <- VAR(ts_combined_data[, c("Value_crp_no", "Value_crp_vs", "Value_iti_ti", "Value_iti_uo")], p = p)

# Print summary of VAR(1) model
summary(varp_model)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Value_crp_no, Value_crp_vs, Value_iti_ti, Value_iti_uo 
#> Deterministic variables: const 
#> Sample size: 372 
#> Log Likelihood: -10886.418 
#> Roots of the characteristic polynomial:
#> 0.9921 0.9884 0.9884 0.5737 0.5162 0.5162 0.3349 0.225
#> Call:
#> VAR(y = ts_combined_data[, c("Value_crp_no", "Value_crp_vs", 
#>     "Value_iti_ti", "Value_iti_uo")], p = p)
#> 
#> 
#> Estimation results for equation Value_crp_no: 
#> ============================================= 
#> Value_crp_no = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                  Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.10827    0.07774  -1.393   0.1646    
#> Value_crp_vs.l1   0.76596    0.09734   7.869 4.15e-14 ***
#> Value_iti_ti.l1   0.04488    0.04628   0.970   0.3328    
#> Value_iti_uo.l1   0.01265    0.02120   0.597   0.5510    
#> Value_crp_no.l2  -0.23947    0.07654  -3.129   0.0019 ** 
#> Value_crp_vs.l2   0.53579    0.09066   5.910 7.90e-09 ***
#> Value_iti_ti.l2  -0.03400    0.04666  -0.729   0.4667    
#> Value_iti_uo.l2  -0.01961    0.02139  -0.917   0.3599    
#> const           490.55230  229.86271   2.134   0.0335 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 351.2 on 363 degrees of freedom
#> Multiple R-Squared: 0.9827,  Adjusted R-squared: 0.9823 
#> F-statistic:  2579 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_crp_vs: 
#> ============================================= 
#> Value_crp_vs = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                  Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1   0.13401    0.06007   2.231   0.0263 *  
#> Value_crp_vs.l1   0.52272    0.07522   6.949 1.71e-11 ***
#> Value_iti_ti.l1   0.07795    0.03576   2.180   0.0299 *  
#> Value_iti_uo.l1   0.01880    0.01638   1.148   0.2519    
#> Value_crp_no.l2   0.03833    0.05915   0.648   0.5174    
#> Value_crp_vs.l2   0.28184    0.07006   4.023 7.00e-05 ***
#> Value_iti_ti.l2  -0.07127    0.03606  -1.977   0.0488 *  
#> Value_iti_uo.l2  -0.02248    0.01653  -1.360   0.1747    
#> const           221.67170  177.63499   1.248   0.2129    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 271.4 on 363 degrees of freedom
#> Multiple R-Squared: 0.9897,  Adjusted R-squared: 0.9894 
#> F-statistic:  4348 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_ti: 
#> ============================================= 
#> Value_iti_ti = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                  Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1  -0.37535    0.08045  -4.665 4.34e-06 ***
#> Value_crp_vs.l1   0.49486    0.10074   4.912 1.36e-06 ***
#> Value_iti_ti.l1   1.39798    0.04789  29.189  < 2e-16 ***
#> Value_iti_uo.l1   0.06986    0.02194   3.184  0.00158 ** 
#> Value_crp_no.l2   0.09328    0.07921   1.178  0.23973    
#> Value_crp_vs.l2  -0.21455    0.09383  -2.287  0.02280 *  
#> Value_iti_ti.l2  -0.40276    0.04829  -8.341 1.55e-15 ***
#> Value_iti_uo.l2  -0.06962    0.02214  -3.144  0.00180 ** 
#> const           174.33160  237.89019   0.733  0.46414    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 363.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.9929,  Adjusted R-squared: 0.9928 
#> F-statistic:  6364 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_iti_uo: 
#> ============================================= 
#> Value_iti_uo = Value_crp_no.l1 + Value_crp_vs.l1 + Value_iti_ti.l1 + Value_iti_uo.l1 + Value_crp_no.l2 + Value_crp_vs.l2 + Value_iti_ti.l2 + Value_iti_uo.l2 + const 
#> 
#>                   Estimate Std. Error t value Pr(>|t|)    
#> Value_crp_no.l1   -0.59956    0.18495  -3.242  0.00130 ** 
#> Value_crp_vs.l1    0.45086    0.23158   1.947  0.05232 .  
#> Value_iti_ti.l1    0.33136    0.11010   3.010  0.00280 ** 
#> Value_iti_uo.l1    1.37959    0.05043  27.355  < 2e-16 ***
#> Value_crp_no.l2   -0.41512    0.18210  -2.280  0.02321 *  
#> Value_crp_vs.l2    0.58504    0.21570   2.712  0.00700 ** 
#> Value_iti_ti.l2   -0.35270    0.11101  -3.177  0.00161 ** 
#> Value_iti_uo.l2   -0.38204    0.05090  -7.506 4.77e-13 ***
#> const           1102.69133  546.87635   2.016  0.04450 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 835.5 on 363 degrees of freedom
#> Multiple R-Squared: 0.9974,  Adjusted R-squared: 0.9973 
#> F-statistic: 1.739e+04 on 8 and 363 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no       123339        70167         7224        46686
#> Value_crp_vs        70167        73658         1860       -14500
#> Value_iti_ti         7224         1860       132104        40401
#> Value_iti_uo        46686       -14500        40401       698140
#> 
#> Correlation matrix of residuals:
#>              Value_crp_no Value_crp_vs Value_iti_ti Value_iti_uo
#> Value_crp_no      1.00000      0.73616      0.05659      0.15910
#> Value_crp_vs      0.73616      1.00000      0.01886     -0.06394
#> Value_iti_ti      0.05659      0.01886      1.00000      0.13303
#> Value_iti_uo      0.15910     -0.06394      0.13303      1.00000

# Comparison between two fits using AIC (Akaike Information Criterion) and BIC (Bayesian Information Criterion)
# Lower values indicates better fit.

# Calculate AIC and BIC for VAR(1) model
aic_var1 <- AIC(var1_model)
bic_var1 <- BIC(var1_model)

# Calculate AIC and BIC for VAR(p) model
aic_varp <- AIC(varp_model)
bic_varp <- BIC(varp_model)

# Print AIC and BIC values
cat("AIC for VAR(1):", aic_var1, "\n")
#> AIC for VAR(1): 22116.74
cat("BIC for VAR(1):", bic_var1, "\n")
#> BIC for VAR(1): 22195.17
cat("AIC for VAR(p):", aic_varp, "\n")
#> AIC for VAR(p): 21844.84
cat("BIC for VAR(p):", bic_varp, "\n")
#> BIC for VAR(p): 21985.92

## We can observe that both AIC and BIC are lower for the VAR(p) model compared to the VAR(1) model. This suggests that the VAR(p) model provides a better balance between goodness of fit and model complexity. Lower AIC and BIC values indicate a better fit while taking into account the complexity of the model.

## Forecasting

# Produce a one-month-ahead forecast using VAR(1) model
one_month_forecast_var1 <- predict(var1_model, n.ahead = 1)

# Extract the forecasted values
forecasted_values_var1 <- one_month_forecast_var1$fcst[1]$Value_crp_no[1]

# Print the forecasted value for the next month
cat("Forecasted value for the next month using VAR 1 model:", forecasted_values_var1, "\n")
#> Forecasted value for the next month using VAR 1 model: 2241.609

# Produce a one-month-ahead forecast using VAR(p) model
one_month_forecast_varp <- predict(varp_model, n.ahead = 1)

# Extract the forecasted values
forecasted_values_varp <- one_month_forecast_varp$fcst[1]$Value_crp_no[1]

# Print the forecasted value for the next month
cat("Forecasted value for the next month using VAR p model:", forecasted_values_varp, "\n")
#> Forecasted value for the next month using VAR p model: 2190.083

# Granger Causality
causality_test_var1 <- causality(var1_model)
#> Warning in causality(var1_model): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Value_crp_no) as cause variable.
print(causality_test_var1)
#> $Granger
#> 
#>  Granger causality H0: Value_crp_no do not Granger-cause Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object var1_model
#> F-Test = 7.5892, df1 = 3, df2 = 1472, p-value = 4.89e-05
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_crp_no and Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object var1_model
#> Chi-squared = 138.41, df = 3, p-value < 2.2e-16

causality_test_varp <- causality(varp_model)
#> Warning in causality(varp_model): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Value_crp_no) as cause variable.
print(causality_test_varp)
#> $Granger
#> 
#>  Granger causality H0: Value_crp_no do not Granger-cause Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object varp_model
#> F-Test = 6.9702, df1 = 6, df2 = 1452, p-value = 2.542e-07
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_crp_no and Value_crp_vs
#>  Value_iti_ti Value_iti_uo
#> 
#> data:  VAR object varp_model
#> Chi-squared = 137.28, df = 3, p-value < 2.2e-16

# Load the BigVAR package
library(BigVAR)

bigvar_matrix <- cbind(crp_no$Value,
                       crp_vs$Value,
                       iti_ti$Value,
                       iti_uo$Value)

# Fit a sparse VAR model using Lasso regularization
sparse_var_fit <- BigVAR.fit(Y=bigvar_matrix,struct='Basic',p=2,lambda=1)[,,1]

# Describe the sparsity structure
print(sparse_var_fit)
#>           [,1]        [,2]        [,3]        [,4]        [,5]        [,6]
#> [1,] 417.59572  0.24196716 0.249637724 0.003185001 0.007121556  0.23873930
#> [2,] 222.58455  0.24856635 0.252579633 0.001598170 0.011755905  0.24551514
#> [3,] -34.94354 -0.03225582 0.054272649 0.792747721 0.133064078 -0.01802307
#> [4,] 908.21645  0.02025621 0.007536235 0.154881877 1.234473633 -0.03508725
#>             [,7]          [,8]        [,9]
#> [1,]  0.24796739 -0.0003493715 -0.01131492
#> [2,]  0.25061997 -0.0027416290 -0.01333212
#> [3,] -0.00726279  0.2051545935 -0.13180078
#> [4,]  0.05198889 -0.1821780945 -0.23351711

#sparsity_structure <- sparse_var_fit$selected
#print(sparsity_structure)

# Summarize the results
summary(sparse_var_fit)
#>        V1               V2                  V3                 V4          
#>  Min.   :-34.94   Min.   :-0.032256   Min.   :0.007536   Min.   :0.001598  
#>  1st Qu.:158.20   1st Qu.: 0.007128   1st Qu.:0.042589   1st Qu.:0.002788  
#>  Median :320.09   Median : 0.131112   Median :0.151955   Median :0.079033  
#>  Mean   :378.36   Mean   : 0.119633   Mean   :0.141007   Mean   :0.238103  
#>  3rd Qu.:540.25   3rd Qu.: 0.243617   3rd Qu.:0.250373   3rd Qu.:0.314348  
#>  Max.   :908.22   Max.   : 0.248566   Max.   :0.252580   Max.   :0.792748  
#>        V5                 V6                 V7                  V8           
#>  Min.   :0.007122   Min.   :-0.03509   Min.   :-0.007263   Min.   :-0.182178  
#>  1st Qu.:0.010597   1st Qu.:-0.02229   1st Qu.: 0.037176   1st Qu.:-0.047601  
#>  Median :0.072410   Median : 0.11036   Median : 0.149978   Median :-0.001545  
#>  Mean   :0.346604   Mean   : 0.10779   Mean   : 0.135828   Mean   : 0.004971  
#>  3rd Qu.:0.408417   3rd Qu.: 0.24043   3rd Qu.: 0.248631   3rd Qu.: 0.051027  
#>  Max.   :1.234474   Max.   : 0.24552   Max.   : 0.250620   Max.   : 0.205155  
#>        V9          
#>  Min.   :-0.23352  
#>  1st Qu.:-0.15723  
#>  Median :-0.07257  
#>  Mean   :-0.09749  
#>  3rd Qu.:-0.01283  
#>  Max.   :-0.01131

Forecasted value for the next month using VAR 1 model: 2241.609 Forecasted value for the next month using VAR p model: 2190.083

Created on 2024-04-10 with reprex v2.1.0