jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Siddharth Haveliwala #9

Open siddharthhaveliwala opened 5 months ago

siddharthhaveliwala commented 5 months ago

> install.packages("tsbox")

> install.packages("xts")

> install.packages("tsibble")

> install.packages("fredr")

> install.packages("prophet")

> install.packages("signal")

library(tidyverse) library(fredr) library(dplyr) library(tidyr) library(tsbox) library(xts) library(tsibble) library(ggplot2) library(signal)

df_icnsa <- fredr(series_id = "ICNSA")

df <- read.csv("/Users/siddharthhaveliwala/Downloads/ICNSA.csv")

which(is.na(df)) sum(is.na(df))

head(df) str(df) summary(df)

df$date <- as.Date(df$DATE) df$value <- df$ICNSA

df <- select(df, -DATE, -ICNSA)

ts_plot(df)

df$month <- format(df$date, "%Y-%m") df_summary <- df %>% group_by(month) %>% summarise(mean_value = mean(value))

summary(df_summary)

df_summary$month <- as.Date(paste0(df_summary$month, "-01"), format = "%Y-%m-%d")

ts_plot(df_summary)

df$year <- format(df$date, "%Y") df_ann_summary <- df %>% group_by(year) %>% summarise(mean_value = mean(value))

df_ann_summary$year <- as.Date(paste0(df_ann_summary$year, "-01-01"), format = "%Y-%m-%d")

ts_plot(df_ann_summary)

ts_data <- ts(df$value, frequency = 52)

decomposed <- decompose(ts_data) detrended_data <- ts_data - decomposed$trend

detrended_df <- data.frame(date = time(ts_data), value = detrended_data)

View(detrended_df)

class(detrended_df$date)

sum(is.na(detrended_df))

detrended_df <- na.omit(detrended_df) detrended_df$date <- as.Date(detrended_df$date)

library(ggplot2)

plot(detrended_df, type = 'line')

have plotted the time series data but trying to implement different models to get the predictions

siddharthhaveliwala commented 4 months ago

data <- read.csv("ICNSA_new.csv")

data$DATE <- as.Date(data$DATE) data <- na.omit(data)

Created lagged variables (lagged values for 1, 2, and 3 weeks)

data$lag1 <- c(NA, head(data$ICNSA, -1)) data$lag2 <- c(NA, NA, head(data$ICNSA, -2)) data$lag3 <- c(NA, NA, NA, head(data$ICNSA, -3))

Created covid_year variable for using covid years as external regressors

data$covid_year <- ifelse(format(data$DATE, "%Y") %in% c("2020", "2021"), 1, 0)

Created time series object with weekly frequency

ts_data <- ts(data$ICNSA, frequency = 52)

autoplot(ts_data, xlab = "Date", ylab = "Value", main = "Original Time Series Data")

Fitting ARIMA model including covid years as external regressors

fit_covid <- auto.arima(ts_data, xreg = data$covid_year)

forecast_covid <- forecast(fit_covid, xreg = rep(1, 5))

autoplot(forecast_covid, xlab = "Date", ylab = "Value", main = "Forecast")

lag_matrix <- as.matrix(data[, c("lag1", "lag2", "lag3")])

Fitting ARIMA model including lagged variables

fit_lag <- auto.arima(ts_data, xreg = lag_matrix)

future_lag <- tail(ts_data, 3)

forecast_lag <- forecast(fit_lag, xreg = matrix(rep(future_lag, 5), ncol = 3, byrow = TRUE, dimnames = list(NULL, colnames(lag_matrix))), h = 5)

autoplot(forecast_lag, xlab = "Date", ylab = "Value", main = "Forecast")

print('Forecast including covid data as xreg:') print(forecast_covid) print('Forecast including lagged data as xreg:') print(forecast_lag)

siddharthhaveliwala commented 4 months ago

print('Forecast including covid data as xreg:') [1] "Forecast including covid data as xreg:" print(forecast_covid) Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 58.26923 364976.2 258644.88 471307.4 202356.508 527595.8 58.28846 361668.6 181381.31 541955.9 85942.996 637394.2 58.30769 351002.6 118641.72 583363.5 -4362.695 706367.9 58.32692 341331.6 75422.16 607241.1 -65341.813 748005.1 58.34615 350291.1 63962.67 636619.6 -87610.468 788192.7 print('Forecast including lagged data as xreg:') [1] "Forecast including lagged data as xreg:" print(forecast_lag) Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 58.26923 316328.9 211308.2 421349.6 155713.5 476944.2 58.28846 313163.5 208142.8 418184.2 152548.2 473778.9 58.30769 314792.9 209772.1 419813.6 154177.5 475408.2 58.32692 316079.5 211058.8 421100.2 155464.1 476694.9 58.34615 329931.3 224910.6 434952.0 169316.0 490546.7

Forecast for tomorrow including covid data as xreg is 364976.2 and using lagged data of 1,2 and 3 weeks is 316328.9 using the ARIMA model.

quicksid commented 4 months ago

Author: Siddharth Miteshkumar Haveliwala

CDA 500LEC: Special Topics: Intro. to Modern Time Series Analysis

Predicting Initial Claims with Conditional Claims as external regressor

Forecast for 22-Feb-24: 214095.3

Forecast Lo 80 Hi 80 Lo 95 Hi 95

214095.3 77059.97 351130.7 4517.815 423672.9

Begin Script

Load necessary libraries

library(ggplot2) library(plotly) library(dplyr) library(forecast) library(tseries) library(reprex) library(cowplot)

Load the datasets for Initial Claims & Continued Claims (Seasonally nonadjusted) from FRED data library

df_cc <- read.csv("/Users/siddharthhaveliwala/Documents/Spring 2024/Courses/CDA 500 - Time Series Analysis/HW/CACCLAIMS.csv") df_ic <- read.csv("/Users/siddharthhaveliwala/Documents/Spring 2024/Courses/CDA 500 - Time Series Analysis/HW/ICSA.csv")

Check the first 5 rows of data frames

head(df_cc) head(df_ic)

View the data frames

View(df_ic)

View(df_cc)

Check for missing values

sum(is.na(df_cc)) sum(is.na(df_ic))

Data pre-processing

df_cc <- subset(df_cc, CACCLAIMS != '.') df_cc$DATE <- as.Date(df_cc$DATE) df_ic$DATE <- as.Date(df_ic$DATE) df_cc$CACCLAIMS <- as.numeric(df_cc$CACCLAIMS)

Summary of data frames

summary(df_cc) summary(df_ic) str(df_ic) str(df_cc)

Plot the Continued Claims vs Initial Claims

plot(df_cc$DATE, df_cc$CACCLAIMS, type = "l", xlab = "Date", ylab = "Continued Claims", main = "Time Series Plot", col = "blue") lines(df_ic$DATE, df_ic$ICSA, type = "l", col = "red") legend("topright", legend = c("CACCLAIMS", "ICSA"), col = c("blue", "red"), lty = 1) title(main = "Time Series Plot", xlab = "Date", ylab = "Continued Claims") image

Check for stationarity using Augmented Dickey-Fuller (ADF) test

adf.test(df_cc$CACCLAIMS) adf.test(df_ic$ICSA)

Check ACF and PACF plots to determine the order of ARIMA terms

acf(df_cc$CACCLAIMS) pacf(df_cc$CACCLAIMS) acf(df_ic$ICSA) pacf(df_ic$ICSA)

Find cross-correlation between df_ic$ICSA and df_cc$CACCLAIMS

cross_correlation <- ccf(df_ic$ICSA, df_cc$CACCLAIMS, lag.max = 10, plot = FALSE)

Plot cross-correlation values

plot(cross_correlation, main = "Cross-Correlation between ICSA and CACCLAIMS") image

Merge the two datasets based on the DATE column

merged_data <- merge(df_cc, df_ic, by = "DATE", all.x = TRUE)

Check if there are any missing values and length of dataset after the merge

sum(is.na(merged_data)) length(merged_data)

Define the COVID period

covid_start_date <- as.Date("2020-03-01") covid_end_date <- as.Date("2021-06-30")

Filter out the COVID period from the merged data

merged_data_filtered <- merged_data[merged_data$DATE < covid_start_date | merged_data$DATE > covid_end_date, ] plot(merged_data_filtered$DATE, merged_data_filtered$CACCLAIMS, type = "l")

Check ACF and PACF plots to determine the order of ARIMA terms

acf(merged_data_filtered$CACCLAIMS) pacf(merged_data_filtered$CACCLAIMS) acf(merged_data_filtered$ICSA) pacf(merged_data_filtered$ICSA) image image

Find cross-correlation between merged_data$ICSA and merged_data$CACCLAIMS

cross_corr_filt <- ccf(merged_data_filtered$ICSA, merged_data_filtered$CACCLAIMS, lag.max = 10, plot = FALSE)

Plot cross-correlation values

plot(cross_corr_filt, main = "Cross-Correlation between ICSA and CACCLAIMS (filtered data)")

Apply lag 1 to the specified columns because of the ACF plot analysis

merged_data_filtered <- merged_data_filtered %>% mutate(ICSA_lag1 = lag(ICSA, 1), CACCLAIMS_lag1 = lag(CACCLAIMS, 1))

Remove NAs

merged_data_filtered <- na.omit(merged_data_filtered)

Convert the merged_data_filtered to tibble format

merged_data_filtered <- as_tibble(merged_data_filtered)

Fit a regARIMA model with filtered data

reg_arima_filtered <- auto.arima(merged_data_filtered$ICSA_lag1, xreg = merged_data_filtered$CACCLAIMS_lag1)

Generate forecasts for tomorrow's date using filtered data

forecast_filtered <- forecast(reg_arima_filtered, xreg = merged_data_filtered$CACCLAIMS, h = 1)

Print the forecast for tomorrow's date with/without filtered data

print(forecast_filtered)

Plot the forecast for tomorrow's date with filtered data

autoplot(forecast_filtered, main = "Forecast for Tomorrow's Date (Filtered Data)") image

End Script

quicksid commented 4 months ago

The provided script predicts Initial Claims (ICNSA) using Conditional Claims in California as an external regressor through an ARIMA(0,1,2) model. Loading and pre-processing the datasets included handling missing values and convering date columns. Exploratory Data Analysis involved checking summary statistics, conducting ADF tests for stationarity, and analyzing ACF and PACF plots to determine ARIMA terms. Model building encompasses cross-correlation analysis, merging datasets, filtering out the COVID-19 period, applying lag 1, and fitting a regARIMA model with filtered data. Finally, forecasts for February 22, 2024, are generated and visualized, providing forecasted values along with confidence intervals, with a value 212738.3.

quicksid commented 4 months ago

Homework 2 -

# Load necessary libraries
library(fredr)
library(ggplot2)
library(plotly)
library(dplyr)
library(hrbrthemes)
library(tidyverse)
library(splines)
library(forecast)

# FRED API Key
fredr_set_key("d40a0fc0b1555df0c6568749cc1737e6")

# Retrieve the ICNSA data from FRED
D0 <- fredr(series_id = "ICNSA")

# Analyze the loaded dataset
summary(D0)
str(D0)
head(D0)

# Load the D0 data to D1 dataframe
D1 <- D0
D1$DATE <- as.Date(D1$date)
D1$ICNSA <- D1$value

# Drop unnecessary features
D1 <- subset(D1, select = -c(date, series_id, value, realtime_start, realtime_end))

# Analyze the D1 data frame
str(D1)
head(D1)
sum(is.na(D1))

## Empirical Analysis for Covid data

# Plot the ICNSA data
plot1 <- D1 %>%
  ggplot(aes(DATE, ICNSA)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("ICNSA Count") +
  geom_rect(data=D1, inherit.aes=FALSE, aes(xmin=DATE[DATE == "2019-08-17"], xmax=DATE[DATE == "2021-03-13"], ymin=min(ICNSA),
                                               ymax=max(ICNSA)), color="transparent", fill="red", alpha=0.3) +
  theme_ipsum()

plot1 <- ggplotly(plot1)
plot1
Screenshot 2024-02-28 at 9 23 46 PM
# Statistical Analysis
summary(D1$ICNSA)

# Plot the boxplot for ICNSA data
boxplot1 <- plot_ly(D1, y = D1$ICNSA, type = "box")
boxplot1

# Plot the histogram for ICNSA data
ggplot(D1) +
  aes(x = ICNSA) +
  geom_histogram(bins = as.integer(sqrt(nrow(D1))), fill = "navyblue") +
  theme_minimal()
Screenshot 2024-02-28 at 9 26 29 PM
# Identify the outliers by quantiles (out of bound by interval from 2.5% and 97.5%)
lwr_bound <- quantile(D1$ICNSA, 0.025)
upr_bound <- quantile(D1$ICNSA, 0.975)

outliers <- which(D1$ICNSA < lwr_bound | D1$ICNSA > upr_bound)
outliers

# View the outlier's dates
D1[outliers, "DATE"]

# Define covid period
covid_period <- subset(D1, DATE >= as.Date("2019-08-17") & DATE <= as.Date("2021-03-13"))

## Implement a multiplicative and an additive Holt-Winters model

# Fit a cubic spline to the covid period data
spline_fit <- smooth.spline(x = as.numeric(covid_period$DATE), y = covid_period$ICNSA, lambda = 0.7)

# Predict the values using fitted spline
imputed_values <- predict(spline_fit, x = as.numeric(covid_period$DATE))

# Replace the values in the original dataset with the predicted values
D1$ICNSA[which(D1$DATE >= as.Date("2019-08-17") & D1$DATE <= as.Date("2021-03-13"))] <- imputed_values$y

# Visualize the original and imputed data
spline_plot <- plot_ly()
spline_plot <- add_lines(spline_plot, x = ~D1$DATE, y = ~D1$ICNSA, name = "Original", line = list(color = "blue"))
spline_plot <- add_lines(spline_plot, x = ~covid_period$DATE, y = ~imputed_values$y, name = "Imputed", line = list(color = "red"))
spline_plot <- layout(spline_plot, xaxis = list(title = "Date"), yaxis = list(title = "ICNSA"))
spline_plot
Screenshot 2024-02-28 at 9 27 13 PM
# Convert the data to time series object
ts_data <- ts(D1$ICNSA, frequency = 52)

# Fit the multiplicative Holt-Winters model
hw_multiplicative <- HoltWinters(ts_data, seasonal = "mult")

# Fit the additive Holt-Winters model
hw_additive <- HoltWinters(ts_data, seasonal = "additive")

# Forecast the next ICNSA value using hw_multiplicative and hw_additive
forecast_multiplicative <- forecast(hw_multiplicative, h = 1)
forecast_additive <- forecast(hw_additive, h = 1)

# Extract the forecasted values
next_ICNSA_multiplicative <- forecast_multiplicative$mean[length(forecast_multiplicative$mean)]
next_ICNSA_additive <- forecast_additive$mean[length(forecast_additive$mean)]

# Print the forecasted values
print(paste("Forecasted ICNSA value (Multiplicative Holt-Winters):", round(next_ICNSA_multiplicative, 2)))
print(paste("Forecasted ICNSA value (Additive Holt-Winters):", round(next_ICNSA_additive, 2)))

Forecasted ICNSA value (Multiplicative Holt-Winters): 193561.54 Forecasted ICNSA value (Additive Holt-Winters): 201703.88

quicksid commented 3 months ago

Homework 3:

# Load necessary libraries
library(fredr)
library(ggplot2)
library(plotly)
library(dplyr)
library(hrbrthemes)
library(tidyverse)
library(splines)
library(forecast)
library(tseries)
library(imputeTS)
library(legion)

# FRED API Key
fredr_set_key("d40a0fc0b1555df0c6568749cc1737e6")

# Retrieve the ICNSA data from FRED
D0 <- fredr(series_id = "ICNSA", observation_start = as.Date("1971-01-02"), observation_end = as.Date("2024-02-17"))

# Analyze the loaded dataset
summary(D0)
str(D0)
head(D0)

# Load the D0 data to D1 dataframe
D1 <- D0
D1$DATE <- as.Date(D1$date)
D1$ICNSA <- D1$value

# Drop unnecessary features
D1 <- subset(D1, select = -c(date, series_id, value, realtime_start, realtime_end))

# Analyze the D1 data frame
str(D1)
head(D1)
sum(is.na(D1))

## Empirical Analysis for Covid data

# Plot the ICNSA data
plot1 <- D1 %>%
  ggplot(aes(DATE, ICNSA)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("ICNSA Count") +
  geom_rect(data=D1, inherit.aes=FALSE, aes(xmin=DATE[DATE == "2020-03-14"], xmax=DATE[DATE == "2020-11-28"], ymin=min(ICNSA),
                                            ymax=max(ICNSA)), color="transparent", fill="red", alpha=0.3) +
  theme_ipsum()

plot1 <- ggplotly(plot1)
plot1
image
# Identify the outliers by quantiles (out of bound by interval from 2.5% and 97.5%)
lwr_bound <- quantile(D1$ICNSA, 0.025)
upr_bound <- quantile(D1$ICNSA, 0.975)

outliers <- which(D1$ICNSA < lwr_bound | D1$ICNSA > upr_bound)
outliers

# Define covid period
covid_period <- subset(D1, DATE >= as.Date("2020-03-14") & DATE <= as.Date("2020-11-28"))

# Replace values with NA
covid_period$ICNSA <- NA

# Replace covid period data in D1 data frame
D1$ICNSA[which(D1$DATE >= as.Date("2020-03-14") & D1$DATE <= as.Date("2020-11-28"))] <- covid_period$ICNSA

# Convert the D1 data frame to time series object
ts_data <- ts(D1$ICNSA, frequency = 52)
sum(is.na(ts_data))

# Plot the ts_data NA distributions
ggplot_na_distribution(ts_data)
statsNA(ts_data)

# Apply Kalman filter on the time series
D1_imp <- na_kalman(ts_data, smooth = TRUE)

# Plot the imputation plot
ggplot_na_imputations(ts_data, D1_imp)
image
# Build structural time series models (level, trend, BSM)
str_model_level <- StructTS(D1_imp, type = "level")
str_model_trend <- StructTS(D1_imp, type = "trend")
str_model_bsm <- StructTS(D1_imp, type = "BSM")

# Forecast the values for built models
str_forecast_level <- forecast(str_model_level, h = 1)
str_forecast_trend <- forecast(str_model_trend, h = 1)
str_forecast_bsm <- forecast(str_model_bsm, h = 1)

# Print the values for forecasts of structural models
print(str_forecast_level)
print(str_forecast_trend)
print(str_forecast_bsm)

# Retrieve the IURNSA (Initial Unemployment Claims Non-Seasonally Adjusted) data from FRED
D2 <- fredr(series_id = "IURNSA")

# Statistical Analysis of IURNSA data
summary(D2)
str(D2)
D2$DATE <- as.Date(D2$date)
D2$IURNSA <- D2$value
D2 <- subset(D2, select = -c(date, series_id, value, realtime_start, realtime_end))
sum(is.na(D2))

# Plot the ICNSA data
plot2 <- D2 %>%
  ggplot(aes(DATE, IURNSA)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("IURNSA Count") +
  geom_rect(data=D2, inherit.aes=FALSE, aes(xmin=DATE[DATE == "2020-03-14"], xmax=DATE[DATE == "2020-11-28"], ymin=min(IURNSA),
                                            ymax=max(IURNSA)), color="transparent", fill="red", alpha=0.3) +
  theme_ipsum()
plot2 <- ggplotly(plot2)
plot2
image
# Define covid period for IURNSA data
covid_period_2 <- subset(D2, DATE >= as.Date("2020-03-14") & DATE <= as.Date("2020-11-28"))

# Replace values with NA in covid_period_2
covid_period_2$IURNSA <- NA

# Replace covid period data in D2 data frame
D2$IURNSA[which(D2$DATE >= as.Date("2020-03-14") & D2$DATE <= as.Date("2020-11-28"))] <- covid_period_2$IURNSA

# Convert the D2 data frame to time series object
ts_data_2 <- ts(D2$IURNSA, frequency = 52)

# Plot the ts_data_2 NA distributions
ggplot_na_distribution(ts_data_2)
ggplot_na_distribution2(ts_data_2)
ggplot_na_gapsize(ts_data_2)
statsNA(ts_data_2)

# Apply Kalman filter on the time series
D2_imp <- na_kalman(ts_data_2, smooth = TRUE)

# Plot the imputation plot
ggplot_na_imputations(ts_data_2, D2_imp)
image
# Convert the D2_imp data frame to time series object
ts_data_2 <- ts(D2_imp, frequency = 52)
ts_data_2

# Convert the D1_imp data frame to time series object
ts_data_1 <- ts(D1_imp, frequency = 52)
ts_data_1

# Union of two time series into a multivariate time series
multivariate_ts <- ts.union(ts_data_1, ts_data_2)

# Check for NA values
sum(is.na(multivariate_ts))

# Fit the model based on covariate series using ETS (Error, Trend and Seasonal) based multivariate forecasting model
covariate_model <- auto.vets(multivariate_ts)

# Summary of the model
summary(covariate_model)

# Forecast ICNSA using covariate series IURNSA
forecast <- forecast(covariate_model, h = 1)

# Print the forecasted value
print(forecast)

# > print(forecast)
# Time Series:
# Start = c(54, 18) 
# End = c(54, 18) 
# Frequency = 52 
#       ts_data_1  ts_data_2
# 54.32692 188821.4  1.476452

Forecasted ICNSA value : 188821.4 Forecasted IURNSA value : 1.476452

quicksid commented 2 months ago

Homework 4 (VAR(1), VAR(p) & BigVAR) :

# Load necessary libraries
library(lubridate)
library(stringr)
library(ggplot2)
library(plotly)
library(dplyr)
library(tseries)
library(imputeTS)
library(vars)
library(forecast)
library(corrplot)
library(BigVAR)
library(reprex)

# Set working directory
setwd("/Users/siddharthhaveliwala/Documents/Spring 2024/Courses/CDA 500 - Time Series Analysis/HW4/Data/input/Tabacco")

# Get 12BVS time series data
tobacco_vs <- read.csv("12BVS.csv")

# Data preprocessing
tobacco_vs <- tobacco_vs[-(1:7),]
tobacco_vs <- subset(tobacco_vs, select = c('X','X.1'))

names(tobacco_vs)[names(tobacco_vs) == "X"] <- "Period"
names(tobacco_vs)[names(tobacco_vs) == "X.1"] <- "Value"

#View(tobacco_vs)
sum(is.na(tobacco_vs))
tobacco_vs <- na.omit(tobacco_vs)
tobacco_vs$Value <- gsub('[,]', '', tobacco_vs$Value)
tobacco_vs$Period <- mdy(tobacco_vs$Period)
tobacco_vs$Value <- as.numeric(tobacco_vs$Value)

# Analyze the dataset
summary(tobacco_vs)
str(tobacco_vs)

# Get 12BTI time series data

tobacco_ti <- read.csv("12BTI.csv")

# Data preprocessing
tobacco_ti <- tobacco_ti[-(1:7),]
tobacco_ti <- subset(tobacco_ti, select = c('X','X.1'))
names(tobacco_ti)[names(tobacco_ti) == "X"] <- "Period"
names(tobacco_ti)[names(tobacco_ti) == "X.1"] <- "Value"
tobacco_ti$Value <- gsub('[,]', '', tobacco_ti$Value)

#View(tobacco_ti)
sum(is.na(tobacco_ti))
tobacco_ti <- na.omit(tobacco_ti)
tobacco_ti$Period <- mdy(tobacco_ti$Period)
tobacco_ti$Value <- as.numeric(tobacco_ti$Value)

# Analyze the dataset
summary(tobacco_ti)
str(tobacco_ti)

# Get 12STI time series data

bev_tab_ti <- read.csv("12STI.csv")

# Data preprocessing
bev_tab_ti <- bev_tab_ti[-(1:7),]
bev_tab_ti <- subset(bev_tab_ti, select = c('X','X.1'))
names(bev_tab_ti)[names(bev_tab_ti) == "X"] <- "Period"
names(bev_tab_ti)[names(bev_tab_ti) == "X.1"] <- "Value"
bev_tab_ti$Value <- gsub('[,]', '', bev_tab_ti$Value)

#View(bev_tab_ti)
sum(is.na(bev_tab_ti))
bev_tab_ti <- na.omit(bev_tab_ti)
bev_tab_ti$Period <- mdy(bev_tab_ti$Period)
bev_tab_ti$Value <- as.numeric(bev_tab_ti$Value)

# Analyze the dataset
summary(bev_tab_ti)
str(bev_tab_ti)

# Get 12SVS time series data

bev_tab_vs <- read.csv("12SVS.csv")

# Data Preprocessing
bev_tab_vs <- bev_tab_vs[-(1:7),]
bev_tab_vs <- subset(bev_tab_vs, select = c('X','X.1'))
names(bev_tab_vs)[names(bev_tab_vs) == "X"] <- "Period"
names(bev_tab_vs)[names(bev_tab_vs) == "X.1"] <- "Value"
bev_tab_vs$Value <- gsub('[,]', '', bev_tab_vs$Value)

#View(bev_tab_vs)
sum(is.na(bev_tab_vs))
bev_tab_vs <- na.omit(bev_tab_vs)
bev_tab_vs$Period <- mdy(bev_tab_vs$Period)
bev_tab_vs$Value <- as.numeric(bev_tab_vs$Value)

# Analyze the dataset
summary(bev_tab_vs)
str(bev_tab_vs)

# Plot the tobacco_vs data
plot1 <- tobacco_vs %>%
  ggplot(aes(Period, Value)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("Tobacco Manufacturing Value of Shipments Count") +
  theme_minimal()

plot1 <- ggplotly(plot1)
plot1

# Plot the tobacco_ti data
plot2 <- tobacco_ti %>%
  ggplot(aes(Period, Value)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("Tobacco Manufacturing Total Inventories Count") +
  theme_minimal()

plot2 <- ggplotly(plot2)
plot2

# Plot the bev_tab_vs data
plot3 <- bev_tab_vs %>%
  ggplot(aes(Period, Value)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("Beverages and Tobacco Products Value of Shipments Count") +
  theme_minimal()

plot3 <- ggplotly(plot3)
plot3

# Plot the bev_tab_ti data
plot4 <- bev_tab_ti %>%
  ggplot(aes(Period, Value)) +
  geom_area(fill="blue", alpha=0.5) +
  geom_line(color="blue") +
  ylab("Beverages and Tobacco Products Total Inventories Count") +
  theme_minimal()

plot4 <- ggplotly(plot4)
plot4

# Convert to Time Series
ts_tobacco_vs <- ts(tobacco_vs$Value, frequency = 12)
ts_tobacco_ti <- ts(tobacco_ti$Value, frequency = 12)
ts_bev_tab_ti <- ts(bev_tab_ti$Value, frequency = 12)
ts_bev_tab_vs <- ts(bev_tab_vs$Value, frequency = 12)

# Union the data into multivariate time series object
multivariate_ts <- ts.union(ts_bev_tab_vs, ts_tobacco_vs, ts_tobacco_ti, ts_bev_tab_ti)

#View(multivariate_ts)

# Plot the multivariate object
autoplot(multivariate_ts)
Screenshot 2024-04-10 at 9 29 41 PM
# Plot the correlations of the multivariate object
corr_multivariate_ts <- cor(multivariate_ts)
corrplot(corr_multivariate_ts, method = "number")
Screenshot 2024-04-10 at 9 31 17 PM
# Determine the persistence of the Model by ACF and PACF plots
par(mfrow = c(2,2))
acf(ts_bev_tab_vs, main = "ACF for Beverage & Tobacco - Value of Shipments")
acf(ts_tobacco_vs, main = "ACF for Tobacco Manufacturing - Value of Shipments")
acf(ts_tobacco_ti, main = "ACF for Tobacco Manufacturing - Total Inventories")
acf(ts_bev_tab_ti, main = "ACF for Beverage & Tobacco Products - Total Inventories")

pacf(ts_bev_tab_vs, main = "PACF for Beverage & Tobacco - Value of Shipments")
pacf(ts_tobacco_vs, main = "PACF for Tobacco Manufacturing - Value of Shipments")
pacf(ts_tobacco_ti, main = "PACF for Tobacco Manufacturing - Total Inventories")
pacf(ts_bev_tab_ti, main = "PACF for Beverage & Tobacco Products - Total Inventories")
Screenshot 2024-04-10 at 9 33 38 PM Screenshot 2024-04-10 at 10 09 26 PM
# Finding the optimal lags
lagselect <- VARselect(multivariate_ts, lag.max = 10, type = "const")
lagselect$selection
#Lag = 3

# Build the VAR(1) model
var_model_1 <- VAR(multivariate_ts, p = 1, type = "const", season = NULL, exog = NULL)
summary(var_model_1)

# Diagnose VAR(1) model
serial_var_1 <- serial.test(var_model_1, lags.pt = 12, type = "PT.asymptotic")
serial_var_1

# Heteroskedasticy check (arch effects)
arch_var_1 <- arch.test(var_model_1, lags.multi = 12, multivariate.only = TRUE)
arch_var_1

# Normal Distribution of Residuals
norm_var_1 <- normality.test(var_model_1, multivariate.only = TRUE)
norm_var_1

# Testing for structural breaks in Residuals (model stability test)
stability_var_1 <- stability(var_model_1, type = "OLS-CUSUM")
par(mar=c(1,1,1,1))
plot(stability_var_1)
Screenshot 2024-04-10 at 9 34 52 PM
# Granger's causality test
granger_tobacco_ti <- causality(var_model_1, cause = 'ts_tobacco_ti')
granger_tobacco_ti
granger_bev_tab_vs <- causality(var_model_1, cause = 'ts_bev_tab_vs')
granger_bev_tab_vs
granger_bev_tab_ti <- causality(var_model_1, cause = 'ts_bev_tab_ti')
granger_bev_tab_ti
granger_tobacco_vs <- causality(var_model_1, cause = 'ts_tobacco_vs')
granger_tobacco_vs

# Impluse Response Functions
inf_var_model_1 <- irf(var_model_1, impulse = "ts_tobacco_ti", response = c("ts_tobacco_vs", "ts_bev_tab_vs", "ts_bev_tab_ti"), boot = TRUE, n.ahead = 10)
plot(inf_var_model_1, main = "Shock from Tobacco Products - Total Inventories")
Screenshot 2024-04-10 at 9 50 45 PM
# Variance Decomposition
fevd_var_1 <- fevd(var_model_1, n.ahead = 10)
plot(fevd_var_1)
Screenshot 2024-04-10 at 9 39 12 PM
# VAR(1) Forecasting
forecast_var_1 <- predict(var_model_1, n.ahead = 12, ci = 0.95)
fanchart(forecast_var_1, names = "ts_bev_tab_vs")
fanchart(forecast_var_1, names = "ts_tobacco_ti")
fanchart(forecast_var_1, names = "ts_tobacco_vs")
fanchart(forecast_var_1, names = "ts_bev_tab_ti")

# Print the forecast using VAR(1)
print(forecast_var_1)

# Forecast with VAR 1 model
# ts_bev_tab_vs : 18018.41
# ts_tobacco_vs : 6588.687
# ts_tobacco_ti : 3594.108
# ts_bev_tab_ti : 30838.32

# Build the VAR(p) model
var_model_p <- VAR(multivariate_ts, p = 3, type = "const", season = NULL, exog = NULL)
summary(var_model_p)

# Diagnose VAR(1) model
serial_var_p <- serial.test(var_model_p, lags.pt = 12, type = "PT.asymptotic")
serial_var_p

# Heteroskedasticy (arch effects)
arch_var_p <- arch.test(var_model_p, lags.multi = 12, multivariate.only = TRUE)
arch_var_p

# Normal Distribution of Residuals
norm_var_p <- normality.test(var_model_p, multivariate.only = TRUE)
norm_var_p

# Testing for structural breaks in Residuals (model stability test)
stability_var_p <- stability(var_model_p, type = "OLS-CUSUM")
par(mar=c(1,1,1,1))
plot(stability_var_p)
Screenshot 2024-04-10 at 9 47 56 PM
# Granger's causality test
granger_tobacco_ti_p <- causality(var_model_p, cause = 'ts_tobacco_ti')
granger_tobacco_ti_p
granger_bev_tab_vs_p <- causality(var_model_p, cause = 'ts_bev_tab_vs')
granger_bev_tab_vs_p
granger_bev_tab_ti_p <- causality(var_model_p, cause = 'ts_bev_tab_ti')
granger_bev_tab_ti_p
granger_tobacco_vs_p <- causality(var_model_p, cause = 'ts_tobacco_vs')
granger_tobacco_vs_p

# Impluse Response Functions
inf_var_model_p <- irf(var_model_p, impulse = "ts_tobacco_ti", response = c("ts_tobacco_vs", "ts_bev_tab_vs", "ts_bev_tab_ti"), boot = TRUE, n.ahead = 10)
plot(inf_var_model_p, main = "Shock from Tobacco Products - Total Inventories")
Screenshot 2024-04-10 at 9 51 57 PM
# Variance Decomposition
fevd_var_p <- fevd(var_model_p, n.ahead = 10)
plot(fevd_var_p)
Screenshot 2024-04-10 at 9 52 38 PM
# VAR Forecasting
forecast_var_p <- predict(var_model_p, n.ahead = 1, ci = 0.95)
fanchart(forecast_var_p, names = "ts_bev_tab_vs")
fanchart(forecast_var_p, names = "ts_tobacco_ti")
fanchart(forecast_var_p, names = "ts_tobacco_vs")
fanchart(forecast_var_p, names = "ts_bev_tab_ti")

# Print the forecasts for VAR(p) model
print(forecast_var_p)

# Forecast with VAR(p) model
# ts_bev_tab_vs : 18479.15
# ts_tobacco_vs : 7033.018
# ts_tobacco_ti : 3580.238
# ts_bev_tab_ti : 30819.38

# BigVAR
# T = 386

# Fit the BIGVAR(3) model on the multivariate time series
bigvar_model_p <- constructModel(as.matrix(multivariate_ts), p = 3, struct = "SparseLag", gran = c(25,10), verbose = FALSE, h = 5, IC = TRUE)

# Save the BIGVAR model results
results_bigvar <- cv.BigVAR(bigvar_model_p)

# Print the results
results_bigvar

# Plot the BIGVAR results
plot(results_bigvar)
Screenshot 2024-04-10 at 9 54 56 PM

SparsityPlot.BigVAR.results(results_bigvar)

Screenshot 2024-04-10 at 9 55 33 PM
# Predict the forecast using BIGVAR(3) model
forecast_bigvar <- predict(results_bigvar, n.ahead = 1)
forecast_bigvar

# Forecast with BigVAR SparseOO model

# ts_bev_tab_vs : 18199.354
# ts_tobacco_vs : 6645.381
# ts_tobacco_ti : 3371.634
# ts_bev_tab_ti : 30844.491

#####################################
# FORECASTS - VAR(1), VAR(p) & BIGVAR
#####################################

# Forecast with VAR 1 model
# ts_bev_tab_vs : 18018.41
# ts_tobacco_vs : 6588.687
# ts_tobacco_ti : 3594.108
# ts_bev_tab_ti : 30838.32

# Forecast with VAR(p) model
# ts_bev_tab_vs : 18479.15
# ts_tobacco_vs : 7033.018
# ts_tobacco_ti : 3580.238
# ts_bev_tab_ti : 30819.38

# Forecast with BigVAR SparseLag model
# ts_bev_tab_vs : 18199.354
# ts_tobacco_vs : 6645.381
# ts_tobacco_ti : 3371.634
# ts_bev_tab_ti : 30844.491

#####################################
Tobacco Production - Value of Shipments - 6588.687, 7033.018, 6645.381
Tobacco Production - Total Inventories - 3594.108, 3580.238, 3371.634
Beverages & Tobacco Products - Value of Shipments - 18018.41, 18479.15, 18199.354
Beverages & Tobacco Products - Total Inventories - 30838.32, 30819.38, 30844.491