# HW1
# All packages
library(fredr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(dygraphs)
library(xts)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)

fredr_set_key("052142bc981666b4ebcb1c8df98d006b")
# Reading Data

icnsa_data = fredr(series_id = "ICNSA")

# Date column conversion to date data type
icnsa_data$date <- as.Date(icnsa_data$date)

# Updataing dataframe to remove other columns and keep only date and value
updated_icnsa_df <- icnsa_data %>%
  select(date, value) %>%
  na.omit()

# Autocorrelation Plot for dataframe
acf(updated_icnsa_df$value)


# Partial Autocorrelation Plot for dataframe
pacf(updated_icnsa_df$value)


str(updated_icnsa_df)
#> tibble [2,979 × 2] (S3: tbl_df/tbl/data.frame)
#>  $ date : Date[1:2979], format: "1967-01-07" "1967-01-14" ...
#>  $ value: num [1:2979] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...

# Ensuring value column as a numeric datatype
updated_icnsa_df$value <- as.numeric(updated_icnsa_df$value)
# Creating a timeseries object with frequency 7 as weekly data
ts_updated_icnsa_df <- ts(updated_icnsa_df$value, frequency = 7)

# Fitting an ARIMA model using auto.arima function. We can also specify p,q,d values using above acf and pcf plots.
arima_model <- auto.arima(ts_updated_icnsa_df)

summary(arima_model)
#> Series: ts_updated_icnsa_df 
#> ARIMA(2,0,3) with non-zero mean 
#> 
#> Coefficients:
#>          ar1      ar2      ma1      ma2      ma3       mean
#>       1.7297  -0.7371  -0.3852  -0.3309  -0.1667  367051.35
#> s.e.  0.0403   0.0370   0.0433   0.0228   0.0218   25043.92
#> 
#> sigma^2 = 7.487e+09:  log likelihood = -38091.08
#> AIC=76196.15   AICc=76196.19   BIC=76238.15
#> 
#> Training set error measures:
#>                     ME    RMSE      MAE       MPE     MAPE      MASE
#> Training set -180.7655 86438.5 41253.75 -1.831697 10.61079 0.4306737
#>                      ACF1
#> Training set -0.003548767

# Forcasting the value
forecast_values <- forecast(arima_model, h = 1)
plot(forecast_values, main = "ARIMA Forecast",
     xlab = "Date", ylab = "Number of Cases")

forecast_mean <- mean(forecast_values$mean, na.rm = TRUE)
print(forecast_mean)
#> [1] 225616

^{Created on 2024-02-09 with reprex v2.1.0}

Forecasted value for current week is 225616

# HW2
# All packages
library(fredr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(dygraphs)
library(xts)
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(reprex)

fredr_set_key("052142bc981666b4ebcb1c8df98d006b")
# Reading Data

icnsa_data = fredr(series_id = "ICNSA")

# Date column conversion to date data type
icnsa_data$date <- as.Date(icnsa_data$date)

# Updataing dataframe to remove other columns and keep only date and value
updated_icnsa_df <- icnsa_data %>%
  select(date, value) %>%
  na.omit()

# Define the COVID-19 period
covid_start <- as.Date("2020-01-01")
covid_end <- as.Date("2022-12-31")

icnsa_no_covid <- subset(updated_icnsa_df, date < covid_start | date > covid_end)

# Autocorrelation Plot for dataframe
acf(icnsa_no_covid$value)


# Partial Autocorrelation Plot for dataframe
pacf(icnsa_no_covid$value)


str(icnsa_no_covid)
#> tibble [2,822 × 2] (S3: tbl_df/tbl/data.frame)
#>  $ date : Date[1:2822], format: "1967-01-07" "1967-01-14" ...
#>  $ value: num [1:2822] 346000 334000 277000 252000 274000 276000 247000 248000 326000 240000 ...

# Ensuring value column as a numeric datatype
icnsa_no_covid$value <- as.numeric(icnsa_no_covid$value)
# Creating a timeseries object with frequency 7 as weekly data
ts_updated_icnsa_df_no_covid <- ts(icnsa_no_covid$value, frequency = 7)

# Fitting an ARIMA model using auto.arima function. We can also specify p,q,d values using above acf and pcf plots.
arima_model_no_covid <- auto.arima(ts_updated_icnsa_df_no_covid)

summary(arima_model_no_covid)
#> Series: ts_updated_icnsa_df_no_covid 
#> ARIMA(2,1,2) 
#> 
#> Coefficients:
#>          ar1      ar2      ma1     ma2
#>       0.9555  -0.0637  -1.1942  0.2068
#> s.e.  0.1222   0.1030   0.1204  0.1179
#> 
#> sigma^2 = 2.477e+09:  log likelihood = -34510.67
#> AIC=69031.34   AICc=69031.36   BIC=69061.07
#> 
#> Training set error measures:
#>                     ME     RMSE     MAE       MPE     MAPE      MASE
#> Training set -146.1736 49722.01 33579.3 -1.452661 9.090663 0.4276818
#>                      ACF1
#> Training set 0.0005209598

# Forcasting the value
forecast_values <- forecast(arima_model_no_covid, h = 1)
plot(forecast_values, main = "ARIMA Forecast",
     xlab = "Date", ylab = "Number of Cases")

forecast_mean <- mean(forecast_values$mean, na.rm = TRUE)
print(forecast_mean)
#> [1] 235751.1

^{Created on 2024-02-09 with reprex v2.1.0}

Forecasted value for current week after removing covid data is 235751.1

# Install the fredr package 
library(reprex)
if (!requireNamespace("fredr", quietly = TRUE)) install.packages("fredr")
library(ggplot2)
# Load the fredr library
library(fredr)

# Set your FRED API key
fredr_set_key("f4e804771fd1a4241706e9c4498f0839") # Your FRED API key

# Fetch the most current Initial Claims (ICNSA) data
initial_claims <- fredr(series_id = "ICNSA", observation_start = as.Date("2022-01-01"))

# Date column conversion to date data type
icnsa_data$date <- as.Date(icnsa_data$date)
#> Error in eval(expr, envir, enclos): object 'icnsa_data' not found

# Viewing the data
head(initial_claims)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 2022-01-01 ICNSA     314796 2024-02-22     2024-02-22  
#> 2 2022-01-08 ICNSA     415162 2024-02-22     2024-02-22  
#> 3 2022-01-15 ICNSA     337075 2024-02-22     2024-02-22  
#> 4 2022-01-22 ICNSA     267637 2024-02-22     2024-02-22  
#> 5 2022-01-29 ICNSA     258288 2024-02-22     2024-02-22  
#> 6 2022-02-05 ICNSA     231452 2024-02-22     2024-02-22

#plot it!
ggplot(initial_claims, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Initial Claims Over Time", x = "Date", y = "Initial Claims (ICNSA)") +
  theme_minimal()


# Fetch the most current Continued Claims (Not Seasonally Adjusted) data
continued_claims_nsadj <- fredr(series_id = "CCNSA", observation_start = as.Date("2020-01-01"))

# View the data
head(continued_claims_nsadj)
#> # A tibble: 6 × 5
#>   date       series_id   value realtime_start realtime_end
#>   <date>     <chr>       <dbl> <date>         <date>      
#> 1 2020-01-04 CCNSA     2251286 2024-02-22     2024-02-22  
#> 2 2020-01-11 CCNSA     2142711 2024-02-22     2024-02-22  
#> 3 2020-01-18 CCNSA     2082416 2024-02-22     2024-02-22  
#> 4 2020-01-25 CCNSA     2154866 2024-02-22     2024-02-22  
#> 5 2020-02-01 CCNSA     2088693 2024-02-22     2024-02-22  
#> 6 2020-02-08 CCNSA     2100473 2024-02-22     2024-02-22
# Assuming your dataframe is named continued_claims_nsadj and the date column is named "date"
continued_claims_nsadj$date <- as.Date(continued_claims_nsadj$date)

#plot it!
ggplot(continued_claims_nsadj, aes(x = date, y = value)) +
  geom_line(color = "red") +
  labs(title = "Continued Claims (Not Seasonally Adjusted) Over Time", x = "Date", y = "Continued Claims (CCNSA)") +
  theme_minimal()


#empirical data analysis 

# a) ADF test
# Install the tseries package if you haven't already
if (!requireNamespace("tseries", quietly = TRUE)) install.packages("tseries")
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo

# Load the tseries library
library(tseries)

# ADF test for Initial Claims (ICNSA)
adf_test_icnsa <- adf.test(initial_claims$value, alternative = "stationary")
print(adf_test_icnsa)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  initial_claims$value
#> Dickey-Fuller = -3.2143, Lag order = 4, p-value = 0.08907
#> alternative hypothesis: stationary

# ADF test for Continued Claims - Not Seasonally Adjusted (CCNSA)
adf_test_ccnsa <- adf.test(continued_claims_nsadj$value, alternative = "stationary")
print(adf_test_ccnsa)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  continued_claims_nsadj$value
#> Dickey-Fuller = -3.3689, Lag order = 5, p-value = 0.061
#> alternative hypothesis: stationary

#b) decomposition

# Ensure your time series data is in a ts object with an appropriate frequency
# For weekly data, you might approximate the frequency as 52
icnsa_ts <- ts(initial_claims$value, frequency = 52)
ccnsa_ts <- ts(continued_claims_nsadj$value, frequency = 52)

# Decompose using STL for ICNSA
library(stats)
decomp_icnsa <- stl(icnsa_ts, s.window = "periodic", robust = TRUE)
plot(decomp_icnsa)


# Decompose using STL for CCNSA
decomp_ccnsa <- stl(ccnsa_ts, s.window = "periodic", robust = TRUE)
plot(decomp_ccnsa)


#c) ACF & PCF PLOTTING
# Load necessary library for ACF and PACF plotting
library(stats)

# ACF and PACF for ICNSA
# Autocorrelation Function (ACF)
acf(icnsa_ts, main = "ACF for ICNSA")


# Partial Autocorrelation Function (PACF)
pacf(icnsa_ts, main = "PACF for ICNSA")


# ACF and PACF for CCNSA
# Autocorrelation Function (ACF)
acf(ccnsa_ts, main = "ACF for CCNSA")


# Partial Autocorrelation Function (PACF)
pacf(ccnsa_ts, main = "PACF for CCNSA")


#####STEP2 MERGER OF THE two data sets choosen

# Ensure all date columns are in the correct date format
initial_claims$date <- as.Date(initial_claims$date)
continued_claims_nsadj$date <- as.Date(continued_claims_nsadj$date)

# Then merge the resulting data frame with continued_claims_nsadj
final_merged_data <- merge(initial_claims, continued_claims_nsadj, by = "date", all = TRUE)

# View the first few rows of the final merged dataset
head(final_merged_data)
#>         date series_id.x value.x realtime_start.x realtime_end.x series_id.y
#> 1 2020-01-04        <NA>      NA             <NA>           <NA>       CCNSA
#> 2 2020-01-11        <NA>      NA             <NA>           <NA>       CCNSA
#> 3 2020-01-18        <NA>      NA             <NA>           <NA>       CCNSA
#> 4 2020-01-25        <NA>      NA             <NA>           <NA>       CCNSA
#> 5 2020-02-01        <NA>      NA             <NA>           <NA>       CCNSA
#> 6 2020-02-08        <NA>      NA             <NA>           <NA>       CCNSA
#>   value.y realtime_start.y realtime_end.y
#> 1 2251286       2024-02-22     2024-02-22
#> 2 2142711       2024-02-22     2024-02-22
#> 3 2082416       2024-02-22     2024-02-22
#> 4 2154866       2024-02-22     2024-02-22
#> 5 2088693       2024-02-22     2024-02-22
#> 6 2100473       2024-02-22     2024-02-22

cleandata <- na.omit(final_merged_data)
cleandata
#>           date series_id.x value.x realtime_start.x realtime_end.x series_id.y
#> 105 2022-01-01       ICNSA  314796       2024-02-22     2024-02-22       CCNSA
#> 106 2022-01-08       ICNSA  415162       2024-02-22     2024-02-22       CCNSA
#> 107 2022-01-15       ICNSA  337075       2024-02-22     2024-02-22       CCNSA
#> 108 2022-01-22       ICNSA  267637       2024-02-22     2024-02-22       CCNSA
#> 109 2022-01-29       ICNSA  258288       2024-02-22     2024-02-22       CCNSA
#> 110 2022-02-05       ICNSA  231452       2024-02-22     2024-02-22       CCNSA
#> 111 2022-02-12       ICNSA  240502       2024-02-22     2024-02-22       CCNSA
#> 112 2022-02-19       ICNSA  216574       2024-02-22     2024-02-22       CCNSA
#> 113 2022-02-26       ICNSA  196537       2024-02-22     2024-02-22       CCNSA
#> 114 2022-03-05       ICNSA  220486       2024-02-22     2024-02-22       CCNSA
#> 115 2022-03-12       ICNSA  204494       2024-02-22     2024-02-22       CCNSA
#> 116 2022-03-19       ICNSA  182581       2024-02-22     2024-02-22       CCNSA
#> 117 2022-03-26       ICNSA  197128       2024-02-22     2024-02-22       CCNSA
#> 118 2022-04-02       ICNSA  194671       2024-02-22     2024-02-22       CCNSA
#> 119 2022-04-09       ICNSA  223751       2024-02-22     2024-02-22       CCNSA
#> 120 2022-04-16       ICNSA  197219       2024-02-22     2024-02-22       CCNSA
#> 121 2022-04-23       ICNSA  202952       2024-02-22     2024-02-22       CCNSA
#> 122 2022-04-30       ICNSA  196422       2024-02-22     2024-02-22       CCNSA
#> 123 2022-05-07       ICNSA  185815       2024-02-22     2024-02-22       CCNSA
#> 124 2022-05-14       ICNSA  199631       2024-02-22     2024-02-22       CCNSA
#> 125 2022-05-21       ICNSA  187695       2024-02-22     2024-02-22       CCNSA
#> 126 2022-05-28       ICNSA  183446       2024-02-22     2024-02-22       CCNSA
#> 127 2022-06-04       ICNSA  186568       2024-02-22     2024-02-22       CCNSA
#> 128 2022-06-11       ICNSA  205890       2024-02-22     2024-02-22       CCNSA
#> 129 2022-06-18       ICNSA  206272       2024-02-22     2024-02-22       CCNSA
#> 130 2022-06-25       ICNSA  207061       2024-02-22     2024-02-22       CCNSA
#> 131 2022-07-02       ICNSA  218886       2024-02-22     2024-02-22       CCNSA
#> 132 2022-07-09       ICNSA  243485       2024-02-22     2024-02-22       CCNSA
#> 133 2022-07-16       ICNSA  237092       2024-02-22     2024-02-22       CCNSA
#> 134 2022-07-23       ICNSA  200929       2024-02-22     2024-02-22       CCNSA
#> 135 2022-07-30       ICNSA  196116       2024-02-22     2024-02-22       CCNSA
#> 136 2022-08-06       ICNSA  196308       2024-02-22     2024-02-22       CCNSA
#> 137 2022-08-13       ICNSA  187391       2024-02-22     2024-02-22       CCNSA
#> 138 2022-08-20       ICNSA  179220       2024-02-22     2024-02-22       CCNSA
#> 139 2022-08-27       ICNSA  173867       2024-02-22     2024-02-22       CCNSA
#> 140 2022-09-03       ICNSA  172835       2024-02-22     2024-02-22       CCNSA
#> 141 2022-09-10       ICNSA  152144       2024-02-22     2024-02-22       CCNSA
#> 142 2022-09-17       ICNSA  168701       2024-02-22     2024-02-22       CCNSA
#> 143 2022-09-24       ICNSA  153821       2024-02-22     2024-02-22       CCNSA
#> 144 2022-10-01       ICNSA  167378       2024-02-22     2024-02-22       CCNSA
#> 145 2022-10-08       ICNSA  198340       2024-02-22     2024-02-22       CCNSA
#> 146 2022-10-15       ICNSA  178773       2024-02-22     2024-02-22       CCNSA
#> 147 2022-10-22       ICNSA  184420       2024-02-22     2024-02-22       CCNSA
#> 148 2022-10-29       ICNSA  185973       2024-02-22     2024-02-22       CCNSA
#> 149 2022-11-05       ICNSA  206079       2024-02-22     2024-02-22       CCNSA
#> 150 2022-11-12       ICNSA  200237       2024-02-22     2024-02-22       CCNSA
#> 151 2022-11-19       ICNSA  248916       2024-02-22     2024-02-22       CCNSA
#> 152 2022-11-26       ICNSA  199323       2024-02-22     2024-02-22       CCNSA
#> 153 2022-12-03       ICNSA  287976       2024-02-22     2024-02-22       CCNSA
#> 154 2022-12-10       ICNSA  250038       2024-02-22     2024-02-22       CCNSA
#> 155 2022-12-17       ICNSA  248444       2024-02-22     2024-02-22       CCNSA
#> 156 2022-12-24       ICNSA  269877       2024-02-22     2024-02-22       CCNSA
#> 157 2022-12-31       ICNSA  278487       2024-02-22     2024-02-22       CCNSA
#> 158 2023-01-07       ICNSA  339883       2024-02-22     2024-02-22       CCNSA
#> 159 2023-01-14       ICNSA  288330       2024-02-22     2024-02-22       CCNSA
#> 160 2023-01-21       ICNSA  225228       2024-02-22     2024-02-22       CCNSA
#> 161 2023-01-28       ICNSA  225026       2024-02-22     2024-02-22       CCNSA
#> 162 2023-02-04       ICNSA  234007       2024-02-22     2024-02-22       CCNSA
#> 163 2023-02-11       ICNSA  225332       2024-02-22     2024-02-22       CCNSA
#> 164 2023-02-18       ICNSA  211007       2024-02-22     2024-02-22       CCNSA
#> 165 2023-02-25       ICNSA  202156       2024-02-22     2024-02-22       CCNSA
#> 166 2023-03-04       ICNSA  238840       2024-02-22     2024-02-22       CCNSA
#> 167 2023-03-11       ICNSA  218084       2024-02-22     2024-02-22       CCNSA
#> 168 2023-03-18       ICNSA  213003       2024-02-22     2024-02-22       CCNSA
#> 169 2023-03-25       ICNSA  224193       2024-02-22     2024-02-22       CCNSA
#> 170 2023-04-01       ICNSA  207120       2024-02-22     2024-02-22       CCNSA
#> 171 2023-04-08       ICNSA  235237       2024-02-22     2024-02-22       CCNSA
#> 172 2023-04-15       ICNSA  229319       2024-02-22     2024-02-22       CCNSA
#> 173 2023-04-22       ICNSA  225137       2024-02-22     2024-02-22       CCNSA
#> 174 2023-04-29       ICNSA  220115       2024-02-22     2024-02-22       CCNSA
#> 175 2023-05-06       ICNSA  204962       2024-02-22     2024-02-22       CCNSA
#> 176 2023-05-13       ICNSA  200738       2024-02-22     2024-02-22       CCNSA
#> 177 2023-05-20       ICNSA  202645       2024-02-22     2024-02-22       CCNSA
#> 178 2023-05-27       ICNSA  208772       2024-02-22     2024-02-22       CCNSA
#> 179 2023-06-03       ICNSA  220449       2024-02-22     2024-02-22       CCNSA
#> 180 2023-06-10       ICNSA  251384       2024-02-22     2024-02-22       CCNSA
#> 181 2023-06-17       ICNSA  250891       2024-02-22     2024-02-22       CCNSA
#> 182 2023-06-24       ICNSA  229718       2024-02-22     2024-02-22       CCNSA
#> 183 2023-07-01       ICNSA  251686       2024-02-22     2024-02-22       CCNSA
#> 184 2023-07-08       ICNSA  258252       2024-02-22     2024-02-22       CCNSA
#> 185 2023-07-15       ICNSA  258169       2024-02-22     2024-02-22       CCNSA
#> 186 2023-07-22       ICNSA  213497       2024-02-22     2024-02-22       CCNSA
#> 187 2023-07-29       ICNSA  205550       2024-02-22     2024-02-22       CCNSA
#> 188 2023-08-05       ICNSA  227917       2024-02-22     2024-02-22       CCNSA
#> 189 2023-08-12       ICNSA  213803       2024-02-22     2024-02-22       CCNSA
#> 190 2023-08-19       ICNSA  199437       2024-02-22     2024-02-22       CCNSA
#> 191 2023-08-26       ICNSA  193441       2024-02-22     2024-02-22       CCNSA
#> 192 2023-09-02       ICNSA  191353       2024-02-22     2024-02-22       CCNSA
#> 193 2023-09-09       ICNSA  175594       2024-02-22     2024-02-22       CCNSA
#> 194 2023-09-16       ICNSA  176586       2024-02-22     2024-02-22       CCNSA
#> 195 2023-09-23       ICNSA  175650       2024-02-22     2024-02-22       CCNSA
#> 196 2023-09-30       ICNSA  174701       2024-02-22     2024-02-22       CCNSA
#> 197 2023-10-07       ICNSA  199743       2024-02-22     2024-02-22       CCNSA
#> 198 2023-10-14       ICNSA  182394       2024-02-22     2024-02-22       CCNSA
#> 199 2023-10-21       ICNSA  193999       2024-02-22     2024-02-22       CCNSA
#> 200 2023-10-28       ICNSA  199306       2024-02-22     2024-02-22       CCNSA
#> 201 2023-11-04       ICNSA  214161       2024-02-22     2024-02-22       CCNSA
#> 202 2023-11-11       ICNSA  217438       2024-02-22     2024-02-22       CCNSA
#> 203 2023-11-18       ICNSA  240979       2024-02-22     2024-02-22       CCNSA
#> 204 2023-11-25       ICNSA  199750       2024-02-22     2024-02-22       CCNSA
#> 205 2023-12-02       ICNSA  294615       2024-02-22     2024-02-22       CCNSA
#> 206 2023-12-09       ICNSA  249090       2024-02-22     2024-02-22       CCNSA
#> 207 2023-12-16       ICNSA  241040       2024-02-22     2024-02-22       CCNSA
#> 208 2023-12-23       ICNSA  274840       2024-02-22     2024-02-22       CCNSA
#> 209 2023-12-30       ICNSA  269409       2024-02-22     2024-02-22       CCNSA
#> 210 2024-01-06       ICNSA  318906       2024-02-22     2024-02-22       CCNSA
#> 211 2024-01-13       ICNSA  291330       2024-02-22     2024-02-22       CCNSA
#> 212 2024-01-20       ICNSA  249947       2024-02-22     2024-02-22       CCNSA
#> 213 2024-01-27       ICNSA  263919       2024-02-22     2024-02-22       CCNSA
#> 214 2024-02-03       ICNSA  234729       2024-02-22     2024-02-22       CCNSA
#>     value.y realtime_start.y realtime_end.y
#> 105 2061856       2024-02-22     2024-02-22
#> 106 2072681       2024-02-22     2024-02-22
#> 107 2009022       2024-02-22     2024-02-22
#> 108 2043402       2024-02-22     2024-02-22
#> 109 2005387       2024-02-22     2024-02-22
#> 110 1980368       2024-02-22     2024-02-22
#> 111 1918036       2024-02-22     2024-02-22
#> 112 1859328       2024-02-22     2024-02-22
#> 113 1917459       2024-02-22     2024-02-22
#> 114 1810381       2024-02-22     2024-02-22
#> 115 1727708       2024-02-22     2024-02-22
#> 116 1676374       2024-02-22     2024-02-22
#> 117 1654359       2024-02-22     2024-02-22
#> 118 1569797       2024-02-22     2024-02-22
#> 119 1467346       2024-02-22     2024-02-22
#> 120 1441824       2024-02-22     2024-02-22
#> 121 1403640       2024-02-22     2024-02-22
#> 122 1332072       2024-02-22     2024-02-22
#> 123 1283158       2024-02-22     2024-02-22
#> 124 1290047       2024-02-22     2024-02-22
#> 125 1256209       2024-02-22     2024-02-22
#> 126 1256450       2024-02-22     2024-02-22
#> 127 1270456       2024-02-22     2024-02-22
#> 128 1288323       2024-02-22     2024-02-22
#> 129 1302015       2024-02-22     2024-02-22
#> 130 1372008       2024-02-22     2024-02-22
#> 131 1327199       2024-02-22     2024-02-22
#> 132 1450142       2024-02-22     2024-02-22
#> 133 1443848       2024-02-22     2024-02-22
#> 134 1451592       2024-02-22     2024-02-22
#> 135 1457158       2024-02-22     2024-02-22
#> 136 1423854       2024-02-22     2024-02-22
#> 137 1414317       2024-02-22     2024-02-22
#> 138 1390837       2024-02-22     2024-02-22
#> 139 1365751       2024-02-22     2024-02-22
#> 140 1273057       2024-02-22     2024-02-22
#> 141 1278853       2024-02-22     2024-02-22
#> 142 1224496       2024-02-22     2024-02-22
#> 143 1233270       2024-02-22     2024-02-22
#> 144 1201432       2024-02-22     2024-02-22
#> 145 1199269       2024-02-22     2024-02-22
#> 146 1226861       2024-02-22     2024-02-22
#> 147 1239975       2024-02-22     2024-02-22
#> 148 1267368       2024-02-22     2024-02-22
#> 149 1230153       2024-02-22     2024-02-22
#> 150 1341329       2024-02-22     2024-02-22
#> 151 1261674       2024-02-22     2024-02-22
#> 152 1561449       2024-02-22     2024-02-22
#> 153 1503416       2024-02-22     2024-02-22
#> 154 1594527       2024-02-22     2024-02-22
#> 155 1577004       2024-02-22     2024-02-22
#> 156 1710626       2024-02-22     2024-02-22
#> 157 1870048       2024-02-22     2024-02-22
#> 158 1908639       2024-02-22     2024-02-22
#> 159 1863956       2024-02-22     2024-02-22
#> 160 1911203       2024-02-22     2024-02-22
#> 161 1925707       2024-02-22     2024-02-22
#> 162 1948437       2024-02-22     2024-02-22
#> 163 1928688       2024-02-22     2024-02-22
#> 164 1891796       2024-02-22     2024-02-22
#> 165 1970023       2024-02-22     2024-02-22
#> 166 1910046       2024-02-22     2024-02-22
#> 167 1878281       2024-02-22     2024-02-22
#> 168 1877314       2024-02-22     2024-02-22
#> 169 1844456       2024-02-22     2024-02-22
#> 170 1795471       2024-02-22     2024-02-22
#> 171 1788631       2024-02-22     2024-02-22
#> 172 1752764       2024-02-22     2024-02-22
#> 173 1689410       2024-02-22     2024-02-22
#> 174 1660410       2024-02-22     2024-02-22
#> 175 1612137       2024-02-22     2024-02-22
#> 176 1612163       2024-02-22     2024-02-22
#> 177 1609655       2024-02-22     2024-02-22
#> 178 1595088       2024-02-22     2024-02-22
#> 179 1649707       2024-02-22     2024-02-22
#> 180 1672995       2024-02-22     2024-02-22
#> 181 1674551       2024-02-22     2024-02-22
#> 182 1737349       2024-02-22     2024-02-22
#> 183 1724290       2024-02-22     2024-02-22
#> 184 1887580       2024-02-22     2024-02-22
#> 185 1835613       2024-02-22     2024-02-22
#> 186 1827375       2024-02-22     2024-02-22
#> 187 1809787       2024-02-22     2024-02-22
#> 188 1814134       2024-02-22     2024-02-22
#> 189 1801993       2024-02-22     2024-02-22
#> 190 1789881       2024-02-22     2024-02-22
#> 191 1747733       2024-02-22     2024-02-22
#> 192 1655858       2024-02-22     2024-02-22
#> 193 1645933       2024-02-22     2024-02-22
#> 194 1588281       2024-02-22     2024-02-22
#> 195 1585286       2024-02-22     2024-02-22
#> 196 1555186       2024-02-22     2024-02-22
#> 197 1542397       2024-02-22     2024-02-22
#> 198 1572658       2024-02-22     2024-02-22
#> 199 1574484       2024-02-22     2024-02-22
#> 200 1607728       2024-02-22     2024-02-22
#> 201 1578949       2024-02-22     2024-02-22
#> 202 1654728       2024-02-22     2024-02-22
#> 203 1555596       2024-02-22     2024-02-22
#> 204 1845080       2024-02-22     2024-02-22
#> 205 1767271       2024-02-22     2024-02-22
#> 206 1835426       2024-02-22     2024-02-22
#> 207 1836547       2024-02-22     2024-02-22
#> 208 1902098       2024-02-22     2024-02-22
#> 209 2104271       2024-02-22     2024-02-22
#> 210 2122548       2024-02-22     2024-02-22
#> 211 2053298       2024-02-22     2024-02-22
#> 212 2181429       2024-02-22     2024-02-22
#> 213 2130008       2024-02-22     2024-02-22
#> 214 2146550       2024-02-22     2024-02-22

# Calculate correlation between ICNSA and CCNSA values
correlation_coefficient <- cor(cleandata$value.x, cleandata$value.y)

# Print the correlation coefficient
print(correlation_coefficient)
#> [1] 0.6254788

#STEP 3 TRAIN MODEL using arima model 

column_names_cleaned <- colnames(cleandata)
print(column_names_cleaned)
#> [1] "date"             "series_id.x"      "value.x"          "realtime_start.x"
#> [5] "realtime_end.x"   "series_id.y"      "value.y"          "realtime_start.y"
#> [9] "realtime_end.y"

# Convert ICNSA values to a time series object
icnsa_ts <- ts(cleandata$value.x, frequency = 52)

# Prepare CCNSA values for use as a regressor
ccnsa_regressor <- cleandata$value.y
# Ensure the forecast package is installed and loaded
if (!requireNamespace("forecast", quietly = TRUE)) install.packages("forecast")
library(forecast)

# Train the ARIMA model with CCNSA as a regressor
fit_arima <- auto.arima(icnsa_ts, xreg = ccnsa_regressor)

# Print the model summary
summary(fit_arima)
#> Series: icnsa_ts 
#> Regression with ARIMA(2,0,0)(0,1,0)[52] errors 
#> 
#> Coefficients:
#>          ar1     ar2    xreg
#>       0.5485  0.1551  0.0606
#> s.e.  0.1402  0.1721  0.0201
#> 
#> sigma^2 = 189329888:  log likelihood = -633.76
#> AIC=1275.53   AICc=1276.28   BIC=1283.77
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE        MPE     MAPE     MASE
#> Training set -905.732 9729.591 5475.743 -0.3606824 2.322205 0.286164
#>                     ACF1
#> Training set 0.009711292

# Placeholder for the next week's CCNSA value
# You need to replace this with the actual or estimated next week's CCNSA value

next_week_ccnsa <- tail(cleandata$value.y, 1)
# Forecast the next week's ICNSA value using the ARIMA model
# Note: Ensure the xreg parameter is provided with a new data frame matching the expected format
next_week_forecast <- forecast(fit_arima, xreg = matrix(next_week_ccnsa, ncol = 1), h = 1)

# Print the forecast results
print(next_week_forecast)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 3.115385         236458 218824.2 254091.8 209489.4 263426.6

^{Created on 2024-02-22 with reprex v2.1.0}

theroy sumarrised!

"ARIMA(2,0,0): This part means we are using the last two observations to predict the next one. Its like saying, Based on how things went the last two weeks, heres our guess for next week. We didnt need to adjust or smooth out the data series (thats the d=0 part), because it's already stable or made stable. (0,1,0)[52]: This shows were also considering a yearly pattern that repeats every 52 weeks, but we are not using any fancy methods to predict this pattern just a basic step to acknowledge it exists.) xreg with coefficient 0.0606: Besides the main data, were also looking at another related factor (CCNSA), thinking it can affect our predictions. The positive number here means if the CCNSA goes up, our main data tends to go up a bit too.

Justification for the Model

'Statistical Significanc The ARIMA(2,0,0)(0,1,0)[52] model with CCNSA as an external factor is our choice for making future predictions. Heres why it stands out:

Statistical Significance: The models parameters are reliably linked to the series past values and the CCNSA data, ensuring a solid foundation for predictions.
AIC, AICc, and BIC: These criteria suggest our model strikes a good balance between being detailed enough to capture the datas complexity and not overly so.
Error Measures: Indicators like ME, RMSE, MAE, and others show the model is quite accurate, outperforming simple guesses.
Seasonality: By accounting for yearly patterns with a 52-week cycle, the model aligns with the datas seasonal trends.
External Regressor (CCNSA): The significant link between ICNSA and CCNSA validates including CCNSA for better predictions.

In short, this model is finely tuned to forecast ICNSA values, leveraging both historical data and external influences effectively.

Final model equation:

yt=βzt + (1−ϕ1B−ϕ2B2)xt = (1−B52)wt β = 0.0606 ϕ1 = 0.5485 ϕ2 = 0.1551 yt=βzt + (1−ϕ1B−ϕ2B2)xt = (1−B52)wt yt=0.0606zt + (1−(0.5485)( 0.0606)−ϕ2(0.0606)2)xt = (1−(0.0606)52)wt

ASSIGNMENT 2 - FEB 28

library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
library(lubridate)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric

# Set FRED API key and read the ICNSA data
fredr_set_key("f4e804771fd1a4241706e9c4498f0839") # Replace with your actual API key
icnsa_data <- fredr(series_id = "ICNSA") %>%
  mutate(date = as.Date(date)) %>%
  arrange(date)

# Plotting the time series ICNSA
icnsa_data %>%
  ggplot(aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA over Time", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# Define the COVID period and set those values to NA
covid_start <- as.Date("2020-03-14")
covid_end <- as.Date("2021-01-02")

# Create a vector that indicates whether each date is within the COVID period
is_covid_period <- icnsa_data$date >= covid_start & icnsa_data$date <= covid_end

# Replace 'value' with NA for the COVID period
icnsa_data$value[is_covid_period] <- NA

# Ensure data is ordered by date
icnsa_data <- icnsa_data %>% arrange(date)

# Subset data without NA values (excluding the COVID period)
icnsa_without_covid <- icnsa_data[!is.na(icnsa_data$value), ]

# Subset data with NA values (only the COVID period)
icnsa_with_covid <- icnsa_data[is.na(icnsa_data$value), ]

# Fitting spline on non-covid data with lambda=1
spline_fit <- smooth.spline(x = as.numeric(icnsa_without_covid$date), y = icnsa_without_covid$value,spar = 0.1)

#impute new values 
imputed_values <- predict(spline_fit, x = as.numeric(icnsa_with_covid$date))

icnsa_with_covid$value <- imputed_values$y

#new values replacing covid values with imputed values
updated_icnsa_data <- bind_rows(icnsa_without_covid, icnsa_with_covid)
updated_icnsa_data <- updated_icnsa_data %>% arrange(date)

# Plot the graph with the original data and the smoothed data
ggplot(updated_icnsa_data, aes(x = date)) +
  geom_line(aes(y = value), color = "blue") +  # Original data
  geom_line(data = icnsa_without_covid, aes(y = fitted(spline_fit)), color = "red") +  # Smoothed data
  labs(title = "ICNSA Data with Original and Smoothed Values", x = "Date", y = "ICNSA Values") +
  theme_minimal()


# Fitting spline on non-covid data with lambda=0.5
spline_fit <- smooth.spline(x = as.numeric(icnsa_without_covid$date), y = icnsa_without_covid$value,spar = 0.5)

#impute new values 
imputed_values <- predict(spline_fit, x = as.numeric(icnsa_with_covid$date))

icnsa_with_covid$value <- imputed_values$y

#new values replacing covid values with imputed values
updated_icnsa_data <- bind_rows(icnsa_without_covid, icnsa_with_covid) %>%
  arrange(date)

anyNA(updated_icnsa_data$value)
#> [1] FALSE
# Plot the graph with the original data and the smoothed data
ggplot(updated_icnsa_data, aes(x = date)) +
  geom_line(aes(y = value), color = "blue") +  # Original data
  geom_line(data = icnsa_without_covid, aes(y = fitted(spline_fit)), color = "red") +  # Smoothed data
  labs(title = "ICNSA Data with Original and Smoothed Values", x = "Date", y = "ICNSA Values") +
  theme_minimal()


# Forecasting using Holt-Winters models for that let's create timeseries object as it is weekly data frequency = 52
icnsa_ts <- ts(updated_icnsa_data$value, frequency = 52) 

# Additive Holt-Winters model
holt_additive_model <- HoltWinters(icnsa_ts, seasonal = "additive")
forecast_additive_model <- forecast(holt_additive_model, h = 1)
print(forecast_additive_model)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 58.32692       163281.7 118912.4 207651.1 95424.73 231138.7

# Multiplicative Holt-Winters model
holt_multiplicative_model <- HoltWinters(icnsa_ts, seasonal = "multiplicative")
forecast_multiplicative_model <- forecast(holt_multiplicative_model, h = 1)
print(forecast_multiplicative_model)
#>          Point Forecast    Lo 80  Hi 80    Lo 95    Hi 95
#> 58.32692       182854.1 138777.3 226931 115444.4 250263.9

^{Created on 2024-02-28 with reprex v2.1.0}

library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
library(lubridate)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)

# Set FRED API key and read the ICNSA data
fredr_set_key("f4e804771fd1a4241706e9c4498f0839") 
icnsa_data <- fredr(series_id = "ICNSA") %>%
  mutate(date = as.Date(date)) %>%
  arrange(date)

# Plotting the time series ICNSA
icnsa_data %>%
  ggplot(aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA over Time", x = "Date", y = "ICNSA Value") +
  theme_minimal()


icnsa_data_org = icnsa_data
# Define the COVID period and set those values to NA
covid_start <- as.Date("2020-03-14")
covid_end <- as.Date("2021-01-02")

# Create a vector that indicates whether each date is within the COVID period
is_covid_period <- icnsa_data$date >= covid_start & icnsa_data$date <= covid_end

# Replace 'value' with NA for the COVID period
icnsa_data$value[is_covid_period] <- NA

# Ensure data is ordered by date
icnsa_data <- icnsa_data %>% arrange(date)

# We will use structTS() model to fit a best structural model to the given TS so that we can use KalmanSmooth technique to get NA values.
# But before that we need to decide on 'type' parameter of structTS(), for that we can perform some empirical analysis. 

# Visual inspection

# Plot the original time series with a rolling mean for trend visualization
icnsa_data %>%
  ggplot(aes(x = date, y = value)) +
  geom_line(alpha = 0.5) +
  geom_line(aes(y = rollapply(value, width = 52, FUN = median, na.pad = TRUE, align='right')), color = 'red') +
  labs(title = "ICNSA over Time with Rolling Median", x = "Date", y = "ICNSA Value") +
  theme_minimal()
#> Warning in rollapply.zoo(zoo(data), ...): na.pad argument is deprecated
#> Warning: Removed 51 rows containing missing values (`geom_line()`).


icnsa_data_without_na <- na.omit(icnsa_data)

# Convert to ts object
icnsa_ts_analysis <- ts(icnsa_data_without_na$value, frequency = 52)

# Seasonal decomposition using STL
decomposed <- stl(icnsa_ts_analysis, s.window = "periodic")

# Plot the decomposed components
plot(decomposed)


# ACF plot
acf(icnsa_ts_analysis)


# PACF plot
pacf(icnsa_ts_analysis)


# So from the above analysis specifically from ACF plot and trend component of decomposition there are indeed underlying trends in the data over time. So we can use type = "trend".

icnsa_ts_final <- ts(icnsa_data$value, start = c(start(icnsa_data$date)), frequency = 52)

# Using structTS
ss_model_icnsa <- StructTS(icnsa_ts_final, type = "trend")

summary(ss_model_icnsa)
#>           Length Class  Mode     
#> coef         3   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    5964   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Let's use KalmanSmooth technique to obtain the smoothed state estimates to impute covid values
smoothed_icnsa <- tsSmooth(ss_model_icnsa)

# Recreate the index for the COVID period
covid_idx <- which(icnsa_data$date >= covid_start & icnsa_data$date <= covid_end)

# Update NA values with the smoothed estimates
icnsa_data$value[covid_idx] <- smoothed_icnsa[covid_idx]

ggplot() +
  geom_line(data = icnsa_data_without_na, aes(x = date, y = value), color = "red") + 
  geom_line(data = icnsa_data, aes(x = date, y = value), color = "blue") +
  labs(title = "ICNSA over Time with Original and Imputed Values", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# We have imputed covid period values, now we can fit a structural time series model with trend, seasonal and error using state space methodology.

icnsa_final_computed_ts <- ts(icnsa_data$value, start = c(start(icnsa_data$date)), frequency = 52)

# Fitting a structural time series model with both trend and seasonal components. For this we need to set type = 'BSM'. It is basic structural model which will automatically include a trend, seasonal, and error component in the state space model.

state_space_imputed_icnsa_model <- StructTS(icnsa_final_computed_ts, type = "BSM")

# Reading the IURSA data
iurnsa_data = fredr(series_id = "IURNSA")
column_names_iurnsa <- colnames(iurnsa_data)
print(column_names_iurnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

head(iurnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 1971-01-02 IURNSA      5.2 2024-03-07     2024-03-07  
#> 2 1971-01-09 IURNSA      5.3 2024-03-07     2024-03-07  
#> 3 1971-01-16 IURNSA      5.3 2024-03-07     2024-03-07  
#> 4 1971-01-23 IURNSA      5.2 2024-03-07     2024-03-07  
#> 5 1971-01-30 IURNSA      5.2 2024-03-07     2024-03-07  
#> 6 1971-02-06 IURNSA      5.2 2024-03-07     2024-03-07

# Combining the data for IURNSA and ICNSA
combined_data_covariate <- left_join(icnsa_data, iurnsa_data, by = "date", suffix = c(".icnsa", ".iurnsa"))
#combined_data_covariate <- na.omit(combined_data_covariate)

# Converting combined data to timeseries object
combined_data_covariate_ts <- ts(combined_data_covariate$value.iurnsa, start = c(year(min(combined_data_covariate$date)), 1), frequency = 52)

# Fitting arima model with auto arima.

arima_model_cov <- auto.arima(icnsa_final_computed_ts, xreg = combined_data_covariate_ts, seasonal = TRUE)

next_week_icnsa <- tail(iurnsa_data$value, 1)
print(next_week_icnsa)
#> [1] 1.4
# Forecast the next week's ICNSA value using the ARIMA model
next_week_forecast <- forecast(arima_model_cov, xreg = matrix(next_week_icnsa, ncol = 1), h = 1)

# Print the forecast results
print(next_week_forecast)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 58.34615       229178.6 185816.4 272540.8 162861.9 295495.4

^{Created on 2024-03-07 with reprex v2.1.0}

HOMEWORK 4 - 10TH APRIL


library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tidyverse)
library(lubridate)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(readxl)
library(dplyr)
library(lubridate)
library(xts)
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
library(urca)
library(vars)
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> Loading required package: strucchange
#> Loading required package: sandwich
#> 
#> Attaching package: 'strucchange'
#> The following object is masked from 'package:stringr':
#> 
#>     boundary
#> Loading required package: lmtest
library(BigVAR)
#> Loading required package: lattice
# New file paths
file_paths <- list(
"Computer_Electronics" = '/Users/alisharuqshan/Downloads/GitHub/Computer_Electronics.xlsx',
"Elec_comp_manufacturing" = '/Users/alisharuqshan/Downloads/GitHub/Elec_comp_manufacturing.xlsx',
"Computer_storage_device_manufacturing" = '/Users/alisharuqshan/Downloads/GitHub/Computer_storage_device_manufacturing.xlsx',
"Other" = '/Users/alisharuqshan/Downloads/GitHub/Other.xlsx'
)

Adjusted function to read and preprocess each Excel file

read_and_process_excel_v2 <- function(file_path) { df <- read_excel(file_path) %>% mutate(Period = mdy(Period), Value = as.numeric(gsub(",", "", Value))) %>% na.omit()

Ensure Period is of Date class, not POSIXct, to avoid xts issues

df$Period <- as.Date(df$Period)

Creating an xts object

xts_object <- xts(x = df$Value, order.by = df$Period) return(xts_object) }

Using a loop instead of lapply for a change in approach

time_series_list <- list() for (name in names(file_paths)) { time_series_list[[name]] <- read_and_process_excel_v2(file_paths[[name]]) }

Extract each time series into its own variable

computer_electronics_ts <- time_series_list[["Computer_Electronics"]] elec_comp_manufacturing_ts <- time_series_list[["Elec_comp_manufacturing"]] computer_storage_device_manufacturing_ts <- time_series_list[["Computer_storage_device_manufacturing"]] other_ts <- time_series_list[["Other"]]

Set up the plotting space

par(mfrow = c(1, 1))

Plot each time series

plot(computer_electronics_ts, main = "Computer Electronics", xlab = "Date", ylab = "Value", col = "blue")


![](https://i.imgur.com/AiPFwrF.png)<!-- -->

``` r
plot(elec_comp_manufacturing_ts, main = "Electronic Computer Manufacturing", xlab = "Date", ylab = "Value", col = "red")

plot(computer_storage_device_manufacturing_ts, main = "Computer Storage Device Manufacturing", xlab = "Date", ylab = "Value", col = "green")

plot(other_ts, main = "Other", xlab = "Date", ylab = "Value", col = "purple")


# Reset to default plotting layout
par(mfrow = c(1, 1))

#Analysis
# 1. Correlation
# Merge all time series into a single xts object
merged_ts <- merge(computer_electronics_ts, elec_comp_manufacturing_ts, computer_storage_device_manufacturing_ts, other_ts, all = FALSE)

# Compute correlation matrix for the merged time series
correlation_matrix <- cor(merged_ts)

# Print the correlation matrix
print(correlation_matrix)
#>                                          computer_electronics_ts
#> computer_electronics_ts                                1.0000000
#> elec_comp_manufacturing_ts                             0.3462417
#> computer_storage_device_manufacturing_ts               0.2326591
#> other_ts                                               0.4640022
#>                                          elec_comp_manufacturing_ts
#> computer_electronics_ts                                   0.3462417
#> elec_comp_manufacturing_ts                                1.0000000
#> computer_storage_device_manufacturing_ts                  0.3970249
#> other_ts                                                  0.5093446
#>                                          computer_storage_device_manufacturing_ts
#> computer_electronics_ts                                                 0.2326591
#> elec_comp_manufacturing_ts                                              0.3970249
#> computer_storage_device_manufacturing_ts                                1.0000000
#> other_ts                                                                0.7975097
#>                                           other_ts
#> computer_electronics_ts                  0.4640022
#> elec_comp_manufacturing_ts               0.5093446
#> computer_storage_device_manufacturing_ts 0.7975097
#> other_ts                                 1.0000000

# 2. CCF Plot
# Set up multiple plots in a grid
par(mfrow=c(length(time_series_list) - 1, 1))

# Define the reference time series
reference_ts <- as.numeric(computer_electronics_ts)

# Loop through each time series and plot its CCF with the reference series
for(i in 2:length(time_series_list)) {  
  # Extract the time series data
  comparison_ts <- as.numeric(time_series_list[[i]])

  # Define names_list within the loop
  names_list <- names(time_series_list)

  # Calculate the CCF
  ccf_result <- ccf(reference_ts, comparison_ts, main = paste("CCF between", names_list[1], "and", names_list[i]))

  # Plot the CCF
  plot(ccf_result)
}


# 3. ACF and PACF plots

# Assuming your plotting window is large enough, or you're plotting to an external file
for(i in 1:length(time_series_list)) {  
  # Extract the time series data
  ts_data <- as.numeric(time_series_list[[i]])

  # Set up a new plot window for ACF
  par(mfrow=c(1, 1))
  acf_result <- acf(ts_data, main = paste("ACF for", names_list[i]))

  # Set up a new plot window for PACF
  par(mfrow=c(1, 1))
  pacf_result <- pacf(ts_data, main = paste("PACF for", names_list[i]))
}


# 4. ADF test

# Performing ADF Test
adf_test <- function(ts_data, name) {
  adf_result <- ur.df(ts_data, type = "trend", lags = 3)
  summary(adf_result)
}

for (i in 1:length(time_series_list)) {
  ts_data <- time_series_list[[i]]
  adf_result <- adf_test(ts_data, names_list[i])
  print(adf_result)
}
#> 
#> ############################################### 
#> # Augmented Dickey-Fuller Test Unit Root Test # 
#> ############################################### 
#> 
#> Test regression trend 
#> 
#> 
#> Call:
#> lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1919.7  -220.1     5.1   224.8  1529.1 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 475.600416 201.969387   2.355   0.0190 *  
#> z.lag.1      -0.009744   0.004033  -2.416   0.0162 *  
#> tt           -0.022014   0.200820  -0.110   0.9128    
#> z.diff.lag1   0.302031   0.048942   6.171 1.76e-09 ***
#> z.diff.lag2   0.122270   0.051006   2.397   0.0170 *  
#> z.diff.lag3   0.300490   0.049252   6.101 2.62e-09 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 427.1 on 376 degrees of freedom
#> Multiple R-squared:  0.3311, Adjusted R-squared:  0.3222 
#> F-statistic: 37.22 on 5 and 376 DF,  p-value: < 2.2e-16
#> 
#> 
#> Value of test-statistic is: -2.4163 2.0255 2.9508 
#> 
#> Critical values for test statistics: 
#>       1pct  5pct 10pct
#> tau3 -3.98 -3.42 -3.13
#> phi2  6.15  4.71  4.05
#> phi3  8.34  6.30  5.36
#> 
#> 
#> ############################################### 
#> # Augmented Dickey-Fuller Test Unit Root Test # 
#> ############################################### 
#> 
#> Test regression trend 
#> 
#> 
#> Call:
#> lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -593.34  -47.50    5.70   48.02  460.85 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 25.590274  27.549291   0.929  0.35354    
#> z.lag.1     -0.007249   0.004561  -1.589  0.11284    
#> tt          -0.051122   0.080107  -0.638  0.52375    
#> z.diff.lag1  0.103269   0.049362   2.092  0.03710 *  
#> z.diff.lag2  0.152573   0.048691   3.134  0.00186 ** 
#> z.diff.lag3  0.269676   0.049117   5.491 7.39e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 111.4 on 376 degrees of freedom
#> Multiple R-squared:  0.1455, Adjusted R-squared:  0.1342 
#> F-statistic: 12.81 on 5 and 376 DF,  p-value: 1.64e-11
#> 
#> 
#> Value of test-statistic is: -1.5893 1.3545 1.6443 
#> 
#> Critical values for test statistics: 
#>       1pct  5pct 10pct
#> tau3 -3.98 -3.42 -3.13
#> phi2  6.15  4.71  4.05
#> phi3  8.34  6.30  5.36
#> 
#> 
#> ############################################### 
#> # Augmented Dickey-Fuller Test Unit Root Test # 
#> ############################################### 
#> 
#> Test regression trend 
#> 
#> 
#> Call:
#> lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -429.92  -25.65   -2.17   24.14  253.82 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 56.45815   19.83251   2.847 0.004659 ** 
#> z.lag.1     -0.03200    0.01098  -2.915 0.003764 ** 
#> tt          -0.09201    0.03750  -2.453 0.014608 *  
#> z.diff.lag1  0.17199    0.05052   3.405 0.000734 ***
#> z.diff.lag2 -0.05015    0.05113  -0.981 0.327346    
#> z.diff.lag3  0.18193    0.05072   3.587 0.000379 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 57.1 on 376 degrees of freedom
#> Multiple R-squared:  0.0715, Adjusted R-squared:  0.05915 
#> F-statistic: 5.791 on 5 and 376 DF,  p-value: 3.634e-05
#> 
#> 
#> Value of test-statistic is: -2.9155 3.0338 4.4042 
#> 
#> Critical values for test statistics: 
#>       1pct  5pct 10pct
#> tau3 -3.98 -3.42 -3.13
#> phi2  6.15  4.71  4.05
#> phi3  8.34  6.30  5.36
#> 
#> 
#> ############################################### 
#> # Augmented Dickey-Fuller Test Unit Root Test # 
#> ############################################### 
#> 
#> Test regression trend 
#> 
#> 
#> Call:
#> lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -170.65  -24.36    1.21   24.66  182.37 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 43.532493  16.833890   2.586  0.01008 *  
#> z.lag.1     -0.014035   0.005557  -2.526  0.01196 *  
#> tt          -0.099489   0.037924  -2.623  0.00906 ** 
#> z.diff.lag1  0.009568   0.048399   0.198  0.84339    
#> z.diff.lag2  0.028286   0.048345   0.585  0.55884    
#> z.diff.lag3  0.323725   0.048369   6.693 7.98e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 47.62 on 376 degrees of freedom
#> Multiple R-squared:  0.1255, Adjusted R-squared:  0.1139 
#> F-statistic: 10.79 on 5 and 376 DF,  p-value: 1.038e-09
#> 
#> 
#> Value of test-statistic is: -2.5257 2.5514 3.6749 
#> 
#> Critical values for test statistics: 
#>       1pct  5pct 10pct
#> tau3 -3.98 -3.42 -3.13
#> phi2  6.15  4.71  4.05
#> phi3  8.34  6.30  5.36

##### Fitting VAR Model #####
# Fit VAR(1) model

combined_ts_series <- do.call(cbind, time_series_list)

var_model_simple <- VAR(combined_ts_series, p = 1, type = "const")

# AIC BIC Values
aic_var1 <- AIC(var_model_simple)
bic_var1 <- BIC(var_model_simple)

# Let's try VAR for different p values 
var_model_p1 <- VAR(combined_ts_series, p = 1, type = "const")
var_model_p2 <- VAR(combined_ts_series, p = 2, type = "const")
var_model_p3 <- VAR(combined_ts_series, p = 3, type = "const")

# Calculate AIC and BIC for each model
aic_values <- c(AIC(var_model_p1), AIC(var_model_p2), AIC(var_model_p3))
bic_values <- c(BIC(var_model_p1), BIC(var_model_p2), BIC(var_model_p3))

# Print AIC and BIC values
print(aic_values)
#> [1] 18813.08 18683.99 18608.97
print(bic_values)
#> [1] 18892.14 18826.22 18814.27

# From AIC and BIC values we can say that VAR with p=3 is the best choice
var_model_final <- VAR(combined_ts_series, p = 3, type = "const")

# Get AIC and BIC for VAR(3) model
aic_varp <- AIC(var_model_final)
bic_varp <- BIC(var_model_final)

# Compare AIC and BIC
print(paste("AIC of VAR(1):", aic_var1, "BIC of VAR(1):", bic_var1))
#> [1] "AIC of VAR(1): 18813.078008483 BIC of VAR(1): 18892.1428751687"
print(paste("AIC of VAR(3):", aic_varp, "BIC of VAR(3):", bic_varp))
#> [1] "AIC of VAR(3): 18608.9678563367 BIC of VAR(3): 18814.2656757741"

# From above comparison we can say that VAR(3) is better that VAR(1)

# One month forecast
# Using selected model
forecast <- predict(var_model_final, n.ahead = 1)

# Forecasted value
print(forecast)
#> $Computer_Electronics
#>                               fcst    lower    upper       CI
#> Computer_Electronics.fcst 55475.59 54613.35 56337.83 862.2423
#> 
#> $Elec_comp_manufacturing
#>                                  fcst    lower    upper       CI
#> Elec_comp_manufacturing.fcst 1890.074 1666.251 2113.897 223.8228
#> 
#> $Computer_storage_device_manufacturing
#>                                                fcst    lower    upper       CI
#> Computer_storage_device_manufacturing.fcst 594.3616 483.2455 705.4776 111.1161
#> 
#> $Other
#>                fcst    lower   upper       CI
#> Other.fcst 998.7883 903.0867 1094.49 95.70155

# Granger test
granger_results_final <- causality(var_model_final)
#> Warning in causality(var_model_final): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Computer_Electronics) as cause variable.
# Print Granger causality test results
print(granger_results_final)
#> $Granger
#> 
#>  Granger causality H0: Computer_Electronics do not Granger-cause
#>  Elec_comp_manufacturing Computer_storage_device_manufacturing Other
#> 
#> data:  VAR object var_model_final
#> F-Test = 2.9609, df1 = 9, df2 = 1480, p-value = 0.001709
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Computer_Electronics and
#>  Elec_comp_manufacturing Computer_storage_device_manufacturing Other
#> 
#> data:  VAR object var_model_final
#> Chi-squared = 85.674, df = 3, p-value < 2.2e-16

# From the test result we can say that there is evidence of instantaneous causality between the "Computer_Electronics" variable and at least one of the other variables in the model ("Elec_comp_manufacturing", "Computer_storage_device_manufacturing", "Other").

# Sparse VAR model

# Convert the xts object to a matrix
combined_original_matrix <- as.matrix(combined_ts_series)

var_model_sparse <- BigVAR.fit(combined_original_matrix, p = 3, lambda = 0.1, struct = "Lag")

# Summary
print(summary(var_model_sparse))
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  -1.1258  -0.0285   0.0158  19.7869   0.1127 737.6810

^{Created on 2024-04-10 with reprex v2.1.0}

jlivsey / UB-sping24-time-series

Alisha Ruqshan Kadiri #18