jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Sanket Gadhave #16

Open sanketgadhave opened 4 months ago

sanketgadhave commented 4 months ago
# homework 1
# Forecasting the unemployment number for current week with given data

library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(dygraphs)
#> Warning: package 'dygraphs' was built under R version 4.2.3
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.2.3
library(forecast)
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
fredr_set_key("052142bc981666b4ebcb1c8df98d006b")

# Reading the data
icnsa = fredr(series_id = "ICNSA")
column_names <- colnames(icnsa)
print(column_names)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

# Converting the date to date data type
icnsa$date <- as.Date(icnsa$date)

#Plotting the timeseries over date and values (Unemployment number)
ggplot(icnsa, aes(x = date, y = value)) +
  geom_line() + 
  labs(title = "Unemployment Numbers Over Time", x = "Date", y = "Unemployment Numbers") +
  theme_minimal()


# Plotting the Autocorrelation Function
Acf(icnsa$value, main = "ACF of Unemployment Numbers")


# Plotting the Partial Autocorrelation Function
Pacf(icnsa$value, main = "PACF of Unemployment Numbers")


# Fitting the Arima model using auto.arima
model <- auto.arima(icnsa$value)
summary(model)
#> Series: icnsa$value 
#> ARIMA(2,0,3) with non-zero mean 
#> 
#> Coefficients:
#>          ar1      ar2      ma1      ma2      ma3       mean
#>       1.7297  -0.7371  -0.3852  -0.3309  -0.1667  367051.06
#> s.e.  0.0403   0.0370   0.0433   0.0228   0.0218   24606.25
#> 
#> sigma^2 = 7.487e+09:  log likelihood = -38091.08
#> AIC=76196.15   AICc=76196.19   BIC=76238.15
#> 
#> Training set error measures:
#>                     ME    RMSE      MAE      MPE     MAPE     MASE         ACF1
#> Training set -180.7468 86438.5 41253.74 -1.83169 10.61079 1.094085 -0.003548791

# Get the last forecasted date from data.
last_date <- max(icnsa$date)

print(last_date)
#> [1] "2024-02-03"

# As it is a weekly data next forecast would be for 2024-02-10
forecast_date <- as.Date("2024-02-10")

# Calculate the difference in weeks
weeks_between <- as.numeric(difftime(forecast_date, last_date, units = "weeks"))

# Since we're working with weekly data, round up any fractional weeks to the nearest whole number
forecast_horizon <- ceiling(weeks_between)

print(paste("Last date in dataset:", last_date))
#> [1] "Last date in dataset: 2024-02-03"
print(paste("Forecast horizon (weeks):", forecast_horizon))
#> [1] "Forecast horizon (weeks): 1"

forecast_result <- forecast(model, h = forecast_horizon) # 'h' is the forecast horizon. 1 week in this case.

forecast_mean <- mean(forecast_result$mean, na.rm = TRUE)
print(forecast_mean)
#> [1] 225616

Created on 2024-02-09 with reprex v2.1.0

sanketgadhave commented 4 months ago
# homework 2
# Handling the Covid period. (2020 - 2022 2 years) by moving average

library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(dygraphs)
#> Warning: package 'dygraphs' was built under R version 4.2.3
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.2.3
library(forecast)
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
fredr_set_key("052142bc981666b4ebcb1c8df98d006b")

# Reading the data
icnsa = fredr(series_id = "ICNSA")
column_names <- colnames(icnsa)
print(column_names)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

# Converting the date to date data type
icnsa$date <- as.Date(icnsa$date)

window_size <- 52 # a 52-week (1 year) moving average
icnsa$smoothed_value <- zoo::rollmean(icnsa$value, k = window_size, fill = NA)

# Plot the smoothed data
ggplot(icnsa, aes(x = date, y = smoothed_value)) +
  geom_line() +
  labs(title = "Smoothed Unemployment Numbers Over Time", x = "Date", y = "Smoothed Unemployment Numbers") +
  theme_minimal()
#> Warning: Removed 51 rows containing missing values (`geom_line()`).


# Plotting the Autocorrelation Function
Acf(icnsa$value, main = "ACF of Unemployment Numbers")


# Plotting the Partial Autocorrelation Function
Pacf(icnsa$value, main = "PACF of Unemployment Numbers")


# Fitting the new arima model with smoothed data
updated_model <- auto.arima(icnsa$smoothed_value)
summary(updated_model)
#> Series: icnsa$smoothed_value 
#> ARIMA(1,1,3) 
#> 
#> Coefficients:
#>          ar1     ma1     ma2      ma3
#>       0.8521  0.7171  0.1644  -0.0587
#> s.e.  0.0134  0.0232  0.0303   0.0223
#> 
#> sigma^2 = 3670877:  log likelihood = -26274.86
#> AIC=52559.71   AICc=52559.73   BIC=52589.62
#> 
#> Training set error measures:
#>                      ME     RMSE     MAE        MPE      MAPE      MASE
#> Training set -0.1090695 1914.317 756.916 0.01319981 0.2107156 0.4144923
#>                      ACF1
#> Training set 0.0005282674

# Get the last forecasted date from data.
last_date <- max(icnsa$date)

print(last_date)
#> [1] "2024-02-03"

# As it is a weekly data next forecast would be for 2024-02-10
forecast_date <- as.Date("2024-02-10")

# Calculate the difference in weeks
weeks_between <- as.numeric(difftime(forecast_date, last_date, units = "weeks"))

# Since we're working with weekly data, round up any fractional weeks to the nearest whole number
forecast_horizon <- ceiling(weeks_between)

print(paste("Last date in dataset:", last_date))
#> [1] "Last date in dataset: 2024-02-03"
print(paste("Forecast horizon (weeks):", forecast_horizon))
#> [1] "Forecast horizon (weeks): 1"

# Forecasting for the same peroid but now with smoothed data
forecast_result_updated <- forecast(updated_model, h = forecast_horizon)

forecast_mean_updated <- mean(forecast_result_updated$mean, na.rm = TRUE)
print(forecast_mean_updated)
#> [1] 219972.3

Created on 2024-02-09 with reprex v2.1.0

sanketgadhave commented 4 months ago

fredr_set_key("052142bc981666b4ebcb1c8df98d006b")

Read Data Global price of Olive Oil

oliv_oil_data = fredr(series_id = "POLVOILUSDM") column_names <- colnames(oliv_oil_data) # Getting column details print(column_names)

> [1] "date" "series_id" "value" "realtime_start"

> [5] "realtime_end"

Printing few records of data

head(oliv_oil_data)

> # A tibble: 6 × 5

> date series_id value realtime_start realtime_end

>

> 1 1980-01-01 POLVOILUSDM NA 2024-02-12 2024-02-12

> 2 1980-02-01 POLVOILUSDM NA 2024-02-12 2024-02-12

> 3 1980-03-01 POLVOILUSDM NA 2024-02-12 2024-02-12

> 4 1980-04-01 POLVOILUSDM NA 2024-02-12 2024-02-12

> 5 1980-05-01 POLVOILUSDM NA 2024-02-12 2024-02-12

> 6 1980-06-01 POLVOILUSDM NA 2024-02-12 2024-02-12

Need to remove N/A values

oliv_oil_data = na.omit(oliv_oil_data) head(oliv_oil_data)

> # A tibble: 6 × 5

> date series_id value realtime_start realtime_end

>

> 1 1990-01-01 POLVOILUSDM 2973. 2024-02-12 2024-02-12

> 2 1990-02-01 POLVOILUSDM 3053. 2024-02-12 2024-02-12

> 3 1990-03-01 POLVOILUSDM 2925. 2024-02-12 2024-02-12

> 4 1990-04-01 POLVOILUSDM 2946. 2024-02-12 2024-02-12

> 5 1990-05-01 POLVOILUSDM 3021. 2024-02-12 2024-02-12

> 6 1990-06-01 POLVOILUSDM 3074. 2024-02-12 2024-02-12

olive_oil_ts <- ts(oliv_oil_data$value, start = c(1990, 1), frequency = 12)

plot(olive_oil_ts, xlab = "Time", ylab = "Price (USD)", main = "Monthly Global Price of Olive Oil")


![](https://i.imgur.com/6fRqUka.png)<!-- -->

``` r

plot(decompose(olive_oil_ts))


# From plot we cannot say the time series is stationary, thus we need to perform statistical test such as Augmented Dickey-Fuller (ADF) test 

adf_result <- adf.test(olive_oil_ts, alternative = "stationary")
print(adf_result)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  olive_oil_ts
#> Dickey-Fuller = -1.2825, Lag order = 7, p-value = 0.8805
#> alternative hypothesis: stationary

# After analysing the results of ADF test, high p-value indicates that we do not have sufficient evidence to conclude that the time series is stationary.
# Thus we need to perform differencing.
# Performing differencing
olive_oil_diff <- diff(olive_oil_ts)

plot(olive_oil_diff, main="Differenced Olive Oil Price Time Series", ylab="Differenced Price")


# Retesting for stationarity using the ADF test to see if the differencing has stabilized the mean of the series
adf_result_diff <- adf.test(olive_oil_diff, alternative = "stationary")
#> Warning in adf.test(olive_oil_diff, alternative = "stationary"): p-value
#> smaller than printed p-value
print(adf_result_diff)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  olive_oil_diff
#> Dickey-Fuller = -5.3733, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary

# Now the 0.01 p-value indicates that the time series is not stationary when differencing is performed.

# Plotting ACF on time series after differencing
Acf(olive_oil_diff, main="ACF for Olive Oil Prices")


# Plotting PACF on time series after differencing

Pacf(olive_oil_diff, main="PACF for Differenced Olive Oil Prices")


# Let's try to fit some ARIMA models and check which is better fit.
# Since we've already found that the first differencing makes the series stationary, we can set d=1 in our ARIMA models.
# For p and q values, from ACF and PACF plots we can say that ACF and PACF values were well within the significance bounds right from the first bound.
# So we can use small p and q values.

# Model 1: ARIMA(0,1,0)
arima_010 <- Arima(olive_oil_ts, order=c(0,1,0))
summary(arima_010)
#> Series: olive_oil_ts 
#> ARIMA(0,1,0) 
#> 
#> sigma^2 = 61693:  log likelihood = -2822.1
#> AIC=5646.19   AICc=5646.2   BIC=5650.2
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE         MPE     MAPE      MASE
#> Training set 15.92897 248.0755 141.7215 -0.03579033 3.729983 0.1821827
#>                   ACF1
#> Training set 0.2326054
checkresiduals(arima_010)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(0,1,0)
#> Q* = 48.433, df = 24, p-value = 0.00223
#> 
#> Model df: 0.   Total lags used: 24

# Model 2: ARIMA(1,1,1)
arima_111 <- Arima(olive_oil_ts, order=c(1,1,1))
summary(arima_111)
#> Series: olive_oil_ts 
#> ARIMA(1,1,1) 
#> 
#> Coefficients:
#>           ar1     ma1
#>       -0.1132  0.3863
#> s.e.   0.1444  0.1311
#> 
#> sigma^2 = 57688:  log likelihood = -2807.48
#> AIC=5620.96   AICc=5621.02   BIC=5632.99
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE         MPE     MAPE      MASE
#> Training set 13.09154 239.2992 136.7525 0.003007408 3.678121 0.1757952
#>                      ACF1
#> Training set -0.004306793
checkresiduals(arima_111)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(1,1,1)
#> Q* = 17.806, df = 22, p-value = 0.7174
#> 
#> Model df: 2.   Total lags used: 24

# Model 3: ARIMA(0,1,1)
arima_011 <- Arima(olive_oil_ts, order=c(0,1,1))
summary(arima_011)
#> Series: olive_oil_ts 
#> ARIMA(0,1,1) 
#> 
#> Coefficients:
#>          ma1
#>       0.2854
#> s.e.  0.0493
#> 
#> sigma^2 = 57629:  log likelihood = -2807.77
#> AIC=5619.54   AICc=5619.57   BIC=5627.56
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE         MPE     MAPE      MASE
#> Training set 12.73021 239.4705 136.5809 0.003674148 3.678025 0.1755745
#>                     ACF1
#> Training set -0.01473991
checkresiduals(arima_011)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(0,1,1)
#> Q* = 18.004, df = 23, p-value = 0.7573
#> 
#> Model df: 1.   Total lags used: 24

# From above three models we can see that the ARIMA(0,1,1) model has the lowest AIC and AICc, suggesting it is the best model among the three according to these criteria.

# Forecast values 
predicted_values <- forecast(arima_011, h = 12)
print(predicted_values)
#>          Point Forecast    Lo 80     Hi 80    Lo 95    Hi 95
#> Jan 2024       9647.078 9339.429  9954.727 9176.570 10117.59
#> Feb 2024       9647.078 9146.048 10148.108 8880.818 10413.34
#> Mar 2024       9647.078 9008.787 10285.368 8670.897 10623.26
#> Apr 2024       9647.078 8896.213 10397.943 8498.729 10795.43
#> May 2024       9647.078 8798.443 10495.713 8349.203 10944.95
#> Jun 2024       9647.078 8710.827 10583.328 8215.206 11078.95
#> Jul 2024       9647.078 8630.737 10663.418 8092.719 11201.44
#> Aug 2024       9647.078 8556.513 10737.643 7979.203 11314.95
#> Sep 2024       9647.078 8487.028 10807.127 7872.935 11421.22
#> Oct 2024       9647.078 8421.477 10872.679 7772.682 11521.47
#> Nov 2024       9647.078 8359.257 10934.898 7677.526 11616.63
#> Dec 2024       9647.078 8299.908 10994.247 7586.760 11707.40

Created on 2024-02-12 with reprex v2.1.0

sanketgadhave commented 4 months ago
# homework 1
# Forecasting the unemployment number for current week with given data

library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(dygraphs)
#> Warning: package 'dygraphs' was built under R version 4.2.3
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(tibbletime)
#> Warning: package 'tibbletime' was built under R version 4.2.3
#> 
#> Attaching package: 'tibbletime'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:xts':
#> 
#>     first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3

fredr_set_key("052142bc981666b4ebcb1c8df98d006b")

# Reading the ICNSA data
icnsa_data = fredr(series_id = "ICNSA")
column_names_icnsa <- colnames(icnsa_data)
print(column_names_icnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-21     2024-02-21  
#> 2 1967-01-14 ICNSA     334000 2024-02-21     2024-02-21  
#> 3 1967-01-21 ICNSA     277000 2024-02-21     2024-02-21  
#> 4 1967-01-28 ICNSA     252000 2024-02-21     2024-02-21  
#> 5 1967-02-04 ICNSA     274000 2024-02-21     2024-02-21  
#> 6 1967-02-11 ICNSA     276000 2024-02-21     2024-02-21

# Reading the IURSA data
iurnsa_data = fredr(series_id = "IURNSA")
column_names_iurnsa <- colnames(iurnsa_data)
print(column_names_iurnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

head(iurnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id value realtime_start realtime_end
#>   <date>     <chr>     <dbl> <date>         <date>      
#> 1 1971-01-02 IURNSA      5.2 2024-02-21     2024-02-21  
#> 2 1971-01-09 IURNSA      5.3 2024-02-21     2024-02-21  
#> 3 1971-01-16 IURNSA      5.3 2024-02-21     2024-02-21  
#> 4 1971-01-23 IURNSA      5.2 2024-02-21     2024-02-21  
#> 5 1971-01-30 IURNSA      5.2 2024-02-21     2024-02-21  
#> 6 1971-02-06 IURNSA      5.2 2024-02-21     2024-02-21

# Converting the date to date data type in both datasets
icnsa_data$date <- as.Date(icnsa_data$date)
iurnsa_data$date <- as.Date(iurnsa_data$date)

# We can see that the data available for ICNSA and IURSA cover different periods. We need to ensure that they are aligned in time 

# Convert data frames to time-aware tibbles
icnsa_ts <- icnsa_data %>% as_tbl_time(index = date)
iurnsa_ts <- iurnsa_data %>% as_tbl_time(index = date)

merged_data <- left_join(icnsa_ts, iurnsa_ts, by = "date", suffix = c("_icnsa", "_iurnsa"))
cleaned_data <- na.omit(merged_data)

head(cleaned_data)
#> # A time tibble: 6 × 9
#> # Index:         date
#>   date       series_id_icnsa value_icnsa realtime_start_icnsa realtime_end_icnsa
#>   <date>     <chr>                 <dbl> <date>               <date>            
#> 1 1971-01-02 ICNSA                443000 2024-02-21           2024-02-21        
#> 2 1971-01-09 ICNSA                500000 2024-02-21           2024-02-21        
#> 3 1971-01-16 ICNSA                452000 2024-02-21           2024-02-21        
#> 4 1971-01-23 ICNSA                399000 2024-02-21           2024-02-21        
#> 5 1971-01-30 ICNSA                353000 2024-02-21           2024-02-21        
#> 6 1971-02-06 ICNSA                375000 2024-02-21           2024-02-21        
#> # ℹ 4 more variables: series_id_iurnsa <chr>, value_iurnsa <dbl>,
#> #   realtime_start_iurnsa <date>, realtime_end_iurnsa <date>
#Plotting the timeseries ICNSA
ggplot(cleaned_data, aes(x = date, y = value_icnsa)) +
  geom_line() +
  labs(title = "ICNSA over Time", x = "Date", y = "ICNSA Value") +
  theme_minimal()


#Plotting the timeseries IURSA
ggplot(cleaned_data, aes(x = date, y = value_iurnsa)) +
  geom_line() +
  labs(title = "IURNSA over Time", x = "Date", y = "IURNSA Value") +
  theme_minimal()


# Correlation between ICNSA and IURSA
cor(cleaned_data$value_icnsa, cleaned_data$value_iurnsa, use = "complete.obs")
#> [1] 0.6052214

# Plotting scatter plot using CCF (cross-correlation function) for lagged relationships
ccf(cleaned_data$value_icnsa, cleaned_data$value_iurnsa, lag.max = 20, main = "CCF between ICNSA and IURSA")


# Based on the correlation coefficient of approximately 0.6052, there is a moderate positive linear relationship between ICNSA and IURSA. 
# This indicates that when ICNSA values increase, IURSA values also tend to increase, and vice versa.

# Now lets check for stationarity
# ADF test for ICNSA
adf.test(cleaned_data$value_icnsa, alternative = "stationary")
#> Warning in adf.test(cleaned_data$value_icnsa, alternative = "stationary"):
#> p-value smaller than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  cleaned_data$value_icnsa
#> Dickey-Fuller = -8.6593, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary

# ADF test for IURSA
adf.test(cleaned_data$value_iurnsa, alternative = "stationary")
#> Warning in adf.test(cleaned_data$value_iurnsa, alternative = "stationary"):
#> p-value smaller than printed p-value
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  cleaned_data$value_iurnsa
#> Dickey-Fuller = -7.1617, Lag order = 14, p-value = 0.01
#> alternative hypothesis: stationary

# Decomposing ICNSA series to understand seasonality
icnsa_decomposed <- stl(ts(cleaned_data$value_icnsa, frequency = 52), s.window = "periodic")
plot(icnsa_decomposed)


# Decomposing IURSA series to understand seasonality
iurnsa_decomposed <- stl(ts(cleaned_data$value_iurnsa, frequency = 52), s.window = "periodic")
plot(iurnsa_decomposed)


# Fiting preliminary ARIMA model for ICNSA
prelim_model_icnsa <- auto.arima(cleaned_data$value_icnsa)

# Summary of the model
summary(prelim_model_icnsa)
#> Series: cleaned_data$value_icnsa 
#> ARIMA(2,0,4) with non-zero mean 
#> 
#> Coefficients:
#>           ar1     ar2     ma1     ma2     ma3      ma4       mean
#>       -0.0112  0.7593  1.3613  0.5133  0.0034  -0.0840  375013.12
#> s.e.   0.0479  0.0391  0.0512  0.0511  0.0439   0.0228   18653.58
#> 
#> sigma^2 = 7.957e+09:  log likelihood = -35515.32
#> AIC=71046.64   AICc=71046.7   BIC=71094.06
#> 
#> Training set error measures:
#>                     ME     RMSE     MAE       MPE     MAPE     MASE
#> Training set -16.44255 89091.54 42860.2 -1.874048 10.73206 1.103865
#>                      ACF1
#> Training set 0.0001125957

# Convert the relevant columns to time series objects
icnsa_series <- ts(cleaned_data$value_icnsa, start = c(1971, 1), frequency = 52)
iurnsa_series <- ts(cleaned_data$value_iurnsa, start = c(1971, 1), frequency = 52)

# For ICNSA
acf(icnsa_series, main = "ACF for ICNSA")

pacf(icnsa_series, main = "PACF for ICNSA")


# For IURNSA
acf(iurnsa_series, main = "ACF for IURNSA")

pacf(iurnsa_series, main = "PACF for IURNSA")


# Fit a regARIMA model
model <- auto.arima(icnsa_series, xreg = iurnsa_series)

# View the model summary
summary(model)
#> Series: icnsa_series 
#> Regression with ARIMA(2,1,1)(1,0,1)[52] errors 
#> 
#> Coefficients:
#>          ar1      ar2     ma1    sar1     sma1        xreg
#>       0.5297  -0.2789  0.0597  0.5110  -0.2019  -65410.144
#> s.e.  0.0638   0.0337  0.0695  0.0373   0.0419    8052.756
#> 
#> sigma^2 = 7.345e+09:  log likelihood = -35394.32
#> AIC=70802.65   AICc=70802.69   BIC=70844.13
#> 
#> Training set error measures:
#>                     ME     RMSE      MAE        MPE     MAPE      MASE
#> Training set -78.08817 85592.44 38155.21 -0.5205096 9.312413 0.3860366
#>                       ACF1
#> Training set -0.0003122404

# To calculate the intercept value as we are not able to get it from summary, we need to perform differencing of 1 on icnsa data
# Calculate the differences of the icnsa_series to mimic the differencing
differences <- diff(icnsa_series)

# Calculate the mean of these differences
mean_difference <- mean(differences)
print(mean_difference)
#> [1] -75.18809

# We will be using last week value from iurnsa as we don't have a future value.
last_value_iurnsa <- tail(cleaned_data$value_iurnsa, 1)
print(last_value_iurnsa)
#> [1] 1.4

# Forcasted value
next_week_forecast <- forecast(model, xreg = last_value_iurnsa, h = 1)
print(next_week_forecast)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 2024.288       211649.7 101819.8 321479.7 43679.33 379620.1
forecast_mean <- mean(next_week_forecast$mean, na.rm = TRUE)
print(forecast_mean)
#> [1] 211649.7

Created on 2024-02-21 with reprex v2.1.0

Final regARIMA Model we got from auto.arima is ARIMA(2,1,1)(1,0,1)[52]

Explaination:

Regression Diagnostics:

  1. Coefficient Estimates:

    • ar1 and ar2 are the coefficients for the autoregressive part of the model. Both are statistically significant, as their standard errors are small relative to the coefficients themselves.
    • sar1 and sma1 are the seasonal autoregressive and moving average parts, respectively. They suggest some seasonal effects in the data that are being accounted for by the model.
  2. Information Criteria:

    • The presence of the regressor improves the AIC, AICc, and BIC scores compared to the model without the regressor, suggesting a better fit to the data.
  3. Training Set Error Measures:

    • MPE (Mean Percentage Error) and MAPE (Mean Absolute Percentage Error) provide percentage measures of the average error. The MAPE of 9.31% suggests that the predictions are, on average, about 9.31% off from the actual values, which may be acceptable depending on the context.
    • MASE (Mean Absolute Scaled Error) compares the MAE to a naive model. A MASE less than one suggests that the model performs better than a naive model.
    • ACF1 is the first lag autocorrelation of the residuals, which is very close to zero, indicating that there's little to no autocorrelation in the residuals, which is good.

Forcasted Value which we got is: 211649.7

sanketgadhave commented 4 months ago
# homework 2
# Forecasting the unemployment number for current week with given ICNSA data by handling covid period using cubic spline methodology

library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(dygraphs)
#> Warning: package 'dygraphs' was built under R version 4.2.3
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(tibbletime)
#> Warning: package 'tibbletime' was built under R version 4.2.3
#> 
#> Attaching package: 'tibbletime'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:xts':
#> 
#>     first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(zoo)

fredr_set_key("052142bc981666b4ebcb1c8df98d006b")

# Reading the ICNSA data
icnsa_data = fredr(series_id = "ICNSA")
column_names_icnsa <- colnames(icnsa_data)
print(column_names_icnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-02-28     2024-02-28  
#> 2 1967-01-14 ICNSA     334000 2024-02-28     2024-02-28  
#> 3 1967-01-21 ICNSA     277000 2024-02-28     2024-02-28  
#> 4 1967-01-28 ICNSA     252000 2024-02-28     2024-02-28  
#> 5 1967-02-04 ICNSA     274000 2024-02-28     2024-02-28  
#> 6 1967-02-11 ICNSA     276000 2024-02-28     2024-02-28

# Converting the date to date data type in both datasets
icnsa_data$date <- as.Date(icnsa_data$date)

# Plotting the timeseries ICNSA
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA over Time", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# Selecting the covid period from March 14, 2020, to January 2, 2021
covid_start <- as.Date("2020-03-14")
covid_end <- as.Date("2021-01-02")

# Identifying the index of rows between this covid period
covid_start_idx <- which(icnsa_data$date == covid_start)
covid_end_idx <- which(icnsa_data$date == covid_end)
covid_idx <- rep(0, nrow(icnsa_data))
covid_idx[covid_start_idx:covid_end_idx] <- 1
covid_idx <- as.logical(covid_idx)

icnsa_na <- icnsa_data |>
  select(date, value) 

icnsa_na$value[covid_idx] <- NA

# Ensuring data is ordered by date to avoid any potential issues during interpolation.
icnsa_na <- icnsa_na %>% arrange(date)

# Impute missing values using cubic spline interpolation
icnsa_na$value <- na.spline(icnsa_na$value)

# Applying smoothing spline outside COVID period and plotting
values_before_covid <- icnsa_na$value[!covid_idx] 

# As icnsa_na$date is of class Date, we need to convert it to numeric as smooth.spline() function expects numerical inputs for both fitting and prediction, and Date objects are not handled as numeric by default.
date_numeric <- as.numeric(icnsa_na$date - min(icnsa_na$date))

# Now, apply smooth.spline using the numeric dates

spar_values <- c(0.1, 0.5, 1)
# Prepare the plot window
par(mfrow=c(3, 1)) 

for(spar in spar_values) {
  # Fit smooth spline with current spar value
  fit_spline <- smooth.spline(date_numeric[!covid_idx], values_before_covid, spar=spar)

  # Predict using the fitted spline for the entire period
  predicted_values <- predict(fit_spline, date_numeric)

  # Plot
  plot(icnsa_na$date, icnsa_na$value, type='l', main=paste("Spar =", spar),
       xlab="Date", ylab="ICNSA Values", col="blue")
  lines(icnsa_na$date, predicted_values$y, col="red")
  legend("topleft", legend=c("Original", "Smoothed"), col=c("blue", "red"), lty=1)
}


# Reset par to default after plotting
par(mfrow=c(1, 1))

# Using the chosen spar value to impute the missing values
fit_spline <- smooth.spline(date_numeric[!covid_idx], values_before_covid, spar=0.5)
predicted_values <- predict(fit_spline, date_numeric)

#ggplot() +
# geom_line(aes(x = icnsa_na$date, y = icnsa_na$value), color = "blue") +
#geom_line(aes(x = icnsa_na$date, y = predicted_values$y), color = "red") +
#labs(title = "ICNSA Data with Cubic Spline Imputation", x = "Date", y = "Value") +
#theme_minimal()

# Replace the NA values in the COVID period with the predicted values
icnsa_na$value[covid_idx] <- predicted_values$y[covid_idx]

# Converting the complete series with imputed values into a time series object
icnsa_ts <- ts(icnsa_na$value, frequency = 52) 

# Fit additive Holt-Winters model
hw_add <- HoltWinters(icnsa_ts, seasonal = "additive")

# Forecast the next value
forecast_add <- forecast(hw_add, h = 1)
print(forecast_add)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 58.32692       163281.7 118912.4 207651.1 95424.73 231138.7
forecast_mean_add <- mean(forecast_add$mean, na.rm = TRUE)
print(forecast_mean_add)
#> [1] 163281.7

# Fit multiplicative Holt-Winters model
hw_mul <- HoltWinters(icnsa_ts, seasonal = "multiplicative")

# Forecast the next value
forecast_mul <- forecast(hw_mul, h = 1)
print(forecast_mul)
#>          Point Forecast    Lo 80  Hi 80    Lo 95    Hi 95
#> 58.32692       182854.1 138777.3 226931 115444.4 250263.9
forecast_mean_mul <- mean(forecast_mul$mean, na.rm = TRUE)
print(forecast_mean_mul)
#> [1] 182854.1

Created on 2024-02-28 with reprex v2.1.0

sanketgadhave commented 4 months ago

Homework - 3

# homework 3
# Forecasting the unemployment number for current week with given ICNSA data by handling covid period using state-space missing value methodology

library(fredr)
#> Warning: package 'fredr' was built under R version 4.2.3
library(forecast)
#> Warning: package 'forecast' was built under R version 4.2.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(dygraphs)
#> Warning: package 'dygraphs' was built under R version 4.2.3
library(xts)
#> Warning: package 'xts' was built under R version 4.2.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.2.3
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.2.3
library(reprex)
#> Warning: package 'reprex' was built under R version 4.2.3
library(tibbletime)
#> Warning: package 'tibbletime' was built under R version 4.2.3
#> 
#> Attaching package: 'tibbletime'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:xts':
#> 
#>     first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tseries)
#> Warning: package 'tseries' was built under R version 4.2.3
library(zoo)
library(stats)

fredr_set_key("052142bc981666b4ebcb1c8df98d006b")

# Reading the ICNSA data
icnsa_data = fredr(series_id = "ICNSA")
column_names_icnsa <- colnames(icnsa_data)
print(column_names_icnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

head(icnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id  value realtime_start realtime_end
#>   <date>     <chr>      <dbl> <date>         <date>      
#> 1 1967-01-07 ICNSA     346000 2024-03-07     2024-03-07  
#> 2 1967-01-14 ICNSA     334000 2024-03-07     2024-03-07  
#> 3 1967-01-21 ICNSA     277000 2024-03-07     2024-03-07  
#> 4 1967-01-28 ICNSA     252000 2024-03-07     2024-03-07  
#> 5 1967-02-04 ICNSA     274000 2024-03-07     2024-03-07  
#> 6 1967-02-11 ICNSA     276000 2024-03-07     2024-03-07

# Converting the date to date data type in both datasets
icnsa_data$date <- as.Date(icnsa_data$date)

# Plotting the timeseries ICNSA
ggplot(icnsa_data, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "ICNSA over Time", x = "Date", y = "ICNSA Value") +
  theme_minimal()


icnsa_data_orignal <- icnsa_data
# Selecting the covid period from March 14, 2020, to January 2, 2021
covid_start <- as.Date("2020-03-14")
covid_end <- as.Date("2021-01-02")

# Identifying the index of rows between this covid period
covid_start_idx <- which(icnsa_data$date == covid_start)
covid_end_idx <- which(icnsa_data$date == covid_end)
covid_idx <- rep(0, nrow(icnsa_data))
covid_idx[covid_start_idx:covid_end_idx] <- 1
covid_idx <- as.logical(covid_idx)

icnsa_na <- icnsa_data |>
  select(date, value) 

icnsa_na$value[covid_idx] <- NA

# Ensuring data is ordered by date to avoid any potential issues during interpolation.
icnsa_na <- icnsa_na %>% arrange(date)

# Now in order to predict the missing values for covid period usinf state-space methodology for handling missing values, we can fir a state-space model to a time series, which can then be used for imputing missing values.
# For this using StructTS we need to decide on the type of model to use. For that we need to do some empirical analysis

# Plot the time series data excluding the COVID period

ggplot(icnsa_na, aes(x = date, y = value)) +
  geom_line(na.rm = TRUE) +  # Removes NA values for plotting
  labs(title = "ICNSA over Time Excluding COVID Period", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# Summary Statistics
summary(icnsa_na$value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>  133000  264757  325019  346846  400886 1082696      43

# ACF and PACF plots
acf(icnsa_na$value, na.action = na.pass, main="ACF for ICNSA")

pacf(icnsa_na$value, na.action = na.pass, main="PACF for ICNSA")


# From the analysis The time series plot did not display any clear trend or seasonality, which could suggest a model with just a level (type = "level").
# The ACF plot showed a slow decay, which often suggests non-stationarity. In the context of StructTS, this might suggest the need for a trend component in the model (type = "trend") to account for non-stationary behavior.

# Fit a model with trend

icnsa_ts <- ts(icnsa_data$value, start = c(start(icnsa_data$date)), frequency = 52)
ss_model_trend <- StructTS(icnsa_ts, type = "trend")

print(ss_model_trend)
#> 
#> Call:
#> StructTS(x = icnsa_ts, type = "trend")
#> 
#> Variances:
#>     level      slope    epsilon  
#> 9.413e+09  0.000e+00  0.000e+00
# Use KalmanSmooth to obtain the smoothed state estimates
smoothed <- tsSmooth(ss_model_trend)

# Updating NA values
icnsa_data$value[covid_idx] <- ifelse(is.na(icnsa_data$value), smoothed[covid_idx], icnsa_data$value)
#> Warning in icnsa_data$value[covid_idx] <- ifelse(is.na(icnsa_data$value), :
#> number of items to replace is not a multiple of replacement length

ggplot() +
  geom_line(data = icnsa_data_orignal, aes(x = date, y = value), color = "blue") + 
  geom_line(data = icnsa_data, aes(x = date, y = value), color = "red") +
  labs(title = "ICNSA over Time with Original and Imputed Values", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# Now as we have imputed the values for covid period let's fit a structural time series model to the entire ICNSA dataset, which includes the imputed COVID period. 
# Converting the imputed data to a 'ts' object
icnsa_complete_ts <- ts(icnsa_data$value, start = c(start(icnsa_data$date)), frequency = 52)

# Fitting a structural time series model with both trend and seasonal components
ss_complete_model <- StructTS(icnsa_complete_ts, type = "BSM")

summary(ss_complete_model)
#>           Length Class  Mode     
#> coef         4   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    8946   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric
# Use tsSmooth to obtain the smoothed state estimates
smoothed_complete <- tsSmooth(ss_complete_model)

residuals_ss <- residuals(ss_complete_model)

# Plot residuals to visualize the error component
plot(residuals_ss, main="Residuals from Structural Model", ylab="Residuals", xlab="Time")


# Reading the CCNSA data
ccnsa_data = fredr(series_id = "CCNSA")
column_names_ccnsa <- colnames(ccnsa_data)
print(column_names_ccnsa)
#> [1] "date"           "series_id"      "value"          "realtime_start"
#> [5] "realtime_end"

head(ccnsa_data)
#> # A tibble: 6 × 5
#>   date       series_id   value realtime_start realtime_end
#>   <date>     <chr>       <dbl> <date>         <date>      
#> 1 1967-01-07 CCNSA     1594000 2024-03-07     2024-03-07  
#> 2 1967-01-14 CCNSA     1563000 2024-03-07     2024-03-07  
#> 3 1967-01-21 CCNSA     1551000 2024-03-07     2024-03-07  
#> 4 1967-01-28 CCNSA     1533000 2024-03-07     2024-03-07  
#> 5 1967-02-04 CCNSA     1534000 2024-03-07     2024-03-07  
#> 6 1967-02-11 CCNSA     1557000 2024-03-07     2024-03-07

# Merge ICNSA and CCNSA data
combined_data <- merge(icnsa_data, ccnsa_data, by = "date", all = TRUE)
print(colnames(combined_data))
#> [1] "date"             "series_id.x"      "value.x"          "realtime_start.x"
#> [5] "realtime_end.x"   "series_id.y"      "value.y"          "realtime_start.y"
#> [9] "realtime_end.y"
# Assuming no NAs, convert to 'ts' object
combined_ccnsa_ts <- ts(combined_data$value.y, start = c(1967, 1), frequency = 52)

# Fit an ARIMA model with a covariate
# Note: You'll likely need to use a different model-fitting function that supports exogenous regressors
arima_with_covariate <- auto.arima(icnsa_complete_ts, xreg = combined_ccnsa_ts, seasonal = TRUE)

# We will be using last week value from iurnsa as we don't have a future value.
last_value_ccnsa <- tail(combined_data$value.y, 2)
print(last_value_ccnsa)
#> [1] 2098851      NA

# Forecast the next week's value using the model with the covariate
next_week_forecast <- forecast(arima_with_covariate, h = 1, xreg = last_value_ccnsa)
#> Warning in forecast.forecast_ARIMA(arima_with_covariate, h = 1, xreg =
#> last_value_ccnsa): Upper prediction intervals are not finite.
print(next_week_forecast)
#>          Point Forecast    Lo 80    Hi 80  Lo 95    Hi 95
#> 58.34615         226137 171170.5 281103.4 142073 310200.9
#> 58.36538             NA       NA       NA     NA       NA
forecast_mean <- mean(next_week_forecast$mean, na.rm = TRUE)
print(forecast_mean)
#> [1] 226137

Created on 2024-03-07 with reprex v2.1.0

sanketgadhave commented 2 months ago

file_paths <- c('C:/Users/sanke/OneDrive/Documents/R/Machinery.xlsx', 'C:/Users/sanke/OneDrive/Documents/R/Mining.xlsx', 'C:/Users/sanke/OneDrive/Documents/R/Construction.xlsx', 'C:/Users/sanke/OneDrive/Documents/R/Farm.xlsx')

names_list <- c("Machinery", "Farm", "Construction", "Mining")

A function to read and preprocess each Excel file

read_and_process_excel <- function(file_path) { df <- read_excel(file_path)

Converting the first column to character in case it's being read in some other format

df$Period <- as.character(df[[1]])

Now converting 'Period' to Date format

df$Period <- as.Date(df$Period, format="%b-%Y")

df$Period <- mdy(df$Period)

If value column contains commas, removing them before conversion to numeric

df$Value <- as.numeric(gsub(",", "", as.character(df[[2]])))

Removing rows with NA values

df <- na.omit(df)

Creating an xts object for the time series

time_series <- xts(df$Value, order.by=df$Period)

return(time_series) }

Applying the function to each file and storing the resulting time series in a list

time_series_list <- lapply(file_paths, read_and_process_excel)

head(time_series_list)

> [[1]]

> m.c.seq.row..seq.n...seq.col..drop...FALSE.

> 1992-01-19 36225

> 1992-02-19 35846

> 1992-03-19 35610

> 1992-04-19 35691

> 1992-05-19 35815

> 1992-06-19 35786

> 1992-07-19 35878

> 1992-08-19 35760

> 1992-09-19 35908

> 1992-10-19 35812

> ...

> 2023-05-20 94545

> 2023-06-20 94737

> 2023-07-20 94687

> 2023-08-20 94753

> 2023-09-20 94925

> 2023-10-20 94877

> 2023-11-20 95060

> 2023-12-20 95407

> 2024-01-20 95340

> 2024-02-20 95257

>

> [[2]]

> m.c.seq.row..seq.n...seq.col..drop...FALSE.

> 1992-01-19 2022

> 1992-02-19 1996

> 1992-03-19 1959

> 1992-04-19 1943

> 1992-05-19 1926

> 1992-06-19 1919

> 1992-07-19 1908

> 1992-08-19 1897

> 1992-09-19 1855

> 1992-10-19 1853

> ...

> 2023-05-20 5865

> 2023-06-20 5868

> 2023-07-20 5798

> 2023-08-20 5866

> 2023-09-20 5939

> 2023-10-20 5953

> 2023-11-20 5879

> 2023-12-20 5840

> 2024-01-20 5667

> 2024-02-20 5509

>

> [[3]]

> m.c.seq.row..seq.n...seq.col..drop...FALSE.

> 1992-01-19 2898

> 1992-02-19 2854

> 1992-03-19 2795

> 1992-04-19 2765

> 1992-05-19 2753

> 1992-06-19 2751

> 1992-07-19 2736

> 1992-08-19 2731

> 1992-09-19 2783

> 1992-10-19 2702

> ...

> 2023-05-20 9397

> 2023-06-20 9456

> 2023-07-20 9470

> 2023-08-20 9608

> 2023-09-20 9714

> 2023-10-20 9807

> 2023-11-20 9855

> 2023-12-20 10010

> 2024-01-20 9971

> 2024-02-20 9982

>

> [[4]]

> m.c.seq.row..seq.n...seq.col..drop...FALSE.

> 1992-01-19 1121

> 1992-02-19 1124

> 1992-03-19 1129

> 1992-04-19 1149

> 1992-05-19 1167

> 1992-06-19 1158

> 1992-07-19 1154

> 1992-08-19 1153

> 1992-09-19 1167

> 1992-10-19 1134

> ...

> 2023-05-20 5711

> 2023-06-20 5765

> 2023-07-20 5766

> 2023-08-20 5651

> 2023-09-20 5605

> 2023-10-20 5522

> 2023-11-20 5634

> 2023-12-20 5720

> 2024-01-20 5601

> 2024-02-20 5555

Name the time series in the list

names(time_series_list) <- names_list

Summary for each timeseries

for (i in 1:length(time_series_list)) { cat("Time series:", names(time_series_list[i]), "\n") print(summary(time_series_list[[i]])) cat("\n") }

> Time series: Machinery

> Index time_series_list[[i]]

> Min. :1992-01-19 Min. :35610

> 1st Qu.:2000-01-27 1st Qu.:44751

> Median :2008-02-04 Median :51758

> Mean :2008-02-03 Mean :56001

> 3rd Qu.:2016-02-12 3rd Qu.:66149

> Max. :2024-02-20 Max. :95407

>

> Time series: Farm

> Index time_series_list[[i]]

> Min. :1992-01-19 Min. :1615

> 1st Qu.:2000-01-27 1st Qu.:2244

> Median :2008-02-04 Median :5014

> Mean :2008-02-03 Mean :4508

> 3rd Qu.:2016-02-12 3rd Qu.:6280

> Max. :2024-02-20 Max. :9504

>

> Time series: Construction

> Index time_series_list[[i]]

> Min. :1992-01-19 Min. : 2702

> 1st Qu.:2000-01-27 1st Qu.: 3479

> Median :2008-02-04 Median : 4482

> Mean :2008-02-03 Mean : 4774

> 3rd Qu.:2016-02-12 3rd Qu.: 5734

> Max. :2024-02-20 Max. :10010

>

> Time series: Mining

> Index time_series_list[[i]]

> Min. :1992-01-19 Min. :1121

> 1st Qu.:2000-01-27 1st Qu.:2046

> Median :2008-02-04 Median :2614

> Mean :2008-02-03 Mean :2894

> 3rd Qu.:2016-02-12 3rd Qu.:3720

> Max. :2024-02-20 Max. :6092

Ploting each time series

par(mfrow=c(2, 2))

for (i in 1:length(time_series_list)) { plot(index(time_series_list[[i]]), coredata(time_series_list[[i]]), type = "l", # Line plot main = names(time_series_list[i]), xlab = "Date", ylab = "Value", col = "blue") }


![](https://i.imgur.com/zbWdCSa.png)<!-- -->

``` r

par(mfrow=c(1, 1)) 

##### Empirical analysis #####
# Augmented Dickey-Fuller Test for each series
adf_results <- lapply(time_series_list, function(ts) {
  return(adf.test(ts))
})

# Check the results
adf_results
#> $Machinery
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.5934, Lag order = 7, p-value = 0.3266
#> alternative hypothesis: stationary
#> 
#> 
#> $Farm
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -1.6584, Lag order = 7, p-value = 0.7214
#> alternative hypothesis: stationary
#> 
#> 
#> $Construction
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.3139, Lag order = 7, p-value = 0.4446
#> alternative hypothesis: stationary
#> 
#> 
#> $Mining
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -3.2893, Lag order = 7, p-value = 0.07297
#> alternative hypothesis: stationary

# The Augmented Dickey-Fuller (ADF) test results for each of time series indicate that the null hypothesis of a unit root (i.e., non-stationarity) cannot be rejected for the Machinery, Farm, and Construction series, given their relatively high p-values. 
# For the Mining series, the p-value is closer to the typical significance level of 0.05, suggesting it may be stationary.
# Function to difference and test stationarity
difference_and_test <- function(ts) {
  # Apply differencing
  diff_ts <- diff(ts)

  # Remove NA values that result from differencing
  diff_ts <- na.omit(diff_ts)

  # Perform ADF test on the differenced series
  adf_test_diff <- adf.test(diff_ts)

  return(list(original_adf = adf.test(ts), diff_adf = adf_test_diff, diff_series = diff_ts))
}

# Applying the function to each time series in the list
diff_adf_results <- lapply(time_series_list, difference_and_test)
#> Warning in adf.test(diff_ts): p-value smaller than printed p-value
#> Warning in adf.test(diff_ts): p-value smaller than printed p-value

#> Warning in adf.test(diff_ts): p-value smaller than printed p-value
# Check the results
diff_adf_results
#> $Machinery
#> $Machinery$original_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.5934, Lag order = 7, p-value = 0.3266
#> alternative hypothesis: stationary
#> 
#> 
#> $Machinery$diff_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  diff_ts
#> Dickey-Fuller = -4.3587, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> 
#> 
#> $Machinery$diff_series
#>            m.c.seq.row..seq.n...seq.col..drop...FALSE.
#> 1992-02-19                                        -379
#> 1992-03-19                                        -236
#> 1992-04-19                                          81
#> 1992-05-19                                         124
#> 1992-06-19                                         -29
#> 1992-07-19                                          92
#> 1992-08-19                                        -118
#> 1992-09-19                                         148
#> 1992-10-19                                         -96
#> 1992-11-19                                          65
#>        ...                                            
#> 2023-05-20                                         576
#> 2023-06-20                                         192
#> 2023-07-20                                         -50
#> 2023-08-20                                          66
#> 2023-09-20                                         172
#> 2023-10-20                                         -48
#> 2023-11-20                                         183
#> 2023-12-20                                         347
#> 2024-01-20                                         -67
#> 2024-02-20                                         -83
#> 
#> 
#> $Farm
#> $Farm$original_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -1.6584, Lag order = 7, p-value = 0.7214
#> alternative hypothesis: stationary
#> 
#> 
#> $Farm$diff_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  diff_ts
#> Dickey-Fuller = -3.5924, Lag order = 7, p-value = 0.03387
#> alternative hypothesis: stationary
#> 
#> 
#> $Farm$diff_series
#>            m.c.seq.row..seq.n...seq.col..drop...FALSE.
#> 1992-02-19                                         -26
#> 1992-03-19                                         -37
#> 1992-04-19                                         -16
#> 1992-05-19                                         -17
#> 1992-06-19                                          -7
#> 1992-07-19                                         -11
#> 1992-08-19                                         -11
#> 1992-09-19                                         -42
#> 1992-10-19                                          -2
#> 1992-11-19                                          11
#>        ...                                            
#> 2023-05-20                                          69
#> 2023-06-20                                           3
#> 2023-07-20                                         -70
#> 2023-08-20                                          68
#> 2023-09-20                                          73
#> 2023-10-20                                          14
#> 2023-11-20                                         -74
#> 2023-12-20                                         -39
#> 2024-01-20                                        -173
#> 2024-02-20                                        -158
#> 
#> 
#> $Construction
#> $Construction$original_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -2.3139, Lag order = 7, p-value = 0.4446
#> alternative hypothesis: stationary
#> 
#> 
#> $Construction$diff_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  diff_ts
#> Dickey-Fuller = -4.2533, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> 
#> 
#> $Construction$diff_series
#>            m.c.seq.row..seq.n...seq.col..drop...FALSE.
#> 1992-02-19                                         -44
#> 1992-03-19                                         -59
#> 1992-04-19                                         -30
#> 1992-05-19                                         -12
#> 1992-06-19                                          -2
#> 1992-07-19                                         -15
#> 1992-08-19                                          -5
#> 1992-09-19                                          52
#> 1992-10-19                                         -81
#> 1992-11-19                                          33
#>        ...                                            
#> 2023-05-20                                          43
#> 2023-06-20                                          59
#> 2023-07-20                                          14
#> 2023-08-20                                         138
#> 2023-09-20                                         106
#> 2023-10-20                                          93
#> 2023-11-20                                          48
#> 2023-12-20                                         155
#> 2024-01-20                                         -39
#> 2024-02-20                                          11
#> 
#> 
#> $Mining
#> $Mining$original_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  ts
#> Dickey-Fuller = -3.2893, Lag order = 7, p-value = 0.07297
#> alternative hypothesis: stationary
#> 
#> 
#> $Mining$diff_adf
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  diff_ts
#> Dickey-Fuller = -5.3153, Lag order = 7, p-value = 0.01
#> alternative hypothesis: stationary
#> 
#> 
#> $Mining$diff_series
#>            m.c.seq.row..seq.n...seq.col..drop...FALSE.
#> 1992-02-19                                           3
#> 1992-03-19                                           5
#> 1992-04-19                                          20
#> 1992-05-19                                          18
#> 1992-06-19                                          -9
#> 1992-07-19                                          -4
#> 1992-08-19                                          -1
#> 1992-09-19                                          14
#> 1992-10-19                                         -33
#> 1992-11-19                                           1
#>        ...                                            
#> 2023-05-20                                          41
#> 2023-06-20                                          54
#> 2023-07-20                                           1
#> 2023-08-20                                        -115
#> 2023-09-20                                         -46
#> 2023-10-20                                         -83
#> 2023-11-20                                         112
#> 2023-12-20                                          86
#> 2024-01-20                                        -119
#> 2024-02-20                                         -46
# Extracting differenced time series from diff_adf_results
diff_series_list <- lapply(diff_adf_results, function(res) res$diff_series)

# Combining the differenced time series into a multivariate time series
combined_diff_series <- do.call(cbind, diff_series_list)

## Correlation
# Assign meaningful names to variables
names_list <- c("Machinery", "Farm", "Construction", "Mining")

# Extract data matrices from time series list and assign names
data_matrices <- lapply(1:length(time_series_list), function(i) {
  ts_matrix <- as.matrix(time_series_list[[i]])
  colnames(ts_matrix) <- names_list[i]
  return(ts_matrix)
})

# Combine data matrices into a single matrix
combined_data_matrix <- do.call(cbind, data_matrices)

# Calculate correlations between original series
correlation_matrix <- cor(combined_data_matrix)

# Print correlation matrix
print(correlation_matrix)
#>              Machinery      Farm Construction    Mining
#> Machinery    1.0000000 0.7356755    0.9766534 0.9766122
#> Farm         0.7356755 1.0000000    0.6912602 0.7468435
#> Construction 0.9766534 0.6912602    1.0000000 0.9610721
#> Mining       0.9766122 0.7468435    0.9610721 1.0000000

# ACF and PACF plots for the Machinery time series

machinery_ts <- time_series_list[["Machinery"]]

# Plotting ACF
acf(machinery_ts, main="ACF for Machinery Time Series")


# Plotting PACF
pacf(machinery_ts, main="PACF for Machinery Time Series")


# CCF plots between Machinery and each of the other sectors
# Define the machinery time series
machinery <- as.numeric(time_series_list[[1]])

# Set up multiple plots in a column
par(mfrow=c(length(time_series_list)-1, 1))  

for(i in 2:length(time_series_list)) {  
  # Extract the time series data
  ts <- as.numeric(time_series_list[[i]])

  # Calculate the CCF
  ccf_result <- ccf(machinery, ts, main=paste("CCF between Machinery and", names_list[i]))

  # Plot the CCF
  plot(ccf_result)
}


##### Fitting VAR Model #####
# Fit VAR(1) model
var_model_1 <- VAR(combined_diff_series, p = 1, type = "const")

# Get AIC and BIC for VAR(1) model
aic_var1 <- AIC(var_model_1)
bic_var1 <- BIC(var_model_1)

# To decide value of p let's fit VAR for different values of p
# Fit VAR models with different lag orders
var_model_p1 <- VAR(combined_diff_series, p = 1, type = "const")
var_model_p2 <- VAR(combined_diff_series, p = 2, type = "const")
var_model_p3 <- VAR(combined_diff_series, p = 3, type = "const")

# Calculate AIC and BIC for each model
aic_values <- c(AIC(var_model_p1), AIC(var_model_p2), AIC(var_model_p3))
bic_values <- c(BIC(var_model_p1), BIC(var_model_p2), BIC(var_model_p3))

# Print AIC and BIC values
print(aic_values)
#> [1] 19022.90 18893.30 18823.45
print(bic_values)
#> [1] 19101.92 19035.43 19028.61

# Both the AIC and BIC criteria suggest that a VAR(3) model may be the most appropriate choice for your data. 
# This model captures more complex dynamics compared to VAR(1) and VAR(2) while still being parsimonious enough to avoid overfitting.
var_model_p <- VAR(combined_diff_series, p = 3, type = "const")

# Get AIC and BIC for VAR(p) model
aic_varp <- AIC(var_model_p)
bic_varp <- BIC(var_model_p)

# Compare AIC and BIC
print(paste("VAR(1) AIC:", aic_var1, "BIC:", bic_var1))
#> [1] "VAR(1) AIC: 19022.9026643798 BIC: 19101.9155154316"
print(paste("VAR(p) AIC:", aic_varp, "BIC:", bic_varp))
#> [1] "VAR(p) AIC: 18823.4465925298 BIC: 19028.6084641773"

##### Forecasting one month ahead #####
# Forecast one month ahead using VAR(p) model (p = 3)
forecast_varp <- predict(var_model_p, n.ahead = 1)

# Print forecasted value
print(forecast_varp)
#> $Machinery
#>                   fcst     lower    upper       CI
#> Machinery.fcst 117.157 -535.5794 769.8935 652.7365
#> 
#> $Farm
#>                fcst     lower    upper       CI
#> Farm.fcst -21.15704 -230.9492 188.6351 209.7922
#> 
#> $Construction
#>                        fcst    lower    upper       CI
#> Construction.fcst -10.80041 -160.375 138.7742 149.5746
#> 
#> $Mining
#>                 fcst     lower    upper       CI
#> Mining.fcst 24.14532 -119.9318 168.2224 144.0771

# Extract last observed values for each time series
last_observed_values <- sapply(time_series_list, function(ts) tail(ts, 1))

# Convert list to numeric vector
last_observed_values <- unlist(last_observed_values)
# Check data types
print(class(last_observed_values))
#> [1] "numeric"

# Extract forecasted differences
forecasted_differences <- c(117.157, -21.15704, -10.80041, 24.14532)
print(class(forecasted_differences))
#> [1] "numeric"
# Convert forecasted differences to original scale
forecasted_values <- last_observed_values + forecasted_differences

# Print forecasted values
print(forecasted_values)
#>    Machinery         Farm Construction       Mining 
#>    95374.157     5487.843     9971.200     5579.145

##### Granger causality #####
# Perform Granger causality tests
granger_results <- causality(var_model_p)
#> Warning in causality(var_model_p): 
#> Argument 'cause' has not been specified;
#> using first variable in 'x$y' (Machinery) as cause variable.
# Print Granger causality test results
print(granger_results)
#> $Granger
#> 
#>  Granger causality H0: Machinery do not Granger-cause Farm Construction
#>  Mining
#> 
#> data:  VAR object var_model_p
#> F-Test = 5.0742, df1 = 9, df2 = 1476, p-value = 8.911e-07
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Machinery and Farm Construction
#>  Mining
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 98.688, df = 3, p-value < 2.2e-16

##### sparse VAR model #####
# Combining the Orignal time series into a multivariate time series
combined_original_series <- do.call(cbind, time_series_list)
# Convert the xts object to a matrix
combined_original_matrix <- as.matrix(combined_original_series)
# Specify the sparsity structure
library(BigVAR)
penalty <- "lasso"
penalty_param <- 0.1  

sparse_var_model <- BigVAR.fit(combined_original_matrix, p = 3, lambda = 0.1, struct = "Lag")

# Print summary of the sparse VAR model
print(summary(sparse_var_model))
#>      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#> -42.79728  -0.06492   0.00556   6.01234   0.08531 198.94642

Created on 2024-04-10 with reprex v2.1.0