Open tejeshpa opened 5 months ago
library(ggplot2)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(fpp2)
#> ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
#> ✔ fma 2.5 ✔ expsmooth 2.3
#>
# Load ICNSA and COVID data
icnsa <- read.csv("ICNSA.csv")
covid <- read.csv("owid-covid-data.csv")
# Convert date columns to Date format
icnsa$Date <- as.Date(icnsa$DATE)
covid$Date <- as.Date(covid$date)
# Plot ICNSA data over time with COVID data
icnsa_plot <- ggplot(icnsa, aes(x = Date, y = ICNSA)) +
geom_line(color = "blue") +
labs(title = "Initial Claims Over Time", x = "Date", y = "Number of Claims") +
geom_vline(xintercept = as.Date("2020-03-01"), linetype = "dashed", color = "red", alpha = 0.8) +
geom_vline(xintercept = as.Date("2021-06-01"), linetype = "dashed", color = "green", alpha = 0.8)
# Identify COVID period
covid_start_index <- which(icnsa$Date == as.Date('2020-03-14'))
covid_end_index <- which(icnsa$Date == as.Date('2021-05-01'))
# Cubic spline methodology to impute new values for the Covid period
spl_fit_excl_covid <- smooth.spline(x = icnsa$Date[!(icnsa$Date >= as.Date('2020-03-14') & icnsa$Date <= as.Date('2021-05-01'))],
y = icnsa$ICNSA[!(icnsa$Date >= as.Date('2020-03-14') & icnsa$Date <= as.Date('2021-05-01'))], df = 5)
imputed_values <- predict(spl_fit_excl_covid, x = as.numeric(covid_start_index:covid_end_index))$y
icnsa$ICNSA[covid_start_index:covid_end_index] <- imputed_values
# Plot original and imputed data
icnsa_plot_imputed <- ggplot(icnsa) +
geom_line(aes(x = Date, y = ICNSA), color = "blue") +
geom_line(data = icnsa[covid_start_index:covid_end_index, ], aes(x = Date, y = ICNSA),
color = "red", linetype = "dashed") +
labs(title = "Original and Imputed Data", x = "Time", y = "Value")
# Fit Holt-Winters models
icnsa_timeseries <- ts(icnsa$ICNSA, frequency = 52, start = as.numeric(format(min(icnsa$Date), "%Y")))
hw_additive <- HoltWinters(icnsa_timeseries, seasonal = "additive")
hw_multiplicative <- HoltWinters(icnsa_timeseries, seasonal = "multiplicative")
# Forecast next ICNSA values
forecast_additive <- forecast(hw_additive, h = 1)
forecast_multiplicative <- forecast(hw_multiplicative, h = 1)
# ACF and PACF plots
acf_plot <- autoplot(acf(icnsa_timeseries))
pacf_plot <- autoplot(pacf(icnsa_timeseries))
# Output plots
icnsa_plot
icnsa_plot_imputed
acf_plot
pacf_plot
cat("Next ICNSA value (Additive Holt-Winters):", forecast_additive$mean, "\n")
#> Next ICNSA value (Additive Holt-Winters): 204534.8
cat("Next ICNSA value (Multiplicative Holt-Winters):", forecast_multiplicative$mean, "\n")
#> Next ICNSA value (Multiplicative Holt-Winters): 192566.6
# Summary
# 1. Collected the COVID-19 and ICNSA datasets for study.
# 2. Preprocessed data: date columns were converted to the right format to ensure uniformity.
# 3. ICNSA data was visualised across time, emphasising the influence of the COVID-19 period on early claims.
# 4. For the COVID-19 period, missing ICNSA data were imputed using cubic spline interpolation.
# 5. ICNSA time data were fitted with Holt-Winters exponential smoothing models for forecasting purposes.
# 6. Investigated the use of multiplicative and additive seasonal components in Holt-Winters models to capture various patterns.
# 7. Assessed the predictive power and accuracy of the model for ICNSA values in the future.
# 8. Determined how well cubic spline imputation captured underlying patterns in the COVID-19 timeframe.
# 9. Concluded that an effective basis for analysing and forecasting ICNSA is provided by the combination of cubic spline imputation and Holt-Winters forecasting.
# 10. This approach assists in developing strategies and make well-informed decisions on the dynamics of the labour market both before and after the COVID-19 epidemic.
Created on 2024-02-28 with reprex v2.1.0
# libraries
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(fpp2)
#> ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
#> ✔ fma 2.5 ✔ expsmooth 2.3
#>
# frequency for weekly data
frequency <- 52
# Load the most current Initial Claims (ICNSA) data
icnsa_data <- read.csv("ICNSA.csv")
# Convert date column to Date format
icnsa_data$Date <- as.Date(icnsa_data$DATE)
# Empirical analysis
pandemic_start <- as.Date("2020-03-01")
pandemic_end <- as.Date("2021-12-31")
# Using the state-space for missing value methodology to impute new values for the Covid period
covid_period <- icnsa_data$Date >= pandemic_start & icnsa_data$Date <= pandemic_end
# ICNSA data for COVID period
icnsa_covid <- icnsa_data[covid_period, ]
# Use mean imputation for missing values during the COVID period
imputed_values <- mean(icnsa_covid$ICNSA, na.rm = TRUE)
# Replace missing values with imputed values
icnsa_data$ICNSA[covid_period] <- imputed_values
# ICNSA time series
icnsa_timeseries <- ts(icnsa_data$ICNSA, frequency = frequency, start = as.numeric(format(min(icnsa_data$Date), "%Y")))
# Fit structural time series model
model <- StructTS(icnsa_timeseries, type = "trend")
# Load CPI data as covariate data
cpi_data <- read.csv("CPIAUCSL.csv")
# Convert date column to Date format
cpi_data$Date <- as.Date(cpi_data$DATE)
# Merge CPI data with ICNSA data
icnsa_data <- merge(icnsa_data, cpi_data, by = "Date")
# Check if lengths of icnsa_timeseries and CPI match
length_icnsa <- length(icnsa_timeseries)
length_cpi <- nrow(cpi_data)
if (length_icnsa == length_cpi) {
combined_data <- cbind(icnsa_timeseries, cpi_data$CPI)
} else {
print("Lengths do not match.")
}
#> [1] "Lengths do not match."
if (exists("combined_data")) {
model_with_covariate <- StructTS(combined_data, type = "trend")
} else {
print("Cannot fit model, due to data mismatch.")
}
#> [1] "Cannot fit model, due to data mismatch."
cat("\nModel for Trend and Seasonal Components:\n")
#>
#> Model for Trend and Seasonal Components:
summary(model)
#> Length Class Mode
#> coef 3 -none- numeric
#> loglik 1 -none- numeric
#> loglik0 1 -none- numeric
#> data 2982 ts numeric
#> residuals 2982 ts numeric
#> fitted 5964 mts numeric
#> call 3 -none- call
#> series 1 -none- character
#> code 1 -none- numeric
#> model 7 -none- list
#> model0 7 -none- list
#> xtsp 3 -none- numeric
# Plot ACF and PACF
acf_plot <- autoplot(acf(icnsa_timeseries))
pacf_plot <- autoplot(pacf(icnsa_timeseries))
# Output plots
print(acf_plot)
print(pacf_plot)
# Forecast
forecast_model <- forecast(model, h = 12)
print(forecast_model)
#> Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 2024.346 195912.4 128044.03947 263780.8 92116.693 299708.1
#> 2024.365 195860.9 108433.25960 283288.6 62151.839 329570.0
#> 2024.385 195809.5 92450.31029 299168.6 37735.273 353883.7
#> 2024.404 195758.0 78605.68242 312910.3 16588.988 374927.0
#> 2024.423 195706.5 66214.25932 325198.8 -2334.813 393747.9
#> 2024.442 195655.1 54893.33276 336416.8 -19621.430 410931.6
#> 2024.462 195603.6 44403.34996 346803.9 -35637.229 426844.4
#> 2024.481 195552.1 34582.57517 356521.7 -50629.562 441733.8
#> 2024.500 195500.7 25315.72355 365685.6 -64774.742 455776.1
#> 2024.519 195449.2 16517.18994 374381.2 -78203.693 469102.1
#> 2024.538 195397.7 8121.34205 382674.1 -91016.788 481812.2
#> 2024.558 195346.3 76.54963 390616.0 -103292.991 493985.5
# Plot forecast
plot(forecast_model)
The forecasted point value for the next week is 195,912.4, with 80% prediction intervals from 128,044.0 to 263,780.8 and 95% prediction intervals from 92,116.7 to 299,708.1.
Created on 2024-03-07 with reprex v2.1.0
# HW4
# Load required packages
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(vars)
#> Warning: package 'vars' was built under R version 4.3.3
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Loading required package: lmtest
# Read data from CSV files
data1 <- read.csv("SeriesReport-202404102055.csv", header = TRUE, skip = 8)
data2 <- read.csv("SeriesReport-202404102057.csv", header = TRUE, skip = 8)
data3 <- read.csv("SeriesReport-202404102059.csv", header = TRUE, skip = 8)
data4 <- read.csv("SeriesReport-202404102100.csv", header = TRUE, skip = 8)
# Combine the data
combined_data <- bind_rows(data1, data2, data3, data4)
# Add column names and remove unnecessary columns
colnames(combined_data) <- c("Period", "Value")
combined_data <- combined_data[, c("Period", "Value")]
# Remove NA values
combined_data <- na.omit(combined_data)
# Descriptive statistics
summary(combined_data$Value)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 228860 329124 413563 403366 474502 589257
# Convert Period to Date format
combined_data$Period <- as.Date(paste0("01-", combined_data$Period), format = "%d-%b-%Y")
# Convert Value to numeric
combined_data$Value <- as.numeric(combined_data$Value)
# Plot the time series
plot(combined_data$Period, combined_data$Value, type = "l", xlab = "Period", ylab = "Value", main = "Time Series Plot")
# Seasonal decomposition
decomp <- decompose(ts(combined_data$Value, frequency = 12))
plot(decomp)
# Fit AR(1) model
ar1_model <- ar(combined_data$Value, order.max = 1)
# Fit AR(p) model with p > 1
p_value <- 2
arp_model <- ar(combined_data$Value, order.max = p_value)
# Extract AIC values
aic_ar1 <- ar1_model$aic
aic_arp <- arp_model$aic
# Compare AIC values
print(paste("AIC for AR(1) model:", aic_ar1))
#> [1] "AIC for AR(1) model: 1405.65884308694"
#> [2] "AIC for AR(1) model: 0"
print(paste("AIC for AR(p) model (p =", p_value, "):", aic_arp))
#> [1] "AIC for AR(p) model (p = 2 ): 1405.65884308694"
#> [2] "AIC for AR(p) model (p = 2 ): 0"
#> [3] "AIC for AR(p) model (p = 2 ): 1.7046613114162"
# AR(1) Model:
# AIC: 1405.65884308694
# AR(p) Model (p = 2):
# AIC: 1405.65884308694
# AIC: 1.7046613114162
# Comparison:
# The AIC values for both models are almost identical.
# The AR(1) model has a slightly lower AIC compared to the AR(p) model with p = 2.
# Generally, lower AIC values indicate better model fit.
# Based on the AIC comparison, the AR(1) model might be preferred over the AR(p) model with p = 2 in terms of model fit.
# However, it's important to consider other factors and conduct further analysis before making a final decision.
# Forecast one month ahead using AR(1) model
forecast_ar1 <- predict(ar1_model, n.ahead = 1)
# Extract the forecasted value
forecast_value_ar1 <- forecast_ar1$pred[1]
# Print the forecasted value
print(paste("One-month-ahead forecast using AR(1) model:", forecast_value_ar1))
#> [1] "One-month-ahead forecast using AR(1) model: 579258.465006995"
# Convert 'Value' to a time series object
value_ts <- ts(combined_data$Value, start = c(1992, 2), frequency = 12) # Adjust start date as needed
# Fit a simple AR(1) model to the data
ar1_model <- ar(value_ts, order.max = 1)
# Perform the Granger causality test using the formula interface
granger_test <- grangertest(combined_data$Value ~ lag(combined_data$Value, 1), order = 1)
# Print the results
print(granger_test)
#> Granger causality test
#>
#> Model 1: combined_data$Value ~ Lags(combined_data$Value, 1:1) + Lags(lag(combined_data$Value, 1), 1:1)
#> Model 2: combined_data$Value ~ Lags(combined_data$Value, 1:1)
#> Res.Df Df F Pr(>F)
#> 1 380
#> 2 381 -1 26.206 4.886e-07 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(granger_test)
#> Res.Df Df F Pr(>F)
#> Min. :380.0 Min. :-1 Min. :26.21 Min. :5e-07
#> 1st Qu.:380.2 1st Qu.:-1 1st Qu.:26.21 1st Qu.:5e-07
#> Median :380.5 Median :-1 Median :26.21 Median :5e-07
#> Mean :380.5 Mean :-1 Mean :26.21 Mean :5e-07
#> 3rd Qu.:380.8 3rd Qu.:-1 3rd Qu.:26.21 3rd Qu.:5e-07
#> Max. :381.0 Max. :-1 Max. :26.21 Max. :5e-07
#> NA's :1 NA's :1 NA's :1
# Load required libraries
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.3.3
#> Loading required package: lattice
# Fit a sparse VAR model using the BigVAR function with Basic structure
bigvar_obj <- constructModel(value_matrix, p = 1, struct = "Basic", gran = c(50, 10))
#> Error in eval(expr, envir, enclos): object 'value_matrix' not found
print(summary(bigvar_obj))
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'bigvar_obj' not found
plot(bigvar_obj)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'plot': object 'bigvar_obj' not found
Created on 2024-04-10 with reprex v2.1.0
HW_1-UBLearns On ICNSA And Unemployement rate as Covariant-data.
With a 95% confidence interval ranging from 170,753.7 to 521,754.0, the forecast number for the latest data release is 346,253.8, based on the final regARIMA model.
Created on 2024-02-19 with reprex v2.1.0