suppressMessages({
library(astsa)
library(dplyr)
library(dygraphs)
library(forecast)
library(fpp3)
library(fredr)
library(imputeTS)
library(lubridate)
library(seasonal)
library(tidyverse)
library(tsbox)
library(zoo)
library(reprex)
})
# My API Key
fredr_set_key("052142bc981666b4ebcb1c8df98d006b")
icnsa = fredr(series_id = "ICNSA")
# Date Preparation
icnsa$date <- as.Date(icnsa$date)
# Check for missing values
missing_values <- any(is.na(icnsa$value))
missing_values
#> [1] FALSE
# Handling extreme values
covid_threshold <- quantile(icnsa$value, 0.99)
icnsa$value <- pmin(icnsa$value, covid_threshold)
# Creating a time series object
icnsa_ts <- ts(icnsa$value, frequency = 52.18)
# Plot the time series
autoplot(icnsa_ts) +
labs(title = "ICNSA")
# Autocorrelation and Partial Autocorrelation plots to get an approx value of lag
par(mar = c(3, 3, 4, 2))
acf(icnsa_ts, main = "Autocorrelation Function (ACF)")
pacf(icnsa_ts, main = "Partial Autocorrelation Function (PACF)")
# Define the lag order based on autocorrelation function (ACF) and partial autocorrelation function (PACF)
# Based on the ACF & PACF plot we can see that at lag 26 or 27 there is a significant peak hence choosing 26 as it predicts slightly better than 27
lag_order <- 26
# Creating lagged variables
icnsa_lagged <- c()
for (i in 1:lag_order) {
icnsa_lagged <- cbind(icnsa_lagged, stats::lag(icnsa_ts, k = i))
}
# Combine lagged variables with the original time series
icnsa_combined <- cbind(icnsa_ts, icnsa_lagged)
colnames(icnsa_combined) <- c("ICNSA", paste0("ICNSA_Lagged_", 1:lag_order))
# Impute missing values using linear interpolation for each column
for (col in colnames(icnsa_combined)) {
icnsa_combined[, col] <- na_interpolation(icnsa_combined[, col], option = "linear")
}
# Split the data into training and testing sets
train_ratio <- 0.8
train_size <- floor(length(icnsa_combined[, "ICNSA"]) * train_ratio)
train_data <- icnsa_combined[1:train_size, ]
test_data <- icnsa_combined[(train_size + 1):nrow(icnsa_combined), ]
# Convert training and testing data to a data frame
train_data_df <- as.data.frame(train_data)
test_data_df <- as.data.frame(test_data)
# Lagged regression model
lagged_model <- lm(ICNSA ~ ., data = train_data_df)
#summary(lagged_model)
# Predictions on the test set
predictions <- predict(lagged_model, newdata = test_data_df)
# Convert predictions to a data frame
predictions_df <- data.frame(Period = index(test_data_df), Actual = test_data_df$ICNSA, Predicted = as.vector(predictions))
# Plot actual vs. predicted values using ggplot2
ggplot(predictions_df, aes(x = Period)) +
geom_line(aes(y = Actual, color = "Actual"), linetype = "solid", size = 1) +
geom_line(aes(y = Predicted, color = "Predicted"), linetype = "dashed", size = 1) +
labs(title = "Actual vs. Predicted Values", y = "ICNSA") +
scale_color_manual(name = "Values", values = c("Actual" = "blue", "Predicted" = "red")) +
theme_minimal()
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
# Print the predicted values
cat("Predicted Forecast value for the upcoming Thursday:", round(predictions[length(predictions)]), "\n")
#> Predicted Forecast value for the upcoming Thursday: 265025
Created on 2024-02-06 with reprex v2.1.0