jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Shweta H #2 #12

Open SH3458 opened 5 months ago

SH3458 commented 5 months ago
suppressMessages({
  library(astsa)
  library(dplyr)
  library(dygraphs)
  library(forecast)
  library(fpp3)
  library(fredr)
  library(imputeTS)
  library(lubridate)
  library(seasonal)
  library(tidyverse)
  library(tsbox)
  library(zoo)
  library(reprex)
})

# My API Key
fredr_set_key("052142bc981666b4ebcb1c8df98d006b")
icnsa = fredr(series_id = "ICNSA")

# Date Preparation
icnsa$date <- as.Date(icnsa$date)

# Check for missing values
missing_values <- any(is.na(icnsa$value))
missing_values
#> [1] FALSE

# Handling extreme values
covid_threshold <- quantile(icnsa$value, 0.99)
icnsa$value <- pmin(icnsa$value, covid_threshold)

# Creating a time series object
icnsa_ts <- ts(icnsa$value, frequency = 52.18)  

# Plot the time series
autoplot(icnsa_ts) +
  labs(title = "ICNSA")


# Autocorrelation and Partial Autocorrelation plots to get an approx value of lag
par(mar = c(3, 3, 4, 2))
acf(icnsa_ts, main = "Autocorrelation Function (ACF)")

pacf(icnsa_ts, main = "Partial Autocorrelation Function (PACF)")


# Define the lag order based on autocorrelation function (ACF) and partial autocorrelation function (PACF)
# Based on the ACF & PACF plot we can see that at lag 26 or 27 there is a significant peak hence choosing 26 as it predicts slightly better than 27
lag_order <- 26 

# Creating lagged variables
icnsa_lagged <- c()
for (i in 1:lag_order) {
  icnsa_lagged <- cbind(icnsa_lagged, stats::lag(icnsa_ts, k = i))
}

# Combine lagged variables with the original time series
icnsa_combined <- cbind(icnsa_ts, icnsa_lagged)
colnames(icnsa_combined) <- c("ICNSA", paste0("ICNSA_Lagged_", 1:lag_order))

# Impute missing values using linear interpolation for each column
for (col in colnames(icnsa_combined)) {
  icnsa_combined[, col] <- na_interpolation(icnsa_combined[, col], option = "linear")
}

# Split the data into training and testing sets
train_ratio <- 0.8
train_size <- floor(length(icnsa_combined[, "ICNSA"]) * train_ratio)
train_data <- icnsa_combined[1:train_size, ]
test_data <- icnsa_combined[(train_size + 1):nrow(icnsa_combined), ]

# Convert training and testing data to a data frame
train_data_df <- as.data.frame(train_data)
test_data_df <- as.data.frame(test_data)

# Lagged regression model
lagged_model <- lm(ICNSA ~ ., data = train_data_df)
#summary(lagged_model)

# Predictions on the test set
predictions <- predict(lagged_model, newdata = test_data_df)

# Convert predictions to a data frame
predictions_df <- data.frame(Period = index(test_data_df), Actual = test_data_df$ICNSA, Predicted = as.vector(predictions))

# Plot actual vs. predicted values using ggplot2
ggplot(predictions_df, aes(x = Period)) +
  geom_line(aes(y = Actual, color = "Actual"), linetype = "solid", size = 1) +
  geom_line(aes(y = Predicted, color = "Predicted"), linetype = "dashed", size = 1) +
  labs(title = "Actual vs. Predicted Values", y = "ICNSA") +
  scale_color_manual(name = "Values", values = c("Actual" = "blue", "Predicted" = "red")) +
  theme_minimal()
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.


# Print the predicted values
cat("Predicted Forecast value for the upcoming Thursday:", round(predictions[length(predictions)]), "\n")
#> Predicted Forecast value for the upcoming Thursday: 265025

Created on 2024-02-06 with reprex v2.1.0

jlivsey commented 4 months ago

Some notes after looking at your code:

  1. You remove 20% of the data for a "test" data.frame but that data is better suited to help build your model.
  2. The method your using is not randomized over a window or doing any variable selection that a train/test subset is needed
SH3458 commented 4 months ago

@jlivsey Yes, I realize my errors now and will address them properly in my upcoming Thursday attempt.