jlivsey / UB-sping24-time-series

2 stars 1 forks source link

Bhupathiraju Sri Sai Pavan Varma #14

Open pavan-149 opened 10 months ago

pavan-149 commented 10 months ago
setwd("C:/Users/pavan varma/Desktop/Spring Sem")
library(reprex)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tsbox)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric

uninsured_claims <- read.csv("ICNSA.csv", header = TRUE)

#View(uninsured_claims)

uninsured_claims$DATE <- as.Date(uninsured_claims$DATE)

# Calculating the 4-week moving average
moving_avg <- rollmean(uninsured_claims$ICNSA, k = 4, align = "right", fill = NA)

# Checking data in COVID years
uninsured_claims$covid_years <- as.numeric(uninsured_claims$DATE >= as.Date("2020-01-01") & uninsured_claims$DATE <= as.Date("2022-12-31"))

# Plotting ICNSA values over time
unemployment_df <- data.frame(Date = uninsured_claims$DATE, ICNSA = uninsured_claims$ICNSA)
moving_avg_df <- data.frame(Date = uninsured_claims$DATE, Moving_Avg = moving_avg)

plot(unemployment_df$Date, unemployment_df$ICNSA, type = "l", xlab = "Date", ylab = "Unemployment", main = "Unemployment over Time", col = "blue")
lines(moving_avg_df$Date, moving_avg_df$Moving_Avg, col = "red")

legend("topright", legend = c("ICNSA", "4-week Moving Average"), col = c("blue", "red"), lty = 1)

# Highlighting COVID years
covid_years <- uninsured_claims$covid_years == 1
rect(as.Date("2020-01-01"), par("usr")[3], as.Date("2022-12-31"), par("usr")[4], col = "gray", border = NA, density = 30)


# Forecast of today's unemployment figure (next 4 weeks)
forecast_today <- rep(moving_avg[length(moving_avg)], 1)
print(paste("Forecasted unemployment figure for today:", forecast_today))
#> [1] "Forecasted unemployment figure for today: 280303"

Created on 2024-02-07 with reprex v2.1.0

-Forecasted unemployment figure for today: 280303

pavan-149 commented 9 months ago
setwd("C:/Users/pavan varma/Desktop/Spring Sem")
library(reprex)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tsbox)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric

# Set FRED API key
fredr_set_key("ad8fa632ef9ee5e564d414be20b7766b")

# Fetch ICNSA data from FRED
icnsa <- fredr(series_id = "ICNSA")

# Convert 'icnsa' to a time series object
icnsa_ts <- ts(icnsa$value, start = c(1978, 1), frequency = 365.25/7)  # Adjusted frequency considering leap years (365.25 days in a year) 

# Remove NA values
icnsa_ts <- na.omit(icnsa_ts)

# Define COVID years (replace with actual COVID years)
covid_years <- c(2020, 2021)

# Plot ICNSA values over time
plot(icnsa_ts, type = "l", xlab = "Year", ylab = "ICNSA", main = "ICNSA Time Series Data")

# Highlight COVID years
for (year in covid_years) {
  abline(v = as.numeric(paste(year, "-01-01", sep = "")), col = "red", lwd = 2)
  abline(v = as.numeric(paste(year + 1, "-01-01", sep = "")), col = "red", lwd = 2)
}
#> Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): NAs
#> introduced by coercion

#> Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): NAs
#> introduced by coercion

#> Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): NAs
#> introduced by coercion

#> Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): NAs
#> introduced by coercion


# Calculate ACF and PACF
acf_values <- acf(icnsa_ts)

pacf_values <- pacf(icnsa_ts)


# Extract significant lag values from ACF and PACF
lag_acf <- which(abs(acf_values$acf) > 0.2)  # Adjust threshold as needed
lag_pacf <- which(abs(pacf_values$acf) > 0.2)  # Adjust threshold as needed

# Choose the maximum lag from ACF and PACF
lag_value <- max(max(lag_acf), max(lag_pacf))

# Create lagged variables
icnsa_lagged <- lag(icnsa_ts, k = lag_value)

# Remove NA values introduced by lagging
icnsa_lagged <- na.omit(icnsa_lagged)

# Fit lag regression model including COVID years
model_including_covid <- lm(icnsa_ts ~ icnsa_lagged)

# Forecast next week's unemployment figure including COVID years
future_data <- lag(icnsa_ts, k = lag_value)
forecast_value_including_covid <- predict(model_including_covid, newdata = data.frame(icnsa_lagged = tail(future_data, 1)))

# Print the forecasted value including COVID years
print(paste("Forecasted next week's unemployment figure including COVID years:", forecast_value_including_covid))
#> [1] "Forecasted next week's unemployment figure including COVID years: 232727"

Created on 2024-02-09 with reprex v2.1.0

-Forecasted next week's unemployment figure: 232727

pavan-149 commented 9 months ago
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(ggplot2)
library(zoo) 
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(tseries) 
library(reprex)

# Working directory
setwd("C:/Users/pavan varma/Desktop/Spring Sem/TSA")

# Loading data
data <- read.csv("OECD.csv", header = TRUE)

# Converting date column to Date type
data$date <- as.Date(data$DATE)

# Removing rows with missing values
data <- na.omit(data)

# Check the structure of your data
str(data)
#> 'data.frame':    275 obs. of  3 variables:
#>  $ DATE             : chr  "2001-01-01" "2001-02-01" "2001-03-01" "2001-04-01" ...
#>  $ OECDLRHUTTTTSTSAM: num  6.46 6.47 6.47 6.5 6.52 ...
#>  $ date             : Date, format: "2001-01-01" "2001-02-01" ...

# Visualizing data
ggplot(data, aes(x = date, y = OECDLRHUTTTTSTSAM)) +
  geom_line() +
  labs(title = "Harmonized Unemployment Monthly Rates", x = "Date", y = "Unemployment Rate")


# Checking for seasonality
decomposition <- decompose(ts(data$OECDLRHUTTTTSTSAM, frequency = 12))  
seasonal <- decomposition$seasonal
seasonal_plot <- ggplot() + 
  geom_line(aes(x = time(data$OECDLRHUTTTTSTSAM), y = seasonal)) +
  labs(title = "Seasonal Component", x = "Date", y = "Seasonal Index")

print(seasonal_plot)
#> Don't know how to automatically pick scale for object of type <ts>. Defaulting
#> to continuous.


# ADF test for stationarity
adf_test <- adf.test(data$OECDLRHUTTTTSTSAM)
adf_test_result <- ifelse(adf_test$p.value < 0.05, "Stationary", "Non-Stationary")

print(paste("ADF Test Result:", adf_test_result))
#> [1] "ADF Test Result: Non-Stationary"

# Identifing Non-Seasonal Parameters (p, d, q)
acf_data <- acf(diff(data$OECDLRHUTTTTSTSAM), main = "ACF of Differenced Data")

pacf_data <- pacf(diff(data$OECDLRHUTTTTSTSAM), main = "PACF of Differenced Data")


# Identifing Seasonal Parameters (P, D, Q)
sacf_data <- acf(data$OECDLRHUTTTTSTSAM, lag.max = 12*2, main = "SACF of Data")

spacf_data <- pacf(data$OECDLRHUTTTTSTSAM, lag.max = 12*2, main = "SPACF of Data")


# Manual ARIMA model specification
p <- 1
d <- 0
q <- 1
P <- 1
D <- 1
Q <- 1

# Fitting ARIMA model
fit <- arima(data$OECDLRHUTTTTSTSAM, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = 12))
#> Warning in log(s2): NaNs produced

#> Warning in log(s2): NaNs produced

# Model summary
summary(fit)
#> 
#> Call:
#> arima(x = data$OECDLRHUTTTTSTSAM, order = c(p, d, q), seasonal = list(order = c(P, 
#>     D, Q), period = 12))
#> 
#> Coefficients:
#>          ar1     ma1    sar1     sma1
#>       0.9796  0.1598  0.0258  -0.9440
#> s.e.  0.0157  0.0636  0.0703   0.0723
#> 
#> sigma^2 estimated as 0.04811:  log likelihood = 12.8,  aic = -15.59
#> 
#> Training set error measures:
#>                        ME      RMSE        MAE        MPE     MAPE     MASE
#> Training set -0.007383039 0.2144984 0.07564254 -0.1920257 1.096264 1.009035
#>                       ACF1
#> Training set -0.0001888289

# Model diagnostics
checkresiduals(fit)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(1,0,1)(1,1,1)[12]
#> Q* = 7.0704, df = 6, p-value = 0.3144
#> 
#> Model df: 4.   Total lags used: 10

# Forecasting
# Forcasting 1 period
forecast_values <- forecast(fit, h = 1)

# Plot forecast
plot(forecast_values)


# Print forecast summary
summary(forecast_values)
#> 
#> Forecast method: ARIMA(1,0,1)(1,1,1)[12]
#> 
#> Model Information:
#> 
#> Call:
#> arima(x = data$OECDLRHUTTTTSTSAM, order = c(p, d, q), seasonal = list(order = c(P, 
#>     D, Q), period = 12))
#> 
#> Coefficients:
#>          ar1     ma1    sar1     sma1
#>       0.9796  0.1598  0.0258  -0.9440
#> s.e.  0.0157  0.0636  0.0703   0.0723
#> 
#> sigma^2 estimated as 0.04811:  log likelihood = 12.8,  aic = -15.59
#> 
#> Error measures:
#>                        ME      RMSE        MAE        MPE     MAPE     MASE
#> Training set -0.007383039 0.2144984 0.07564254 -0.1920257 1.096264 1.009035
#>                       ACF1
#> Training set -0.0001888289
#> 
#> Forecasts:
#>     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 276       4.862572 4.580266 5.144878 4.430822 5.294322

Created on 2024-02-13 with reprex v2.1.0

pavan-149 commented 9 months ago

Set FRED API key

fredr_set_key("ad8fa632ef9ee5e564d414be20b7766b")

Fetch ICNSA data from FRED

icnsa <- fredr(series_id = "ICNSA") icnsa$date <- as.Date(icnsa$date) # Adjusted column name to lowercase

Calculating the 4-week moving average

moving_avg <- rollmean(icnsa$value, k = 4, align = "right", fill = NA) # Adjusted column name to lowercase

Plotting ICNSA values over time

plot(icnsa$date, icnsa$value, type = "l", xlab = "Date", ylab = "Unemployment", main = "Unemployment over Time", col = "blue") lines(icnsa$date, moving_avg, col = "red")

legend("topright", legend = c("ICNSA", "4-week Moving Average"), col = c("blue", "red"), lty = 1)

Highlighting COVID years

rect(as.Date("2020-01-01"), par("usr")[3], as.Date("2022-12-31"), par("usr")[4], col = "gray", border = NA, density = 30)


![](https://i.imgur.com/OZIxViT.png)<!-- -->

``` r

# Forecast of today's unemployment figure (next 4 weeks)
forecast_today <- rep(moving_avg[length(moving_avg)], 1)
print(paste("Forecasted unemployment figure for today:", forecast_today))
#> [1] "Forecasted unemployment figure for today: 242689.75"

Created on 2024-02-16 with reprex v2.1.0

pavan-149 commented 9 months ago
library(reprex)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(tsbox)

# Set FRED API key
fredr_set_key("ad8fa632ef9ee5e564d414be20b7766b")

# Fetching ICNSA data from FRED
icnsa <- fredr(series_id = "ICNSA")

# Fetching NYCCLAIMS data from FRED
nycclaims <- fredr(series_id = "NYCCLAIMS")

# Removing missing values
icnsa <- na.omit(icnsa)
nycclaims <- na.omit(nycclaims)

# Merging datasets based on date
merged_data <- merge(icnsa, nycclaims, by = "date", all = FALSE)

# Removing missing values
merged_data <- na.omit(merged_data)

# Visualize the relationship between ICNSA and NYCCLAIMS
plot(merged_data$date, merged_data$value.x, type = "l", main = "ICNSA vs. NYCCLAIMS", ylab = "ICNSA", xlab = "Date")
lines(merged_data$date, merged_data$value.y, col = "red")


# Calculating the correlation coefficient
correlation <- cor(merged_data$value.x, merged_data$value.y)

print(paste("Correlation coefficient between ICNSA and NYCCLAIMS:", correlation))
#> [1] "Correlation coefficient between ICNSA and NYCCLAIMS: 0.674691579063901"

# Cross-correlation function
ccf_result <- ccf(merged_data$value.x, merged_data$value.y, lag.max = 10)

plot(ccf_result, main = "Cross-correlation Function: ICNSA vs. NYCCLAIMS", xlab = "Lag", ylab = "Cross-correlation")


# Autocorrelation function (ACF)
acf_result <- acf(merged_data$value.x, main = "Autocorrelation Function: ICNSA")


# Partial autocorrelation function (PACF)
pacf_result <- pacf(merged_data$value.x, main = "Partial Autocorrelation Function: ICNSA")


# Removing missing values
merged_data <- na.omit(merged_data)

# Fitting regARIMA model with covariates using automatic model identification
regarima_model <- auto.arima(merged_data$value.x, xreg = merged_data$value.y)

# Summary of the model
summary(regarima_model)
#> Series: merged_data$value.x 
#> Regression with ARIMA(1,0,3) errors 
#> 
#> Coefficients:
#>          ar1     ma1     ma2      ma3  intercept    xreg
#>       0.8383  0.5869  0.1333  -0.0473  347024.69  0.0977
#> s.e.  0.0235  0.0296  0.0368   0.0289   32136.24  0.1025
#> 
#> sigma^2 = 9.369e+09:  log likelihood = -25577.49
#> AIC=51168.98   AICc=51169.03   BIC=51208.12
#> 
#> Training set error measures:
#>                     ME     RMSE     MAE       MPE     MAPE     MASE        ACF1
#> Training set 0.1140441 96647.05 45095.5 -2.021267 11.48395 1.142174 0.001482879

# Regression diagnostics
checkresiduals(regarima_model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from Regression with ARIMA(1,0,3) errors
#> Q* = 9.4649, df = 6, p-value = 0.1491
#> 
#> Model df: 4.   Total lags used: 10

# Forecast with the regARIMA model
forecast_result <- forecast(regarima_model, xreg = merged_data$value.y)

# Plot of the forecast
plot(forecast_result)


# The most recent value of NYCCLAIMS
latest_nycclaims <- tail(merged_data$value.y, 1)

# Forecast of ICNSA for the current week using the merged model
forecast_icnsa <- forecast(regarima_model, xreg = latest_nycclaims)

# Extracting the point forecast for the current week
icnsa_point_forecast_current <- forecast_icnsa$mean[1]

# Printing the point forecast for ICNSA from the merged model for the current week
cat("Point forecast for ICNSA from the merged model for the current week:", icnsa_point_forecast_current, "\n")
#> Point forecast for ICNSA from the merged model for the current week: 226040.1

Created on 2024-02-21 with reprex v2.1.0

pavan-149 commented 9 months ago
library(reprex)
library(fredr)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(tsbox)

# Set FRED API key
fredr_set_key("ad8fa632ef9ee5e564d414be20b7766b")

# Fetching ICNSA data from FRED
icnsa = fredr(series_id = "ICNSA")

# Convert 'date' column to Date type
icnsa$date = as.Date(icnsa$date)

# Removing missing values
icnsa = na.omit(icnsa)

# Plotting the ICNSA data
plot(icnsa$date, icnsa$value, type = "l", xlab = "Date", ylab = "Initial Claims", 
     main = "Initial Claims over Time")

# Adding vertical lines for potential start and end dates 
abline(v = as.Date("2020-03-01"), col = "red", lty = 2)
abline(v = as.Date("2021-06-30"), col = "blue", lty = 2)


# Fitting cubic spline interpolation with lambda = 0.5
spline_fit = smooth.spline(icnsa$date, icnsa$value, lambda = 0.5)

# Imputing values for the Covid period
covid_period_start = as.Date("2020-03-01")
covid_period_end = as.Date("2021-06-30")
covid_period_values = predict(spline_fit, newdata = seq(covid_period_start, covid_period_end, by = "week"))

# Combining original and imputed data
icnsa_imputed = data.frame(date = seq(min(icnsa$date), max(icnsa$date), by = "week"),
                            value = c(icnsa$value, covid_period_values$y))

# Fitting multiplicative Holt-Winters model
hw_multiplicative = HoltWinters(ts(icnsa_imputed$value, frequency = 52), seasonal = "multiplicative")

# Fitting additive Holt-Winters model
hw_additive = HoltWinters(ts(icnsa_imputed$value, frequency = 52), seasonal = "additive")

# Forecasting next ICNSA value using both models
forecast_multiplicative = forecast(hw_multiplicative, h = 1)
forecast_additive = forecast(hw_additive, h = 1)

# forecasts
print(forecast_multiplicative)
#>          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#> 115.6538       385774.3 290594.9 480953.8 240209.9 531338.7
print(forecast_additive)
#>          Point Forecast    Lo 80    Hi 80  Lo 95    Hi 95
#> 115.6538       404580.3 306141.4 503019.2 254031 555129.6

Created on 2024-02-28 with reprex v2.1.0

pavan-149 commented 9 months ago
library(reprex)
library(fredr)
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(ggplot2)
library(changepoint)
#> Successfully loaded changepoint package version 2.2.4
#>  See NEWS for details of changes.
library(imputeTS)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
#> 
#> Attaching package: 'imputeTS'
#> The following object is masked from 'package:zoo':
#> 
#>     na.locf
library(forecast)
library(bfast)
#> Warning: package 'bfast' was built under R version 4.3.3
#> Loading required package: strucchangeRcpp
#> Warning: package 'strucchangeRcpp' was built under R version 4.3.3
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3

# Set FRED API key
fredr_set_key("ad8fa632ef9ee5e564d414be20b7766b")

# Fetching ICNSA data from FRED
icnsa <- fredr(series_id = "ICNSA")
ilcclaims <- fredr(series_id = "ILCClaims")

# Converting 'date' column to Date type
icnsa$date <- as.Date(icnsa$date)
ilcclaims$date <- as.Date(ilcclaims$date)

# Plotting initial ICNSA data
ggplot(icnsa, aes(x = date, y = value)) +
  geom_line(color = "blue") +
  labs(title = "Initial ICNSA Data", x = "Date", y = "ICNSA Value") +
  theme_minimal()


# Plotting initial ILCClaims data
ggplot(ilcclaims, aes(x = date, y = value)) +
  geom_line(color = "red") +
  labs(title = "Initial ILCClaims Data", x = "Date", y = "ILCClaims Value") +
  theme_minimal()


# Merging ICNSA and ILCClaims data
merged_data <- merge(icnsa, ilcclaims, by = "date", all = TRUE)

# Removing missing values
merged_data <- na.omit(merged_data)

# Fitting a trend line to the ICNSA data
trend_icnsa <- lm(value.x ~ date, data = merged_data)
summary(trend_icnsa)
#> 
#> Call:
#> lm(formula = value.x ~ date, data = merged_data)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -241796 -100052  -41853   31035 5770707 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 3.208e+05  2.211e+04  14.506   <2e-16 ***
#> date        3.801e+00  1.646e+00   2.309   0.0211 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 294700 on 1985 degrees of freedom
#> Multiple R-squared:  0.002678,   Adjusted R-squared:  0.002176 
#> F-statistic: 5.331 on 1 and 1985 DF,  p-value: 0.02106

# Calculating residuals (deviations from the trend line)
merged_data$residuals <- residuals(trend_icnsa)

# Setting a threshold level for residuals
threshold <- sd(merged_data$residuals)  # Adjust this threshold as needed

# Performing change point detection on the residuals
cpt <- cpt.mean(merged_data$residuals)

# Extracting the dates associated with the change points
change_point_dates <- merged_data$date[cpt@cpts]

# Identifing the start date based on the first change point
start_date <- change_point_dates[1]

# Finding the index of the last residual above the threshold
end_index <- tail(which(merged_data$residuals > threshold), 1)

# Using the date corresponding to the end_index as the end date
end_date <- merged_data$date[end_index]

# Printing the start and end dates
cat("Start Date:", start_date, "\n")
#> Start Date: 18335
cat("End Date:", end_date, "\n")
#> End Date: 18713

# Plotting the detected change points
plot(cpt, type = "l", col = "red", main = "Change Points Detection")


# Imputing missing values using state-space methodology
merged_data_imputed_state_space <- na_kalman(merged_data$value.x)

# Applying the spline imputation method for comparison
spline_fit <- smooth.spline(merged_data$date, merged_data$value.x)
merged_data_imputed_spline <- predict(spline_fit, xout = merged_data$date)$y

# Calculating metrics for state-space imputation
mae_state_space <- mean(abs(merged_data$value.x - merged_data_imputed_state_space))
rmse_state_space <- sqrt(mean((merged_data$value.x - merged_data_imputed_state_space)^2))
rsquared_state_space <- summary(lm(merged_data_imputed_state_space ~ merged_data$value.x))$r.squared
bias_state_space <- mean(merged_data_imputed_state_space - merged_data$value.x)

# Calculating metrics for spline imputation
mae_spline <- mean(abs(merged_data$value.x - merged_data_imputed_spline))
rmse_spline <- sqrt(mean((merged_data$value.x - merged_data_imputed_spline)^2))
rsquared_spline <- summary(lm(merged_data_imputed_spline ~ merged_data$value.x))$r.squared
bias_spline <- mean(merged_data_imputed_spline - merged_data$value.x)

# Printing the metrics
comparison_results <- data.frame(Method = c("State-Space Imputation", "Spline Imputation"),
                                  MAE = c(mae_state_space, mae_spline),
                                  RMSE = c(rmse_state_space, rmse_spline),
                                  R_squared = c(rsquared_state_space, rsquared_spline),
                                  Bias = c(bias_state_space, bias_spline))
print(comparison_results)
#>                   Method     MAE     RMSE R_squared         Bias
#> 1 State-Space Imputation     0.0      0.0 1.0000000 0.000000e+00
#> 2      Spline Imputation 49349.9 167394.3 0.6781726 3.188004e-11

# Plotting the ICNSA data before and after state-space imputation
plot(merged_data$date, merged_data$value.x, type = "l", col = "blue", lty = 1, xlab = "Date", ylab = "Initial Claims", main = "ICNSA Data Before and After Imputation")
lines(merged_data$date, merged_data_imputed_state_space, col = "red", lty = 2)


# Plotting the ICNSA data before and after spline imputation
plot(merged_data$date, merged_data$value.x, type = "l", col = "black", lty = 1, xlab = "Date", ylab = "Initial Claims", main = "ICNSA Data Before and After Imputation")
lines(merged_data$date, merged_data_imputed_spline, col = "red", lty = 3)


# Calculating correlations and plots
cor_residuals <- cor(merged_data$residuals, merged_data$value.x)
ccf_plot <- ccf(merged_data$residuals, merged_data$value.x)

acf_plot <- acf(merged_data$residuals)

pacf_plot <- pacf(merged_data$residuals)


# Correlation Residuals
print(cor_residuals)
#> [1] 0.99866

# Plots of CCF, ACF, PACF
plot(ccf_plot)

plot(acf_plot)

plot(pacf_plot)


# Manually selecting ARIMA model based on correlation coefficients
lags <- 1:10
correlations <- sapply(lags, function(lag) cor(merged_data$value.x[-1], lagged_data <- lag(merged_data$value.x, lag)[-1]))

# Identifying significant lag values
significant_lags <- lags[abs(correlations) > 0.5] 

# ARIMA model build with adjusted parameters
arima_model <- Arima(merged_data$value.x, order = c(1, 0, 0), seasonal = list(order = c(1, 0, 1), period = 52))

# Forecast of next week's value
forecast_next_week <- forecast(arima_model, h = 1)

# Printing the forecast
print(forecast_next_week)
#>      Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
#> 1988       209028.8 69045.38 349012.2 -5057.362 423114.9

Created on 2024-03-07 with reprex v2.1.0

pavan-149 commented 7 months ago
setwd("C:/Users/pavan varma/OneDrive - University at Buffalo/Desktop/Spring Sem/TSA")

library(reprex)
library(censusapi)
#> Warning: package 'censusapi' was built under R version 4.3.3
#> 
#> Attaching package: 'censusapi'
#> The following object is masked from 'package:methods':
#> 
#>     getFunction
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(readxl)
#> Warning: package 'readxl' was built under R version 4.3.3
library(tsbox)
library(ggplot2)
library(vars)
#> Warning: package 'vars' was built under R version 4.3.3
#> Loading required package: MASS
#> Warning: package 'MASS' was built under R version 4.3.3
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Loading required package: lmtest
library(lmtest)
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.3.3
#> Loading required package: lattice
#> Warning: package 'lattice' was built under R version 4.3.3

# Read the datasets
GS <- read_excel("SeriesReport-202404101652-V.xlsx")
FBS <- read_excel("SeriesReport-202404101653-V.xlsx")
HPCS <- read_excel("SeriesReport-202404101654-V.xlsx")
CCAS <- read_excel("SeriesReport-202404101655-V.xlsx")

# Convert "Value" column to numeric and "Period" to Date format for each dataset
convert_and_format <- function(data) {
  data$Value <- as.numeric(data$Value)
  data$Period <- as.Date(paste0("01-", data$Period), format = "%d-%b-%Y")
  return(data)
}

GS <- convert_and_format(GS)
#> Warning in convert_and_format(GS): NAs introduced by coercion
FBS <- convert_and_format(FBS)
#> Warning in convert_and_format(FBS): NAs introduced by coercion
HPCS <- convert_and_format(HPCS)
#> Warning in convert_and_format(HPCS): NAs introduced by coercion
CCAS <- convert_and_format(CCAS)
#> Warning in convert_and_format(CCAS): NAs introduced by coercion

# Merge the datasets based on the "Period" column
merged_data <- merge(GS, FBS, by = "Period", all = TRUE)
merged_data <- merge(merged_data, HPCS, by = "Period", all = TRUE)
merged_data <- merge(merged_data, CCAS, by = "Period", all = TRUE)
#> Warning in merge.data.frame(merged_data, CCAS, by = "Period", all = TRUE):
#> column names 'Value.x', 'Value.y' are duplicated in the result

# Define new column names
new_column_names <- c("Period", "GS_Value", "FBS_Value", "HPCS_Value", "CCAS_Value")

# Assign new column names to the merged dataset
colnames(merged_data) <- new_column_names

merged_data = na.omit(merged_data)

# Summary Statistics
summary(merged_data)
#>      Period              GS_Value       FBS_Value       HPCS_Value   
#>  Min.   :1992-01-01   Min.   :12601   Min.   :30382   Min.   : 7292  
#>  1st Qu.:2000-01-08   1st Qu.:19084   1st Qu.:36448   1st Qu.:12434  
#>  Median :2008-01-16   Median :34941   Median :46847   Median :20152  
#>  Mean   :2008-01-15   Mean   :32356   Mean   :48849   Mean   :19693  
#>  3rd Qu.:2016-01-24   3rd Qu.:43190   3rd Qu.:57561   3rd Qu.:26127  
#>  Max.   :2024-02-01   Max.   :68257   Max.   :82993   Max.   :37455  
#>    CCAS_Value   
#>  Min.   : 2849  
#>  1st Qu.:13386  
#>  Median :17496  
#>  Mean   :17281  
#>  3rd Qu.:21271  
#>  Max.   :26625

# Plot time series for each variable
ggplot(merged_data, aes(x = Period)) +
  geom_line(aes(y = GS_Value, color = "GS")) +
  geom_line(aes(y = FBS_Value, color = "FBS")) +
  geom_line(aes(y = HPCS_Value, color = "HPCS")) +
  geom_line(aes(y = CCAS_Value, color = "CCAS")) +
  labs(color = "Variables", title = "Time Series Plot of Merged Data") +
  theme_minimal()


# Correlation Analysis
correlation_matrix <- cor(merged_data[, -1]) # Exclude "Period" column
print(correlation_matrix)
#>             GS_Value FBS_Value HPCS_Value CCAS_Value
#> GS_Value   1.0000000 0.8814042  0.9213464  0.9350807
#> FBS_Value  0.8814042 1.0000000  0.9762019  0.8946275
#> HPCS_Value 0.9213464 0.9762019  1.0000000  0.9432814
#> CCAS_Value 0.9350807 0.8946275  0.9432814  1.0000000

# Compute and plot ACF for each variable
par(mfrow=c(2,2))
for (i in 2:5) {
  acf_result <- acf(merged_data[, i], main = paste("ACF for", new_column_names[i]))
}


# Compute and plot PACF for each variable
par(mfrow=c(2,2))
for (i in 2:5) {
  pacf_result <- pacf(merged_data[, i], main = paste("PACF for", new_column_names[i]))
}


# Compute and plot CCF between each pair of variables
par(mfrow=c(1,3))
for (i in 2:5) {
  for (j in 2:5) {
    if (i != j) {
      ccf_result <- ccf(merged_data[, i], merged_data[, j], main = paste("CCF between", new_column_names[i], "and", new_column_names[j]))
    }
  }
}


# Prepare the data for VAR modeling
merged_data_ts <- ts(merged_data[, -1], frequency = 12) # Exclude "Period" column

# Fit VAR(1) model
var_model_1 <- VAR(merged_data_ts, p = 1)

# Fit VAR(p) model with p > 1
var_model_p <- VAR(merged_data_ts, p = 2)  # Change p value as desired

# Compare the two models using AIC
AIC_1 <- AIC(var_model_1)
AIC_p <- AIC(var_model_p)

# Print AIC values
print(paste("AIC for VAR(1) model:", AIC_1))
#> [1] "AIC for VAR(1) model: 24718.1302816642"
print(paste("AIC for VAR(p) model with p > 1:", AIC_p))
#> [1] "AIC for VAR(p) model with p > 1: 24086.819078207"

# Decide which model is better based on AIC
if (AIC_1 < AIC_p) {
  print("VAR(1) model is better.")
} else {
  print("VAR(p) model with p > 1 is better.")
}
#> [1] "VAR(p) model with p > 1 is better."

# Forecast one month ahead using VAR(1) model
forecast_1 <- predict(var_model_1, n.ahead = 1)

# Forecast one month ahead using VAR(p) model with p > 1
forecast_p <- predict(var_model_p, n.ahead = 1)

# Print the forecasts
print("Forecast for VAR(1) model:")
#> [1] "Forecast for VAR(1) model:"
print(forecast_1$fcst)
#> $GS_Value
#>                   fcst    lower    upper       CI
#> GS_Value.fcst 53159.35 50479.12 55839.58 2680.232
#> 
#> $FBS_Value
#>                    fcst    lower    upper       CI
#> FBS_Value.fcst 82832.31 80770.73 84893.89 2061.577
#> 
#> $HPCS_Value
#>                     fcst    lower    upper       CI
#> HPCS_Value.fcst 36115.87 35367.09 36864.66 748.7856
#> 
#> $CCAS_Value
#>                     fcst    lower    upper       CI
#> CCAS_Value.fcst 26160.28 24309.91 28010.65 1850.369

print("Forecast for VAR(p) model with p > 1:")
#> [1] "Forecast for VAR(p) model with p > 1:"
print(forecast_p$fcst)
#> $GS_Value
#>                  fcst    lower    upper       CI
#> GS_Value.fcst 53627.2 51317.23 55937.16 2309.967
#> 
#> $FBS_Value
#>                    fcst    lower    upper       CI
#> FBS_Value.fcst 83023.89 81214.63 84833.15 1809.262
#> 
#> $HPCS_Value
#>                    fcst    lower    upper       CI
#> HPCS_Value.fcst 36186.4 35644.32 36728.48 542.0815
#> 
#> $CCAS_Value
#>                     fcst    lower    upper       CI
#> CCAS_Value.fcst 26381.57 24903.86 27859.28 1477.708

## Granger Causality Tests for VAR(1) Model
granger_test_1 <- causality(var_model_1, cause = "GS_Value")
print(granger_test_1)
#> $Granger
#> 
#>  Granger causality H0: GS_Value do not Granger-cause FBS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_1
#> F-Test = 1.8076, df1 = 3, df2 = 1520, p-value = 0.1438
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: GS_Value and FBS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_1
#> Chi-squared = 78.701, df = 3, p-value < 2.2e-16

granger_test_1 <- causality(var_model_1, cause = "FBS_Value")
print(granger_test_1)
#> $Granger
#> 
#>  Granger causality H0: FBS_Value do not Granger-cause GS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_1
#> F-Test = 0.4492, df1 = 3, df2 = 1520, p-value = 0.7179
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: FBS_Value and GS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_1
#> Chi-squared = 122.19, df = 3, p-value < 2.2e-16

granger_test_1 <- causality(var_model_1, cause = "HPCS_Value")
print(granger_test_1)
#> $Granger
#> 
#>  Granger causality H0: HPCS_Value do not Granger-cause GS_Value
#>  FBS_Value CCAS_Value
#> 
#> data:  VAR object var_model_1
#> F-Test = 1.6263, df1 = 3, df2 = 1520, p-value = 0.1814
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: HPCS_Value and GS_Value
#>  FBS_Value CCAS_Value
#> 
#> data:  VAR object var_model_1
#> Chi-squared = 137.87, df = 3, p-value < 2.2e-16

granger_test_1 <- causality(var_model_1, cause = "CCAS_Value")
print(granger_test_1)
#> $Granger
#> 
#>  Granger causality H0: CCAS_Value do not Granger-cause GS_Value
#>  FBS_Value HPCS_Value
#> 
#> data:  VAR object var_model_1
#> F-Test = 5.5748, df1 = 3, df2 = 1520, p-value = 0.0008387
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: CCAS_Value and GS_Value
#>  FBS_Value HPCS_Value
#> 
#> data:  VAR object var_model_1
#> Chi-squared = 136, df = 3, p-value < 2.2e-16

## Granger Causality Tests for VAR(p) Model with p > 1
granger_test_p <- causality(var_model_p, cause = "GS_Value")
print(granger_test_p)
#> $Granger
#> 
#>  Granger causality H0: GS_Value do not Granger-cause FBS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_p
#> F-Test = 2.8709, df1 = 6, df2 = 1500, p-value = 0.008747
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: GS_Value and FBS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 51.271, df = 3, p-value = 4.284e-11

granger_test_p <- causality(var_model_p, cause = "FBS_Value")
print(granger_test_p)
#> $Granger
#> 
#>  Granger causality H0: FBS_Value do not Granger-cause GS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_p
#> F-Test = 7.8538, df1 = 6, df2 = 1500, p-value = 2.398e-08
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: FBS_Value and GS_Value
#>  HPCS_Value CCAS_Value
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 140.45, df = 3, p-value < 2.2e-16

granger_test_p <- causality(var_model_p, cause = "HPCS_Value")
print(granger_test_p)
#> $Granger
#> 
#>  Granger causality H0: HPCS_Value do not Granger-cause GS_Value
#>  FBS_Value CCAS_Value
#> 
#> data:  VAR object var_model_p
#> F-Test = 8.9283, df1 = 6, df2 = 1500, p-value = 1.345e-09
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: HPCS_Value and GS_Value
#>  FBS_Value CCAS_Value
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 73.098, df = 3, p-value = 8.882e-16

granger_test_p <- causality(var_model_p, cause = "CCAS_Value")
print(granger_test_p)
#> $Granger
#> 
#>  Granger causality H0: CCAS_Value do not Granger-cause GS_Value
#>  FBS_Value HPCS_Value
#> 
#> data:  VAR object var_model_p
#> F-Test = 21.054, df1 = 6, df2 = 1500, p-value < 2.2e-16
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: CCAS_Value and GS_Value
#>  FBS_Value HPCS_Value
#> 
#> data:  VAR object var_model_p
#> Chi-squared = 142.15, df = 3, p-value < 2.2e-16

# Construct a Basic VAR-L(3,4) model with sparsity structure
model <- constructModel(merged_data_ts, p = 4, struct = "Basic", gran = c(50, 10), verbose = FALSE)

# Perform cross-validation and select the optimal penalty parameter
results <- cv.BigVAR(model)
print(results)
#> *** BIGVAR MODEL Results *** 
#> Structure
#> [1] "Basic"
#> Loss 
#> [1] "L2"
#> Forecast Horizon 
#> [1] 1
#> Minnesota VAR
#> [1] FALSE
#> Maximum Lag Order 
#> [1] 4
#> Optimal Lambda 
#> [1] 625900000
#> Grid Depth 
#> [1] 50
#> Index of Optimal Lambda 
#> [1] 10
#> Fraction of active coefficients 
#> [1] 0.3841
#> In-Sample Loss
#> [1] 14300000
#> BigVAR Out of Sample Loss
#> [1] 15700000
#> *** Benchmark Results *** 
#> Conditional Mean Out of Sample Loss
#> [1] 1.01e+09
#> AIC Out of Sample Loss
#> [1] 10500000
#> BIC Out of Sample Loss
#> [1] 1e+07
#> RW Out of Sample Loss
#> [1] 9870000

# Visualize the sparsity pattern of the estimated coefficients
SparsityPlot.BigVAR.results(results)

Created on 2024-04-10 with reprex v2.1.0