jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Tejesh Pawar #3

Open tejeshpa opened 5 months ago

tejeshpa commented 5 months ago

HW_1-UBLearns On ICNSA And Unemployement rate as Covariant-data.

#libraries
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo

#ICNSA data
icnsa_data <- read.csv("ICNSA.csv")

#Covariate data
covariate_data <- read.csv("UNRATE.csv")

# Merge datasets by date
merged_data <- merge(icnsa_data, covariate_data, by = "DATE", all = TRUE)

# Check for missing values
if (any(is.na(merged_data))) {
  merged_data <- na.omit(merged_data)
}

# Convert DATE column to Date type
merged_data$DATE <- as.Date(merged_data$DATE)

# Perform EDA
summary(merged_data)
#>       DATE                ICNSA            UNRATE      
#>  Min.   :1967-04-01   Min.   :139000   Min.   : 3.400  
#>  1st Qu.:1980-05-01   1st Qu.:256055   1st Qu.: 4.725  
#>  Median :1995-05-16   Median :313424   Median : 5.800  
#>  Mean   :1995-02-16   Mean   :346766   Mean   : 5.909  
#>  3rd Qu.:2008-08-31   3rd Qu.:406500   3rd Qu.: 7.175  
#>  Max.   :2023-07-01   Max.   :982808   Max.   :10.400
plot(merged_data$DATE, merged_data$ICNSA, type = 'l', xlab = 'Time', ylab = 'ICNSA', 
     main = 'Initial Claims (ICNSA) Time Series')
lines(merged_data$DATE, merged_data$Covariate, col = 'red')
legend("topright", legend = c("ICNSA", "Covariate"), col = c("black", "red"), lty = 1)


#correlation between ICNSA and Covariate
correlation <- cor(merged_data[, c("ICNSA", "UNRATE")])
print(paste("Correlation between ICNSA and UNRATE:", correlation))
#> [1] "Correlation between ICNSA and UNRATE: 1"                
#> [2] "Correlation between ICNSA and UNRATE: 0.631535203210569"
#> [3] "Correlation between ICNSA and UNRATE: 0.631535203210569"
#> [4] "Correlation between ICNSA and UNRATE: 1"

#ARIMA model
model <- auto.arima(merged_data$ICNSA, xreg = merged_data$Covariate)
summary(model)
#> Series: merged_data$ICNSA 
#> ARIMA(1,0,0) with non-zero mean 
#> 
#> Coefficients:
#>          ar1      mean
#>       0.1874  346253.8
#> s.e.  0.0993   16514.9
#> 
#> sigma^2 = 1.809e+10:  log likelihood = -1295.39
#> AIC=2596.78   AICc=2597.03   BIC=2604.53
#> 
#> Training set error measures:
#>                    ME     RMSE      MAE       MPE     MAPE      MASE
#> Training set 257.9453 133137.5 97424.19 -12.47385 29.83116 0.7773316
#>                     ACF1
#> Training set -0.01440805

checkresiduals(model)

#> 
#>  Ljung-Box test
#> 
#> data:  Residuals from ARIMA(1,0,0) with non-zero mean
#> Q* = 5.1308, df = 9, p-value = 0.8228
#> 
#> Model df: 1.   Total lags used: 10

# Forecasting
forecast_horizon <- 12  # Example: Forecasting for the next 12 periods
forecast_result <- forecast(model, xreg = tail(merged_data$Covariate, forecast_horizon), h = forecast_horizon)
print(forecast_result)
#>     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
#>  99       328531.4 156140.7 500922.2 64882.51 592180.4
#> 100       342932.6 167540.7 518324.4 74693.90 611171.3
#> 101       345631.4 170135.1 521127.7 77232.97 614029.9
#> 102       346137.2 170637.2 521637.2 77733.13 614541.3
#> 103       346232.0 170731.9 521732.1 77827.72 614636.2
#> 104       346249.7 170749.6 521749.9 77845.47 614654.0
#> 105       346253.1 170753.0 521753.2 77848.80 614657.3
#> 106       346253.7 170753.6 521753.8 77849.42 614658.0
#> 107       346253.8 170753.7 521753.9 77849.54 614658.1
#> 108       346253.8 170753.7 521754.0 77849.56 614658.1
#> 109       346253.8 170753.7 521754.0 77849.57 614658.1
#> 110       346253.8 170753.7 521754.0 77849.57 614658.1

# Plot forecast
plot(forecast_result, main = 'Forecast of Initial Claims (ICNSA)')

With a 95% confidence interval ranging from 170,753.7 to 521,754.0, the forecast number for the latest data release is 346,253.8, based on the final regARIMA model.

Created on 2024-02-19 with reprex v2.1.0

tejeshpa commented 4 months ago

HW-2-UBLearns ICNSA AND Imputed Covid Data Forecast

library(ggplot2)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(fpp2)
#> ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
#> ✔ fma       2.5     ✔ expsmooth 2.3
#> 

# Load ICNSA and COVID data
icnsa <- read.csv("ICNSA.csv")
covid <- read.csv("owid-covid-data.csv")

# Convert date columns to Date format
icnsa$Date <- as.Date(icnsa$DATE)
covid$Date <- as.Date(covid$date)

# Plot ICNSA data over time with COVID data
icnsa_plot <- ggplot(icnsa, aes(x = Date, y = ICNSA)) +
  geom_line(color = "blue") +
  labs(title = "Initial Claims Over Time", x = "Date", y = "Number of Claims") +
  geom_vline(xintercept = as.Date("2020-03-01"), linetype = "dashed", color = "red", alpha = 0.8) +
  geom_vline(xintercept = as.Date("2021-06-01"), linetype = "dashed", color = "green", alpha = 0.8)

# Identify COVID period
covid_start_index <- which(icnsa$Date == as.Date('2020-03-14'))
covid_end_index <- which(icnsa$Date == as.Date('2021-05-01'))

# Cubic spline methodology to impute new values for the Covid period
spl_fit_excl_covid <- smooth.spline(x = icnsa$Date[!(icnsa$Date >= as.Date('2020-03-14') & icnsa$Date <= as.Date('2021-05-01'))],
                                    y = icnsa$ICNSA[!(icnsa$Date >= as.Date('2020-03-14') & icnsa$Date <= as.Date('2021-05-01'))], df = 5)

imputed_values <- predict(spl_fit_excl_covid, x = as.numeric(covid_start_index:covid_end_index))$y
icnsa$ICNSA[covid_start_index:covid_end_index] <- imputed_values

# Plot original and imputed data
icnsa_plot_imputed <- ggplot(icnsa) +
  geom_line(aes(x = Date, y = ICNSA), color = "blue") +
  geom_line(data = icnsa[covid_start_index:covid_end_index, ], aes(x = Date, y = ICNSA), 
            color = "red", linetype = "dashed") +
  labs(title = "Original and Imputed Data", x = "Time", y = "Value")

# Fit Holt-Winters models
icnsa_timeseries <- ts(icnsa$ICNSA, frequency = 52, start = as.numeric(format(min(icnsa$Date), "%Y")))
hw_additive <- HoltWinters(icnsa_timeseries, seasonal = "additive")
hw_multiplicative <- HoltWinters(icnsa_timeseries, seasonal = "multiplicative")

# Forecast next ICNSA values
forecast_additive <- forecast(hw_additive, h = 1)
forecast_multiplicative <- forecast(hw_multiplicative, h = 1)

# ACF and PACF plots
acf_plot <- autoplot(acf(icnsa_timeseries))

pacf_plot <- autoplot(pacf(icnsa_timeseries))


# Output plots
icnsa_plot

icnsa_plot_imputed

acf_plot

pacf_plot


cat("Next ICNSA value (Additive Holt-Winters):", forecast_additive$mean, "\n")
#> Next ICNSA value (Additive Holt-Winters): 204534.8
cat("Next ICNSA value (Multiplicative Holt-Winters):", forecast_multiplicative$mean, "\n")
#> Next ICNSA value (Multiplicative Holt-Winters): 192566.6

# Summary
# 1. Collected the COVID-19 and ICNSA datasets for study.
# 2. Preprocessed data: date columns were converted to the right format to ensure uniformity.
# 3. ICNSA data was visualised across time, emphasising the influence of the COVID-19 period on early claims.
# 4. For the COVID-19 period, missing ICNSA data were imputed using cubic spline interpolation.
# 5. ICNSA time data were fitted with Holt-Winters exponential smoothing models for forecasting purposes.
# 6. Investigated the use of multiplicative and additive seasonal components in Holt-Winters models to capture various patterns.
# 7. Assessed the predictive power and accuracy of the model for ICNSA values in the future.
# 8. Determined how well cubic spline imputation captured underlying patterns in the COVID-19 timeframe.
# 9. Concluded that an effective basis for analysing and forecasting ICNSA is provided by the combination of cubic spline imputation and Holt-Winters forecasting.
# 10. This approach assists in developing strategies and make well-informed decisions on the dynamics of the labour market both before and after the COVID-19 epidemic.

Created on 2024-02-28 with reprex v2.1.0

tejeshpa commented 4 months ago

HW3 - UBLearns

# libraries
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)
library(fpp2)
#> ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
#> ✔ fma       2.5     ✔ expsmooth 2.3
#> 

# frequency for weekly data
frequency <- 52

# Load the most current Initial Claims (ICNSA) data
icnsa_data <- read.csv("ICNSA.csv")

# Convert date column to Date format
icnsa_data$Date <- as.Date(icnsa_data$DATE)

# Empirical analysis
pandemic_start <- as.Date("2020-03-01")
pandemic_end <- as.Date("2021-12-31")

# Using the state-space for missing value methodology to impute new values for the Covid period
covid_period <- icnsa_data$Date >= pandemic_start & icnsa_data$Date <= pandemic_end

# ICNSA data for COVID period
icnsa_covid <- icnsa_data[covid_period, ]

# Use mean imputation for missing values during the COVID period
imputed_values <- mean(icnsa_covid$ICNSA, na.rm = TRUE)

# Replace missing values with imputed values
icnsa_data$ICNSA[covid_period] <- imputed_values

# ICNSA time series
icnsa_timeseries <- ts(icnsa_data$ICNSA, frequency = frequency, start = as.numeric(format(min(icnsa_data$Date), "%Y")))

# Fit structural time series model
model <- StructTS(icnsa_timeseries, type = "trend")

# Load CPI data as covariate data
cpi_data <- read.csv("CPIAUCSL.csv")

# Convert date column to Date format
cpi_data$Date <- as.Date(cpi_data$DATE)

# Merge CPI data with ICNSA data
icnsa_data <- merge(icnsa_data, cpi_data, by = "Date")

# Check if lengths of icnsa_timeseries and CPI match
length_icnsa <- length(icnsa_timeseries)
length_cpi <- nrow(cpi_data)

if (length_icnsa == length_cpi) {
  combined_data <- cbind(icnsa_timeseries, cpi_data$CPI)
} else {
  print("Lengths do not match.")
}
#> [1] "Lengths do not match."

if (exists("combined_data")) {
  model_with_covariate <- StructTS(combined_data, type = "trend")
} else {
  print("Cannot fit model, due to data mismatch.")
}
#> [1] "Cannot fit model, due to data mismatch."

cat("\nModel for Trend and Seasonal Components:\n")
#> 
#> Model for Trend and Seasonal Components:
summary(model)
#>           Length Class  Mode     
#> coef         3   -none- numeric  
#> loglik       1   -none- numeric  
#> loglik0      1   -none- numeric  
#> data      2982   ts     numeric  
#> residuals 2982   ts     numeric  
#> fitted    5964   mts    numeric  
#> call         3   -none- call     
#> series       1   -none- character
#> code         1   -none- numeric  
#> model        7   -none- list     
#> model0       7   -none- list     
#> xtsp         3   -none- numeric

# Plot ACF and PACF
acf_plot <- autoplot(acf(icnsa_timeseries))

pacf_plot <- autoplot(pacf(icnsa_timeseries))


# Output plots
print(acf_plot)

print(pacf_plot)


# Forecast 
forecast_model <- forecast(model, h = 12)

print(forecast_model)
#>          Point Forecast        Lo 80    Hi 80       Lo 95    Hi 95
#> 2024.346       195912.4 128044.03947 263780.8   92116.693 299708.1
#> 2024.365       195860.9 108433.25960 283288.6   62151.839 329570.0
#> 2024.385       195809.5  92450.31029 299168.6   37735.273 353883.7
#> 2024.404       195758.0  78605.68242 312910.3   16588.988 374927.0
#> 2024.423       195706.5  66214.25932 325198.8   -2334.813 393747.9
#> 2024.442       195655.1  54893.33276 336416.8  -19621.430 410931.6
#> 2024.462       195603.6  44403.34996 346803.9  -35637.229 426844.4
#> 2024.481       195552.1  34582.57517 356521.7  -50629.562 441733.8
#> 2024.500       195500.7  25315.72355 365685.6  -64774.742 455776.1
#> 2024.519       195449.2  16517.18994 374381.2  -78203.693 469102.1
#> 2024.538       195397.7   8121.34205 382674.1  -91016.788 481812.2
#> 2024.558       195346.3     76.54963 390616.0 -103292.991 493985.5

# Plot forecast
plot(forecast_model)

The forecasted point value for the next week is 195,912.4, with 80% prediction intervals from 128,044.0 to 263,780.8 and 95% prediction intervals from 92,116.7 to 299,708.1.

Created on 2024-03-07 with reprex v2.1.0

tejeshpa commented 2 months ago

# HW4
# Load required packages
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(vars)
#> Warning: package 'vars' was built under R version 4.3.3
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Loading required package: lmtest

# Read data from CSV files
data1 <- read.csv("SeriesReport-202404102055.csv", header = TRUE, skip = 8)
data2 <- read.csv("SeriesReport-202404102057.csv", header = TRUE, skip = 8)
data3 <- read.csv("SeriesReport-202404102059.csv", header = TRUE, skip = 8)
data4 <- read.csv("SeriesReport-202404102100.csv", header = TRUE, skip = 8)

# Combine the data
combined_data <- bind_rows(data1, data2, data3, data4)

# Add column names and remove unnecessary columns
colnames(combined_data) <- c("Period", "Value")
combined_data <- combined_data[, c("Period", "Value")]

# Remove NA values
combined_data <- na.omit(combined_data)

# Descriptive statistics
summary(combined_data$Value)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  228860  329124  413563  403366  474502  589257

# Convert Period to Date format
combined_data$Period <- as.Date(paste0("01-", combined_data$Period), format = "%d-%b-%Y")

# Convert Value to numeric
combined_data$Value <- as.numeric(combined_data$Value)

# Plot the time series
plot(combined_data$Period, combined_data$Value, type = "l", xlab = "Period", ylab = "Value", main = "Time Series Plot")


# Seasonal decomposition
decomp <- decompose(ts(combined_data$Value, frequency = 12))
plot(decomp)


# Fit AR(1) model
ar1_model <- ar(combined_data$Value, order.max = 1)

# Fit AR(p) model with p > 1
p_value <- 2
arp_model <- ar(combined_data$Value, order.max = p_value)

# Extract AIC values
aic_ar1 <- ar1_model$aic
aic_arp <- arp_model$aic

# Compare AIC values
print(paste("AIC for AR(1) model:", aic_ar1))
#> [1] "AIC for AR(1) model: 1405.65884308694"
#> [2] "AIC for AR(1) model: 0"
print(paste("AIC for AR(p) model (p =", p_value, "):", aic_arp))
#> [1] "AIC for AR(p) model (p = 2 ): 1405.65884308694"
#> [2] "AIC for AR(p) model (p = 2 ): 0"               
#> [3] "AIC for AR(p) model (p = 2 ): 1.7046613114162"

# AR(1) Model:

# AIC: 1405.65884308694
# AR(p) Model (p = 2):

# AIC: 1405.65884308694
# AIC: 1.7046613114162
# Comparison:

# The AIC values for both models are almost identical.
# The AR(1) model has a slightly lower AIC compared to the AR(p) model with p = 2.
# Generally, lower AIC values indicate better model fit.
# Based on the AIC comparison, the AR(1) model might be preferred over the AR(p) model with p = 2 in terms of model fit.
# However, it's important to consider other factors and conduct further analysis before making a final decision.

# Forecast one month ahead using AR(1) model
forecast_ar1 <- predict(ar1_model, n.ahead = 1)

# Extract the forecasted value
forecast_value_ar1 <- forecast_ar1$pred[1]

# Print the forecasted value
print(paste("One-month-ahead forecast using AR(1) model:", forecast_value_ar1))
#> [1] "One-month-ahead forecast using AR(1) model: 579258.465006995"

# Convert 'Value' to a time series object
value_ts <- ts(combined_data$Value, start = c(1992, 2), frequency = 12)  # Adjust start date as needed

# Fit a simple AR(1) model to the data
ar1_model <- ar(value_ts, order.max = 1)

# Perform the Granger causality test using the formula interface
granger_test <- grangertest(combined_data$Value ~ lag(combined_data$Value, 1), order = 1)

# Print the results
print(granger_test)
#> Granger causality test
#> 
#> Model 1: combined_data$Value ~ Lags(combined_data$Value, 1:1) + Lags(lag(combined_data$Value, 1), 1:1)
#> Model 2: combined_data$Value ~ Lags(combined_data$Value, 1:1)
#>   Res.Df Df      F    Pr(>F)    
#> 1    380                        
#> 2    381 -1 26.206 4.886e-07 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(granger_test)
#>      Res.Df            Df           F             Pr(>F)     
#>  Min.   :380.0   Min.   :-1   Min.   :26.21   Min.   :5e-07  
#>  1st Qu.:380.2   1st Qu.:-1   1st Qu.:26.21   1st Qu.:5e-07  
#>  Median :380.5   Median :-1   Median :26.21   Median :5e-07  
#>  Mean   :380.5   Mean   :-1   Mean   :26.21   Mean   :5e-07  
#>  3rd Qu.:380.8   3rd Qu.:-1   3rd Qu.:26.21   3rd Qu.:5e-07  
#>  Max.   :381.0   Max.   :-1   Max.   :26.21   Max.   :5e-07  
#>                  NA's   :1    NA's   :1       NA's   :1

# Load required libraries
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.3.3
#> Loading required package: lattice

# Fit a sparse VAR model using the BigVAR function with Basic structure
bigvar_obj <- constructModel(value_matrix, p = 1, struct = "Basic", gran = c(50, 10))
#> Error in eval(expr, envir, enclos): object 'value_matrix' not found

print(summary(bigvar_obj))
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'bigvar_obj' not found

plot(bigvar_obj)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'plot': object 'bigvar_obj' not found

Created on 2024-04-10 with reprex v2.1.0