jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Devansh Pratap Singh #8

Open devanshpratapsingh opened 5 months ago

devanshpratapsingh commented 5 months ago

`` https://github.com/devanshpratapsingh/TimeSeriesAnalysis/blob/main/HW1/TSA_HW1.ipynb

devanshpratapsingh commented 5 months ago

Forecasted value: 252,678 Actual value: 261,029

vgunturi commented 4 months ago

@devanshpratapsingh The values chosen for order were in the range of (0,3) and a grid search was performed to find the best order in that range. But ACF plot of ICNSA data has non-decaying lag values till lag 12 or 13 which makes p value 12 - not in the range of (0,3)?

devanshpratapsingh commented 4 months ago

HW1(Brightspace)

https://github.com/devanshpratapsingh/TimeSeriesAnalysis/blob/main/HW1_Brightspace/HW1%20(2).pdf

devanshpratapsingh commented 4 months ago

@vgunturi Thankyou for pointing this out, the acf plot for this particular data has highest autocorrelation at lag 12 or 13 hence my model should have p value as 12 or 13. But I didn't used acf or pacf plots to determine p,q,d orders, I used grid search, I wasn't clear and thought the grid search already considers both lag 12 and lag 13 values for p-value so I fet it is not an issue. I rectified it later!

devanshpratapsingh commented 4 months ago
library(reprex)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(fredr)
library(ggplot2)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tseries)
library(urca)
library(tidyverse)
library(readxl)
library(forecast)

fredr_set_key("360481124fc765b815de2697f1bf8d62")

icnsa <- fredr(series_id = "ICNSA") 

icnsa$date <- as.Date(icnsa$date)
ggplot(icnsa, aes(x = date, y = value)) + geom_line() + labs(title = "Insurance Claim Over Time",x = "Year",y = "Number")

str(icnsa)
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
lower_limit <- 100
upper_limit <- 200
mean_value <- mean(icnsa$value)
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
# Filter the data within the range defined by the blue lines
icnsa_filtered <- icnsa %>%
  filter(date >= min(date) & date <= max(date), value >= lower_limit & value <= upper_limit)
#> Error in icnsa %>% filter(date >= min(date) & date <= max(date), value >= : could not find function "%>%"

# Calculate mean and standard deviation for the filtered data
mean_value_filtered <- mean(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found
sd_value_filtered <- sd(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found

# Plot for line representing the mean value and blue lines for the lower and upper limits of the standard deviation
ggplot(icnsa, aes(x = date, y = value)) + 
  geom_line() +
  geom_hline(yintercept = mean_value, linetype = "dashed", color = "red") +
  geom_segment(aes(x = min(date), y = lower_limit, xend = max(date), yend = lower_limit), color = "blue") +
  geom_segment(aes(x = min(date), y = upper_limit, xend = max(date), yend = upper_limit), color = "blue") +
  geom_hline(yintercept = mean_value_filtered, linetype = "dashed", color = "green") +  # Add mean line for filtered data
  labs(title = "Insurance Claim Over Time", x = "Year", y = "Number")
#> Error in ggplot(icnsa, aes(x = date, y = value)): could not find function "ggplot"
# Filter the data within the range defined by the blue lines
icnsa_filtered <- icnsa %>%
  filter(date >= min(date) & date <= max(date), value >= lower_limit & value <= upper_limit)
#> Error in icnsa %>% filter(date >= min(date) & date <= max(date), value >= : could not find function "%>%"

# Mean and standard deviation for the filtered data
mean_value_filtered <- mean(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found
sd_value_filtered <- sd(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found

print(paste("Mean value between the blue lines:", mean_value_filtered))
#> Error in eval(expr, envir, enclos): object 'mean_value_filtered' not found
# All points above the higher limit of the standard deviation
above_limit_points <- icnsa %>%
  filter(value > upper_limit)
#> Error in icnsa %>% filter(value > upper_limit): could not find function "%>%"

# Print the points
print(above_limit_points)
#> Error in eval(expr, envir, enclos): object 'above_limit_points' not found
# Calculate Z-score
icnsa <- icnsa %>%
  mutate(z_score = abs(value - mean(value)) / sd(value))
#> Error in icnsa %>% mutate(z_score = abs(value - mean(value))/sd(value)): could not find function "%>%"

# Set threshold for anomaly detection (e.g., 3 standard deviations)
threshold <- 3

# Identify anomalies
anomalies <- icnsa %>%
  filter(z_score > threshold)
#> Error in icnsa %>% filter(z_score > threshold): could not find function "%>%"

# Print detected anomalies
print(anomalies)
#> Error in eval(expr, envir, enclos): object 'anomalies' not found
library(ggplot2)

# Plot the data with anomalies highlighted
ggplot(icnsa, aes(x = date, y = value)) + 
  geom_line() +
  geom_point(data = anomalies, aes(x = date, y = value), color = "red", size = 2) +
  labs(title = "Insurance Claim Over Time", x = "Year", y = "Number")
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
start_date <- as.Date("2020-01-01")
end_date <- as.Date("2021-01-01")
# Filter the data for the specified range
filtered_icnsa <- icnsa %>%
  filter(date >= start_date & date <= end_date)
#> Error in icnsa %>% filter(date >= start_date & date <= end_date): could not find function "%>%"

# Plot the filtered data with the specified range on the x-axis
ggplot(filtered_icnsa, aes(x = date, y = value)) + 
  geom_line() +
  labs(title = "Insurance Claim Over Time", x = "Date", y = "Number") +
  scale_x_date(date_labels = "%b %Y")
#> Error in ggplot(filtered_icnsa, aes(x = date, y = value)): could not find function "ggplot"
start_date <- as.Date("2020-03-01")
end_date <- as.Date("2021-12-30")
# Filter the data for the specified range
filtered_icnsa <- icnsa %>%
  filter(date >= start_date & date <= end_date)
#> Error in icnsa %>% filter(date >= start_date & date <= end_date): could not find function "%>%"

# Plot the filtered data with the specified range on the x-axis
ggplot(filtered_icnsa, aes(x = date, y = value)) + 
  geom_line() +
  labs(title = "Insurance Claim Over Time", x = "Date", y = "Number") +
  scale_x_date(date_labels = "%b %Y") 
#> Error in ggplot(filtered_icnsa, aes(x = date, y = value)): could not find function "ggplot"
# Defining the range
start_date_start <- as.Date("2020-02-01")
end_date_start <- as.Date("2020-03-20")

# Filter the data for the specified range
filtered_icnsa <- icnsa %>%
  filter(date >= start_date_start & date <= end_date_start)
#> Error in icnsa %>% filter(date >= start_date_start & date <= end_date_start): could not find function "%>%"

# Calculate the differences in value and date
diff_value <- diff(filtered_icnsa$value)
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found
diff_date <- as.numeric(diff(filtered_icnsa$date))
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found

# Calculate the slopes
slopes <- diff_value / diff_date
#> Error in eval(expr, envir, enclos): object 'diff_value' not found

# Find the index of the segment with the maximum slope
max_slope_index <- which.max(slopes)
#> Error in eval(expr, envir, enclos): object 'slopes' not found

# Get the corresponding start and end dates for the segment with the highest slope
highest_slope_start_date <- filtered_icnsa$date[max_slope_index]
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found
highest_slope_end_date <- filtered_icnsa$date[max_slope_index + 1]
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found

# Plot the data with the specified range and slope highlighted
ggplot(filtered_icnsa, aes(x = date, y = value)) + 
  geom_line() +
  geom_segment(aes(x = filtered_icnsa$date[max_slope_index], 
                   y = filtered_icnsa$value[max_slope_index],
                   xend = filtered_icnsa$date[max_slope_index + 1], 
                   yend = filtered_icnsa$value[max_slope_index + 1]),
               color = "red", size = 1) +
  labs(title = "Insurance Claim Over Time", x = "Date", y = "Number")
#> Error in ggplot(filtered_icnsa, aes(x = date, y = value)): could not find function "ggplot"
# Print the value of the x-axis where the slope starts
print(paste("X-axis value where slope starts:", highest_slope_start_date))
#> Error in eval(expr, envir, enclos): object 'highest_slope_start_date' not found
# Define the start and end dates
start_date_end <- as.Date("2021-01-01")
end_date_end <- as.Date("2021-12-31")

# Define the specific value (y-value)
specific_value <- 349995  
# Filter the data for the specified date range
filtered_data <- icnsa_filtered %>%
  filter(date >= start_date_end & date <= end_date_end)
#> Error in icnsa_filtered %>% filter(date >= start_date_end & date <= end_date_end): could not find function "%>%"

# Plot the filtered data with a horizontal line at the specific value
ggplot(filtered_data, aes(x = date, y = value)) +
  geom_line(color = "blue") +  # Adding color to the line
  geom_hline(yintercept = specific_value, linetype = "dashed", color = "green") +
  labs(title = "Data with Horizontal Line", x = "Date", y = "Value")
#> Error in ggplot(filtered_data, aes(x = date, y = value)): could not find function "ggplot"
start_date <- as.Date("2020-03-07")
end_date <- as.Date("2021-07-31")
# Plotting Covid period data with vertical lines to visualize the covid region
plot(icnsa$date, icnsa$value, type = "l", 
     main = "Claims Marking the Covid period",
     xlab = "Year",
     ylab = "Number")
#> Error in eval(expr, envir, enclos): object 'icnsa' not found

abline(v = as.numeric(start_date), col = "red", lty = 2)
#> Error in eval(expr, envir, enclos): object 'start_date' not found
abline(v = as.numeric(end_date), col = "red", lty = 2)
#> Error in eval(expr, envir, enclos): object 'end_date' not found
# Define non-Covid and Covid periods
non_covid_period <- icnsa[icnsa$date < start_date | icnsa$date > end_date, ]
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
covid_period <- icnsa[icnsa$date >= start_date & icnsa$date <= end_date, ]
#> Error in eval(expr, envir, enclos): object 'icnsa' not found

# Initialize an empty list to store updated data frames for each spar value
updated_data_list <- list()

# Initialize an empty data frame to store lambda and MSE values
lambda_mse_df <- data.frame(lambda = numeric(), MSE = numeric())
spar_values <- seq(0.1, 1.0, by = 0.1)
# Implementing cubic spline with different values of spar
lambda_values <- seq(0.1, 1.0, by = 0.1)
for (lambda in lambda_values) {
  # Fit cubic spline to non-Covid period with current spar value
  spline_fit_covid <- smooth.spline(x = as.numeric(non_covid_period$date), y = non_covid_period$value, spar = lambda)

  # Impute new values using spline fit
  imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period$date))$y

  # Updating Covid period data with imputed values
  covid_period$value <- imputed_values

  # Combining non-Covid and imputed values
  updated_icnsa_data <- rbind(non_covid_period, covid_period)
  updated_icnsa_data <- updated_icnsa_data %>% arrange(date)

  # Store updated data frame in the list
  updated_data_list[[as.character(lambda)]] <- updated_icnsa_data

  # Plotting updated data
  plot(updated_icnsa_data$date, updated_icnsa_data$value, type = "l", col = "red", lwd = 2,
       main = paste("Comparison of Time Series (spar =", lambda, ")"), xlab = "Year", ylab = "Number")

  # Add original data for comparison
  lines(icnsa$date, icnsa$value, col = "blue", lwd = 2)

  # Add legend
  legend("topright", legend = c("Updated", "Original"), col = c("red", "blue"), lty = 1, lwd = 2)
}
#> Error in eval(expr, envir, enclos): object 'non_covid_period' not found

# Print the table of lambda and MSE values
print(lambda_mse_df)
#> [1] lambda MSE   
#> <0 rows> (or 0-length row.names)
# Initialize empty vectors to store AIC and BIC values
aic_values <- numeric()
bic_values <- numeric()

# Calculate AIC and BIC for the current model
residuals <- covid_period$value - imputed_values
#> Error in eval(expr, envir, enclos): object 'covid_period' not found
rss <- sum(residuals^2)
#> Error in residuals^2: non-numeric argument to binary operator
n <- length(covid_period$value)
#> Error in eval(expr, envir, enclos): object 'covid_period' not found
k <- 3  # Number of parameters in the model (intercept, lambda, and degree of freedom)
aic_values <- c(aic_values, n * log(rss/n) + 2 * k)
#> Error in eval(expr, envir, enclos): object 'n' not found
bic_values <- c(bic_values, n * log(rss/n) + k * log(n))
#> Error in eval(expr, envir, enclos): object 'n' not found

# Find the index of the minimum AIC and BIC values
min_aic_index <- which.min(aic_values)
min_bic_index <- which.min(bic_values)

# Get the corresponding lambda values
best_lambda_aic <- lambda_values[min_aic_index]
#> Error in eval(expr, envir, enclos): object 'lambda_values' not found
best_lambda_bic <- lambda_values[min_bic_index]
#> Error in eval(expr, envir, enclos): object 'lambda_values' not found

# Print the lambda values corresponding to the minimum AIC and BIC
cat("Lambda value corresponding to minimum AIC:", best_lambda_aic, "\n")
#> Error in eval(expr, envir, enclos): object 'best_lambda_aic' not found
cat("Lambda value corresponding to minimum BIC:", best_lambda_bic, "\n")
#> Error in eval(expr, envir, enclos): object 'best_lambda_bic' not found
#Choosing λ = 0.1 achieves a delicate equilibrium between smoothing and preserving data trends. While lower values such as 0.1 minimize RMSE, they raise concerns of overfitting to noise. Conversely, higher values like 1.0 tend to oversmooth, sacrificing data nuances. λ = 0.3 effectively tackles overfitting while still capturing essential trends, making it a judicious choice for interpolation.
# Fit cubic spline to non-Covid period with spar value of 0.3
spline_fit_covid <- smooth.spline(x = as.numeric(non_covid_period$date), y = non_covid_period$value, spar = 0.1)
#> Error in eval(expr, envir, enclos): object 'non_covid_period' not found

# Impute new values using spline fit
imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period$date))$y
#> Error in eval(expr, envir, enclos): object 'spline_fit_covid' not found

# Updating Covid period data with imputed values
covid_period$value <- imputed_values
#> Error in eval(expr, envir, enclos): object 'imputed_values' not found

# Combining non-Covid and imputed values
updated_icnsa_data <- rbind(non_covid_period, covid_period)
#> Error in eval(expr, envir, enclos): object 'non_covid_period' not found
updated_icnsa_data <- updated_icnsa_data %>% arrange(date)
#> Error in updated_icnsa_data %>% arrange(date): could not find function "%>%"

# Plotting original and updated data together
ggplot() +
  geom_line(data = icnsa, aes(x = date, y = value), color = "red") +
  geom_line(data = updated_icnsa_data, aes(x = date, y = value), color = "blue") +
  labs(title = "Comparison of Original and Updated Data", x = "Year", y = "Number")
#> Error in ggplot(): could not find function "ggplot"
icnsa_ts <- ts(updated_icnsa_data$value, frequency = 52)
#> Error in eval(expr, envir, enclos): object 'updated_icnsa_data' not found

#Applied Holt-Winters models both  additive and multiplicative
hw_add <- HoltWinters(icnsa_ts, seasonal = "additive")
#> Error in eval(expr, envir, enclos): object 'icnsa_ts' not found
hw_mult <- HoltWinters(icnsa_ts, seasonal = "multiplicative")
#> Error in eval(expr, envir, enclos): object 'icnsa_ts' not found

#Forecasted next value using both models
forecast_add <- forecast::forecast(hw_add, h = 1)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
#> Error in eval(expr, envir, enclos): object 'hw_add' not found
forecast_mult <- forecast::forecast(hw_mult, h = 1)
#> Error in eval(expr, envir, enclos): object 'hw_mult' not found

point_forecast_add <- forecast_add$mean
#> Error in eval(expr, envir, enclos): object 'forecast_add' not found
point_forecast_mult <- forecast_mult$mean
#> Error in eval(expr, envir, enclos): object 'forecast_mult' not found

#Printed Point Forecast value of both model
cat("Forecasted Unemployment using Holt-Winters additive Model:", point_forecast_add, "\n")
#> Error in eval(expr, envir, enclos): object 'point_forecast_add' not found
cat("Forecasted Unemployment using Holt-Winters Multiplicative Model:", point_forecast_mult, "\n")
#> Error in eval(expr, envir, enclos): object 'point_forecast_mult' not found

Created on 2024-02-28 with reprex v2.1.0

devanshpratapsingh commented 4 months ago

HW2 Python implementation: https://github.com/devanshpratapsingh/TimeSeriesAnalysis/blob/main/HW2/HW2_TSA.ipynb

devanshpratapsingh commented 4 months ago

03/07/2023 forecast: https://github.com/devanshpratapsingh/TimeSeriesAnalysis/blob/main/HW4.pdf (190698.2)

devanshpratapsingh commented 3 months ago
library(reprex)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(fredr)
library(ggplot2)
library(forecast)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(tseries)
library(urca)
library(tidyverse)
library(readxl)
library(forecast)

fredr_set_key("360481124fc765b815de2697f1bf8d62")

icnsa <- fredr(series_id = "ICNSA") 

icnsa$date <- as.Date(icnsa$date)
ggplot(icnsa, aes(x = date, y = value)) + geom_line() + labs(title = "Insurance Claim Over Time",x = "Year",y = "Number")

str(icnsa)
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
lower_limit <- 100
upper_limit <- 200
mean_value <- mean(icnsa$value)
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
# Filter the data within the range defined by the blue lines
icnsa_filtered <- icnsa %>%
  filter(date >= min(date) & date <= max(date), value >= lower_limit & value <= upper_limit)
#> Error in icnsa %>% filter(date >= min(date) & date <= max(date), value >= : could not find function "%>%"

# Calculate mean and standard deviation for the filtered data
mean_value_filtered <- mean(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found
sd_value_filtered <- sd(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found

# Plot for line representing the mean value and blue lines for the lower and upper limits of the standard deviation
ggplot(icnsa, aes(x = date, y = value)) + 
  geom_line() +
  geom_hline(yintercept = mean_value, linetype = "dashed", color = "red") +
  geom_segment(aes(x = min(date), y = lower_limit, xend = max(date), yend = lower_limit), color = "blue") +
  geom_segment(aes(x = min(date), y = upper_limit, xend = max(date), yend = upper_limit), color = "blue") +
  geom_hline(yintercept = mean_value_filtered, linetype = "dashed", color = "green") +  # Add mean line for filtered data
  labs(title = "Insurance Claim Over Time", x = "Year", y = "Number")
#> Error in ggplot(icnsa, aes(x = date, y = value)): could not find function "ggplot"
# Filter the data within the range defined by the blue lines
icnsa_filtered <- icnsa %>%
  filter(date >= min(date) & date <= max(date), value >= lower_limit & value <= upper_limit)
#> Error in icnsa %>% filter(date >= min(date) & date <= max(date), value >= : could not find function "%>%"

# Mean and standard deviation for the filtered data
mean_value_filtered <- mean(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found
sd_value_filtered <- sd(icnsa_filtered$value)
#> Error in eval(expr, envir, enclos): object 'icnsa_filtered' not found

print(paste("Mean value between the blue lines:", mean_value_filtered))
#> Error in eval(expr, envir, enclos): object 'mean_value_filtered' not found
# All points above the higher limit of the standard deviation
above_limit_points <- icnsa %>%
  filter(value > upper_limit)
#> Error in icnsa %>% filter(value > upper_limit): could not find function "%>%"

# Print the points
print(above_limit_points)
#> Error in eval(expr, envir, enclos): object 'above_limit_points' not found
# Calculate Z-score
icnsa <- icnsa %>%
  mutate(z_score = abs(value - mean(value)) / sd(value))
#> Error in icnsa %>% mutate(z_score = abs(value - mean(value))/sd(value)): could not find function "%>%"

# Set threshold for anomaly detection (e.g., 3 standard deviations)
threshold <- 3

# Identify anomalies
anomalies <- icnsa %>%
  filter(z_score > threshold)
#> Error in icnsa %>% filter(z_score > threshold): could not find function "%>%"

# Print detected anomalies
print(anomalies)
#> Error in eval(expr, envir, enclos): object 'anomalies' not found
library(ggplot2)

# Plot the data with anomalies highlighted
ggplot(icnsa, aes(x = date, y = value)) + 
  geom_line() +
  geom_point(data = anomalies, aes(x = date, y = value), color = "red", size = 2) +
  labs(title = "Insurance Claim Over Time", x = "Year", y = "Number")
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
start_date <- as.Date("2020-01-01")
end_date <- as.Date("2021-01-01")
# Filter the data for the specified range
filtered_icnsa <- icnsa %>%
  filter(date >= start_date & date <= end_date)
#> Error in icnsa %>% filter(date >= start_date & date <= end_date): could not find function "%>%"

# Plot the filtered data with the specified range on the x-axis
ggplot(filtered_icnsa, aes(x = date, y = value)) + 
  geom_line() +
  labs(title = "Insurance Claim Over Time", x = "Date", y = "Number") +
  scale_x_date(date_labels = "%b %Y")
#> Error in ggplot(filtered_icnsa, aes(x = date, y = value)): could not find function "ggplot"
start_date <- as.Date("2020-03-01")
end_date <- as.Date("2021-12-30")
# Filter the data for the specified range
filtered_icnsa <- icnsa %>%
  filter(date >= start_date & date <= end_date)
#> Error in icnsa %>% filter(date >= start_date & date <= end_date): could not find function "%>%"

# Plot the filtered data with the specified range on the x-axis
ggplot(filtered_icnsa, aes(x = date, y = value)) + 
  geom_line() +
  labs(title = "Insurance Claim Over Time", x = "Date", y = "Number") +
  scale_x_date(date_labels = "%b %Y") 
#> Error in ggplot(filtered_icnsa, aes(x = date, y = value)): could not find function "ggplot"
# Defining the range
start_date_start <- as.Date("2020-02-01")
end_date_start <- as.Date("2020-03-20")

# Filter the data for the specified range
filtered_icnsa <- icnsa %>%
  filter(date >= start_date_start & date <= end_date_start)
#> Error in icnsa %>% filter(date >= start_date_start & date <= end_date_start): could not find function "%>%"

# Calculate the differences in value and date
diff_value <- diff(filtered_icnsa$value)
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found
diff_date <- as.numeric(diff(filtered_icnsa$date))
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found

# Calculate the slopes
slopes <- diff_value / diff_date
#> Error in eval(expr, envir, enclos): object 'diff_value' not found

# Find the index of the segment with the maximum slope
max_slope_index <- which.max(slopes)
#> Error in eval(expr, envir, enclos): object 'slopes' not found

# Get the corresponding start and end dates for the segment with the highest slope
highest_slope_start_date <- filtered_icnsa$date[max_slope_index]
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found
highest_slope_end_date <- filtered_icnsa$date[max_slope_index + 1]
#> Error in eval(expr, envir, enclos): object 'filtered_icnsa' not found

# Plot the data with the specified range and slope highlighted
ggplot(filtered_icnsa, aes(x = date, y = value)) + 
  geom_line() +
  geom_segment(aes(x = filtered_icnsa$date[max_slope_index], 
                   y = filtered_icnsa$value[max_slope_index],
                   xend = filtered_icnsa$date[max_slope_index + 1], 
                   yend = filtered_icnsa$value[max_slope_index + 1]),
               color = "red", size = 1) +
  labs(title = "Insurance Claim Over Time", x = "Date", y = "Number")
#> Error in ggplot(filtered_icnsa, aes(x = date, y = value)): could not find function "ggplot"
# Print the value of the x-axis where the slope starts
print(paste("X-axis value where slope starts:", highest_slope_start_date))
#> Error in eval(expr, envir, enclos): object 'highest_slope_start_date' not found
# Define the start and end dates
start_date_end <- as.Date("2021-01-01")
end_date_end <- as.Date("2021-12-31")

# Define the specific value (y-value)
specific_value <- 349995  
# Filter the data for the specified date range
filtered_data <- icnsa_filtered %>%
  filter(date >= start_date_end & date <= end_date_end)
#> Error in icnsa_filtered %>% filter(date >= start_date_end & date <= end_date_end): could not find function "%>%"

# Plot the filtered data with a horizontal line at the specific value
ggplot(filtered_data, aes(x = date, y = value)) +
  geom_line(color = "blue") +  # Adding color to the line
  geom_hline(yintercept = specific_value, linetype = "dashed", color = "green") +
  labs(title = "Data with Horizontal Line", x = "Date", y = "Value")
#> Error in ggplot(filtered_data, aes(x = date, y = value)): could not find function "ggplot"
start_date <- as.Date("2020-03-07")
end_date <- as.Date("2021-07-31")
# Plotting Covid period data with vertical lines to visualize the covid region
plot(icnsa$date, icnsa$value, type = "l", 
     main = "Claims Marking the Covid period",
     xlab = "Year",
     ylab = "Number")
#> Error in eval(expr, envir, enclos): object 'icnsa' not found

abline(v = as.numeric(start_date), col = "red", lty = 2)
#> Error in eval(expr, envir, enclos): object 'start_date' not found
abline(v = as.numeric(end_date), col = "red", lty = 2)
#> Error in eval(expr, envir, enclos): object 'end_date' not found
# Define non-Covid and Covid periods
non_covid_period <- icnsa[icnsa$date < start_date | icnsa$date > end_date, ]
#> Error in eval(expr, envir, enclos): object 'icnsa' not found
covid_period <- icnsa[icnsa$date >= start_date & icnsa$date <= end_date, ]
#> Error in eval(expr, envir, enclos): object 'icnsa' not found

# Initialize an empty list to store updated data frames for each spar value
updated_data_list <- list()

# Initialize an empty data frame to store lambda and MSE values
lambda_mse_df <- data.frame(lambda = numeric(), MSE = numeric())
spar_values <- seq(0.1, 1.0, by = 0.1)
# Implementing cubic spline with different values of spar
lambda_values <- seq(0.1, 1.0, by = 0.1)
for (lambda in lambda_values) {
  # Fit cubic spline to non-Covid period with current spar value
  spline_fit_covid <- smooth.spline(x = as.numeric(non_covid_period$date), y = non_covid_period$value, spar = lambda)

  # Impute new values using spline fit
  imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period$date))$y

  # Updating Covid period data with imputed values
  covid_period$value <- imputed_values

  # Combining non-Covid and imputed values
  updated_icnsa_data <- rbind(non_covid_period, covid_period)
  updated_icnsa_data <- updated_icnsa_data %>% arrange(date)

  # Store updated data frame in the list
  updated_data_list[[as.character(lambda)]] <- updated_icnsa_data

  # Plotting updated data
  plot(updated_icnsa_data$date, updated_icnsa_data$value, type = "l", col = "red", lwd = 2,
       main = paste("Comparison of Time Series (spar =", lambda, ")"), xlab = "Year", ylab = "Number")

  # Add original data for comparison
  lines(icnsa$date, icnsa$value, col = "blue", lwd = 2)

  # Add legend
  legend("topright", legend = c("Updated", "Original"), col = c("red", "blue"), lty = 1, lwd = 2)
}
#> Error in eval(expr, envir, enclos): object 'non_covid_period' not found

# Print the table of lambda and MSE values
print(lambda_mse_df)
#> [1] lambda MSE   
#> <0 rows> (or 0-length row.names)
# Initialize empty vectors to store AIC and BIC values
aic_values <- numeric()
bic_values <- numeric()

# Calculate AIC and BIC for the current model
residuals <- covid_period$value - imputed_values
#> Error in eval(expr, envir, enclos): object 'covid_period' not found
rss <- sum(residuals^2)
#> Error in residuals^2: non-numeric argument to binary operator
n <- length(covid_period$value)
#> Error in eval(expr, envir, enclos): object 'covid_period' not found
k <- 3  # Number of parameters in the model (intercept, lambda, and degree of freedom)
aic_values <- c(aic_values, n * log(rss/n) + 2 * k)
#> Error in eval(expr, envir, enclos): object 'n' not found
bic_values <- c(bic_values, n * log(rss/n) + k * log(n))
#> Error in eval(expr, envir, enclos): object 'n' not found

# Find the index of the minimum AIC and BIC values
min_aic_index <- which.min(aic_values)
min_bic_index <- which.min(bic_values)

# Get the corresponding lambda values
best_lambda_aic <- lambda_values[min_aic_index]
#> Error in eval(expr, envir, enclos): object 'lambda_values' not found
best_lambda_bic <- lambda_values[min_bic_index]
#> Error in eval(expr, envir, enclos): object 'lambda_values' not found

# Print the lambda values corresponding to the minimum AIC and BIC
cat("Lambda value corresponding to minimum AIC:", best_lambda_aic, "\n")
#> Error in eval(expr, envir, enclos): object 'best_lambda_aic' not found
cat("Lambda value corresponding to minimum BIC:", best_lambda_bic, "\n")
#> Error in eval(expr, envir, enclos): object 'best_lambda_bic' not found
#Choosing λ = 0.1 achieves a delicate equilibrium between smoothing and preserving data trends. While lower values such as 0.1 minimize RMSE, they raise concerns of overfitting to noise. Conversely, higher values like 1.0 tend to oversmooth, sacrificing data nuances. λ = 0.3 effectively tackles overfitting while still capturing essential trends, making it a judicious choice for interpolation.
# Fit cubic spline to non-Covid period with spar value of 0.3
spline_fit_covid <- smooth.spline(x = as.numeric(non_covid_period$date), y = non_covid_period$value, spar = 0.1)
#> Error in eval(expr, envir, enclos): object 'non_covid_period' not found

# Impute new values using spline fit
imputed_values <- predict(spline_fit_covid, x = as.numeric(covid_period$date))$y
#> Error in eval(expr, envir, enclos): object 'spline_fit_covid' not found

# Updating Covid period data with imputed values
covid_period$value <- imputed_values
#> Error in eval(expr, envir, enclos): object 'imputed_values' not found

# Combining non-Covid and imputed values
updated_icnsa_data <- rbind(non_covid_period, covid_period)
#> Error in eval(expr, envir, enclos): object 'non_covid_period' not found
updated_icnsa_data <- updated_icnsa_data %>% arrange(date)
#> Error in updated_icnsa_data %>% arrange(date): could not find function "%>%"

# Plotting original and updated data together
ggplot() +
  geom_line(data = icnsa, aes(x = date, y = value), color = "red") +
  geom_line(data = updated_icnsa_data, aes(x = date, y = value), color = "blue") +
  labs(title = "Comparison of Original and Updated Data", x = "Year", y = "Number")
#> Error in ggplot(): could not find function "ggplot"
icnsa_ts <- ts(updated_icnsa_data$value, frequency = 52)
#> Error in eval(expr, envir, enclos): object 'updated_icnsa_data' not found

#Applied Holt-Winters models both  additive and multiplicative
hw_add <- HoltWinters(icnsa_ts, seasonal = "additive")
#> Error in eval(expr, envir, enclos): object 'icnsa_ts' not found
hw_mult <- HoltWinters(icnsa_ts, seasonal = "multiplicative")
#> Error in eval(expr, envir, enclos): object 'icnsa_ts' not found

#Forecasted next value using both models
forecast_add <- forecast::forecast(hw_add, h = 1)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
#> Error in eval(expr, envir, enclos): object 'hw_add' not found
forecast_mult <- forecast::forecast(hw_mult, h = 1)
#> Error in eval(expr, envir, enclos): object 'hw_mult' not found

point_forecast_add <- forecast_add$mean
#> Error in eval(expr, envir, enclos): object 'forecast_add' not found
point_forecast_mult <- forecast_mult$mean
#> Error in eval(expr, envir, enclos): object 'forecast_mult' not found

#Printed Point Forecast value of both model
cat("Forecasted Unemployment using Holt-Winters additive Model:", point_forecast_add, "\n")
#> Error in eval(expr, envir, enclos): object 'point_forecast_add' not found
cat("Forecasted Unemployment using Holt-Winters Multiplicative Model:", point_forecast_mult, "\n")
#> Error in eval(expr, envir, enclos): object 'point_forecast_mult' not found

Created on 2024-02-28 with reprex v2.1.0

https://github.com/devanshpratapsingh/TimeSeriesAnalysis/blob/main/HW2/HW2.pdf

devanshpratapsingh commented 2 months ago

HW4: https://github.com/devanshpratapsingh/TimeSeriesAnalysis/blob/main/HW4/HW4%20(1).pdf