Open UpenderKaveti opened 5 months ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from statsmodels.tsa.api import AutoReg
#reading the data
data = pd.read_csv(r"C:\Users\upend\Downloads\ICNSA.csv")
#printing the data
data
#> DATE ICNSA
0 1967-01-07 346000
1 1967-01-14 334000
2 1967-01-21 277000
3 1967-01-28 252000
4 1967-02-04 274000
... ... ...
2973 2023-12-30 269409
2974 2024-01-06 318906
2975 2024-01-13 291330
2976 2024-01-20 249947
2977 2024-01-27 261029
[2978 rows x 2 columns]
data.plot()
#> <Axes: >
mask = (data.index < len(data)-3)
df_train = data[mask].copy()
df_test = data[~mask].copy()
df_test
#> DATE ICNSA
2975 2024-01-13 291330
2976 2024-01-20 249947
2977 2024-01-27 261029
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
acf_original = plot_acf(df_train['ICNSA'])
pacf_original = plot_pacf(df_train['ICNSA'])
# In[69]:
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(df_train['ICNSA'], order=(2,1,1))
model_fit = model.fit()
model_fit.summary()
#> <class 'statsmodels.iolib.summary.Summary'>
"""
SARIMAX Results
==============================================================================
Dep. Variable: ICNSA No. Observations: 2975
Model: ARIMA(2, 1, 1) Log Likelihood -38063.993
Date: Mon, 12 Feb 2024 AIC 76135.986
Time: 23:08:51 BIC 76159.976
Sample: 0 HQIC 76144.619
- 2975
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 1.2666 0.007 192.006 0.000 1.254 1.279
ar.L2 -0.4108 0.003 -158.750 0.000 -0.416 -0.406
ma.L1 -0.9643 0.006 -148.807 0.000 -0.977 -0.952
sigma2 7.941e+09 1.35e-13 5.86e+22 0.000 7.94e+09 7.94e+09
===================================================================================
Ljung-Box (L1) (Q): 6.13 Jarque-Bera (JB): 23122109.03
Prob(Q): 0.01 Prob(JB): 0.00
Heteroskedasticity (H): 4.96 Skew: 15.00
Prob(H) (two-sided): 0.00 Kurtosis: 433.92
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.23e+37. Standard errors may be unstable.
"""
final_answer = model_fit.predict(start = len(df_train), end=len(df_train)+3)
print(final_answer) #predicted data
#> 2975 328633.016942
#> 2976 320621.049412
#> 2977 306477.839593
#> 2978 291855.658206
#> Name: predicted_mean, dtype: float64
df_test # original data
#> DATE ICNSA
2975 2024-01-13 291330
2976 2024-01-20 249947
2977 2024-01-27 261029
final_answer.iloc[-1:]
#> 2978 291855.658206
Name: predicted_mean, dtype: float64
Dataset: Monthly Supply of New Houses in the United States (MSACSR) - https://fred.stlouisfed.org/series/MSACSR
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima.model import ARIMA
data = pd.read_csv(r"C:\Users\upend\Downloads\MSACSR.csv")
data
#> DATE MSACSR
0 1963-01-01 4.7
1 1963-02-01 6.6
2 1963-03-01 6.4
3 1963-04-01 5.3
4 1963-05-01 5.1
.. ... ...
727 2023-08-01 7.9
728 2023-09-01 7.5
729 2023-10-01 7.8
730 2023-11-01 8.8
731 2023-12-01 8.2
[732 rows x 2 columns]
data.plot()
#> <Axes: >
log_transformed_data = np.log(data['MSACSR'])
log_transformed_data.plot()
#> <Axes: >
plt.figure(figsize=(12, 6))
#> <Figure size 1200x600 with 0 Axes>
plt.subplot(2, 1, 1)
#> <Axes: >
plt.plot(data['MSACSR'], label='Original Data')
#> [<matplotlib.lines.Line2D at 0x1ce7e7fa3d0>]
plt.title('Original Data')
#> Text(0.5, 1.0, 'Original Data')
plt.subplot(2, 1, 2)
#> <Axes: >
plt.plot(log_transformed_data, label='Transformed Data', color='orange')
#> [<matplotlib.lines.Line2D at 0x1ce7e840050>]
plt.title('Log Transformed Data')
#> Text(0.5, 1.0, 'Log Transformed Data')
plt.tight_layout()
plt.show()
'''
Null Hypothesis is p-value > 5%
Alternate Hypothesis is p-value < 5%
'''
#> '\nNull Hypothesis is p-value > 5%\nAlternate Hypothesis is p-value < 5%\n'
adfuller(data['MSACSR'])
#> (-3.733882476547235,
0.0036610363673251546,
20,
711,
{'1%': -3.439580754053961,
'5%': -2.865613606467485,
'10%': -2.568939269723711},
1101.062338869638)
# p value is less than 5% so null hypothesis is rejected
# the data is stationary
plot_acf(data['MSACSR'])
#> <Figure size 640x480 with 1 Axes>
plot_pacf(data['MSACSR'])
#> <Figure size 640x480 with 1 Axes>
# p = 1
# q = 1
# d = 0
model = ARIMA(log_transformed_data, order=(1,0,1))
model_fit = model.fit()
model_fit.summary()
#> <class 'statsmodels.iolib.summary.Summary'>
"""
SARIMAX Results
==============================================================================
Dep. Variable: MSACSR No. Observations: 732
Model: ARIMA(1, 0, 1) Log Likelihood 791.129
Date: Mon, 12 Feb 2024 AIC -1574.259
Time: 23:14:21 BIC -1555.876
Sample: 0 HQIC -1567.168
- 732
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 1.7810 0.075 23.738 0.000 1.634 1.928
ar.L1 0.9672 0.010 94.401 0.000 0.947 0.987
ma.L1 -0.1854 0.029 -6.314 0.000 -0.243 -0.128
sigma2 0.0067 0.000 25.150 0.000 0.006 0.007
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 73.01
Prob(Q): 0.94 Prob(JB): 0.00
Heteroskedasticity (H): 1.05 Skew: 0.04
Prob(H) (two-sided): 0.71 Kurtosis: 4.55
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
"""
res_log = model_fit.predict(start = len(log_transformed_data), end=len(log_transformed_data))
final_answer = np.exp(res_log)
final_answer
#> 732 8.16215
dtype: float64
import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import ccf
import statsmodels.api as sm
# reading the data from API
fred = Fred(api_key='7d6f571eb06bbfe443c9fa3a4ee6e742')
data = fred.get_series('ICNSA')
data.head()
#> 1967-01-07 346000.0
1967-01-14 334000.0
1967-01-21 277000.0
1967-01-28 252000.0
1967-02-04 274000.0
dtype: float64
# plotting ICNSA data
data.plot()
#> <Axes: >
# covid period data
data.plot(xlim=['2020-01-01','2022-09-01'],ylim=[0,7500000],figsize=(12,4))
#> <Axes: >
# reading 2nd data
data_1 = fred.get_series('CCNSA')
data_1.head()
#> 1967-01-07 1594000.0
1967-01-14 1563000.0
1967-01-21 1551000.0
1967-01-28 1533000.0
1967-02-04 1534000.0
dtype: float64
#plotting the 2nd data
data_1.plot()
#> <Axes: >
print("Shape of ICNSA data", data.shape)
#> Shape of ICNSA data (2980,)
print("Shape of 2nd data", data_1.shape)
#> Shape of 2nd data (2979,)
# checking for missing values
print("Count of missing values in ICNSA:", data.isna().sum())
#> Count of missing values in ICNSA: 0
print("Count of missing values in Covariate data:", data_1.isna().sum())
#> Count of missing values in Covariate data: 0
data.head()
#> 1967-01-07 346000.0
1967-01-14 334000.0
1967-01-21 277000.0
1967-01-28 252000.0
1967-02-04 274000.0
dtype: float64
data.tail()
#> 2024-01-13 291330.0
2024-01-20 249947.0
2024-01-27 263919.0
2024-02-03 234729.0
2024-02-10 222164.0
dtype: float64
data_1.head()
#> 1967-01-07 1594000.0
1967-01-14 1563000.0
1967-01-21 1551000.0
1967-01-28 1533000.0
1967-02-04 1534000.0
dtype: float64
data_1.tail()
#> 2024-01-06 2122548.0
2024-01-13 2053298.0
2024-01-20 2181429.0
2024-01-27 2130008.0
2024-02-03 2146550.0
dtype: float64
data.info()
#> <class 'pandas.core.series.Series'>
#> DatetimeIndex: 2980 entries, 1967-01-07 to 2024-02-10
#> Series name: None
#> Non-Null Count Dtype
#> -------------- -----
#> 2980 non-null float64
#> dtypes: float64(1)
#> memory usage: 46.6 KB
data_1.info()
#> <class 'pandas.core.series.Series'>
#> DatetimeIndex: 2979 entries, 1967-01-07 to 2024-02-03
#> Series name: None
#> Non-Null Count Dtype
#> -------------- -----
#> 2979 non-null float64
#> dtypes: float64(1)
#> memory usage: 46.5 KB
# as there is no column name for ICNSA data, I am adding a column name 'data'
col = ['data']
d = pd.DataFrame(data.to_numpy(), columns=col)
# renaming 2nd dataset column name to 'data'
col_1 = ['data']
d_1 = pd.DataFrame(data_1.astype(float).to_numpy(), columns=col_1)
d.head()
#> data
0 346000.0
1 334000.0
2 277000.0
3 252000.0
4 274000.0
d_1.head()
#> data
0 1594000.0
1 1563000.0
2 1551000.0
3 1533000.0
4 1534000.0
# Correlation Analysis between ICNSA data and 2nd data
correlation_matrix = d.corrwith(d_1)
print("Pearson correlation coefficients between TICNSA data and 2nd data:",correlation_matrix)
#> Pearson correlation coefficients between TICNSA data and 2nd data: data 0.714954
#> dtype: float64
# Pairwise correlation is computed between columns of ICNSA with columns of 2nd dataset.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corrwith.html
# As the correlation value is near to +1 it indicates a positive linear relationship between ICNSA and 2nd dataset
def stationarity_test(series):
result = adfuller(series)
print('ADF Statistic:', result[0])
print('p-value:', result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
print("Stationarity Test for ICNSA data:")
#> Stationarity Test for ICNSA data:
stationarity_test(d)
#> ADF Statistic: -6.872346491681085
#> p-value: 1.5044350273818226e-09
#> Critical Values:
#> 1%: -3.433
#> 5%: -2.863
#> 10%: -2.567
print("\nStationarity Test for 2nd data:")
#>
#> Stationarity Test for 2nd data:
stationarity_test(d_1)
#> ADF Statistic: -6.045172149046205
#> p-value: 1.3171244588684742e-07
#> Critical Values:
#> 1%: -3.433
#> 5%: -2.863
#> 10%: -2.567
# Based on the p-value, we can say that ICNSA and 2nd dataset is stationary.
# acf and pacf plots for ICNSA data
plot_acf(d,)
#> <Figure size 640x480 with 1 Axes>
plot_pacf(d)
#> <Figure size 640x480 with 1 Axes>
# acf and pacf plots for 2nd data
plot_acf(d_1)
#> <Figure size 640x480 with 1 Axes>
plot_pacf(d_1)
#> <Figure size 640x480 with 1 Axes>
# ACF plots of ICNSA and 2nd dataset is similar to the trend of exponential decay
# PACF plots of ICNSA and 2nd dataset is having a wave like pattern
# ccf plot
ccf_result = ccf(d.iloc[:-1,:], d_1)
plt.plot(ccf_result)
#> [<matplotlib.lines.Line2D at 0x1f36b7452d0>]
plt.title('Cross-correlation Function')
#> Text(0.5, 1.0, 'Cross-correlation Function')
plt.xlabel('Lag')
#> Text(0.5, 4.444444444444445, 'Lag')
plt.ylabel('CCF')
#> Text(4.444444444444452, 0.5, 'CCF')
plt.show()
# Fitting the model
order = (1, 0, 1) # (p, d, q)
model = sm.tsa.ARIMA(d.iloc[:-1,:], exog=d_1, order=order)
res = model.fit()
print(res.summary())
#> SARIMAX Results
#> ==============================================================================
#> Dep. Variable: data No. Observations: 2979
#> Model: ARIMA(1, 0, 1) Log Likelihood -38182.330
#> Date: Thu, 22 Feb 2024 AIC 76374.661
#> Time: 02:35:05 BIC 76404.657
#> Sample: 0 HQIC 76385.454
#> - 2979
#> Covariance Type: opg
#> ==============================================================================
#> coef std err z P>|z| [0.025 0.975]
#> ------------------------------------------------------------------------------
#> const 5.783e+04 2.99e-10 1.93e+14 0.000 5.78e+04 5.78e+04
#> data 0.0884 0.002 58.630 0.000 0.085 0.091
#> ar.L1 0.8088 0.004 216.010 0.000 0.801 0.816
#> ma.L1 0.3094 0.003 100.979 0.000 0.303 0.315
#> sigma2 8.093e+09 5.05e-13 1.6e+22 0.000 8.09e+09 8.09e+09
#> ===================================================================================
#> Ljung-Box (L1) (Q): 1.91 Jarque-Bera (JB): 18524038.62
#> Prob(Q): 0.17 Prob(JB): 0.00
#> Heteroskedasticity (H): 4.76 Skew: 13.82
#> Prob(H) (two-sided): 0.00 Kurtosis: 388.32
#> ===================================================================================
#>
#> Warnings:
#> [1] Covariance matrix calculated using the outer product of gradients (complex-step).
#> [2] Covariance matrix is singular or near-singular, with condition number 1.33e+37. Standard errors may be unstable.
l=[]
for i in range(0,2978):
k = res.predict(start=i, end=i,exog=d_1.iloc[:i])
l.append(k.tolist())
l_data = pd.DataFrame(l, columns=['data'])
forecast_values = res.forecast(steps=3, exog=d_1.iloc[-3:])
forecast_values
#>
2979 229064.731939
2980 228667.211960
2981 233487.327614
# predicted value is 233487.327614
Name: predicted_mean, dtype: float64
#pip install fredapi
#pip install hampel
# importing the required libraries
import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import ccf
import statsmodels.api as sm
from scipy.interpolate import UnivariateSpline
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from hampel import hampel
#> Traceback (most recent call last):
#> Cell In[14], line 1
#> ----> 1 from hampel import hampel
#> ModuleNotFoundError: No module named 'hampel'
import warnings
warnings.filterwarnings("ignore")
# reading the data from API
fred = Fred(api_key='7d6f571eb06bbfe443c9fa3a4ee6e742')
data = fred.get_series('ICNSA')
# visualizing the data
plt.plot(data)
#> [<matplotlib.lines.Line2D at 0x250c17bc090>]
plt.show()
plt.close()
# 1st 5 records in the data
data.head()
#> 1967-01-07 346000.0
1967-01-14 334000.0
1967-01-21 277000.0
1967-01-28 252000.0
1967-02-04 274000.0
dtype: float64
# dates in the dataset
data.index
#> DatetimeIndex(['1967-01-07', '1967-01-14', '1967-01-21', '1967-01-28',
'1967-02-04', '1967-02-11', '1967-02-18', '1967-02-25',
'1967-03-04', '1967-03-11',
...
'2023-12-16', '2023-12-23', '2023-12-30', '2024-01-06',
'2024-01-13', '2024-01-20', '2024-01-27', '2024-02-03',
'2024-02-10', '2024-02-17'],
dtype='datetime64[ns]', length=2981, freq=None)
# creating a dataframe named 'X'
X = pd.DataFrame()
X['date']= data.index
X['ICNSA'] = data.to_numpy()
# printing the dataframe 'X'
X
#> date ICNSA
0 1967-01-07 346000.0
1 1967-01-14 334000.0
2 1967-01-21 277000.0
3 1967-01-28 252000.0
4 1967-02-04 274000.0
... ... ...
2976 2024-01-20 249947.0
2977 2024-01-27 263919.0
2978 2024-02-03 234729.0
2979 2024-02-10 223985.0
2980 2024-02-17 197932.0
[2981 rows x 2 columns]
result = hampel(data, window_size=3, n_sigma=3.0)
#> Traceback (most recent call last):
#> Cell In[28], line 1
#> ----> 1 result = hampel(data, window_size=3, n_sigma=3.0)
#> NameError: name 'hampel' is not defined
# https://pypi.org/project/hampel/
filtered_data = result.filtered_data
#> Traceback (most recent call last):
#> Cell In[29], line 3
#> 1 # https://pypi.org/project/hampel/
#> ----> 3 filtered_data = result.filtered_data
#> NameError: name 'result' is not defined
outlier_indices = result.outlier_indices
#> Traceback (most recent call last):
#> Cell In[30], line 1
#> ----> 1 outlier_indices = result.outlier_indices
#> NameError: name 'result' is not defined
medians = result.medians
#> Traceback (most recent call last):
#> Cell In[31], line 1
#> ----> 1 medians = result.medians
#> NameError: name 'result' is not defined
mad_values = result.median_absolute_deviations
#> Traceback (most recent call last):
#> Cell In[32], line 1
#> ----> 1 mad_values = result.median_absolute_deviations
#> NameError: name 'result' is not defined
thresholds = result.thresholds
#> Traceback (most recent call last):
#> Cell In[33], line 1
#> ----> 1 thresholds = result.thresholds
#> NameError: name 'result' is not defined
outlier_indices
#> Traceback (most recent call last):
#> Cell In[34], line 1
#> ----> 1 outlier_indices
#> NameError: name 'outlier_indices' is not defined
plt.plot(X['ICNSA'])
#> [<matplotlib.lines.Line2D at 0x250c2cbb850>]
plt.show()
plt.close()
# from the above plot is clear that there is sudden change from the index '2700' approximately,
for i in outlier_indices:
if(i>=2700):
print(i)
break
#> Traceback (most recent call last):
#> Cell In[38], line 3
#> 1 # from the above plot is clear that there is sudden change from the index '2700' approximately,
#> ----> 3 for i in outlier_indices:
#> 4 if(i>=2700):
#> 5 print(i)
#> NameError: name 'outlier_indices' is not defined
# hence considering the index after '2700' from the 'outlier_indices', which is '2750'.
# to check for the exact sudden change
plt.plot(X['ICNSA'].iloc[2750:2785])
#> [<matplotlib.lines.Line2D at 0x250c5d8e2d0>]
plt.show()
plt.close()
# after a detailed check, the sudden change in the data is at the index '2775'.
sudden_change_index = 2775
# covid period start data: 1st week of March, 2020.
ax = data.iloc[sudden_change_index-5:2800].plot()
fig = ax.get_figure()
plt.show(block=False)
plt.close(fig)
# visualizing the median values within the sliding window
plt.plot(medians[sudden_change_index-1:])
#> Traceback (most recent call last):
#> Cell In[47], line 3
#> 1 # visualizing the median values within the sliding window
#> ----> 3 plt.plot(medians[sudden_change_index-1:])
#> NameError: name 'medians' is not defined
plt.show()
plt.close()
# the above graph is following a decreasing trend.
# from the index '100' graph is not decreasing much when compared to the graph before index '100'.
# hence an index after '100' is to be considered.
# Approimately index '125' is consider as the end od covid period as there is small increasing the graph (between 100 - 150).
ending_index = 125
#data.iloc[2753:(2753+150)].plot()
#data.iloc[2753:(2753+150)].plot()
#2753
data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index].plot() # covid range
#> <Axes: >
#plt.plot(data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index])
plt.show()
plt.close()
# covid start date is from 1st week of March, 2020 to 1st week of July, 2021.
def get_spline(smoothing_param = 0.5):
x = range(0,len(X['date'].iloc[:sudden_change_index]))
y = X['ICNSA'].iloc[:sudden_change_index]
# Adjust this parameter to control smoothing
spline = UnivariateSpline(x, y, k=3, s=smoothing_param)
x_interp = range(0,len(X['date'].iloc[sudden_change_index:sudden_change_index+ending_index]))
y_interp = spline(x_interp)
frames = [y, pd.DataFrame(y_interp), data.iloc[sudden_change_index + ending_index:]]
new_data = pd.DataFrame()
new_data['ICNSA'] = pd.concat(frames)
new_data['date'] = data.index
new_data = new_data.set_index('date')
return new_data
collection_of_splines = []
for i in [0.25, 0.5, 0.75]:
collection_of_splines.append(get_spline(smoothing_param = i))
def additive_model(new_data):
model = ExponentialSmoothing(new_data, trend="add", seasonal="add")
model_additive = model.fit()
forecast_additive = model_additive.forecast(1)
plt.plot(new_data, label="Data")
plt.plot(model_additive.fittedvalues, label="Additive Fitted Values")
plt.legend()
plt.show()
plt.close()
return forecast_additive
collection_of_additive = []
for i,j in zip(collection_of_splines,[0.25, 0.5, 0.75]):
print("Additive model with lambda = ", j)
collection_of_additive.append(additive_model(new_data = i))
#> Additive model with lambda = 0.25
#> Additive model with lambda = 0.5
#> Additive model with lambda = 0.75
def multiplicative_model(new_data):
model = ExponentialSmoothing(new_data, trend="add", seasonal="mul")
model_multiplicative = model.fit()
forecast_multiplicative = model_multiplicative.forecast(1)
plt.plot(new_data, label="Original Data")
plt.plot(model_multiplicative.fittedvalues, label="Multiplicative Fitted Values")
plt.legend()
plt.show()
plt.close()
return forecast_multiplicative
collection_of_multiplicative = []
for i,j in zip(collection_of_splines,[0.25, 0.5, 0.75]):
print("Multiplicative model with lambda = ", j)
collection_of_multiplicative.append(multiplicative_model(new_data = i))
#> Multiplicative model with lambda = 0.25
#> Multiplicative model with lambda = 0.5
#> Multiplicative model with lambda = 0.75
# Additive Holt-Winters
for i,j in zip(collection_of_additive,[0.25, 0.5, 0.75]):
print("Prediction with Additive Holt-Winters with lamdba {} = {}".format(j, i.to_string(index=False)))
#> Prediction with Additive Holt-Winters with lamdba 0.25 = 238179.045324
#> Freq: W-SAT
#> Prediction with Additive Holt-Winters with lamdba 0.5 = 238179.04623
#> Freq: W-SAT
#> Prediction with Additive Holt-Winters with lamdba 0.75 = 238179.046926
#> Freq: W-SAT
# Multiplicative Holt-Winters
for i,j in zip(collection_of_multiplicative, [0.25, 0.5, 0.75]):
print("Prediction with Multiplicative Holt-Winters with lamdba {} = {}".format(j, i.to_string(index=False)))
#> Prediction with Multiplicative Holt-Winters with lamdba 0.25 = 213876.611124
#> Freq: W-SAT
#> Prediction with Multiplicative Holt-Winters with lamdba 0.5 = 213876.612128
#> Freq: W-SAT
#> Prediction with Multiplicative Holt-Winters with lamdba 0.75 = 213876.612898
#> Freq: W-SAT
#pip install fredapi
#pip install hampel
# importing the required libraries
import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import ccf
import statsmodels.api as sm
from scipy.interpolate import UnivariateSpline
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from hampel import hampel
#> Traceback (most recent call last):
#> Cell In[14], line 1
#> ----> 1 from hampel import hampel
#> ModuleNotFoundError: No module named 'hampel'
import warnings
warnings.filterwarnings("ignore")
# reading the data from API
fred = Fred(api_key='7d6f571eb06bbfe443c9fa3a4ee6e742')
data = fred.get_series('ICNSA')
# visualizing the data
plt.plot(data)
#> [<matplotlib.lines.Line2D at 0x1f8e8310050>]
plt.show()
plt.close()
# 1st 5 records in the data
data.head()
#> 1967-01-07 346000.0
1967-01-14 334000.0
1967-01-21 277000.0
1967-01-28 252000.0
1967-02-04 274000.0
dtype: float64
# dates in the dataset
data.index
#> DatetimeIndex(['1967-01-07', '1967-01-14', '1967-01-21', '1967-01-28',
'1967-02-04', '1967-02-11', '1967-02-18', '1967-02-25',
'1967-03-04', '1967-03-11',
...
'2023-12-23', '2023-12-30', '2024-01-06', '2024-01-13',
'2024-01-20', '2024-01-27', '2024-02-03', '2024-02-10',
'2024-02-17', '2024-02-24'],
dtype='datetime64[ns]', length=2982, freq=None)
# creating a dataframe named 'X'
X = pd.DataFrame()
X['date']= data.index
X['ICNSA'] = data.to_numpy()
# printing the dataframe 'X'
X
#> date ICNSA
0 1967-01-07 346000.0
1 1967-01-14 334000.0
2 1967-01-21 277000.0
3 1967-01-28 252000.0
4 1967-02-04 274000.0
... ... ...
2977 2024-01-27 263919.0
2978 2024-02-03 234729.0
2979 2024-02-10 223985.0
2980 2024-02-17 199337.0
2981 2024-02-24 193988.0
[2982 rows x 2 columns]
result = hampel(data, window_size=3, n_sigma=3.0)
#> Traceback (most recent call last):
#> Cell In[28], line 1
#> ----> 1 result = hampel(data, window_size=3, n_sigma=3.0)
#> NameError: name 'hampel' is not defined
# https://pypi.org/project/hampel/
filtered_data = result.filtered_data
#> Traceback (most recent call last):
#> Cell In[29], line 3
#> 1 # https://pypi.org/project/hampel/
#> ----> 3 filtered_data = result.filtered_data
#> NameError: name 'result' is not defined
outlier_indices = result.outlier_indices
#> Traceback (most recent call last):
#> Cell In[30], line 1
#> ----> 1 outlier_indices = result.outlier_indices
#> NameError: name 'result' is not defined
medians = result.medians
#> Traceback (most recent call last):
#> Cell In[31], line 1
#> ----> 1 medians = result.medians
#> NameError: name 'result' is not defined
mad_values = result.median_absolute_deviations
#> Traceback (most recent call last):
#> Cell In[32], line 1
#> ----> 1 mad_values = result.median_absolute_deviations
#> NameError: name 'result' is not defined
thresholds = result.thresholds
#> Traceback (most recent call last):
#> Cell In[33], line 1
#> ----> 1 thresholds = result.thresholds
#> NameError: name 'result' is not defined
outlier_indices
#> Traceback (most recent call last):
#> Cell In[34], line 1
#> ----> 1 outlier_indices
#> NameError: name 'outlier_indices' is not defined
plt.plot(X['ICNSA'])
#> [<matplotlib.lines.Line2D at 0x1f8e87db6d0>]
plt.show()
plt.close()
# from the above plot is clear that there is sudden change from the index '2700' approximately,
for i in outlier_indices:
if(i>=2700):
print(i)
break
#> Traceback (most recent call last):
#> Cell In[38], line 3
#> 1 # from the above plot is clear that there is sudden change from the index '2700' approximately,
#> ----> 3 for i in outlier_indices:
#> 4 if(i>=2700):
#> 5 print(i)
#> NameError: name 'outlier_indices' is not defined
# hence considering the index after '2700' from the 'outlier_indices', which is '2750'.
# to check for the exact sudden change
plt.plot(X['ICNSA'].iloc[2750:2785])
#> [<matplotlib.lines.Line2D at 0x1f8e8350e90>]
plt.show()
plt.close()
# after a detailed check, the sudden change in the data is at the index '2775'.
sudden_change_index = 2775
# covid period start data: 1st week of March, 2020.
ax = data.iloc[sudden_change_index-5:2800].plot()
fig = ax.get_figure()
plt.show(block=False)
plt.close(fig)
# visualizing the median values within the sliding window
plt.plot(medians[sudden_change_index-1:])
#> Traceback (most recent call last):
#> Cell In[47], line 3
#> 1 # visualizing the median values within the sliding window
#> ----> 3 plt.plot(medians[sudden_change_index-1:])
#> NameError: name 'medians' is not defined
plt.show()
plt.close()
# the above graph is following a decreasing trend.
# from the index '100' graph is not decreasing much when compared to the graph before index '100'.
# hence an index after '100' is to be considered.
# Approimately index '125' is consider as the end od covid period as there is small increasing the graph (between 100 - 150).
ending_index = 125
#data.iloc[2753:(2753+150)].plot()
#data.iloc[2753:(2753+150)].plot()
#2753
data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index].plot() # covid range
#> <Axes: >
#plt.plot(data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index])
plt.show()
plt.close()
# covid start date is from 1st week of March, 2020 to 1st week of July, 2021.
data = data.reset_index()
data
#> index 0
0 1967-01-07 346000.0
1 1967-01-14 334000.0
2 1967-01-21 277000.0
3 1967-01-28 252000.0
4 1967-02-04 274000.0
... ... ...
2977 2024-01-27 263919.0
2978 2024-02-03 234729.0
2979 2024-02-10 223985.0
2980 2024-02-17 199337.0
2981 2024-02-24 193988.0
[2982 rows x 2 columns]
# Covid pandemic period
covid_start_date = pd.to_datetime("2020-03-01")
covid_end_date = pd.to_datetime("2021-07-03")
from statsmodels.tsa.statespace.structural import UnobservedComponents
# Filter data for Covid period
covid_data = data[(data['index'] >= covid_start_date) & (data['index'] <= covid_end_date)]
# Fit a local level state-space model with the Kalman filter
model = UnobservedComponents(covid_data[0], 'local level')
results = model.fit()
# Use fitted model to impute missing values
covid_data['ICNSA_imputed'] = results.filtered_state[0]
covid_data
#> index 0 ICNSA_imputed
2774 2020-03-07 199914.0 3.174537e-01
2775 2020-03-14 251875.0 1.341532e+05
2776 2020-03-21 2914268.0 1.873881e+06
2777 2020-03-28 5981838.0 4.496327e+06
2778 2020-04-04 6161268.0 5.561928e+06
... ... ... ...
2839 2021-06-05 364577.0 3.889787e+05
2840 2021-06-12 407148.0 4.006119e+05
2841 2021-06-19 396935.0 3.982577e+05
2842 2021-06-26 362899.0 3.756187e+05
2843 2021-07-03 382622.0 3.801027e+05
[70 rows x 3 columns]
# Compare the two methods visually
plt.figure(figsize=(10, 6))
#> <Figure size 1000x600 with 0 Axes>
plt.plot(covid_data['index'], covid_data[0], label='Actual Data')
#> [<matplotlib.lines.Line2D at 0x1f8e8993850>]
plt.plot(covid_data['index'], covid_data['ICNSA_imputed'], label='State-Space Imputation')
#> [<matplotlib.lines.Line2D at 0x1f8e9123390>]
plt.xlabel('Date')
#> Text(0.5, 0, 'Date')
plt.ylabel('Initial Claims (ICNSA)')
#> Text(0, 0.5, 'Initial Claims (ICNSA)')
plt.title('Comparison of imputated data during covid period')
#> Text(0.5, 1.0, 'Comparison of imputated data during covid period')
plt.legend()
#> <matplotlib.legend.Legend at 0x1f8e899d790>
plt.show()
plt.close()
y = X['ICNSA'].iloc[:sudden_change_index-1]
frames = [y, covid_data['ICNSA_imputed'], data.iloc[sudden_change_index + 69:][0]]
new_data = pd.DataFrame()
new_data['new_ICNSA'] = pd.concat(frames)
y
#> 0 346000.0
1 334000.0
2 277000.0
3 252000.0
4 274000.0
...
2769 224561.0
2770 219459.0
2771 209218.0
2772 198845.0
2773 216625.0
Name: ICNSA, Length: 2774, dtype: float64
covid_data['ICNSA_imputed']
#> 2774 3.174537e-01
2775 1.341532e+05
2776 1.873881e+06
2777 4.496327e+06
2778 5.561928e+06
...
2839 3.889787e+05
2840 4.006119e+05
2841 3.982577e+05
2842 3.756187e+05
2843 3.801027e+05
Name: ICNSA_imputed, Length: 70, dtype: float64
data.iloc[sudden_change_index + 69:][0]
#> 2844 388662.0
2845 411244.0
2846 342003.0
2847 325265.0
2848 322782.0
...
2977 263919.0
2978 234729.0
2979 223985.0
2980 199337.0
2981 193988.0
Name: 0, Length: 138, dtype: float64
from statsmodels.tsa.statespace.structural import UnobservedComponents
# Define and fit the structural time series model
model = UnobservedComponents(new_data, 'local linear trend', seasonal=52)
results = model.fit()
# Print model summary
print(results.summary())
#> Unobserved Components Results
#> =====================================================================================
#> Dep. Variable: new_ICNSA No. Observations: 2982
#> Model: local linear trend Log Likelihood -41286.538
#> + stochastic seasonal(52) AIC 82581.077
#> Date: Thu, 07 Mar 2024 BIC 82605.007
#> Time: 03:07:03 HQIC 82589.695
#> Sample: 0
#> - 2982
#> Covariance Type: opg
#> ====================================================================================
#> coef std err z P>|z| [0.025 0.975]
#> ------------------------------------------------------------------------------------
#> sigma2.irregular 1.953e+10 1.48e+10 1.322 0.186 -9.42e+09 4.85e+10
#> sigma2.level 6.765e+08 1.49e+10 0.046 0.964 -2.84e+10 2.98e+10
#> sigma2.trend 2.97e+10 4.03e+09 7.367 0.000 2.18e+10 3.76e+10
#> sigma2.seasonal 1.953e+10 5.76e+09 3.390 0.001 8.24e+09 3.08e+10
#> ===================================================================================
#> Ljung-Box (L1) (Q): 829.23 Jarque-Bera (JB): 12472704.31
#> Prob(Q): 0.00 Prob(JB): 0.00
#> Heteroskedasticity (H): 17.51 Skew: -0.63
#> Prob(H) (two-sided): 0.00 Kurtosis: 322.69
#> ===================================================================================
#>
#> Warnings:
#> [1] Covariance matrix calculated using the outer product of gradients (complex-step).
# https://fred.stlouisfed.org/series/IURNSA
# reading 2nd data
data_1 = fred.get_series('IURNSA')
data_1
#> 1971-01-02 5.2
1971-01-09 5.3
1971-01-16 5.3
1971-01-23 5.2
1971-01-30 5.2
...
2024-01-20 1.5
2024-01-27 1.4
2024-02-03 1.4
2024-02-10 1.4
2024-02-17 1.4
Length: 2773, dtype: float64
plt.plot(data_1)
#> [<matplotlib.lines.Line2D at 0x1f8e91a0290>]
plt.show()
plt.close()
plt.close()
# acf and pacf plots for ICNSA data
plot_acf(data_1,)
#> <Figure size 640x480 with 1 Axes>
plot_pacf(data_1)
#> <Figure size 640x480 with 1 Axes>
plt.show()
plt.close()
finaldata = pd.DataFrame({'ICNSA': new_data['new_ICNSA'].iloc[209:].tolist(),
'IURNSA': data_1[0].tolist()}, index=None)
finaldata
#> ICNSA IURNSA
0 500000.0 5.2
1 452000.0 5.2
2 399000.0 5.2
3 353000.0 5.2
4 375000.0 5.2
... ... ...
2768 263919.0 5.2
2769 234729.0 5.2
2770 223985.0 5.2
2771 199337.0 5.2
2772 193988.0 5.2
[2773 rows x 2 columns]
# ccf plot
ccf_result = ccf(finaldata['IURNSA'], finaldata['ICNSA'])
plt.plot(ccf_result)
#> [<matplotlib.lines.Line2D at 0x1f8ffe7f390>]
plt.title('Cross-correlation Function')
#> Text(0.5, 1.0, 'Cross-correlation Function')
plt.xlabel('Lag')
#> Text(0.5, 4.444444444444445, 'Lag')
plt.ylabel('CCF')
#> Text(4.444444444444452, 0.5, 'CCF')
plt.show()
plt.close()
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
model = sm.tsa.ARIMA( finaldata['ICNSA'], exog = finaldata['IURNSA'],
order=(1, 0, 1),
seasonal_order=(1, 1, 1, 52))
model_results = model.fit()
forecast_values = model_results.forecast(steps=2, exog=new_data['new_ICNSA'].iloc[-2:])
forecast_values.iloc[:-1]
#> 2773 214329.190072
Name: predicted_mean, dtype: float64
# Load library
library(readxl)
#> Warning: package 'readxl' was built under R version 4.3.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.3.3
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
library(lmtest)
#> Warning: package 'lmtest' was built under R version 4.3.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.3.2
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(reprex)
#> Warning: package 'reprex' was built under R version 4.3.3
library(zoo)
library(vars)
#> Warning: package 'vars' was built under R version 4.3.3
#> Loading required package: MASS
#> Warning: package 'MASS' was built under R version 4.3.2
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Warning: package 'urca' was built under R version 4.3.2
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.3.3
#> Loading required package: lattice
# Read the Excel file
wi_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Work-in-Process Inventory - 31SWI.xlsx")
fi_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Finished Good Inventory - 31SFI.xlsx")
mi_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Materials-and-Supplies Inventory – 31SMI.xlsx")
vs_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Shipments – 31SVS.xlsx")
# converting to Period to date format
wi_data$Period <- as.Date(paste0("01-", wi_data$Period), format = "%d-%b-%Y")
fi_data$Period <- as.Date(paste0("01-", fi_data$Period), format = "%d-%b-%Y")
mi_data$Period <- as.Date(paste0("01-", mi_data$Period), format = "%d-%b-%Y")
vs_data$Period <- as.Date(paste0("01-", vs_data$Period), format = "%d-%b-%Y")
# to check if the Period column is in date format
print(class(wi_data$Period))
#> [1] "Date"
plot(wi_data$Period, wi_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Work-in-Process Inventory")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
plot(fi_data$Period, fi_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Finished Good Inventory")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
plot(mi_data$Period, mi_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Materials-and-Supplies Inventory")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
plot(vs_data$Period, vs_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Value of Shipments")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
# Augmented Dickey-Fuller (ADF) test for stationarity
adf_result_wi <- adf.test(wi_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion
# Print the ADF test result
print("Work-in-Process Inventory")
#> [1] "Work-in-Process Inventory"
print(adf_result_wi)
#>
#> Augmented Dickey-Fuller Test
#>
#> data: wi_data$Value
#> Dickey-Fuller = -3.7347, Lag order = 7, p-value = 0.02247
#> alternative hypothesis: stationary
adf_result_fi <- adf.test(fi_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion
# Print the ADF test result
print("Finished Good Inventory")
#> [1] "Finished Good Inventory"
print(adf_result_fi)
#>
#> Augmented Dickey-Fuller Test
#>
#> data: fi_data$Value
#> Dickey-Fuller = -3.693, Lag order = 7, p-value = 0.02456
#> alternative hypothesis: stationary
adf_result_mi <- adf.test(mi_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion
# Print the ADF test result
print("Materials-and-Supplies Inventory")
#> [1] "Materials-and-Supplies Inventory"
print(adf_result_mi)
#>
#> Augmented Dickey-Fuller Test
#>
#> data: mi_data$Value
#> Dickey-Fuller = -2.9352, Lag order = 7, p-value = 0.1823
#> alternative hypothesis: stationary
adf_result_vs <- adf.test(vs_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion
# Print the ADF test result
print("Value of Shipments")
#> [1] "Value of Shipments"
print(adf_result_vs)
#>
#> Augmented Dickey-Fuller Test
#>
#> data: vs_data$Value
#> Dickey-Fuller = -3.4253, Lag order = 7, p-value = 0.04989
#> alternative hypothesis: stationary
# Granger causality test
granger_result <- grangertest(na.approx(fi_data$Value) ~ na.approx(wi_data$Value), order = 1)
#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion
#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion
#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion
#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion
# Print the Granger causality test result
print(granger_result)
#> Granger causality test
#>
#> Model 1: na.approx(fi_data$Value) ~ Lags(na.approx(fi_data$Value), 1:1) + Lags(na.approx(wi_data$Value), 1:1)
#> Model 2: na.approx(fi_data$Value) ~ Lags(na.approx(fi_data$Value), 1:1)
#> Res.Df Df F Pr(>F)
#> 1 382
#> 2 383 -1 18.965 1.713e-05 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Merge all the attributes by Period
final_data <- merge(wi_data, fi_data, by = "Period", all = TRUE, suffixes = c("_WI", "_FI"))
final_data <- merge(final_data, mi_data, by = "Period", all = TRUE, suffixes = c("", "_MI"))
final_data <- merge(final_data, vs_data, by = "Period", all = TRUE, suffixes = c("", "_VS"))
final_data <- na.omit(final_data)
final_data$Value_WI <- as.numeric(final_data$Value_WI)
#> Warning: NAs introduced by coercion
final_data$Value_FI <- as.numeric(final_data$Value_FI)
#> Warning: NAs introduced by coercion
final_data$Value <- as.numeric(final_data$Value)
#> Warning: NAs introduced by coercion
final_data$Value_VS <- as.numeric(final_data$Value_VS)
#> Warning: NAs introduced by coercion
final_data = final_data[1:386,]
# fitting the var(1) and var(p=2) model
ts_data<- ts(final_data, frequency = 1)
# Fit the VAR(1) model
var1 <- VAR(ts_data, p = 1, type = "const")
# Fit the VAR(2) model, p=2
var2 <- VAR(ts_data, p = 2, type = "const")
summary(var1)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: Period, Value_WI, Value_FI, Value, Value_VS
#> Deterministic variables: const
#> Sample size: 385
#> Log Likelihood: -10907.586
#> Roots of the characteristic polynomial:
#> 1 0.9649 0.9649 0.9642 0.933
#> Call:
#> VAR(y = ts_data, p = 1, type = "const")
#>
#>
#> Estimation results for equation Period:
#> =======================================
#> Period = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 1.000e+00 3.794e-05 26353.798 <2e-16 ***
#> Value_WI.l1 -3.866e-05 9.960e-05 -0.388 0.698
#> Value_FI.l1 6.318e-05 1.004e-04 0.629 0.529
#> Value.l1 -1.425e-05 4.845e-05 -0.294 0.769
#> Value_VS.l1 -7.569e-07 2.804e-05 -0.027 0.978
#> const 3.055e+01 4.278e-01 71.411 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 0.8195 on 379 degrees of freedom
#> Multiple R-Squared: 1, Adjusted R-squared: 1
#> F-statistic: 1.312e+09 on 5 and 379 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_WI:
#> =========================================
#> Value_WI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 3.933e-02 7.371e-03 5.336 1.64e-07 ***
#> Value_WI.l1 9.846e-01 1.935e-02 50.885 < 2e-16 ***
#> Value_FI.l1 -8.082e-02 1.950e-02 -4.145 4.20e-05 ***
#> Value.l1 -4.061e-02 9.412e-03 -4.315 2.04e-05 ***
#> Value_VS.l1 5.939e-02 5.448e-03 10.902 < 2e-16 ***
#> const -2.416e+02 8.310e+01 -2.907 0.00386 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 159.2 on 379 degrees of freedom
#> Multiple R-Squared: 0.9932, Adjusted R-squared: 0.9931
#> F-statistic: 1.113e+04 on 5 and 379 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_FI:
#> =========================================
#> Value_FI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 6.045e-02 7.377e-03 8.194 3.93e-15 ***
#> Value_WI.l1 8.264e-02 1.937e-02 4.267 2.50e-05 ***
#> Value_FI.l1 8.501e-01 1.952e-02 43.558 < 2e-16 ***
#> Value.l1 -6.082e-02 9.420e-03 -6.456 3.29e-10 ***
#> Value_VS.l1 6.581e-02 5.452e-03 12.070 < 2e-16 ***
#> const -6.714e+02 8.317e+01 -8.072 9.25e-15 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 159.3 on 379 degrees of freedom
#> Multiple R-Squared: 0.9967, Adjusted R-squared: 0.9967
#> F-statistic: 2.304e+04 on 5 and 379 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value:
#> ======================================
#> Value = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 3.250e-02 9.372e-03 3.468 0.000584 ***
#> Value_WI.l1 1.905e-02 2.460e-02 0.775 0.439069
#> Value_FI.l1 -2.801e-02 2.479e-02 -1.130 0.259220
#> Value.l1 8.979e-01 1.197e-02 75.032 < 2e-16 ***
#> Value_VS.l1 7.421e-02 6.926e-03 10.715 < 2e-16 ***
#> const -5.126e+02 1.057e+02 -4.852 1.79e-06 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 202.4 on 379 degrees of freedom
#> Multiple R-Squared: 0.9973, Adjusted R-squared: 0.9973
#> F-statistic: 2.799e+04 on 5 and 379 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_VS:
#> =========================================
#> Value_VS = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 0.15655 0.02135 7.334 1.37e-12 ***
#> Value_WI.l1 0.09016 0.05604 1.609 0.108473
#> Value_FI.l1 -0.30796 0.05647 -5.453 8.93e-08 ***
#> Value.l1 -0.06453 0.02726 -2.367 0.018414 *
#> Value_VS.l1 1.08425 0.01578 68.724 < 2e-16 ***
#> const -883.04504 240.66629 -3.669 0.000278 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 461.1 on 379 degrees of freedom
#> Multiple R-Squared: 0.9913, Adjusted R-squared: 0.9912
#> F-statistic: 8686 on 5 and 379 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> Period Value_WI Value_FI Value Value_VS
#> Period 0.6716 1.782 -3.952 5.106 15.43
#> Value_WI 1.7819 25343.285 4739.504 -1716.657 21069.26
#> Value_FI -3.9520 4739.504 25386.913 5211.478 16117.77
#> Value 5.1055 -1716.657 5211.478 40966.851 15823.27
#> Value_VS 15.4337 21069.259 16117.767 15823.274 212568.83
#>
#> Correlation matrix of residuals:
#> Period Value_WI Value_FI Value Value_VS
#> Period 1.00000 0.01366 -0.03027 0.03078 0.04085
#> Value_WI 0.01366 1.00000 0.18685 -0.05328 0.28706
#> Value_FI -0.03027 0.18685 1.00000 0.16160 0.21941
#> Value 0.03078 -0.05328 0.16160 1.00000 0.16956
#> Value_VS 0.04085 0.28706 0.21941 0.16956 1.00000
summary(var2)
#>
#> VAR Estimation Results:
#> =========================
#> Endogenous variables: Period, Value_WI, Value_FI, Value, Value_VS
#> Deterministic variables: const
#> Sample size: 384
#> Log Likelihood: -10750.069
#> Roots of the characteristic polynomial:
#> 1 0.9607 0.9607 0.9392 0.9392 0.4777 0.4737 0.3325 0.1318 0.05043
#> Call:
#> VAR(y = ts_data, p = 2, type = "const")
#>
#>
#> Estimation results for equation Period:
#> =======================================
#> Period = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 5.255e-01 4.542e-02 11.570 <2e-16 ***
#> Value_WI.l1 2.427e-06 2.477e-04 0.010 0.992
#> Value_FI.l1 2.435e-04 2.444e-04 0.996 0.320
#> Value.l1 -2.782e-04 1.901e-04 -1.463 0.144
#> Value_VS.l1 -3.419e-06 8.670e-05 -0.039 0.969
#> Period.l2 4.745e-01 4.542e-02 10.449 <2e-16 ***
#> Value_WI.l2 -6.169e-05 2.556e-04 -0.241 0.809
#> Value_FI.l2 -1.714e-04 2.343e-04 -0.731 0.465
#> Value.l2 2.628e-04 1.782e-04 1.475 0.141
#> Value_VS.l2 1.036e-05 9.168e-05 0.113 0.910
#> const 4.503e+01 1.447e+00 31.113 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 0.7227 on 373 degrees of freedom
#> Multiple R-Squared: 1, Adjusted R-squared: 1
#> F-statistic: 8.371e+08 on 10 and 373 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_WI:
#> =========================================
#> Value_WI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 4.13789 9.35573 0.442 0.65854
#> Value_WI.l1 1.02150 0.05103 20.017 < 2e-16 ***
#> Value_FI.l1 0.12384 0.05034 2.460 0.01435 *
#> Value.l1 0.05163 0.03916 1.319 0.18808
#> Value_VS.l1 0.12643 0.01786 7.079 7.26e-12 ***
#> Period.l2 -4.12014 9.35561 -0.440 0.65991
#> Value_WI.l2 -0.04232 0.05265 -0.804 0.42200
#> Value_FI.l2 -0.16601 0.04827 -3.439 0.00065 ***
#> Value.l2 -0.05922 0.03671 -1.613 0.10753
#> Value_VS.l2 -0.10034 0.01889 -5.312 1.86e-07 ***
#> const -164.24681 298.11885 -0.551 0.58200
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 148.9 on 373 degrees of freedom
#> Multiple R-Squared: 0.9942, Adjusted R-squared: 0.994
#> F-statistic: 6346 on 10 and 373 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_FI:
#> =========================================
#> Value_FI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 -0.31089 9.30970 -0.033 0.973378
#> Value_WI.l1 0.21341 0.05078 4.203 3.31e-05 ***
#> Value_FI.l1 1.02357 0.05009 20.433 < 2e-16 ***
#> Value.l1 0.10944 0.03896 2.809 0.005234 **
#> Value_VS.l1 0.10366 0.01777 5.832 1.18e-08 ***
#> Period.l2 0.34843 9.30958 0.037 0.970164
#> Value_WI.l2 -0.14227 0.05239 -2.715 0.006928 **
#> Value_FI.l2 -0.12870 0.04803 -2.679 0.007702 **
#> Value.l2 -0.13334 0.03653 -3.650 0.000299 ***
#> Value_VS.l2 -0.07519 0.01879 -4.001 7.62e-05 ***
#> const -414.17516 296.65208 -1.396 0.163495
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 148.1 on 373 degrees of freedom
#> Multiple R-Squared: 0.9972, Adjusted R-squared: 0.9971
#> F-statistic: 1.325e+04 on 10 and 373 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value:
#> ======================================
#> Value = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 3.16636 12.36548 0.256 0.798042
#> Value_WI.l1 0.19248 0.06745 2.854 0.004561 **
#> Value_FI.l1 0.06509 0.06654 0.978 0.328582
#> Value.l1 1.10735 0.05175 21.397 < 2e-16 ***
#> Value_VS.l1 0.01460 0.02361 0.618 0.536744
#> Period.l2 -3.14003 12.36532 -0.254 0.799683
#> Value_WI.l2 -0.18410 0.06959 -2.645 0.008505 **
#> Value_FI.l2 -0.07830 0.06380 -1.227 0.220494
#> Value.l2 -0.18609 0.04852 -3.836 0.000147 ***
#> Value_VS.l2 0.03879 0.02496 1.554 0.121015
#> const -462.32797 394.02416 -1.173 0.241404
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 196.8 on 373 degrees of freedom
#> Multiple R-Squared: 0.9975, Adjusted R-squared: 0.9974
#> F-statistic: 1.473e+04 on 10 and 373 DF, p-value: < 2.2e-16
#>
#>
#> Estimation results for equation Value_VS:
#> =========================================
#> Value_VS = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const
#>
#> Estimate Std. Error t value Pr(>|t|)
#> Period.l1 -1.10389 27.59273 -0.040 0.96811
#> Value_WI.l1 0.37714 0.15050 2.506 0.01264 *
#> Value_FI.l1 0.20463 0.14847 1.378 0.16896
#> Value.l1 -0.23008 0.11548 -1.992 0.04706 *
#> Value_VS.l1 1.26380 0.05267 23.993 < 2e-16 ***
#> Period.l2 1.19627 27.59238 0.043 0.96544
#> Value_WI.l2 -0.33497 0.15529 -2.157 0.03164 *
#> Value_FI.l2 -0.39122 0.14237 -2.748 0.00629 **
#> Value.l2 0.23909 0.10826 2.208 0.02782 *
#> Value_VS.l2 -0.24753 0.05570 -4.444 1.17e-05 ***
#> const -267.68827 879.23822 -0.304 0.76095
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#>
#> Residual standard error: 439.1 on 373 degrees of freedom
#> Multiple R-Squared: 0.9922, Adjusted R-squared: 0.992
#> F-statistic: 4764 on 10 and 373 DF, p-value: < 2.2e-16
#>
#>
#>
#> Covariance matrix of residuals:
#> Period Value_WI Value_FI Value Value_VS
#> Period 0.5222 3.474 -2.787 7.452 10.8
#> Value_WI 3.4735 22161.714 1368.620 -2929.184 13978.8
#> Value_FI -2.7875 1368.620 21944.175 3145.551 9962.8
#> Value 7.4521 -2929.184 3145.551 38714.190 15762.1
#> Value_VS 10.8032 13978.760 9962.828 15762.096 192769.2
#>
#> Correlation matrix of residuals:
#> Period Value_WI Value_FI Value Value_VS
#> Period 1.00000 0.03229 -0.02604 0.05241 0.03405
#> Value_WI 0.03229 1.00000 0.06206 -0.10000 0.21387
#> Value_FI -0.02604 0.06206 1.00000 0.10792 0.15318
#> Value 0.05241 -0.10000 0.10792 1.00000 0.18246
#> Value_VS 0.03405 0.21387 0.15318 0.18246 1.00000
# predict one month
forecast <- predict(var2, n.ahead = 1)
print(forecast)
#> $Period
#> fcst lower upper CI
#> Period.fcst 19784.18 19782.77 19785.6 1.416376
#>
#> $Value_WI
#> fcst lower upper CI
#> Value_WI.fcst 11838.31 11546.53 12130.09 291.7761
#>
#> $Value_FI
#> fcst lower upper CI
#> Value_FI.fcst 14877.72 14587.38 15168.06 290.3406
#>
#> $Value
#> fcst lower upper CI
#> Value.fcst 19241.84 18856.2 19627.48 385.641
#>
#> $Value_VS
#> fcst lower upper CI
#> Value_VS.fcst 26342.1 25481.57 27202.63 860.5317
print(causality(var2, cause = "Value_WI")) # for Work-in-Process Inventory
#> $Granger
#>
#> Granger causality H0: Value_WI do not Granger-cause Period Value_FI
#> Value Value_VS
#>
#> data: VAR object var2
#> F-Test = 4.7194, df1 = 8, df2 = 1865, p-value = 9.558e-06
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: Value_WI and Period Value_FI
#> Value Value_VS
#>
#> data: VAR object var2
#> Chi-squared = 24.635, df = 4, p-value = 5.956e-05
print(causality(var2, cause = "Value_FI")) # for Finished Good Inventory
#> $Granger
#>
#> Granger causality H0: Value_FI do not Granger-cause Period Value_WI
#> Value Value_VS
#>
#> data: VAR object var2
#> F-Test = 3.519, df1 = 8, df2 = 1865, p-value = 0.0004768
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: Value_FI and Period Value_WI
#> Value Value_VS
#>
#> data: VAR object var2
#> Chi-squared = 12.338, df = 4, p-value = 0.01501
print(causality(var2, cause = "Value")) # for Materials-and-Supplies Inventory
#> $Granger
#>
#> Granger causality H0: Value do not Granger-cause Period Value_WI
#> Value_FI Value_VS
#>
#> data: VAR object var2
#> F-Test = 4.6139, df1 = 8, df2 = 1865, p-value = 1.358e-05
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: Value and Period Value_WI
#> Value_FI Value_VS
#>
#> data: VAR object var2
#> Chi-squared = 22.96, df = 4, p-value = 0.000129
print(causality(var2, cause = "Value_VS")) # for value of shipments
#> $Granger
#>
#> Granger causality H0: Value_VS do not Granger-cause Period Value_WI
#> Value_FI Value
#>
#> data: VAR object var2
#> F-Test = 17.428, df1 = 8, df2 = 1865, p-value < 2.2e-16
#>
#>
#> $Instant
#>
#> H0: No instantaneous causality between: Value_VS and Period Value_WI
#> Value_FI Value
#>
#> data: VAR object var2
#> Chi-squared = 35.508, df = 4, p-value = 3.653e-07
data_matrix <- cbind(final_data$Value_WI,
final_data$Value_FI,
final_data$Value,
final_data$Value_VS)
# Fitting a sparse VAR model with Basic-Elastic Net (BasicEN) penalty
bigvar_model <- BigVAR.fit(data_matrix, p = 2, struct = "BasicEN", alpha = 0.5, lambda = 1)
summary(bigvar_model)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> -270.2544 -0.0799 0.0333 1.6343 0.2154 458.3565
print(bigvar_model)
#> , , 1
#>
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 22.42747 0.60195252 0.04274818 0.01844193 0.2032870 0.36650865
#> [2,] -155.55493 0.06884793 0.63349337 0.05175590 0.2102588 -0.07662389
#> [3,] -270.25436 0.02384631 0.07365336 0.66386800 0.1098656 -0.05855728
#> [4,] 458.35648 0.14535630 0.14965514 0.01697487 1.3282691 -0.22847822
#> [,7] [,8] [,9]
#> [1,] -0.08985074 -0.02628400 -0.15785515
#> [2,] 0.33991232 -0.09248895 -0.15644317
#> [3,] -0.03566235 0.23089930 -0.02817492
#> [4,] -0.16337012 0.01769926 -0.32421028
Created on 2024-04-11 with reprex v2.1.0