jlivsey / UB-sping24-time-series

3 stars 1 forks source link

Upender Kaveti #6

Open UpenderKaveti opened 5 months ago

UpenderKaveti commented 5 months ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from statsmodels.tsa.api import AutoReg

#reading the data
data = pd.read_csv(r"C:\Users\upend\Downloads\ICNSA.csv")

#printing the data
print(data)
#>             DATE   ICNSA
#> 0     1967-01-07  346000
#> 1     1967-01-14  334000
#> 2     1967-01-21  277000
#> 3     1967-01-28  252000
#> 4     1967-02-04  274000
#> ...          ...     ...
#> 2973  2023-12-30  269409
#> 2974  2024-01-06  318906
#> 2975  2024-01-13  291330
#> 2976  2024-01-20  249947
#> 2977  2024-01-27  261029
#> 
#> [2978 rows x 2 columns]

import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add trace for line plot
fig.add_trace(go.Scatter(x=data['DATE'], y=data['ICNSA'], mode='lines', name='Line'))

# Update layout
fig.update_layout(title='ICNSA', xaxis_title='Date', yaxis_title='Numbers')

# Show plot
fig.show()

log_transformed_data = np.log(data['ICNSA'])

plt.figure(figsize=(12, 6))
#> <Figure size 1200x600 with 0 Axes>
plt.subplot(2, 1, 1)
#> <Axes: >
plt.plot(data['ICNSA'], label='Original Data')
#> [<matplotlib.lines.Line2D at 0x224323431d0>]
plt.title('Original Data')
#> Text(0.5, 1.0, 'Original Data')
plt.subplot(2, 1, 2)
#> <Axes: >
plt.plot(log_transformed_data, label='Transformed Data', color='orange')
#> [<matplotlib.lines.Line2D at 0x22432748550>]
plt.title('Log Transformed Data')
#> Text(0.5, 1.0, 'Log Transformed Data')

plt.tight_layout()
plt.show()


model_log = AutoReg(log_transformed_data, lags=2).fit()

res_log = model_log.predict(start = len(log_transformed_data), end=len(log_transformed_data))

final_answer = np.exp(res_log)

print(final_answer)
#> 2978    263825.711068
#> dtype: float64
UpenderKaveti commented 4 months ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from statsmodels.tsa.api import AutoReg

#reading the data
data = pd.read_csv(r"C:\Users\upend\Downloads\ICNSA.csv")

#printing the data
data
#>             DATE   ICNSA
0     1967-01-07  346000
1     1967-01-14  334000
2     1967-01-21  277000
3     1967-01-28  252000
4     1967-02-04  274000
...          ...     ...
2973  2023-12-30  269409
2974  2024-01-06  318906
2975  2024-01-13  291330
2976  2024-01-20  249947
2977  2024-01-27  261029

[2978 rows x 2 columns]

data.plot()
#> <Axes: >

mask = (data.index < len(data)-3)
df_train = data[mask].copy()
df_test = data[~mask].copy()

df_test
#>             DATE   ICNSA
2975  2024-01-13  291330
2976  2024-01-20  249947
2977  2024-01-27  261029

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

acf_original = plot_acf(df_train['ICNSA'])

pacf_original = plot_pacf(df_train['ICNSA'])

# In[69]:

from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(df_train['ICNSA'], order=(2,1,1))
model_fit = model.fit()
model_fit.summary()
#> <class 'statsmodels.iolib.summary.Summary'>
"""
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                  ICNSA   No. Observations:                 2975
Model:                 ARIMA(2, 1, 1)   Log Likelihood              -38063.993
Date:                Mon, 12 Feb 2024   AIC                          76135.986
Time:                        23:08:51   BIC                          76159.976
Sample:                             0   HQIC                         76144.619
                               - 2975                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.2666      0.007    192.006      0.000       1.254       1.279
ar.L2         -0.4108      0.003   -158.750      0.000      -0.416      -0.406
ma.L1         -0.9643      0.006   -148.807      0.000      -0.977      -0.952
sigma2      7.941e+09   1.35e-13   5.86e+22      0.000    7.94e+09    7.94e+09
===================================================================================
Ljung-Box (L1) (Q):                   6.13   Jarque-Bera (JB):          23122109.03
Prob(Q):                              0.01   Prob(JB):                         0.00
Heteroskedasticity (H):               4.96   Skew:                            15.00
Prob(H) (two-sided):                  0.00   Kurtosis:                       433.92
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.23e+37. Standard errors may be unstable.
"""

final_answer = model_fit.predict(start = len(df_train), end=len(df_train)+3)

print(final_answer) #predicted data
#> 2975    328633.016942
#> 2976    320621.049412
#> 2977    306477.839593
#> 2978    291855.658206
#> Name: predicted_mean, dtype: float64

df_test # original data
#>             DATE   ICNSA
2975  2024-01-13  291330
2976  2024-01-20  249947
2977  2024-01-27  261029

final_answer.iloc[-1:]
#> 2978    291855.658206
Name: predicted_mean, dtype: float64
UpenderKaveti commented 4 months ago

Dataset: Monthly Supply of New Houses in the United States (MSACSR) - https://fred.stlouisfed.org/series/MSACSR

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima.model import ARIMA

data = pd.read_csv(r"C:\Users\upend\Downloads\MSACSR.csv")

data
#>            DATE  MSACSR
0    1963-01-01     4.7
1    1963-02-01     6.6
2    1963-03-01     6.4
3    1963-04-01     5.3
4    1963-05-01     5.1
..          ...     ...
727  2023-08-01     7.9
728  2023-09-01     7.5
729  2023-10-01     7.8
730  2023-11-01     8.8
731  2023-12-01     8.2

[732 rows x 2 columns]

data.plot()
#> <Axes: >

log_transformed_data = np.log(data['MSACSR'])

log_transformed_data.plot()
#> <Axes: >

plt.figure(figsize=(12, 6))
#> <Figure size 1200x600 with 0 Axes>
plt.subplot(2, 1, 1)
#> <Axes: >
plt.plot(data['MSACSR'], label='Original Data')
#> [<matplotlib.lines.Line2D at 0x1ce7e7fa3d0>]
plt.title('Original Data')
#> Text(0.5, 1.0, 'Original Data')
plt.subplot(2, 1, 2)
#> <Axes: >
plt.plot(log_transformed_data, label='Transformed Data', color='orange')
#> [<matplotlib.lines.Line2D at 0x1ce7e840050>]
plt.title('Log Transformed Data')
#> Text(0.5, 1.0, 'Log Transformed Data')

plt.tight_layout()
plt.show()


'''
Null Hypothesis is p-value > 5%
Alternate Hypothesis is p-value < 5%
'''
#> '\nNull Hypothesis is p-value > 5%\nAlternate Hypothesis is p-value < 5%\n'

adfuller(data['MSACSR'])
#> (-3.733882476547235,
 0.0036610363673251546,
 20,
 711,
 {'1%': -3.439580754053961,
  '5%': -2.865613606467485,
  '10%': -2.568939269723711},
 1101.062338869638)

# p value is less than 5% so null hypothesis is rejected
# the data is stationary

plot_acf(data['MSACSR'])
#> <Figure size 640x480 with 1 Axes>
plot_pacf(data['MSACSR'])
#> <Figure size 640x480 with 1 Axes>

# p = 1
# q = 1
# d = 0

model = ARIMA(log_transformed_data, order=(1,0,1))
model_fit = model.fit()
model_fit.summary()
#> <class 'statsmodels.iolib.summary.Summary'>
"""
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                 MSACSR   No. Observations:                  732
Model:                 ARIMA(1, 0, 1)   Log Likelihood                 791.129
Date:                Mon, 12 Feb 2024   AIC                          -1574.259
Time:                        23:14:21   BIC                          -1555.876
Sample:                             0   HQIC                         -1567.168
                                - 732                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.7810      0.075     23.738      0.000       1.634       1.928
ar.L1          0.9672      0.010     94.401      0.000       0.947       0.987
ma.L1         -0.1854      0.029     -6.314      0.000      -0.243      -0.128
sigma2         0.0067      0.000     25.150      0.000       0.006       0.007
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):                73.01
Prob(Q):                              0.94   Prob(JB):                         0.00
Heteroskedasticity (H):               1.05   Skew:                             0.04
Prob(H) (two-sided):                  0.71   Kurtosis:                         4.55
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
"""

res_log = model_fit.predict(start = len(log_transformed_data), end=len(log_transformed_data))
final_answer = np.exp(res_log)

final_answer
#> 732    8.16215
dtype: float64
UpenderKaveti commented 4 months ago
import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import ccf
import statsmodels.api as sm

# reading the data from API

fred = Fred(api_key='7d6f571eb06bbfe443c9fa3a4ee6e742')
data = fred.get_series('ICNSA')

data.head()
#> 1967-01-07    346000.0
1967-01-14    334000.0
1967-01-21    277000.0
1967-01-28    252000.0
1967-02-04    274000.0
dtype: float64

# plotting ICNSA data
data.plot()
#> <Axes: >

# covid period data
data.plot(xlim=['2020-01-01','2022-09-01'],ylim=[0,7500000],figsize=(12,4))
#> <Axes: >

# reading 2nd data
data_1 = fred.get_series('CCNSA')

data_1.head()
#> 1967-01-07    1594000.0
1967-01-14    1563000.0
1967-01-21    1551000.0
1967-01-28    1533000.0
1967-02-04    1534000.0
dtype: float64

#plotting the 2nd data
data_1.plot()
#> <Axes: >

print("Shape of ICNSA data", data.shape)
#> Shape of ICNSA data (2980,)
print("Shape of 2nd data", data_1.shape)
#> Shape of 2nd data (2979,)

# checking for missing values
print("Count of missing values in ICNSA:", data.isna().sum())
#> Count of missing values in ICNSA: 0
print("Count of missing values in Covariate data:", data_1.isna().sum())
#> Count of missing values in Covariate data: 0

data.head()
#> 1967-01-07    346000.0
1967-01-14    334000.0
1967-01-21    277000.0
1967-01-28    252000.0
1967-02-04    274000.0
dtype: float64

data.tail()
#> 2024-01-13    291330.0
2024-01-20    249947.0
2024-01-27    263919.0
2024-02-03    234729.0
2024-02-10    222164.0
dtype: float64

data_1.head()
#> 1967-01-07    1594000.0
1967-01-14    1563000.0
1967-01-21    1551000.0
1967-01-28    1533000.0
1967-02-04    1534000.0
dtype: float64

data_1.tail()
#> 2024-01-06    2122548.0
2024-01-13    2053298.0
2024-01-20    2181429.0
2024-01-27    2130008.0
2024-02-03    2146550.0
dtype: float64

data.info()
#> <class 'pandas.core.series.Series'>
#> DatetimeIndex: 2980 entries, 1967-01-07 to 2024-02-10
#> Series name: None
#> Non-Null Count  Dtype  
#> --------------  -----  
#> 2980 non-null   float64
#> dtypes: float64(1)
#> memory usage: 46.6 KB

data_1.info()
#> <class 'pandas.core.series.Series'>
#> DatetimeIndex: 2979 entries, 1967-01-07 to 2024-02-03
#> Series name: None
#> Non-Null Count  Dtype  
#> --------------  -----  
#> 2979 non-null   float64
#> dtypes: float64(1)
#> memory usage: 46.5 KB

# as there is no column name for ICNSA data, I am adding a column name 'data'
col = ['data']
d = pd.DataFrame(data.to_numpy(),  columns=col)

# renaming 2nd dataset column name to 'data'
col_1 = ['data']
d_1 = pd.DataFrame(data_1.astype(float).to_numpy(),  columns=col_1)

d.head()
#>        data
0  346000.0
1  334000.0
2  277000.0
3  252000.0
4  274000.0

d_1.head()
#>         data
0  1594000.0
1  1563000.0
2  1551000.0
3  1533000.0
4  1534000.0

# Correlation Analysis between ICNSA data and 2nd data
correlation_matrix = d.corrwith(d_1)
print("Pearson correlation coefficients between TICNSA data and 2nd data:",correlation_matrix)
#> Pearson correlation coefficients between TICNSA data and 2nd data: data    0.714954
#> dtype: float64

# Pairwise correlation is computed between columns of ICNSA with columns of 2nd dataset.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corrwith.html

# As the correlation value is near to +1 it indicates a positive linear relationship between ICNSA and 2nd dataset

def stationarity_test(series):
   result = adfuller(series)
   print('ADF Statistic:', result[0])
   print('p-value:', result[1])
   print('Critical Values:')
   for key, value in result[4].items():
       print('\t%s: %.3f' % (key, value))

print("Stationarity Test for ICNSA data:")
#> Stationarity Test for ICNSA data:
stationarity_test(d)
#> ADF Statistic: -6.872346491681085
#> p-value: 1.5044350273818226e-09
#> Critical Values:
#>  1%: -3.433
#>  5%: -2.863
#>  10%: -2.567
print("\nStationarity Test for 2nd data:")
#> 
#> Stationarity Test for 2nd data:
stationarity_test(d_1)
#> ADF Statistic: -6.045172149046205
#> p-value: 1.3171244588684742e-07
#> Critical Values:
#>  1%: -3.433
#>  5%: -2.863
#>  10%: -2.567

# Based on the p-value, we can say that ICNSA and 2nd dataset is stationary.

# acf and pacf plots for ICNSA data
plot_acf(d,)
#> <Figure size 640x480 with 1 Axes>
plot_pacf(d)
#> <Figure size 640x480 with 1 Axes>

# acf and pacf plots for 2nd data
plot_acf(d_1)
#> <Figure size 640x480 with 1 Axes>
plot_pacf(d_1)
#> <Figure size 640x480 with 1 Axes>

# ACF plots of ICNSA and 2nd dataset is similar to the trend of exponential decay
# PACF plots of ICNSA and 2nd dataset is having a wave like pattern

# ccf plot

ccf_result = ccf(d.iloc[:-1,:], d_1)
plt.plot(ccf_result)
#> [<matplotlib.lines.Line2D at 0x1f36b7452d0>]
plt.title('Cross-correlation Function')
#> Text(0.5, 1.0, 'Cross-correlation Function')
plt.xlabel('Lag')
#> Text(0.5, 4.444444444444445, 'Lag')
plt.ylabel('CCF')
#> Text(4.444444444444452, 0.5, 'CCF')
plt.show()


# Fitting the model
order = (1, 0, 1)  # (p, d, q)
model = sm.tsa.ARIMA(d.iloc[:-1,:], exog=d_1, order=order)
res = model.fit()

print(res.summary())
#>                                SARIMAX Results                                
#> ==============================================================================
#> Dep. Variable:                   data   No. Observations:                 2979
#> Model:                 ARIMA(1, 0, 1)   Log Likelihood              -38182.330
#> Date:                Thu, 22 Feb 2024   AIC                          76374.661
#> Time:                        02:35:05   BIC                          76404.657
#> Sample:                             0   HQIC                         76385.454
#>                                - 2979                                         
#> Covariance Type:                  opg                                         
#> ==============================================================================
#>                  coef    std err          z      P>|z|      [0.025      0.975]
#> ------------------------------------------------------------------------------
#> const       5.783e+04   2.99e-10   1.93e+14      0.000    5.78e+04    5.78e+04
#> data           0.0884      0.002     58.630      0.000       0.085       0.091
#> ar.L1          0.8088      0.004    216.010      0.000       0.801       0.816
#> ma.L1          0.3094      0.003    100.979      0.000       0.303       0.315
#> sigma2      8.093e+09   5.05e-13    1.6e+22      0.000    8.09e+09    8.09e+09
#> ===================================================================================
#> Ljung-Box (L1) (Q):                   1.91   Jarque-Bera (JB):          18524038.62
#> Prob(Q):                              0.17   Prob(JB):                         0.00
#> Heteroskedasticity (H):               4.76   Skew:                            13.82
#> Prob(H) (two-sided):                  0.00   Kurtosis:                       388.32
#> ===================================================================================
#> 
#> Warnings:
#> [1] Covariance matrix calculated using the outer product of gradients (complex-step).
#> [2] Covariance matrix is singular or near-singular, with condition number 1.33e+37. Standard errors may be unstable.

l=[]
for i in range(0,2978):
  k = res.predict(start=i, end=i,exog=d_1.iloc[:i])
  l.append(k.tolist())

l_data = pd.DataFrame(l, columns=['data'])

forecast_values = res.forecast(steps=3, exog=d_1.iloc[-3:])
forecast_values
#>
2979    229064.731939
2980    228667.211960
2981    233487.327614

# predicted value is 233487.327614
Name: predicted_mean, dtype: float64
UpenderKaveti commented 4 months ago
#pip install fredapi

#pip install hampel

# importing the required libraries

import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import ccf
import statsmodels.api as sm
from scipy.interpolate import UnivariateSpline
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from hampel import hampel
#> Traceback (most recent call last):
#> Cell In[14], line 1
#> ----> 1 from hampel import hampel
#> ModuleNotFoundError: No module named 'hampel'
import warnings
warnings.filterwarnings("ignore")

# reading the data from API

fred = Fred(api_key='7d6f571eb06bbfe443c9fa3a4ee6e742')
data = fred.get_series('ICNSA')

# visualizing the data
plt.plot(data)
#> [<matplotlib.lines.Line2D at 0x250c17bc090>]
plt.show()

plt.close()

# 1st 5 records in the data
data.head()
#> 1967-01-07    346000.0
1967-01-14    334000.0
1967-01-21    277000.0
1967-01-28    252000.0
1967-02-04    274000.0
dtype: float64

# dates in the dataset
data.index
#> DatetimeIndex(['1967-01-07', '1967-01-14', '1967-01-21', '1967-01-28',
               '1967-02-04', '1967-02-11', '1967-02-18', '1967-02-25',
               '1967-03-04', '1967-03-11',
               ...
               '2023-12-16', '2023-12-23', '2023-12-30', '2024-01-06',
               '2024-01-13', '2024-01-20', '2024-01-27', '2024-02-03',
               '2024-02-10', '2024-02-17'],
              dtype='datetime64[ns]', length=2981, freq=None)

# creating a dataframe named 'X'
X = pd.DataFrame()

X['date']= data.index

X['ICNSA'] = data.to_numpy()

# printing the dataframe 'X'
X
#>            date     ICNSA
0    1967-01-07  346000.0
1    1967-01-14  334000.0
2    1967-01-21  277000.0
3    1967-01-28  252000.0
4    1967-02-04  274000.0
...         ...       ...
2976 2024-01-20  249947.0
2977 2024-01-27  263919.0
2978 2024-02-03  234729.0
2979 2024-02-10  223985.0
2980 2024-02-17  197932.0

[2981 rows x 2 columns]

result = hampel(data, window_size=3, n_sigma=3.0)
#> Traceback (most recent call last):
#> Cell In[28], line 1
#> ----> 1 result = hampel(data, window_size=3, n_sigma=3.0)
#> NameError: name 'hampel' is not defined

# https://pypi.org/project/hampel/

filtered_data = result.filtered_data
#> Traceback (most recent call last):
#> Cell In[29], line 3
#>       1 # https://pypi.org/project/hampel/
#> ----> 3 filtered_data = result.filtered_data
#> NameError: name 'result' is not defined
outlier_indices = result.outlier_indices
#> Traceback (most recent call last):
#> Cell In[30], line 1
#> ----> 1 outlier_indices = result.outlier_indices
#> NameError: name 'result' is not defined
medians = result.medians
#> Traceback (most recent call last):
#> Cell In[31], line 1
#> ----> 1 medians = result.medians
#> NameError: name 'result' is not defined
mad_values = result.median_absolute_deviations
#> Traceback (most recent call last):
#> Cell In[32], line 1
#> ----> 1 mad_values = result.median_absolute_deviations
#> NameError: name 'result' is not defined
thresholds = result.thresholds
#> Traceback (most recent call last):
#> Cell In[33], line 1
#> ----> 1 thresholds = result.thresholds
#> NameError: name 'result' is not defined

outlier_indices
#> Traceback (most recent call last):
#> Cell In[34], line 1
#> ----> 1 outlier_indices
#> NameError: name 'outlier_indices' is not defined

plt.plot(X['ICNSA'])
#> [<matplotlib.lines.Line2D at 0x250c2cbb850>]
plt.show()

plt.close()

# from the above plot is clear that there is sudden change from the index '2700' approximately,

for i in outlier_indices:
  if(i>=2700):
    print(i)
    break
#> Traceback (most recent call last):
#> Cell In[38], line 3
#>       1 # from the above plot is clear that there is sudden change from the index '2700' approximately,
#> ----> 3 for i in outlier_indices:
#>       4   if(i>=2700):
#>       5     print(i)
#> NameError: name 'outlier_indices' is not defined

# hence considering the index after '2700' from the 'outlier_indices', which is '2750'.

# to check for the exact sudden change

plt.plot(X['ICNSA'].iloc[2750:2785])
#> [<matplotlib.lines.Line2D at 0x250c5d8e2d0>]
plt.show()

plt.close()

# after a detailed check, the sudden change in the data is at the index '2775'.
sudden_change_index = 2775

# covid period start data: 1st week of March, 2020.

ax = data.iloc[sudden_change_index-5:2800].plot()
fig = ax.get_figure()
plt.show(block=False)

plt.close(fig)

# visualizing the median values within the sliding window

plt.plot(medians[sudden_change_index-1:])
#> Traceback (most recent call last):
#> Cell In[47], line 3
#>       1 # visualizing the median values within the sliding window
#> ----> 3 plt.plot(medians[sudden_change_index-1:])
#> NameError: name 'medians' is not defined
plt.show()
plt.close()

# the above graph is following a decreasing trend.
# from the index '100' graph is not decreasing much when compared to the graph before index '100'.
# hence an index after '100' is to be considered.
# Approimately index '125' is consider as the end od covid period as there is small increasing the graph (between 100 - 150).
ending_index = 125

#data.iloc[2753:(2753+150)].plot()

#data.iloc[2753:(2753+150)].plot()
#2753
data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index].plot() # covid range
#> <Axes: >

#plt.plot(data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index])
plt.show()

plt.close()

# covid start date is from 1st week of March, 2020 to 1st week of July, 2021.

def get_spline(smoothing_param = 0.5):
  x = range(0,len(X['date'].iloc[:sudden_change_index]))
  y = X['ICNSA'].iloc[:sudden_change_index]

  # Adjust this parameter to control smoothing
  spline = UnivariateSpline(x, y, k=3, s=smoothing_param)

  x_interp = range(0,len(X['date'].iloc[sudden_change_index:sudden_change_index+ending_index]))
  y_interp = spline(x_interp)

  frames = [y, pd.DataFrame(y_interp), data.iloc[sudden_change_index + ending_index:]]
  new_data = pd.DataFrame()
  new_data['ICNSA'] = pd.concat(frames)
  new_data['date'] = data.index
  new_data = new_data.set_index('date')

  return new_data

collection_of_splines = []
for i in [0.25, 0.5, 0.75]:
  collection_of_splines.append(get_spline(smoothing_param = i))

def additive_model(new_data):
  model = ExponentialSmoothing(new_data, trend="add", seasonal="add")
  model_additive = model.fit()

  forecast_additive = model_additive.forecast(1)

  plt.plot(new_data, label="Data")
  plt.plot(model_additive.fittedvalues, label="Additive Fitted Values")
  plt.legend()
  plt.show()
  plt.close()

  return forecast_additive

collection_of_additive = []
for i,j in zip(collection_of_splines,[0.25, 0.5, 0.75]):
  print("Additive model with lambda = ", j)
  collection_of_additive.append(additive_model(new_data = i))
#> Additive model with lambda =  0.25
#> Additive model with lambda =  0.5
#> Additive model with lambda =  0.75


def multiplicative_model(new_data):
  model = ExponentialSmoothing(new_data, trend="add", seasonal="mul")
  model_multiplicative = model.fit()

  forecast_multiplicative = model_multiplicative.forecast(1)

  plt.plot(new_data, label="Original Data")
  plt.plot(model_multiplicative.fittedvalues, label="Multiplicative Fitted Values")
  plt.legend()
  plt.show()
  plt.close()

  return forecast_multiplicative

collection_of_multiplicative = []
for i,j in zip(collection_of_splines,[0.25, 0.5, 0.75]):
  print("Multiplicative model with lambda = ", j)
  collection_of_multiplicative.append(multiplicative_model(new_data = i))
#> Multiplicative model with lambda =  0.25
#> Multiplicative model with lambda =  0.5
#> Multiplicative model with lambda =  0.75


# Additive Holt-Winters
for i,j in zip(collection_of_additive,[0.25, 0.5, 0.75]):
  print("Prediction with Additive Holt-Winters with lamdba {} = {}".format(j, i.to_string(index=False)))
#> Prediction with Additive Holt-Winters with lamdba 0.25 = 238179.045324
#> Freq: W-SAT
#> Prediction with Additive Holt-Winters with lamdba 0.5 = 238179.04623
#> Freq: W-SAT
#> Prediction with Additive Holt-Winters with lamdba 0.75 = 238179.046926
#> Freq: W-SAT

# Multiplicative Holt-Winters
for i,j in zip(collection_of_multiplicative, [0.25, 0.5, 0.75]):
  print("Prediction with Multiplicative Holt-Winters with lamdba {} = {}".format(j, i.to_string(index=False)))
#> Prediction with Multiplicative Holt-Winters with lamdba 0.25 = 213876.611124
#> Freq: W-SAT
#> Prediction with Multiplicative Holt-Winters with lamdba 0.5 = 213876.612128
#> Freq: W-SAT
#> Prediction with Multiplicative Holt-Winters with lamdba 0.75 = 213876.612898
#> Freq: W-SAT

Since the forecasted values does not show any significant change, the lambda is considered as '0.5'.

Prediction with Additive Holt-Winters with lamdba 0.5 = 238179.04623

Prediction with Multiplicative Holt-Winters with lamdba 0.5 = 213876.612128

UpenderKaveti commented 4 months ago
#pip install fredapi

#pip install hampel

# importing the required libraries

import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import ccf
import statsmodels.api as sm
from scipy.interpolate import UnivariateSpline
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from hampel import hampel
#> Traceback (most recent call last):
#> Cell In[14], line 1
#> ----> 1 from hampel import hampel
#> ModuleNotFoundError: No module named 'hampel'
import warnings
warnings.filterwarnings("ignore")

# reading the data from API

fred = Fred(api_key='7d6f571eb06bbfe443c9fa3a4ee6e742')
data = fred.get_series('ICNSA')

# visualizing the data
plt.plot(data)
#> [<matplotlib.lines.Line2D at 0x1f8e8310050>]
plt.show()

plt.close()

# 1st 5 records in the data
data.head()
#> 1967-01-07    346000.0
1967-01-14    334000.0
1967-01-21    277000.0
1967-01-28    252000.0
1967-02-04    274000.0
dtype: float64

# dates in the dataset
data.index
#> DatetimeIndex(['1967-01-07', '1967-01-14', '1967-01-21', '1967-01-28',
               '1967-02-04', '1967-02-11', '1967-02-18', '1967-02-25',
               '1967-03-04', '1967-03-11',
               ...
               '2023-12-23', '2023-12-30', '2024-01-06', '2024-01-13',
               '2024-01-20', '2024-01-27', '2024-02-03', '2024-02-10',
               '2024-02-17', '2024-02-24'],
              dtype='datetime64[ns]', length=2982, freq=None)

# creating a dataframe named 'X'
X = pd.DataFrame()

X['date']= data.index

X['ICNSA'] = data.to_numpy()

# printing the dataframe 'X'
X
#>            date     ICNSA
0    1967-01-07  346000.0
1    1967-01-14  334000.0
2    1967-01-21  277000.0
3    1967-01-28  252000.0
4    1967-02-04  274000.0
...         ...       ...
2977 2024-01-27  263919.0
2978 2024-02-03  234729.0
2979 2024-02-10  223985.0
2980 2024-02-17  199337.0
2981 2024-02-24  193988.0

[2982 rows x 2 columns]

result = hampel(data, window_size=3, n_sigma=3.0)
#> Traceback (most recent call last):
#> Cell In[28], line 1
#> ----> 1 result = hampel(data, window_size=3, n_sigma=3.0)
#> NameError: name 'hampel' is not defined

# https://pypi.org/project/hampel/

filtered_data = result.filtered_data
#> Traceback (most recent call last):
#> Cell In[29], line 3
#>       1 # https://pypi.org/project/hampel/
#> ----> 3 filtered_data = result.filtered_data
#> NameError: name 'result' is not defined
outlier_indices = result.outlier_indices
#> Traceback (most recent call last):
#> Cell In[30], line 1
#> ----> 1 outlier_indices = result.outlier_indices
#> NameError: name 'result' is not defined
medians = result.medians
#> Traceback (most recent call last):
#> Cell In[31], line 1
#> ----> 1 medians = result.medians
#> NameError: name 'result' is not defined
mad_values = result.median_absolute_deviations
#> Traceback (most recent call last):
#> Cell In[32], line 1
#> ----> 1 mad_values = result.median_absolute_deviations
#> NameError: name 'result' is not defined
thresholds = result.thresholds
#> Traceback (most recent call last):
#> Cell In[33], line 1
#> ----> 1 thresholds = result.thresholds
#> NameError: name 'result' is not defined

outlier_indices
#> Traceback (most recent call last):
#> Cell In[34], line 1
#> ----> 1 outlier_indices
#> NameError: name 'outlier_indices' is not defined

plt.plot(X['ICNSA'])
#> [<matplotlib.lines.Line2D at 0x1f8e87db6d0>]
plt.show()

plt.close()

# from the above plot is clear that there is sudden change from the index '2700' approximately,

for i in outlier_indices:
  if(i>=2700):
    print(i)
    break
#> Traceback (most recent call last):
#> Cell In[38], line 3
#>       1 # from the above plot is clear that there is sudden change from the index '2700' approximately,
#> ----> 3 for i in outlier_indices:
#>       4   if(i>=2700):
#>       5     print(i)
#> NameError: name 'outlier_indices' is not defined

# hence considering the index after '2700' from the 'outlier_indices', which is '2750'.

# to check for the exact sudden change

plt.plot(X['ICNSA'].iloc[2750:2785])
#> [<matplotlib.lines.Line2D at 0x1f8e8350e90>]
plt.show()

plt.close()

# after a detailed check, the sudden change in the data is at the index '2775'.
sudden_change_index = 2775

# covid period start data: 1st week of March, 2020.

ax = data.iloc[sudden_change_index-5:2800].plot()
fig = ax.get_figure()
plt.show(block=False)

plt.close(fig)

# visualizing the median values within the sliding window

plt.plot(medians[sudden_change_index-1:])
#> Traceback (most recent call last):
#> Cell In[47], line 3
#>       1 # visualizing the median values within the sliding window
#> ----> 3 plt.plot(medians[sudden_change_index-1:])
#> NameError: name 'medians' is not defined
plt.show()
plt.close()

# the above graph is following a decreasing trend.
# from the index '100' graph is not decreasing much when compared to the graph before index '100'.
# hence an index after '100' is to be considered.
# Approimately index '125' is consider as the end od covid period as there is small increasing the graph (between 100 - 150).
ending_index = 125

#data.iloc[2753:(2753+150)].plot()

#data.iloc[2753:(2753+150)].plot()
#2753
data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index].plot() # covid range
#> <Axes: >

#plt.plot(data.iloc[sudden_change_index-1:(sudden_change_index-1)+ending_index])
plt.show()

plt.close()

# covid start date is from 1st week of March, 2020 to 1st week of July, 2021.

data = data.reset_index()

data
#>           index         0
0    1967-01-07  346000.0
1    1967-01-14  334000.0
2    1967-01-21  277000.0
3    1967-01-28  252000.0
4    1967-02-04  274000.0
...         ...       ...
2977 2024-01-27  263919.0
2978 2024-02-03  234729.0
2979 2024-02-10  223985.0
2980 2024-02-17  199337.0
2981 2024-02-24  193988.0

[2982 rows x 2 columns]

# Covid pandemic period
covid_start_date = pd.to_datetime("2020-03-01")
covid_end_date = pd.to_datetime("2021-07-03")

from statsmodels.tsa.statespace.structural import UnobservedComponents

# Filter data for Covid period
covid_data = data[(data['index'] >= covid_start_date) & (data['index'] <= covid_end_date)]

# Fit a local level state-space model with the Kalman filter
model = UnobservedComponents(covid_data[0], 'local level')
results = model.fit()

# Use fitted model to impute missing values
covid_data['ICNSA_imputed'] = results.filtered_state[0]

covid_data
#>           index          0  ICNSA_imputed
2774 2020-03-07   199914.0   3.174537e-01
2775 2020-03-14   251875.0   1.341532e+05
2776 2020-03-21  2914268.0   1.873881e+06
2777 2020-03-28  5981838.0   4.496327e+06
2778 2020-04-04  6161268.0   5.561928e+06
...         ...        ...            ...
2839 2021-06-05   364577.0   3.889787e+05
2840 2021-06-12   407148.0   4.006119e+05
2841 2021-06-19   396935.0   3.982577e+05
2842 2021-06-26   362899.0   3.756187e+05
2843 2021-07-03   382622.0   3.801027e+05

[70 rows x 3 columns]

# Compare the two methods visually
plt.figure(figsize=(10, 6))
#> <Figure size 1000x600 with 0 Axes>
plt.plot(covid_data['index'], covid_data[0], label='Actual Data')
#> [<matplotlib.lines.Line2D at 0x1f8e8993850>]
plt.plot(covid_data['index'], covid_data['ICNSA_imputed'], label='State-Space Imputation')
#> [<matplotlib.lines.Line2D at 0x1f8e9123390>]

plt.xlabel('Date')
#> Text(0.5, 0, 'Date')
plt.ylabel('Initial Claims (ICNSA)')
#> Text(0, 0.5, 'Initial Claims (ICNSA)')
plt.title('Comparison of imputated data during covid period')
#> Text(0.5, 1.0, 'Comparison of imputated data during covid period')
plt.legend()
#> <matplotlib.legend.Legend at 0x1f8e899d790>
plt.show()

plt.close()

y = X['ICNSA'].iloc[:sudden_change_index-1]

frames = [y, covid_data['ICNSA_imputed'], data.iloc[sudden_change_index + 69:][0]]
new_data = pd.DataFrame()
new_data['new_ICNSA'] = pd.concat(frames)

y
#> 0       346000.0
1       334000.0
2       277000.0
3       252000.0
4       274000.0
          ...   
2769    224561.0
2770    219459.0
2771    209218.0
2772    198845.0
2773    216625.0
Name: ICNSA, Length: 2774, dtype: float64

covid_data['ICNSA_imputed']
#> 2774    3.174537e-01
2775    1.341532e+05
2776    1.873881e+06
2777    4.496327e+06
2778    5.561928e+06
            ...     
2839    3.889787e+05
2840    4.006119e+05
2841    3.982577e+05
2842    3.756187e+05
2843    3.801027e+05
Name: ICNSA_imputed, Length: 70, dtype: float64

data.iloc[sudden_change_index + 69:][0]
#> 2844    388662.0
2845    411244.0
2846    342003.0
2847    325265.0
2848    322782.0
          ...   
2977    263919.0
2978    234729.0
2979    223985.0
2980    199337.0
2981    193988.0
Name: 0, Length: 138, dtype: float64

from statsmodels.tsa.statespace.structural import UnobservedComponents

# Define and fit the structural time series model
model = UnobservedComponents(new_data, 'local linear trend', seasonal=52)
results = model.fit()

# Print model summary
print(results.summary())
#>                             Unobserved Components Results                            
#> =====================================================================================
#> Dep. Variable:                     new_ICNSA   No. Observations:                 2982
#> Model:                    local linear trend   Log Likelihood              -41286.538
#>                    + stochastic seasonal(52)   AIC                          82581.077
#> Date:                       Thu, 07 Mar 2024   BIC                          82605.007
#> Time:                               03:07:03   HQIC                         82589.695
#> Sample:                                    0                                         
#>                                       - 2982                                         
#> Covariance Type:                         opg                                         
#> ====================================================================================
#>                        coef    std err          z      P>|z|      [0.025      0.975]
#> ------------------------------------------------------------------------------------
#> sigma2.irregular  1.953e+10   1.48e+10      1.322      0.186   -9.42e+09    4.85e+10
#> sigma2.level      6.765e+08   1.49e+10      0.046      0.964   -2.84e+10    2.98e+10
#> sigma2.trend       2.97e+10   4.03e+09      7.367      0.000    2.18e+10    3.76e+10
#> sigma2.seasonal   1.953e+10   5.76e+09      3.390      0.001    8.24e+09    3.08e+10
#> ===================================================================================
#> Ljung-Box (L1) (Q):                 829.23   Jarque-Bera (JB):          12472704.31
#> Prob(Q):                              0.00   Prob(JB):                         0.00
#> Heteroskedasticity (H):              17.51   Skew:                            -0.63
#> Prob(H) (two-sided):                  0.00   Kurtosis:                       322.69
#> ===================================================================================
#> 
#> Warnings:
#> [1] Covariance matrix calculated using the outer product of gradients (complex-step).

# https://fred.stlouisfed.org/series/IURNSA

# reading 2nd data
data_1 = fred.get_series('IURNSA')

data_1
#> 1971-01-02    5.2
1971-01-09    5.3
1971-01-16    5.3
1971-01-23    5.2
1971-01-30    5.2
             ... 
2024-01-20    1.5
2024-01-27    1.4
2024-02-03    1.4
2024-02-10    1.4
2024-02-17    1.4
Length: 2773, dtype: float64

plt.plot(data_1)
#> [<matplotlib.lines.Line2D at 0x1f8e91a0290>]
plt.show()

plt.close()

plt.close()

# acf and pacf plots for ICNSA data
plot_acf(data_1,)
#> <Figure size 640x480 with 1 Axes>
plot_pacf(data_1)
#> <Figure size 640x480 with 1 Axes>
plt.show()

plt.close()

finaldata = pd.DataFrame({'ICNSA': new_data['new_ICNSA'].iloc[209:].tolist(),
                   'IURNSA': data_1[0].tolist()}, index=None)

finaldata
#>          ICNSA  IURNSA
0     500000.0     5.2
1     452000.0     5.2
2     399000.0     5.2
3     353000.0     5.2
4     375000.0     5.2
...        ...     ...
2768  263919.0     5.2
2769  234729.0     5.2
2770  223985.0     5.2
2771  199337.0     5.2
2772  193988.0     5.2

[2773 rows x 2 columns]

# ccf plot

ccf_result = ccf(finaldata['IURNSA'], finaldata['ICNSA'])
plt.plot(ccf_result)
#> [<matplotlib.lines.Line2D at 0x1f8ffe7f390>]
plt.title('Cross-correlation Function')
#> Text(0.5, 1.0, 'Cross-correlation Function')
plt.xlabel('Lag')
#> Text(0.5, 4.444444444444445, 'Lag')
plt.ylabel('CCF')
#> Text(4.444444444444452, 0.5, 'CCF')
plt.show()

plt.close()

from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

model = sm.tsa.ARIMA( finaldata['ICNSA'], exog = finaldata['IURNSA'],
                        order=(1, 0, 1),
                        seasonal_order=(1, 1, 1, 52))

model_results = model.fit()

forecast_values = model_results.forecast(steps=2, exog=new_data['new_ICNSA'].iloc[-2:])

forecast_values.iloc[:-1]
#> 2773    214329.190072
Name: predicted_mean, dtype: float64
UpenderKaveti commented 2 months ago
# Load library
library(readxl)
#> Warning: package 'readxl' was built under R version 4.3.3
library(tseries)
#> Warning: package 'tseries' was built under R version 4.3.3
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(lmtest)
#> Warning: package 'lmtest' was built under R version 4.3.3
#> Loading required package: zoo
#> Warning: package 'zoo' was built under R version 4.3.2
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(reprex)
#> Warning: package 'reprex' was built under R version 4.3.3
library(zoo)
library(vars)
#> Warning: package 'vars' was built under R version 4.3.3
#> Loading required package: MASS
#> Warning: package 'MASS' was built under R version 4.3.2
#> Loading required package: strucchange
#> Warning: package 'strucchange' was built under R version 4.3.3
#> Loading required package: sandwich
#> Warning: package 'sandwich' was built under R version 4.3.3
#> Loading required package: urca
#> Warning: package 'urca' was built under R version 4.3.2
library(BigVAR)
#> Warning: package 'BigVAR' was built under R version 4.3.3
#> Loading required package: lattice

# Read the Excel file
wi_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Work-in-Process Inventory - 31SWI.xlsx")
fi_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Finished Good Inventory - 31SFI.xlsx")
mi_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Materials-and-Supplies Inventory – 31SMI.xlsx")
vs_data <- read_excel("C:\\Users\\upend\\Documents\\Time Series\\A4\\Shipments – 31SVS.xlsx")

# converting to Period to date format
wi_data$Period <- as.Date(paste0("01-", wi_data$Period), format = "%d-%b-%Y")
fi_data$Period <- as.Date(paste0("01-", fi_data$Period), format = "%d-%b-%Y")
mi_data$Period <- as.Date(paste0("01-", mi_data$Period), format = "%d-%b-%Y")
vs_data$Period <- as.Date(paste0("01-", vs_data$Period), format = "%d-%b-%Y")

# to check if the Period column is in date format
print(class(wi_data$Period))
#> [1] "Date"

plot(wi_data$Period, wi_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Work-in-Process Inventory")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


plot(fi_data$Period, fi_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Finished Good Inventory")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


plot(mi_data$Period, mi_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Materials-and-Supplies Inventory")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


plot(vs_data$Period, vs_data$Value, type = "l", xlab = "Period", ylab = "Values", main = "Value of Shipments")
#> Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion


# Augmented Dickey-Fuller (ADF) test for stationarity
adf_result_wi <- adf.test(wi_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion

# Print the ADF test result
print("Work-in-Process Inventory")
#> [1] "Work-in-Process Inventory"
print(adf_result_wi)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  wi_data$Value
#> Dickey-Fuller = -3.7347, Lag order = 7, p-value = 0.02247
#> alternative hypothesis: stationary

adf_result_fi <- adf.test(fi_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion

# Print the ADF test result
print("Finished Good Inventory")
#> [1] "Finished Good Inventory"
print(adf_result_fi)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  fi_data$Value
#> Dickey-Fuller = -3.693, Lag order = 7, p-value = 0.02456
#> alternative hypothesis: stationary

adf_result_mi <- adf.test(mi_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion

# Print the ADF test result
print("Materials-and-Supplies Inventory")
#> [1] "Materials-and-Supplies Inventory"
print(adf_result_mi)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  mi_data$Value
#> Dickey-Fuller = -2.9352, Lag order = 7, p-value = 0.1823
#> alternative hypothesis: stationary

adf_result_vs <- adf.test(vs_data$Value)
#> Warning in as.vector(x, mode = "double"): NAs introduced by coercion

# Print the ADF test result
print("Value of Shipments")
#> [1] "Value of Shipments"
print(adf_result_vs)
#> 
#>  Augmented Dickey-Fuller Test
#> 
#> data:  vs_data$Value
#> Dickey-Fuller = -3.4253, Lag order = 7, p-value = 0.04989
#> alternative hypothesis: stationary

# Granger causality test
granger_result <- grangertest(na.approx(fi_data$Value) ~ na.approx(wi_data$Value), order = 1)
#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion
#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion

#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion

#> Warning in xy.coords(x, y, setLab = FALSE): NAs introduced by coercion

# Print the Granger causality test result
print(granger_result)
#> Granger causality test
#> 
#> Model 1: na.approx(fi_data$Value) ~ Lags(na.approx(fi_data$Value), 1:1) + Lags(na.approx(wi_data$Value), 1:1)
#> Model 2: na.approx(fi_data$Value) ~ Lags(na.approx(fi_data$Value), 1:1)
#>   Res.Df Df      F    Pr(>F)    
#> 1    382                        
#> 2    383 -1 18.965 1.713e-05 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Merge all the attributes by Period
final_data <- merge(wi_data, fi_data, by = "Period", all = TRUE, suffixes = c("_WI", "_FI"))
final_data <- merge(final_data, mi_data, by = "Period", all = TRUE, suffixes = c("", "_MI"))
final_data <- merge(final_data, vs_data, by = "Period", all = TRUE, suffixes = c("", "_VS"))
final_data <- na.omit(final_data)

final_data$Value_WI <- as.numeric(final_data$Value_WI)
#> Warning: NAs introduced by coercion
final_data$Value_FI <- as.numeric(final_data$Value_FI)
#> Warning: NAs introduced by coercion
final_data$Value <- as.numeric(final_data$Value)
#> Warning: NAs introduced by coercion
final_data$Value_VS <- as.numeric(final_data$Value_VS)
#> Warning: NAs introduced by coercion

final_data = final_data[1:386,]

# fitting the var(1) and var(p=2) model

ts_data<- ts(final_data, frequency = 1)

# Fit the VAR(1) model
var1 <- VAR(ts_data, p = 1, type = "const")

# Fit the VAR(2) model, p=2
var2 <- VAR(ts_data, p = 2, type = "const")

summary(var1)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Period, Value_WI, Value_FI, Value, Value_VS 
#> Deterministic variables: const 
#> Sample size: 385 
#> Log Likelihood: -10907.586 
#> Roots of the characteristic polynomial:
#>     1 0.9649 0.9649 0.9642 0.933
#> Call:
#> VAR(y = ts_data, p = 1, type = "const")
#> 
#> 
#> Estimation results for equation Period: 
#> ======================================= 
#> Period = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const 
#> 
#>               Estimate Std. Error   t value Pr(>|t|)    
#> Period.l1    1.000e+00  3.794e-05 26353.798   <2e-16 ***
#> Value_WI.l1 -3.866e-05  9.960e-05    -0.388    0.698    
#> Value_FI.l1  6.318e-05  1.004e-04     0.629    0.529    
#> Value.l1    -1.425e-05  4.845e-05    -0.294    0.769    
#> Value_VS.l1 -7.569e-07  2.804e-05    -0.027    0.978    
#> const        3.055e+01  4.278e-01    71.411   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 0.8195 on 379 degrees of freedom
#> Multiple R-Squared:     1,   Adjusted R-squared:     1 
#> F-statistic: 1.312e+09 on 5 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_WI: 
#> ========================================= 
#> Value_WI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1    3.933e-02  7.371e-03   5.336 1.64e-07 ***
#> Value_WI.l1  9.846e-01  1.935e-02  50.885  < 2e-16 ***
#> Value_FI.l1 -8.082e-02  1.950e-02  -4.145 4.20e-05 ***
#> Value.l1    -4.061e-02  9.412e-03  -4.315 2.04e-05 ***
#> Value_VS.l1  5.939e-02  5.448e-03  10.902  < 2e-16 ***
#> const       -2.416e+02  8.310e+01  -2.907  0.00386 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 159.2 on 379 degrees of freedom
#> Multiple R-Squared: 0.9932,  Adjusted R-squared: 0.9931 
#> F-statistic: 1.113e+04 on 5 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_FI: 
#> ========================================= 
#> Value_FI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1    6.045e-02  7.377e-03   8.194 3.93e-15 ***
#> Value_WI.l1  8.264e-02  1.937e-02   4.267 2.50e-05 ***
#> Value_FI.l1  8.501e-01  1.952e-02  43.558  < 2e-16 ***
#> Value.l1    -6.082e-02  9.420e-03  -6.456 3.29e-10 ***
#> Value_VS.l1  6.581e-02  5.452e-03  12.070  < 2e-16 ***
#> const       -6.714e+02  8.317e+01  -8.072 9.25e-15 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 159.3 on 379 degrees of freedom
#> Multiple R-Squared: 0.9967,  Adjusted R-squared: 0.9967 
#> F-statistic: 2.304e+04 on 5 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value: 
#> ====================================== 
#> Value = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1    3.250e-02  9.372e-03   3.468 0.000584 ***
#> Value_WI.l1  1.905e-02  2.460e-02   0.775 0.439069    
#> Value_FI.l1 -2.801e-02  2.479e-02  -1.130 0.259220    
#> Value.l1     8.979e-01  1.197e-02  75.032  < 2e-16 ***
#> Value_VS.l1  7.421e-02  6.926e-03  10.715  < 2e-16 ***
#> const       -5.126e+02  1.057e+02  -4.852 1.79e-06 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 202.4 on 379 degrees of freedom
#> Multiple R-Squared: 0.9973,  Adjusted R-squared: 0.9973 
#> F-statistic: 2.799e+04 on 5 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_VS: 
#> ========================================= 
#> Value_VS = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1      0.15655    0.02135   7.334 1.37e-12 ***
#> Value_WI.l1    0.09016    0.05604   1.609 0.108473    
#> Value_FI.l1   -0.30796    0.05647  -5.453 8.93e-08 ***
#> Value.l1      -0.06453    0.02726  -2.367 0.018414 *  
#> Value_VS.l1    1.08425    0.01578  68.724  < 2e-16 ***
#> const       -883.04504  240.66629  -3.669 0.000278 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 461.1 on 379 degrees of freedom
#> Multiple R-Squared: 0.9913,  Adjusted R-squared: 0.9912 
#> F-statistic:  8686 on 5 and 379 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>           Period  Value_WI  Value_FI     Value  Value_VS
#> Period    0.6716     1.782    -3.952     5.106     15.43
#> Value_WI  1.7819 25343.285  4739.504 -1716.657  21069.26
#> Value_FI -3.9520  4739.504 25386.913  5211.478  16117.77
#> Value     5.1055 -1716.657  5211.478 40966.851  15823.27
#> Value_VS 15.4337 21069.259 16117.767 15823.274 212568.83
#> 
#> Correlation matrix of residuals:
#>            Period Value_WI Value_FI    Value Value_VS
#> Period    1.00000  0.01366 -0.03027  0.03078  0.04085
#> Value_WI  0.01366  1.00000  0.18685 -0.05328  0.28706
#> Value_FI -0.03027  0.18685  1.00000  0.16160  0.21941
#> Value     0.03078 -0.05328  0.16160  1.00000  0.16956
#> Value_VS  0.04085  0.28706  0.21941  0.16956  1.00000

summary(var2)
#> 
#> VAR Estimation Results:
#> ========================= 
#> Endogenous variables: Period, Value_WI, Value_FI, Value, Value_VS 
#> Deterministic variables: const 
#> Sample size: 384 
#> Log Likelihood: -10750.069 
#> Roots of the characteristic polynomial:
#>     1 0.9607 0.9607 0.9392 0.9392 0.4777 0.4737 0.3325 0.1318 0.05043
#> Call:
#> VAR(y = ts_data, p = 2, type = "const")
#> 
#> 
#> Estimation results for equation Period: 
#> ======================================= 
#> Period = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1    5.255e-01  4.542e-02  11.570   <2e-16 ***
#> Value_WI.l1  2.427e-06  2.477e-04   0.010    0.992    
#> Value_FI.l1  2.435e-04  2.444e-04   0.996    0.320    
#> Value.l1    -2.782e-04  1.901e-04  -1.463    0.144    
#> Value_VS.l1 -3.419e-06  8.670e-05  -0.039    0.969    
#> Period.l2    4.745e-01  4.542e-02  10.449   <2e-16 ***
#> Value_WI.l2 -6.169e-05  2.556e-04  -0.241    0.809    
#> Value_FI.l2 -1.714e-04  2.343e-04  -0.731    0.465    
#> Value.l2     2.628e-04  1.782e-04   1.475    0.141    
#> Value_VS.l2  1.036e-05  9.168e-05   0.113    0.910    
#> const        4.503e+01  1.447e+00  31.113   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 0.7227 on 373 degrees of freedom
#> Multiple R-Squared:     1,   Adjusted R-squared:     1 
#> F-statistic: 8.371e+08 on 10 and 373 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_WI: 
#> ========================================= 
#> Value_WI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1      4.13789    9.35573   0.442  0.65854    
#> Value_WI.l1    1.02150    0.05103  20.017  < 2e-16 ***
#> Value_FI.l1    0.12384    0.05034   2.460  0.01435 *  
#> Value.l1       0.05163    0.03916   1.319  0.18808    
#> Value_VS.l1    0.12643    0.01786   7.079 7.26e-12 ***
#> Period.l2     -4.12014    9.35561  -0.440  0.65991    
#> Value_WI.l2   -0.04232    0.05265  -0.804  0.42200    
#> Value_FI.l2   -0.16601    0.04827  -3.439  0.00065 ***
#> Value.l2      -0.05922    0.03671  -1.613  0.10753    
#> Value_VS.l2   -0.10034    0.01889  -5.312 1.86e-07 ***
#> const       -164.24681  298.11885  -0.551  0.58200    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 148.9 on 373 degrees of freedom
#> Multiple R-Squared: 0.9942,  Adjusted R-squared: 0.994 
#> F-statistic:  6346 on 10 and 373 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_FI: 
#> ========================================= 
#> Value_FI = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1     -0.31089    9.30970  -0.033 0.973378    
#> Value_WI.l1    0.21341    0.05078   4.203 3.31e-05 ***
#> Value_FI.l1    1.02357    0.05009  20.433  < 2e-16 ***
#> Value.l1       0.10944    0.03896   2.809 0.005234 ** 
#> Value_VS.l1    0.10366    0.01777   5.832 1.18e-08 ***
#> Period.l2      0.34843    9.30958   0.037 0.970164    
#> Value_WI.l2   -0.14227    0.05239  -2.715 0.006928 ** 
#> Value_FI.l2   -0.12870    0.04803  -2.679 0.007702 ** 
#> Value.l2      -0.13334    0.03653  -3.650 0.000299 ***
#> Value_VS.l2   -0.07519    0.01879  -4.001 7.62e-05 ***
#> const       -414.17516  296.65208  -1.396 0.163495    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 148.1 on 373 degrees of freedom
#> Multiple R-Squared: 0.9972,  Adjusted R-squared: 0.9971 
#> F-statistic: 1.325e+04 on 10 and 373 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value: 
#> ====================================== 
#> Value = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1      3.16636   12.36548   0.256 0.798042    
#> Value_WI.l1    0.19248    0.06745   2.854 0.004561 ** 
#> Value_FI.l1    0.06509    0.06654   0.978 0.328582    
#> Value.l1       1.10735    0.05175  21.397  < 2e-16 ***
#> Value_VS.l1    0.01460    0.02361   0.618 0.536744    
#> Period.l2     -3.14003   12.36532  -0.254 0.799683    
#> Value_WI.l2   -0.18410    0.06959  -2.645 0.008505 ** 
#> Value_FI.l2   -0.07830    0.06380  -1.227 0.220494    
#> Value.l2      -0.18609    0.04852  -3.836 0.000147 ***
#> Value_VS.l2    0.03879    0.02496   1.554 0.121015    
#> const       -462.32797  394.02416  -1.173 0.241404    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 196.8 on 373 degrees of freedom
#> Multiple R-Squared: 0.9975,  Adjusted R-squared: 0.9974 
#> F-statistic: 1.473e+04 on 10 and 373 DF,  p-value: < 2.2e-16 
#> 
#> 
#> Estimation results for equation Value_VS: 
#> ========================================= 
#> Value_VS = Period.l1 + Value_WI.l1 + Value_FI.l1 + Value.l1 + Value_VS.l1 + Period.l2 + Value_WI.l2 + Value_FI.l2 + Value.l2 + Value_VS.l2 + const 
#> 
#>               Estimate Std. Error t value Pr(>|t|)    
#> Period.l1     -1.10389   27.59273  -0.040  0.96811    
#> Value_WI.l1    0.37714    0.15050   2.506  0.01264 *  
#> Value_FI.l1    0.20463    0.14847   1.378  0.16896    
#> Value.l1      -0.23008    0.11548  -1.992  0.04706 *  
#> Value_VS.l1    1.26380    0.05267  23.993  < 2e-16 ***
#> Period.l2      1.19627   27.59238   0.043  0.96544    
#> Value_WI.l2   -0.33497    0.15529  -2.157  0.03164 *  
#> Value_FI.l2   -0.39122    0.14237  -2.748  0.00629 ** 
#> Value.l2       0.23909    0.10826   2.208  0.02782 *  
#> Value_VS.l2   -0.24753    0.05570  -4.444 1.17e-05 ***
#> const       -267.68827  879.23822  -0.304  0.76095    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> 
#> Residual standard error: 439.1 on 373 degrees of freedom
#> Multiple R-Squared: 0.9922,  Adjusted R-squared: 0.992 
#> F-statistic:  4764 on 10 and 373 DF,  p-value: < 2.2e-16 
#> 
#> 
#> 
#> Covariance matrix of residuals:
#>           Period  Value_WI  Value_FI     Value Value_VS
#> Period    0.5222     3.474    -2.787     7.452     10.8
#> Value_WI  3.4735 22161.714  1368.620 -2929.184  13978.8
#> Value_FI -2.7875  1368.620 21944.175  3145.551   9962.8
#> Value     7.4521 -2929.184  3145.551 38714.190  15762.1
#> Value_VS 10.8032 13978.760  9962.828 15762.096 192769.2
#> 
#> Correlation matrix of residuals:
#>            Period Value_WI Value_FI    Value Value_VS
#> Period    1.00000  0.03229 -0.02604  0.05241  0.03405
#> Value_WI  0.03229  1.00000  0.06206 -0.10000  0.21387
#> Value_FI -0.02604  0.06206  1.00000  0.10792  0.15318
#> Value     0.05241 -0.10000  0.10792  1.00000  0.18246
#> Value_VS  0.03405  0.21387  0.15318  0.18246  1.00000

# predict one month
forecast <- predict(var2, n.ahead = 1)
print(forecast)
#> $Period
#>                 fcst    lower   upper       CI
#> Period.fcst 19784.18 19782.77 19785.6 1.416376
#> 
#> $Value_WI
#>                   fcst    lower    upper       CI
#> Value_WI.fcst 11838.31 11546.53 12130.09 291.7761
#> 
#> $Value_FI
#>                   fcst    lower    upper       CI
#> Value_FI.fcst 14877.72 14587.38 15168.06 290.3406
#> 
#> $Value
#>                fcst   lower    upper      CI
#> Value.fcst 19241.84 18856.2 19627.48 385.641
#> 
#> $Value_VS
#>                  fcst    lower    upper       CI
#> Value_VS.fcst 26342.1 25481.57 27202.63 860.5317

print(causality(var2, cause = "Value_WI")) # for Work-in-Process Inventory  
#> $Granger
#> 
#>  Granger causality H0: Value_WI do not Granger-cause Period Value_FI
#>  Value Value_VS
#> 
#> data:  VAR object var2
#> F-Test = 4.7194, df1 = 8, df2 = 1865, p-value = 9.558e-06
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_WI and Period Value_FI
#>  Value Value_VS
#> 
#> data:  VAR object var2
#> Chi-squared = 24.635, df = 4, p-value = 5.956e-05

print(causality(var2, cause = "Value_FI")) # for Finished Good Inventory 
#> $Granger
#> 
#>  Granger causality H0: Value_FI do not Granger-cause Period Value_WI
#>  Value Value_VS
#> 
#> data:  VAR object var2
#> F-Test = 3.519, df1 = 8, df2 = 1865, p-value = 0.0004768
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_FI and Period Value_WI
#>  Value Value_VS
#> 
#> data:  VAR object var2
#> Chi-squared = 12.338, df = 4, p-value = 0.01501

print(causality(var2, cause = "Value")) # for Materials-and-Supplies Inventory
#> $Granger
#> 
#>  Granger causality H0: Value do not Granger-cause Period Value_WI
#>  Value_FI Value_VS
#> 
#> data:  VAR object var2
#> F-Test = 4.6139, df1 = 8, df2 = 1865, p-value = 1.358e-05
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value and Period Value_WI
#>  Value_FI Value_VS
#> 
#> data:  VAR object var2
#> Chi-squared = 22.96, df = 4, p-value = 0.000129

print(causality(var2, cause = "Value_VS")) # for value of shipments
#> $Granger
#> 
#>  Granger causality H0: Value_VS do not Granger-cause Period Value_WI
#>  Value_FI Value
#> 
#> data:  VAR object var2
#> F-Test = 17.428, df1 = 8, df2 = 1865, p-value < 2.2e-16
#> 
#> 
#> $Instant
#> 
#>  H0: No instantaneous causality between: Value_VS and Period Value_WI
#>  Value_FI Value
#> 
#> data:  VAR object var2
#> Chi-squared = 35.508, df = 4, p-value = 3.653e-07

data_matrix <- cbind(final_data$Value_WI,
                     final_data$Value_FI,
                     final_data$Value,
                     final_data$Value_VS)

# Fitting a sparse VAR model with Basic-Elastic Net (BasicEN) penalty
bigvar_model <- BigVAR.fit(data_matrix, p = 2, struct = "BasicEN", alpha = 0.5, lambda = 1)

summary(bigvar_model)
#>      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#> -270.2544   -0.0799    0.0333    1.6343    0.2154  458.3565

print(bigvar_model)
#> , , 1
#> 
#>            [,1]       [,2]       [,3]       [,4]      [,5]        [,6]
#> [1,]   22.42747 0.60195252 0.04274818 0.01844193 0.2032870  0.36650865
#> [2,] -155.55493 0.06884793 0.63349337 0.05175590 0.2102588 -0.07662389
#> [3,] -270.25436 0.02384631 0.07365336 0.66386800 0.1098656 -0.05855728
#> [4,]  458.35648 0.14535630 0.14965514 0.01697487 1.3282691 -0.22847822
#>             [,7]        [,8]        [,9]
#> [1,] -0.08985074 -0.02628400 -0.15785515
#> [2,]  0.33991232 -0.09248895 -0.15644317
#> [3,] -0.03566235  0.23089930 -0.02817492
#> [4,] -0.16337012  0.01769926 -0.32421028

Created on 2024-04-11 with reprex v2.1.0