cdt15 / lingam

Python package for causal discovery based on LiNGAM.
https://sites.google.com/view/sshimizu06/lingam
MIT License
379 stars 59 forks source link

Bootstrap Speed #130

Open firmai opened 8 months ago

firmai commented 8 months ago
## This is 30 times slower 
model = lingam.VARLiNGAM()
result = model.bootstrap(process_df, n_sampling=1)
## Than the following

model = lingam.VARLiNGAM(lags=lags)
process_df = variable_improvement(df_accounting, variable)
model.fit(process_df)

Would you know why this is the reason, and whould you know how to improve these times?

firmai commented 8 months ago

I wonder if we can't significantly speed this up, by just changing a bit of the underlying code.


import numpy as np
from sklearn.utils import resample
from numba import jit
from sklearn.utils.validation import check_array

# ... assuming other necessary imports and class definitions ...

class YourModel:
    # ... assuming other methods of your class ...

    @jit(nopython=True)
    def _resample_and_create_X(self, n_samples, n_features, lags, residuals, fitted_ar_coefs):
        resampled_X = np.zeros((n_samples, n_features))
        for j in range(n_samples):
            if j < lags:
                resampled_X[j, :] = residuals[j]
            else:
                for t in range(len(fitted_ar_coefs)):
                    resampled_X[j, :] += np.dot(fitted_ar_coefs[t], resampled_X[j - t - 1, :])
                resampled_X[j, :] += residuals[j]
        return resampled_X

    def bootstrap(self, X, n_sampling):
        X = check_array(X)
        n_samples, n_features = X.shape

        # Store initial settings
        ar_coefs = self._ar_coefs
        lags = self._lags
        criterion = self._criterion
        self._criterion = None

        self.fit(X)
        fitted_ar_coefs = self._ar_coefs

        total_effects = np.zeros((n_sampling, n_features, n_features * (1 + self._lags)))
        adjacency_matrices = []

        for i in range(n_sampling):
            sampled_residuals = resample(self._residuals, n_samples=n_samples)
            resampled_X = self._resample_and_create_X(n_samples, n_features, lags, sampled_residuals, fitted_ar_coefs)

            # Restore initial settings
            self._ar_coefs = ar_coefs
            self._lags = lags

            self.fit(resampled_X)
            am = np.concatenate([*self._adjacency_matrices], axis=1)
            adjacency_matrices.append(am)

            # Assuming `estimate_total_effect2` can be optimized or is already efficient
            # Further optimization might require inspection of this method

        self._criterion = criterion
        return VARBootstrapResult(adjacency_matrices, total_effects)