This example goes from 3.355 sec down to 0.0039 sec.
from tqdm import tqdm
from time import time
from sklearn.linear_model import Ridge, LinearRegression, RidgeCV
from sklearn.model_selection import LeaveOneOut, ShuffleSplit
from scipy.linalg import pinv, svd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
def make_data(seed=0):
np.random.seed(seed)
# make data
n = 300 # number of samples
dim_x = 20 # dimensionality of X
dim_y = 21
Cx = np.random.randn(dim_x, dim_x)
Cx = Cx.dot(Cx.T) / dim_x # sym pos-semidefin
X = np.random.multivariate_normal(np.zeros(dim_x), Cx, n)
N = np.random.randn(n, dim_x)
F = np.random.randn(dim_y, dim_x)
E = np.eye(dim_x)
E[:5] = 0
E[10:] = 0
Y = (X @ E + N) @ F.T
Y += np.random.randn(*Y.shape)
return X, Y, E
# make data
X, Y, E = make_data()
n_samples, dim_x = X.shape
alphas = np.logspace(-3, 3, 10)
ridge = RidgeCV(alphas=alphas, fit_intercept=False)
# cv
start = time()
cv = LeaveOneOut()
cv = ShuffleSplit(100)
X, Y, E = make_data(seed=2)
E_hats = list()
for set1, set2 in tqdm(cv.split(X, Y), total=cv.n_splits):
E_hat = np.zeros((dim_x, dim_x))
# for each column of X
for i in range(dim_x):
# decode
G = ridge.fit(Y[set1], X[set1, i])
X_hat = G.predict(Y[set2])
# Encode
E_hat[i, :] = ridge.fit(X[set2], X_hat).coef_
E_hats.append(E_hat)
print(time()-start)
plt.fill_between(range(X.shape[1]), np.diag(np.mean(E_hats, 0)))
# Fast implementation of ridge for multiple alphas
def ridge_cv(X, Y, alphas, independent_alphas):
"""
Similar to sklearn RidgeCV but
(1) optimize a different alpha for each column of Y
(2) return leave-one-out Y_hat
"""
if isinstance(alphas, (float, int)):
alphas = np.array([alphas, ], np.float64)
n, n_x = X.shape
n, n_y = Y.shape
# Decompose X
U, s, _ = svd(X, full_matrices=0)
v = s**2
UY = U.T @ Y
# For each alpha, solve leave-one-out error coefs
cv_duals = np.zeros((len(alphas), n, n_y))
cv_errors = np.zeros((len(alphas), n, n_y))
for alpha_idx, alpha in enumerate(alphas):
# Solve
w = ((v + alpha) ** -1) - alpha ** -1
c = U @ np.diag(w) @ UY + alpha ** -1 * Y
cv_duals[alpha_idx] = c
# compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
G_diag = (w * U ** 2).sum(axis=-1) + alpha ** -1
error = c / G_diag[:, np.newaxis]
cv_errors[alpha_idx] = error
# identify best alpha for each column of Y independently
if independent_alphas:
best_alphas = (cv_errors ** 2).mean(axis=1).argmin(axis=0)
duals = np.transpose([cv_duals[b, :, i]
for i, b in enumerate(best_alphas)])
cv_errors = np.transpose([cv_errors[b, :, i]
for i, b in enumerate(best_alphas)])
else:
_cv_errors = cv_errors.reshape(len(alphas), -1)
best_alphas = (_cv_errors ** 2).mean(axis=1).argmin(axis=0)
duals = cv_duals[best_alphas]
cv_errors = cv_errors[best_alphas]
coefs = duals.T @ X
Y_hat = Y - cv_errors
return coefs, best_alphas, Y_hat
# Fast JRR
start = time()
_, _, X_hat = ridge_cv(Y, X, alphas, True)
E_hat, _, _ = ridge_cv(X, X_hat, alphas, False)
print(time()-start)
plt.fill_between(range(X.shape[1]), np.diag(E_hat))
Here is a super fast implementation of the JRR.
It consists in breaking down the two regressions into their dependent and independent parts, in a way similar to the ideas explained in http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
This example goes from 3.355 sec down to 0.0039 sec.