kingjr / b2b

0 stars 0 forks source link

Algorithm: LOO JRR #9

Open kingjr opened 5 years ago

kingjr commented 5 years ago

Here is a super fast implementation of the JRR.

It consists in breaking down the two regressions into their dependent and independent parts, in a way similar to the ideas explained in http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf

This example goes from 3.355 sec down to 0.0039 sec.

from tqdm import tqdm
from time import time
from sklearn.linear_model import Ridge, LinearRegression, RidgeCV
from sklearn.model_selection import LeaveOneOut, ShuffleSplit
from scipy.linalg import pinv, svd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

def make_data(seed=0):
    np.random.seed(seed)

    # make data
    n = 300  # number of samples
    dim_x = 20  # dimensionality of X
    dim_y = 21

    Cx = np.random.randn(dim_x, dim_x)
    Cx = Cx.dot(Cx.T) / dim_x  # sym pos-semidefin

    X = np.random.multivariate_normal(np.zeros(dim_x), Cx, n)
    N = np.random.randn(n, dim_x)
    F = np.random.randn(dim_y, dim_x)
    E = np.eye(dim_x)
    E[:5] = 0
    E[10:] = 0

    Y = (X @ E + N) @ F.T
    Y += np.random.randn(*Y.shape)
    return X, Y, E

# make data
X, Y, E = make_data()
n_samples, dim_x = X.shape

alphas = np.logspace(-3, 3, 10)
ridge = RidgeCV(alphas=alphas, fit_intercept=False)

# cv
start = time()
cv = LeaveOneOut()
cv = ShuffleSplit(100)
X, Y, E = make_data(seed=2)
E_hats = list()
for set1, set2 in tqdm(cv.split(X, Y), total=cv.n_splits):
    E_hat = np.zeros((dim_x, dim_x))

    # for each column of X
    for i in range(dim_x):
        # decode
        G = ridge.fit(Y[set1], X[set1, i])
        X_hat = G.predict(Y[set2])

        # Encode
        E_hat[i, :] = ridge.fit(X[set2], X_hat).coef_
    E_hats.append(E_hat)
print(time()-start)
plt.fill_between(range(X.shape[1]), np.diag(np.mean(E_hats, 0)))

# Fast implementation of ridge for multiple alphas
def ridge_cv(X, Y, alphas, independent_alphas):
    """
    Similar to sklearn RidgeCV but
    (1) optimize a different alpha for each column of Y
    (2) return leave-one-out Y_hat
    """
    if isinstance(alphas, (float, int)):
        alphas = np.array([alphas, ], np.float64)
    n, n_x = X.shape
    n, n_y = Y.shape
    # Decompose X
    U, s, _ = svd(X, full_matrices=0)
    v = s**2
    UY = U.T @ Y

    # For each alpha, solve leave-one-out error coefs
    cv_duals = np.zeros((len(alphas), n, n_y))
    cv_errors = np.zeros((len(alphas), n, n_y))
    for alpha_idx, alpha in enumerate(alphas):
        # Solve
        w = ((v + alpha) ** -1) - alpha ** -1
        c = U @ np.diag(w) @ UY + alpha ** -1 * Y
        cv_duals[alpha_idx] = c

        # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
        G_diag = (w * U ** 2).sum(axis=-1) + alpha ** -1
        error = c / G_diag[:, np.newaxis]
        cv_errors[alpha_idx] = error

    # identify best alpha for each column of Y independently
    if independent_alphas:
        best_alphas = (cv_errors ** 2).mean(axis=1).argmin(axis=0)
        duals = np.transpose([cv_duals[b, :, i]
                              for i, b in enumerate(best_alphas)])
        cv_errors = np.transpose([cv_errors[b, :, i]
                                  for i, b in enumerate(best_alphas)])
    else:
        _cv_errors = cv_errors.reshape(len(alphas), -1)
        best_alphas = (_cv_errors ** 2).mean(axis=1).argmin(axis=0)
        duals = cv_duals[best_alphas]
        cv_errors = cv_errors[best_alphas]

    coefs = duals.T @ X
    Y_hat = Y - cv_errors
    return coefs, best_alphas, Y_hat

# Fast JRR
start = time()
_, _, X_hat = ridge_cv(Y, X, alphas, True)
E_hat, _, _ = ridge_cv(X, X_hat, alphas, False)
print(time()-start)
plt.fill_between(range(X.shape[1]), np.diag(E_hat))