QB3 / sparse-ho

Fast hyperparameter settings for non-smooth estimators:
http://qb3.github.io/sparse-ho
BSD 3-Clause "New" or "Revised" License
39 stars 15 forks source link

Good initialization does not seem to help sparse-ho #79

Closed mathurinm closed 3 years ago

mathurinm commented 3 years ago

I find weights which are good for the Lasso. I initialize sparse-ho with these. At first iteration the MSe is 3 x the MSE at init. Am I doing something wrong ?

from sklearn.metrics import mean_squared_error
from celer import AdaptiveLasso, Lasso
from sklearn.model_selection import train_test_split

from libsvmdata import fetch_libsvm
from sparse_ho.optimizers import GradientDescent
from sparse_ho.ho import grad_search
from sparse_ho.utils import Monitor
from sparse_ho import ImplicitForward
from sparse_ho.criterion import HeldOutMSE
from sparse_ho.models import WeightedLasso
import numpy as np
from numpy.linalg import norm

X, y = fetch_libsvm('rcv1_train')

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42)

train_indices, val_indices = train_test_split(
    np.arange(len(y_train_val)), test_size=0.5, random_state=42)

X_train_val = X_train_val.tocsc()
X_test = X_test.tocsc()

# I compute some good weights in a way:
clf1 = AdaptiveLasso(fit_intercept=False, alpha=7e-05).fit(
    X_train_val, y_train_val)

weights = clf1.alpha / np.abs(clf1.coef_)
# running Lasso with these weights gives a good MSE:
clf2 = Lasso(alpha=1, weights=weights, fit_intercept=False).fit(
    X_train_val, y_train_val)
print("MSE with weighted lasso, adalasso weights "
      f"{mean_squared_error(clf2.predict(X_test), y_test):.4f}")

# Now I run sparse-ho with these weights as init:
estimator = Lasso(fit_intercept=False, warm_start=True)
model = WeightedLasso(estimator=estimator, max_iter=1e5)
criterion = HeldOutMSE(train_indices, val_indices)
algo = ImplicitForward()
objs_test = []

def callback(val, grad, mask, dense, log_alpha):
    objs_test.append(mean_squared_error(X_test[:, mask] @ dense, y_test))

monitor_grad = Monitor(callback=callback)
optimizer = GradientDescent(n_outer=5, tol=1e-7, p_grad0=1)
grad_search(algo, criterion, model, optimizer,
            X, y, np.log10(weights), monitor_grad)

val_sparse = np.array(monitor_grad.objs)
alpha_sparse = np.array(monitor_grad.log_alphas[-1])
mspe_sparse = objs_test

print("Sparse-HO + ACV MSPE on test data at first iteration %.5f" %
      mspe_sparse[0])
# this is way higher than the initial value
QB3 commented 3 years ago

We choose a (subjective) log parametrization fro the parameters, you are passing a log10 for the initialization. Another 'pb' is that you are givin X and y to grad_search, to be consistent I would tend to pass X_val_train and y_val_train

For the long term I think we should hide this log parametrization to the users and do it under the hood.

mathurinm commented 3 years ago

Right, thank you ! Agree for the parametrization

QB3 commented 3 years ago

I modified a little bit the script and this should work

from sklearn.metrics import mean_squared_error
from celer import AdaptiveLasso, Lasso
from sklearn.model_selection import train_test_split

from libsvmdata import fetch_libsvm
from sparse_ho.optimizers import GradientDescent
from sparse_ho.ho import grad_search
from sparse_ho.utils import Monitor
from sparse_ho import ImplicitForward
from sparse_ho.criterion import HeldOutMSE
from sparse_ho.models import WeightedLasso
import numpy as np
from numpy.linalg import norm

X, y = fetch_libsvm('rcv1_train')

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42)

train_indices, val_indices = train_test_split(
    np.arange(len(y_train_val)), test_size=0.5, random_state=42)

X_train_val = X_train_val.tocsc()
X_test = X_test.tocsc()

# I compute some good weights in a way:
clf1 = AdaptiveLasso(fit_intercept=False, alpha=7e-05).fit(
    X_train_val, y_train_val)

weights = clf1.alpha / np.abs(clf1.coef_)
# running Lasso with these weights gives a good MSE:
clf2 = Lasso(alpha=1, weights=weights, fit_intercept=False).fit(
    X_train_val[train_indices, :], y_train_val[train_indices])
print(
    "MSE on test with weighted lasso, adalasso weights %.2e" %
    mean_squared_error(clf2.predict(X_test), y_test))

print(
    "MSE on validation with weighted lasso, adalasso weights %.2e" %
    mean_squared_error(clf2.predict(X_train_val[val_indices]), y_train_val[val_indices]))

# print("MSE on train with weighted lasso, adalasso weights %.2e" %
#       cl)

# Now I run sparse-ho with these weights as init:
estimator = Lasso(fit_intercept=False, warm_start=True)
model = WeightedLasso(estimator=estimator, max_iter=1e5)
criterion = HeldOutMSE(train_indices, val_indices)
algo = ImplicitForward()
objs_test = []

def callback(val, grad, mask, dense, log_alpha):
    objs_test.append(mean_squared_error(X_test[:, mask] @ dense, y_test))

monitor_grad = Monitor(callback=callback)
optimizer = GradientDescent(n_outer=50, tol=1e-7, p_grad0=2, verbose=True)
grad_search(algo, criterion, model, optimizer,
            X_train_val, y_train_val, np.log(weights), monitor_grad)

val_sparse = np.array(monitor_grad.objs)
alpha_sparse = np.array(monitor_grad.log_alphas[-1])
mspe_sparse = objs_test

print("Sparse-HO + ACV MSPE on test data at first iteration %.5f" %
      mspe_sparse[0])
# the pb of overfitting still remains
QB3 commented 3 years ago

Note that the overfitting pb still remains, this is a little bit sad for us