Closed angela97lin closed 3 years ago
Tracing the error down, LogisticRegression classifier's fit fails with *** AttributeError: 'str' object has no attribute 'decode'
Full stack trace:
F [100%]
====================================================================== FAILURES ======================================================================= ___ test_estimators_feature_name_with_random_ascii ____
self = LogisticRegressionClassifier(penalty='l2', C=1.0, n_jobs=1, multi_class='auto', solver='lbfgs') X = column_0 column_1 column_2 column_3 column_4 column_5 column6 ... column~ column column\t column_...11629 0.264954 ... 0.262116 0.735502 0.550447 0.397151 0.758430 0.023787 0.813575
[100 rows x 100 columns] y = 0 1 1 0 2 0 3 0 4 1 .. 95 1 96 0 97 0 98 0 99 1 Length: 100, dtype: int64
def fit(self, X, y=None):
"""Fits component to data
Arguments:
X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]
Returns:
self
"""
import pdb; pdb.set_trace()
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
if y is not None:
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())
try:
self._component_obj.fit(X, y)
evalml/pipelines/components/component_base.py:102:
self = LogisticRegression(n_jobs=1, random_state=0) X = array([[0.5488135 , 0.71518937, 0.60276338, ..., 0.02010755, 0.82894003, 0.00469548], [0.67781654, 0.27...99, 0.1419334 ], [0.88498232, 0.19701397, 0.56861333, ..., 0.75842952, 0.02378743, 0.81357508]]) y = array([1, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 0, 1, 2, 0, 0, 2, 1, 0, 2, 0, 1, 0, 1, 1, 2, 2, 2, 0, 1, 2,...0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 1, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1]) sample_weight = None
def fit(self, X, y, sample_weight=None):
"""
Fit the model according to the given training data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like of shape (n_samples,)
Target vector relative to X.
sample_weight : array-like of shape (n_samples,) default=None
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight.
.. versionadded:: 0.17
*sample_weight* support to LogisticRegression.
Returns
-------
self
Fitted estimator.
Notes
-----
The SAGA solver supports both float64 and float32 bit arrays.
"""
solver = _check_solver(self.solver, self.penalty, self.dual)
if not isinstance(self.C, numbers.Number) or self.C < 0:
raise ValueError("Penalty term must be positive; got (C=%r)"
% self.C)
if self.penalty == 'elasticnet':
if (not isinstance(self.l1_ratio, numbers.Number) or
self.l1_ratio < 0 or self.l1_ratio > 1):
raise ValueError("l1_ratio must be between 0 and 1;"
" got (l1_ratio=%r)" % self.l1_ratio)
elif self.l1_ratio is not None:
warnings.warn("l1_ratio parameter is only used when penalty is "
"'elasticnet'. Got "
"(penalty={})".format(self.penalty))
if self.penalty == 'none':
if self.C != 1.0: # default values
warnings.warn(
"Setting penalty='none' will ignore the C and l1_ratio "
"parameters"
)
# Note that check for l1_ratio is done right above
C_ = np.inf
penalty = 'l2'
else:
C_ = self.C
penalty = self.penalty
if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
raise ValueError("Maximum number of iteration must be positive;"
" got (max_iter=%r)" % self.max_iter)
if not isinstance(self.tol, numbers.Number) or self.tol < 0:
raise ValueError("Tolerance for stopping criteria must be "
"positive; got (tol=%r)" % self.tol)
if solver == 'lbfgs':
_dtype = np.float64
else:
_dtype = [np.float64, np.float32]
X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
order="C",
accept_large_sparse=solver != 'liblinear')
check_classification_targets(y)
self.classes_ = np.unique(y)
multi_class = _check_multi_class(self.multi_class, solver,
len(self.classes_))
if solver == 'liblinear':
if effective_n_jobs(self.n_jobs) != 1:
warnings.warn("'n_jobs' > 1 does not have any effect when"
" 'solver' is set to 'liblinear'. Got 'n_jobs'"
" = {}.".format(effective_n_jobs(self.n_jobs)))
self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
X, y, self.C, self.fit_intercept, self.intercept_scaling,
self.class_weight, self.penalty, self.dual, self.verbose,
self.max_iter, self.tol, self.random_state,
sample_weight=sample_weight)
self.n_iter_ = np.array([n_iter_])
return self
if solver in ['sag', 'saga']:
max_squared_sum = row_norms(X, squared=True).max()
else:
max_squared_sum = None
n_classes = len(self.classes_)
classes_ = self.classes_
if n_classes < 2:
raise ValueError("This solver needs samples of at least 2 classes"
" in the data, but the data contains only one"
" class: %r" % classes_[0])
if len(self.classes_) == 2:
n_classes = 1
classes_ = classes_[1:]
if self.warm_start:
warm_start_coef = getattr(self, 'coef_', None)
else:
warm_start_coef = None
if warm_start_coef is not None and self.fit_intercept:
warm_start_coef = np.append(warm_start_coef,
self.intercept_[:, np.newaxis],
axis=1)
self.coef_ = list()
self.intercept_ = np.zeros(n_classes)
# Hack so that we iterate only once for the multinomial case.
if multi_class == 'multinomial':
classes_ = [None]
warm_start_coef = [warm_start_coef]
if warm_start_coef is None:
warm_start_coef = [None] * n_classes
path_func = delayed(_logistic_regression_path)
# The SAG solver releases the GIL so it's more efficient to use
# threads for this solver.
if solver in ['sag', 'saga']:
prefer = 'threads'
else:
prefer = 'processes'
fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
**_joblib_parallel_args(prefer=prefer))(
path_func(X, y, pos_class=class_, Cs=[C_],
l1_ratio=self.l1_ratio, fit_intercept=self.fit_intercept, tol=self.tol, verbose=self.verbose, solver=solver, multi_class=multi_class, max_iter=self.max_iter, class_weight=self.class_weight, check_input=False, random_state=self.random_state, coef=warm_startcoef, penalty=penalty, max_squared_sum=max_squared_sum, sample_weight=sampleweight) for class, warm_startcoef in zip(classes_, warm_start_coef))
../venv/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1409:
self = Parallel(n_jobs=1), iterable = <generator object LogisticRegression.fit.
def __call__(self, iterable):
if self._jobs:
raise ValueError('This Parallel instance is already running')
# A flag used to abort the dispatching of jobs in case an
# exception is found
self._aborting = False
if not self._managed_backend:
n_jobs = self._initialize_backend()
else:
n_jobs = self._effective_n_jobs()
# self._effective_n_jobs should be called in the Parallel.__call__
# thread only -- store its value in an attribute for further queries.
self._cached_effective_n_jobs = n_jobs
backend_name = self._backend.__class__.__name__
if n_jobs == 0:
raise RuntimeError("%s has no active worker." % backend_name)
self._print("Using backend %s with %d concurrent workers.",
(backend_name, n_jobs))
if hasattr(self._backend, 'start_call'):
self._backend.start_call()
iterator = iter(iterable)
pre_dispatch = self.pre_dispatch
if pre_dispatch == 'all' or n_jobs == 1:
# prevent further dispatch via multiprocessing callback thread
self._original_iterator = None
self._pre_dispatch_amount = 0
else:
self._original_iterator = iterator
if hasattr(pre_dispatch, 'endswith'):
pre_dispatch = eval(pre_dispatch)
self._pre_dispatch_amount = pre_dispatch = int(pre_dispatch)
# The main thread will consume the first pre_dispatch items and
# the remaining items will later be lazily dispatched by async
# callbacks upon task completions.
# TODO: this iterator should be batch_size * n_jobs
iterator = itertools.islice(iterator, self._pre_dispatch_amount)
self._start_time = time.time()
self.n_dispatched_batches = 0
self.n_dispatched_tasks = 0
self.n_completed_tasks = 0
# Use a caching dict for callables that are pickled with cloudpickle to
# improve performances. This cache is used only in the case of
# functions that are defined in the __main__ module, functions that are
# defined locally (inside another function) and lambda expressions.
self._pickle_cache = dict()
try:
# Only set self._iterating to True if at least a batch
# was dispatched. In particular this covers the edge
# case of Parallel used with an exhausted iterator. If
# self._original_iterator is None, then this means either
# that pre_dispatch == "all", n_jobs == 1 or that the first batch
# was very quick and its callback already dispatched all the
# remaining jobs.
self._iterating = False
if self.dispatch_one_batch(iterator):
../venv/lib/python3.7/site-packages/joblib/parallel.py:1003:
self = Parallel(n_jobs=1), iterator = <generator object LogisticRegression.fit.
def dispatch_one_batch(self, iterator):
"""Prefetch the tasks for the next batch and dispatch them.
The effective size of the batch is computed here.
If there are no more jobs to dispatch, return False, else return True.
The iterator consumption and dispatching is protected by the same
lock so calling this function should be thread safe.
"""
if self.batch_size == 'auto':
batch_size = self._backend.compute_batch_size()
else:
# Fixed batch size strategy
batch_size = self.batch_size
with self._lock:
# to ensure an even distribution of the workolad between workers,
# we look ahead in the original iterators more than batch_size
# tasks - However, we keep consuming only one batch at each
# dispatch_one_batch call. The extra tasks are stored in a local
# queue, _ready_batches, that is looked-up prior to re-consuming
# tasks from the origal iterator.
try:
tasks = self._ready_batches.get(block=False)
except queue.Empty:
# slice the iterator n_jobs * batchsize items at a time. If the
# slice returns less than that, then the current batchsize puts
# too much weight on a subset of workers, while other may end
# up starving. So in this case, re-scale the batch size
# accordingly to distribute evenly the last items between all
# workers.
n_jobs = self._cached_effective_n_jobs
big_batch_size = batch_size * n_jobs
islice = list(itertools.islice(iterator, big_batch_size))
if len(islice) == 0:
return False
elif (iterator is self._original_iterator
and len(islice) < big_batch_size):
# We reached the end of the original iterator (unless
# iterator is the ``pre_dispatch``-long initial slice of
# the original iterator) -- decrease the batch size to
# account for potential variance in the batches running
# time.
final_batch_size = max(1, len(islice) // (10 * n_jobs))
else:
final_batch_size = max(1, len(islice) // n_jobs)
# enqueue n_jobs batches in a local queue
for i in range(0, len(islice), final_batch_size):
tasks = BatchedCalls(islice[i:i + final_batch_size],
self._backend.get_nested_backend(),
self._pickle_cache)
self._ready_batches.put(tasks)
# finally, get one task.
tasks = self._ready_batches.get(block=False)
if len(tasks) == 0:
# No more tasks available in the iterator: tell caller to stop.
return False
else:
self._dispatch(tasks)
../venv/lib/python3.7/site-packages/joblib/parallel.py:834:
self = Parallel(n_jobs=1), batch = <joblib.parallel.BatchedCalls object at 0x147349d50>
def _dispatch(self, batch):
"""Queue the batch for computing, with or without multiprocessing
WARNING: this method is not thread-safe: it should be only called
indirectly via dispatch_one_batch.
"""
# If job.get() catches an exception, it closes the queue:
if self._aborting:
return
self.n_dispatched_tasks += len(batch)
self.n_dispatched_batches += 1
dispatch_timestamp = time.time()
cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
with self._lock:
job_idx = len(self._jobs)
job = self._backend.apply_async(batch, callback=cb)
../venv/lib/python3.7/site-packages/joblib/parallel.py:753:
self = <joblib._parallel_backends.SequentialBackend object at 0x147349a10>, func = <joblib.parallel.BatchedCalls object at 0x147349d50> callback = <joblib.parallel.BatchCompletionCallBack object at 0x147349bd0>
def apply_async(self, func, callback=None):
"""Schedule a func to be run"""
result = ImmediateResult(func)
../venv/lib/python3.7/site-packages/joblib/_parallel_backends.py:201:
self = <joblib._parallel_backends.ImmediateResult object at 0x147349f10>, batch = <joblib.parallel.BatchedCalls object at 0x147349d50>
def __init__(self, batch):
# Don't delay the application, to avoid keeping the input
# arguments in memory
self.results = batch()
../venv/lib/python3.7/site-packages/joblib/_parallel_backends.py:582:
self = <joblib.parallel.BatchedCalls object at 0x147349d50>
def __call__(self):
# Set the default nested backend to self._backend but do not set the
# change the default number of processes to -1
with parallel_backend(self._backend, n_jobs=self._n_jobs):
return [func(*args, **kwargs)
for func, args, kwargs in self.items]
../venv/lib/python3.7/site-packages/joblib/parallel.py:256:
.0 = <list_iterator object at 0x147349c50>
return [func(*args, **kwargs)
for func, args, kwargs in self.items]
../venv/lib/python3.7/site-packages/joblib/parallel.py:256:
X = array([[0.5488135 , 0.71518937, 0.60276338, ..., 0.02010755, 0.82894003, 0.00469548], [0.67781654, 0.27...99, 0.1419334 ], [0.88498232, 0.19701397, 0.56861333, ..., 0.75842952, 0.02378743, 0.81357508]]) y = array([1, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 0, 1, 2, 0, 0, 2, 1, 0, 2, 0, 1, 0, 1, 1, 2, 2, 2, 0, 1, 2,...0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 1, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1]) pos_class = None, Cs = [1.0], fit_intercept = True, max_iter = 100, tol = 0.0001, verbose = 0, solver = 'lbfgs', coef = None, class_weight = None dual = False, penalty = 'l2', intercept_scaling = 1.0, multi_class = 'multinomial', random_state = RandomState(MT19937) at 0x107875E20 check_input = False, max_squared_sum = None sample_weight = array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ...1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) l1_ratio = None
def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
max_iter=100, tol=1e-4, verbose=0,
solver='lbfgs', coef=None,
class_weight=None, dual=False, penalty='l2',
intercept_scaling=1., multi_class='auto',
random_state=None, check_input=True,
max_squared_sum=None, sample_weight=None,
l1_ratio=None):
"""Compute a Logistic Regression model for a list of regularization
parameters.
This is an implementation that uses the result of the previous model
to speed up computations along the set of solutions, making it faster
than sequentially calling LogisticRegression for the different parameters.
Note that there will be no speedup with liblinear solver, since it does
not handle warm-starting.
Read more in the :ref:`User Guide <logistic_regression>`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
y : array-like of shape (n_samples,) or (n_samples, n_targets)
Input data, target values.
pos_class : int, default=None
The class with respect to which we perform a one-vs-all fit.
If None, then it is assumed that the given problem is binary.
Cs : int or array-like of shape (n_cs,), default=10
List of values for the regularization parameter or integer specifying
the number of regularization parameters that should be used. In this
case, the parameters will be chosen in a logarithmic scale between
1e-4 and 1e4.
fit_intercept : bool, default=True
Whether to fit an intercept for the model. In this case the shape of
the returned array is (n_cs, n_features + 1).
max_iter : int, default=100
Maximum number of iterations for the solver.
tol : float, default=1e-4
Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
will stop when ``max{|g_i | i = 1, ..., n} <= tol``
where ``g_i`` is the i-th component of the gradient.
verbose : int, default=0
For the liblinear and lbfgs solvers set verbose to any positive
number for verbosity.
solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, \
default='lbfgs'
Numerical solver to use.
coef : array-like of shape (n_features,), default=None
Initialization value for coefficients of logistic regression.
Useless for liblinear solver.
class_weight : dict or 'balanced', default=None
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``.
Note that these weights will be multiplied with sample_weight (passed
through the fit method) if sample_weight is specified.
dual : bool, default=False
Dual or primal formulation. Dual formulation is only implemented for
l2 penalty with liblinear solver. Prefer dual=False when
n_samples > n_features.
penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
Used to specify the norm used in the penalization. The 'newton-cg',
'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
only supported by the 'saga' solver.
intercept_scaling : float, default=1.
Useful only when the solver 'liblinear' is used
and self.fit_intercept is set to True. In this case, x becomes
[x, self.intercept_scaling],
i.e. a "synthetic" feature with constant value equal to
intercept_scaling is appended to the instance vector.
The intercept becomes ``intercept_scaling * synthetic_feature_weight``.
Note! the synthetic feature weight is subject to l1/l2 regularization
as all other features.
To lessen the effect of regularization on synthetic feature weight
(and therefore on the intercept) intercept_scaling has to be increased.
multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'
If the option chosen is 'ovr', then a binary problem is fit for each
label. For 'multinomial' the loss minimised is the multinomial loss fit
across the entire probability distribution, *even when the data is
binary*. 'multinomial' is unavailable when solver='liblinear'.
'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
and otherwise selects 'multinomial'.
.. versionadded:: 0.18
Stochastic Average Gradient descent solver for 'multinomial' case.
.. versionchanged:: 0.22
Default changed from 'ovr' to 'auto' in 0.22.
random_state : int, RandomState instance, default=None
Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
data. See :term:`Glossary <random_state>` for details.
check_input : bool, default=True
If False, the input arrays X and y will not be checked.
max_squared_sum : float, default=None
Maximum squared sum of X over samples. Used only in SAG solver.
If None, it will be computed, going through all the samples.
The value should be precomputed to speed up cross validation.
sample_weight : array-like of shape(n_samples,), default=None
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight.
l1_ratio : float, default=None
The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
combination of L1 and L2.
Returns
-------
coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
List of coefficients for the Logistic Regression model. If
fit_intercept is set to True then the second dimension will be
n_features + 1, where the last item represents the intercept. For
``multiclass='multinomial'``, the shape is (n_classes, n_cs,
n_features) or (n_classes, n_cs, n_features + 1).
Cs : ndarray
Grid of Cs used for cross-validation.
n_iter : array of shape (n_cs,)
Actual number of iteration for each Cs.
Notes
-----
You might get slightly different results with the solver liblinear than
with the others since this uses LIBLINEAR which penalizes the intercept.
.. versionchanged:: 0.19
The "copy" parameter was removed.
"""
if isinstance(Cs, numbers.Integral):
Cs = np.logspace(-4, 4, Cs)
solver = _check_solver(solver, penalty, dual)
# Preprocessing.
if check_input:
X = check_array(X, accept_sparse='csr', dtype=np.float64,
accept_large_sparse=solver != 'liblinear')
y = check_array(y, ensure_2d=False, dtype=None)
check_consistent_length(X, y)
_, n_features = X.shape
classes = np.unique(y)
random_state = check_random_state(random_state)
multi_class = _check_multi_class(multi_class, solver, len(classes))
if pos_class is None and multi_class != 'multinomial':
if (classes.size > 2):
raise ValueError('To fit OvR, use the pos_class argument')
# np.unique(y) gives labels in sorted order.
pos_class = classes[1]
# If sample weights exist, convert them to array (support for lists)
# and check length
# Otherwise set them to 1 for all examples
sample_weight = _check_sample_weight(sample_weight, X,
dtype=X.dtype)
# If class_weights is a dict (provided by the user), the weights
# are assigned to the original labels. If it is "balanced", then
# the class_weights are assigned after masking the labels with a OvR.
le = LabelEncoder()
if isinstance(class_weight, dict) or multi_class == 'multinomial':
class_weight_ = compute_class_weight(class_weight,
classes=classes, y=y)
sample_weight *= class_weight_[le.fit_transform(y)]
# For doing a ovr, we need to mask the labels first. for the
# multinomial case this is not necessary.
if multi_class == 'ovr':
w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
mask_classes = np.array([-1, 1])
mask = (y == pos_class)
y_bin = np.ones(y.shape, dtype=X.dtype)
y_bin[~mask] = -1.
# for compute_class_weight
if class_weight == "balanced":
class_weight_ = compute_class_weight(class_weight,
classes=mask_classes,
y=y_bin)
sample_weight *= class_weight_[le.fit_transform(y_bin)]
else:
if solver not in ['sag', 'saga']:
lbin = LabelBinarizer()
Y_multi = lbin.fit_transform(y)
if Y_multi.shape[1] == 1:
Y_multi = np.hstack([1 - Y_multi, Y_multi])
else:
# SAG multinomial solver needs LabelEncoder, not LabelBinarizer
le = LabelEncoder()
Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)
w0 = np.zeros((classes.size, n_features + int(fit_intercept)),
order='F', dtype=X.dtype)
if coef is not None:
# it must work both giving the bias term and not
if multi_class == 'ovr':
if coef.size not in (n_features, w0.size):
raise ValueError(
'Initialization coef is of shape %d, expected shape '
'%d or %d' % (coef.size, n_features, w0.size))
w0[:coef.size] = coef
else:
# For binary problems coef.shape[0] should be 1, otherwise it
# should be classes.size.
n_classes = classes.size
if n_classes == 2:
n_classes = 1
if (coef.shape[0] != n_classes or
coef.shape[1] not in (n_features, n_features + 1)):
raise ValueError(
'Initialization coef is of shape (%d, %d), expected '
'shape (%d, %d) or (%d, %d)' % (
coef.shape[0], coef.shape[1], classes.size,
n_features, classes.size, n_features + 1))
if n_classes == 1:
w0[0, :coef.shape[1]] = -coef
w0[1, :coef.shape[1]] = coef
else:
w0[:, :coef.shape[1]] = coef
if multi_class == 'multinomial':
# scipy.optimize.minimize and newton-cg accepts only
# ravelled parameters.
if solver in ['lbfgs', 'newton-cg']:
w0 = w0.ravel()
target = Y_multi
if solver == 'lbfgs':
def func(x, *args): return _multinomial_loss_grad(x, *args)[0:2]
elif solver == 'newton-cg':
def func(x, *args): return _multinomial_loss(x, *args)[0]
def grad(x, *args): return _multinomial_loss_grad(x, *args)[1]
hess = _multinomial_grad_hess
warm_start_sag = {'coef': w0.T}
else:
target = y_bin
if solver == 'lbfgs':
func = _logistic_loss_and_grad
elif solver == 'newton-cg':
func = _logistic_loss
def grad(x, *args): return _logistic_loss_and_grad(x, *args)[1]
hess = _logistic_grad_hess
warm_start_sag = {'coef': np.expand_dims(w0, axis=1)}
coefs = list()
n_iter = np.zeros(len(Cs), dtype=np.int32)
for i, C in enumerate(Cs):
if solver == 'lbfgs':
iprint = [-1, 50, 1, 100, 101][
np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
opt_res = optimize.minimize(
func, w0, method="L-BFGS-B", jac=True,
args=(X, target, 1. / C, sample_weight),
options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
)
n_iter_i = _check_optimize_result(
solver, opt_res, max_iter,
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
../venv/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:764:
solver = 'lbfgs' result = fun: 43.61597578773504 hess_inv: <303x303 LbfgsInvHessProduct with dtype=float64> jac: array([-2.69720198...2783e-01, -3.65757762e-01, 5.32847587e-02, -1.22963842e-01, -2.34838587e-01, 8.15949341e-01, 2.31130362e+00]) max_iter = 100 extra_warning_msg = 'Please also refer to the documentation for alternative solver options:\n https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression'
def _check_optimize_result(solver, result, max_iter=None,
extra_warning_msg=None):
"""Check the OptimizeResult for successful convergence
Parameters
----------
solver: str
solver name. Currently only `lbfgs` is supported.
result: OptimizeResult
result of the scipy.optimize.minimize function
max_iter: {int, None}
expected maximum number of iterations
Returns
-------
n_iter: int
number of iterations
"""
# handle both scipy and scikit-learn solver names
if solver == "lbfgs":
if result.status != 0:
warning_msg = (
"{} failed to converge (status={}):\n{}.\n\n"
"Increase the number of iterations (max_iter) "
"or scale the data as shown in:\n"
" https://scikit-learn.org/stable/modules/"
"preprocessing.html"
).format(solver, result.status, result.message.decode("latin1"))
E AttributeError: 'str' object has no attribute 'decode'
Repro:
import string
X, y = X_y_multi()
log_classifier = LogisticRegressionClassifier(random_state=0, penalty='l2', C=1.0, n_jobs=1, multi_class='auto', solver='lbfgs')
X = log_classifier.random_state.random((X.shape[0], len(string.printable)))
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
X = pd.DataFrame(X, columns=col_names)
log_classifier.fit(X, y)
Tried
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
for name in col_names:
name.decode("latin1")
And got the same error. The combination of this + the stack trace leads me to think that when LogisticRegression classifier fails to converge, it tries to print a warning message, which requires calling .decode("latin1"), which breaks on an already-decoded string.
@dsherry @chukarsten
Tried to dig into this a bit more, found that someone posted a similar issue: https://github.com/scikit-optimize/scikit-optimize/issues/981
Not really sure if there's anything we can do on our end to get around this, since we can't prevent LogisticRegression failing to converge. We could catch the error, but that doesn't feel right. Perhaps it's okay to keep disallow scipy==1.6.0 and wait for the next release. Thoughts?
Wanted to follow up so I dug more into this by seeing if I could repro this with just scikit-learn. I couldn't ðŸ˜:
from sklearn.linear_model import LogisticRegression
import string
from sklearn.utils import check_random_state
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)
random_state = check_random_state(0)
evalml_params = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1,
'l1_ratio': None, 'max_iter': 100,
'multi_class': 'auto', 'n_jobs': 1, 'penalty': 'l2', 'random_state': random_state,
'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
log_classifier_sk = LogisticRegression(**evalml_params)
# log_classifier_sk = LogisticRegression(random_state=random_state, penalty='l2', C=1.0, n_jobs=1, multi_class='auto', solver='lbfgs')
X = random_state.random((X.shape[0], len(string.printable)))
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
X = pd.DataFrame(X, columns=col_names)
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())
log_classifier_sk.fit(X, y)
This works fine... but using EvalML doesn't:
from evalml.pipelines import LogisticRegressionClassifier
X, y = X_y_multi()
log_classifier = LogisticRegressionClassifier(random_state=0, penalty='l2', C=1.0, n_jobs=1,
multi_class='auto', solver='lbfgs')
X = check_random_state(0).random((X.shape[0], len(string.printable)))
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
X = pd.DataFrame(X, columns=col_names)
log_classifier.fit(X, y)
Ugh! I tried debugging but am quite stuck. Could be of interest to note that https://github.com/scikit-learn/scikit-learn/pull/18711 fixes this issue. This mean that upgrading scikit-learn to >0.24.0 could resolve this?
Update: let's hold off on this for now, and see if updating scikit-learn to >0.24.0 resolves this issue.
When
scipy
version 1.6.0 was released our unit tests broke. This issue tracks the following:core-requirements.txt
Failed tests here: https://github.com/alteryx/evalml/pull/1628