alteryx / evalml

EvalML is an AutoML library written in python.
https://evalml.alteryx.com
BSD 3-Clause "New" or "Revised" License
765 stars 86 forks source link

Integrate `scipy` version 1.6.0 #1630

Closed angela97lin closed 3 years ago

angela97lin commented 3 years ago

When scipy version 1.6.0 was released our unit tests broke. This issue tracks the following:

Failed tests here: https://github.com/alteryx/evalml/pull/1628

angela97lin commented 3 years ago

Tracing the error down, LogisticRegression classifier's fit fails with *** AttributeError: 'str' object has no attribute 'decode'

Full stack trace:

F [100%]

====================================================================== FAILURES ======================================================================= ___ test_estimators_feature_name_with_random_ascii ____

self = LogisticRegressionClassifier(penalty='l2', C=1.0, n_jobs=1, multi_class='auto', solver='lbfgs') X = column_0 column_1 column_2 column_3 column_4 column_5 column6 ... column~ column column\t column_...11629 0.264954 ... 0.262116 0.735502 0.550447 0.397151 0.758430 0.023787 0.813575

[100 rows x 100 columns] y = 0 1 1 0 2 0 3 0 4 1 .. 95 1 96 0 97 0 98 0 99 1 Length: 100, dtype: int64

def fit(self, X, y=None):
    """Fits component to data

    Arguments:
        X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
        y (ww.DataColumn, pd.Series, np.ndarray, optional): The target training data of length [n_samples]

    Returns:
        self
    """
    import pdb; pdb.set_trace()
    X = _convert_to_woodwork_structure(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    if y is not None:
        y = _convert_to_woodwork_structure(y)
        y = _convert_woodwork_types_wrapper(y.to_series())
    try:
      self._component_obj.fit(X, y)

evalml/pipelines/components/component_base.py:102:


self = LogisticRegression(n_jobs=1, random_state=0) X = array([[0.5488135 , 0.71518937, 0.60276338, ..., 0.02010755, 0.82894003, 0.00469548], [0.67781654, 0.27...99, 0.1419334 ], [0.88498232, 0.19701397, 0.56861333, ..., 0.75842952, 0.02378743, 0.81357508]]) y = array([1, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 0, 1, 2, 0, 0, 2, 1, 0, 2, 0, 1, 0, 1, 1, 2, 2, 2, 0, 1, 2,...0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 1, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1]) sample_weight = None

def fit(self, X, y, sample_weight=None):
    """
    Fit the model according to the given training data.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like of shape (n_samples,)
        Target vector relative to X.

    sample_weight : array-like of shape (n_samples,) default=None
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.

        .. versionadded:: 0.17
           *sample_weight* support to LogisticRegression.

    Returns
    -------
    self
        Fitted estimator.

    Notes
    -----
    The SAGA solver supports both float64 and float32 bit arrays.
    """
    solver = _check_solver(self.solver, self.penalty, self.dual)

    if not isinstance(self.C, numbers.Number) or self.C < 0:
        raise ValueError("Penalty term must be positive; got (C=%r)"
                         % self.C)
    if self.penalty == 'elasticnet':
        if (not isinstance(self.l1_ratio, numbers.Number) or
                self.l1_ratio < 0 or self.l1_ratio > 1):
            raise ValueError("l1_ratio must be between 0 and 1;"
                             " got (l1_ratio=%r)" % self.l1_ratio)
    elif self.l1_ratio is not None:
        warnings.warn("l1_ratio parameter is only used when penalty is "
                      "'elasticnet'. Got "
                      "(penalty={})".format(self.penalty))
    if self.penalty == 'none':
        if self.C != 1.0:  # default values
            warnings.warn(
                "Setting penalty='none' will ignore the C and l1_ratio "
                "parameters"
            )
            # Note that check for l1_ratio is done right above
        C_ = np.inf
        penalty = 'l2'
    else:
        C_ = self.C
        penalty = self.penalty
    if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
        raise ValueError("Maximum number of iteration must be positive;"
                         " got (max_iter=%r)" % self.max_iter)
    if not isinstance(self.tol, numbers.Number) or self.tol < 0:
        raise ValueError("Tolerance for stopping criteria must be "
                         "positive; got (tol=%r)" % self.tol)

    if solver == 'lbfgs':
        _dtype = np.float64
    else:
        _dtype = [np.float64, np.float32]

    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
                               order="C",
                               accept_large_sparse=solver != 'liblinear')
    check_classification_targets(y)
    self.classes_ = np.unique(y)

    multi_class = _check_multi_class(self.multi_class, solver,
                                     len(self.classes_))

    if solver == 'liblinear':
        if effective_n_jobs(self.n_jobs) != 1:
            warnings.warn("'n_jobs' > 1 does not have any effect when"
                          " 'solver' is set to 'liblinear'. Got 'n_jobs'"
                          " = {}.".format(effective_n_jobs(self.n_jobs)))
        self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
            X, y, self.C, self.fit_intercept, self.intercept_scaling,
            self.class_weight, self.penalty, self.dual, self.verbose,
            self.max_iter, self.tol, self.random_state,
            sample_weight=sample_weight)
        self.n_iter_ = np.array([n_iter_])
        return self

    if solver in ['sag', 'saga']:
        max_squared_sum = row_norms(X, squared=True).max()
    else:
        max_squared_sum = None

    n_classes = len(self.classes_)
    classes_ = self.classes_
    if n_classes < 2:
        raise ValueError("This solver needs samples of at least 2 classes"
                         " in the data, but the data contains only one"
                         " class: %r" % classes_[0])

    if len(self.classes_) == 2:
        n_classes = 1
        classes_ = classes_[1:]

    if self.warm_start:
        warm_start_coef = getattr(self, 'coef_', None)
    else:
        warm_start_coef = None
    if warm_start_coef is not None and self.fit_intercept:
        warm_start_coef = np.append(warm_start_coef,
                                    self.intercept_[:, np.newaxis],
                                    axis=1)

    self.coef_ = list()
    self.intercept_ = np.zeros(n_classes)

    # Hack so that we iterate only once for the multinomial case.
    if multi_class == 'multinomial':
        classes_ = [None]
        warm_start_coef = [warm_start_coef]
    if warm_start_coef is None:
        warm_start_coef = [None] * n_classes

    path_func = delayed(_logistic_regression_path)

    # The SAG solver releases the GIL so it's more efficient to use
    # threads for this solver.
    if solver in ['sag', 'saga']:
        prefer = 'threads'
    else:
        prefer = 'processes'
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                           **_joblib_parallel_args(prefer=prefer))(
      path_func(X, y, pos_class=class_, Cs=[C_],

l1_ratio=self.l1_ratio, fit_intercept=self.fit_intercept, tol=self.tol, verbose=self.verbose, solver=solver, multi_class=multi_class, max_iter=self.max_iter, class_weight=self.class_weight, check_input=False, random_state=self.random_state, coef=warm_startcoef, penalty=penalty, max_squared_sum=max_squared_sum, sample_weight=sampleweight) for class, warm_startcoef in zip(classes_, warm_start_coef))

../venv/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1409:


self = Parallel(n_jobs=1), iterable = <generator object LogisticRegression.fit.. at 0x14734b6d0>

def __call__(self, iterable):
    if self._jobs:
        raise ValueError('This Parallel instance is already running')
    # A flag used to abort the dispatching of jobs in case an
    # exception is found
    self._aborting = False

    if not self._managed_backend:
        n_jobs = self._initialize_backend()
    else:
        n_jobs = self._effective_n_jobs()

    # self._effective_n_jobs should be called in the Parallel.__call__
    # thread only -- store its value in an attribute for further queries.
    self._cached_effective_n_jobs = n_jobs

    backend_name = self._backend.__class__.__name__
    if n_jobs == 0:
        raise RuntimeError("%s has no active worker." % backend_name)

    self._print("Using backend %s with %d concurrent workers.",
                (backend_name, n_jobs))
    if hasattr(self._backend, 'start_call'):
        self._backend.start_call()
    iterator = iter(iterable)
    pre_dispatch = self.pre_dispatch

    if pre_dispatch == 'all' or n_jobs == 1:
        # prevent further dispatch via multiprocessing callback thread
        self._original_iterator = None
        self._pre_dispatch_amount = 0
    else:
        self._original_iterator = iterator
        if hasattr(pre_dispatch, 'endswith'):
            pre_dispatch = eval(pre_dispatch)
        self._pre_dispatch_amount = pre_dispatch = int(pre_dispatch)

        # The main thread will consume the first pre_dispatch items and
        # the remaining items will later be lazily dispatched by async
        # callbacks upon task completions.

        # TODO: this iterator should be batch_size * n_jobs
        iterator = itertools.islice(iterator, self._pre_dispatch_amount)

    self._start_time = time.time()
    self.n_dispatched_batches = 0
    self.n_dispatched_tasks = 0
    self.n_completed_tasks = 0
    # Use a caching dict for callables that are pickled with cloudpickle to
    # improve performances. This cache is used only in the case of
    # functions that are defined in the __main__ module, functions that are
    # defined locally (inside another function) and lambda expressions.
    self._pickle_cache = dict()
    try:
        # Only set self._iterating to True if at least a batch
        # was dispatched. In particular this covers the edge
        # case of Parallel used with an exhausted iterator. If
        # self._original_iterator is None, then this means either
        # that pre_dispatch == "all", n_jobs == 1 or that the first batch
        # was very quick and its callback already dispatched all the
        # remaining jobs.
        self._iterating = False
      if self.dispatch_one_batch(iterator):

../venv/lib/python3.7/site-packages/joblib/parallel.py:1003:


self = Parallel(n_jobs=1), iterator = <generator object LogisticRegression.fit.. at 0x14734b6d0>

def dispatch_one_batch(self, iterator):
    """Prefetch the tasks for the next batch and dispatch them.

    The effective size of the batch is computed here.
    If there are no more jobs to dispatch, return False, else return True.

    The iterator consumption and dispatching is protected by the same
    lock so calling this function should be thread safe.

    """
    if self.batch_size == 'auto':
        batch_size = self._backend.compute_batch_size()
    else:
        # Fixed batch size strategy
        batch_size = self.batch_size

    with self._lock:
        # to ensure an even distribution of the workolad between workers,
        # we look ahead in the original iterators more than batch_size
        # tasks - However, we keep consuming only one batch at each
        # dispatch_one_batch call. The extra tasks are stored in a local
        # queue, _ready_batches, that is looked-up prior to re-consuming
        # tasks from the origal iterator.
        try:
            tasks = self._ready_batches.get(block=False)
        except queue.Empty:
            # slice the iterator n_jobs * batchsize items at a time. If the
            # slice returns less than that, then the current batchsize puts
            # too much weight on a subset of workers, while other may end
            # up starving. So in this case, re-scale the batch size
            # accordingly to distribute evenly the last items between all
            # workers.
            n_jobs = self._cached_effective_n_jobs
            big_batch_size = batch_size * n_jobs

            islice = list(itertools.islice(iterator, big_batch_size))
            if len(islice) == 0:
                return False
            elif (iterator is self._original_iterator
                  and len(islice) < big_batch_size):
                # We reached the end of the original iterator (unless
                # iterator is the ``pre_dispatch``-long initial slice of
                # the original iterator) -- decrease the batch size to
                # account for potential variance in the batches running
                # time.
                final_batch_size = max(1, len(islice) // (10 * n_jobs))
            else:
                final_batch_size = max(1, len(islice) // n_jobs)

            # enqueue n_jobs batches in a local queue
            for i in range(0, len(islice), final_batch_size):
                tasks = BatchedCalls(islice[i:i + final_batch_size],
                                     self._backend.get_nested_backend(),
                                     self._pickle_cache)
                self._ready_batches.put(tasks)

            # finally, get one task.
            tasks = self._ready_batches.get(block=False)
        if len(tasks) == 0:
            # No more tasks available in the iterator: tell caller to stop.
            return False
        else:
          self._dispatch(tasks)

../venv/lib/python3.7/site-packages/joblib/parallel.py:834:


self = Parallel(n_jobs=1), batch = <joblib.parallel.BatchedCalls object at 0x147349d50>

def _dispatch(self, batch):
    """Queue the batch for computing, with or without multiprocessing

    WARNING: this method is not thread-safe: it should be only called
    indirectly via dispatch_one_batch.

    """
    # If job.get() catches an exception, it closes the queue:
    if self._aborting:
        return

    self.n_dispatched_tasks += len(batch)
    self.n_dispatched_batches += 1

    dispatch_timestamp = time.time()
    cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
    with self._lock:
        job_idx = len(self._jobs)
      job = self._backend.apply_async(batch, callback=cb)

../venv/lib/python3.7/site-packages/joblib/parallel.py:753:


self = <joblib._parallel_backends.SequentialBackend object at 0x147349a10>, func = <joblib.parallel.BatchedCalls object at 0x147349d50> callback = <joblib.parallel.BatchCompletionCallBack object at 0x147349bd0>

def apply_async(self, func, callback=None):
    """Schedule a func to be run"""
  result = ImmediateResult(func)

../venv/lib/python3.7/site-packages/joblib/_parallel_backends.py:201:


self = <joblib._parallel_backends.ImmediateResult object at 0x147349f10>, batch = <joblib.parallel.BatchedCalls object at 0x147349d50>

def __init__(self, batch):
    # Don't delay the application, to avoid keeping the input
    # arguments in memory
  self.results = batch()

../venv/lib/python3.7/site-packages/joblib/_parallel_backends.py:582:


self = <joblib.parallel.BatchedCalls object at 0x147349d50>

def __call__(self):
    # Set the default nested backend to self._backend but do not set the
    # change the default number of processes to -1
    with parallel_backend(self._backend, n_jobs=self._n_jobs):
        return [func(*args, **kwargs)
              for func, args, kwargs in self.items]

../venv/lib/python3.7/site-packages/joblib/parallel.py:256:


.0 = <list_iterator object at 0x147349c50>

return [func(*args, **kwargs)
      for func, args, kwargs in self.items]

../venv/lib/python3.7/site-packages/joblib/parallel.py:256:


X = array([[0.5488135 , 0.71518937, 0.60276338, ..., 0.02010755, 0.82894003, 0.00469548], [0.67781654, 0.27...99, 0.1419334 ], [0.88498232, 0.19701397, 0.56861333, ..., 0.75842952, 0.02378743, 0.81357508]]) y = array([1, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 0, 1, 2, 0, 0, 2, 1, 0, 2, 0, 1, 0, 1, 1, 2, 2, 2, 0, 1, 2,...0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 1, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1]) pos_class = None, Cs = [1.0], fit_intercept = True, max_iter = 100, tol = 0.0001, verbose = 0, solver = 'lbfgs', coef = None, class_weight = None dual = False, penalty = 'l2', intercept_scaling = 1.0, multi_class = 'multinomial', random_state = RandomState(MT19937) at 0x107875E20 check_input = False, max_squared_sum = None sample_weight = array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ...1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) l1_ratio = None

def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                              max_iter=100, tol=1e-4, verbose=0,
                              solver='lbfgs', coef=None,
                              class_weight=None, dual=False, penalty='l2',
                              intercept_scaling=1., multi_class='auto',
                              random_state=None, check_input=True,
                              max_squared_sum=None, sample_weight=None,
                              l1_ratio=None):
    """Compute a Logistic Regression model for a list of regularization
    parameters.

    This is an implementation that uses the result of the previous model
    to speed up computations along the set of solutions, making it faster
    than sequentially calling LogisticRegression for the different parameters.
    Note that there will be no speedup with liblinear solver, since it does
    not handle warm-starting.

    Read more in the :ref:`User Guide <logistic_regression>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Input data.

    y : array-like of shape (n_samples,) or (n_samples, n_targets)
        Input data, target values.

    pos_class : int, default=None
        The class with respect to which we perform a one-vs-all fit.
        If None, then it is assumed that the given problem is binary.

    Cs : int or array-like of shape (n_cs,), default=10
        List of values for the regularization parameter or integer specifying
        the number of regularization parameters that should be used. In this
        case, the parameters will be chosen in a logarithmic scale between
        1e-4 and 1e4.

    fit_intercept : bool, default=True
        Whether to fit an intercept for the model. In this case the shape of
        the returned array is (n_cs, n_features + 1).

    max_iter : int, default=100
        Maximum number of iterations for the solver.

    tol : float, default=1e-4
        Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
        will stop when ``max{|g_i | i = 1, ..., n} <= tol``
        where ``g_i`` is the i-th component of the gradient.

    verbose : int, default=0
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.

    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, \
            default='lbfgs'
        Numerical solver to use.

    coef : array-like of shape (n_features,), default=None
        Initialization value for coefficients of logistic regression.
        Useless for liblinear solver.

    class_weight : dict or 'balanced', default=None
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.

        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``.

        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.

    dual : bool, default=False
        Dual or primal formulation. Dual formulation is only implemented for
        l2 penalty with liblinear solver. Prefer dual=False when
        n_samples > n_features.

    penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
        Used to specify the norm used in the penalization. The 'newton-cg',
        'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
        only supported by the 'saga' solver.

    intercept_scaling : float, default=1.
        Useful only when the solver 'liblinear' is used
        and self.fit_intercept is set to True. In this case, x becomes
        [x, self.intercept_scaling],
        i.e. a "synthetic" feature with constant value equal to
        intercept_scaling is appended to the instance vector.
        The intercept becomes ``intercept_scaling * synthetic_feature_weight``.

        Note! the synthetic feature weight is subject to l1/l2 regularization
        as all other features.
        To lessen the effect of regularization on synthetic feature weight
        (and therefore on the intercept) intercept_scaling has to be increased.

    multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'
        If the option chosen is 'ovr', then a binary problem is fit for each
        label. For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'multinomial' is unavailable when solver='liblinear'.
        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
        and otherwise selects 'multinomial'.

        .. versionadded:: 0.18
           Stochastic Average Gradient descent solver for 'multinomial' case.
        .. versionchanged:: 0.22
            Default changed from 'ovr' to 'auto' in 0.22.

    random_state : int, RandomState instance, default=None
        Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
        data. See :term:`Glossary <random_state>` for details.

    check_input : bool, default=True
        If False, the input arrays X and y will not be checked.

    max_squared_sum : float, default=None
        Maximum squared sum of X over samples. Used only in SAG solver.
        If None, it will be computed, going through all the samples.
        The value should be precomputed to speed up cross validation.

    sample_weight : array-like of shape(n_samples,), default=None
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.

    l1_ratio : float, default=None
        The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
        used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
        to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
        to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
        combination of L1 and L2.

    Returns
    -------
    coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
        List of coefficients for the Logistic Regression model. If
        fit_intercept is set to True then the second dimension will be
        n_features + 1, where the last item represents the intercept. For
        ``multiclass='multinomial'``, the shape is (n_classes, n_cs,
        n_features) or (n_classes, n_cs, n_features + 1).

    Cs : ndarray
        Grid of Cs used for cross-validation.

    n_iter : array of shape (n_cs,)
        Actual number of iteration for each Cs.

    Notes
    -----
    You might get slightly different results with the solver liblinear than
    with the others since this uses LIBLINEAR which penalizes the intercept.

    .. versionchanged:: 0.19
        The "copy" parameter was removed.
    """
    if isinstance(Cs, numbers.Integral):
        Cs = np.logspace(-4, 4, Cs)

    solver = _check_solver(solver, penalty, dual)

    # Preprocessing.
    if check_input:
        X = check_array(X, accept_sparse='csr', dtype=np.float64,
                        accept_large_sparse=solver != 'liblinear')
        y = check_array(y, ensure_2d=False, dtype=None)
        check_consistent_length(X, y)
    _, n_features = X.shape

    classes = np.unique(y)
    random_state = check_random_state(random_state)

    multi_class = _check_multi_class(multi_class, solver, len(classes))
    if pos_class is None and multi_class != 'multinomial':
        if (classes.size > 2):
            raise ValueError('To fit OvR, use the pos_class argument')
        # np.unique(y) gives labels in sorted order.
        pos_class = classes[1]

    # If sample weights exist, convert them to array (support for lists)
    # and check length
    # Otherwise set them to 1 for all examples
    sample_weight = _check_sample_weight(sample_weight, X,
                                         dtype=X.dtype)

    # If class_weights is a dict (provided by the user), the weights
    # are assigned to the original labels. If it is "balanced", then
    # the class_weights are assigned after masking the labels with a OvR.
    le = LabelEncoder()
    if isinstance(class_weight, dict) or multi_class == 'multinomial':
        class_weight_ = compute_class_weight(class_weight,
                                             classes=classes, y=y)
        sample_weight *= class_weight_[le.fit_transform(y)]

    # For doing a ovr, we need to mask the labels first. for the
    # multinomial case this is not necessary.
    if multi_class == 'ovr':
        w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
        mask_classes = np.array([-1, 1])
        mask = (y == pos_class)
        y_bin = np.ones(y.shape, dtype=X.dtype)
        y_bin[~mask] = -1.
        # for compute_class_weight

        if class_weight == "balanced":
            class_weight_ = compute_class_weight(class_weight,
                                                 classes=mask_classes,
                                                 y=y_bin)
            sample_weight *= class_weight_[le.fit_transform(y_bin)]

    else:
        if solver not in ['sag', 'saga']:
            lbin = LabelBinarizer()
            Y_multi = lbin.fit_transform(y)
            if Y_multi.shape[1] == 1:
                Y_multi = np.hstack([1 - Y_multi, Y_multi])
        else:
            # SAG multinomial solver needs LabelEncoder, not LabelBinarizer
            le = LabelEncoder()
            Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)

        w0 = np.zeros((classes.size, n_features + int(fit_intercept)),
                      order='F', dtype=X.dtype)

    if coef is not None:
        # it must work both giving the bias term and not
        if multi_class == 'ovr':
            if coef.size not in (n_features, w0.size):
                raise ValueError(
                    'Initialization coef is of shape %d, expected shape '
                    '%d or %d' % (coef.size, n_features, w0.size))
            w0[:coef.size] = coef
        else:
            # For binary problems coef.shape[0] should be 1, otherwise it
            # should be classes.size.
            n_classes = classes.size
            if n_classes == 2:
                n_classes = 1

            if (coef.shape[0] != n_classes or
                    coef.shape[1] not in (n_features, n_features + 1)):
                raise ValueError(
                    'Initialization coef is of shape (%d, %d), expected '
                    'shape (%d, %d) or (%d, %d)' % (
                        coef.shape[0], coef.shape[1], classes.size,
                        n_features, classes.size, n_features + 1))

            if n_classes == 1:
                w0[0, :coef.shape[1]] = -coef
                w0[1, :coef.shape[1]] = coef
            else:
                w0[:, :coef.shape[1]] = coef

    if multi_class == 'multinomial':
        # scipy.optimize.minimize and newton-cg accepts only
        # ravelled parameters.
        if solver in ['lbfgs', 'newton-cg']:
            w0 = w0.ravel()
        target = Y_multi
        if solver == 'lbfgs':
            def func(x, *args): return _multinomial_loss_grad(x, *args)[0:2]
        elif solver == 'newton-cg':
            def func(x, *args): return _multinomial_loss(x, *args)[0]
            def grad(x, *args): return _multinomial_loss_grad(x, *args)[1]
            hess = _multinomial_grad_hess
        warm_start_sag = {'coef': w0.T}
    else:
        target = y_bin
        if solver == 'lbfgs':
            func = _logistic_loss_and_grad
        elif solver == 'newton-cg':
            func = _logistic_loss
            def grad(x, *args): return _logistic_loss_and_grad(x, *args)[1]
            hess = _logistic_grad_hess
        warm_start_sag = {'coef': np.expand_dims(w0, axis=1)}

    coefs = list()
    n_iter = np.zeros(len(Cs), dtype=np.int32)
    for i, C in enumerate(Cs):
        if solver == 'lbfgs':
            iprint = [-1, 50, 1, 100, 101][
                np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
            opt_res = optimize.minimize(
                func, w0, method="L-BFGS-B", jac=True,
                args=(X, target, 1. / C, sample_weight),
                options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
            )
            n_iter_i = _check_optimize_result(
                solver, opt_res, max_iter,
              extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

../venv/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:764:


solver = 'lbfgs' result = fun: 43.61597578773504 hess_inv: <303x303 LbfgsInvHessProduct with dtype=float64> jac: array([-2.69720198...2783e-01, -3.65757762e-01, 5.32847587e-02, -1.22963842e-01, -2.34838587e-01, 8.15949341e-01, 2.31130362e+00]) max_iter = 100 extra_warning_msg = 'Please also refer to the documentation for alternative solver options:\n https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression'

def _check_optimize_result(solver, result, max_iter=None,
                           extra_warning_msg=None):
    """Check the OptimizeResult for successful convergence

    Parameters
    ----------
    solver: str
       solver name. Currently only `lbfgs` is supported.
    result: OptimizeResult
       result of the scipy.optimize.minimize function
    max_iter: {int, None}
       expected maximum number of iterations

    Returns
    -------
    n_iter: int
       number of iterations
    """
    # handle both scipy and scikit-learn solver names
    if solver == "lbfgs":
        if result.status != 0:
            warning_msg = (
                "{} failed to converge (status={}):\n{}.\n\n"
                "Increase the number of iterations (max_iter) "
                "or scale the data as shown in:\n"
                "    https://scikit-learn.org/stable/modules/"
                "preprocessing.html"
          ).format(solver, result.status, result.message.decode("latin1"))

E AttributeError: 'str' object has no attribute 'decode'

angela97lin commented 3 years ago

Repro:

import string
X, y = X_y_multi()
log_classifier = LogisticRegressionClassifier(random_state=0, penalty='l2', C=1.0, n_jobs=1, multi_class='auto', solver='lbfgs')
X = log_classifier.random_state.random((X.shape[0], len(string.printable)))
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
X = pd.DataFrame(X, columns=col_names)
log_classifier.fit(X, y)
angela97lin commented 3 years ago

Tried

col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
for name in col_names:
    name.decode("latin1")

And got the same error. The combination of this + the stack trace leads me to think that when LogisticRegression classifier fails to converge, it tries to print a warning message, which requires calling .decode("latin1"), which breaks on an already-decoded string.

angela97lin commented 3 years ago

@dsherry @chukarsten

Tried to dig into this a bit more, found that someone posted a similar issue: https://github.com/scikit-optimize/scikit-optimize/issues/981

Not really sure if there's anything we can do on our end to get around this, since we can't prevent LogisticRegression failing to converge. We could catch the error, but that doesn't feel right. Perhaps it's okay to keep disallow scipy==1.6.0 and wait for the next release. Thoughts?

angela97lin commented 3 years ago

Wanted to follow up so I dug more into this by seeing if I could repro this with just scikit-learn. I couldn't 😭:

from sklearn.linear_model import LogisticRegression
import string
from sklearn.utils import check_random_state
from evalml.utils.gen_utils import (
    _convert_to_woodwork_structure,
    _convert_woodwork_types_wrapper
)

random_state = check_random_state(0)
evalml_params = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 
                 'l1_ratio': None, 'max_iter': 100,
                 'multi_class': 'auto', 'n_jobs': 1, 'penalty': 'l2', 'random_state': random_state,
                 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
log_classifier_sk = LogisticRegression(**evalml_params)
# log_classifier_sk = LogisticRegression(random_state=random_state, penalty='l2', C=1.0, n_jobs=1, multi_class='auto', solver='lbfgs')
X = random_state.random((X.shape[0], len(string.printable)))
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
X = pd.DataFrame(X, columns=col_names)
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())
log_classifier_sk.fit(X, y)

This works fine... but using EvalML doesn't:

from evalml.pipelines import LogisticRegressionClassifier
X, y = X_y_multi()
log_classifier = LogisticRegressionClassifier(random_state=0, penalty='l2', C=1.0, n_jobs=1, 
                                              multi_class='auto', solver='lbfgs')
X = check_random_state(0).random((X.shape[0], len(string.printable)))
col_names = ['column_{}'.format(ascii_char) for ascii_char in string.printable]
X = pd.DataFrame(X, columns=col_names)
log_classifier.fit(X, y)

Ugh! I tried debugging but am quite stuck. Could be of interest to note that https://github.com/scikit-learn/scikit-learn/pull/18711 fixes this issue. This mean that upgrading scikit-learn to >0.24.0 could resolve this?

angela97lin commented 3 years ago

Update: let's hold off on this for now, and see if updating scikit-learn to >0.24.0 resolves this issue.