diff_test get ValueError: Input X contains infinity or a value too large for dtype('float64').

wkl1990 commented 3 days ago

Hi Kai, I am currently using diff_test to identify cell type-specific peaks, but I encountered a ValueError with my data. Do you have any insights or suggestions on how to resolve this issue? Below is the error message for your reference. Thank you!

`File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/snapatac2/tools/_diff.py:148, in diff_test(data, cell_group1, cell_group2, features, covariates, direction, min_log_fc, min_pct) 146 features, log_fc = zip(*filtered) 147 logging.info("Testing {} features ...".format(len(features))) --> 148 pvals = _diff_test_helper(cell_by_peak, test_var, features, covariates) 149 var_names = data.var_names 150 return pl.DataFrame({ 151 "feature name": [var_names[i] for i in features], 152 "log2(fold_change)": np.array(log_fc), 153 "p-value": np.array(pvals), 154 "adjusted p-value": _p_adjust_bh(pvals), 155 }).sort("adjusted p-value")

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/snapatac2/tools/_diff.py:219, in _diff_test_helper(mat, z, peaks, covariate) 216 if peaks is not None: 217 mat = mat[:, peaks] --> 219 return _likelihood_ratio_test_many(np.asarray(X), np.asarray(z), mat)

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/snapatac2/tools/_diff.py:248, in _likelihood_ratio_test_many(X, z, Y) 245 result = [] 246 for i in tqdm(range(n)): 247 result.append( --> 248 _likelihood_ratio_test(X0, X1, np.asarray(np.ravel(Y[:, i].todense()))) 249 ) 250 return result

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/snapatac2/tools/_diff.py:280, in _likelihood_ratio_test(X0, X1, y) 274 from sklearn.linear_model import LogisticRegression 275 from sklearn.metrics import log_loss 277 model = LogisticRegression(penalty=None, random_state=0, n_jobs=1, 278 solver="lbfgs", multi_class='ovr', warm_start=False, 279 max_iter = 1000, --> 280 ).fit(X0, y) 281 reduced = -log_loss(y, model.predict_proba(X0), normalize=False) 283 model = LogisticRegression(penalty=None, random_state=0, n_jobs=1, 284 solver="lbfgs", multi_class='ovr', warm_start=False, 285 max_iter = 1000, 286 ).fit(X1, y)

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/base.py:1351, in _fit_context..decorator..wrapper(estimator, *args, *kwargs) 1344 estimator._validate_params() 1346 with config_context( 1347 skip_parameter_validation=( 1348 prefer_skip_nested_validation or global_skip_validation 1349 ) 1350 ): -> 1351 return fit_method(estimator, args, **kwargs)

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:1201, in LogisticRegression.fit(self, X, y, sample_weight) 1198 else: 1199 _dtype = [np.float64, np.float32] -> 1201 X, y = self._validate_data( 1202 X, 1203 y, 1204 accept_sparse="csr", 1205 dtype=_dtype, 1206 order="C", 1207 accept_large_sparse=solver not in ["liblinear", "sag", "saga"], 1208 ) 1209 check_classificationtargets(y) 1210 self.classes = np.unique(y)

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/base.py:650, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, check_params) 648 y = check_array(y, input_name="y", check_y_params) 649 else: --> 650 X, y = check_X_y(X, y, **check_params) 651 out = X, y 653 if not no_val_X and check_params.get("ensure_2d", True):

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/utils/validation.py:1192, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1187 estimator_name = _check_estimator_name(estimator) 1188 raise ValueError( 1189 f"{estimator_name} requires y to be passed, but the target y is None" 1190 ) -> 1192 X = check_array( 1193 X, 1194 accept_sparse=accept_sparse, 1195 accept_large_sparse=accept_large_sparse, 1196 dtype=dtype, 1197 order=order, 1198 copy=copy, 1199 force_all_finite=force_all_finite, 1200 ensure_2d=ensure_2d, 1201 allow_nd=allow_nd, 1202 ensure_min_samples=ensure_min_samples, 1203 ensure_min_features=ensure_min_features, 1204 estimator=estimator, 1205 input_name="X", 1206 ) 1208 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1210 check_consistent_length(X, y)

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/utils/validation.py:1003, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 997 raise ValueError( 998 "Found array with dim %d. %s expected <= 2." 999 % (array.ndim, estimator_name) 1000 ) 1002 if force_all_finite: -> 1003 _assert_all_finite( 1004 array, 1005 input_name=input_name, 1006 estimator_name=estimator_name, 1007 allow_nan=force_all_finite == "allow-nan", 1008 ) 1010 if copy: 1011 if _is_numpy_namespace(xp): 1012 # only make a copy if array and array_orig may share memory`

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/utils/validation.py:126, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name) 123 if first_pass_isfinite: 124 return --> 126 _assert_all_finite_element_wise( 127 X, 128 xp=xp, 129 allow_nan=allow_nan, 130 msg_dtype=msg_dtype, 131 estimator_name=estimator_name, 132 input_name=input_name, 133 )

File ~/softwares/miniconda3/envs/SnapATAC2/lib/python3.10/site-packages/sklearn/utils/validation.py:175, in _assert_all_finite_element_wise(X, xp, allow_nan, msg_dtype, estimator_name, input_name) 158 if estimator_name and input_name == "X" and has_nan_error: 159 # Improve the error message on how to handle missing values in 160 # scikit-learn. 161 msg_err += ( 162 f"\n{estimator_name} does not accept missing values" 163 " encoded as NaN natively. For supervised learning, you might want" (...) 173 "#estimators-that-handle-nan-values" 174 ) --> 175 raise ValueError(msg_err)

ValueError: Input X contains infinity or a value too large for dtype('float64').`

kaizhang commented 3 days ago

You may want to check if the matrix contains any NaN of inf values.

wkl1990 commented 3 days ago

I checked the matrix, and it seems there are no NaN or infinite values present. `In [39]: np.any(np.isnan(cell_by_peak_array)) Out[39]: False

In [40]: np.all(np.isfinite(cell_by_peak_array)) Out[40]: True `

kaizhang commented 3 days ago

Can you send me a test data and code for reproducing the error?

wkl1990 commented 2 days ago

Here is the test data and code: Dropbox Link. Thank you for your assistance!

kaizhang / SnapATAC2

diff_test get ValueError: Input X contains infinity or a value too large for dtype('float64'). #364