gao-lab / GLUE

Graph-linked unified embedding for single-cell multi-omics data integration
MIT License
385 stars 57 forks source link

Input contains NaN when LSI #103

Open cynthier opened 1 year ago

cynthier commented 1 year ago

Hello, when I use the scglue.data.lsi, it returned the error: Input contains NaN. Do you know how to figure out it? Thank you.

Jeff1995 commented 1 year ago

Hi @cynthier, thanks for your interest in GLUE!

As the error suggests, you might be having NaN values in the input matrix, which is not supported. Could you confirm if there are NaN values? Maybe you need to replace them with proper fillers or do an imputation before running lsi.

cynthier commented 1 year ago

Thank you so much for your rapid reply! I use nan = (True in np.isnan(atac.X.data)) to confirm there is no NaN value.

Jeff1995 commented 1 year ago

Okay thanks. Could you also provide a complete trackback so I can check where exactly was the error raised?

cynthier commented 1 year ago

Okay. Thank you in advance.


/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scglue/num.py:289: RuntimeWarning: divide by zero encountered in divide
  idf = X.shape[0] / X.sum(axis=0)
/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scipy/sparse/_compressed.py:466: RuntimeWarning: invalid value encountered in multiply
  data = np.multiply(ret.data, other[:, ret.col].ravel())
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[41], line 1
----> 1 scglue.data.lsi(atac, n_components=50, n_iter=15)

File /Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scglue/data.py:60, in lsi(adata, n_components, use_highly_variable, **kwargs)
     58 adata_use = adata[:, adata.var["highly_variable"]] if use_highly_variable else adata
     59 X = num.tfidf(adata_use.X)
---> 60 X_norm = normalize(X, norm="l1")
     61 X_norm = np.log1p(X_norm * 1e4)
     62 X_lsi = sklearn.utils.extmath.randomized_svd(X_norm, n_components, **kwargs)[0]

File /Conda/R4b/pkgs/glu/lib/python3.10/site-packages/sklearn/utils/_param_validation.py:214, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    208 try:
    209     with config_context(
    210         skip_parameter_validation=(
    211             prefer_skip_nested_validation or global_skip_validation
    212         )
    213     ):
--> 214         return func(*args, **kwargs)
    215 except InvalidParameterError as e:
    216     # When the function is just a wrapper around an estimator, we allow
    217     # the function to delegate validation to the estimator, but we replace
    218     # the name of the estimator by the name of the function in the error
    219     # message to avoid confusion.
    220     msg = re.sub(
    221         r"parameter of \w+ must be",
    222         f"parameter of {func.__qualname__} must be",
    223         str(e),
    224     )

File /Conda/R4b/pkgs/glu/lib/python3.10/site-packages/sklearn/preprocessing/_data.py:1841, in normalize(X, norm, axis, copy, return_norm)
   1838 else:  # axis == 1:
   1839     sparse_format = "csr"
-> 1841 X = check_array(
   1842     X,
   1843     accept_sparse=sparse_format,
   1844     copy=copy,
   1845     estimator="the normalize function",
   1846     dtype=FLOAT_DTYPES,
   1847 )
   1848 if axis == 0:
   1849     X = X.T

File/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/sklearn/utils/validation.py:881, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    879 if sp.issparse(array):
    880     _ensure_no_complex_data(array)
--> 881     array = _ensure_sparse_format(
    882         array,
    883         accept_sparse=accept_sparse,
    884         dtype=dtype,
    885         copy=copy,
    886         force_all_finite=force_all_finite,
    887         accept_large_sparse=accept_large_sparse,
    888         estimator_name=estimator_name,
    889         input_name=input_name,
    890     )
    891 else:
    892     # If np.array(..) gives ComplexWarning, then we convert the warning
    893     # to an error. This is needed because specifying a non complex
    894     # dtype to the function converts complex to real dtype,
    895     # thereby passing the test made in the lines following the scope
    896     # of warnings context manager.
    897     with warnings.catch_warnings():

File/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/sklearn/utils/validation.py:571, in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse, estimator_name, input_name)
    566         warnings.warn(
    567             "Can't check %s sparse matrix for nan or inf." % spmatrix.format,
    568             stacklevel=2,
    569         )
    570     else:
--> 571         _assert_all_finite(
    572             spmatrix.data,
    573             allow_nan=force_all_finite == "allow-nan",
    574             estimator_name=estimator_name,
    575             input_name=input_name,
    576         )
    578 return spmatrix

File /Conda/R4b/pkgs/glu/lib/python3.10/site-packages/sklearn/utils/validation.py:122, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
    119 if first_pass_isfinite:
    120     return
--> 122 _assert_all_finite_element_wise(
    123     X,
    124     xp=xp,
    125     allow_nan=allow_nan,
    126     msg_dtype=msg_dtype,
    127     estimator_name=estimator_name,
    128     input_name=input_name,
    129 )

File /Conda/R4b/pkgs/glu/lib/python3.10/site-packages/sklearn/utils/validation.py:171, in _assert_all_finite_element_wise(X, xp, allow_nan, msg_dtype, estimator_name, input_name)
    154 if estimator_name and input_name == "X" and has_nan_error:
    155     # Improve the error message on how to handle missing values in
    156     # scikit-learn.
    157     msg_err += (
    158         f"\n{estimator_name} does not accept missing values"
    159         " encoded as NaN natively. For supervised learning, you might want"
   (...)
    169         "#estimators-that-handle-nan-values"
    170     )
--> 171 raise ValueError(msg_err)

ValueError: Input contains NaN.
```'
Jeff1995 commented 1 year ago

Based on the traceback, I suppose the NaN values are produced when computing TF-IDF, specifically this line:

/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scglue/num.py:289: RuntimeWarning: divide by zero encountered in divide
  idf = X.shape[0] / X.sum(axis=0)

This suggests that there are all-zero rows in your adata object. Could you try removing these rows?

cynthier commented 1 year ago

Thank you so much. It works now. But I feel curious about another error when running guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)

: AttributeError: module 'scglue.genomics' has no attribute 'rna_anchored_guidance_graph'

Jeff1995 commented 1 year ago

Well that could be a version problem. Which version of scglue are you using?

cynthier commented 1 year ago

It is version 0.2.3. I installed it using command conda install -c conda-forge -c bioconda scglue

Jeff1995 commented 1 year ago

I see. That's the reason. The rna_anchored_guidance_graph was introduced in v0.3.1. I'd recommend updating to the latest version (v0.3.2).

cynthier commented 1 year ago

I update the package and it works. Thank you so much! But I meet with another error (crying). My input data are both count matrix. But the erro occurred when runinng scglue.genomics.rna_anchored_guidance_graph(rna, atac)

IntCastingNaNError                        Traceback (most recent call last)
Cell In[27], line 1
----> 1 guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scglue/genomics.py:528, in rna_anchored_guidance_graph(rna, gene_region, promoter_len, extend_range, extend_fn, signs, propagate_highly_variable, corrupt_rate, random_state, *others)
    525 if set(signs).difference({-1, 1}):
    526     raise RuntimeError("``signs`` can only contain {-1, 1}!")
--> 528 rna_bed = Bed(rna.var.assign(name=rna.var_names))
    529 other_beds = [Bed(other.var.assign(name=other.var_names)) for other in others]
    530 if gene_region == "promoter":

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scglue/utils.py:495, in ConstrainedDataFrame.__init__(self, *args, **kwargs)
    493 def __init__(self, *args, **kwargs) -> None:
    494     df = pd.DataFrame(*args, **kwargs)
--> 495     df = self.rectify(df)
    496     self.verify(df)
    497     super().__init__(df)

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/scglue/genomics.py:52, in Bed.rectify(cls, df)
     50 if item in df:
     51     if item in ("chromStart", "chromEnd"):
---> 52         df[item] = df[item].astype(int)
     53     else:
     54         df[item] = df[item].astype(str)

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/generic.py:6324, in NDFrame.astype(self, dtype, copy, errors)
   6317     results = [
   6318         self.iloc[:, i].astype(dtype, copy=copy)
   6319         for i in range(len(self.columns))
   6320     ]
   6322 else:
   6323     # else, only a single dtype is given
-> 6324     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6325     return self._constructor(new_data).__finalize__(self, method="astype")
   6327 # GH 33113: handle empty frame or series

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/internals/managers.py:451, in BaseBlockManager.astype(self, dtype, copy, errors)
    448 elif using_copy_on_write():
    449     copy = False
--> 451 return self.apply(
    452     "astype",
    453     dtype=dtype,
    454     copy=copy,
    455     errors=errors,
    456     using_cow=using_copy_on_write(),
    457 )

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/internals/managers.py:352, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    350         applied = b.apply(f, **kwargs)
    351     else:
--> 352         applied = getattr(b, f)(**kwargs)
    353     result_blocks = extend_blocks(applied, result_blocks)
    355 out = type(self).from_blocks(result_blocks, self.axes)

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/internals/blocks.py:511, in Block.astype(self, dtype, copy, errors, using_cow)
    491 """
    492 Coerce to the new dtype.
    493 
   (...)
    507 Block
    508 """
    509 values = self.values
--> 511 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    513 new_values = maybe_coerce_values(new_values)
    515 refs = None

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:242, in astype_array_safe(values, dtype, copy, errors)
    239     dtype = dtype.numpy_dtype
    241 try:
--> 242     new_values = astype_array(values, dtype, copy=copy)
    243 except (ValueError, TypeError):
    244     # e.g. _astype_nansafe can fail on object-dtype of strings
    245     #  trying to convert to float
    246     if errors == "ignore":

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:187, in astype_array(values, dtype, copy)
    184     values = values.astype(dtype, copy=copy)
    186 else:
--> 187     values = _astype_nansafe(values, dtype, copy=copy)
    189 # in pandas we don't store numpy str dtypes, so convert to object
    190 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:105, in _astype_nansafe(arr, dtype, copy, skipna)
    100     return lib.ensure_string_array(
    101         arr, skipna=skipna, convert_na_value=False
    102     ).reshape(shape)
    104 elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype):
--> 105     return _astype_float_to_int_nansafe(arr, dtype, copy)
    107 elif is_object_dtype(arr.dtype):
    108     # if we have a datetime/timedelta array of objects
    109     # then coerce to datetime64[ns] and use DatetimeArray.astype
    111     if is_datetime64_dtype(dtype):

File /cluster2/huanglab/lliu/Conda/R4b/pkgs/glu/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:150, in _astype_float_to_int_nansafe(values, dtype, copy)
    146 """
    147 astype with a check preventing converting NaN to an meaningless integer value.
    148 """
    149 if not np.isfinite(values).all():
--> 150     raise IntCastingNaNError(
    151         "Cannot convert non-finite values (NA or inf) to integer"
    152     )
    153 if dtype.kind == "u":
    154     # GH#45151
    155     if not (values >= 0).all():

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
Jeff1995 commented 1 year ago

It seems that you do not have "chromStart", "chromEnd" columns in adata.var, or there are NA or inf values in these columns. Could you check on that?