n_pcs bus in harmony of batch_correction

Describe the bug When I use code like following with npcs lower than 13 (like 12), it make error

adata_harmony=ov.single.batch_correction(adata,batch_key='batch',
                                        methods='harmony',n_pcs=15,) # change the npcs to less than 13 cause error, why?

Error msg:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[56], line 1
----> 1 adata_harmony=ov.single.batch_correction(adata,batch_key='batch',
      2                                         methods='harmony',n_pcs=12,) # change the npcs to less than 13 cause error, why?
      3 adata

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/omicverse/single/_batch.py:39, in batch_correction(adata, batch_key, use_rep, methods, n_pcs, **kwargs)
     37 scale(adata3)
     38 pca(adata3,layer='scaled',n_pcs=n_pcs)
---> 39 sc.external.pp.harmony_integrate(adata3, batch_key,basis=use_rep,**kwargs)
     40 adata.obsm['X_harmony']=adata3.obsm['X_pca_harmony'].copy()
     41 return adata3

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/legacy_api_wrap/__init__.py:80, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
     77 @wraps(fn)
     78 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
     79     if len(args_all) <= n_positional:
---> 80         return fn(*args_all, **kw)
     82     args_pos: P.args
     83     args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/scanpy/external/pp/_harmony_integrate.py:95, in harmony_integrate(adata, key, basis, adjusted_basis, **kwargs)
     91     raise ImportError("\nplease install harmonypy:\n\n\tpip install harmonypy")
     93 X = adata.obsm[basis].astype(np.float64)
---> 95 harmony_out = harmonypy.run_harmony(X, adata.obs, key, **kwargs)
     97 adata.obsm[adjusted_basis] = harmony_out.Z_corr.T

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/harmonypy/harmony.py:124, in run_harmony(data_mat, meta_data, vars_use, theta, lamb, sigma, nclust, tau, block_size, max_iter_harmony, max_iter_kmeans, epsilon_cluster, epsilon_harmony, plot_convergence, verbose, reference_values, cluster_prior, random_state)
    120 phi_moe = np.vstack((np.repeat(1, N), phi))
    122 np.random.seed(random_state)
--> 124 ho = Harmony(
    125     data_mat, phi, phi_moe, Pr_b, sigma, theta, max_iter_harmony, max_iter_kmeans,
    126     epsilon_cluster, epsilon_harmony, nclust, block_size, lamb_mat, verbose
    127 )
    129 return ho

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/harmonypy/harmony.py:172, in Harmony.__init__(self, Z, Phi, Phi_moe, Pr_b, sigma, theta, max_iter_harmony, max_iter_kmeans, epsilon_kmeans, epsilon_harmony, K, block_size, lamb, verbose)
    169 self.kmeans_rounds  = []
    171 self.allocate_buffers()
--> 172 self.init_cluster()
    173 self.harmonize(self.max_iter_harmony, self.verbose)

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/harmonypy/harmony.py:191, in Harmony.init_cluster(self)
    188 logger.info("Computing initial centroids with sklearn.KMeans...")
    189 model = KMeans(n_clusters=self.K, init='k-means++',
    190                n_init=10, max_iter=25)
--> 191 model.fit(self.Z_cos.T)
    192 km_centroids, km_labels = model.cluster_centers_, model.labels_
    193 logger.info("sklearn.KMeans initialization complete.")

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1464, in KMeans.fit(self, X, y, sample_weight)
   1436 @_fit_context(prefer_skip_nested_validation=True)
   1437 def fit(self, X, y=None, sample_weight=None):
   1438     """Compute k-means clustering.
   1439 
   1440     Parameters
   (...)
   1462         Fitted estimator.
   1463     """
-> 1464     X = self._validate_data(
   1465         X,
   1466         accept_sparse="csr",
   1467         dtype=[np.float64, np.float32],
   1468         order="C",
   1469         copy=self.copy_x,
   1470         accept_large_sparse=False,
   1471     )
   1473     self._check_params_vs_input(X)
   1475     random_state = check_random_state(self.random_state)

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/sklearn/base.py:633, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    631         out = X, y
    632 elif not no_val_X and no_val_y:
--> 633     out = check_array(X, input_name="X", **check_params)
    634 elif no_val_X and not no_val_y:
    635     out = _check_y(y, **check_params)

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/sklearn/utils/validation.py:1064, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1058     raise ValueError(
   1059         "Found array with dim %d. %s expected <= 2."
   1060         % (array.ndim, estimator_name)
   1061     )
   1063 if force_all_finite:
-> 1064     _assert_all_finite(
   1065         array,
   1066         input_name=input_name,
   1067         estimator_name=estimator_name,
   1068         allow_nan=force_all_finite == "allow-nan",
   1069     )
   1071 if copy:
   1072     if _is_numpy_namespace(xp):
   1073         # only make a copy if `array` and `array_orig` may share memory`

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/sklearn/utils/validation.py:123, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
    120 if first_pass_isfinite:
    121     return
--> 123 _assert_all_finite_element_wise(
    124     X,
    125     xp=xp,
    126     allow_nan=allow_nan,
    127     msg_dtype=msg_dtype,
    128     estimator_name=estimator_name,
    129     input_name=input_name,
    130 )

File ~/micromamba/envs/omicverse/lib/python3.10/site-packages/sklearn/utils/validation.py:172, in _assert_all_finite_element_wise(X, xp, allow_nan, msg_dtype, estimator_name, input_name)
    155 if estimator_name and input_name == "X" and has_nan_error:
    156     # Improve the error message on how to handle missing values in
    157     # scikit-learn.
    158     msg_err += (
    159         f"\n{estimator_name} does not accept missing values"
    160         " encoded as NaN natively. For supervised learning, you might want"
   (...)
    170         "#estimators-that-handle-nan-values"
    171     )
--> 172 raise ValueError(msg_err)

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Smartphone (please complete the following information):

VersionVersion: 1.6.3

This doesn't seem to be a code ERROR, but a boundary check ERROR, the following answer is from GPT for reference:

The error you're encountering with the ov.single.batch_correction function when setting n_pcs (number of principal components) to less than 13 in your code is due to a mismatch between the number of principal components and the requirements of the Harmony integration process. Let's break down the possible reasons and solutions for this issue:

Understanding the Error

Principal Component Analysis (PCA):
- PCA is used to reduce the dimensionality of the dataset while preserving as much variability as possible.
- When you set n_pcs=12, it means that the PCA will reduce the dataset to 12 principal components.
Harmony Integration:
- Harmony is used for batch effect correction in single-cell RNA-seq data.
- The harmony_integrate function from the scanpy package is called to perform this integration using the specified principal components.
Error Source:
- The error indicates a ValueError due to NaN values encountered during the KMeans clustering step inside the Harmony algorithm.
- This suggests that reducing n_pcs to less than 13 might be leading to issues such as NaN values or an insufficient number of features for clustering.

Possible Reasons for the Error

Insufficient Dimensionality:
- Reducing the number of principal components to less than a certain threshold (in your case, 13) might lead to an insufficient number of features to capture the essential variability of the dataset.
- Harmony and KMeans clustering can struggle with too few features, leading to unstable or ill-defined clustering solutions.
Data Quality Issues:
- With fewer components, the PCA-transformed data might include NaN or near-zero variance components, which are problematic for subsequent steps in Harmony's algorithm.
Specific Implementation Requirements:
- The Harmony integration process might have internal checks or requirements for a minimum number of principal components to function correctly.

Solutions

Increase n_pcs:
- Ensure that n_pcs is sufficiently high to capture the variability in the dataset. In your case, setting n_pcs to at least 13 seems to work.
- You can try different values above 12 to find the minimum number of components that prevent the error.

Check for NaNs:

Before running Harmony, inspect the PCA output to ensure there are no NaN values.
You can use numpy or pandas to check for and handle NaN values in your dataset.

import numpy as np
import pandas as pd

# Assuming adata is your AnnData object
pca_output = adata.obsm['X_pca']
if np.isnan(pca_output).any():
   print("NaN values found in PCA output")

Review Harmony's Requirements:
- Review the Harmony documentation and requirements to ensure your settings and inputs align with its expected usage.
- Harmony might require a certain number of principal components to function effectively.
Data Preprocessing:
- Ensure that the data preprocessing steps (scaling, normalization, etc.) are correctly applied before running PCA and Harmony.

Example Code Adjustment

Here's how you might adjust your code to ensure it works properly:

import ov
import scanpy as sc

# Check if PCA output has NaNs
adata3 = adata.copy()
sc.pp.scale(adata3)  # Example scaling step
sc.tl.pca(adata3, n_comps=12)  # Trying with 12 principal components

if np.isnan(adata3.obsm['X_pca']).any():
    raise ValueError("PCA output contains NaNs, increase n_pcs or preprocess data")

# Now perform batch correction
adata_harmony = ov.single.batch_correction(adata, batch_key='batch', methods='harmony', n_pcs=15)

Summary

The error arises due to issues related to the dimensionality reduction step (PCA) when the number of components (n_pcs) is too low for Harmony's integration to handle properly. To resolve this, try increasing n_pcs to a higher value, ensuring that there are no NaN values in the PCA output, and checking Harmony's requirements for the number of principal components.

This doesn't seem to be a code ERROR, but a boundary check ERROR, the following answer is from GPT for reference:

Understanding the Error

1. **Principal Component Analysis (PCA)**:

   * PCA is used to reduce the dimensionality of the dataset while preserving as much variability as possible.
   * When you set `n_pcs=12`, it means that the PCA will reduce the dataset to 12 principal components.

2. **Harmony Integration**:

   * Harmony is used for batch effect correction in single-cell RNA-seq data.
   * The `harmony_integrate` function from the `scanpy` package is called to perform this integration using the specified principal components.

3. **Error Source**:

   * The error indicates a `ValueError` due to `NaN` values encountered during the KMeans clustering step inside the Harmony algorithm.
   * This suggests that reducing `n_pcs` to less than 13 might be leading to issues such as `NaN` values or an insufficient number of features for clustering.

Possible Reasons for the Error

1. **Insufficient Dimensionality**:

   * Reducing the number of principal components to less than a certain threshold (in your case, 13) might lead to an insufficient number of features to capture the essential variability of the dataset.
   * Harmony and KMeans clustering can struggle with too few features, leading to unstable or ill-defined clustering solutions.

2. **Data Quality Issues**:

   * With fewer components, the PCA-transformed data might include `NaN` or near-zero variance components, which are problematic for subsequent steps in Harmony's algorithm.

3. **Specific Implementation Requirements**:

   * The Harmony integration process might have internal checks or requirements for a minimum number of principal components to function correctly.

Solutions

1. **Increase `n_pcs`**:

   * Ensure that `n_pcs` is sufficiently high to capture the variability in the dataset. In your case, setting `n_pcs` to at least 13 seems to work.
   * You can try different values above 12 to find the minimum number of components that prevent the error.

2. **Check for NaNs**:

   * Before running Harmony, inspect the PCA output to ensure there are no `NaN` values.
   * You can use `numpy` or `pandas` to check for and handle `NaN` values in your dataset.

   ```python
   import numpy as np
   import pandas as pd

   # Assuming adata is your AnnData object
   pca_output = adata.obsm['X_pca']
   if np.isnan(pca_output).any():
       print("NaN values found in PCA output")
   ```

3. **Review Harmony's Requirements**:

   * Review the Harmony documentation and requirements to ensure your settings and inputs align with its expected usage.
   * Harmony might require a certain number of principal components to function effectively.

4. **Data Preprocessing**:

   * Ensure that the data preprocessing steps (scaling, normalization, etc.) are correctly applied before running PCA and Harmony.

Example Code Adjustment

Here's how you might adjust your code to ensure it works properly:

import ov
import scanpy as sc

# Check if PCA output has NaNs
adata3 = adata.copy()
sc.pp.scale(adata3)  # Example scaling step
sc.tl.pca(adata3, n_comps=12)  # Trying with 12 principal components

if np.isnan(adata3.obsm['X_pca']).any():
    raise ValueError("PCA output contains NaNs, increase n_pcs or preprocess data")

# Now perform batch correction
adata_harmony = ov.single.batch_correction(adata, batch_key='batch', methods='harmony', n_pcs=15)

Summary

Dear Starlitnightly, I try to carry PCA by manully. It work, But I think the problem is still exists because of code PCA in ov. The Error example : When I change the ov.pp.pca to sc.pp.pca, It works.

Thank you for your help, we will fix this issue in the next version.

Zehua

Starlitnightly / omicverse