Can't open saved .h5ad after cell2location

BenjaminDEMAILLE commented 1 year ago

Hi ! I got an error when trying to read the Visium h5ad after cell2location

my script is :


# Import modules
import os
# silence scanpy that prints a lot of warnings
import warnings
warnings.filterwarnings('ignore')

os.environ["THEANO_FLAGS"] = 'device=mps,floatX=float32,force_device=True'
import torch
torch.set_num_interop_threads(10)
torch.set_num_threads(10)
import scanpy as sc
import numpy as np
import matplotlib as mpl
import cell2location
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text for PDFs
torch.set_default_dtype(torch.float32)
mpl.use('Agg')
#
import matplotlib.pyplot as plt

results_folder = "results"
# create paths and names to results folders for reference regression and cell2location models
ref_run_name = f'{results_folder}/reference_signatures'
run_name = f'{results_folder}/cell2location_map'

# Loading Visium and scRNA-seq reference data
adata_vis = sc.read_h5ad(filename="Plasticity_spatial.h5ad")
adata_vis.layers['logcounts'] = adata_vis.X.copy()
adata_vis.layers['raw'] = adata_vis.raw.X.copy()
adata_ref = sc.read_h5ad(filename="Grout_Leader_data.h5ad")
adata_ref.layers['raw'] = adata_ref.X.copy()

#scRNA part
# filter data
from cell2location.utils.filtering import filter_genes
selected =filter_genes(adata_ref, cell_count_cutoff=1, cell_percentage_cutoff2=0.01)
adata_ref = adata_ref[:, selected].copy()
# Estimation of reference cell type signatures (NB regression)
cell2location.models.RegressionModel.setup_anndata(adata=adata_ref,
                        # 10X reaction / sample / batch
                        batch_key="sample",
                        # cell type, covariate used for constructing signatures
                        labels_key="Cell_subtype",
                        layer="raw",  
                        )

# create the regression model
from cell2location.models import RegressionModel
mod = RegressionModel(adata_ref)
# view anndata_setup as a sanity check
mod.view_anndata_setup()
mod.train(max_epochs=300, 
          use_gpu=True)

# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_ref = mod.export_posterior(
    adata_ref, sample_kwargs={'num_samples': 1000, 
                              'batch_size': 2500, 
                              'use_gpu': True}
)

# Save model
mod.save(f"{ref_run_name}", overwrite=True)

# Save anndata object with results
adata_file = f"{ref_run_name}/sc.h5ad"
adata_ref.write(adata_file)
adata_file

# Examine QC plots
#mod.plot_QC()

# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
    inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
else:
    inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}'
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_ref.uns['mod']['factor_names']

# Spatial part
# find shared genes and subset both anndata and reference signatures
intersect = np.intersect1d(adata_vis.var_names, inf_aver.index)
adata_vis = adata_vis[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()

# prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata_vis, 
                                                 batch_key="Patient",   
                                                 layer="raw")

# create and train the model
mod = cell2location.models.Cell2location(
    adata_vis, cell_state_df=inf_aver,
    # the expected average cell abundance: tissue-dependent
    # hyper-prior which can be estimated from paired histology:
    N_cells_per_location=np.float32(30),
    # hyperparameter controlling normalisation of
    # within-experiment variation in RNA detection:
    detection_alpha=np.float32(20)
)
mod.view_anndata_setup()
mod.train(max_epochs=30000,
          # train using full data (batch_size=None)
          batch_size=None,
          # use all data points in training because
          # we need to estimate cell abundance at all locations
          train_size=1,
          use_gpu=True,     

         )

# plot ELBO loss history during training, removing first 100 epochs from the plot
#mod.plot_history(1000)
plt.legend(labels=['full data training']);

# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_vis = mod.export_posterior(
    adata_vis, 
    sample_kwargs={'num_samples': 1000, 
                   'batch_size': mod.adata.n_obs, 
                       'use_gpu': True}
)

# Save model
mod.save(f"{run_name}", overwrite=True)

# Save anndata object with results
adata_file = f"{run_name}/sp.h5ad"
adata_vis.write(adata_file)
adata_file

#mod.plot_QC()

And reading the h5ad after like :

adata_file = "/Users/benjamin/Dropbox (U932)/Salmon team/Benjamin/BD2lab/BD2AA/for_cell2location/results/results/cell2location_map/sp.h5ad"

adata_vis = sc.read_h5ad(adata_file)

I got :

--------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[35], line 1
----> 1 adata_vis = sc.read_h5ad(adata_file)

File [/opt/homebrew/lib/python3.11/site-packages/anndata/_io/h5ad.py:252](https://untitled+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/anndata/_io/h5ad.py:252), in read_h5ad(filename, backed, as_sparse, as_sparse_fmt, chunk_size)
    249         adata.raw = raw
    251     # Backwards compat to <0.7
--> 252     if isinstance(f["obs"], h5py.Dataset):
    253         _clean_uns(adata)
    255 return adata

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File [/opt/homebrew/lib/python3.11/site-packages/h5py/_hl/group.py:357](https://untitled+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/h5py/_hl/group.py:357), in Group.__getitem__(self, name)
    355         raise ValueError("Invalid HDF5 object reference")
    356 elif isinstance(name, (bytes, str)):
--> 357     oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    358 else:
    359     raise TypeError("Accessing a group is done with bytes or str, "
    360                     "not {}".format(type(name)))

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()
...
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File h5py/h5o.pyx:190, in h5py.h5o.open()

KeyError: "Unable to open object (object 'obs' doesn't exist)"

How can I solve this ?

vitkl commented 1 year ago

This likely means that saving failed - an issue with anndata rather than cell2location. Check errors/log from your saving script.

dyinboisry4u commented 8 months ago

Hi, @vitkl When I load the c2l h5ad by sc.read_h5ad, I got same error: KeyError: 'Unable to synchronously open object (message type not found)' Error raised while reading key '/varm/means_per_cluster_mu_fg' of <class 'h5py._hl.group.Group'> to / But saving looks ok

vitkl commented 8 months ago

Try examining adata.varm['means_per_cluster_mu_fg'] before saving to make sure that there are no issues. I suspect this error is due to non-allowed characters in column names (allowed by pandas, not allowed by h5py). Column names are cell clusters names so you probably need to rename some of your cell types to avoid these characters (likely / + etc).

dyinboisry4u commented 8 months ago

Try examining adata.varm['means_per_cluster_mu_fg'] before saving to make sure that there are no issues. I suspect this error is due to non-allowed characters in column names (allowed by pandas, not allowed by h5py). Column names are cell clusters names so you probably need to rename some of your cell types to avoid these characters (likely / + etc).

thanks, you are right, "/" is an illegal col name for h5py, but "+" is ok.

BayraktarLab / cell2location

Can't open saved .h5ad after cell2location #288