Cell labels do not match UMAP clusters

goepp commented 3 months ago

Thank you for making this data available.

I have stumbled upon an unexpected behavior: for the organ "blood", the cell labels provided do not fit the clusters as seen on a UMAP:

Here is how to reproduce my result:

import os
import json
import warnings
import pandas as pd
import scanpy as sc
import anndata as ad

adata = sc.read_h5ad(output_file)

adata.layers["counts"] = adata.X
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

sc.pl.umap(adata, color="label", save = "label_umap.png")

where adata is obtained from contatenating all blood samples which do not have missing data or metadata. (The complete code is available hereafter.) The adata file can be downloaded here until 11/06/2024: https://gigamove.rwth-aachen.de/en/download/1620d4d0178dafce5708a6df917b067d.

Code used to concatenate the blood samples.

```python # General utility import os import json import warnings # Computation import pandas as pd import scanpy as sc import anndata as ad import scanpy.external as sce # Define project directory proj_dir = '/Users/vgoepp/work/aachen/human_cell_atlas/' file_list = [] for root, dirs, files in os.walk(proj_dir+'data'): for file in files: file_list.append(file) print(file_list) generator = os.walk(proj_dir+'data') first_item = next(generator) for item in generator: print(item) missing_data_samples = ['GSM2560246', 'GSM2560249', 'GSM2560247', 'CRX102286', 'CRX102288', 'GSM1676569'] organ = "blood" data_dir = os.path.join(proj_dir, 'data/', organ) output_file = proj_dir+'data/'+"whole_"+organ+"_raw.h5ad" filenames = next(os.walk(data_dir))[1] # Discard folders which don't include data for file in missing_data_samples: if file in filenames: filenames.remove(file) print(str(len(filenames)) + " samples for organ " + organ) adatas = [None] * len(filenames) for ind, filename in enumerate(filenames): filepath = data_dir+"/"+filename adata = sc.read_mtx(filepath+'/filter/matrix.mtx.gz') # Load cell barcodes adata_bc = pd.read_csv(filepath + '/filter/barcodes.txt.gz', header=None, names=["cell_id"]) assert adata_bc.shape[0] == adata.shape[0] # Save barcodes as adata.obs adata.obs = adata_bc adata.obs["sample_id"] = filename # Load gene names adata_features = pd.read_csv( filepath + '/filter/genes.txt.gz', header=None) assert adata_features.shape[0] == adata.shape[1] adata.var['gene_name'] = adata_features[0].tolist() # Load metadata try: with open(filepath+'/metadata.json', "r") as file: metadata = json.load(file) metadata = metadata[filename] if "sample" in metadata.keys(): metadata = list(metadata['sample'].values())[0] except: # if file not present or empty: metadata = None print("Metadata missing for filename: "+filename) # Add metadata as obs fields # Equivalent to if else, but works in metadata is None try: assert "attributes" in metadata.keys() tmp = metadata['attributes'] adata.obs["developmental stage"] = tmp.get("developmental stage") adata.obs["sex"] = tmp.get("sex") adata.obs["age"] = tmp.get("age") adata.obs["disease"] = tmp.get("disease") adata.obs["individual"] = tmp.get("individual") adata.uns = dict({"metadata": metadata}) except: adata.obs["developmental stage"] = "none" adata.obs["sex"] = "none" adata.obs["age"] = "none" adata.obs["disease"] = "none" adata.obs["individual"] = "none" # Load observation data try: with open(filepath+'/observation.json', "r") as file: observation = json.load(file) # Add observation as obs fields adata.obs["paper"] = observation.get("paper") adata.obs["technology"] = observation.get('technology') adata.obs["organ"] = observation.get('organ') adata.obs["database_id"] = observation.get('database_id') adata.uns = dict({"observation": observation}) except: adata.obs["paper"] = "none" adata.obs["technology"] = "none" adata.obs["organ"] = "none" adata.obs["database_id"] = "none" observation = None print("Observation missing for filename: "+filename) # Save metadata and observation data in adata object # if they exist if metadata is not None or observation is not None: adata.uns = dict() if metadata is not None: adata.uns['metadata'] = metadata if observation is not None: adata.uns['observation'] = observation # Load cell label assignement data try: assignement_df = pd.read_table(filepath+"/mx_out/assignments_rank_mx.tsv", usecols=["barcodes", "label_id", "label"]) # Take the intersection of cells in annotation and aa = set(adata.obs['cell_id'].tolist()) bb = set(assignement_df["barcodes"].tolist()) if len(aa.symmetric_difference(bb)) != 0: warnings.warn( "The cells in the barcode and label files don't overlap perfectly.") if len(aa.difference(bb)) != 0: warnings.warn( f'{len(aa.difference(bb))} cells are in barcode.tsv and not in assignements.tsv: {aa.difference(bb)}') if len(bb.difference(aa)) != 0: warnings.warn( f'{len(bb.difference(aa))} cells are in assignements.tsv and not in barcode.tsv: {bb.difference(aa)}') # Add cell label data: adata.obs = pd.concat([adata.obs.set_index("cell_id"), assignement_df.set_index("barcodes")], axis=1, join="outer") adata.obs = adata.obs.rename_axis("cell_id").reset_index() except: warnings.warn("Assignement file assignments_rank_mx.tsv at location '"+filepath + "/mx_out/' could not be read") # Save adata to list of adatas adatas[ind] = adata # Remove none elements in adata adatas = [adata for adata in adatas if adata is not None] # Concatenate list of adatas adata = ad.concat(adatas, join="outer", merge="same") ## Remove cells with no label adata.obs["label"].value_counts(dropna=False) adata = adata[adata.obs["label"].notna()] adata = adata[adata.obs["label"] != "none", :] ## Remove cells with no database_id adata.obs["database_id"].value_counts(dropna=False) adata = adata[adata.obs["database_id"] != "none", :] adata.write(output_file) ```

As a note, the problem persists after removing batch effects with respect to samples using harmony:

sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sce.pp.harmony_integrate(adata, 'sample_id')

adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 15)
sc.tl.umap(adata)

sc.pl.umap(adata, color="label", save = "_label_umap_harmony.png")

umap_label_umap_harmony

sbooeshaghi commented 2 months ago

Hi, I noticed that you are using the cp10k normalization strategy before finding highly variable genes, scaling, pca, neighborhood graph creation, followed by UMAP. The cell type assignments with mx assign are built from the log1pPF normalization where the normalization factor is the average of the total umi counts per cell. This is generally computed on a dataset by dataset basis (and so is assignment). Can you try performing normalization with log1pPF and then assignment on a dataset by dataset basis?

goepp commented 2 months ago

Hi, I am not sure what log1pPF refers to. Could you provide a code example?

sbooeshaghi commented 2 months ago

https://github.com/cellatlas/mx/blob/4cf6e079d84bcf8bcee64e1ab35574f63cd894dc/mx/mx_normalize.py#L47-L52

cellatlas / human

Cell labels do not match UMAP clusters #6