STOmics / Stereopy

A toolkit of spatial transcriptomic analysis.
MIT License
184 stars 64 forks source link

Error in MSData.find_marker_genes #306

Closed Zjianglin closed 1 month ago

Zjianglin commented 1 month ago

Hi, I tried to find markers/DEG between clusters of integrated MSdata object. It seems the res_key option can no be set except default marker_genes.

I have seven Stereo-seq objects and made filter, merge, integrate and clustering. Here is my running code:

#stereo version is 1.3.1

data_dt = OrderedDict({ "WT": st.io.read_gef(geffp_mps['WT'], bin_type='cell_bins'), 
            "LowMM1479": st.io.read_gef(geffp_mps['LowMM1479'], bin_type='cell_bins'), 
            "LowMM1496": st.io.read_gef(geffp_mps['LowMM1496'], bin_type='cell_bins'), 
            "MedianLL1084": st.io.read_gef(geffp_mps['MedianLL1084'], bin_type='cell_bins'), 
            "Median1362": st.io.read_gef(geffp_mps['Median1362'], bin_type='cell_bins'), 
            "HighPT2203": st.io.read_gef(geffp_mps['HighPT2203'], bin_type='cell_bins'),
            "HighCF142": st.io.read_gef(geffp_mps['HighCF142'], bin_type='cell_bins')
          })

for kx, vx in data_dt.items():
    vx.cells['sample'] = kx
    print("{}({}) : {}\n".format(kx, geffp_mps[kx],  vx))

for kx, vx in data_dt.items():
    vx.tl.cal_qc()
    vx.tl.raw_checkpoint()
    ## filter cells
    vx.tl.filter_cells(min_gene=150, max_gene=4000, 
                       min_n_genes_by_counts=100, 
                       max_n_genes_by_counts=1500, 
                       pct_counts_mt=10, inplace=True)
    vx.tl.filter_genes(min_cell=3, inplace=True)

ms_data = MSData(_data_list=list(data_dt.values()), _names=list(data_dt.keys()), _relationship='other', _var_type='intersect')
ms_data.tl.raw_checkpoint()
print(ms_data.tl.raw)

ms_data.tl.normalize_total(scope=slice_generator[:], mode='integrate')
ms_data.tl.log1p(scope=slice_generator[:], mode='integrate')
ms_data.tl.pca(scope=slice_generator[:], mode='integrate', use_highly_genes=False, n_pcs=50, res_key='pca')
ms_data.tl.batches_integrate(scope=slice_generator[:], mode='integrate', pca_res_key='pca', res_key='pca_integrated')

data.tl.neighbors(scope=slice_generator[:], mode='integrate', pca_res_key='pca_integrated', n_pcs=50, res_key='neighbors_integrated')
data.tl.umap(scope=slice_generator[:], mode='integrate', pca_res_key='pca_integrated', neighbors_res_key='neighbors_integrated', res_key='umap_integrated')
data.plt.batches_umap(scope=slice_generator[:], mode='integrate', res_key='umap_integrated')
#Clustering
ms_data.tl.leiden(scope=slice_generator[:], mode='integrate', neighbors_res_key='neighbors_integrated',
                  resolution=1.5, res_key='leiden1.5')
ms_data.tl.leiden(scope=slice_generator[:], mode='integrate', neighbors_res_key='neighbors_integrated',
                  resolution=1.0, res_key='leiden1.0')

cls_key = 'leiden1.5'
#https://stereopy.readthedocs.io/en/latest/Tutorials(Multi-sample)/Comparative_Analysis.html#Marker-for-tissue
ms_data.tl.find_marker_genes(cluster_res_key=cls_key, use_highly_genes=False, use_raw =False,
#                             res_key='marker_genes_{}'.format(cls_key)) #_{}
                            res_key='marker_genes') #_{}
print(ms_data)

When I run the ms_data.tl.find_marker_genes with res_key='marker_genes_{}'.format(cls_key) to set the DEG result of laiden1.5 to a marker_genes_laiden1.5, it crash out as below:

[2024-07-12 17:31:36][Stereo][73872][MainThread][23448250165056][ms_pipeline][167][INFO]: data_obj(idx=0) in ms_data start to run find_marker_genes
[2024-07-12 17:31:36][Stereo][73872][MainThread][23448250165056][st_pipeline][41][INFO]: start to run find_marker_genes...
[2024-07-12 17:32:11][Stereo][73872][MainThread][23448250165056][tool_base][122][INFO]: read group information, grouping by group column.
[2024-07-12 17:32:11][Stereo][73872][MainThread][23448250165056][tool_base][151][INFO]: start to run...
[2024-07-12 17:32:39][Stereo][73872][MainThread][23448250165056][tool_base][153][INFO]: end to run.
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[55], line 7
      5 cls_key = 'leiden1.5'
      6 #https://stereopy.readthedocs.io/en/latest/Tutorials(Multi-sample)/Comparative_Analysis.html#Marker-for-tissue
----> 7 ms_data.tl.find_marker_genes(cluster_res_key=cls_key, use_highly_genes=False, use_raw =False,
      8                             res_key='marker_genes_{}'.format(cls_key)) #_{}
      9 #                             res_key='marker_genes') #_{}
     10 print(ms_data)

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/ms_pipeline.py:266, in MSDataPipeLine.__getattr__.<locals>.temp(*args, **kwargs)
    263     kwargs["mode"] = self.__mode
    265 if kwargs["mode"] == "integrate":
--> 266     return self._use_integrate_method(item, *args, **kwargs)
    267 elif kwargs["mode"] == "isolated":
    268     self._run_isolated_method(item, *args, **kwargs)

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/ms_pipeline.py:168, in MSDataPipeLine._use_integrate_method(self, item, *args, **kwargs)
    165             return new_attr(*args, **kwargs)
    167 logger.info(f'data_obj(idx=0) in ms_data start to run {item}')
--> 168 return new_attr(
    169     ms_data_view.merged_data.__getattribute__(self.__class__.ATTR_NAME),
    170     *args,
    171     **kwargs
    172 )

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/st_pipeline.py:43, in logit.<locals>.wrapped(*args, **kwargs)
     41 logger.info('start to run {}...'.format(func.__name__))
     42 tk = tc.start()
---> 43 res = func(*args, **kwargs)
     44 logger.info('{} end, consume time {:.4f}s.'.format(func.__name__, tc.get_time_consumed(key=tk, restart=False)))
     45 return res

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/st_pipeline.py:1101, in StPipeline.find_marker_genes(self, cluster_res_key, method, case_groups, control_groups, corr_method, use_raw, use_highly_genes, hvg_res_key, res_key, output, sort_by, n_genes, ascending, n_jobs)
   1093 result = tool.result
   1094 result['parameters'] = {
   1095     'cluster_res_key': cluster_res_key,
   1096     'method': method,
   (...)
   1099     'use_raw': use_raw
   1100 }
-> 1101 self.result[res_key] = result
   1102 if output is not None:
   1103     import natsort

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/result.py:175, in Result.__setitem__(self, key, value)
    172 for name_type, name_dict in Result.TYPE_NAMES_DICT.items():
    173     for like_name in name_dict:
    174         # if not key.startswith('gene_exp_') and like_name in key and self._real_set_item(name_type, key, value):
--> 175         if like_name in key and self._real_set_item(name_type, key, value):
    176             return
    177 if type(value) is pd.DataFrame:

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/result.py:150, in Result._real_set_item(self, type, key, value)
    148         if key.startswith(prefix):
    149             return False
--> 150     self._set_cluster_res(key, value)
    151 elif type == Result.CONNECTIVITY:
    152     self._set_connectivities_res(key, value)

File ~/miniforge3/envs/stomicsPy38/lib/python3.8/site-packages/stereo/core/result.py:198, in Result._set_cluster_res(self, key, value)
    197 def _set_cluster_res(self, key, value):
--> 198     assert type(value) is pd.DataFrame and 'group' in value.columns.values, "this is not cluster res"
    199     # warn(
    200     #     f'FutureWarning: {key} will be moved from `StereoExpData.tl.result` to `StereoExpData.cells` in the '
    201     #     'future, make sure your code set the property correctly. ',
    202     #     category=FutureWarning
    203     # )
    204     self.__stereo_exp_data.cells._obs[key] = value['group'].values

AssertionError: this is not cluster res

I have two questions:

  1. my workflow of integration, cluster and find DEG is correct or not? and which data should I use for DEG between laiden1.5 clusters? i.e, should I use the normalize_total + log1p processed data or the use_raw=True from ms_data.tl.raw_checkpoint() before normalize_total?
  2. does find_marker_genes function can only save a res_key as marker_genes? Could we set it as our demand? Thanks.
tanliwei-coder commented 1 month ago

Don't use leiden in find_marker_genes res_key, because of some reasons, it will be considered as a cluster result.

Similarly, louvain, phenograph, annotation, celltype and cell_type will also cause this error.

We can set the find_marker_genes res_key as a string that starts with marker_genes but doesn't contain the key words mentioned above.

Whether to use normalized data or raw data depends on the calculating effect, you can try to use both to see which is more effective.

Zjianglin commented 1 month ago

Thanks for your reply @tanliwei-coder . The res_key now can be set without leiden in it.

As for the use_raw for the find_marker_genes, it has the use_raw (bool) – whether to use raw express matrix for analysis, default True. in API page. I see that the similar function rank_genes_groups in scanpy noted that Expects logarithmized data. .

Because the raw data backup code position is different in tutorial codes between scanpy and stereopy . I'd like to know for the find_marker_genes in stereopy, which kind of data is expected in the official method code implementation? i.e. the raw means the entirely raw from read_gef without any preprocess or the logarithmized data from normalize_total + log1p?

tanliwei-coder commented 1 month ago

What the raw data is depends on when you run the data.tl.raw_checkpoint, raw_checkpoint will saves the current state of data, that is to say, if you run raw_checkpoint after read_gef, it will saves the data without any processing, and if you run raw_checkpoint after normalize_total or log1p, the raw is the one which has been normalized or logarithmized, normally, in some workflows of my company, we set the use_raw as True, because the normalized data sometimes may cause some calculating errors which is casued by the internal logic of the algorithm.

Zjianglin commented 1 month ago

Thanks for your reply. I tried it with use_raw as False. After filter_marker_genes, the results for some clusters is all NaN except genes column. is this means there is no DEG for this culster?

ms_data.tl.find_marker_genes(cluster_res_key=cls_key, use_highly_genes=False, use_raw =False,
                            res_key='marker_genes_{}'.format(cls_key.replace('leiden', 'ld')))
ms_data.tl.filter_marker_genes(marker_genes_res_key='marker_genes_{}'.format(cls_key.replace('leiden', 'ld')), 
                              min_fold_change=0.5, min_in_group_fraction=0.25, max_out_group_fraction=0.5,
                              compare_abs=False, res_key='marker_genes_filtered_{}'.format(cls_key.replace('leiden', 'ld')),
                              output=ofp)

print( ms_data.merged_data.tl.result['marker_genes_filtered_ld1.5'].keys())
print( ms_data.merged_data.tl.result['marker_genes_filtered_ld1.5']['1.vs.rest'].head())
print( ms_data.merged_data.tl.result['marker_genes_filtered_ld1.5']['1.vs.rest'].isna().sum())

#==>

dict_keys(['parameters', 'pct', 'pct_rest', 'mean_count', '4.vs.rest', '1.vs.rest', '2.vs.rest', '3.vs.rest', '6.vs.rest', '7.vs.rest', '8.vs.rest', '5.vs.rest', '11.vs.rest', '10.vs.rest', '12.vs.rest', '9.vs.rest', '15.vs.rest', '13.vs.rest', '14.vs.rest'])
   scores  pvalues  pvalues_adj  log2fc      genes  pct  pct_rest  mean_count
0     NaN      NaN          NaN     NaN  Serpinb6a  NaN       NaN         NaN
1     NaN      NaN          NaN     NaN      Crip1  NaN       NaN         NaN
2     NaN      NaN          NaN     NaN      Itgav  NaN       NaN         NaN
3     NaN      NaN          NaN     NaN       Spp1  NaN       NaN         NaN
4     NaN      NaN          NaN     NaN     Col1a1  NaN       NaN         NaN
scores         18003
pvalues        18003
pvalues_adj    18003
log2fc         18003
genes              0
pct            18003
pct_rest       18003
mean_count     18003
dtype: int64
tanliwei-coder commented 1 month ago

Thanks for your reply. I tried it with use_raw as False. After filter_marker_genes, the results for some clusters is all NaN except genes column. is this means there is no DEG for this culster?

ms_data.tl.find_marker_genes(cluster_res_key=cls_key, use_highly_genes=False, use_raw =False,
                            res_key='marker_genes_{}'.format(cls_key.replace('leiden', 'ld')))
ms_data.tl.filter_marker_genes(marker_genes_res_key='marker_genes_{}'.format(cls_key.replace('leiden', 'ld')), 
                              min_fold_change=0.5, min_in_group_fraction=0.25, max_out_group_fraction=0.5,
                              compare_abs=False, res_key='marker_genes_filtered_{}'.format(cls_key.replace('leiden', 'ld')),
                              output=ofp)

print( ms_data.merged_data.tl.result['marker_genes_filtered_ld1.5'].keys())
print( ms_data.merged_data.tl.result['marker_genes_filtered_ld1.5']['1.vs.rest'].head())
print( ms_data.merged_data.tl.result['marker_genes_filtered_ld1.5']['1.vs.rest'].isna().sum())

#==>

dict_keys(['parameters', 'pct', 'pct_rest', 'mean_count', '4.vs.rest', '1.vs.rest', '2.vs.rest', '3.vs.rest', '6.vs.rest', '7.vs.rest', '8.vs.rest', '5.vs.rest', '11.vs.rest', '10.vs.rest', '12.vs.rest', '9.vs.rest', '15.vs.rest', '13.vs.rest', '14.vs.rest'])
   scores  pvalues  pvalues_adj  log2fc      genes  pct  pct_rest  mean_count
0     NaN      NaN          NaN     NaN  Serpinb6a  NaN       NaN         NaN
1     NaN      NaN          NaN     NaN      Crip1  NaN       NaN         NaN
2     NaN      NaN          NaN     NaN      Itgav  NaN       NaN         NaN
3     NaN      NaN          NaN     NaN       Spp1  NaN       NaN         NaN
4     NaN      NaN          NaN     NaN     Col1a1  NaN       NaN         NaN
scores         18003
pvalues        18003
pvalues_adj    18003
log2fc         18003
genes              0
pct            18003
pct_rest       18003
mean_count     18003
dtype: int64

That is the result after filtering via ms_data.tl.filter_marker_genes, it means that all the recoads are filtered out by the conditions you set.

Zjianglin commented 1 month ago

Thanks for your reply. As the thresholds were relatively small, it may suggest that the cellbins are too similar to distinguish them.