Closed pchiang5 closed 11 months ago
I could proceed a bit by removing nonoverlapping topics:
from ctxcore.rnkdb import FeatherRankingDatabase
from pycistarget.utils import target_to_query
db = FeatherRankingDatabase(rankings_db, name='homo_sapiens')
db_regions = db.genes
for topic in region_sets['topics_otsu'].keys():
try:
overlapping_regions = target_to_query(
region_sets['topics_otsu'][topic],
list(db_regions),
fraction_overlap = 0.4)
except:
print(topic)
# Topic10
# Topic14
del region_sets['topics_otsu']['Topic10']
del region_sets['topics_otsu']['Topic14']
However, the same error appeared below:
from scenicplus.wrappers.run_pycistarget import run_pycistarget
run_pycistarget(
region_sets = region_sets,
species = 'homo_sapiens',
save_path = os.path.join(work_dir, 'motifs'),
ctx_db_path = rankings_db,
dem_db_path = scores_db,
path_to_motif_annotations = motif_annotation,
run_without_promoters = True,
n_cpu = 40,
# _temp_dir = os.path.join(tmp_dir, 'ray_spill'),
annotation_version = 'v10nr_clust',
)
> 2023-08-09 15:07:42,212 pycisTarget_wrapper INFO /mnt/c/Users/pc/Downloads/motifs folder already exists.
> 2023-08-09 15:07:43,496 pycisTarget_wrapper INFO Loading cisTarget database for topics_otsu
> 2023-08-09 15:07:43,497 cisTarget INFO Reading cisTarget database
> 2023-08-09 15:10:07,267 pycisTarget_wrapper INFO Running cisTarget for topics_otsu
> 2023-08-09 15:10:45,577 INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8272
> (ctx_internal_ray pid=1149855) 2023-08-09 15:10:55,864 cisTarget INFO Running cisTarget for Topic1 which has 2820 regions
> (ctx_internal_ray pid=1149877) 2023-08-09 15:11:01,083 cisTarget INFO Running cisTarget for Topic15 which has 1 regions [repeated 12x across cluster]
> (ctx_internal_ray pid=1149860) 2023-08-09 15:13:25,800 cisTarget INFO Running cisTarget for Topic15 which has 1 regions [repeated 2x across cluster]
> (ctx_internal_ray pid=1149883) 2023-08-09 15:13:30,649 cisTarget INFO Annotating motifs for Topic2
> (ctx_internal_ray pid=1149883) 2023-08-09 15:13:30,658 cisTarget INFO Unable to load annotation for homo_sapiens
> (ctx_internal_ray pid=1149859) 2023-08-09 15:13:27,286 cisTarget INFO Running cisTarget for Topic13 which has 2373 regions [repeated 2x across cluster]
> (ctx_internal_ray pid=1149883) 2023-08-09 15:13:31,224 cisTarget INFO Getting cistromes for Topic2
> (ctx_internal_ray pid=1149879) 2023-08-09 15:13:35,256 cisTarget INFO Annotating motifs for Topic12 [repeated 9x across cluster]
> (ctx_internal_ray pid=1149879) 2023-08-09 15:13:35,265 cisTarget INFO Unable to load annotation for homo_sapiens [repeated 9x across cluster]
> (ctx_internal_ray pid=1149878) 2023-08-09 15:13:36,091 cisTarget INFO Getting cistromes for Topic8 [repeated 9x across cluster]
> (ctx_internal_ray pid=1149867) 2023-08-09 15:13:36,553 cisTarget INFO Annotating motifs for Topic11
> (ctx_internal_ray pid=1149867) 2023-08-09 15:13:36,562 cisTarget INFO Unable to load annotation for homo_sapiens
> (ctx_internal_ray pid=1149867) 2023-08-09 15:13:37,070 cisTarget INFO Getting cistromes for Topic11
> 2023-08-09 15:14:21,253 cisTarget INFO Done!
> 2023-08-09 15:14:21,262 pycisTarget_wrapper INFO Created folder : /mnt/c/Users/pc/Downloads/motifs/CTX_topics_otsu_All
> 2023-08-09 15:14:21,610 pycisTarget_wrapper INFO Running cisTarget without promoters for topics_otsu
> ---------------------------------------------------------------------------
> AttributeError Traceback (most recent call last)
> Cell In[99], line 2
> 1 from scenicplus.wrappers.run_pycistarget import run_pycistarget
> ----> 2 run_pycistarget(
> 3 region_sets = region_sets,
> 4 species = 'homo_sapiens',
> 5 save_path = os.path.join(work_dir, 'motifs'),
> 6 ctx_db_path = rankings_db,
> 7 dem_db_path = scores_db,
> 8 path_to_motif_annotations = motif_annotation,
> 9 run_without_promoters = True,
> 10 n_cpu = 40,
> 11 # _temp_dir = os.path.join(tmp_dir, 'ray_spill'),
> 12 annotation_version = 'v10nr_clust',
> 13 )
>
> File /mnt/c/Users/pc/Downloads/scenicplus/src/scenicplus/wrappers/run_pycistarget.py:224, in run_pycistarget(region_sets, species, save_path, custom_annot, save_partial, ctx_db_path, dem_db_path, run_without_promoters, biomart_host, promoter_space, ctx_auc_threshold, ctx_nes_threshold, ctx_rank_threshold, dem_log2fc_thr, dem_motif_hit_thr, dem_max_bg_regions, annotation, motif_similarity_fdr, path_to_motif_annotations, annotation_version, n_cpu, _temp_dir, exclude_motifs, exclude_collection, **kwargs)
> 222 regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
> 223 db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
> --> 224 ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
> 225 menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
> 226 region_sets = regions_np,
> 227 specie = species,
> (...)
> 236 annotation_version = annotation_version,
> 237 **kwargs)
> 238 out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
>
> File /mnt/c/Users/pc/Downloads/scenicplus/src/scenicplus/wrappers/run_pycistarget.py:224, in <dictcomp>(.0)
> 222 regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
> 223 db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
> --> 224 ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
> 225 menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
> 226 region_sets = regions_np,
> 227 specie = species,
> (...)
> 236 annotation_version = annotation_version,
> 237 **kwargs)
> 238 out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
>
> File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/utils.py:283, in target_to_query(target, query, fraction_overlap)
> 280 query_pr=query
> 282 join_pr = target_pr.join(query_pr, report_overlap = True)
> --> 283 join_pr.Overlap_query = join_pr.Overlap/(join_pr.End_b - join_pr.Start_b)
> 284 join_pr.Overlap_target = join_pr.Overlap/(join_pr.End - join_pr.Start)
> 285 join_pr = join_pr[(join_pr.Overlap_query > fraction_overlap) | (join_pr.Overlap_target > fraction_overlap)]
>
> File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/pyranges/pyranges_main.py:265, in PyRanges.__getattr__(self, name)
> 240 """Return column.
> 241
> 242 Parameters
> (...)
> 260 Name: Start, dtype: int64
> 261 """
> 263 from pyranges.methods.attr import _getattr
> --> 265 return _getattr(self, name)
>
> File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/pyranges/methods/attr.py:65, in _getattr(self, name)
> 63 return pd.concat([df[name] for df in self.values()])
> 64 else:
> ---> 65 raise AttributeError("PyRanges object has no attribute", name)
>
> AttributeError: ('PyRanges object has no attribute', 'Overlap')
Hi @pchiang5
I guess for some topics all regions overlap with promotor regions, you can check by running:
def get_species_annotation(species: str):
dataset = pbm.Dataset(name=species, host=biomart_host)
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
annot['Chromosome'] = annot['Chromosome'].astype('str')
filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
annot = annot[~filterf]
annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
annot = annot[annot.Transcript_type == 'protein_coding']
annot = annot.dropna(subset = ['Chromosome', 'Start'])
# Check if chromosomes have chr
check = region_sets[list(region_sets.keys())[0]]
if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
annot.Chromosome = annot.Chromosome.str.replace('chr', '')
if not any(['chr' in x for x in annot.Chromosome]):
annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
annot_dem=annot.copy()
# Define promoter space
annot['End'] = annot['Start'].astype(int)+promoter_space
annot['Start'] = annot['Start'].astype(int)-promoter_space
annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
return annot, annot_dem
annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
for topic in region_sets['topics_otsu'].keys():
regions = region_sets['topics_otsu'][topic]
regions_overlaps = {
topic: region_sets["topic_otsu"][topic].count_overlaps(annot)
for topic in region_sets['topics_otsu'].keys()}
regions_np = {
topic: regions_overlaps[topic][regions_overlaps[topic].NumberOverlaps == 0][["Chromosome", "Start", "End"]]
for topic in region_sets['topics_otsu'].keys()}
regions_np
That being said, there should be a check in the code for this.
Best,
Seppe
Hi@SeppeDeWinter
All the topics have at least one region not in promoters (defined by +- 500bp). I tried to remove topic15 which only contains seven regions (6 in promoters), and it worked.
However, I got another immediate error:
ValueError Traceback (most recent call last) Cell In[128], line 2 1 from scenicplus.wrappers.run_pycistarget import run_pycistarget ----> 2 run_pycistarget( 3 region_sets = region_sets, 4 species = 'homo_sapiens', 5 save_path = os.path.join(work_dir, 'motifs'), 6 ctx_db_path = rankings_db, 7 dem_db_path = scores_db, 8 path_to_motif_annotations = motif_annotation, 9 run_without_promoters = True, 10 n_cpu = 1, 11 _temp_dir = os.path.join(tmp_dir, 'ray_spill'), 12 annotation_version = 'v10nr_clust', 13 )
File /mnt/c/Users/pc/Downloads/scenicplus/src/scenicplus/wrappers/run_pycistarget.py:256, in run_pycistarget(region_sets, species, save_path, custom_annot, save_partial, ctx_db_path, dem_db_path, run_without_promoters, biomart_host, promoter_space, ctx_auc_threshold, ctx_nes_threshold, ctx_rank_threshold, dem_log2fc_thr, dem_motif_hit_thr, dem_max_bg_regions, annotation, motif_similarity_fdr, path_to_motif_annotations, annotation_version, n_cpu, _temp_dir, exclude_motifs, exclude_collection, **kwargs) 254 log.info('Running DEM for '+key) 255 regions = region_sets[key] --> 256 dem_db = DEMDatabase(dem_db_path, regions) 257 if exclude_motifs is not None: 258 out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/motif_enrichment_dem.py:67, in DEMDatabase.init(self, fname, region_sets, name, fraction_overlap) 48 def init(self, 49 fname: str, 50 region_sets: Dict[str, pr.PyRanges] = None, 51 name: str = None, 52 fraction_overlap: float = 0.4): 53 """ 54 Initialize DEMDatabase 55 (...) 65 Minimal overlap between query and regions in the database for the mapping. 66 """ ---> 67 self.regions_to_db, self.db_scores = self.load_db(fname, 68 region_sets, 69 name, 70 fraction_overlap)
File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/motif_enrichment_dem.py:111, in DEMDatabase.load_db(self, fname, region_sets, name, fraction_overlap) 109 if name is None: 110 name = os.path.basename(fname) --> 111 db = FeatherRankingDatabase(fname, name=name) 112 db_regions = db.genes 113 if region_sets is not None:
File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/rnkdb.py:109, in FeatherRankingDatabase.init(self, fname, name) 106 assert os.path.isfile(fname), """Database "{fname}" doesn't exist.""" 108 self._fname = fname --> 109 self.ct_db = CisTargetDatabase.init_ct_db( 110 ct_db_filename=self._fname, engine="pyarrow" 111 )
File /home/pc/miniconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:170, in CisTargetDatabase.init_ct_db(ct_db_filename, engine) 167 feather_v1_or_v2 = is_feather_v1_or_v2(ct_db_filename) 169 if not feather_v1_or_v2: --> 170 raise ValueError( 171 f'"{ct_db_filename}" is not a cisTarget Feather database in Feather v1 or v2 format.' 172 ) 173 elif feather_v1_or_v2 == 1: 174 raise ValueError( 175 f'"{ct_db_filename}" is a cisTarget Feather database in Feather v1 format, which is not supported ' 176 f'anymore. Convert them with "convert_cistarget_databases_v1_to_v2.py" ' 177 "(https://github.com/aertslab/create_cisTarget_databases/) to Feather v2 format." 178 )
ValueError: "/mnt/c/Users/pc/Downloads/hg38_screen_v10_clust.regions_vs_motifs.scores.feather" is not a cisTarget Feather database in Feather v1 or v2 format.
The file was downloaded from https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/screen/mc_v10_clust/region_based/
Does it mean I have to perform a conversion on the feather file? Thanks
The error was resolved after I re-downloaded the feather from the website.
Describe the bug Hi,
The error jumped out when I ran the following code.
To Reproduce
Error output
Expected behavior Like what showed in the tutorial
Version (please complete the following information):
Additional context Add any other context about the problem here.