Genentech / gReLU

gReLU is a python library to train, interpret, and apply deep learning models to DNA sequences.
https://genentech.github.io/gReLU/
MIT License
230 stars 23 forks source link

accessing genomes using genomepy #22

Closed mhfzsharmin closed 4 months ago

mhfzsharmin commented 4 months ago
  1. The genomepy issue when I tried to run from local machine
  2. The genomepy issue was fixed when I started using the company cluster. The genome fasta was available but it did not get the annotation.gtf file. which created the following error

exons = grelu.io.genome.read_gtf("hg38", features="exon")


16:46:31 | WARNING | Could not find 'hg38.annotation.bed(.gz)' in directory /gstore/home/sharmim1/.local/share/genomes/hg38. Methods using this file won't work! 16:46:31 | WARNING | Could not find 'hg38.annotation.gtf(.gz)' in directory /gstore/home/sharmim1/.local/share/genomes/hg38. Methods using this file won't work! Genome annotation files not found. Installing genome annotation files. 16:46:32 | INFO | Downloading the ncbiRefSeq annotation from the UCSC MySQL database.

FileNotFoundError Traceback (most recent call last) File ~/.conda/envs/grelu/lib/python3.11/site-packages/grelu/io/genome.py:73, in read_gtf(genome, features) 72 try: ---> 73 gtf = genomepy.Annotation(genome).named_gtf 74 except FileNotFoundError:

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/annotation/init.py:115, in Annotation.getattribute(self, name) 114 elif name == "named_gtf": --> 115 val = self.gtf.join( 116 self.from_attributes("gene_name"), how="inner" 117 ).set_index("gene_name") 118 setattr(self, name, val)

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/annotation/init.py:110, in Annotation.getattribute(self, name) 109 elif name == "gtf": --> 110 _check_property(self.annotation_gtf_file, f"{self.name}.annotation.gtf") 111 val = read_annot(self.annotation_gtf_file)

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/annotation/utils.py:44, in _check_property(prop, fname) 43 if prop is None: ---> 44 raise FileNotFoundError(f"'{fname}' required.")

FileNotFoundError: 'hg38.annotation.gtf' required.

During handling of the above exception, another exception occurred:

OSError Traceback (most recent call last) File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/providers/ucsc.py:343, in UcscProvider.download_annotation(self, name, genomes_dir, localname, **kwargs) 342 try: --> 343 download_annotation(name, annot, genomes_dir, localname) 344 logger.info("Annotation download successful")

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/providers/ucsc.py:549, in download_annotation(name, annot, genomes_dir, localname, n) 545 """ 546 Download the extended genePred file from the UCSC MySQL database. 547 Next convert this to a BED and GTF file. 548 """ --> 549 check_ucsc_tools() 551 out_dir = os.path.join(genomes_dir, localname)

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/utils.py:179, in check_ucsc_tools(tools) 178 if shutil.which(tool) is None: --> 179 raise OSError( 180 "Installing gene annotations requires missing UCSC tools. " 181 "See https://github.com/vanheeringen-lab/genomepy#pip " 182 "for download links and details." 183 )

OSError: Installing gene annotations requires missing UCSC tools. See https://github.com/vanheeringen-lab/genomepy#pip for download links and details.

During handling of the above exception, another exception occurred:

GenomeDownloadError Traceback (most recent call last) Cell In[14], line 3 1 import grelu.io.genome ----> 3 exons = grelu.io.genome.read_gtf("hg38", features="exon") 4 exons.head(3)

File ~/.conda/envs/grelu/lib/python3.11/site-packages/grelu/io/genome.py:76, in read_gtf(genome, features) 74 except FileNotFoundError: 75 print("Genome annotation files not found. Installing genome annotation files.") ---> 76 genomepy.install_genome(genome, only_annotation=True) 77 gtf = genomepy.Annotation(genome).named_gtf 79 gtf = gtf.reset_index()

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/functions.py:264, in install_genome(name, provider, genomes_dir, localname, mask, keep_alt, regex, invert_match, bgzip, annotation, only_annotation, skip_matching, skip_filter, threads, force, kwargs) 262 if force: 263 _delete_extensions(out_dir, ["annotation.gtf", "annotation.bed"]) --> 264 provider.download_annotation(name, genomes_dir, localname=localname, kwargs) 265 annotation_downloaded = bool( 266 glob_ext_files(out_dir, "annotation.gtf") 267 ) and bool(glob_ext_files(out_dir, "annotation.bed")) 269 if annotation_downloaded:

File ~/.conda/envs/grelu/lib/python3.11/site-packages/genomepy/providers/ucsc.py:346, in UcscProvider.download_annotation(self, name, genomes_dir, localname, **kwargs) 344 logger.info("Annotation download successful") 345 except Exception as e: --> 346 raise GenomeDownloadError( 347 f"An error occured while installing the gene annotation for {name} from {self.name}.\n" 348 "If you think the annotation should be there, please file a bug report at: " 349 "https://github.com/vanheeringen-lab/genomepy/issues\n\n" 350 f"Error: {e.args[0]}" 351 ) 353 # Add annotation URL to readme 354 readme = os.path.join(genomes_dir, localname, "README.txt")

GenomeDownloadError: An error occured while installing the gene annotation for hg38 from UCSC. If you think the annotation should be there, please file a bug report at: https://github.com/vanheeringen-lab/genomepy/issues

Error: Installing gene annotations requires missing UCSC tools. See https://github.com/vanheeringen-lab/genomepy#pip for download links and details.

gokceneraslan commented 4 months ago

This typically happens when either the additional UCSC dependencies are not installed (please see this) or if the machine is not able to access MySQL ports due to a firewall. So it's very likely that this is not a gReLU bug.

mhfzsharmin commented 4 months ago

resolved with manual hg38 installation via genomepy