Open jpjarnoux opened 1 month ago
Hi!
I share part of my script to ensure that it's not me who is wrong here.
def digit_family_sequences(pangenome: Pangenome,
disable_bar: bool = False) -> tuple[List[DigitalSequence], bool]:
"""
Digitalised pangenome gene family sequences for HMM alignment
Args:
pangenome: Pangenome object with gene families
disable_bar: Flag to disable the progress bar
Returns:
List[pyhmmer.easel.Sequence]: list of digitalised gene family sequences
"""
sequences = []
logging.getLogger("PANORAMA").info("Begin to digitalized gene families sequences...")
for family in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families, unit="gene families",
desc="Digitalized gene families sequences", disable=disable_bar):
bit_name = family.name.encode('UTF-8')
sequence = family.sequence if family.HMM is None else family.HMM.consensus.upper()
if sequence[-1] == '*':
sequence = sequence[:-1]
seq = TextSequence(name=bit_name, sequence=sequence)
sequences.append(seq.digitize(Alphabet.amino()))
available_memory = psutil.virtual_memory().available
return sequences, True if sys.getsizeof(sequences) < available_memory else False
def annot_with_hmmsearch(hmms: Dict[str, List[HMM]], gf_sequences: SequenceBlock, meta: pd.DataFrame = None,
threads: int = 1, disable_bar: bool = False
) -> Tuple[List[Tuple[str, str, str, float, float, float, float, str, str]], List[TopHits]]:
"""
Compute HMMer alignment between gene families sequences and HMM
Args:
hmms: List of HMM classified by bit_cutoffs
gf_sequences: List of digitalized gene families sequences
meta: Metadata associate with HMM
threads: Number of available threads
disable_bar: Disable progress bar
Returns:
Alignment results
"""
def hmmsearch_callback(hmm, _):
"""HMMSearch callback function for debugging
"""
logging.getLogger('PANORAMA').debug(f"Finished annotation with HMM {hmm.name.decode()}")
bar.update()
res = []
result = namedtuple("Result", res_col_names)
logging.getLogger("PANORAMA").info("Begin alignment to HMM with HMMSearch")
seq_file = SequenceFile("P.aeruginosa/all_protein_families.faa", digital=True)
with tqdm(total=sum(len(hmm_list) for hmm_list in hmms.values()), unit="hmm",
desc="Align target to HMM", disable=disable_bar) as bar:
all_top_hits = []
for cutoff, hmm_list in hmms.items():
options = {}
if cutoff != "other":
options["bit_cutoffs"] = cutoff
for top_hits in hmmsearch(hmm_list, seq_file, cpus=threads, callback=hmmsearch_callback, **options):
all_top_hits.append(top_hits)
for hit in top_hits.included:
assign = assign_hit(hit, meta)
if assign is not None:
res.append(result(*assign))
return res, all_top_hits
sequences, fit_memory = digit_family_sequences(pangenome, disable_bar=disable_bar)
if fit_memory:
logging.getLogger("PANORAMA").debug("Launch pyHMMer-HMMSearch")
sequence_block = DigitalSequenceBlock(alphabet=Alphabet.amino(), iterable=sequences)
res, all_top_hits = annot_with_hmmsearch(hmms, sequence_block, meta, threads, disable_bar)
else:
logging.getLogger("PANORAMA").debug("Launch pyHMMer-HMMScan")
res, all_top_hits = annot_with_hmmscan(hmms, sequences, meta, threads, tmp, disable_bar)
Hi @jpjarnoux, if the E-values are different but the P-values are the same, then it means the only difference between the two runs is the Z
and domZ
parameters, which are normally counted by how many sequences are contained in the target database, so the number of HMMs (with hmmsearch
) or the number of sequences (with hmmscan
). This is documented a bit more here: https://pyhmmer.readthedocs.io/en/stable/examples/performance_tips.html#Search-and-scan
This means in the above example block, you wont get the same E-values in the annot_with_hmmscan
and annot_with_hmmsearch
unless you set the Z
and domZ
parameters to the same value. It should be always the same value across calls to your tool, so this probably should be set to your HMM database size.
By the way, sys.getsizeof(sequences)
will not give you the total memory footprint of the sequences, only of the list that stores the references to the TextSequence
objects. You could do sum(map(sys.getsizeof, sequences))
to sum the size of each sequence instead.
Hi Thanks for your reply and sorry for my delay. I set the Z parameters as you suggest and problem solve thanks.
Thanks for the tips too.
Hi! I'm having some trouble understanding where this can come from. I tried to align a sequence with HMMER and filter on the e-value at 1e-05. With HMMER, I have this result in the domtblout:
With pyHMMER:
All values are the same except e-value, i-evalue, c-evalue.
Maybe I missed the information, but is it expected to be different?
I'm using pyhmmer=0.10.7=py310h4b81fae_0 and hmmer=3.4
Thanks for your help