Closed jyaacoub closed 3 months ago
Outcome: FAIL Results shown below in comments next to files, but any parameter tweaking I did failed to help decrease the number of gaps in the sequences.
# %%
from src.utils.af_clust import AF_Clust
from pathlib import Path
import logging
logging.getLogger().setLevel(logging.DEBUG)
HOME = Path.home()
pid = 'ABL1'
tmp_p = f"{HOME}/projects/tmp/{pid}/"
# rem / total
msa_uniref30_2023 = f"{tmp_p}/{pid}.a3m" # 2 / 4627
msa_uniref30_2023_v1 = f"{tmp_p}/{pid}_cov25.a3m" # 2 / 113
msa_uniref30_2023_v2 = f"{tmp_p}/{pid}_cov26_preeval200_pregap40.a3m" # 2 / 95
msa_uniref30_2023_v3 = f"{tmp_p}/{pid}_cov26_preeval200_pregap80_maxfilt40k.a3m" # 2 / 74
msa_uniref30_2023_v4 = f"{tmp_p}/{pid}_cov40_preeval100_pregap160_maxfilt50k.a3m"# 1 / 29
msa_uniref30_2023_v5 = f"{tmp_p}/{pid}_preeval100_pregap160_maxfilt50k.a3m" # 1 / 3461
msa_uniref30_2023_v6 = f"{tmp_p}/{pid}_preeval100_pregap160_maxfilt50k_qid25.a3m"# 1 / 3523
msa_uniref30_2023_v7 = f"{tmp_p}/{pid}_cov25_preeval50_pregap160_maxfilt50k.a3m" # 1 / 48
msa_uniref30_2023_v8 = f"{tmp_p}/{pid}_prepre_smax100.a3m" # 2 / 4772
msa_uniref30_2023_v9 = f"{tmp_p}/{pid}_preeval10.a3m" # 2 / 5359
msa_uniref30_2023_v10 = f"{tmp_p}/{pid}_preeval2k.a3m" # 2 / 4400
af = AF_Clust(keyword="test-"+pid,
input_msa=msa_uniref30_2023_v10,
output_dir=f"{tmp_p}/af_clust")
#%%
print("DAVIS sample -", pid)
print("UNICLUST30_2023:")
af = AF_Clust(keyword="test-"+pid,
input_msa=msa_uniref30_2023,
output_dir=f"{tmp_p}/af_clust")
#%%
print("UNICLUST30_2023_COV25:")
af = AF_Clust(keyword="test-"+pid,
input_msa=msa_uniref30_2023_v1,
output_dir=f"{tmp_p}/af_clust")
#%%
print("UNICLUST30_2023_COV26_PREEVAL200_PREGAP40:")
af = AF_Clust(keyword="test-"+pid,
input_msa=msa_uniref30_2023_v2,
output_dir=f"{tmp_p}/af_clust")
## CMD TO RUN HHBLITS ON /home/jean/projects/ABL1.fa
# hhblits -i /home/jean/projects/ABL1.fa -oa3m /home/jean/projects/ABL1.a3m -d /home/jean/projects/data/uniclust50_2018_08/uniclust50_2018_08 -n 3
#%%
from src.utils.seq_alignment import MSARunner
in_fp = f"{mine_p}/{pid}.fa"
out_fp = f"{mine_p}/{pid}.a3m"
MSARunner.hhblits(in_fp, out_fp,
dataset='/home/jean/projects/data/uniclust50_2018_08/uniclust50_2018_08',
return_cmd=True)
# %%
# Prefilter options
# -pre_gap_open is the gap opening penalty for the prefiltering step. (default is 20)
# filter option applied to query MSA:
# -cov is a threshold for the coverage of the alignment (default is 0)
# hhblits -i <FASTA> -oa3m <A3M> -d <DB> -cpu 8 -n 2 -pre_evalue_thresh 500 -gapd 0.5 -cov 25
Using Big Fantastic Database (https://bfd.mmseqs.com/) is the only option left, none of the other solutions worked.
A lot of the matching sequences contain >25% gaps, we should redo MSA with hhblits using >uniref30.
Code: