import os
import shutil
import subprocess
import pandas as pd
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio import SeqIO
from Bio.Blast.NCBIXML import parse
# MMseqs clustering function
def run_mmseqs_clustering(db_path, output_dir, params):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
else:
shutil.rmtree(output_dir)
os.makedirs(output_dir)
cluster_out = os.path.join(output_dir, "mmseq_clu")
tmp_dir = os.path.join(output_dir, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
cmd = ["mmseqs", "cluster", db_path, cluster_out, tmp_dir] + params
subprocess.run(cmd, check=True)
# Convert to TSV
tsv_out = os.path.join(output_dir, "mmseq_clu.tsv")
subprocess.run(["mmseqs", "createtsv", db_path, db_path, cluster_out, tsv_out], check=True)
return cluster_out, tsv_out
def params_to_cmd_args(params):
args = []
for key, value in params.items():
args.append(str(key))
args.append(str(value))
return args
def local_blastp_query(input_fasta, db, output_path, exclusion_list):
cmd = [
'blastp', '-db', db, '-query', input_fasta,
'-evalue', '1.0', '-outfmt', '5', '-out', output_path,
'-num_threads', '32', '-word_size', '3',
'-matrix', 'BLOSUM62', '-qcov_hsp_perc', '80', '-negative_seqidlist', exclusion_list
]
subprocess.run(cmd, check=True)
def main():
DB_PATH = './mms_smallDB'
OUTPUT_DIR = './mmseqs_output'
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
# Input parameters for MMseqs
MMSEQS_PARAMS = {
'--cluster-mode': 1,
'--cluster-steps': 3,
'--cluster-reassign': 1,
'--cov-mode': 0,
'-c': 0.8,
'-e': 0.001,
'-s': 7,
'--min-seq-id': 0.4,
'--threads': 40,
'--max-seqs': 200,
'--max-iterations': 1000,
'--alignment-mode': 3,
'--similarity-type': 2
}
params = params_to_cmd_args(MMSEQS_PARAMS)
# Cluster using MMseqs
cluster_out, tsv_out = run_mmseqs_clustering(DB_PATH, OUTPUT_DIR, params)
# Map sequences to their respective clusters
cluster_mapping = {}
with open(tsv_out, 'r') as f:
for line in f:
cluster, sequence = line.strip().split('\t')
cluster_mapping[sequence] = cluster
# Sample a sequence from 20 distinct clusters
sampled_sequences = {}
for sequence, cluster in cluster_mapping.items():
if cluster not in sampled_sequences and len(sampled_sequences) < 20:
sampled_sequences[cluster] = sequence
# Create exclusion list
exclusion_list = os.path.join(OUTPUT_DIR, "exclusion_list.txt")
with open(exclusion_list, 'w') as f:
for sequence, cluster in cluster_mapping.items():
if cluster in sampled_sequences:
f.write(f"{sequence}\n")
# write query sequences to a file
# we need to go get the sequences from the original database
index = SeqIO.index('./mmseqs_input.fasta', "fasta")
query_sequences = os.path.join(OUTPUT_DIR, "blast_query_sequences.fasta")
with open (query_sequences, 'w') as f:
for _, sequence in sampled_sequences.items():
f.write(f">{sequence}\n{index[sequence].seq}\n")
# Blast the sampled sequences against the MMseqs clustered database
blast_output = os.path.join(OUTPUT_DIR, "blast_output.xml")
local_blastp_query(query_sequences, './blast_smallDB', blast_output, exclusion_list)
# Parse and return alignments
es = []
perc_identities = []
records_iter = parse(open(blast_output, 'r'))
for record in records_iter:
# get only the best alignment and hsp for each query
# (there should only be one)
try:
alignment = record.alignments[0]
except IndexError:
es.append(None)
perc_identities.append(None)
continue
hsp = alignment.hsps[0]
e = hsp.expect
# check if both strands are covered
# calculate average coverege
coverage = (hsp.query_end - hsp.query_start + 1 + hsp.sbjct_end - hsp.sbjct_start + 1) / (alignment.length + record.query_length)
if coverage < 0.9:
es.append(None)
perc_identities.append(None)
# check that the first hsp of the first align is indeed the best
for align in record.alignments:
for hsp in align.hsps:
if hsp.expect < e:
raise ValueError('Not the best hsp')
es.append(e)
perc_identities.append(hsp.identities / ((alignment.length + record.query_length)/2))
num_aligned = len([e for e in es if e is not None])
# remove nans
es = [e for e in es if e is not None]
perc_identities = [p for p in perc_identities if p is not None]
mean_percid = sum(perc_identities)/len(perc_identities)
mean_e = sum(es)/len(es)
max_percid = max(perc_identities)
min_e = min(es)
result_dict = MMSEQS_PARAMS
result_dict['num_aligned'] = num_aligned
result_dict['mean_percid'] = mean_percid
result_dict['mean_e'] = mean_e
result_dict['max_percid'] = max_percid
result_dict['min_e'] = min_e
# load the current results (tsv) and append the new results
results_path = os.path.join('./', "results.tsv")
if os.path.exists(results_path):
results = pd.read_csv(results_path, sep='\t')
results = results.append(result_dict, ignore_index=True)
else:
results = pd.DataFrame([result_dict])
results.to_csv(results_path, sep='\t', index=False)
if __name__ == "__main__":
main()
MMseqs Output (for bugs)
Here is one output example, though I have run the above script varying the parameters for a number of params.
cluster ./mms_smallDB ./mmseqs_output/mmseq_clu ./mmseqs_output/tmp --cluster-mode 1 --cluster-steps 3 --cluster-reassign 1 --cov-mode 0 -c 0.8 -e 0.001 -s 7 --min-seq-id 0.4 --threads 40 --max-seqs 200 --max-iterations 1000 --alignment-mode 3 --similarity-type 2
MMseqs Version: 14.7e284
Substitution matrix aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix aa:VTML80.out,nucl:nucleotide.out
Sensitivity 7
k-mer length 0
k-score seq:2147483647,prof:2147483647
Alphabet size aa:21,nucl:5
Max sequence length 65535
Max results per query 200
Split database 0
Split mode 2
Split memory limit 0
Coverage threshold 0.8
Coverage mode 0
Compositional bias 1
Compositional bias 1
Diagonal scoring true
Exact k-mer matching 0
Mask residues 1
Mask residues probability 0.9
Mask lower case residues 0
Minimum diagonal score 15
Selected taxa
Include identical seq. id. false
Spaced k-mers 1
Preload mode 0
Pseudo count a substitution:1.100,context:1.400
Pseudo count b substitution:4.100,context:5.800
Spaced k-mer pattern
Local temporary path
Threads 40
Compressed 0
Verbosity 3
Add backtrace false
Alignment mode 3
Alignment mode 0
Allow wrapped scoring false
E-value threshold 0.001
Seq. id. threshold 0.4
Min alignment length 0
Seq. id. mode 0
Alternative alignments 0
Max reject 2147483647
Max accept 2147483647
Score bias 0
Realign hits false
Realign score bias -0.2
Realign max seqs 2147483647
Correlation score weight 0
Gap open cost aa:11,nucl:5
Gap extension cost aa:1,nucl:2
Zdrop 40
Rescore mode 0
Remove hits by seq. id. and coverage false
Sort results 0
Cluster mode 1
Max connected component depth 1000
Similarity type 2
Single step clustering false
Cascaded clustering steps 3
Cluster reassign true
Remove temporary files false
Force restart with latest tmp false
MPI runner
k-mers per sequence 21
Scale k-mers per sequence aa:0.000,nucl:0.200
Adjust k-mer length false
Shift hash 67
Include only extendable false
Skip repeating k-mers false
Connected component clustering produces less clusters in a single step clustering.
Please use --single-step-clusterlinclust ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/clu_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 1 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --alph-size aa:13,nucl:5 --kmer-per-seq 21 --spaced-kmer-mode 1 --kmer-per-seq-scale aa:0.000,nucl:0.200 --adjust-kmer-len 0 --mask 0 --mask-prob 0.9 --mask-lower-case 0 -k 0 --hash-shift 67 --split-memory-limit 0 --include-only-extendable 0 --ignore-multi-kmer 0 --rescore-mode 0 --filter-hits 0 --sort-results 0 --remove-tmp-files 0 --force-reuse 0
kmermatcher ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --alph-size aa:13,nucl:5 --min-seq-id 0.4 --kmer-per-seq 21 --spaced-kmer-mode 1 --kmer-per-seq-scale aa:0.000,nucl:0.200 --adjust-kmer-len 0 --mask 0 --mask-prob 0.9 --mask-lower-case 0 --cov-mode 0 -k 0 -c 0.8 --max-seq-len 65535 --hash-shift 67 --split-memory-limit 0 --include-only-extendable 0 --ignore-multi-kmer 0 --threads 40 --compressed 0 -v 3
kmermatcher ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --alph-size aa:13,nucl:5 --min-seq-id 0.4 --kmer-per-seq 21 --spaced-kmer-mode 1 --kmer-per-seq-scale aa:0.000,nucl:0.200 --adjust-kmer-len 0 --mask 0 --mask-prob 0.9 --mask-lower-case 0 --cov-mode 0 -k 0 -c 0.8 --max-seq-len 65535 --hash-shift 67 --split-memory-limit 0 --include-only-extendable 0 --ignore-multi-kmer 0 --threads 40 --compressed 0 -v 3
Database size: 100000 type: Aminoacid
Reduced amino acid alphabet: (A S T) (C) (D B N) (E Q Z) (F Y) (G) (H) (I V) (K R) (L J M) (P) (W) (X)
Generate k-mers list for 1 split
[=================================================================] 100.00K 0s 853ms
Sort kmer 0h 0m 0s 789ms
Sort by rep. sequence 0h 0m 0s 951ms
Time for fill: 0h 0m 0s 155ms
Time for merging to pref: 0h 0m 0s 0ms
Time for processing: 0h 0m 3s 166ms
rescorediagonal ./mms_smallDB ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_rescore1 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --rescore-mode 0 --wrapped-scoring 0 --filter-hits 0 -e 0.001 -c 0.8 -a 0 --cov-mode 0 --min-seq-id 0.5 --min-aln-len 0 --seq-id-mode 0 --add-self-matches 0 --sort-results 0 --db-load-mode 0 --threads 40 --compressed 0 -v 3
[=================================================================] 100.00K 0s 206ms
Time for merging to pref_rescore1: 0h 0m 0s 757ms
Time for processing: 0h 0m 1s 988ms
clust ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_rescore1 ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pre_clust --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3
Clustering mode: Connected Component
[=================================================================] 100.00K 0s 367ms
Sort entries
Find missing connections
Found 245160 new connections.
Reconstruct initial order
[=================================================================] 100.00K 0s 304ms
Add missing connections
[=================================================================] 100.00K 0s 8ms
Time for read in: 0h 0m 1s 971ms
connected component mode
Total time: 0h 0m 3s 258ms
Size of the sequence database: 100000
Size of the alignment database: 100000
Number of clusters: 31321
Writing results 0h 0m 0s 6ms
Time for merging to pre_clust: 0h 0m 0s 0ms
Time for processing: 0h 0m 3s 597ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/order_redundancy ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/input_step_redundancy -v 3 --subdb-mode 1
Time for merging to input_step_redundancy: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 14ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/order_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_filter1 -v 3 --subdb-mode 1
Time for merging to pref_filter1: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 422ms
filterdb ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_filter1 ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_filter2 --filter-file ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/order_redundancy --threads 40 --compressed 0 -v 3
Filtering using file(s)
[=================================================================] 31.32K 0s 100ms
Time for merging to pref_filter2: 0h 0m 0s 137ms
Time for processing: 0h 0m 0s 847ms
rescorediagonal ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_filter2 ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_rescore2 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --rescore-mode 1 --wrapped-scoring 0 --filter-hits 1 -e 0.001 -c 0.8 -a 0 --cov-mode 0 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --add-self-matches 0 --sort-results 0 --db-load-mode 0 --threads 40 --compressed 0 -v 3
[=================================================================] 31.32K 0s 42ms
Time for merging to pref_rescore2: 0h 0m 0s 90ms
Time for processing: 0h 0m 0s 772ms
align ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pref_rescore2 ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/aln --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 1 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --threads 40 --compressed 0 -v 3
Compute score, coverage and sequence identity
Query database size: 31321 type: Aminoacid
Target database size: 31321 type: Aminoacid
Calculation of alignments
[=================================================================] 31.32K 3s 713ms
Time for merging to aln: 0h 0m 0s 107ms
53166 alignments calculated
45707 sequence pairs passed the thresholds (0.859704 of overall calculated)
1.459308 hits per query sequence
Time for processing: 0h 0m 4s 203ms
clust ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/aln ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/clust --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3
Clustering mode: Connected Component
[=================================================================] 31.32K 0s 215ms
Sort entries
Find missing connections
Found 14386 new connections.
Reconstruct initial order
[=================================================================] 31.32K 0s 218ms
Add missing connections
[=================================================================] 31.32K 0s 1ms
Time for read in: 0h 0m 1s 273ms
connected component mode
Total time: 0h 0m 1s 458ms
Size of the sequence database: 31321
Size of the alignment database: 31321
Number of clusters: 20942
Writing results 0h 0m 0s 96ms
Time for merging to clust: 0h 0m 0s 0ms
Time for processing: 0h 0m 1s 683ms
mergeclusters ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/clu_redundancy ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/pre_clust ./mmseqs_output/tmp/5351426679731834765/linclust/262265298633898384/clust --threads 40 --compressed 0 -v 3
Clustering step 1
[=================================================================] 31.32K 0s 36ms
Clustering step 2
[=================================================================] 20.94K 0s 74ms
Write merged clustering
[=================================================================] 100.00K 0s 404ms
Time for merging to clu_redundancy: 0h 0m 0s 145ms
Time for processing: 0h 0m 0s 639ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/clu_redundancy ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy -v 3 --subdb-mode 1
Time for merging to input_step_redundancy: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 13ms
prefilter ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/pref_step0 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --seed-sub-mat 'aa:VTML80.out,nucl:nucleotide.out' -s 1 -k 0 --k-score seq:2147483647,prof:2147483647 --alph-size aa:21,nucl:5 --max-seq-len 65535 --max-seqs 200 --split 0 --split-mode 2 --split-memory-limit 0 -c 0.8 --cov-mode 0 --comp-bias-corr 0 --comp-bias-corr-scale 1 --diag-score 0 --exact-kmer-matching 0 --mask 1 --mask-prob 0.9 --mask-lower-case 0 --min-ungapped-score 0 --add-self-matches 0 --spaced-kmer-mode 1 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --threads 40 --compressed 0 -v 3
Query database size: 20942 type: Aminoacid
Estimated memory consumption: 1G
Target database size: 20942 type: Aminoacid
Index table k-mer threshold: 154 at k-mer size 6
Index table: counting k-mers
[=================================================================] 20.94K 0s 601ms
Index table: Masked residues: 6638
Index table: fill
[=================================================================] 20.94K 0s 645ms
Index statistics
Entries: 1435009
DB size: 496 MB
Avg k-mer size: 0.022422
Top 10 k-mers
GPGGTL 342
LDMPDG 185
LGDYKP 145
DVLDMP 119
PFLEAR 69
PFPEAR 65
FDDTDS 59
ADYTFS 55
LITRGY 55
GPGGTT 44
Time for index table init: 0h 0m 2s 668ms
Process prefiltering step 1 of 1
k-mer similarity threshold: 154
Starting prefiltering scores calculation (step 1 of 1)
Query db start 1 to 20942
Target db start 1 to 20942
[=================================================================] 20.94K 0s 928ms
1.256278 k-mers per position
118 DB matches per sequence
0 overflows
0 queries produce too many hits (truncated result)
8 sequences passed prefiltering per query sequence
3 median result list length
0 sequences with 0 size result lists
Time for merging to pref_step0: 0h 0m 0s 51ms
Time for processing: 0h 0m 6s 669ms
align ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/pref_step0 ./mmseqs_output/tmp/5351426679731834765/aln_step0 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 0 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --threads 40 --compressed 0 -v 3
Compute score, coverage and sequence identity
Query database size: 20942 type: Aminoacid
Target database size: 20942 type: Aminoacid
Calculation of alignments
[=================================================================] 20.94K 15s 380ms
Time for merging to aln_step0: 0h 0m 0s 75ms
172065 alignments calculated
67554 sequence pairs passed the thresholds (0.392607 of overall calculated)
3.225766 hits per query sequence
Time for processing: 0h 0m 16s 166ms
clust ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/aln_step0 ./mmseqs_output/tmp/5351426679731834765/clu_step0 --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3
Clustering mode: Connected Component
[=================================================================] 20.94K 0s 211ms
Sort entries
Find missing connections
Found 98 new connections.
Reconstruct initial order
[=================================================================] 20.94K 0s 218ms
Add missing connections
[=================================================================] 20.94K 0s 1ms
Time for read in: 0h 0m 1s 264ms
connected component mode
Total time: 0h 0m 1s 477ms
Size of the sequence database: 20942
Size of the alignment database: 20942
Number of clusters: 10966
Writing results 0h 0m 0s 66ms
Time for merging to clu_step0: 0h 0m 0s 4ms
Time for processing: 0h 0m 1s 628ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/clu_step0 ./mmseqs_output/tmp/5351426679731834765/input_step_redundancy ./mmseqs_output/tmp/5351426679731834765/input_step1 -v 3 --subdb-mode 1
Time for merging to input_step1: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 7ms
prefilter ./mmseqs_output/tmp/5351426679731834765/input_step1 ./mmseqs_output/tmp/5351426679731834765/input_step1 ./mmseqs_output/tmp/5351426679731834765/pref_step1 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --seed-sub-mat 'aa:VTML80.out,nucl:nucleotide.out' -s 4 -k 0 --k-score seq:2147483647,prof:2147483647 --alph-size aa:21,nucl:5 --max-seq-len 65535 --max-seqs 200 --split 0 --split-mode 2 --split-memory-limit 0 -c 0.8 --cov-mode 0 --comp-bias-corr 1 --comp-bias-corr-scale 1 --diag-score 1 --exact-kmer-matching 0 --mask 1 --mask-prob 0.9 --mask-lower-case 0 --min-ungapped-score 15 --add-self-matches 0 --spaced-kmer-mode 1 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --threads 40 --compressed 0 -v 3
Query database size: 10966 type: Aminoacid
Estimated memory consumption: 1010M
Target database size: 10966 type: Aminoacid
Index table k-mer threshold: 127 at k-mer size 6
Index table: counting k-mers
[=================================================================] 10.97K 0s 560ms
Index table: Masked residues: 4144
Index table: fill
[=================================================================] 10.97K 0s 667ms
Index statistics
Entries: 1798942
DB size: 498 MB
Avg k-mer size: 0.028108
Top 10 k-mers
IGAALA 68
GPGGTL 58
GIVAPG 43
ALTAGI 42
ALGNGK 34
GLGNGK 32
ELPGVN 31
DLLDLP 29
GQQVAR 24
GEQVAR 23
Time for index table init: 0h 0m 2s 664ms
Process prefiltering step 1 of 1
k-mer similarity threshold: 127
Starting prefiltering scores calculation (step 1 of 1)
Query db start 1 to 10966
Target db start 1 to 10966
[=================================================================] 10.97K 3s 91ms
46.510777 k-mers per position
438 DB matches per sequence
0 overflows
0 queries produce too many hits (truncated result)
13 sequences passed prefiltering per query sequence
7 median result list length
0 sequences with 0 size result lists
Time for merging to pref_step1: 0h 0m 0s 41ms
Time for processing: 0h 0m 8s 706ms
align ./mmseqs_output/tmp/5351426679731834765/input_step1 ./mmseqs_output/tmp/5351426679731834765/input_step1 ./mmseqs_output/tmp/5351426679731834765/pref_step1 ./mmseqs_output/tmp/5351426679731834765/aln_step1 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 1 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --threads 40 --compressed 0 -v 3
Compute score, coverage and sequence identity
Query database size: 10966 type: Aminoacid
Target database size: 10966 type: Aminoacid
Calculation of alignments
[=================================================================] 10.97K 9s 362ms
Time for merging to aln_step1: 0h 0m 0s 91ms
128470 alignments calculated
17027 sequence pairs passed the thresholds (0.132537 of overall calculated)
1.552708 hits per query sequence
Time for processing: 0h 0m 9s 872ms
clust ./mmseqs_output/tmp/5351426679731834765/input_step1 ./mmseqs_output/tmp/5351426679731834765/aln_step1 ./mmseqs_output/tmp/5351426679731834765/clu_step1 --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3
Clustering mode: Connected Component
[=================================================================] 10.97K 0s 3ms
Sort entries
Find missing connections
Found 475 new connections.
Reconstruct initial order
[=================================================================] 10.97K 0s 5ms
Add missing connections
[=================================================================] 10.97K 0s 0ms
Time for read in: 0h 0m 0s 613ms
connected component mode
Total time: 0h 0m 0s 705ms
Size of the sequence database: 10966
Size of the alignment database: 10966
Number of clusters: 8338
Writing results 0h 0m 0s 47ms
Time for merging to clu_step1: 0h 0m 0s 4ms
Time for processing: 0h 0m 0s 815ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/clu_step1 ./mmseqs_output/tmp/5351426679731834765/input_step1 ./mmseqs_output/tmp/5351426679731834765/input_step2 -v 3 --subdb-mode 1
Time for merging to input_step2: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 8ms
prefilter ./mmseqs_output/tmp/5351426679731834765/input_step2 ./mmseqs_output/tmp/5351426679731834765/input_step2 ./mmseqs_output/tmp/5351426679731834765/pref_step2 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --seed-sub-mat 'aa:VTML80.out,nucl:nucleotide.out' -s 7 -k 0 --k-score seq:2147483647,prof:2147483647 --alph-size aa:21,nucl:5 --max-seq-len 65535 --max-seqs 200 --split 0 --split-mode 2 --split-memory-limit 0 -c 0.8 --cov-mode 0 --comp-bias-corr 1 --comp-bias-corr-scale 1 --diag-score 1 --exact-kmer-matching 0 --mask 1 --mask-prob 0.9 --mask-lower-case 0 --min-ungapped-score 15 --add-self-matches 0 --spaced-kmer-mode 1 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --threads 40 --compressed 0 -v 3
Query database size: 8338 type: Aminoacid
Estimated memory consumption: 1003M
Target database size: 8338 type: Aminoacid
Index table k-mer threshold: 100 at k-mer size 6
Index table: counting k-mers
[=================================================================] 8.34K 0s 514ms
Index table: Masked residues: 3074
Index table: fill
[=================================================================] 8.34K 0s 572ms
Index statistics
Entries: 1408015
DB size: 496 MB
Avg k-mer size: 0.022000
Top 10 k-mers
GPGGTL 37
GLGNGK 26
ALGNGK 23
DLLDLP 21
FDDTDS 20
NGGSLK 17
DLLDMP 17
DVLDMP 17
GEQVAR 16
FDDTDT 16
Time for index table init: 0h 0m 2s 591ms
Process prefiltering step 1 of 1
k-mer similarity threshold: 100
Starting prefiltering scores calculation (step 1 of 1)
Query db start 1 to 8338
Target db start 1 to 8338
[=================================================================] 8.34K 26s 907ms
903.365687 k-mers per position
4641 DB matches per sequence
0 overflows
0 queries produce too many hits (truncated result)
88 sequences passed prefiltering per query sequence
76 median result list length
0 sequences with 0 size result lists
Time for merging to pref_step2: 0h 0m 0s 36ms
Time for processing: 0h 0m 32s 520ms
align ./mmseqs_output/tmp/5351426679731834765/input_step2 ./mmseqs_output/tmp/5351426679731834765/input_step2 ./mmseqs_output/tmp/5351426679731834765/pref_step2 ./mmseqs_output/tmp/5351426679731834765/aln_step2 --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 1 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --threads 40 --compressed 0 -v 3
Compute score, coverage and sequence identity
Query database size: 8338 type: Aminoacid
Target database size: 8338 type: Aminoacid
Calculation of alignments
[=================================================================] 8.34K 17s 958ms
Time for merging to aln_step2: 0h 0m 0s 88ms
489475 alignments calculated
8622 sequence pairs passed the thresholds (0.017615 of overall calculated)
1.034061 hits per query sequence
Time for processing: 0h 0m 18s 545ms
clust ./mmseqs_output/tmp/5351426679731834765/input_step2 ./mmseqs_output/tmp/5351426679731834765/aln_step2 ./mmseqs_output/tmp/5351426679731834765/clu_step2 --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3
Clustering mode: Connected Component
[=================================================================] 8.34K 0s 2ms
Sort entries
Find missing connections
Found 28 new connections.
Reconstruct initial order
[=================================================================] 8.34K 0s 2ms
Add missing connections
[=================================================================] 8.34K 0s 0ms
Time for read in: 0h 0m 0s 408ms
connected component mode
Total time: 0h 0m 0s 491ms
Size of the sequence database: 8338
Size of the alignment database: 8338
Number of clusters: 8185
Writing results 0h 0m 0s 23ms
Time for merging to clu_step2: 0h 0m 0s 4ms
Time for processing: 0h 0m 0s 572ms
mergeclusters ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/clu ./mmseqs_output/tmp/5351426679731834765/clu_redundancy ./mmseqs_output/tmp/5351426679731834765/clu_step0 ./mmseqs_output/tmp/5351426679731834765/clu_step1 ./mmseqs_output/tmp/5351426679731834765/clu_step2
Clustering step 1
[=================================================================] 20.94K 0s 219ms
Clustering step 2
[=================================================================] 10.97K 0s 427ms
Clustering step 3
[=================================================================] 8.34K 0s 657ms
Clustering step 4
[=================================================================] 8.19K 0s 758ms
Write merged clustering
[=================================================================] 100.00K 0s 956ms
Time for merging to clu: 0h 0m 0s 164ms
Time for processing: 0h 0m 1s 268ms
align ./mms_smallDB ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/clu ./mmseqs_output/tmp/5351426679731834765/aln --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 1 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --threads 40 --compressed 0 -v 3
Compute score, coverage and sequence identity
Query database size: 100000 type: Aminoacid
Target database size: 100000 type: Aminoacid
Calculation of alignments
[=================================================================] 8.19K 8s 160ms
Time for merging to aln: 0h 0m 0s 15ms
99829 alignments calculated
73771 sequence pairs passed the thresholds (0.738974 of overall calculated)
9.012951 hits per query sequence
Time for processing: 0h 0m 8s 437ms
subtractdbs ./mmseqs_output/tmp/5351426679731834765/clu ./mmseqs_output/tmp/5351426679731834765/aln ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted --e-profile 100000000 -e 100000000 --threads 40 --compressed 0 -v 3
subtractdbs ./mmseqs_output/tmp/5351426679731834765/clu ./mmseqs_output/tmp/5351426679731834765/aln ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted --e-profile 100000000 -e 100000000 --threads 40 --compressed 0 -v 3
Remove ./mmseqs_output/tmp/5351426679731834765/aln ids from ./mmseqs_output/tmp/5351426679731834765/clu
[=================================================================] 8.19K 0s 263ms
Time for merging to clu_not_accepted: 0h 0m 0s 69ms
Time for processing: 0h 0m 0s 514ms
swapdb ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted_swap --threads 40 --compressed 0 -v 3
[=================================================================] 8.19K 0s 2ms
Computing offsets.
[=================================================================] 8.19K 0s 1ms
Reading results.
[=================================================================] 8.19K 0s 1ms
Output database: ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted_swap
[=================================================================] 100.00K 0s 116ms
Time for merging to clu_not_accepted_swap: 0h 0m 0s 143ms
Time for processing: 0h 0m 0s 577ms
subtractdbs ./mmseqs_output/tmp/5351426679731834765/clu ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted ./mmseqs_output/tmp/5351426679731834765/clu_accepted --e-profile 100000000 -e 100000000 --threads 40 --compressed 0 -v 3
subtractdbs ./mmseqs_output/tmp/5351426679731834765/clu ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted ./mmseqs_output/tmp/5351426679731834765/clu_accepted --e-profile 100000000 -e 100000000 --threads 40 --compressed 0 -v 3
Remove ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted ids from ./mmseqs_output/tmp/5351426679731834765/clu
[=================================================================] 8.19K 0s 41ms
Time for merging to clu_accepted: 0h 0m 0s 137ms
Time for processing: 0h 0m 0s 277ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/clu_not_accepted_swap ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned -v 3
Time for merging to seq_wrong_assigned: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 28ms
createsubdb ./mmseqs_output/tmp/5351426679731834765/clu ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/seq_seeds -v 3
Time for merging to seq_seeds: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 16ms
prefilter ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned ./mmseqs_output/tmp/5351426679731834765/seq_seeds.merged ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' --seed-sub-mat 'aa:VTML80.out,nucl:nucleotide.out' -s 7 -k 0 --k-score seq:2147483647,prof:2147483647 --alph-size aa:21,nucl:5 --max-seq-len 65535 --max-seqs 200 --split 0 --split-mode 2 --split-memory-limit 0 -c 0.8 --cov-mode 0 --comp-bias-corr 1 --comp-bias-corr-scale 1 --diag-score 1 --exact-kmer-matching 0 --mask 1 --mask-prob 0.9 --mask-lower-case 0 --min-ungapped-score 15 --add-self-matches 0 --spaced-kmer-mode 1 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --threads 40 --compressed 0 -v 3
Query database size: 26229 type: Aminoacid
Estimated memory consumption: 1G
Target database size: 34414 type: Aminoacid
Index table k-mer threshold: 100 at k-mer size 6
Index table: counting k-mers
[=================================================================] 34.41K 1s 394ms
Index table: Masked residues: 8741
Index table: fill
[=================================================================] 34.41K 1s 378ms
Index statistics
Entries: 6295744
DB size: 524 MB
Avg k-mer size: 0.098371
Top 10 k-mers
DVLDMP 2320
PDVMRM 1368
DRQVAY 1181
PFPEAR 738
MPLGAT 728
MPMGAT 703
GQQVAR 620
ADYTFS 597
LTFLYV 568
VLLALS 518
Time for index table init: 0h 0m 4s 142ms
Process prefiltering step 1 of 1
k-mer similarity threshold: 100
Starting prefiltering scores calculation (step 1 of 1)
Query db start 1 to 26229
Target db start 1 to 34414
[=================================================================] 26.23K 1m 58s 7ms
775.834912 k-mers per position
60277 DB matches per sequence
0 overflows
0 queries produce too many hits (truncated result)
193 sequences passed prefiltering per query sequence
200 median result list length
0 sequences with 0 size result lists
Time for merging to seq_wrong_assigned_pref: 0h 0m 0s 56ms
Time for processing: 0h 2m 5s 612ms
swapdb ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped --threads 40 --compressed 0 -v 3
[=================================================================] 26.23K 0s 396ms
Computing offsets.
[=================================================================] 26.23K 0s 384ms
Reading results.
[=================================================================] 26.23K 0s 441ms
Output database: ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped
[=================================================================] 100.00K 0s 144ms
Time for merging to seq_wrong_assigned_pref_swaped: 0h 0m 0s 19ms
Time for processing: 0h 0m 2s 119ms
align ./mmseqs_output/tmp/5351426679731834765/seq_seeds.merged ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped_aln --sub-mat 'aa:blosum62.out,nucl:nucleotide.out' -a 0 --alignment-mode 3 --alignment-output-mode 0 --wrapped-scoring 0 -e 0.001 --min-seq-id 0.4 --min-aln-len 0 --seq-id-mode 0 --alt-ali 0 -c 0.8 --cov-mode 0 --max-seq-len 65535 --comp-bias-corr 1 --comp-bias-corr-scale 1 --max-rejected 2147483647 --max-accept 2147483647 --add-self-matches 0 --db-load-mode 0 --pca substitution:1.100,context:1.400 --pcb substitution:4.100,context:5.800 --score-bias 0 --realign 0 --realign-score-bias -0.2 --realign-max-seqs 2147483647 --corr-score-weight 0 --gap-open aa:11,nucl:5 --gap-extend aa:1,nucl:2 --zdrop 40 --threads 40 --compressed 0 -v 3
Compute score, coverage and sequence identity
Query database size: 34414 type: Aminoacid
Target database size: 26229 type: Aminoacid
Calculation of alignments
[=================================================================] 34.29K 6m 32s 543ms
Time for merging to seq_wrong_assigned_pref_swaped_aln: 0h 0m 0s 85ms
4335308 alignments calculated
2294027 sequence pairs passed the thresholds (0.529150 of overall calculated)
66.900757 hits per query sequence
Time for processing: 0h 6m 33s 544ms
filterdb ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped_aln ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped_aln_ocol --trim-to-one-column --threads 40 --compressed 0 -v 3
Filtering using regular expression
[=================================================================] 34.29K 1s 15ms
Time for merging to seq_wrong_assigned_pref_swaped_aln_ocol: 0h 0m 0s 70ms
Time for processing: 0h 0m 1s 765ms
mergedbs ./mmseqs_output/tmp/5351426679731834765/seq_seeds.merged ./mmseqs_output/tmp/5351426679731834765/clu_accepted_plus_wrong ./mmseqs_output/tmp/5351426679731834765/clu_accepted ./mmseqs_output/tmp/5351426679731834765/seq_wrong_assigned_pref_swaped_aln_ocol --merge-stop-empty 0 --compressed 0 -v 3
Merging the results to ./mmseqs_output/tmp/5351426679731834765/clu_accepted_plus_wrong
[=================================================================] 34.41K 0s 26ms
Time for merging to clu_accepted_plus_wrong: 0h 0m 0s 5ms
Time for processing: 0h 0m 0s 53ms
tsv2db ./mmseqs_output/tmp/5351426679731834765/missing.single.seqs ./mmseqs_output/tmp/5351426679731834765/missing.single.seqs.db --output-dbtype 6 --compressed 0 -v 3
Output database type: Clustering
Time for merging to missing.single.seqs.db: 0h 0m 0s 12ms
Time for processing: 0h 0m 0s 34ms
mergedbs ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/clu_accepted_plus_wrong_plus_single ./mmseqs_output/tmp/5351426679731834765/clu_accepted_plus_wrong ./mmseqs_output/tmp/5351426679731834765/missing.single.seqs.db --merge-stop-empty 0 --compressed 0 -v 3
Merging the results to ./mmseqs_output/tmp/5351426679731834765/clu_accepted_plus_wrong_plus_single
[=================================================================] 100.00K 0s 35ms
Time for merging to clu_accepted_plus_wrong_plus_single: 0h 0m 0s 12ms
Time for processing: 0h 0m 0s 66ms
clust ./mms_smallDB ./mmseqs_output/tmp/5351426679731834765/clu_accepted_plus_wrong_plus_single ./mmseqs_output/mmseq_clu --cluster-mode 1 --max-iterations 1000 --similarity-type 2 --threads 40 --compressed 0 -v 3
Clustering mode: Connected Component
[=================================================================] 100.00K 0s 609ms
Sort entries
Find missing connections
Found 596106 new connections.
Reconstruct initial order
[=================================================================] 100.00K 0s 572ms
Add missing connections
[=================================================================] 100.00K 0s 324ms
Time for read in: 0h 0m 2s 881ms
connected component mode
Total time: 0h 0m 4s 86ms
Size of the sequence database: 100000
Size of the alignment database: 100000
Number of clusters: 8463
Writing results 0h 0m 0s 21ms
Time for merging to mmseq_clu: 0h 0m 0s 0ms
Time for processing: 0h 0m 4s 446ms
createtsv ./mms_smallDB ./mms_smallDB ./mmseqs_output/mmseq_clu ./mmseqs_output/mmseq_clu.tsv
MMseqs Version: 14.7e284
First sequence as representative false
Target column 1
Add full header false
Sequence source 0
Database output false
Threads 40
Compressed 0
Verbosity 3
Time for merging to mmseq_clu.tsv: 0h 0m 0s 64ms
Time for processing: 0h 0m 0s 533ms
Context
Providing context helps us come up with a solution and improve our documentation for the future.
Your Environment
Include as many relevant details about the environment you experienced the bug in.
Git commit used (The string after "MMseqs Version:" 14.7e284:
Which MMseqs version was used (Statically-compiled, self-compiled, Homebrew, etc.): conda
For self-compiled and Homebrew: Compiler and Cmake versions used and their invocation:
Server specifications (especially CPU support for AVX2/SSE and amount of system memory):
Perhaps I am reading your example wrong, but Isn't your maximum sequence identity in the tsv table 0.4268774703557312? if so, it would seem that your intercluster sequence identity is below 50%.
Expected Behavior
When min-seq-id is eg 50%, I would expect identity across clusters to be upper bounded by 50%
Current Behavior
Running blastp of sequences from a cluster against sequences from all other cluster yields high percent id and low e value hits.
Here is a table of some results I have tried (tsv). No matter what i do, inter cluster identity remains high:
Steps to Reproduce (for bugs)
My scripts
MMseqs Output (for bugs)
Here is one output example, though I have run the above script varying the parameters for a number of params.
Context
Providing context helps us come up with a solution and improve our documentation for the future.
Your Environment
Include as many relevant details about the environment you experienced the bug in.