sourmash-bio / sourmash

Quickly search, compare, and analyze genomic and metagenomic data sets.
http://sourmash.readthedocs.io/en/latest/
Other
477 stars 79 forks source link

Example: Individual sketches for each sequence in the multiFasta file concurrently. #2537

Closed mr-eyes closed 1 month ago

mr-eyes commented 1 year ago
import sys
import gzip
from Bio import SeqIO
import sourmash
from sourmash import MinHash
import concurrent.futures

def create_sourmash_sketch(genome, ksizes, scale):
    # Create a list to store MinHash objects for different k-mer sizes
    minhash_list = []

    # Create a MinHash object for each k-mer size
    for ksize in ksizes:
        mh = MinHash(n = 0, scaled=scale, ksize=ksize, track_abundance=False)
        mh.add_sequence(str(genome.seq).upper(), force=True)
        minhash_list.append(mh)

    sigs = [sourmash.SourmashSignature(mh, name=genome.id) for mh in minhash_list]
    with open(f"{genome.id}.sig", "w") as fSig:
        sourmash.save_signatures(sigs, fSig)

    return f"Generated sourmash sketch for genome: {genome.id}"

def create_sourmash_sketches(input_fna_gz, num_cores):
    # Read the input .fna.gz file
    # check if file is gzipped or not
    if input_fna_gz.endswith(".gz"):
        with gzip.open(input_fna_gz, "rt") as handle:
            genomes = list(SeqIO.parse(handle, "fasta"))
    else:
        with open(input_fna_gz, "rt") as handle:
            genomes = list(SeqIO.parse(handle, "fasta"))

    # Set the parameters for the MinHash sketch
    ksizes = [21, 31, 51]
    scale = 1000

    # Process genomes concurrently
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:
        futures = [executor.submit(create_sourmash_sketch, genome, ksizes, scale) for genome in genomes]
        for future in concurrent.futures.as_completed(futures):
            print(future.result())

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python multifasta_sketch.py <input_fasta> <num_cores>")
        sys.exit(1)

    input_fna_gz = sys.argv[1]
    num_cores = int(sys.argv[2])
    create_sourmash_sketches(input_fna_gz, num_cores)
ctb commented 1 year ago

neat!

A few notes -

using sourmash_args.SaveSignaturesToLocation

    with sourmash_args.SaveSignaturesToLocation(sigfile) as save_sig:
        for sigs in sigslist:
            set_sig_name(sigs, filename, name)
            for ss in sigs:
                save_sig.add(ss)
ctb commented 1 month ago

Closing in favor of sourmash_plugin_branchwater which provides manysketch ... --singleton and is very fast!