MDU-PHL / ngmaster

In silico multi-antigen sequence typing for Neisseria gonorrhoeae (NG-MAST)
GNU General Public License v3.0
6 stars 5 forks source link

www.ng-mast.net is down #34

Open mschmerer opened 3 years ago

mschmerer commented 3 years ago

Updating the local NG-MAST database via the update function fails, because NG-MAST is now hosted on pubMLST.org the original website is down.

Here is a function I wrote that updates NG-MAST for ngmaster successfully.

import subprocess, os
import pandas as pd
from Bio import SeqIO

def updateNGMAST(dbdir):
    """
    Downloads allele sequences and schema from pubMLST and converts to correct 
    formats for ngmaster.

    Parameters
    ----------
    dbdir : PATH
        Path to database directory.

    Returns
    -------
    None.

    """

    # Download new allele files and profile from pubMLST.org
    subprocess.call("wget --no-check-certificate -O " + dbdir + "/alleledb/ngmast/POR.tfa.new https://rest.pubmlst.org/db/pubmlst_neisseria_seqdef/loci/NG-MAST_porB/alleles_fasta", shell=True)
    subprocess.call("wget --no-check-certificate -O " + dbdir + "/alleledb/ngmast/TBPB.tfa.new https://rest.pubmlst.org/db/pubmlst_neisseria_seqdef/loci/NG-MAST_tbpB/alleles_fasta", shell=True)
    subprocess.call("wget --no-check-certificate -O " + dbdir + "/alleledb/ngmast/ng_mast.txt.new https://rest.pubmlst.org/db/pubmlst_neisseria_seqdef/schemes/71/profiles_csv", shell=True)

    # Backup current ng_mast.txt and convert new one to correct format
    new = pd.read_csv(dbdir + "/alleledb/ngmast/ng_mast.txt.new", sep="\t")
    os.rename(dbdir + "/alleledb/ngmast/ng_mast.txt", dbdir + "/alleledb/ngmast/ng_mast.txt.old")
    new.rename(columns={"NG-MAST_porB":"POR", "NG-MAST_tbpB":"TBPB"}, inplace=True)
    new.set_index("ST", inplace=True)
    new.to_csv(dbdir + "/alleledb/ngmast/ng_mast.txt")

    # Backup current tfa files and convert fasta headers in new ones to correct format
    os.rename(dbdir + "/alleledb/ngmast/POR.tfa", dbdir + "/alleledb/ngmast/POR.tfa.old")
    os.rename(dbdir + "/alleledb/ngmast/TBPB.tfa", dbdir + "/alleledb/ngmast/TBPB.tfa.old")

    por_records = []
    for record in SeqIO.parse(dbdir + "/alleledb/ngmast/POR.tfa.new", "fasta"):
        record.id = "POR" + str(record.id).split("_")[2]
        record.name = ""
        record.description = ""
        por_records.append(record)
    SeqIO.write(por_records, dbdir + "/alleledb/ngmast/POR.tfa", "fasta-2line")

    tbpb_records = []
    for record in SeqIO.parse(dbdir + "/alleledb/ngmast/TBPB.tfa.new", "fasta"):
        record.id = "TBPB" + str(record.id).split("_")[2]
        record.name = ""
        record.description = ""
        tbpb_records.append(record)
    SeqIO.write(por_records, dbdir + "/alleledb/ngmast/TBPB.tfa", "fasta-2line")

    # Clean up after update
    subprocess.call("rm " + dbdir + "/alleledb/ngmast/*.new", shell=True)

    return