phasegenomics / FALCON-Phase

FALCON-Phase integrates PacBio long-read assemblies with Phase Genomics Hi-C data to create phased, diploid, chromosome-scale scaffolds
Other
74 stars 17 forks source link

Compatibility with IPA output #75

Closed Adamtaranto closed 4 years ago

Adamtaranto commented 4 years ago

Is falcon-Phase compatible with the output of IPA? The naming convention for the primary contigs looks the same, but the alternate contigs from IPA have a slightly different format to the haplotigs from Falcon-Unzip.

zeeev commented 4 years ago

Hi @Adamtaranto,

You're correct, IPA headers aren't compatible with Falcon-phase. In the next release of IPA, we are adding a conversion script under falconc:

Usage:
  ipa2-to-falcon-unzip [required&optional-params]
Rename IPA2 fasta header names to match falcon
Options:
  -h, --help                               print this cligen-erated help
  --help-syntax                            advanced: prepend,plurals,..
  -i=, --input-p-fn=     string  REQUIRED  input primary contigs
  --input-a-fn=          string  REQUIRED  input associate contigs
  -o=, --output-prefix=  string  REQUIRED  prefix for output files

In the meantime here is the source code. The logic is trivial.

from ./util import nil
from strformat import fmt
import hts
import strutils

proc renamedSeq*(name: string): string =
    let name_parts = name.split({'.', '-'})
    if name_parts.len > 2:
        if name_parts[2] != "01":
            return ""
        return "{name_parts[1]}_{name_parts[2]}".fmt
    else:
        return "{name_parts[1]}".fmt

type
    POrA = enum
        # val = (string version of val),
        pCtg = "p",
        aCtg = "a",

proc renameSeqs(seq_fn, output_prefix: string, extension: POrA) =
    var refx: hts.Fai
    if not hts.open(refx, seq_fn):
        util.raiseEx(format("[FATAL] Could not open '$#'", seq_fn))

    var f = open("{output_prefix}.{extension}.fasta".fmt, fmWrite)

    for i in 0 .. (refx.len - 1):
        let ctgSeq = refx.get(refx[i])
        let new_name = renamedSeq(refx[i])
        if new_name == "":
            continue
        f.write('>')
        f.writeLine(new_name)
        f.writeLine(ctgSeq)
    f.close

proc main*(input_p_fn, input_a_fn, output_prefix: string) =
    ##Rename IPA2 fasta header names to match falcon
    renameSeqs(input_p_fn, output_prefix, pCtg)
    renameSeqs(input_a_fn, output_prefix, aCtg)