gen_fusion_file.jl is out of date

ZKai0801 commented 4 years ago

Hi,

I notice that the scripts/gen_fusion_file.jl is out of date and can't be used anymore. So I wrote a simple python script to replace its function, and I would like to share it with others. However any pushes to this repo is banned, would you mind to add this script manually?

"""
Generate a user-defined fusion file 

Input file should be a text file containing 
1. gene name and 
2. transcript name (optional)

For example:
gene1   transcript1
gene2
gene3   transcript3
"""
__author__ = "Kai"
__date__ = "20/05/2020"

import argparse

def read_genelist(inputf):
    with open(inputf, "r") as fh:
        for line in fh:
            yield line.rstrip("\n").split()

def make_fusion_gene(gene, fw, refflat):
    # no transcript specified --> use the longest transcript
    if len(gene) == 1:
        transcripts = {}
        with open(refflat, "r") as fh:
            for line in fh:
                if gene[0] not in line:
                    continue
                _, transcript, chrom, _, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t")
                transcripts[transcript] = (chrom, start, end, exonstart, exonend)
        transcript = get_longest_transcript(transcripts.keys(), refflat)
        chrom, start, end, exonstart, exonend  = transcripts[transcript]

    # use user-specified transcript
    elif len(gene) == 2:
        with open(refflat, "r") as fh:
            for line in fh:
                if gene[1] not in line:
                    continue
                _, transcript, chrom, _, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t")
                break

    # write to a file
    header = f">{gene[0]}_{transcript},{chrom}:{start}-{end}\n"
    fw.write(header)
    exons = list(zip(exonstart.split(","), exonend.split(",")))[:-1]
    for index, each_exon in enumerate(exons, start=1):
        fw.write(f'{index},{each_exon[0]},{each_exon[1]}\n')
    fw.write("\n")

def get_longest_transcript(transcripts, refflat):
    longest_length = 0
    longest_transcript = ""
    with open(refflat, "r") as fh:
        for line in fh:
            line = line.strip().split()
            if line[1] in transcripts:
                length = int(line[5]) - int(line[4])
                if length > longest_length:
                    longest_length = length
                    longest_transcript = line[1]
    return longest_transcript

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input", help="Input filename")
    parser.add_argument("-r", "--refflat", required=True, help="Path to the refFlat.txt file, need to be downloaded from UCSC in advance")
    parser.add_argument("-o", "--output", required=True, help="The output filename")
    args = parser.parse_args()

    with open(args.output, "w") as fw:
        for gene in read_genelist(args.input):
            make_fusion_gene(gene, fw, args.refflat)

sfchen commented 4 years ago

Thanks, can you provide a pull request?

ZKai0801 commented 4 years ago

yeah sure, just done that

OpenGene / GeneFuse

gen_fusion_file.jl is out of date #22