BED output files - Githubissues

lskatz commented 4 months ago

I would like to suggest a feature where you have a BED output file, one per genome. https://genome.ucsc.edu/FAQ/FAQformat.html#format1

It has the advantages that

Can be put into most genome browsers like IGV.
Makes it easier to overlay with important features like coding sequences in IGV but also on the command line with tools like bedtools intersect.
If desired, you could also add in colors or thickness in the standard BED field to help with genome browser viz

I think this would also mean that you'd want to have an output folder option to place all the BED files in.

dr-joe-wirth commented 3 months ago

something like this:

bedtools intersect -a <(grep -v source file.gff) -b <(grep -v '\b0$' coverage.bed) | head

dr-joe-wirth commented 3 months ago

Could not quite get this running. The following code almost works, but I am running into an error I don't totally understand. I will revisit this issue later.

Code:

import os
from Bio import SeqIO
from Bio.Seq import Seq

from pybedtools import BedTool, chromsizes
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from bin.Parameters import Parameters
from bin.AnalysisData import AnalysisData, Level

def __genbankToGffFasta(gbFn:str) -> None:
    # initialize variables
    rec:SeqRecord
    feat:SeqFeature
    records = list()
    gffFn = os.path.splitext(gbFn)[0] + ".gff"
    fnaFn = os.path.splitext(gbFn)[0] + ".fna"

    # open the gff file
    with open(gffFn, 'w') as fh:
        # for each feature
        for rec in SeqIO.parse(gbFn, 'genbank'):
            for feat in rec.features:
                # do not write the source entry
                if feat.type != 'source':
                    # extract the coordinates
                    start = int(feat.location.start) + 1
                    end = int(feat.location.end) + 1
                    strand = '+' if feat.location.strand == 1 else '-'

                    # build the row
                    row = [rec.id,
                        'ncbi',
                        feat.type,
                        str(start),
                        str(end),
                        '.',
                        strand,
                        '.',
                        ';'.join([f"{key}={value}" for key, value in feat.qualifiers.items()])]

                    # write the row to file
                    fh.write('\t'.join(row) + '\n')

            # save the record
            records.append(rec)

    # write the records as a fasta file
    SeqIO.write(records, fnaFn, 'fasta')

def __fastaToFai(fnaFn:str) -> None:
    # initialize variables
    rec:SeqRecord
    outFn = fnaFn + ".fai"
    rowNum = 0
    offset = 0

    # get the linebases and linewidth of the file
    with open(fnaFn, 'r') as fh:
        # for each line
        for line in fh:
            # only count the number of bases in the second row
            if rowNum == 1:
                linewidth = len(line)
                linebases = linewidth - 1
                break

            # move to the next line if at the first row
            else:
                rowNum += 1

    # start writing the output
    with open(outFn, 'w') as fh:
        # for each contig
        for rec in SeqIO.parse(fnaFn, 'fasta'):
            # update the offset
            offset += len(str(rec.seq)) + 1

            # write the data
            fh.write(f"{rec.id}\t{len(rec.seq)}\t{offset}\t{linebases}\t{linewidth}\n")

def __restructureData(analysis:dict[tuple[Seq,str],AnalysisData]) -> dict[str,dict[Level,list[tuple[str,AnalysisData]]]]:
    # initialize output
    out = dict()

    # go through each entry
    for (kmer,contig),data in analysis.items():
        # restructure: {name:{Level:[(contig,AnalysisData)]}}
        out[data.name] = out.get(data.name, dict())
        out[data.name][data.getLevel()] = out[data.name].get(data.getLevel(), list())
        out[data.name][data.getLevel()].append((contig, data))

    return out

def __getGenomeData(params:Parameters, name:str) -> BedTool:
    # initialize variables
    rec:SeqRecord
    rows = list()
    out = dict()

    # get the genome filename
    fn = [x for x in params.ingroupFns if name == os.path.basename(x)].pop()

    # create an entry for each contig
    for rec in SeqIO.parse(fn, params.format):
        out[rec.id] = (0, len(rec.seq))

    return out

def proto(analysisData:dict[tuple[str,Seq],AnalysisData], params:Parameters):
    # restructure the analysis data
    restructured = __restructureData(analysisData)

    # for each genome
    for name in restructured.keys():
        # get the genome data
        genome = __getGenomeData(params, name)

        # get the gff file as a BedTool object
        gffFn = os.path.join(os.path.dirname(params.ingroupFns[0]), os.path.splitext(name)[0] + ".gff")
        genomeGff = BedTool(gffFn)

        # for each level
        for level in restructured[name].keys():
            # get the genomoic coordinates for the kmers as a BedTool object
            kmerCoords = list()
            for contig,data in restructured[name][level]:
                kmerCoords.append((contig, data.primer.start, data.primer.end + 1))
            kmerCoords = BedTool(kmerCoords)

            # get the kmer coverage for this genome
            coverageFn = kmerCoords.genome_coverage(dz=True, genome=genome).saveas()

            # get the intersection of the gff with the coverage
            intersect = genomeGff.intersect(b=coverageFn)
            outFn = f'{name}_{level}_coverage.gff'
            intersect.saveas(outFn)

Error:

    121 bedIntervals = BedTool(intervals)
    123 coverageFn = bedIntervals.genome_coverage(dz=True, genome=genome).saveas()
--> 125 intersect = genomeGff.intersect(b=coverageFn, header=True)
    126 outFn = f'{name}_{level}_coverage.gff'
    127 intersect.saveas(outFn)

File [REDACTED]/python3.11/site-packages/pybedtools/bedtool.py:909, in BedTool._log_to_history.<locals>.decorated(self, *args, **kwargs)
    905 def decorated(self, *args, **kwargs):
    906 
    907     # this calls the actual method in the first place; *result* is
    908     # whatever you get back
--> 909     result = method(self, *args, **kwargs)
    911     # add appropriate tags
    912     parent_tag = self._tag

File [REDACTED]/python3.11/site-packages/pybedtools/bedtool.py:388, in _wraps.<locals>.decorator.<locals>.wrapped(self, *args, **kwargs)
    385 decode_output = not result_is_bam
    387 # Do the actual call
--> 388 stream = call_bedtools(
    389     cmds,
    390     tmp,
    391     stdin=stdin,
    392     check_stderr=check_stderr,
    393     decode_output=decode_output,
    394 )
    396 if does_not_return_bedtool:
    397     return does_not_return_bedtool(stream, **kwargs)

File [REDACTED]/python3.11/site-packages/pybedtools/helpers.py:460, in call_bedtools(cmds, tmpfn, stdin, check_stderr, decode_output, encode_input)
    458             sys.stderr.write(stderr)
    459         else:
--> 460             raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
    462 except (OSError, IOError) as err:
    463     print("%s: %s" % (type(err), os.strerror(err.errno)))

BEDToolsError: 
Command was:

        bedtools intersect -a [REDACTED]/i1_tiny.gff -header -b [REDACTED]/pybedtools.pm8zlpqu.tmp

Error message was:
Error: unable to open file or unable to determine types for file [REDACTED]/pybedtools.pm8zlpqu.tmp

- Please ensure that your file is TAB delimited (e.g., cat -t FILE).
- Also ensure that your file has integer chromosome coordinates in the 
  expected columns (e.g., cols 2 and 3 for BED).

dr-joe-wirth / primerForge

BED output files #14