Open lskatz opened 4 months ago
something like this:
bedtools intersect -a <(grep -v source file.gff) -b <(grep -v '\b0$' coverage.bed) | head
Could not quite get this running. The following code almost works, but I am running into an error I don't totally understand. I will revisit this issue later.
Code:
import os
from Bio import SeqIO
from Bio.Seq import Seq
from pybedtools import BedTool, chromsizes
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from bin.Parameters import Parameters
from bin.AnalysisData import AnalysisData, Level
def __genbankToGffFasta(gbFn:str) -> None:
# initialize variables
rec:SeqRecord
feat:SeqFeature
records = list()
gffFn = os.path.splitext(gbFn)[0] + ".gff"
fnaFn = os.path.splitext(gbFn)[0] + ".fna"
# open the gff file
with open(gffFn, 'w') as fh:
# for each feature
for rec in SeqIO.parse(gbFn, 'genbank'):
for feat in rec.features:
# do not write the source entry
if feat.type != 'source':
# extract the coordinates
start = int(feat.location.start) + 1
end = int(feat.location.end) + 1
strand = '+' if feat.location.strand == 1 else '-'
# build the row
row = [rec.id,
'ncbi',
feat.type,
str(start),
str(end),
'.',
strand,
'.',
';'.join([f"{key}={value}" for key, value in feat.qualifiers.items()])]
# write the row to file
fh.write('\t'.join(row) + '\n')
# save the record
records.append(rec)
# write the records as a fasta file
SeqIO.write(records, fnaFn, 'fasta')
def __fastaToFai(fnaFn:str) -> None:
# initialize variables
rec:SeqRecord
outFn = fnaFn + ".fai"
rowNum = 0
offset = 0
# get the linebases and linewidth of the file
with open(fnaFn, 'r') as fh:
# for each line
for line in fh:
# only count the number of bases in the second row
if rowNum == 1:
linewidth = len(line)
linebases = linewidth - 1
break
# move to the next line if at the first row
else:
rowNum += 1
# start writing the output
with open(outFn, 'w') as fh:
# for each contig
for rec in SeqIO.parse(fnaFn, 'fasta'):
# update the offset
offset += len(str(rec.seq)) + 1
# write the data
fh.write(f"{rec.id}\t{len(rec.seq)}\t{offset}\t{linebases}\t{linewidth}\n")
def __restructureData(analysis:dict[tuple[Seq,str],AnalysisData]) -> dict[str,dict[Level,list[tuple[str,AnalysisData]]]]:
# initialize output
out = dict()
# go through each entry
for (kmer,contig),data in analysis.items():
# restructure: {name:{Level:[(contig,AnalysisData)]}}
out[data.name] = out.get(data.name, dict())
out[data.name][data.getLevel()] = out[data.name].get(data.getLevel(), list())
out[data.name][data.getLevel()].append((contig, data))
return out
def __getGenomeData(params:Parameters, name:str) -> BedTool:
# initialize variables
rec:SeqRecord
rows = list()
out = dict()
# get the genome filename
fn = [x for x in params.ingroupFns if name == os.path.basename(x)].pop()
# create an entry for each contig
for rec in SeqIO.parse(fn, params.format):
out[rec.id] = (0, len(rec.seq))
return out
def proto(analysisData:dict[tuple[str,Seq],AnalysisData], params:Parameters):
# restructure the analysis data
restructured = __restructureData(analysisData)
# for each genome
for name in restructured.keys():
# get the genome data
genome = __getGenomeData(params, name)
# get the gff file as a BedTool object
gffFn = os.path.join(os.path.dirname(params.ingroupFns[0]), os.path.splitext(name)[0] + ".gff")
genomeGff = BedTool(gffFn)
# for each level
for level in restructured[name].keys():
# get the genomoic coordinates for the kmers as a BedTool object
kmerCoords = list()
for contig,data in restructured[name][level]:
kmerCoords.append((contig, data.primer.start, data.primer.end + 1))
kmerCoords = BedTool(kmerCoords)
# get the kmer coverage for this genome
coverageFn = kmerCoords.genome_coverage(dz=True, genome=genome).saveas()
# get the intersection of the gff with the coverage
intersect = genomeGff.intersect(b=coverageFn)
outFn = f'{name}_{level}_coverage.gff'
intersect.saveas(outFn)
Error:
121 bedIntervals = BedTool(intervals)
123 coverageFn = bedIntervals.genome_coverage(dz=True, genome=genome).saveas()
--> 125 intersect = genomeGff.intersect(b=coverageFn, header=True)
126 outFn = f'{name}_{level}_coverage.gff'
127 intersect.saveas(outFn)
File [REDACTED]/python3.11/site-packages/pybedtools/bedtool.py:909, in BedTool._log_to_history.<locals>.decorated(self, *args, **kwargs)
905 def decorated(self, *args, **kwargs):
906
907 # this calls the actual method in the first place; *result* is
908 # whatever you get back
--> 909 result = method(self, *args, **kwargs)
911 # add appropriate tags
912 parent_tag = self._tag
File [REDACTED]/python3.11/site-packages/pybedtools/bedtool.py:388, in _wraps.<locals>.decorator.<locals>.wrapped(self, *args, **kwargs)
385 decode_output = not result_is_bam
387 # Do the actual call
--> 388 stream = call_bedtools(
389 cmds,
390 tmp,
391 stdin=stdin,
392 check_stderr=check_stderr,
393 decode_output=decode_output,
394 )
396 if does_not_return_bedtool:
397 return does_not_return_bedtool(stream, **kwargs)
File [REDACTED]/python3.11/site-packages/pybedtools/helpers.py:460, in call_bedtools(cmds, tmpfn, stdin, check_stderr, decode_output, encode_input)
458 sys.stderr.write(stderr)
459 else:
--> 460 raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
462 except (OSError, IOError) as err:
463 print("%s: %s" % (type(err), os.strerror(err.errno)))
BEDToolsError:
Command was:
bedtools intersect -a [REDACTED]/i1_tiny.gff -header -b [REDACTED]/pybedtools.pm8zlpqu.tmp
Error message was:
Error: unable to open file or unable to determine types for file [REDACTED]/pybedtools.pm8zlpqu.tmp
- Please ensure that your file is TAB delimited (e.g., cat -t FILE).
- Also ensure that your file has integer chromosome coordinates in the
expected columns (e.g., cols 2 and 3 for BED).
I would like to suggest a feature where you have a BED output file, one per genome. https://genome.ucsc.edu/FAQ/FAQformat.html#format1
It has the advantages that
bedtools intersect
.I think this would also mean that you'd want to have an output folder option to place all the BED files in.