rkern / line_profiler

(OLD REPO) Line-by-line profiling for Python - Current repo ->
https://github.com/pyutils/line_profiler
Other
3.6k stars 254 forks source link

TypeError: source code string cannot contain null bytes #108

Closed juancresc closed 6 years ago

juancresc commented 6 years ago

I'm getting the next error while trying to analyze a script of my own:

 kernprof -l python3 novoMITE.py -g test/example.fasta -o test/mites2
Wrote profile results to python3.lprof
Traceback (most recent call last):
  File "/vagrant/bio/novoMITE/venv3/bin/kernprof", line 9, in <module>
    load_entry_point('line-profiler==2.0', 'console_scripts', 'kernprof')()
  File "/vagrant/bio/novoMITE/venv3/lib/python3.4/site-packages/kernprof.py", line 222, in main
    execfile(script_file, ns, ns)
  File "/vagrant/bio/novoMITE/venv3/lib/python3.4/site-packages/kernprof.py", line 35, in execfile
    exec_(compile(f.read(), filename, 'exec'), globals, locals)
TypeError: source code string cannot contain null bytes

There's nothing odd with the script at first sight

juancresc commented 6 years ago

Also if I left the script empty, and use no parameters, I'm getting the same results. Using: line-profiler==2.0

caethan commented 6 years ago

What python version are you using? Can you provide a minimal example that causes the failure?

juancresc commented 6 years ago

I'm using python3 (could not run with 2)

kernprof -l python3 novoMITE.py -g test/some.fa -o test/some.mite

and this is my full script

!/usr/bin/env python

-- coding: utf-8 --

from future import print_function import time, os, csv from subprocess import Popen, PIPE from threading import Thread from math import ceil from Queue import Queue from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import pairwise2

def blast(query, db, filename):

check if db is created

if not os.path.exists(db + ".nin"):
    cmd_list = ['makeblastdb','-dbtype', 'nucl', '-in', db, '-out', db]
    p = Popen(cmd_list, stdout=PIPE, stderr=PIPE)
    #TODO error handling
    out, err = p.communicate()
cmd_list = ['blastn','-db', db, '-query',query,'-max_target_seqs','1','-evalue','1e-3','-outfmt','6']
blast_process = Popen(cmd_list, stdout=PIPE, stderr=PIPE)
out,err = blast_process.communicate()
outfile = open(filename,"w")
outfile.write(out)
outfile.close
return err

def mismatches(a, b): """ """ if a == "" or b == "": return False

return sum(map(lambda (x, y): 0 if x == y else 1, zip(a, b)))#only works in python2.7

return sum([0 if x == y else 1 for x,y in zip(a,b) ])

def find_tsd(sequence, tsd_min_len, tsd_max_len, min_score): """ """ sequence_len = len(sequence) for tsd_len in range(tsd_max_len, tsd_min_len-1, -1): tsd_1 = sequence[0 : tsd_len] tsd_2 = sequence[sequence_len - tsd_len : sequence_len]

total_mismatches = mismatches(tsd_1,tsd_2)

    #if not total_mismatches is False and total_mismatches <= max_mismatches:
    if tsd_1 == tsd_2:
        return (tsd_1, tsd_len)
return False

def find_tir(sequence, tir_min_len): """ """ sequence_len = len(sequence) tir_1 = sequence[0 : tir_min_len] tir_2 = sequence[sequence_len - tir_min_len : sequence_len] tir_2 = str(Seq(tir_2).reverse_complement()) score = pairwise2.align.globalms(tir_1, tir_2, 1, -1, -1, -1, score_only=True) if score >= args.score_min: return tir_1

total_mismatches = mismatches(tir_1,tir_2)

#max_mismatches = floor(tir_min_len * 0.2)
#if not total_mismatches is False and total_mismatches <= max_mismatches:
#    return tir_1
return False

def search_mites(q, buffer_seq, buffer_gff, args): """ """ while not q.empty(): record = q.get(0) print("Processing sequence %s" % (record.id)) clean_seq = ''.join(str(record.seq).splitlines()) seq_len = len(clean_seq) cursor_start = 0 max_seq_end = seq_len - args.mite_min_len while cursor_start < max_seq_end: for mite_len in range(args.mite_max_len, args.mite_min_len, -1): cursor_end = cursor_start + mite_len current_seq = clean_seq[cursor_start:cursor_end] current_seq_len = len(current_seq) tsd_found = find_tsd(current_seq, args.tsd_min_len, args.tsd_max_len, 0) if tsd_found: (tsd, tsd_len) = tsd_found remaining_seq = current_seq[tsd_len : current_seq_len - tsd_len] tir_found = find_tir(remaining_seq, args.tir_min_len) if tir_found:

cursor_start += current_seq_len

                    cursor_start += args.tir_min_len
                    desc =  "seq:" + str(record.id)
                    desc += " start:" + str(cursor_start)
                    mite_end = cursor_start + current_seq_len
                    desc += " end:" + str(mite_end)
                    desc += " TSD_len:" + str(tsd_len)
                    #desc += " TIR_len:" + str(tir_len)
                    desc += " TSD:" + str(tsd)
                    seq = SeqRecord(Seq(current_seq), id='MITE_' + str(args.count + 1) ,description = desc)
                    args.count += 1
                    #buffer_seq.append(seq)
                    SeqIO.write(seq, args.output_file, "fasta")
                    buffer_gff['MITE_' + str(args.count + 1)] = (record.id, cursor_start, mite_end, args.count)
                    perc = cursor_start * 100 / seq_len
                    print("Found " + str(args.count) + " MITEs " + str(perc) + "% of current seq")
        cursor_start += 1
    q.task_done()
return True

def novo_mite(args): """ args: genome, tir_min_len, tsd_min_len, tsd_max_len, mite_min_len, mite_max_len, flank_seq_len,cores, outfile Will create the threads for seraching mites """ args.count = 0 buffer_seq = [] buffer_gff = {}

#read de genome file
use_genome = args.genome
"""
if args.cores > 1:
    use_genome = args.outfile + '.chunks.fasta'
    use_genome_handle = open(use_genome, 'w')
    fasta_seq = SeqIO.parse(args.genome, 'fasta')
    #split the sequences into _cores_ with some overlapping
    print("Splitting genome files into chunks...")
    for record in fasta_seq:
        seq_count = 1
        clean_seq = ''.join(str(record.seq).splitlines())
        seq_len = len(clean_seq)
        chunk_size = (seq_len / args.cores)
        current_chunk = 0
        if chunk_size < args.mite_max_len * 2:
            desc = "from:0 to:" + str(chunk_size)
            seq = SeqRecord(record.seq, id=record.id + "_1", description=desc)
            SeqIO.write(seq, use_genome_handle, "fasta")
            continue
        while current_chunk < seq_len:
            end_chunk = current_chunk + chunk_size
            if current_chunk + chunk_size > seq_len:
                end_chunk = seq_len
            current_seq = clean_seq[current_chunk:end_chunk]
            desc = "from:" + str(current_chunk) + ' to:' + str(end_chunk)
            seq = SeqRecord(Seq(current_seq), id=record.id + "_" + str(seq_count), description=desc)
            seq_count += 1
            SeqIO.write(seq, use_genome_handle, "fasta")
            current_chunk += chunk_size - args.mite_max_len
"""
print("Reading processed genome file and starting search...")
fasta_seq = SeqIO.parse(use_genome, 'fasta')
q = Queue(maxsize=0)
for record in fasta_seq:
    q.put(record)

output_file_name = args.outfile + ".fasta"
args.output_file = open(output_file_name, 'w')

if False:
    for i in range(args.cores):
        print("Starting thread",i)
        worker = Thread(target=search_mites, args=(q,buffer_seq, buffer_gff, args))
        worker.setDaemon(True)
        worker.start()
    #now we wait until the queue has been processed
    q.join()
args.score_min = ceil(args.tir_min_len * 0.8)
search_mites(q,buffer_seq, buffer_gff, args)

if args.database:
    blast(output_file_name, args.database, args.outfile + ".allhits")
    with open(args.outfile + ".allhits", 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for r in reader:
            buffer_gff[r[0]] += ('ID=MITE_' + str(buffer_gff[r[0]][3]) + ';ELEMENT=' + str(r[1]) + ';EVALUE=' + str(r[10]),)

output_gff = open(args.outfile + ".gff3","w")
output_gff.write("##gff-version 3\n")
for k,row in buffer_gff.iteritems():
    if len(row) == 5:
        desc = str(row[4])
    else:
        desc = ""
    write_row =  '\t'.join([ str(row[0]), 'novoMITE','MITE',str(row[1]), str(row[2]),'.','+','.', desc]) 
    output_gff.write(write_row + '\n')
return args.count

if name == "main": start_time = time.time() import argparse parser = argparse.ArgumentParser()#pylint: disable=invalid-name parser.add_argument("-g", "--genome", help="Genome file in fasta format", required=True) parser.add_argument("-o","--outfile", help="Output files prefix (will append .fasta and .gff3)", required=True) parser.add_argument("-c","--cores", help="Max number of processes to use simultaneously", type=int, default=1) parser.add_argument("--tir_min_len", help="TIR min lenght", type=int, default=12) parser.add_argument("--tsd_min_len", help="TSD min lenght", type=int, default=2) parser.add_argument("--tsd_max_len", help="TSD max lenght", type=int, default=10) parser.add_argument("--mite_min_len", help="MITE min lenght", type=int, default=50) parser.add_argument("--mite_max_len", help="MITE max lenght", type=int, default=800) parser.add_argument("--flank_seq_len", help="Flanking sequence length", type=int, default=50) parser.add_argument("-d","--database", help="Database for calssification", default=False) args = parser.parse_args()#pylint: disable=invalid-name count = novo_mite(args) print("100%% found %i MITEs in %s seconds" % ((count), time.time() - start_time))

juancresc commented 6 years ago

now with python 2 I got:

(venv)➜  novoMITE git:(master) ✗ kernprof -l python novoMITE.py -g test/some.fa -o test/some.mite
Wrote profile results to python.lprof
Traceback (most recent call last):
  File "/vagrant/bio/novoMITE/venv/bin/kernprof", line 11, in <module>
    load_entry_point('line-profiler==2.1.2', 'console_scripts', 'kernprof')()
  File "/vagrant/bio/novoMITE/venv/local/lib/python2.7/site-packages/kernprof.py", line 222, in main
    execfile(script_file, ns, ns)
  File "/vagrant/bio/novoMITE/venv/bin/python", line 1
SyntaxError: Non-ASCII character '\x86' in file /vagrant/bio/novoMITE/venv/bin/python on line 2, but no encoding declared; see http://www.python.org/peps/pep-0263.html for details

encoding is declared in my script