hail-is / hail

Cloud-native genomic dataframes and batch computing
https://hail.is
MIT License
982 stars 246 forks source link

BGZIP bug #566

Closed tpoterba closed 8 years ago

tpoterba commented 8 years ago

Replicable bug:

hail -b 0 importvcf src/test/resources/multipleChromosomes.vcf -n 10 exportvcf -o /tmp/out.vcf.bgz importvcf /tmp/out.vcf.bgz -n 10 count --genotypes

hail: count: caught exception: Job aborted due to stage failure: Task 4 in stage 0.0 failed 1 times, most recent failure: Lost task 4.0 in stage 0.0 (TID 4, localhost): htsjdk.samtools.SAMFormatException: Invalid GZIP header
    at htsjdk.samtools.util.BlockGunzipper.unzipBlock(BlockGunzipper.java:72)
    at htsjdk.samtools.util.BlockCompressedInputStream.inflateBlock(BlockCompressedInputStream.java:410)
    at htsjdk.samtools.util.BlockCompressedInputStream.readBlock(BlockCompressedInputStream.java:392)
    at htsjdk.samtools.util.BlockCompressedInputStream.available(BlockCompressedInputStream.java:127)
    at org.seqdoop.hadoop_bam.util.BGZFSplitCompressionInputStream.readWithinBlock(BGZFSplitCompressionInputStream.java:81)
    at org.seqdoop.hadoop_bam.util.BGZFSplitCompressionInputStream.read(BGZFSplitCompressionInputStream.java:48)
    at java.io.InputStream.read(InputStream.java:101)
    at org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader.fillBuffer(CompressedSplitLineReader.java:130)
    at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:216)
    at org.apache.hadoop.util.LineReader.readLine(LineReader.java:174)
    at org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader.readLine(CompressedSplitLineReader.java:159)
    at org.apache.hadoop.mapred.LineRecordReader.<init>(LineRecordReader.java:134)
    at org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67)
    at org.apache.spark.rdd.HadoopRDD$$anon$1.<init>(HadoopRDD.scala:239)
    at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:216)
    at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
    at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:87)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:88)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
tpoterba commented 8 years ago

reverted in a3d504c