Pheniqs only processes a small fraction of reads

rspreafico-sgi commented 6 years ago

Hi, I am trying trying to demux a PE run using inline barcodes on read 2:

{
    "input": [
        "LS01-2018-09-24-LS-UA-CAAF-001_R1_001.fastq.gz",
        "LS01-2018-09-24-LS-UA-CAAF-001_R2_001.fastq.gz"
    ],
    "transform": { 
        "token": [ "0::", "1:14:" ],
        "segment pattern": [ "0", "1" ]
    },
    "multiplex": {
        "transform": { "token": [ "1::12" ] },
        "codec": {
            "@TATGAACGTCCG": { "barcode": [ "TATGAACGTCCG" ], "output": [ "BC4CAAF1-10m-20180912-1120_L001_R1_001.fastq.gz", "BC4CAAF1-10m-20180912-1120_L001_R2_001.fastq.gz" ] },
            "@CCACATTGGGTC": { "barcode": [ "CCACATTGGGTC" ], "output": [ "BC1CAAF1-200m-20180912-0920_L001_R1_001.fastq.gz", "BC1CAAF1-200m-20180912-0920_L001_R2_001.fastq.gz" ] },
            "@TCAGTCAGATGA": { "barcode": [ "TCAGTCAGATGA" ], "output": [ "BC4CAAF1-10m-20180912-1015_L001_R1_001.fastq.gz", "BC4CAAF1-10m-20180912-1015_L001_R2_001.fastq.gz" ] },
            "@AAGTCACACACA": { "barcode": [ "AAGTCACACACA" ], "output": [ "BC1CAAF1-200m-20180912-1020_L001_R1_001.fastq.gz", "BC1CAAF1-200m-20180912-1020_L001_R2_001.fastq.gz" ] },
            "@GCTGTGATTCGA": { "barcode": [ "GCTGTGATTCGA" ], "output": [ "BC2CAAF2-200m-20180912-0920_L001_R1_001.fastq.gz", "BC2CAAF2-200m-20180912-0920_L001_R2_001.fastq.gz" ] }
        },
        "undetermined": { "output": [ "LS01-undetermined_L001_R1_001.fastq.gz", "LS01-undetermined_L001_R2_001.fastq.gz" ] },
        "algorithm": "mdd",
        "include filtered": true
    }
}

However, I always get back only 116 reads, despite the fact that the two input files have thousands of reads. I have tried with PAML too, same results.

{
    "multiplex": {
        "average classified distance": 0.219298245614035,
        "average pf classified distance": 0.219298245614035,
        "classified": [
            {
                "ID": "AAGTCACACACA",
                "PU": "AAGTCACACACA",
                "average distance": 0.461538461538461,
                "average pf distance": 0.461538461538461,
                "barcode": [
                    "AAGTCACACACA"
                ],
                "concentration": 0.198,
                "count": 13,
                "index": 1,
                "pf count": 13,
                "pf fraction": 1.0,
                "pf pooled classified fraction": 0.114035087719298,
                "pf pooled fraction": 0.112068965517241,
                "pooled classified fraction": 0.114035087719298,
                "pooled fraction": 0.112068965517241
            },
            {
                "ID": "CCACATTGGGTC",
                "PU": "CCACATTGGGTC",
                "average distance": 0.071428571428571,
                "average pf distance": 0.071428571428571,
                "barcode": [
                    "CCACATTGGGTC"
                ],
                "concentration": 0.198,
                "count": 28,
                "index": 2,
                "pf count": 28,
                "pf fraction": 1.0,
                "pf pooled classified fraction": 0.245614035087719,
                "pf pooled fraction": 0.241379310344827,
                "pooled classified fraction": 0.245614035087719,
                "pooled fraction": 0.241379310344827
            },
            {
                "ID": "GCTGTGATTCGA",
                "PU": "GCTGTGATTCGA",
                "average distance": 0.260869565217391,
                "average pf distance": 0.260869565217391,
                "barcode": [
                    "GCTGTGATTCGA"
                ],
                "concentration": 0.198,
                "count": 46,
                "index": 3,
                "pf count": 46,
                "pf fraction": 1.0,
                "pf pooled classified fraction": 0.403508771929824,
                "pf pooled fraction": 0.396551724137931,
                "pooled classified fraction": 0.403508771929824,
                "pooled fraction": 0.396551724137931
            },
            {
                "ID": "TATGAACGTCCG",
                "PU": "TATGAACGTCCG",
                "average distance": 1.0,
                "average pf distance": 1.0,
                "barcode": [
                    "TATGAACGTCCG"
                ],
                "concentration": 0.198,
                "count": 1,
                "index": 4,
                "pf count": 1,
                "pf fraction": 1.0,
                "pf pooled classified fraction": 0.008771929824561,
                "pf pooled fraction": 0.008620689655172,
                "pooled classified fraction": 0.008771929824561,
                "pooled fraction": 0.008620689655172
            },
            {
                "ID": "TCAGTCAGATGA",
                "PU": "TCAGTCAGATGA",
                "average distance": 0.153846153846153,
                "average pf distance": 0.153846153846153,
                "barcode": [
                    "TCAGTCAGATGA"
                ],
                "concentration": 0.198,
                "count": 26,
                "index": 5,
                "pf count": 26,
                "pf fraction": 1.0,
                "pf pooled classified fraction": 0.228070175438596,
                "pf pooled fraction": 0.224137931034482,
                "pooled classified fraction": 0.228070175438596,
                "pooled fraction": 0.224137931034482
            }
        ],
        "classified count": 114,
        "classified fraction": 0.982758620689655,
        "classified pf fraction": 1.0,
        "count": 116,
        "pf classified count": 114,
        "pf classified fraction": 0.982758620689655,
        "pf count": 116,
        "pf fraction": 1.0,
        "unclassified": {
            "ID": "undetermined",
            "PU": "undetermined",
            "count": 2,
            "index": 0,
            "pf count": 2,
            "pf fraction": 1.0,
            "pf pooled classified fraction": 0.017543859649122,
            "pf pooled fraction": 0.017241379310344,
            "pooled classified fraction": 0.017543859649122,
            "pooled fraction": 0.017241379310344
        }
    }
}

moonwatcher commented 5 years ago

Hi Sorry for the late reply.

Have you tried this with several datasets or is it only with those particular fastq files? it is possible that it is something in the gzip lacing. Can you please try extracting the gz files into uncompressed files and try feeding those? I am afraid not all gzip files are the same...

moonwatcher commented 5 years ago

To be clear. It is theoretically possible that it’s a more subtle bug. The multi threaded double buffering mechanism in Pheniqs is quite a complex thing... but it has been in production for quite a while and those kind of bugs are VERY rare these days.

I have seen many zip files that were generated with slight violations that are not compatible. For instance the gzip on MacOS and the gzip on an an average gnu based Linux machine are incompatible is some scenarios and it can make the compression engine in HTSLib read a few blocks fine and stop. Usually just decompressing the file with standard gzip will work and you can then recomporess it.

You can also just feed Pheniqs from stdin but only one input... (for instance an interleaved FASTQ or SAM).

rspreafico-sgi commented 5 years ago

Reads came gzipped straight off Basespace, they were not compressed by a user in Linux or Mac.

moonwatcher commented 5 years ago

Do you observe the same effect when feeding the same files uncompressed? Did Pheniqs print out any error message?

I would love to have s look at the files if that is possible.

rspreafico-sgi commented 5 years ago

Uncompressed FASTQ: works
Recompressed on CentOS 7: works
Originally compressed gzipped file straight off Basespace: confirmed it still doesn't work

Unfortunately I am not at liberty of sharing these files...

rspreafico-sgi commented 5 years ago

Both recompressed and originally compressed file pass the test of gzip -tv

moonwatcher commented 5 years ago

Decompression is actually preformed by HTSlib, except for an initial snoop done with zlib to verify the format and structure of the input. Do you have similar files from Basespace you can test to see if the behavior is the same?

Since you say it stops after 116 reads, that sounds like about the time it would hit the first block end. its possible that just examining the first 2mb or so of the file would be sufficient to figure it out. would be it ok for you to share the first 2mb of each file?

moonwatcher commented 5 years ago

Which version of pheniqs is this? have you built it yourself or installed it from conda? if you built it yourself, which version of htslib? has htslib been built with libdeflate? support for libdeflate has been added in htslib over the 1.8-1.9 revisions and is significantly faster at decoding gzip but perhaps its introducing some incompatibilities...

moonwatcher commented 5 years ago

another thing to try is bgzip --test on the file. Pheniqs uses bgzf to decompress gzip compressed fastq files.

rspreafico-sgi commented 5 years ago

I originally filed the bug report when using 2.0.4, I am currently using 2.0.6 from bioconda (which installs htslib version 1.9 - and libdeflate seems included, see here). If I recall correctly, back then I had tried with both the conda version as well as compiling the latest snapshot from Github, but I no longer have details on the htslib and zlib I had installed on that system to enable compilation.

bgzip --test works fine on all gzipped version, both those failing and those succeeding from demux.

Unfortunately I cannot share even a portion of the gene - trust me, I would if I could, but it's a rabbit hole on my end. However, I am inquiring with the sequencing provider whether there is anything I am unaware they are doing on their end before releasing the files.

rspreafico-sgi commented 5 years ago

If it helps, here is how the sequencing providers generates compressed fastq files:

Fastqs are compressed during bcl2fastq conversion and we use the --no-bgzf-compression option.

moonwatcher commented 5 years ago

So the files are gzip not bgzf compressed. But you say bgzip --test doesn't complain. try actually extract the file with bgzip and see that it doesn't stop after 116 reads...

rspreafico-sgi commented 5 years ago

That's it. bgzip --test does not complain but decompressing with bgzip -d produces a file with 116 reads:

wc -l *.fastq
464 LS02-2018-09-24-LS-UA-CAAF-002_R1_001.fastq

rspreafico-sgi commented 5 years ago

Decompressing the file same with gunzip works fine

moonwatcher commented 5 years ago

Interesting... it probably sees and EOF marker and thinks its the end, since it is not complaining or giving out some error. perhaps we can post this question on the htslib GitHub project. They will either know why this is happening or will need to look at the file to debug it...

For your purpose right now I would say recompress the data or even better use pheniqs to convert it to CRAM and save space :) but it is an interesting question generally. It will require an example file.

Maybe you can find a file delivered to you from the same facility that shows the same behavior and can be shared?

moonwatcher commented 5 years ago

If bgzip turns out to have too many incompatiblity issues I will look into adding an option to decode with standard gzip.

Gzip is generally the bottle neck in processing genetic data since it’s really old and doesn’t scale well with multithreading.

rspreafico-sgi commented 5 years ago

Thanks for addressing this.

It will be hard for me to get other data from the same facility, since they belong to other customers. However, it seems that similar FASTQ files could be regenerated from any run you have BCL files for using bcl2fastq with the option above.

Yep for me I will use the workaround of decompressing/recompressing with gzip.

moonwatcher commented 5 years ago

Notice that IO with unaligned BAM, and even more so with CRAM is usually much faster, at least on machines with multiple cores, since gzip basically doesn't parallelize. If you intend to play around with the data I suggest making a single Pheniqs run to collapse all your fastq files into a single, interleaved cram file and use that as input.

For instance: pheniqs mux --input foo_R1.fastq.gz --input foo_I1.fastq.gz --input foo_I2.fastq.gz --input foo_R2.fastq.gz --output foo.cram

will interleave dual index paired end reads into a single unaligned foo.cram. it will take less space and allow pheniqs to use multiple threads for reading from it.

for bonus points you should install the zsh_completion, if haven't already. That is if you use zsh. if you don't you should be asking yourself why :)

moonwatcher commented 5 years ago

using interleaved files also has the bonus of allowing you to feed them from standard input (pheniqs will automatically detect the format) so if you want to preprocess before feeding the data into pheniqs you can do that without a temporary file.

moonwatcher commented 5 years ago

I added in the latest commit support for query parameters and it is now possible to specify compression parameters for HTSlib https://biosails.github.io/pheniqs/2.0/manual.html#query-parameters

That means you can now pick bgzf vs gz compression and zlib compression level. Not sure exactly what HTSlib does with those but it did yield different file sizes playing with them so potentially this might help with your issue.

moonwatcher commented 5 years ago

I am going to close this ticket. Please report if you have any further issues with this.

biosails / pheniqs

Pheniqs only processes a small fraction of reads #16