MIT-LCP / wfdb-python

Native Python WFDB package
MIT License
738 stars 300 forks source link

Broken with libFLAC 1.4.0 through 1.4.2 ("internal psf_fseek() failed") #488

Open bemoody opened 4 months ago

bemoody commented 4 months ago

Some versions of libFLAC report an error when we try to seek to particular sample numbers:

(All tested only on Linux amd64.)

In the case of wfdb-python, this manifests as an exception "internal psf_fseek() failed" (but see also issue #486, which appears to be something different.)

Some examples from mimic4wdb/0.1.0:

filename                                       sample     possible byte range
p100/p10079700/85594648/85594648_0003e.dat     18229140   16277832-16286024
p101/p10100546/83268087/83268087_0113p.dat     2288000    1049471-1057663
p110/p11013146/82432904/82432904_0011e.dat     1686714    2026933-2084277
p132/p13208026/82342224/82342224_0006e.dat     18576787   17925601-17950177
p132/p13240081/87706224/87706224_0102p.dat     363771     322161-330353
p142/p14285792/87867111/87867111_0009e.dat     4161139    5495737-5503929
p155/p15552902/85078610/85078610_0017r.dat     3313501    1682872-1691064
p158/p15857793/84686667/84686667_0004e.dat     13244922   12714501-12722693
p160/p16096029/87730914/87730914_0010e.dat     9319846    12328985-12337177
p160/p16096029/87730914/87730914_0045r.dat     3254418    1713774-1721966
p166/p16644640/84209926/84209926_0022e.dat     1012957    1128680-1136872
p171/p17107892/83439223/83439223_0001e.dat     2026051    1807349-1815541
p174/p17459141/80111237/80111237_0014e.dat     4904188    7866536-7882920
p179/p17973277/83801243/83801243_0016e.dat     25970274   33443196-33451388
p179/p17973277/83801243/83801243_0010r.dat     267184     126505-134697
p193/p19305085/87468061/87468061_0019e.dat     22393823   22401189-22409381
p193/p19313794/88537698/88537698_0016e.dat     8141191    11785911-11794103

It'd be good to figure out why these examples are broken, and find a minimal example that we can use as a test case (and also share with the upstream flac and soundfile developers.)

bemoody commented 4 months ago

Here's the tool I'm using to search for these examples:

#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <unistd.h>
#include <sys/stat.h>
#include <FLAC/stream_decoder.h>

struct filehandle {
    const char *filename;
    int fd;
    long long read_pos;
    struct stat st;
    int nseeks;
    int abort_nseeks;
    long long seek_pos;

    struct {
        long long t;
        long long seek_pos;
        long long read_pos;
        int status;
        int nseeks;
    } results_cache[64];
};

static FLAC__StreamDecoderReadStatus
read_cb(const FLAC__StreamDecoder *dec, FLAC__byte buffer[],
        size_t *bytes, void *client_data)
{
    struct filehandle *fh = client_data;
    ssize_t n;
    size_t max = *bytes;
    *bytes = 0;
    while (*bytes < max) {
        n = pread(fh->fd, buffer, max - *bytes, fh->read_pos);
        if (n < 0)
            return FLAC__STREAM_DECODER_READ_STATUS_ABORT;
        buffer += n;
        *bytes += n;
        fh->read_pos += n;
        if (n == 0)
            return FLAC__STREAM_DECODER_READ_STATUS_END_OF_STREAM;
    }
    return FLAC__STREAM_DECODER_READ_STATUS_CONTINUE;
}

static FLAC__StreamDecoderSeekStatus
seek_cb(const FLAC__StreamDecoder *dec, FLAC__uint64 pos, void *client_data)
{
    struct filehandle *fh = client_data;

    if (++fh->nseeks == fh->abort_nseeks)
        return FLAC__STREAM_DECODER_SEEK_STATUS_ERROR;

    fh->seek_pos = pos;
    fh->read_pos = pos;
    return FLAC__STREAM_DECODER_SEEK_STATUS_OK;
}

static FLAC__StreamDecoderTellStatus
tell_cb(const FLAC__StreamDecoder *dec, FLAC__uint64 *pos, void *client_data)
{
    struct filehandle *fh = client_data;
    *pos = fh->read_pos;
    return FLAC__STREAM_DECODER_TELL_STATUS_OK;
}

static FLAC__StreamDecoderLengthStatus
length_cb(const FLAC__StreamDecoder *dec, FLAC__uint64 *len, void *client_data)
{
    struct filehandle *fh = client_data;
    *len = fh->st.st_size;
    return FLAC__STREAM_DECODER_LENGTH_STATUS_OK;
}

static FLAC__bool eof_cb(const FLAC__StreamDecoder *dec, void *client_data)
{
    struct filehandle *fh = client_data;
    return (fh->read_pos >= fh->st.st_size);
}

static FLAC__StreamDecoderWriteStatus
write_cb(const FLAC__StreamDecoder *dec, const FLAC__Frame *frame,
         const FLAC__int32 *const buf[], void *client_data)
{
    return FLAC__STREAM_DECODER_WRITE_STATUS_CONTINUE;
}

static void
error_cb(const FLAC__StreamDecoder *decoder,
         FLAC__StreamDecoderErrorStatus status,
         void *client_data)
{
}

static FLAC__StreamDecoder *init_decoder(struct filehandle *fh)
{
    FLAC__StreamDecoder *dec;

    fh->read_pos = 0;
    fh->seek_pos = 0;
    fh->nseeks = 0;
    fh->abort_nseeks = -1;

    if (!(dec = FLAC__stream_decoder_new())) {
        printf("%s: stream_decoder_new failed\n", fh->filename);
        return NULL;
    } else if (FLAC__stream_decoder_init_stream(dec, &read_cb,
                                                &seek_cb, &tell_cb,
                                                &length_cb, &eof_cb,
                                                &write_cb, NULL,
                                                &error_cb, fh)) {
        printf("%s: stream_decoder_init_stream failed\n", fh->filename);
        FLAC__stream_decoder_delete(dec);
        return NULL;
    } else if (!FLAC__stream_decoder_process_until_end_of_metadata(dec)) {
        printf("%s: stream_decoder_process_until_end_of_metadata failed\n",
               fh->filename);
        FLAC__stream_decoder_delete(dec);
        return NULL;
    } else {
        return dec;
    }
}

static int try_seek(struct filehandle *fh, int depth, long long t,
                    long long *seek_pos, long long *read_pos, int *nseeks)
{
    FLAC__StreamDecoder *dec;
    int status;

    if (t == fh->results_cache[depth].t) {
        *seek_pos = fh->results_cache[depth].seek_pos;
        *read_pos = fh->results_cache[depth].read_pos;
        *nseeks = fh->results_cache[depth].nseeks;
        return fh->results_cache[depth].status;
    }

    if (!(dec = init_decoder(fh)))
        return 2;

    fh->nseeks = 0;
    fh->abort_nseeks = 2;
    fh->seek_pos = 0;
    status = !!FLAC__stream_decoder_seek_absolute(dec, t);
    FLAC__stream_decoder_delete(dec);

    fh->results_cache[depth].t = t;
    fh->results_cache[depth].status = status;
    fh->results_cache[depth].seek_pos = *seek_pos = fh->seek_pos;
    fh->results_cache[depth].read_pos = *read_pos = fh->read_pos;
    fh->results_cache[depth].nseeks = *nseeks = fh->nseeks;
    return status;
}

static int probe_bad_seek(struct filehandle *fh, long long total_samples,
                          long long sync_pos)
{
    long long a, b, t, seek_pos, read_pos,
        best_seek_pos = -1, best_read_pos = -1, best_t = -1;
    int depth = 0, status, nseeks, best_status = 0, best_nseeks = -1;

    a = 0;
    b = total_samples;

    /* Try to pick a sample number such that seek_absolute's first
       seek lands at or before sync_pos, as close as possible. */

    while (a < b) {
        t = (a + b) / 2;
        status = try_seek(fh, depth, t, &seek_pos, &read_pos, &nseeks);
        if (status > 1)
            return status;

        if (seek_pos > sync_pos)
            b = t;
        else {
            if (seek_pos > best_seek_pos) {
                best_seek_pos = seek_pos;
                best_read_pos = read_pos;
                best_t = t;
                best_status = status;
                best_nseeks = nseeks;
            }
            a = t + 1;
        }
        depth++;
    }

    if (best_seek_pos > sync_pos || best_read_pos < sync_pos + 2) {
        printf("%s: unable to force seeking to %lld\n",
               fh->filename, sync_pos);
        return 2;
    } else if (best_status != 0) {
        /* libFLAC successfully parsed the given frame header
           (or skipped a fake frame header and parsed the next
           real frame header), and continued reading */
        return 0;
    } else if (best_nseeks > 1) {
        /* libFLAC successfully parsed the given frame header (or
           skipped a fake frame header and parsed the next real frame
           header), and decided to seek somewhere else */
        return 0;
    } else {
        /* libFLAC threw an error after reading the data at sync_pos,
           without trying to perform another seek */
        printf("%s: FAILED at t=%lld (bytes %lld to %lld)\n",
               fh->filename, best_t, best_seek_pos, best_read_pos);
        return 1;
    }
}

int main(int argc, char **argv)
{
    struct filehandle fh;
    FLAC__StreamDecoder *dec;
    FILE *fp;
    long long pos, npos = 0, total_samples;
    int status, failed = 0, c;

    if (argc != 2) {
        printf("usage: %s file.flac\n", argv[0]);
        return 2;
    }

    fh.filename = argv[1];
    if (!(fp = fopen(fh.filename, "r"))) {
        printf("%s: open failed: %s\n", fh.filename, strerror(errno));
        return 2;
    }
    fh.fd = fileno(fp);
    if (fstat(fh.fd, &fh.st)) {
        printf("%s: fstat failed: %s\n", fh.filename, strerror(errno));
        return 2;
    }

    if (!(dec = init_decoder(&fh)))
        return 2;
    total_samples = FLAC__stream_decoder_get_total_samples(dec);
    FLAC__stream_decoder_delete(dec);

    fseeko(fp, fh.read_pos, SEEK_SET);
    while ((c = fgetc(fp)) != EOF) {
        if (c == 0xff) {
            c = fgetc(fp);
            if (c == 0xf8 || c == 0xf9) {
                pos = ftello(fp) - 2;
                if (pos > npos) {
                    fprintf(stderr, "%15lld/%15lld (%.0f%%)\r",
                            pos, (long long) fh.st.st_size,
                            (pos * 100.0 / fh.st.st_size));
                    npos = pos + 500000;
                }
                status = probe_bad_seek(&fh, total_samples, pos);
                if (status == 1)
                    failed = 1;
                else if (status != 0)
                    return status;
            }
        }
    }
    fprintf(stderr, "\n");
    return failed;
}

(My first attempt took 5 hours to run on a 300 MB file, this one takes 4 minutes.)

bemoody commented 2 months ago

Here are the exact commands you can run to check if your installation is affected:

wget https://physionet.org/files/mimic4wdb/0.1.0/waves/p179/p17973277/83801243/83801243_0010r.dat
python3 -c 'import soundfile; soundfile.SoundFile("83801243_0010r.dat").seek(267184)'
tompollard commented 2 months ago

On MacOS 14.6 Beta (23G5061b) M1 Max:

soundfile==0.11.0

python3 -c 'import soundfile; soundfile.SoundFile("83801243_0010r.dat").seek(267184)'

Runs without error.

soundfile==0.12.1

python3 -c 'import soundfile; soundfile.SoundFile("83801243_0010r.dat").seek(267184)'
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/tompollard/projects/wfdb-python/env/lib/python3.9/site-packages/soundfile.py", line 802, in seek
    _error_check(self._errorcode)
  File "/Users/tompollard/projects/wfdb-python/env/lib/python3.9/site-packages/soundfile.py", line 1407, in _error_check
    raise LibsndfileError(err, prefix=prefix)
soundfile.LibsndfileError: Internal psf_fseek() failed.

Latest version of soundfile

Installing the latest version of soundfile from the top of https://github.com/bastibe/python-soundfile raises the same error :/

pip install git+https://github.com/bastibe/python-soundfile.git@8ebb523725e315f24c5592677353c43c4562be54

(Master branch, commit: 8ebb523725e315f24c5592677353c43c4562be54)

Latest commit on soundfile branch v1.2.2

Installing the version of soundfile at: https://github.com/bastibe/python-soundfile/commits/v1.2.2/ also raises the error.

pip install git+https://github.com/bastibe/python-soundfile.git@cbaaad4fd06366b7624d0ce08354463ec7917a17

(v1.2.2 branch, commit: cbaaad4fd06366b7624d0ce08354463ec7917a17)

tompollard commented 2 months ago

@bemoody have you following the discussion at: https://github.com/bastibe/python-soundfile/issues/274 ? Sounds related?

bemoody commented 2 months ago

Not entirely following but it looks like that is a bug in libsndfile's float-to-integer conversion. That doesn't seem relevant.

bemoody commented 2 months ago

@tompollard: In the libsndfile-binaries repository there is a script for building libsndfile_arm64.dylib. It might be helpful for you to test this locally.

after which you should be able to reinstall and test the package as before.

eagomez2 commented 2 months ago

On MacOS 14.6 Beta (23G5061b) M1 Max:

soundfile==0.11.0

python3 -c 'import soundfile; soundfile.SoundFile("83801243_0010r.dat").seek(267184)'

Runs without error.

soundfile==0.12.1

python3 -c 'import soundfile; soundfile.SoundFile("83801243_0010r.dat").seek(267184)'
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/tompollard/projects/wfdb-python/env/lib/python3.9/site-packages/soundfile.py", line 802, in seek
    _error_check(self._errorcode)
  File "/Users/tompollard/projects/wfdb-python/env/lib/python3.9/site-packages/soundfile.py", line 1407, in _error_check
    raise LibsndfileError(err, prefix=prefix)
soundfile.LibsndfileError: Internal psf_fseek() failed.

Latest version of soundfile

Installing the latest version of soundfile from the top of https://github.com/bastibe/python-soundfile raises the same error :/

pip install git+https://github.com/bastibe/python-soundfile.git@8ebb523725e315f24c5592677353c43c4562be54

(Master branch, commit: 8ebb523725e315f24c5592677353c43c4562be54)

Latest commit on soundfile branch v1.2.2

Installing the version of soundfile at: https://github.com/bastibe/python-soundfile/commits/v1.2.2/ also raises the error.

pip install git+https://github.com/bastibe/python-soundfile.git@cbaaad4fd06366b7624d0ce08354463ec7917a17

(v1.2.2 branch, commit: cbaaad4fd06366b7624d0ce08354463ec7917a17)

Hi @tompollard ,

Did soundfile==0.11.0 work fine always? I am experiencing the exact same error with soundfile==0.12.1 in Debian 10

bemoody commented 2 months ago

@eagomez2: to the best of my knowledge, soundfile==0.11.0 should work correctly on Debian testing/unstable (you will also need to install the system libsndfile1 package.)

eagomez2 commented 2 months ago

@eagomez2: to the best of my knowledge, soundfile==0.11.0 should work correctly on Debian testing/unstable (you will also need to install the system libsndfile1 package.)

I though it was working because I was not hitting the error that I could earlier replicate in my script, but after running for some hours more I still hit the same error, even using soundfile==0.11.0 🙁

bemoody commented 2 months ago

Sorry I wasn't clear. Yes, if you are using Debian 10, then you're susceptible to either bug #486 or #488, depending on which version of soundfile you installed.

soundfile 0.11.0 would work if you upgraded your system to Debian 13, which isn't yet released.

There is also an unreleased version of soundfile that fixes the bugs for Linux x86-64, Linux arm64, Windows x86, Windows x86-64, and MacOS x86-64. That's the version that Tom mentioned above, which you can install by running

pip install --force-reinstall git+https://github.com/bastibe/python-soundfile.git@cbaaad4fd06366b7624d0ce08354463ec7917a17
eagomez2 commented 2 months ago

Sorry I wasn't clear. Yes, if you are using Debian 10, then you're susceptible to either bug #486 or #488, depending on which version of soundfile you installed.

soundfile 0.11.0 would work if you upgraded your system to Debian 13, which isn't yet released.

There is also an unreleased version of soundfile that fixes the bugs for Linux x86-64, Linux arm64, Windows x86, Windows x86-64, and MacOS x86-64. That's the version that Tom mentioned above, which you can install by running

pip install --force-reinstall git+https://github.com/bastibe/python-soundfile.git@cbaaad4fd06366b7624d0ce08354463ec7917a17

Thank you! So far I bypassed the problem by reading the entire file and then selecting the section I want by indexing, but I will give it a try to the aforementioned solutions.