isciences / exactextract

Fast and accurate raster zonal statistics
Apache License 2.0
260 stars 33 forks source link

Unexpected result for stats with no result using integer rasters #138

Closed theroggy closed 3 months ago

theroggy commented 3 months ago

It seems that for geotiff files, when the nodata value is not np.nan, nodata is not respected.

Tested with exactextract 0.2.0.dev199 from python

Results of some tests:

EDIT: changed the nodata value so the example with scale and offset set gives a more interesting result. TLDR: the scale and offset are applied to the nodata value returned.

exact_extract for gdal_type=1, nodata=2, scale=None, offset=None
    -> results=[{'type': 'Feature', 'properties': {'mode': 2}}]
exact_extract for gdal_type=6, nodata=nan, scale=None, offset=None
    -> results=[{'type': 'Feature', 'properties': {'mode': nan}}]
exact_extract for gdal_type=1, nodata=255, scale=0.004, offset=-0.08
    -> results=[{'type': 'Feature', 'properties': {'mode': 0.9400000000000001}}]
exact_extract for gdal_type=6, nodata=nan, scale=0.004, offset=-0.08
    -> results=[{'type': 'Feature', 'properties': {'mode': nan}}]

Script to reproduce:

from pathlib import Path

import numpy as np
import pytest
from numpy import nan
from osgeo import gdal

from exactextract import exact_extract

gdal.UseExceptions()

def create_gdal_raster(
    fname, values, *, gt=None, gdal_type=None, nodata=None, scale=None, offset=None
):
    gdal = pytest.importorskip("osgeo.gdal")
    gdal_array = pytest.importorskip("osgeo.gdal_array")
    drv = gdal.GetDriverByName("GTiff")
    bands = 1 if len(values.shape) == 2 else values.shape[0]
    if gdal_type is None:
        gdal_type = gdal_array.NumericTypeCodeToGDALTypeCode(values.dtype)
    ds = drv.Create(
        str(fname), values.shape[-2], values.shape[-1], bands=bands, eType=gdal_type
    )
    if gt is None:
        ds.SetGeoTransform((0.0, 1.0, 0.0, values.shape[-2], 0.0, -1.0))
    else:
        ds.SetGeoTransform(gt)
    if nodata:
        if type(nodata) in {list, tuple}:
            for i, v in enumerate(nodata):
                ds.GetRasterBand(i + 1).SetNoDataValue(v)
        else:
            ds.GetRasterBand(1).SetNoDataValue(nodata)
    if scale:
        ds.GetRasterBand(1).SetScale(scale)
    if offset:
        ds.GetRasterBand(1).SetOffset(offset)
    if len(values.shape) == 2:
        ds.WriteArray(values)
    else:
        for i in range(bands):
            ds.GetRasterBand(i + 1).WriteArray(values[i, :, :])

def make_rect(xmin, ymin, xmax, ymax, id=None, properties=None):
    f = {
        "type": "Feature",
        "geometry": {
            "type": "Polygon",
            "coordinates": [
                [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax], [xmin, ymin]]
            ],
        },
    }
    if id is not None:
        f["id"] = id
    if properties is not None:
        f["properties"] = properties
    return f

def test_tiff_nodata_scale_offset(tmp_path, gdal_type, nodata, offset, scale):
    print(
        "exact_extract for "
        f"gdal_type={gdal_type}, nodata={nodata}, scale={scale}, offset={offset}"
    )
    raster_fname = tmp_path / f"test_nodata_{offset}_{scale}.tif"
    raster_fname.unlink(missing_ok=True)
    raster_fname = str(raster_fname)
    create_gdal_raster(
        raster_fname,
        np.array(
            [
                [nodata, nodata, nodata],
                [nodata, nodata, nodata],
                [nodata, nodata, nodata],
            ]
        ),
        gdal_type=gdal_type,
        nodata=nodata,
        scale=scale,
        offset=offset,
    )
    square = make_rect(0, 0, 2, 2)

    results = exact_extract(raster_fname, square, "mode")

    # assert np.isnan(results[0]["properties"]["mode"])
    print(f"    -> {results=}")

if __name__ == "__main__":
    tmp_path = Path("C:/temp")

    # Test with nodata=2 and gdal_type=gdal.GDT_Byte
    test_tiff_nodata_scale_offset(
        tmp_path, gdal_type=gdal.GDT_Byte, nodata=2, scale=None, offset=None
    )
    # Test with nodata=nan and gdal_type=gdal.GDT_Float32
    test_tiff_nodata_scale_offset(
        tmp_path, gdal_type=gdal.GDT_Float32, nodata=np.nan, scale=None, offset=None
    )
    # Test with nodata=2 and gdal_type=gdal.GDT_Byte
    test_tiff_nodata_scale_offset(
        tmp_path, gdal_type=gdal.GDT_Byte, nodata=255, scale=0.004, offset=-0.08
    )
    # Test with nodata=nan and gdal_type=gdal.GDT_Float32
    test_tiff_nodata_scale_offset(
        tmp_path, gdal_type=gdal.GDT_Float32, nodata=np.nan, scale=0.004, offset=-0.08
    )
dbaston commented 3 months ago

The nodata values are not being ignored.

The issue is that the type of "mode" is the same as the type of the input raster, i.e. byte. Since there are no pixels with an actual data value, there is no result for "mode". How do you represent "no result" ? If the input raster is a floating point type, you can return NaN. There being no equivalent of NaN for integers, it returns ... the nodata value.

I can see that this is not ideal. It'll take some thought to see how it can be improved.

theroggy commented 3 months ago

Maybe return None?

dbaston commented 3 months ago

None exists only in Python, and it's a distinct type from integer. This is all happening in C++.

theroggy commented 3 months ago

std::monostate or std::optional?

theroggy commented 3 months ago

My example didn't quite show the behaviour with a tiff with scale and offset specified, so I updated the code sample above.

TLDR: the scale and offset are applied to the nodata value returned.

dbaston commented 3 months ago

Fix for both issues available on test PyPI: https://test.pypi.org/project/exactextract/0.2.0.dev241/