[General question] Figuring out the correct index type for `gather` operations

This is not an issue but rather a query for advice. I actually couldn't figure whether it would me more relevant to post this here or on the Mitsuba issue tracker.

Background: the general goal consists in adding spectral interpolation to Mitsuba's gridvolume plugin. I'm writing an 4-linear interpolant which uses data loaded into a linear buffer. The first 3 dimensions are space dimensions, the last is the spectral dimension. I want to vectorize my interpolation code.

I came up with a flawed implementation which has two (I think related) huge issues:

the index type I could create to query the buffer and store its values to Spectrum elements has the wrong shape and can't be used to perform a gather operation without being transposed;
if I'm not mistaken, the transpose code I wrote creates temporary dynamic arrays, which is terrible.

A consequence of this is that this code doesn't work if I use Packet<float, 2> as my basic Float type: that's also terrible.

Question: Could you hint me at what I'm doing wrong and how I should actually do?

The following MWE provides easily checkable results. I can upload it as a GitHub repository with a working CMake setup if it's more convenient.

#include <iostream>
#include <fstream>
#include <enoki/array.h>
#include <enoki/dynamic.h>

using namespace enoki;

// Basic types
using Float = float;
// using Float = Packet<float, 2>;  // Broken a.t.m
using Spectrum = Array<Float, 2>;

using ScalarFloat = scalar_t<Float>;
using Int32 = int32_array_t<Float>;
using Point3i = Array<Int32, 3>;
using Point3f = Array<Float, 3>;
using ScalarVector3f = Array<ScalarFloat, 3>;

template <
    typename Value,
    typename T = std::conditional_t<
        is_static_array_v<Value>,
        value_t<Value>,
        Value>>
using DynamicBuffer = std::conditional_t<
    is_dynamic_array_v<T>,
    T,
    DynamicArray<Packet<scalar_t<T>>>>;

// Our main testing routine
int main()
{
    // Create data (simulate load from hard drive)
    std::cout << "-- Data generation --" << std::endl;
    size_t nx = 3, ny = 3, nz = 3, channel_count = 12;
    size_t size_data = nx * ny * nz * channel_count;
    auto raw_data = std::unique_ptr<ScalarFloat[]>(new ScalarFloat[size_data]);

    {
        size_t p = 0;
        for (size_t i = 0; i < nx; ++i)
            for (size_t j = 0; j < ny; ++j)
                for (size_t k = 0; k < nz; ++k)
                    for (size_t l = 0; l < channel_count; ++l)
                    {
                        raw_data[p] = i + j + k + l;
                        ++p;
                    }
    }

    auto data = DynamicBuffer<Float>::copy(raw_data.get(), size_data);
    std::cout << "data = " << data << std::endl;
    std::cout << "packets(data) = " << packets(data) << std::endl;

    // Our data is loaded: let's perform a 4-linear interpolation
    std::cout << std::endl;

    // Interpolation point definition
    // We assume:
    // - x in [0, nx-1]
    // - y in [0, ny-1]
    // - z in [0, nz-1]
    // - wavelength in [0, channel_count-1]
    std::cout << "-- Interpolation point definition --" << std::endl;
    Point3f p(0.8f, 1.0f, 1.4f);
    std::cout << "p = " << p << std::endl;
    Spectrum wavelengths(5.2, 1.4);
    std::cout << "wavelengths = " << wavelengths << std::endl;

    std::cout << std::endl;

    // Lookup neighbouring data points
    /// Spatial component
    std::cout << "-- Nearest neighbour lookup --" << std::endl;
    Point3i p_i = floor2int<Point3i>(p);
    std::cout << "p_i = " << p_i << std::endl;
    /// Spectral component
    using SpectrumIndex = uint32_array_t<Spectrum>;
    SpectrumIndex wavelengths_i = floor2int<SpectrumIndex>(wavelengths);
    std::cout << "wavelengths_i = " << wavelengths_i << std::endl;

    std::cout << std::endl;

    // Retrieve data points from storage
    std::cout << "-- Data point retrieval --" << std::endl;
    using Int16 = Array<Int32, 16>;
    using Int316 = Array<Int16, 3>;

    /// Compute per-axis index values for spatial component
    Int316 index_i_space(
        Int16(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1) + p_i.x(),
        Int16(0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1) + p_i.y(),
        Int16(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1) + p_i.z());
    std::cout << "index_i_space = " << index_i_space << std::endl;

    /// Compute per-axis index values for spectral component
    using IntN16 = Array<Int16, Spectrum::Size>;
    IntN16 index_i_spectrum =
        IntN16(Int16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)) +
        wavelengths_i;
    std::cout << "index_i_spectrum = " << index_i_spectrum << std::endl;

    /// Compute linear index (in storage) for each spectral component
    IntN16 index =
        fmadd(
            fmadd(
                fmadd(index_i_space[0], ny, index_i_space[1]),
                nx,
                index_i_space[2]),
            channel_count,
            index_i_spectrum);
    std::cout << "index = " << index << std::endl;
    // Problem: wrong array shape, we need to swap dimensions

    Array<SpectrumIndex, 16> index_transpose;
    for (size_t i = 0; i < 16; ++i)
        // Terrible: slice allocates dynamically! Also doesn't work with a packet Float type
        index_transpose[i] = gather<SpectrumIndex>(slice(index, i), 0);
    std::cout << "index_transpose = " << index_transpose << std::endl;

    std::cout << std::endl;

    // We now apply the 4-linear interpolation formula
    std::cout << "-- Interpolation --" << std::endl;
    Spectrum
        d0000 = gather<Spectrum>(data, index_transpose[0]),
        d0100 = gather<Spectrum>(data, index_transpose[1]),
        d0010 = gather<Spectrum>(data, index_transpose[2]),
        d0110 = gather<Spectrum>(data, index_transpose[3]),
        d0001 = gather<Spectrum>(data, index_transpose[4]),
        d0101 = gather<Spectrum>(data, index_transpose[5]),
        d0011 = gather<Spectrum>(data, index_transpose[6]),
        d0111 = gather<Spectrum>(data, index_transpose[7]),
        d1000 = gather<Spectrum>(data, index_transpose[8]),
        d1100 = gather<Spectrum>(data, index_transpose[9]),
        d1010 = gather<Spectrum>(data, index_transpose[10]),
        d1110 = gather<Spectrum>(data, index_transpose[11]),
        d1001 = gather<Spectrum>(data, index_transpose[12]),
        d1101 = gather<Spectrum>(data, index_transpose[13]),
        d1011 = gather<Spectrum>(data, index_transpose[14]),
        d1111 = gather<Spectrum>(data, index_transpose[15]);

    Point3f w_space_1 = p - Point3f(p_i),
            w_space_0 = 1.f - w_space_1;
    Spectrum w_spectrum_1 = wavelengths - Spectrum(wavelengths_i),
             w_spectrum_0 = 1.f - w_spectrum_1;

    Spectrum d000 = fmadd(w_space_0.x(), d0000, w_space_1.x() * d1000),
             d001 = fmadd(w_space_0.x(), d0001, w_space_1.x() * d1001),
             d010 = fmadd(w_space_0.x(), d0010, w_space_1.x() * d1010),
             d011 = fmadd(w_space_0.x(), d0011, w_space_1.x() * d1011),
             d100 = fmadd(w_space_0.x(), d0100, w_space_1.x() * d1100),
             d101 = fmadd(w_space_0.x(), d0101, w_space_1.x() * d1101),
             d110 = fmadd(w_space_0.x(), d0110, w_space_1.x() * d1110),
             d111 = fmadd(w_space_0.x(), d0111, w_space_1.x() * d1111);
    Spectrum d00 = fmadd(w_space_0.y(), d000, w_space_1.y() * d100),
             d01 = fmadd(w_space_0.y(), d001, w_space_1.y() * d101),
             d10 = fmadd(w_space_0.y(), d010, w_space_1.y() * d110),
             d11 = fmadd(w_space_0.y(), d011, w_space_1.y() * d111);
    Spectrum d0 = fmadd(w_space_0.z(), d00, w_space_1.z() * d10),
             d1 = fmadd(w_space_0.z(), d01, w_space_1.z() * d11);
    Spectrum result = fmadd(w_spectrum_0, d0, w_spectrum_1 * d1);

    std::cout << "result = " << result << std::endl;
}

It turns out that the index type I had chosen for the spectral part was wrong. I fixed it as follows:

#include <iostream>
#include <fstream>
#include <enoki/array.h>
#include <enoki/dynamic.h>

using namespace enoki;

// Basic types
// using Float = float;
using Float = Packet<float, 2>;
using Spectrum = Array<Float, 2>;

using ScalarFloat = scalar_t<Float>;
using Int32 = int32_array_t<Float>;
using Point3i = Array<Int32, 3>;
using Point3f = Array<Float, 3>;
using ScalarVector3f = Array<ScalarFloat, 3>;

template <
    typename Value,
    typename T = std::conditional_t<
        is_static_array_v<Value>,
        value_t<Value>,
        Value>>
using DynamicBuffer = std::conditional_t<
    is_dynamic_array_v<T>,
    T,
    DynamicArray<Packet<scalar_t<T>>>>;

// Our main testing routine
int main()
{
    // Create data (simulate load from hard drive)
    std::cout << "-- Data generation --" << std::endl;
    size_t nx = 3, ny = 3, nz = 3, channel_count = 12;
    size_t size_data = nx * ny * nz * channel_count;
    auto raw_data = std::unique_ptr<ScalarFloat[]>(new ScalarFloat[size_data]);

    {
        size_t p = 0;
        for (size_t i = 0; i < nx; ++i)
            for (size_t j = 0; j < ny; ++j)
                for (size_t k = 0; k < nz; ++k)
                    for (size_t l = 0; l < channel_count; ++l)
                    {
                        raw_data[p] = i + j + k + l;
                        ++p;
                    }
    }

    auto data = DynamicBuffer<Float>::copy(raw_data.get(), size_data);
    std::cout << "data = " << data << std::endl;
    std::cout << "packets(data) = " << packets(data) << std::endl;

    // Our data is loaded: let's perform a 4-linear interpolation
    std::cout << std::endl;

    // Interpolation point definition
    // We assume:
    // - x in [0, nx-1]
    // - y in [0, ny-1]
    // - z in [0, nz-1]
    // - wavelength in [0, channel_count-1]
    std::cout << "-- Interpolation point definition --" << std::endl;
    Point3f p(0.8f, 1.0f, 1.4f);
    std::cout << "p = " << p << std::endl;
    Spectrum wavelengths(5.2, 1.4);
    std::cout << "wavelengths = " << wavelengths << std::endl;

    std::cout << std::endl;

    // Lookup neighbouring data points
    /// Spatial component
    std::cout << "-- Nearest neighbour lookup --" << std::endl;
    Point3i p_i = floor2int<Point3i>(p);
    std::cout << "p_i = " << p_i << std::endl;
    /// Spectral component
    using SpectrumIndex = uint32_array_t<Spectrum>;
    SpectrumIndex wavelengths_i = floor2int<SpectrumIndex>(wavelengths);
    std::cout << "wavelengths_i = " << wavelengths_i << std::endl;

    std::cout << std::endl;

    // Retrieve data points from storage
    std::cout << "-- Data point retrieval --" << std::endl;
    using Int16 = Array<SpectrumIndex, 16>;
    using Int316 = Array<Int16, 3>;

    /// Compute per-axis index values for spatial component
    Int316 index_i_space(
        Int16(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1) + p_i.x(),
        Int16(0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1) + p_i.y(),
        Int16(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1) + p_i.z());
    std::cout << "index_i_space = " << index_i_space << std::endl;

    /// Compute per-axis index values for spectral component
    Int16 index_i_spectrum =
        Int16(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) +
        wavelengths_i;
    std::cout << "index_i_spectrum = " << index_i_spectrum << std::endl;

    /// Compute linear index (in storage) for each spectral component
    Int16 index = fmadd(
        fmadd(
            fmadd(index_i_space[0], ny, index_i_space[1]), 
            nx, 
            index_i_space[2]
        ), 
        channel_count, 
        index_i_spectrum
    );
    std::cout << "index = " << index << std::endl;

    std::cout << std::endl;

    // We now apply the 4-linear interpolation formula
    std::cout << "-- Interpolation --" << std::endl;
    Spectrum
        d0000 = gather<Spectrum>(data, index[0]),
        d0100 = gather<Spectrum>(data, index[1]),
        d0010 = gather<Spectrum>(data, index[2]),
        d0110 = gather<Spectrum>(data, index[3]),
        d0001 = gather<Spectrum>(data, index[4]),
        d0101 = gather<Spectrum>(data, index[5]),
        d0011 = gather<Spectrum>(data, index[6]),
        d0111 = gather<Spectrum>(data, index[7]),
        d1000 = gather<Spectrum>(data, index[8]),
        d1100 = gather<Spectrum>(data, index[9]),
        d1010 = gather<Spectrum>(data, index[10]),
        d1110 = gather<Spectrum>(data, index[11]),
        d1001 = gather<Spectrum>(data, index[12]),
        d1101 = gather<Spectrum>(data, index[13]),
        d1011 = gather<Spectrum>(data, index[14]),
        d1111 = gather<Spectrum>(data, index[15]);

    Point3f w_space_1 = p - Point3f(p_i),
            w_space_0 = 1.f - w_space_1;
    Spectrum w_spectrum_1 = wavelengths - Spectrum(wavelengths_i),
             w_spectrum_0 = 1.f - w_spectrum_1;

    Spectrum d000 = fmadd(w_space_0.x(), d0000, w_space_1.x() * d1000),
             d001 = fmadd(w_space_0.x(), d0001, w_space_1.x() * d1001),
             d010 = fmadd(w_space_0.x(), d0010, w_space_1.x() * d1010),
             d011 = fmadd(w_space_0.x(), d0011, w_space_1.x() * d1011),
             d100 = fmadd(w_space_0.x(), d0100, w_space_1.x() * d1100),
             d101 = fmadd(w_space_0.x(), d0101, w_space_1.x() * d1101),
             d110 = fmadd(w_space_0.x(), d0110, w_space_1.x() * d1110),
             d111 = fmadd(w_space_0.x(), d0111, w_space_1.x() * d1111);
    Spectrum d00 = fmadd(w_space_0.y(), d000, w_space_1.y() * d100),
             d01 = fmadd(w_space_0.y(), d001, w_space_1.y() * d101),
             d10 = fmadd(w_space_0.y(), d010, w_space_1.y() * d110),
             d11 = fmadd(w_space_0.y(), d011, w_space_1.y() * d111);
    Spectrum d0 = fmadd(w_space_0.z(), d00, w_space_1.z() * d10),
             d1 = fmadd(w_space_0.z(), d01, w_space_1.z() * d11);
    Spectrum result = fmadd(w_spectrum_0, d0, w_spectrum_1 * d1);

    std::cout << "result = " << result << std::endl;
}

Works like a charm with packet Float types! I'll close this issue.

mitsuba-renderer / enoki

[General question] Figuring out the correct index type for `gather` operations #107