gfx-rs / wgpu

A cross-platform, safe, pure-Rust graphics API.
https://wgpu.rs
Apache License 2.0
12.49k stars 914 forks source link

Possibly incorrect WGSL->GLSL transformation: `textureNumLayers -> textureSize` #6435

Open cdata opened 6 days ago

cdata commented 6 days ago

Description I make use of wgpu transitively via bevy.

When I build my bevy app for WebGL2, my use of textureNumLayers is transformed to a use textureSize that does not seem to be compatible with GLSL 3.00.

The usage in the source looks like this:

const MAX_MAP_LAYERS: u32 = 4;

@group(2) @binding(4) var map_texture: texture_2d_array<u32>;

fn sample_map(source_xy: vec2<u32>, length: vec2<f32>, layer_mask: u32, palette: vec3<u32>, uv: vec2<f32>) -> vec4<f32> {
    let map_layers: u32 = min(textureNumLayers(map_texture), MAX_MAP_LAYERS);
    // ...
}

And the resulting GLSL looks like this:

vec4 sample_map(uvec2 source_xy_1, vec2 length_1, uint layer_mask, uvec3 palette_3, vec2 uv_2) {
    // ...
    uint map_layers = min(uint(textureSize(_group_2_binding_4_fs).z), MAX_MAP_LAYERS);
    // ...
}

Note that the resulting call to textureSize is only passed one argument. This produces an error that there is no matching overload for textureSize. Upon consulting the spec, textureSize does indeed seem to expect a second positional argument for the mip level:

highp ivec2 textureSize (gsampler2D sampler, int lod)
highp ivec3 textureSize (gsampler3D sampler, int lod)
highp ivec2 textureSize (gsamplerCube sampler, int lod)

highp ivec2 textureSize (sampler2DShadow sampler, int lod)
highp ivec2 textureSize (samplerCubeShadow sampler, int lod)
highp ivec3 textureSize (gsampler2DArray sampler, int lod)
highp ivec3 textureSize (sampler2DArrayShadow sampler, int lod)

However, textureNumLayers does not accept a similar second argument (https://www.w3.org/TR/WGSL/#texturenumlayers).

Repro steps

  1. Use textureNumLayers in a WGSL shader

Expected vs observed behavior The resulting GLSL is compatible with WebGL2 in web browsers, however it is not.

Extra materials Here is the full transformed output of the offending fragment shader (captured via Chrome dev tools): https://gist.github.com/cdata/22a420fee91854366e13f01983e04dfd

Platform Chrome 129, Linux / Wayland bevy 0.14.2, wgpu 0.20.1

teoxoy commented 5 days ago

That's odd since it looks like the code should write the lod as well.

https://github.com/gfx-rs/wgpu/blob/36fab5ce3dead02f191509980e38dc6792a77e5a/naga/src/back/glsl/mod.rs#L3095

We even have snapshot tests for this (naga\tests\in\image.wgsl -> naga\tests\out\glsl\image.queries.Vertex.glsl).

cdata commented 5 days ago

@teoxoy thanks for taking a look and for referencing that test!

It may be relevant context: in the shader, I sample a texture within a non-uniform condition (which I ran into when getting it to work under WebGPU). I'm not sure if this kind of conditional logic could change the generated GLSL in a meaningful way, but it may be worth noting.

Here is the WGSL (pre-naga-oil), in case it is helpful.

#import bevy_pbr::mesh_functions::{get_world_from_local, mesh_position_local_to_clip}

struct Vertex {
    @location(0) position: vec3<f32>,
    @location(1) normal: vec3<f32>,
    @location(2) uv: vec2<f32>,

    @location(3) operation: u32,
    @location(4) palette: vec3<u32>,
    @location(5) atlas_index: u32,
    @location(6) source_xy: vec2<u32>,
    @location(7) screen_xy: vec2<i32>,
    @location(8) length: vec2<u32>,
    @location(9) layer_mask: u32,
};

struct VertexOutput {
    @builtin(position) clip_position: vec4<f32>,
    @location(0) uv: vec2<f32>,
    @location(1) operation: u32,
    @location(2) palette: vec3<u32>,
    @location(3) atlas_index: u32,
    @location(4) source_xy: vec2<u32>,
    @location(5) length: vec2<f32>,
    @location(6) layer_mask: u32
};

@vertex
fn vertex(vertex: Vertex) -> VertexOutput {
    let w: f32 = f32(vertex.length.x);
    let h: f32 = f32(vertex.length.y);

    let screen_space_offset = vec3<f32>(0.0, 240., 0.0);
    let scaled_size = (vertex.position + vec3<f32>(0.5, -0.5, 0.0)) * vec3<f32>(w, h, 1.);
    let draw_offset = vec3<f32>(f32(vertex.screen_xy.x), -1. * f32(vertex.screen_xy.y), 0.);

    let position = screen_space_offset + scaled_size + draw_offset;

    var out: VertexOutput;

    out.clip_position = mesh_position_local_to_clip(
        get_world_from_local(0u),
        vec4<f32>(position, 1.0)
    );
    out.uv = vertex.uv;
    out.operation = vertex.operation;
    out.atlas_index = vertex.atlas_index;
    out.palette = vertex.palette;
    out.source_xy = vertex.source_xy;
    out.length = vec2(w, h);
    out.layer_mask = vertex.layer_mask;

    return out;
}

// TODO: Atlas binding should be a non-sampled textre_2d<u32>
@group(2) @binding(0) var atlas_texture: texture_2d<f32>;
@group(2) @binding(1) var atlas_texture_sampler: sampler;

@group(2) @binding(2) var palette_texture: texture_2d<f32>;
@group(2) @binding(3) var palette_texture_sampler: sampler;

@group(2) @binding(4) var map_texture: texture_2d_array<u32>;

const CHARACTER_PIXEL_SIZE: vec2<f32> = vec2(16.0);
const ATLAS_CHARACTER_SIZE: vec2<f32> = vec2(16.0);
const PALETTE_SIZE: u32 = 16;

const ATLAS_PIXEL_SIZE: vec2<f32> = CHARACTER_PIXEL_SIZE * ATLAS_CHARACTER_SIZE;
const PALETTE_PIXEL_SIZE: vec2<f32> = vec2(f32(PALETTE_SIZE));
const MAX_MAP_LAYERS: u32 = 4;

const SPRITE_OPERATION: u32 = 0;
const MAP_OPERATION: u32 = 1;

const PALETTE_LOCK_MASK: u32 = 65535;
const LOCK_COUNTER_MASK: u32 = 32768;

fn sample_palette(index: u32, palette: vec3<u32>) -> vec4<f32> {
    let palette_offset = palette.y;
    let palette_lock = palette.z;
    var palette_index = index % PALETTE_SIZE;

    // We only need to do offset math if the color is "free" (not locked)
    let is_free = ((palette_lock >> palette_index) & 1) == 0;

    if (is_free) {
        if (palette_lock > 0) {
            // At least one other color is locked.

            // 8 bytes (64 bits) to store "free" indices in order, noting
            // Indices are in the range [0,15] and therefor
            // can be represented by 4 bits each
            var free_indices_0: u32 = 0;
            var free_indices_1: u32 = 0;

            // NOTE: Tried to be clever with a pointer, but there are usage
            // restrictions imposed by Naga's IR that seem to apply but that
            // I don't fully understand.
            // SEE: https://github.com/gfx-rs/wgpu/issues/4451
            // var free_indices = &free_indices_0;

            var free_bits: u32 = 0;
            var lock_counter = palette_lock;

            for (var i: u32 = 0; i < PALETTE_SIZE; i += 1u) {
                if ((lock_counter & 1) == 0) {
                    // This bit is "free"
                    let buffer_offset: u32 = free_bits - free_bits / 8 * 8;
                    let value = u32(i) << (4 * buffer_offset);
                    // OR the index into the buffer at the appropriate offset
                    if (buffer_offset < free_bits) {
                        free_indices_1 |= value;
                    } else {
                        free_indices_0 |= value;
                    }
                    free_bits += 1u;
                } else if (i < index) {
                    // This bit is "locked"
                    // Decrement the starting index to account for proceding
                    // "locked" indices
                    palette_index -= 1u;
                }

                // Shift the lock counter to the next bit
                lock_counter >>= 1u;
            }

            // The total set of slots that a color may shift through is
            // equal to the free bit count, so modulo that to get the
            // real offset.
            palette_index = (palette_index + palette_offset) % free_bits;

            var free_indices = free_indices_0;

            if (palette_index > 7u) {
                free_indices = free_indices_1;
                palette_index -= 8u;
            }

            palette_index = (free_indices >> ((palette_index) * 4)) & (PALETTE_SIZE - 1);

        } else {
            // All colors are free. Modulus will suffice to determine the
            // final index.
            palette_index = (palette_index + palette_offset) % PALETTE_SIZE;
        }
    }

    let palette_uv = vec2(
        fract(f32(palette_index) / f32(PALETTE_SIZE)),
        0.0
    );

    return textureSampleLevel(palette_texture, palette_texture_sampler, palette_uv, 0.);
}

fn sample_sprite(atlas_index: u32, palette: vec3<u32>, uv: vec2<f32>) -> vec4<f32> {
    let atlas_coordinates = vec2(
        f32(atlas_index % u32(ATLAS_CHARACTER_SIZE.x)),
        f32(atlas_index / u32(ATLAS_CHARACTER_SIZE.y) % u32(ATLAS_CHARACTER_SIZE.y))
    );

    let atlas_uv = vec2(uv.x, 1. - uv.y) / 16. + (atlas_coordinates / 16.);

    // This gets us a color in the range [0, 15]
    // TODO: Use textureLoad and get a u32 directly / save upload bandwidth
    // NOTE: We cannot use a simple `textureSample` in these functions, because
    // the sample happens conditionally within the fragment and WebGPU does not
    // allow MIP-aware sampling within non-uniform-based conditions (sheesh).
    var palette_index = u32(textureSampleLevel(atlas_texture, atlas_texture_sampler, atlas_uv, 0.).r * 256.);

    return sample_palette(palette_index, palette);
}

fn sample_map_layer(layer: u32, source_xy: vec2<u32>, length: vec2<f32>, palette: vec3<u32>, uv: vec2<f32>) -> vec4<f32> {
    let map_uv = vec2(uv.x, 1. - uv.y);
    let map_size: vec2<u32> = textureDimensions(map_texture, 0).xy;
    let map_source_xy = vec2<f32>(source_xy) / CHARACTER_PIXEL_SIZE;
    let map_length = floor(length / CHARACTER_PIXEL_SIZE);
    let map_coords = vec2<u32>(map_source_xy + map_length * map_uv);

    let atlas_index = textureLoad(map_texture, map_coords, layer, 0).x;

    let tile_length: vec2<f32> = length / CHARACTER_PIXEL_SIZE;
    let tile = tile_length * map_uv + fract(map_source_xy);
    let tile_uv: vec2<f32> = fract(tile);

    return sample_sprite(atlas_index, palette, vec2(tile_uv.x, 1 - tile_uv.y));
}

fn sample_map(source_xy: vec2<u32>, length: vec2<f32>, layer_mask: u32, palette: vec3<u32>, uv: vec2<f32>) -> vec4<f32> {
    // NOTE: Currently naga desugars this incorrectly when targeting WebGL2, so
    // we hardcode the max value; this trades off some perf in circumstances where
    // a map is used that has fewer than 4 layers,
    // https://github.com/gfx-rs/wgpu/issues/6435
    // let map_layers: u32 = min(textureNumLayers(map_texture), MAX_MAP_LAYERS);
    let map_layers: u32 = MAX_MAP_LAYERS;

    var mask_includes: array<bool,4> = array(true, true, true, true);

    if (layer_mask > 0) {
        mask_includes = array(
            (layer_mask & 1) == 1,
            ((layer_mask >> 1) & 1) == 1,
            ((layer_mask >> 1) & 1) == 1,
            ((layer_mask >> 1) & 1) == 1
        );
    }

    var final_color = vec4(0.0);

    for (var i = 0u; i < map_layers; i += 1u) {
        let include_layer = mask_includes[i];
        if (!include_layer) {
            continue;
        }

        let layer_color = sample_map_layer(i, source_xy, length, palette, uv);

        if (layer_color.a > 0.0) {
            final_color = layer_color;
        }
    }

    return final_color;
}

@fragment
fn fragment(in: VertexOutput) -> @location(0) vec4<f32> {
    let operation = in.operation;
    let atlas_index = in.atlas_index;
    let uv = in.uv;
    let palette = in.palette;
    let length = in.length;
    let source_xy = in.source_xy;
    let layer_mask = in.layer_mask;

    var palette_uv = vec2<f32>(1./16. * 3., 0.0);

    if (operation == SPRITE_OPERATION) {
        return sample_sprite(atlas_index, palette, uv);
    } else if (operation == MAP_OPERATION) {
        return sample_map(source_xy, length, layer_mask, palette, uv);
    }

    return vec4(0.0);
}
teoxoy commented 4 days ago

I sample a texture within a non-uniform condition (which I ran into when getting it to work under WebGPU). I'm not sure if this kind of conditional logic could change the generated GLSL in a meaningful way.

It shouldn't.

Could you create a minimal repro and try to pass it to the naga CLI? To rule out any transforms that naga-oil might be doing.