error X3504: literal loop terminated early due to out of bounds array access

TobTobXX commented 2 years ago

Hi there, I wrote this wgpu application on Linux (Wayland) and it worked just fine. Now that someone wanted to compile it on Windows, it doesn't work:

noisy log output

``` 2022-04-27T15:20:32.196Z WARN [wgpu_hal::dx12::instance] Unable to enable D3D12 debug interface: 0x887A002D 2022-04-27T15:20:32.197Z WARN [wgpu_hal::dx12::instance] Unable to enable DXGI debug interface: 0x887A002D 2022-04-27T15:21:29.310Z WARN [wgpu_hal::dx12::device] Naga generated shader for "fs_main" at Fragment: struct NagaConstants { int base_vertex; int base_instance; uint other; }; ConstantBuffer _NagaConstants: register(b2); static const int PATHS_BUF_SIZE = 200; struct CameraUniform { row_major float4x4 view_proj; float zoom; }; struct Path { uint kind; float2 p0_; float2 p1_; float2 p2_; float2 p3_; }; struct Paths { Path paths[200]; }; struct Bezier { float2 a; float2 b; float2 c; float2 d; }; struct Poly5_ { float c5_; float c4_; float c3_; float c2_; float c1_; float c0_; }; struct VertexInput { linear float2 pos : LOC0; linear float2 tex_coord : LOC1; }; struct VertexOutput { float4 projected : SV_Position; linear float2 tex_coord : LOC0; }; cbuffer camera : register(b0) { CameraUniform camera; } cbuffer paths : register(b1) { Paths paths; } struct VertexOutput_vs_main { float2 tex_coord : LOC0; float4 projected : SV_Position; }; struct FragmentInput_fs_main { float2 tex_coord_1 : LOC0; float4 projected_1 : SV_Position; }; float2 cubic_bezier(Bezier b, float t) { return ((((b.a * pow(t, 3.0)) + (b.b * pow(t, 2.0))) + (b.c * t)) + b.d); } float4 poly5_mult(Poly5_ p, float4 x) { return ((((((p.c5_ * pow(x, float4(5.0.xxxx))) + (p.c4_ * pow(x, float4(4.0.xxxx)))) + (p.c3_ * pow(x, float4(3.0.xxxx)))) + (p.c2_ * pow(x, float4(2.0.xxxx)))) + (p.c1_ * x)) + float4(p.c0_.xxxx)); } float4 poly5_d1_mult(Poly5_ p_1, float4 x_1) { return ((((((5.0 * p_1.c5_) * pow(x_1, float4(4.0.xxxx))) + ((4.0 * p_1.c4_) * pow(x_1, float4(3.0.xxxx)))) + ((3.0 * p_1.c3_) * pow(x_1, float4(2.0.xxxx)))) + ((2.0 * p_1.c2_) * x_1)) + float4(p_1.c1_.xxxx)); } float4 find_roots4_newton(Poly5_ poly) { float4 approx = (float4)0; int i_1 = 0; approx = float4(0.0, 0.33000001311302185, 0.6600000262260437, 1.0); bool loop_init = true; while(true) { if (!loop_init) { int _expr15 = i_1; i_1 = (_expr15 + 1); } loop_init = false; int _expr13 = i_1; if ((_expr13 < 3)) { } else { break; } float4 _expr18 = approx; float4 _expr19 = approx; const float4 _e20 = poly5_mult(poly, _expr19); float4 _expr21 = approx; const float4 _e22 = poly5_d1_mult(poly, _expr21); approx = (_expr18 - (_e20 / _e22)); } float4 _expr25 = approx; return clamp(_expr25, float4(0.0.xxxx), float4(1.0.xxxx)); } Bezier ConstructBezier(float2 arg0, float2 arg1, float2 arg2, float2 arg3) { Bezier ret; ret.a = arg0; ret.b = arg1; ret.c = arg2; ret.d = arg3; return ret; } Poly5_ ConstructPoly5_(float arg0, float arg1, float arg2, float arg3, float arg4, float arg5) { Poly5_ ret; ret.c5_ = arg0; ret.c4_ = arg1; ret.c3_ = arg2; ret.c2_ = arg3; ret.c1_ = arg4; ret.c0_ = arg5; return ret; } float cubic_bezier_sd(float2 p0_, float2 p1_, float2 p2_, float2 p3_, float2 p_2, float R) { float dist_1 = (float)0; int i_2 = 1; float2 a_1 = ((((-1.0 * p0_) + (3.0 * p1_)) - (3.0 * p2_)) + (1.0 * p3_)); float2 b_2 = (((3.0 * p0_) - (6.0 * p1_)) + (3.0 * p2_)); float2 c = ((-3.0 * p0_) + (3.0 * p1_)); float2 d = (1.0 * p0_); Bezier curve = ConstructBezier(a_1, b_2, c, d); float2 v_c5_ = ((-3.0 * a_1) * a_1); float c5_ = (v_c5_.x + v_c5_.y); float2 v_c4_ = ((-5.0 * a_1) * b_2); float c4_ = (v_c4_.x + v_c4_.y); float2 v_c3_ = (((-4.0 * a_1) * c) - ((2.0 * b_2) * b_2)); float c3_ = (v_c3_.x + v_c3_.y); float2 v_c2_ = ((((-3.0 * a_1) * d) - ((3.0 * b_2) * c)) + ((3.0 * a_1) * p_2)); float c2_ = (v_c2_.x + v_c2_.y); float2 v_c1_ = ((((-2.0 * b_2) * d) - (c * c)) + ((2.0 * b_2) * p_2)); float c1_ = (v_c1_.x + v_c1_.y); float2 v_c0_ = ((-c * d) + (c * p_2)); float c0_ = (v_c0_.x + v_c0_.y); Poly5_ poly_1 = ConstructPoly5_(c5_, c4_, c3_, c2_, c1_, c0_); const float4 _e94 = find_roots4_newton(poly_1); const float2 _e97 = cubic_bezier(curve, _e94.x); dist_1 = distance(_e97, p_2); bool loop_init_1 = true; while(true) { if (!loop_init_1) { int _expr105 = i_2; i_2 = (_expr105 + 1); } loop_init_1 = false; int _expr102 = i_2; if ((_expr102 < 5)) { } else { break; } int _expr108 = i_2; const float2 _e110 = cubic_bezier(curve, _e94[_expr108]); float this_dist = distance(_e110, p_2); float _expr112 = dist_1; dist_1 = min(_expr112, this_dist); } float _expr114 = dist_1; return (_expr114 - R); } float line_sd(float2 a, float2 b_1, float2 p_3, float R_1) { float2 r = (b_1 - a); float t_1 = clamp((dot(r, (p_3 - a)) / length(r)), 0.0, length(r)); float2 n = ((t_1 * normalize(r)) + a); return (distance(n, p_3) - R_1); } VertexOutput_vs_main vs_main(VertexInput v_in) { VertexOutput v_out = (VertexOutput)0; float4x4 _expr7 = camera.view_proj; v_out.projected = mul(float4(v_in.pos, 0.0, 1.0), _expr7); v_out.tex_coord = v_in.tex_coord; VertexOutput _expr15 = v_out; const VertexOutput vertexoutput = _expr15; const VertexOutput_vs_main vertexoutput_1 = { vertexoutput.tex_coord, vertexoutput.projected }; return vertexoutput_1; } float4 fs_main(FragmentInput_fs_main fragmentinput_fs_main) : SV_Target0 { VertexOutput f_in = { fragmentinput_fs_main.projected_1, fragmentinput_fs_main.tex_coord_1 }; float dist = 1000000.0; int i = 0; bool loop_init_2 = true; while(true) { if (!loop_init_2) { int _expr11 = i; i = (_expr11 + 1); } loop_init_2 = false; int _expr9 = i; if ((_expr9 < PATHS_BUF_SIZE)) { } else { break; } int _expr15 = i; Path path = paths.paths[_expr15]; switch(path.kind) { case 1u: { const float _e22 = line_sd(path.p0_, path.p1_, f_in.tex_coord, 3.0); float _expr23 = dist; dist = min(_expr23, _e22); break; } case 2u: { float2 p0_1 = path.p0_; float2 p1_1 = (path.p0_ + ((2.0 / 3.0) * (path.p1_ - path.p0_))); float2 p2_1 = (path.p2_ + ((2.0 / 3.0) * (path.p1_ - path.p2_))); float2 p3_1 = path.p2_; const float _e46 = cubic_bezier_sd(p0_1, p1_1, p2_1, p3_1, f_in.tex_coord, 3.0); float _expr47 = dist; dist = min(_expr47, _e46); break; } case 3u: { const float _e54 = cubic_bezier_sd(path.p0_, path.p1_, path.p2_, path.p3_, f_in.tex_coord, 3.0); float _expr55 = dist; dist = min(_expr55, _e54); break; } default: { break; } } } float _expr57 = dist; float _expr59 = camera.zoom; float adj_distance = (_expr57 / _expr59); return float4(float3(adj_distance.xxx), 1.0); } 2022-04-27T15:21:29.310Z WARN [wgpu::backend::direct] Shader translation error for stage FRAGMENT: D3DCompile error (0x80004005): C:\Users\Tobias\Downloads\font-renderer-master.tar\font-renderer-master\Shader(168,50-63): error X3504: literal loop terminated early due to out of bounds array access ``` (note that this is not the stderr output, since Windows shell SUUUCKS to copy-paste from. But stderr doesn't have more infos.)

Here's the important part (I think, line breaks and indentation mine):

2022-04-27T15:21:29.310Z WARN  [wgpu::backend::direct] Shader translation error for stage FRAGMENT:
  D3DCompile error (0x80004005):
    C:\Users\Tobias\Downloads\font-renderer-master.tar\font-renderer-master\Shader(168,50-63):
      error X3504: literal loop terminated early due to out of bounds array access

This is the shader I used: shader.wgsl (pls don't laugh. I know these aglorithms can be done better, but I had to do it without copy-pasting.)

And this is a link to the project source. It's just standart cargo run, in case anyone wants to try: sources.tar.gz.

Is there a way to work around this?

cwfitzgerald commented 2 years ago

Thanks for filing!

This is ultimately an issue on our end, as this is valid wgsl, but the HLSL compiler isn't happy with the output.

That being said the loop on line 209 indexes the vector out of bounds, it indexes it at 1, 2, 3, 4 and 4 isn't a in-bounds index into a vec4. If you were to keep those in bounds, it should compile.

TobTobXX commented 2 years ago

Yes, it compiles now. Sorry for the inconvenience. I didn't catch it because I guess Vulkan just ignores it...

(at least now someone searching for it should find this ;) )

jimblandy commented 2 years ago

Could we get a reduced version of the test case here?

Here's the loop @cwfitzgerald called out at line 209:

for (var i = 1; i < 5; i=i+1) {
    let this_dist = distance(cubic_bezier(curve, roots_n[i]), p);
    // And accept any smaller distance
    dist = min(dist, this_dist);
}

This could be a problem for us. roots_n[i] is a dynamic access, and it's apparently only because the HLSL compiler is unrolling the loop that it recognizes that this expression will definitely perform an out-of-bounds access. We don't want to have to do this level of analysis in Naga.

If that's what's going on, then Naga may just have to accept that HLSL will reject shaders that we pass, because it analyzes them more thoroughly.

jimblandy commented 2 years ago

@TobTobXX Would you be able to put together a reduced version of the test case, based on the loop at line 209?

cwfitzgerald commented 2 years ago

I think a simple example would be an explicit vector_4[4]. I think this might be a limitation in DXBC or FXC where it doesn't allow non-constant indexing into vectors.

jimblandy commented 2 years ago

I think a simple example would be an explicit vector_4[4]. I think this might be a limitation in DXBC or FXC where it doesn't allow non-constant indexing into vectors.

We already detect this:

fn f(v: vec4<f32>) -> f32 {
   return v[4];
}

That elicits:

error: Index 4 is out of bounds for expression [1]

Could not parse WGSL

jimblandy commented 2 years ago

But, for example, Naga validates this and I suspect the HLSL compiler will reject it:

fn f(v: vec4<f32>) -> f32 {
   var sum: f32 = 0.0;
   for (var i = 1; i <= 4; i = i + 1) {
       sum = sum + v[i];
   }
   return sum;
}

TobTobXX commented 2 years ago

Yes, as @jimblandy has guessed, this shader does work on Linux, but doesn't on Windows (tested):

let foo = vec4<f32>(0.1, 0.2, 0.3, -0.6);

[[stage(vertex)]]
fn vs_main(
    [[builtin(vertex_index)]] in_vertex_index: u32,
) -> [[builtin(position)]] vec4<f32> {
    let x = f32(1 - i32(in_vertex_index)) * 0.5;
    let y = f32(i32(in_vertex_index & 1u) * 2 - 1) * 0.5;
    return vec4<f32>(x, y, 0.0, 1.0);
}

[[stage(fragment)]]
fn fs_main() -> [[location(0)]] vec4<f32> {
    var sum = 0.0;
    // Note the off-by-one error
    for (var i = 0; i < 5; i=i+1) {
        sum = sum + foo[i];
    }
    return vec4<f32>(1.0, sum, 1.0, 1.0);
}

jimblandy commented 2 years ago

Unless @kvark has some clever way out for us, I think that leaves us at

If that's what's going on, then Naga may just have to accept that HLSL will reject shaders that we pass, because it analyzes them more thoroughly

teoxoy commented 2 years ago

This only seems to be an issue with FXC. DXC seems to compile it just fine. See https://shader-playground.timjones.io/70cb272b067ff17ae1ab1dc6ee12a513 I also found this related issue https://github.com/microsoft/DirectXShaderCompiler/issues/1879#issuecomment-463415562

teoxoy commented 2 years ago

I think a simple example would be an explicit vector_4[4]. I think this might be a limitation in DXBC or FXC where it doesn't allow non-constant indexing into vectors.

We already detect this:
fn f(v: vec4<f32>) -> f32 {
   return v[4];
}
That elicits:
error: Index 4 is out of bounds for expression [1]

Could not parse WGSL

@jimblandy according to the WGSL spec this should work though.

Implementing the OOB behavior above would fix the issue.

jimblandy commented 2 years ago

Implementing the OOB behavior above would fix the issue.

The OOB behavior is already implemented, actually. We're just being over-eager about detecting errors at compile time.

teoxoy commented 2 years ago

So, solving both gfx-rs/wgpu#4389 and gfx-rs/wgpu#4390 should hopefully fix this issue.

gfx-rs / wgpu

error X3504: literal loop terminated early due to out of bounds array access #4381