Intel + DXC failure: webgpu:shader,execution,flow_control,loop:nested_loops:preventValueOptimizations=false

amaiorano commented 1 year ago

crbug.com/tint/0000 [ intel-0x9bc5 webgpu-dxc-enabled win10 ] webgpu:shader,execution,flow_control,loop:nested_loops:preventValueOptimizations=false [ Failure ]

Tested on Intel UHD 620, and was able to reproduce when using DXC to compile:

$ ./tools/run run-cts --flag=enable-dawn-features=use_dxc --bin=./build-vs/Debug 'webgpu:shader,execution,flow_control,loop:nested_loops:preventValueOptimizations=false'
Running in parallel with server...
Testing 1 test cases...
webgpu:shader,execution,flow_control,loop:nested_loops:preventValueOptimizations=false - fail:
  - EXPECTATION FAILED: expect_order(10, 20) expected event 10, got 4
    Error
        at Object.expect_order (C:\src\dawn\third_party\webgpu-cts\out-node\webgpu\shader\execution\flow_control\harness.js:59:24)
        at C:\src\dawn\third_party\webgpu-cts\out-node\webgpu\shader\execution\flow_control\loop.spec.js:90:13
        at runFlowControlTest (C:\src\dawn\third_party\webgpu-cts\out-node\webgpu\shader\execution\flow_control\harness.js:42:31)
        at RunCaseSpecific.fn (C:\src\dawn\third_party\webgpu-cts\out-node\webgpu\shader\execution\flow_control\loop.spec.js:75:41)
        at RunCaseSpecific.runTest (C:\src\dawn\third_party\webgpu-cts\out-node\common\internal\test_group.js:255:28)
        at processTicksAndRejections (node:internal/process/task_queues:96:5)
        at async RunCaseSpecific.run (C:\src\dawn\third_party\webgpu-cts\out-node\common\internal\test_group.js:420:21)
        at async runTestcase (C:\src\dawn\third_party\webgpu-cts\out-node\common\runtime\server.js:118:9)
        at async Server.<anonymous> (C:\src\dawn\third_party\webgpu-cts\out-node\common\runtime\server.js:153:36)
    WGSL:

    struct Outputs {
      count : u32,
      data  : array<u32>,
    };
    @group(0) @binding(0) var<storage, read>       inputs  : array<i32>;
    @group(0) @binding(1) var<storage, read_write> outputs : Outputs;

    fn push_output(value : u32) {
      outputs.data[outputs.count] = value;
      outputs.count++;
    }

    @compute @workgroup_size(1)
    fn main() {
      _ = &inputs;
      _ = &outputs;

      push_output(0);
      var i = 0;
      loop {
        push_output(1);
        if i == 6 {
          push_output(2);
          break;
          push_output(3);
        }
        push_output(4);
        loop {
          i++;
          push_output(5);
          if (i % 3) == 0 {
            push_output(6);
            break;
            push_output(7);
          }
          push_output(8);
          if (i & 1) == 0 {
            push_output(9);
            continue;
            push_output(10);
          }
          push_output(11);
        }
      }
      push_output(12);

    }

    Error
        at GPUTest.eventualAsyncExpectation (C:\src\dawn\third_party\webgpu-cts\out-node\common\framework\fixture.js:169:28)
        at GPUTest.eventualExpectOK (C:\src\dawn\third_party\webgpu-cts\out-node\common\framework\fixture.js:288:14)
        at runFlowControlTest (C:\src\dawn\third_party\webgpu-cts\out-node\webgpu\shader\execution\flow_control\harness.js:131:7)
        at RunCaseSpecific.fn (C:\src\dawn\third_party\webgpu-cts\out-node\webgpu\shader\execution\flow_control\loop.spec.js:75:41)
  ⣽ [▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉] 100.0%
Completed in 721.781ms
    FAIL: 1         (100.0%)

Running with FXC (the default), the test passes:

$ ./tools/run run-cts --bin=./build-vs/Debug 'webgpu:shader,execution,flow_control,loop:nested_loops:preventValueOptimizations=false'
Running in parallel with server...
Testing 1 test cases...
  ⣽ [▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉] 100.0%
Completed in 686.6627ms
    PASS: 1         (100.0%)

Tested this on another machine with an NVIDIA GPU, and the test passes with both DXC and FXC, so this looks to be an Intel driver bug, specifically when compiling against the DXIL produced by DXC.

Using dump_shaders, here's the HLSL generated for this test:

/* Dumped generated HLSL */
ByteAddressBuffer inputs : register(t0);
RWByteAddressBuffer outputs : register(u1);

void push_output(uint value) {
  outputs.Store((4u + (4u * outputs.Load(0u))), asuint(value));
  outputs.Store(0u, asuint((outputs.Load(0u) + 1u)));
}

int tint_mod(int lhs, int rhs) {
  const int rhs_or_one = (((rhs == 0) | ((lhs == -2147483648) & (rhs == -1))) ? 1 : rhs);
  if (any(((uint((lhs | rhs_or_one)) & 2147483648u) != 0u))) {
    return (lhs - ((lhs / rhs_or_one) * rhs_or_one));
  } else {
    return (lhs % rhs_or_one);
  }
}

[numthreads(1, 1, 1)]
void main() {
  push_output(0u);
  int i = 0;
  while (true) {
    push_output(1u);
    if ((i == 6)) {
      push_output(2u);
      break;
      push_output(3u);
    }
    push_output(4u);
    while (true) {
      i = (i + 1);
      push_output(5u);
      if ((tint_mod(i, 3) == 0)) {
        push_output(6u);
        break;
        push_output(7u);
      }
      push_output(8u);
      if (((i & 1) == 0)) {
        push_output(9u);
        continue;
        push_output(10u);
      }
      push_output(11u);
    }
  }
  push_output(12u);
  return;
}

I'll post the DXIL we're sendig to the GPU soon. Right now, Dawn is failing to get the DXIL when dumping shaders.

amaiorano commented 1 year ago

I've attached the DXIL disassembly for the above HLSL. cts_3063_dxil_asm.txt

amaiorano commented 1 year ago

Just to add to the analysis, here is the HLSL again:

01: [numthreads(1, 1, 1)]
02: void main() {
03:   push_output(0u);
04:   int i = 0;
05:   while (true) {
06:     push_output(1u);
07:     if ((i == 6)) {
08:       push_output(2u);
09:       break;
10:       push_output(3u);
11:     }
12:     push_output(4u);
13:     while (true) {
14:       i = (i + 1); <------------ i becomes 1, 2, 3
15:       push_output(5u);
16:       if ((tint_mod(i, 3) == 0)) { <--------- when i == 3, we should enter the branch, but we don't
17:         push_output(6u); <----------- WE NEVER REACH HERE
18:         break;
19:         push_output(7u);
20:       }
21:       push_output(8u); <----------- when i == 3, we end up here, which is incorrect, and is why the test fails
22:       if (((i & 1) == 0)) {
23:         push_output(9u);
24:         continue;
25:         push_output(10u);
26:       }
27:       push_output(11u);
28:     }
29:   }
30:   push_output(12u);
31:   return;
32: }

The test is making sure that the loop that starts at line 13 executes 3 times, and on the third time, it should reach line 17, and break on line 18.

The miscompilation is probably be related to tint_mod, HLSL shown here:

int tint_mod(int lhs, int rhs) {
  const int rhs_or_one = (((rhs == 0) | ((lhs == -2147483648) & (rhs == -1))) ? 1 : rhs);
  if (any(((uint((lhs | rhs_or_one)) & 2147483648u) != 0u))) {
    return (lhs - ((lhs / rhs_or_one) * rhs_or_one));
  } else {
    return (lhs % rhs_or_one);
  }
}

Tint emits this function to replace the built-in modulo operator in order to avoid divide-by-zero and integer overflows.

Intel's driver seems to be miscompiling this function, resulting in the return value not being zero when i is 3 in tint_mod(i, 3).

jzm-intel commented 1 year ago

I checked the output buffer of the flow control tests webgpu:shader,execution,flow_control,loop:nested_loops, run on Intel UHD 630. The following is the preventValueOptimizations = True/False cases running with FXC and DXC.

FXC,prevent Output values (length: 24): 0, 1, 4, 5, 8, 11, 5, 8, 9, 5, 6, 1, 4, 5, 8, 9, 5, 8, 11, 5, 6, 1, 2, 12 DXC,non-pre Output values (length: 24): 0, 1, 4, 5, 8, 11, 5, 8, 9, 5, 6, 1, 4, 5, 8, 9, 5, 8, 11, 5, 6, 1, 2, 12 DXC,prevent Output values (length: 24): 0, 1, 4, 5, 8, 11, 5, 8, 9, 5, 6, 1, 4, 5, 8, 9, 5, 8, 11, 5, 6, 1, 2, 12 DXC,not-pre(!) Output values (length: 24): 0, 1, 4, 5, 6, 11, 5, 0, 9, 5, 0, 1, 4, 5, 6, 9, 5, 0, 11, 5, 0, 1, 2, 12

In the failed DXC output, it is very strange to see 0 in multiple places since output 0 should be push only at the very beginning of the shader.

jzm-intel commented 1 year ago

By further making events id start from 1 instead of 0, it turns out that all but the first 0 in the failed DXC output buffer comes from initialization. No value has been written into buffer in those position.

Detailed output on DXC failed cases:

Output values (length: 24): 1, 2, 5, 6, 7, 12, 6, 0, 10, 6, 0, 2, 5, 6, 7, 10, 6, 0, 12, 6, 0, 2, 3, 13

WGSL:

struct Outputs {
  count : u32,
  data  : array<u32>,
};
@group(0) @binding(0) var<storage, read>       inputs  : array<i32>;
@group(0) @binding(1) var<storage, read_write> outputs : Outputs;

fn push_output(value : u32) {
  outputs.data[outputs.count] = value;
  outputs.count++;
}

@compute @workgroup_size(1)
fn main() {
  _ = &inputs;
  _ = &outputs;

  push_output(1);
  var i = 0;
  loop {
    push_output(2);
    if i == 6 {
      push_output(3);
      break;
      push_output(4);
    }
    push_output(5);
    loop {
      i++;
      push_output(6);
      if (i % 3) == 0 {
        push_output(7);
        break;
        push_output(8);
      }
      push_output(9);
      if (i & 1) == 0 {
        push_output(10);
        continue;
        push_output(11);
      }
      push_output(12);
    }
  }
  push_output(13);

}

jzm-intel commented 11 months ago

This issue has been reproduced on Intel UHD630 with native D3D12 case using DXC. Newer devices like UHD770 or A770 seems not affected by this issue. Internal issue has been filed.

With some further investigation, this issue seems like a storage read/write issue instead of a general control flow issue. The error in the output seems comes from certain storage buffer writing not taking place and some other writing to wrong place. Adding some statement with side effect around the suspected buffer read/write may or may not make the result correct, and the side effect seems always happen as expected.

Some detailed observation below.

1. Simplified WGSL shader that reproduce issue

struct Outputs {
  count : u32,
  data  : array<u32>,
};

@group(0) @binding(0) var<storage, read>       inputs  : array<i32>;
@group(0) @binding(1) var<storage, read_write> outputs : Outputs;

fn push_output(value : u32) {
  outputs.data[outputs.count] = value;
  outputs.count++;
}

@compute @workgroup_size(1)
fn main() {
  _ = &inputs;
  _ = &outputs;

  outputs.count = 0;
  var i = 1;
  loop {
    // This expectation is needed
    push_output(1); // expect_order(0)
    loop {
      i++;
      // Before-expectation is needed
      push_output(2); // expect_order(1, 3, 5)
      if (i % 3) == 1 {
      // Pass if change the condition to:
      // if (u32(i) % u32(3)) == 1 {
        push_output(3); // expect_order(6)
        break;
      }
      // After-expectation is needed
      push_output(4); // expect_order(2, 4)
    }
    if (i >= 4) {
      break;
    }
  }
}

The expected and incorrect output is:

Expected:  1, 2, 4, 2, 4, 2, 3
Incorrect: 1, 2, 3, 2, 0, 2, 0

The 0s in the incorrect output are actually the initialized value of buffer.

2. Changing the storage buffer read/write behavior make the issue disappeared

Currently the WGSL template used in flow_control tests directly read back the output count from the storage buffer, use it as index to write the output value, and update the output count again in the buffer. If we track the count within a private variable (local register) instead of reading it back, the result will be correct.

Modified WGSL shader that create correct output:

struct Outputs {
  count : u32,
  data  : array<u32>,
};
@group(0) @binding(0) var<storage, read>       inputs  : array<i32>;
@group(0) @binding(1) var<storage, read_write> outputs : Outputs;

// Track the output count in private variable
var<private> output_count = 0u;

fn push_output(value : u32) {
  // Use output count as index to store the pushed value
  outputs.data[output_count] = value;
  // Update the output count in private variable
  output_count++;
  // Only write the updated output count, never read back from it
  outputs.count = output_count;
}

@compute @workgroup_size(1)
fn main() {
  _ = &inputs;
  _ = &outputs;

  outputs.count = 0;
  var i = 1;
  loop {
    // This expectation is needed
    push_output(1); // expect_order(0)
    loop {
      i++;
      // Before-expectation is needed
      push_output(2); // expect_order(1, 3, 5)
      if (i % 3) == 1 {
        push_output(3); // expect_order(6)
        break;
      }
      // After-expectation is needed
      push_output(4); // expect_order(2, 4)
    }
    if (i >= 4) {
      break;
    }
  }

}

3. Adding statement with side effect around the suspected `push_value`

When adding statements with side effect around suspected push_output(3) and push_output(4) when using the read-back version of push_output, the output of these push_output may (or may not) get correct, while the side effect seems always happen as expected.

For example, update a counter acc around these push_value:

struct Outputs {
  count : u32,
  data  : array<u32>,
};
@group(0) @binding(0) var<storage, read>       inputs  : array<i32>;
@group(0) @binding(1) var<storage, read_write> outputs : Outputs;

fn push_output(value : u32) {
  outputs.data[outputs.count] = value;
  outputs.count++;
}

@compute @workgroup_size(1)
fn main() {
  _ = &inputs;
  _ = &outputs;

  outputs.count = 0;
  var acc = 0u;
  var i = 1;
  loop {
    // This expectation is needed
    push_output(1); // expect_order(0)
    loop {
      i++;
      // Before-expectation is needed
      push_output(2); // expect_order(1, 3, 5)
      if (i % 3) == 1 {
        acc = acc + 1;  // <--------------------- Acc point 1
        push_output(3); // expect_order(6)
        acc = acc + 1;  // <--------------------- Acc point 2
        break;
      }
      acc = acc + 1;  // <--------------------- Acc point 3
      // After-expectation is needed
      push_output(4); // expect_order(2, 4)
    }
    if (i >= 4) {
      break;
    }
  }

  // Push tha acc to buffer at the end
  push_output(acc);

}

Note the present and absent (commented) for each of the three Acc point as T and F, the corresponding outputs are:

F, F, F: 1, 2, 3, 2, 0, 2, 0, 0
F, F, T: 1, 2, 4, 2, 4, 2, 3, 2 (Correct)
F, T, F: 1, 2, 3, 2, 0, 2, 0, 1
F, T, T: 1, 2, 3, 2, 0, 2, 0, 3
T, F, F: 1, 2, 4, 2, 4, 2, 3, 1 (Correct)
T, F, T: 1, 2, 3, 2, 0, 2, 0, 3
T, T, F: 1, 2, 3, 2, 0, 2, 0, 2
T, T, T: 1, 2, 4, 2, 4, 2, 3, 4 (Correct)

Although in only three cases the 3 and 4 outputs are correct, the output acc values are always correct in all cases. I though this suggests that the control flow do goes as expect, although some certain storage buffer writing is crashed.

gpuweb / cts