can't compile compute shader when using `SHADER_INT64`

Jianqoq commented 2 months ago

currently I am trying to compile the wgsl code but it keep saysing

NVVM compilation failed: 1
thread 'main' panicked at C:\Users\123\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-22.1.0\src\backend\wgpu_core.rs:3411:5:
wgpu error: Validation Error

Caused by:
  In Device::create_compute_pipeline, label = 'compute_pipeline'
    Parent device is lost`

source_code:

@group(0) @binding(0) var<storage, read> a : array<f32>;
@group(0) @binding(1) var<storage, read> a_strides : array<i64>;

@group(0) @binding(2) var<storage, read> b : array<f32>;
@group(0) @binding(3) var<storage, read> b_strides : array<i64>;

@group(0) @binding(4) var<storage, read_write> c : array<f32>;
@group(0) @binding(5) var<storage, read> c_strides : array<i64>;
@group(0) @binding(6) var<storage, read> c_shape : array<i64>;

@group(0) @binding(7) var<storage, read> outer_loop_size : i64;
@group(0) @binding(8) var<storage, read> inner_loop_size : i64;

@group(0) @binding(9) var<storage, read> res_ndim : i64;

@compute
@workgroup_size(16, 16, 1)
fn main(
@builtin(workgroup_id) workgroup_id : vec3 <u32>,
@builtin(local_invocation_id) local_id : vec3 <u32>
)
{
   let global_id_x: i64 = i64(workgroup_id.x) * 16 + i64(local_id.x);

   let tmp: i64 = outer_loop_size % (NUM_GRP_X * 16);
   let start_idx: i64 = global_id_x * (outer_loop_size / (NUM_GRP_X * 16)) + min(global_id_x, tmp);
   var end_idx: i64 = start_idx + (outer_loop_size / (NUM_GRP_X * 16)) + i64(global_id_x < tmp);

   if end_idx - start_idx == 0 {
      return;
   }
   var amount : i64 = start_idx * inner_loop_size;
   var c_offset : i64 = 0;
   var a_offset : i64 = 0;
   var b_offset : i64 = 0;
   var prg : array<i64, 3>;
   for (var i : i64 = res_ndim - 1; i >= 0; i--)
   {
      let tmp : i64 = amount % c_shape[i];
      c_offset += tmp * c_strides[i];
      a_offset += tmp * a_strides[i];
      b_offset += tmp * b_strides[i];
      prg[i] = tmp;
      amount /= c_shape[i];
   }
   let global_id_y: i64 = i64(workgroup_id.y) * 16 + i64(local_id.y);

   let tmp2: i64 = inner_loop_size % (NUM_GRP_Y * 16);
   let start_idx2: i64 = global_id_y * (inner_loop_size / NUM_GRP_Y * 16) + min(global_id_y, tmp2);
   var end_idx2: i64 = start_idx2 + (inner_loop_size / NUM_GRP_Y * 16) + i64(global_id_y < tmp2);

   let c_last_stride: i64 = c_strides[res_ndim - 1];
   let a_last_stride: i64 = a_strides[res_ndim - 1];
   let b_last_stride: i64 = b_strides[res_ndim - 1];

   c_offset += c_last_stride * start_idx2;
   a_offset += a_last_stride * start_idx2;
   b_offset += b_last_stride * start_idx2;

   if end_idx2 - start_idx2 == 0 {
      return;
   }

   let inner_loop_size: i64 = end_idx2 - start_idx2;
   let outer_loop_size: i64 = end_idx - start_idx;

   for (var j : i64 = 0; j < outer_loop_size; j++)
   {
      for (var i : i64 = 0; i < inner_loop_size; i++)
      {
         c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];
      }
      for (var i : i64 = 0; i < res_ndim; i++)
      {
         if (prg[i] + 1 < c_shape[i])
         {
            prg[i]++;
            c_offset += c_strides[i];
            a_offset += a_strides[i];
            b_offset += b_strides[i];
            break;
         }
         else
         {
            prg[i] = i64(0);
            c_offset -= c_strides[i] * (c_shape[i] - 1);
            a_offset -= a_strides[i] * (c_shape[i] - 1);
            b_offset -= b_strides[i] * (c_shape[i] - 1);
         }
      }
   }
}

async fn create_device() -> (wgpu::Device, wgpu::Queue) {
    // Instantiates instance of WebGPU
    let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
        backends: wgpu::Backends::VULKAN | wgpu::Backends::METAL | wgpu::Backends::DX12 | wgpu::Backends::GL | wgpu::Backends::BROWSER_WEBGPU,
        flags: InstanceFlags::VALIDATION,
        dx12_shader_compiler: Dx12Compiler::Fxc,
        gles_minor_version: Gles3MinorVersion::default(),
    });

    // `request_adapter` instantiates the general connection to the GPU
    let adapter = instance
        .request_adapter(
            &(RequestAdapterOptions {
                power_preference: wgpu::PowerPreference::HighPerformance,
                compatible_surface: None,
                force_fallback_adapter: false,
            })
        ).await
        .unwrap();

    // `request_device` instantiates the feature specific connection to the GPU, defining some parameters,
    //  `features` being the available features.
    let limits = wgpu::Limits {
        max_buffer_size: 20 * 1024 * 1024 * 1024,
        max_storage_buffers_per_shader_stage: 12,
        ..wgpu::Limits::default()
    };
    adapter
        .request_device(
            &(wgpu::DeviceDescriptor {
                label: None,
                required_features: wgpu::Features::SHADER_INT64,
                required_limits: limits,
                memory_hints: wgpu::MemoryHints::MemoryUsage,
            }),
            None
        ).await
        .unwrap()
}

Jianqoq commented 2 months ago

I am able to compile now, when I debug, the error is actually StageError, and the error says the int type has problem but I can't see the actual msg by using the debugger. I casted all the index from i64 to u32

Jianqoq commented 2 months ago

@group(0) @binding(0) var<storage, read> a : array<a_ty>;
@group(0) @binding(1) var<storage, read> a_strides : array<i64>;

@group(0) @binding(2) var<storage, read> b : array<b_ty>;
@group(0) @binding(3) var<storage, read> b_strides : array<i64>;

@group(0) @binding(4) var<storage, read_write> c : array<c_ty>;
@group(0) @binding(5) var<storage, read> c_strides : array<i64>;
@group(0) @binding(6) var<storage, read> c_shape : array<i64>;

@group(0) @binding(7) var<storage, read> outer_loop_size : i64;
@group(0) @binding(8) var<storage, read> inner_loop_size : i64;

@group(0) @binding(9) var<storage, read> res_ndim : i64;

@compute
@workgroup_size(GRP_SIZE_X, GRP_SIZE_Y, 1)
fn main(
@builtin(workgroup_id) workgroup_id : vec3 <u32>,
@builtin(local_invocation_id) local_id : vec3 <u32>
)
{
   let global_id_x: i64 = i64(workgroup_id.x) * GRP_SIZE_X + i64(local_id.x);

   let tmp: i64 = outer_loop_size % (NUM_GRP_X * GRP_SIZE_X);
   let start_idx: i64 = global_id_x * (outer_loop_size / (NUM_GRP_X * GRP_SIZE_X)) + min(global_id_x, tmp);
   var end_idx: i64 = start_idx + (outer_loop_size / (NUM_GRP_X * GRP_SIZE_X)) + i64(global_id_x < tmp);

   if end_idx - start_idx == 0 {
      return;
   }
   var amount : i64 = start_idx * inner_loop_size;
   var c_offset : i64 = 0;
   var a_offset : i64 = 0;
   var b_offset : i64 = 0;
   var prg : array<i64, prg_place_holder>;
   for (var i : i64 = res_ndim - 1; i >= 0; i--)
   {
      let idx: u32 = u32(i);
      let tmp : i64 = amount % c_shape[idx];
      c_offset += tmp * c_strides[idx];
      a_offset += tmp * a_strides[idx];
      b_offset += tmp * b_strides[idx];
      prg[idx] = tmp;
      amount /= c_shape[idx];
   }
   let global_id_y: i64 = i64(workgroup_id.y) * GRP_SIZE_Y + i64(local_id.y);

   let tmp2: i64 = inner_loop_size % (NUM_GRP_Y * GRP_SIZE_Y);
   let start_idx2: i64 = global_id_y * (inner_loop_size / NUM_GRP_Y * GRP_SIZE_Y) + min(global_id_y, tmp2);
   var end_idx2: i64 = start_idx2 + (inner_loop_size / NUM_GRP_Y * GRP_SIZE_Y) + i64(global_id_y < tmp2);

   let c_last_stride: i64 = c_strides[res_ndim - 1];
   let a_last_stride: i64 = a_strides[res_ndim - 1];
   let b_last_stride: i64 = b_strides[res_ndim - 1];

   c_offset += c_last_stride * start_idx2;
   a_offset += a_last_stride * start_idx2;
   b_offset += b_last_stride * start_idx2;

   if end_idx2 - start_idx2 == 0 {
      return;
   }

   let inner_loop_size: i64 = end_idx2 - start_idx2;
   let outer_loop_size: i64 = end_idx - start_idx;

   for (var j : i64 = 0; j < outer_loop_size; j++)
   {
      for (var i : i64 = 0; i < inner_loop_size; i++)
      {
         c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];
      }
      for (var k : i64 = res_ndim - 2; k >= 0; k--)
      {
         let idx: u32 = u32(k);
         if (prg[idx] + 1 < c_shape[idx])
         {
            prg[idx]++;
            c_offset += c_strides[idx];
            a_offset += a_strides[idx];
            b_offset += b_strides[idx];
            break;
         }
         else
         {
            prg[idx] = i64(0);
            c_offset -= c_strides[idx] * (c_shape[idx] - 1);
            a_offset -= a_strides[idx] * (c_shape[idx] - 1);
            b_offset -= b_strides[idx] * (c_shape[idx] - 1);
         }
      }
   }
}

Jianqoq commented 2 months ago

Maybe the validation should also check the var type for indexing

Jianqoq commented 2 months ago

actually, the i64 indexing is also ok, but just not sure why the validation failed c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];, this is using i64 for indexing,

let idx: u32 = u32(k);
         if (prg[idx] + 1 < c_shape[idx])

but this must use u32

teoxoy commented 2 months ago

A bunch of declarations are missing from both shaders (a_type, b_type, NUM_GRP_X, GRP_SIZE_X, prg_place_holder, ...). Please provide the full shader code or a reduced version if those items are not relevant.

Also, which OS and backend are you seeing the issue on?

Jianqoq commented 2 months ago

I am using windows11 and vulkan | DX12 backend, RTX 4090.

@group(0) @binding(0) var<storage, read> a : array<f32>;
@group(0) @binding(1) var<storage, read> a_strides : array<i64>;

@group(0) @binding(2) var<storage, read> b : array<f32>;
@group(0) @binding(3) var<storage, read> b_strides : array<i64>;

@group(0) @binding(4) var<storage, read_write> c : array<f32>;
@group(0) @binding(5) var<storage, read> c_strides : array<i64>;
@group(0) @binding(6) var<storage, read> c_shape : array<i64>;

@group(0) @binding(7) var<storage, read> outer_loop_size : i64;
@group(0) @binding(8) var<storage, read> inner_loop_size : i64;

@group(0) @binding(9) var<storage, read> res_ndim : i64;

@compute
@workgroup_size(16, 16, 1)
fn main(
@builtin(workgroup_id) workgroup_id : vec3 <u32>,
@builtin(local_invocation_id) local_id : vec3 <u32>
)
{
   let global_id_x: i64 = i64(workgroup_id.x) * 16 + i64(local_id.x);

   let tmp: i64 = outer_loop_size % (1024* 16);
   let start_idx: i64 = global_id_x * (outer_loop_size / (1024 * 16)) + min(global_id_x, tmp);
   var end_idx: i64 = start_idx + (outer_loop_size / (1024 * 16)) + i64(global_id_x < tmp);

   if end_idx - start_idx == 0 {
      return;
   }
   var amount : i64 = start_idx * inner_loop_size;
   var c_offset : i64 = 0;
   var a_offset : i64 = 0;
   var b_offset : i64 = 0;
   var prg : array<i64, 3>;
   for (var i : i64 = res_ndim - 1; i >= 0; i--)
   {
      let tmp : i64 = amount % c_shape[i];
      c_offset += tmp * c_strides[i];
      a_offset += tmp * a_strides[i];
      b_offset += tmp * b_strides[i];
      prg[i] = tmp;
      amount /= c_shape[i];
   }
   let global_id_y: i64 = i64(workgroup_id.y) * 16 + i64(local_id.y);

   let tmp2: i64 = inner_loop_size % (1024 * 16);
   let start_idx2: i64 = global_id_y * (inner_loop_size / 1024 * 16) + min(global_id_y, tmp2);
   var end_idx2: i64 = start_idx2 + (inner_loop_size / 1024 * 16) + i64(global_id_y < tmp2);

   let c_last_stride: i64 = c_strides[res_ndim - 1];
   let a_last_stride: i64 = a_strides[res_ndim - 1];
   let b_last_stride: i64 = b_strides[res_ndim - 1];

   c_offset += c_last_stride * start_idx2;
   a_offset += a_last_stride * start_idx2;
   b_offset += b_last_stride * start_idx2;

   if end_idx2 - start_idx2 == 0 {
      return;
   }

   let inner_loop_size: i64 = end_idx2 - start_idx2;
   let outer_loop_size: i64 = end_idx - start_idx;

   for (var j : i64 = 0; j < outer_loop_size; j++)
   {
      for (var i : i64 = 0; i < inner_loop_size; i++)
      {
         c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];
      }
      for (var i : i64 = 0; i < res_ndim; i++)
      {
         if (prg[i] + 1 < c_shape[i])
         {
            prg[i]++;
            c_offset += c_strides[i];
            a_offset += a_strides[i];
            b_offset += b_strides[i];
            break;
         }
         else
         {
            prg[i] = i64(0);
            c_offset -= c_strides[i] * (c_shape[i] - 1);
            a_offset -= a_strides[i] * (c_shape[i] - 1);
            b_offset -= b_strides[i] * (c_shape[i] - 1);
         }
      }
   }
}

this is the full shader code

teoxoy commented 2 months ago

SHADER_INT64 will only be exposed on DX12 if Dx12Compiler::Dxc is used. So, I guess the "Parent device is lost" error is coming from the Vulkan backend. I ran the SPIR-V shader generated by naga through spirv-val and there were no errors. This might be a driver bug.

I don't fully understand this comment https://github.com/gfx-rs/wgpu/issues/6081#issuecomment-2270171617, did you manage to find the issue and work around it? If so, can you go into the details?

Jianqoq commented 2 months ago

Yup, I worked around it and I found that when I cast some number to u32, then the validation will pass

gfx-rs / wgpu

can't compile compute shader when using `SHADER_INT64` #6081