Open Jianqoq opened 2 months ago
I am able to compile now, when I debug, the error is actually StageError, and the error says the int type has problem but I can't see the actual msg by using the debugger. I casted all the index from i64 to u32
@group(0) @binding(0) var<storage, read> a : array<a_ty>;
@group(0) @binding(1) var<storage, read> a_strides : array<i64>;
@group(0) @binding(2) var<storage, read> b : array<b_ty>;
@group(0) @binding(3) var<storage, read> b_strides : array<i64>;
@group(0) @binding(4) var<storage, read_write> c : array<c_ty>;
@group(0) @binding(5) var<storage, read> c_strides : array<i64>;
@group(0) @binding(6) var<storage, read> c_shape : array<i64>;
@group(0) @binding(7) var<storage, read> outer_loop_size : i64;
@group(0) @binding(8) var<storage, read> inner_loop_size : i64;
@group(0) @binding(9) var<storage, read> res_ndim : i64;
@compute
@workgroup_size(GRP_SIZE_X, GRP_SIZE_Y, 1)
fn main(
@builtin(workgroup_id) workgroup_id : vec3 <u32>,
@builtin(local_invocation_id) local_id : vec3 <u32>
)
{
let global_id_x: i64 = i64(workgroup_id.x) * GRP_SIZE_X + i64(local_id.x);
let tmp: i64 = outer_loop_size % (NUM_GRP_X * GRP_SIZE_X);
let start_idx: i64 = global_id_x * (outer_loop_size / (NUM_GRP_X * GRP_SIZE_X)) + min(global_id_x, tmp);
var end_idx: i64 = start_idx + (outer_loop_size / (NUM_GRP_X * GRP_SIZE_X)) + i64(global_id_x < tmp);
if end_idx - start_idx == 0 {
return;
}
var amount : i64 = start_idx * inner_loop_size;
var c_offset : i64 = 0;
var a_offset : i64 = 0;
var b_offset : i64 = 0;
var prg : array<i64, prg_place_holder>;
for (var i : i64 = res_ndim - 1; i >= 0; i--)
{
let idx: u32 = u32(i);
let tmp : i64 = amount % c_shape[idx];
c_offset += tmp * c_strides[idx];
a_offset += tmp * a_strides[idx];
b_offset += tmp * b_strides[idx];
prg[idx] = tmp;
amount /= c_shape[idx];
}
let global_id_y: i64 = i64(workgroup_id.y) * GRP_SIZE_Y + i64(local_id.y);
let tmp2: i64 = inner_loop_size % (NUM_GRP_Y * GRP_SIZE_Y);
let start_idx2: i64 = global_id_y * (inner_loop_size / NUM_GRP_Y * GRP_SIZE_Y) + min(global_id_y, tmp2);
var end_idx2: i64 = start_idx2 + (inner_loop_size / NUM_GRP_Y * GRP_SIZE_Y) + i64(global_id_y < tmp2);
let c_last_stride: i64 = c_strides[res_ndim - 1];
let a_last_stride: i64 = a_strides[res_ndim - 1];
let b_last_stride: i64 = b_strides[res_ndim - 1];
c_offset += c_last_stride * start_idx2;
a_offset += a_last_stride * start_idx2;
b_offset += b_last_stride * start_idx2;
if end_idx2 - start_idx2 == 0 {
return;
}
let inner_loop_size: i64 = end_idx2 - start_idx2;
let outer_loop_size: i64 = end_idx - start_idx;
for (var j : i64 = 0; j < outer_loop_size; j++)
{
for (var i : i64 = 0; i < inner_loop_size; i++)
{
c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];
}
for (var k : i64 = res_ndim - 2; k >= 0; k--)
{
let idx: u32 = u32(k);
if (prg[idx] + 1 < c_shape[idx])
{
prg[idx]++;
c_offset += c_strides[idx];
a_offset += a_strides[idx];
b_offset += b_strides[idx];
break;
}
else
{
prg[idx] = i64(0);
c_offset -= c_strides[idx] * (c_shape[idx] - 1);
a_offset -= a_strides[idx] * (c_shape[idx] - 1);
b_offset -= b_strides[idx] * (c_shape[idx] - 1);
}
}
}
}
Maybe the validation should also check the var type for indexing
actually, the i64 indexing is also ok, but just not sure why the validation failed
c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];
, this is using i64 for indexing,
let idx: u32 = u32(k);
if (prg[idx] + 1 < c_shape[idx])
but this must use u32
A bunch of declarations are missing from both shaders (a_type
, b_type
, NUM_GRP_X
, GRP_SIZE_X
, prg_place_holder
, ...).
Please provide the full shader code or a reduced version if those items are not relevant.
Also, which OS and backend are you seeing the issue on?
I am using windows11 and vulkan | DX12 backend, RTX 4090.
@group(0) @binding(0) var<storage, read> a : array<f32>;
@group(0) @binding(1) var<storage, read> a_strides : array<i64>;
@group(0) @binding(2) var<storage, read> b : array<f32>;
@group(0) @binding(3) var<storage, read> b_strides : array<i64>;
@group(0) @binding(4) var<storage, read_write> c : array<f32>;
@group(0) @binding(5) var<storage, read> c_strides : array<i64>;
@group(0) @binding(6) var<storage, read> c_shape : array<i64>;
@group(0) @binding(7) var<storage, read> outer_loop_size : i64;
@group(0) @binding(8) var<storage, read> inner_loop_size : i64;
@group(0) @binding(9) var<storage, read> res_ndim : i64;
@compute
@workgroup_size(16, 16, 1)
fn main(
@builtin(workgroup_id) workgroup_id : vec3 <u32>,
@builtin(local_invocation_id) local_id : vec3 <u32>
)
{
let global_id_x: i64 = i64(workgroup_id.x) * 16 + i64(local_id.x);
let tmp: i64 = outer_loop_size % (1024* 16);
let start_idx: i64 = global_id_x * (outer_loop_size / (1024 * 16)) + min(global_id_x, tmp);
var end_idx: i64 = start_idx + (outer_loop_size / (1024 * 16)) + i64(global_id_x < tmp);
if end_idx - start_idx == 0 {
return;
}
var amount : i64 = start_idx * inner_loop_size;
var c_offset : i64 = 0;
var a_offset : i64 = 0;
var b_offset : i64 = 0;
var prg : array<i64, 3>;
for (var i : i64 = res_ndim - 1; i >= 0; i--)
{
let tmp : i64 = amount % c_shape[i];
c_offset += tmp * c_strides[i];
a_offset += tmp * a_strides[i];
b_offset += tmp * b_strides[i];
prg[i] = tmp;
amount /= c_shape[i];
}
let global_id_y: i64 = i64(workgroup_id.y) * 16 + i64(local_id.y);
let tmp2: i64 = inner_loop_size % (1024 * 16);
let start_idx2: i64 = global_id_y * (inner_loop_size / 1024 * 16) + min(global_id_y, tmp2);
var end_idx2: i64 = start_idx2 + (inner_loop_size / 1024 * 16) + i64(global_id_y < tmp2);
let c_last_stride: i64 = c_strides[res_ndim - 1];
let a_last_stride: i64 = a_strides[res_ndim - 1];
let b_last_stride: i64 = b_strides[res_ndim - 1];
c_offset += c_last_stride * start_idx2;
a_offset += a_last_stride * start_idx2;
b_offset += b_last_stride * start_idx2;
if end_idx2 - start_idx2 == 0 {
return;
}
let inner_loop_size: i64 = end_idx2 - start_idx2;
let outer_loop_size: i64 = end_idx - start_idx;
for (var j : i64 = 0; j < outer_loop_size; j++)
{
for (var i : i64 = 0; i < inner_loop_size; i++)
{
c[c_offset + i * c_last_stride] = a[a_offset + i * a_last_stride] + b[b_offset + i * b_last_stride];
}
for (var i : i64 = 0; i < res_ndim; i++)
{
if (prg[i] + 1 < c_shape[i])
{
prg[i]++;
c_offset += c_strides[i];
a_offset += a_strides[i];
b_offset += b_strides[i];
break;
}
else
{
prg[i] = i64(0);
c_offset -= c_strides[i] * (c_shape[i] - 1);
a_offset -= a_strides[i] * (c_shape[i] - 1);
b_offset -= b_strides[i] * (c_shape[i] - 1);
}
}
}
}
this is the full shader code
SHADER_INT64
will only be exposed on DX12
if Dx12Compiler::Dxc
is used. So, I guess the "Parent device is lost" error is coming from the Vulkan backend. I ran the SPIR-V shader generated by naga through spirv-val
and there were no errors. This might be a driver bug.
I don't fully understand this comment https://github.com/gfx-rs/wgpu/issues/6081#issuecomment-2270171617, did you manage to find the issue and work around it? If so, can you go into the details?
Yup, I worked around it and I found that when I cast some number to u32, then the validation will pass
currently I am trying to compile the wgsl code but it keep saysing
source_code: