Glavnokoman / vuh

Vulkan compute for people
https://glavnokoman.github.io/vuh
MIT License
346 stars 34 forks source link

Cannot compile #69

Open shangjiaxuan opened 2 years ago

shangjiaxuan commented 2 years ago

Cannot compile sample, error message:

error C2580: 'vuh::Delayed<vuh::detail::Noop>::Delayed(vuh::Delayed<vuh::detail::Noop> &&)': multiple versions of a defaulted special member functions are not allowed
shangjiaxuan commented 2 years ago

related #45 vuh2 works, but my dct8x8 compiled from hlsl gives zero output data when executed, and no error code is retrieved. I changed the code to explicitly write array[0]=1.0, and still 0 result. Resolving to hand-code vulkan to find error.

shangjiaxuan commented 2 years ago

Code in hlsl:

cbuffer image_info : register(b0)
{
    uint2 size;
    int2 offset;
    uint x_stride;
    uint pix_stride;
    uint3 global_size;
};

static const uint3 local_size = uint3(1u, 1u, 1u);

Buffer<float> data_in : register(t0);
RWBuffer<float> data_out : register(u0);

struct compute_input {
    uint3 global_idx : SV_DispatchThreadID;
};

uint input_local_to_global_idx(uint2 local_idx, int3 local_start)
{
    int2 global_location = local_start.xy + int2(local_idx);
    // if out of global range, clamp to global range here
    global_location = clamp(global_location, int2(0, 0), int2(size) - int2(1, 1));
    uint line_offset = x_stride * global_location.y;
    return line_offset + global_location.y * pix_stride + local_start.z;
}

uint output_global_to_local_idx(uint2 local_idx, uint3 global_invocation)
{
    uint2 global_location = global_invocation.xy * uint2(8, 8) + local_idx;
    uint x_out_stride = 8 * global_size.x;
    return (x_out_stride * global_location.y + global_location.x) * global_size.z + global_invocation.z;
}

//[numthreads(int(gl_WorkGroupSize.x), int(gl_WorkGroupSize.y), int(gl_WorkGroupSize.z))]
[numthreads(1, 1, 1)]
void main(compute_input input)
{
    // copy the data into current data_in
    // this should be the index of our 8x8 region
    int3 local_start = int3((int2(input.global_idx.xy * uint2(8, 8)) + offset) * int(global_size.z), input.global_idx.z);
    float4x4 invocation_data[2][2];
    // i: horizontal index in input,
    //    horizontal index in invocation_data
    {
        for (uint i = 0; i < 8; ++i) {
            for (uint j = 0; j < 8; ++j) {
                invocation_data[j / 4][i / 4][j % 4][i % 4] = data_in[input_local_to_global_idx(uint2(j, i), local_start)];
            }
        }
    }
    dct_8x8_impl(invocation_data);
    //copy all data to output
    {
        for (uint i = 0; i < 8; ++i) {
            for (uint j = 0; j < 8; ++j) {
                data_out[output_global_to_local_idx(uint2(j, i), input.global_idx)] = 1.0;// invocation_data[j / 4][i / 4][j % 4][i % 4];
            }
        }
    }
    data_out[0] = 1;
}

It seems code in glsl is also not working:

#version 450
#extension GL_GOOGLE_include_directive:enable

#include "dct.glsl"

layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
// in pixel units
// use z coordinate for multi channel
layout(push_constant) uniform Image_info
{
    uvec2 size;
    ivec2 offset;
    uint x_stride;
    uint pix_stride;
    uvec3 global_size;
} image_info;

layout(binding = 1) readonly buffer image_input
{
    float data_in[];
};

layout(binding = 2) writeonly buffer image_output
{
    float data_out[];
};

uint input_local_to_global_idx(uvec2 local_idx, ivec2 local_start) {
    ivec2 global_idx = local_start + ivec2(local_idx);
    // if out of global range, clamp to global range here
    global_idx = clamp(global_idx, ivec2(0,0), ivec2(image_info.size)-ivec2(1,1));
    uint line_offset = image_info.x_stride*global_idx.y;
    return line_offset + global_idx.y * gl_WorkGroupSize.z + gl_LocalInvocationID.z;
}

uint output_global_to_local_idx(uvec2 local_idx) {
    uvec2 global_location = gl_GlobalInvocationID.xy*uvec2(8,8)+local_idx;
    uint x_out_stride = 8*gl_WorkGroupSize.x*gl_NumWorkGroups.x;
    return (x_out_stride*global_location.y + global_location.x)*gl_WorkGroupSize.z+gl_LocalInvocationID.z;
}

void main(void) {
    // copy the data into current data_in
    // this should be the index of our 8x8 region
    ivec2 local_start = (ivec2(gl_GlobalInvocationID.xy*uvec2(8,8))+image_info.offset)*ivec2(gl_WorkGroupSize.z);
    mat4x4 invocation_data[2][2];
    // i: vertical index in input,
    //    horizontal index in invocation_data
    for(uint i = 0; i<8; ++i){
        for(uint j = 0; j<8; ++j){
            invocation_data[i/4][j/4][i%4][j%4] = data_in[input_local_to_global_idx(uvec2(j,i), local_start)];
        }
    }
    dct_8x8_impl(invocation_data);
    //copy all data to output
    for(uint i = 0; i<8; ++i){
        for(uint j = 0; j<8; ++j){
            data_out[output_global_to_local_idx(uvec2(j, i))] = invocation_data[i/4][j/4][i%4][j%4];
        }
    }
    data_out[0] = 1.0;
}