microsoft / DirectXShaderCompiler

This repo hosts the source for the DirectX Shader Compiler which is based on LLVM/Clang.
Other
3.09k stars 690 forks source link

struct value on "stack" #4497

Open laurentcau opened 2 years ago

laurentcau commented 2 years ago

Hi,

I noticed a strange behavior of the compiler when using intermediate value. It doesn't generate the same code when using a structuredbuffer directly or through a variable. Here is an example:

struct SData
{
    float3 value;
    uint type;
    float4 value2;
};

StructuredBuffer<SData> dataBuffer;

void fct1(SData data)
{
    [branch]if (data.type == 0)
        [branch] if(data.value.x < 0.0f)
            discard;
}

void test1()
{
    fct1(dataBuffer[0]);
}

void fct2(int id)
{
    [branch] if (dataBuffer[id].type == 0)
        [branch] if (dataBuffer[id].value.x < 0.0f)
              discard;
}

void test2()
{
    fct2(0);
}

I expected the compiler to generate the same code but it doesn’t.

Test1 generates code:

define void @main() {
  %dataBuffer_texture_structbuf = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 0, i1 false), !dbg !472  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 0.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 0.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 0.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 0.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 1, float 0.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 2, float 0.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 3, float 1.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  %RawBufferLoad11 = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %dataBuffer_texture_structbuf, i32 0, i32 0, i8 7, i32 4), !dbg !476  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
  %1 = extractvalue %dx.types.ResRet.f32 %RawBufferLoad11, 0, !dbg !476
  %RawBufferLoad = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %dataBuffer_texture_structbuf, i32 0, i32 12, i8 1, i32 4), !dbg !476  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
  %2 = extractvalue %dx.types.ResRet.i32 %RawBufferLoad, 0, !dbg !476
  call void @llvm.dbg.value(metadata i32 %2, i64 0, metadata !477, metadata !478), !dbg !479
  %3 = icmp eq i32 %2, 0, !dbg !481
  %4 = fcmp fast olt float %1, 0.000000e+00, !dbg !483
  %or.cond = and i1 %4, %3, !dbg !485
  br i1 %or.cond, label %5, label %"\01?ps_test1@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit", !dbg !485, !dx.controlflow.hints !486

; <label>:5                                       ; preds = %0
  call void @dx.op.discard(i32 82, i1 true), !dbg !487  ; Discard(condition)
  br label %"\01?ps_test1@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit", !dbg !487

"\01?ps_test1@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit": ; preds = %5, %0
  ret void, !dbg !488
} 

Test2 generates code:

define void @main() {
  %dataBuffer_texture_structbuf = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 0, i1 false), !dbg !472  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 0.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 0.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 0.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+00), !dbg !473  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 0.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 1, float 0.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 2, float 0.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 3, float 1.000000e+00), !dbg !475  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !476, metadata !477), !dbg !478
  %RawBufferLoad7 = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %dataBuffer_texture_structbuf, i32 0, i32 12, i8 1, i32 4), !dbg !480  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
  %1 = extractvalue %dx.types.ResRet.i32 %RawBufferLoad7, 0, !dbg !480
  %2 = icmp eq i32 %1, 0, !dbg !482
  br i1 %2, label %3, label %"\01?ps_test2@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit", !dbg !483, !dx.controlflow.hints !484

; <label>:3                                       ; preds = %0
  %RawBufferLoad = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %dataBuffer_texture_structbuf, i32 0, i32 0, i8 7, i32 4), !dbg !485  ; RawBufferLoad(srv,index,elementOffset,mask,alignment)
  %4 = extractvalue %dx.types.ResRet.f32 %RawBufferLoad, 0, !dbg !485
  %5 = fcmp fast olt float %4, 0.000000e+00, !dbg !487
  br i1 %5, label %6, label %"\01?ps_test2@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit", !dbg !488, !dx.controlflow.hints !489

; <label>:6                                       ; preds = %3
  call void @dx.op.discard(i32 82, i1 true), !dbg !488  ; Discard(condition)
  br label %"\01?ps_test2@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit", !dbg !488

"\01?ps_test2@@YA?AUPS_HDR_OUTPUT@@UVS_PS_COMMON@@UPSGPUVar@@@Z.exit": ; preds = %6, %3, %0
  ret void, !dbg !490
}

The second version looks better since the memory fetch is done only when the fist condition is true.

laurentcau commented 2 years ago

removed

tex3d commented 2 years ago

Technically, HLSL argument passing is all by-value, meaning copy-in (and copy-out for inout/out args). This means that if you pass a structure from the StructuredBuffer to the function, it's loading the entire structure, then copying it into the input argument of the function.

After inlining and some optimization, the aliasing is known, and in theory the load could sink into a branch. However, the branch gets eliminated by simplifycfg fairly early, even with the [branch] attribute, since the operations in the branch at this point are just trivial extract+fcmp+branch again.

Even if the branch was preserved until later, DXC is very conservative with sinking/hoisting loads/stores due to the potential for certain properties not tracked well in the IR to complicate things and make the transformation illegal. Generally, back-ends should know best where to move/group loads and stores for performance.

We can keep this issue open to track improving cases such as this, and potentially:

  1. teach simplifycfg to preserve even trivial branches with the [branch] attribute
  2. sink loads into control flow when trivially legal
llvm-beanz commented 1 month ago

Compiler Explorer