ytgui / temp

0 stars 0 forks source link

Vulkan 3: LLVM-Based Pipeline Compiler (AMD YES !) #71

Closed ytgui closed 5 years ago

ytgui commented 5 years ago

Build and Run

https://github.com/GPUOpen-Drivers/AMDVLK

1. repo

mkdir ~/bin
curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
chmod a+x ~/bin/repo

2. Source

mkdir vulkandriver
cd vulkandriver
~/bin/repo init -u https://github.com/GPUOpen-Drivers/AMDVLK.git -b master
~/bin/repo sync

3. build

cd <vulkandriver_path>/drivers/xgl
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug ..
make -j`nproc`

4. run

./glslangValidator -V vulkan_101.vert
./amdllpc vert.spv -debug -print-after-all
ytgui commented 5 years ago

Basic VS I/O

#version 450

precision highp float;
layout (location = 0) in vec3 inPosition;
layout (location = 1) in vec3 inColor;
layout (location = 0) out vec3 outColor;

void main()
{
  gl_Position = vec4(inPosition, 0.0);
  outColor = inColor;
}
define spir_func void @llpc.shader.VS.main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32) local_unnamed_addr #0 !spirv.ExecutionModel !6 !llpc.shaderstage !6 {
.entry:
  ; inColor
  %7 = call <3 x float> @llpc.input.import.generic.v3f32.i32.i32(i32 1, i32 0) #0
  ; inPosition
  %8 = call <3 x float> @llpc.input.import.generic.v3f32.i32.i32(i32 0, i32 0) #0
  ; vec3 -> vec4
  %9 = extractelement <3 x float> %8, i32 0
  %10 = extractelement <3 x float> %8, i32 1
  %11 = extractelement <3 x float> %8, i32 2
  %12 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %9, i32 0
  %13 = insertelement <4 x float> %12, float %10, i32 1
  %14 = insertelement <4 x float> %13, float %11, i32 2
  ; gl_Posotion
  call void @llpc.output.export.builtin.Position.i32.i32.v4f32(i32 0, i32 0, <4 x float> %14) #0
  ; glsl builtin and function
  call void @llpc.output.export.builtin.PointSize.i32.i32.f32(i32 1, i32 0, float undef) #0
  call void @llpc.output.export.builtin.ClipDistance.i32.a1f32(i32 3, [1 x float] undef) #0
  call void @llpc.output.export.builtin.CullDistance.i32.a1f32(i32 4, [1 x float] undef) #0
  ; outColor
  call void @llpc.output.export.generic.i32.i32.v3f32(i32 0, i32 0, <3 x float> %7) #0
  ret void
}

Basic FS I/O

#version 450

precision highp float;
layout (location = 0) in vec3 inColor;
layout (location = 0) out vec4 fragColor;

void main()
{
  fragColor = vec4(inColor, 1.0);
}
define spir_func void @llpc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !6 !llpc.shaderstage !6 {
.entry:
  ; inColor
  %0 = call <3 x float> @llpc.input.import.generic.v3f32.i32.i32.i32.i32(i32 0, i32 0, i32 0, i32 1) #0
  ; vec3 -> vec4
  %1 = extractelement <3 x float> %0, i32 0
  %2 = extractelement <3 x float> %0, i32 1
  %3 = extractelement <3 x float> %0, i32 2
  %4 = insertelement <4 x float> <float undef, float undef, float undef, float 1.000000e+00>, float %1, i32 0
  %5 = insertelement <4 x float> %4, float %2, i32 1
  %6 = insertelement <4 x float> %5, float %3, i32 2
  ; fragColor
  call void @llpc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> %6) #0
  ret void
}

FS UBO

#version 450

precision highp float;
layout (set = 0, binding = 0) uniform UBO
{
  vec4 inColor;
} ubo;
layout (location = 0) out vec4 fragColor;

void main()
{
  fragColor = vec4(ubo.inColor);
}
define spir_func void @llpc.shader.FS.main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, <2 x float>, <2 x float>, <2 x float>, <3 x float>, <2 x float>, <2 x float>, <2 x float>, float, float, float, float, float, i32, i32, i32, i32) local_unnamed_addr #3 !spirv.ExecutionModel !6 !llpc.shaderstage !6 {
.entry:
  ; descriptor
  %20 = call <4 x i32> @llpc.descriptor.load.buffer(i32 0, i32 0, i32 0, i1 false) #0
  ; pointer
  %21 = call i8 addrspace(7)* @llpc.late.launder.fat.pointer(<4 x i32> %20) #2
  %22 = call {}* @llvm.invariant.start.p7i8(i64 -1, i8 addrspace(7)* %21)
  %23 = bitcast i8 addrspace(7)* %21 to <4 x float> addrspace(7)*
  ; buffer
  %24 = load <4 x float>, <4 x float> addrspace(7)* %23, align 16
  ; fragColor
  call void @llpc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> %24) #0
  ret void
}
ytgui commented 5 years ago

CS Image

#version 450
layout (local_size_x = 1) in;
layout(binding=0, rgba16f) readonly uniform image2D in_image;
layout(binding=1, rgba16f) writeonly uniform image2D out_image;
void main(void)
{
  ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
  imageStore(out_image, pos, imageLoad(in_image, pos));
}
define spir_func void @llpc.shader.CS.main() local_unnamed_addr #0 !spirv.ExecutionModel !9 !llpc.shaderstage !9 {
.entry:
  ; index
  %0 = call <3 x i32> @llpc.input.import.builtin.GlobalInvocationId.v3i32.i32(i32 28) #0
  %1 = shufflevector <3 x i32> %0, <3 x i32> undef, <2 x i32> <i32 0, i32 1>
  ; imageLoad
  %2 = call <8 x i32> @llpc.descriptor.load.resource(i32 0, i32 0, i32 0, i1 false) #0
  %3 = call <4 x float> @llpc.image.read.f32.2D.dimaware(<8 x i32> %2, <2 x i32> %1, i32 0, i32 69) #0
  ; imageStore
  %4 = call <8 x i32> @llpc.descriptor.load.resource(i32 0, i32 1, i32 0, i1 false) #0
  call void @llpc.image.write.f32.2D.dimaware(<8 x i32> %4, <2 x i32> %1, <4 x float> %3, i32 0, i32 8262) #0
  ret void
}
ytgui commented 5 years ago
  1. reserved
ytgui commented 5 years ago

GPU 与 LLVM

Graphic Pipeline

Shader Memory Model (VS FS IR)

GPGPU

CUDA Memory Model (CS IR)

Intrinsic

优化点:寄存器分配、Texture 优化、不要使用过于偏门的 builtin、行优先列优先与透视变换

OpenGPU