Shader validator error with linear broadcast kernel

maleadt commented 6 months ago

The linear broadcast kernel introduced in https://github.com/JuliaGPU/Metal.jl/pull/304 causes a shader validation error on M1-M3. metallib attached, can be reproduced using the following loader:

#import <Foundation/Foundation.h>
#import <Metal/Metal.h>

int main(int argc, const char *argv[]) {
  @autoreleasepool {
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();

    NSError *error = nil;

    NSURL *url = [NSURL fileURLWithPath:@(argv[1])];
    id<MTLLibrary> library = [device newLibraryWithURL:url error:&error];
    assert(library != nil);

    id<MTLFunction> function = [library newFunctionWithName:@(argv[2])];
    assert(function != nil);

    id<MTLComputePipelineState> pipeline_state =
        [device newComputePipelineStateWithFunction:function error:&error];
    if (pipeline_state == nil) {
      NSLog(@"%@", error);
      return 1;
    }
  }
  return 0;
}

❯ ./loader.exe original.metallib _Z16broadcast_linear14MtlDeviceArrayI4BoolLi1ELi1EE11BroadcastedI13MtlArrayStyleILi1E39Metal_MTL_MTLResourceStorageModePrivateE5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS_I4Int8Li1ELi1EES3_IS0_ES3_IS5_EES5_EE

❯ MTL_SHADER_VALIDATION=1 ./loader.exe original.metallib _Z16broadcast_linear14MtlDeviceArrayI4BoolLi1ELi1EE11BroadcastedI13MtlArrayStyleILi1E39Metal_MTL_MTLResourceStorageModePrivateE5TupleI5OneToI5Int64EE2__S3_I8ExtrudedIS_I4Int8Li1ELi1EES3_IS0_ES3_IS5_EES5_EE
2024-03-07 11:59:56.338 loader.exe[97724:11930823] Metal GPU Validation Enabled

2024-03-07 11:59:56.522 loader.exe[97724:11930823] Error Domain=AGXMetalG15X_B0 Code=3 "Compiler encountered an internal error" UserInfo={NSLocalizedDescription=Compiler encountered an internal error}

Looking at the crash log (also attached), this is an ISEL failure:

  "termination": {
    "code": 1,
    "flags": 518,
    "namespace": "METAL",
    "reasons": [
      "unable to legalize instruction: %363:_(p0) = 224 %362:_(s64), 1",
      "Context:",
      "%363:_(p0) = 224 %362:_(s64), 1",
      "%362:_(s64) = 120 i64 16",
      "(in function: agc.main)"
    ]
  },

original.zip

I reduced it to the following IR:

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.3.1"

define void @my_kernel({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { { { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, i64 }, [1 x [1 x i64]] } addrspace(1)* %1, i32 %threads_per_grid, i32 %thread_position_in_grid) {
conversion:
  %.elt = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, i64 0, i32 0
  %.unpack = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.elt, align 8
  %.unpack7.elt = getelementptr inbounds { { { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, i64 }, [1 x [1 x i64]] }, { { { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, i64 }, [1 x [1 x i64]] } addrspace(1)* %1, i64 0, i32 0, i32 0
  %.unpack7.unpack = load { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } addrspace(1)* %.unpack7.elt, align 8
  %.sroa.6 = alloca i64, align 8
  %2 = alloca i32, i32 0, align 4
  br label %L19.lr.ph

L19.lr.ph:                                        ; preds = %conversion
  %.fca.0.0.1.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack7.unpack, 1, 0
  %.not2 = icmp eq i8 %.fca.0.0.1.0.extract, 0
  %.sroa.6.0.sroa_cast = bitcast i64* %.sroa.6 to i32*
  %ifelse_result = select i1 %.not2, i32* %.sroa.6.0.sroa_cast, i32* %2
  %memcpy_refined_src = bitcast i32* %ifelse_result to i64*
  br label %L19

L19:                                              ; preds = %L139, %L19.lr.ph
  %value_phi17 = phi i32 [ %thread_position_in_grid, %L19.lr.ph ], [ %threads_per_grid, %L139 ]
  br label %L54

L54:                                              ; preds = %L19
  %3 = load i64, i64* %memcpy_refined_src, align 4
  br label %L139

L139:                                             ; preds = %L54
  %value_phi15.in = getelementptr inbounds i8, i8 addrspace(1)* null, i64 %3
  %value_phi15 = load i8, i8 addrspace(1)* %value_phi15.in, align 1
  %4 = sext i32 %value_phi17 to i64
  %5 = getelementptr inbounds i8, i8 addrspace(1)* %.unpack, i64 %4
  store i8 %value_phi15, i8 addrspace(1)* %5, align 1
  br label %L19
}

!air.kernel = !{!0}
!air.version = !{!7}

!0 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { { { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, i64 }, [1 x [1 x i64]] } addrspace(1)*, i32, i32)* @my_kernel, !1, !2}
!1 = !{}
!2 = !{!3, !4, !5, !6}
!3 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Bool, 1}", !"air.arg_name", !"dest"}
!4 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 48, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"Base.Broadcast.Broadcasted{Metal.MtlArrayStyle{1, Metal.MTL.MTLResourceStorageModePrivate}, Tuple{Base.OneTo{Int64}}, typeof(==), Tuple{Base.Broadcast.Extruded{MtlDeviceVector{Int8, 1}, Tuple{Bool}, Tuple{Int64}}, Int64}}", !"air.arg_name", !"bc"}
!5 = !{i32 2, !"air.threads_per_grid", !"air.arg_type_name", !"uint"}
!6 = !{i32 3, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!7 = !{i32 2, i32 6, i32 0}

maleadt commented 6 months ago

Another one that fails similarly:

declare float @air.sincos.f32(i64)

define void @my_kernel(float addrspace(1)* %a, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %e, i32 %arg2) {
  %tmp4 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %e to float addrspace(1)* addrspace(1)*
  %tmp5 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %tmp4, align 8
  %b = alloca i32, align 4
  %f = sext i32 %arg2 to i64
  %c = ptrtoint i32* %b to i64
  %1 = call float @air.sincos.f32(i64 %c)
  %d = getelementptr float, float addrspace(1)* %a, i64 %f
  store float 0.000000e+00, float addrspace(1)* %d, align 4
  store float 0.000000e+00, float addrspace(1)* %tmp5, align 4
  ret void
}

!air.kernel = !{!0}
!air.version = !{!6}

!0 = !{void (float addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @my_kernel, !1, !2}
!1 = !{}
!2 = !{!3, !4, !5}
!3 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.arg_type_name", !"arr_sin"}
!4 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, !"air.arg_type_name", !"air.arg_name", !"arr_cos"}
!5 = !{i32 2, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!6 = !{i32 2, i32 6, i32 0}

  "termination": {
    "code": 1,
    "flags": 518,
    "namespace": "METAL",
    "reasons": [
      "unable to legalize instruction: %101:_(p0) = 224 %115:_(s64), 1",
      "Context:",
      "%101:_(p0) = 224 %115:_(s64), 1",
      "%115:_(s64) = 120 i64 16",
      "(in function: agc.main)"
    ]
  },

tgymnich commented 6 months ago

%1 = call float @air.sincos.f32(i64 %c) i64??

maleadt commented 6 months ago

Yeah, ok, that's nonsensical. It's an artifact from the reduction, though. Here's the original IR:

declare float @air.sincos.f32(float, i64) local_unnamed_addr

define void @my_kernel({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1, i32 %thread_position_in_grid) local_unnamed_addr {
conversion:
  %2 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0 to float addrspace(1)* addrspace(1)*
  %.unpack8 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %2, align 8
  %3 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %1 to float addrspace(1)* addrspace(1)*
  %.unpack12 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %3, align 8
  %4 = alloca i32, align 8
  %5 = sext i32 %thread_position_in_grid to i64
  %6 = getelementptr inbounds float, float addrspace(1)* %.unpack12, i64 %5
  %7 = load float, float addrspace(1)* %6, align 4
  %bitcast_coercion5 = ptrtoint i32* %4 to i64
  %8 = call float @air.sincos.f32(float %7, i64 %bitcast_coercion5)
  %9 = bitcast i32* %4 to float*

  %10 = load float, float* %9, align 8
  %11 = getelementptr inbounds float, float addrspace(1)* %.unpack8, i64 %5
  store float %8, float addrspace(1)* %11, align 4
  store float %10, float addrspace(1)* %6, align 4
  ret void
}

attributes #0 = { argmemonly nocallback nofree nosync nounwind willreturn }

!air.kernel = !{!41}
!air.version = !{!48}
!air.language_version = !{!49}

!9 = distinct !DICompileUnit(language: DW_LANG_Julia, file: !10, producer: "julia", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
!10 = !DIFile(filename: "julia", directory: ".")
!41 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { i8 addrspace(1)*, [1 x i64] } addrspace(1)*, i32)* @my_kernel, !42, !43}
!42 = !{}
!43 = !{!44, !45, !46}
!44 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Float32, 1}", !"air.arg_name", !"arr_sin"}
!45 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Float32, 1}", !"air.arg_name", !"arr_cos"}
!46 = !{i32 2, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!48 = !{i32 2, i32 5, i32 0}
!49 = !{!"Metal", i32 3, i32 1, i32 0}

  "termination": {
    "code": 1,
    "flags": 518,
    "namespace": "METAL",
    "reasons": [
      "unable to legalize instruction: %259:_(p0) = 224 %258:_(s64), 1",
      "Context:",
      "%259:_(p0) = 224 %258:_(s64), 1",
      "%258:_(s64) = 120 i64 16",
      "(in function: agc.main)"
    ]
  },

EDIT: the sincos intrinsic signature still doesn't look nice here though. It comes from ccall("extern air.sincos.f32", llvmcall, Cfloat, (Cfloat, Ptr{Cfloat}), x, c), which should probably be an LLVMPtr. In any case, I don't think that's the cause of this issue, as changing the signature still reproduces the failure.

JuliaGPU / Metal.jl

Shader validator error with linear broadcast kernel #308