Open Alexander-Barth opened 6 hours ago
Hi. That is because regular floor
does inexact precision check and if it fails, it throws an error, boxing the original value which launches malloc hostcall.
To use more GPU-friendly function you can use floor
without conversion followed by unsafe_trunc
:
julia> @code_llvm unsafe_trunc(Int, floor(1f0))
; Function Signature: unsafe_trunc(Type{Int64}, Float32)
; @ float.jl:416 within `unsafe_trunc`
define i64 @julia_unsafe_trunc_836(float %"x::Float32") #0 {
top:
%0 = fptosi float %"x::Float32" to i64
%1 = freeze i64 %0
ret i64 %1
}
You can also compare it with the original to see how fewer things it does:
julia> @code_llvm floor(Int, 1f0)
; Function Signature: floor(Type{Int64}, Float32)
; @ rounding.jl:475 within `floor`
define i64 @julia_floor_794(float %"x::Float32") #0 {
top:
%jlcallframe1 = alloca [3 x ptr], align 8
%gcframe2 = alloca [4 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe2, i8 0, i64 32, i1 true)
%thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #9
%tls_ppgcstack = getelementptr i8, ptr %thread_ptr, i64 -8
%tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
store i64 8, ptr %gcframe2, align 16
%frame.prev = getelementptr inbounds ptr, ptr %gcframe2, i64 1
%task.gcstack = load ptr, ptr %tls_pgcstack, align 8
store ptr %task.gcstack, ptr %frame.prev, align 8
store ptr %gcframe2, ptr %tls_pgcstack, align 8
; ┌ @ rounding.jl:479 within `round` @ float.jl:463
%0 = call float @llvm.floor.f32(float %"x::Float32")
; │ @ rounding.jl:479 within `round`
; │┌ @ rounding.jl:480 within `_round_convert`
; ││┌ @ number.jl:7 within `convert`
; │││┌ @ float.jl:991 within `Int64`
; ││││┌ @ float.jl:619 within `<=`
%1 = fcmp ult float %0, 0xC3E0000000000000
; ││││└
%2 = fcmp uge float %0, 0x43E0000000000000
%narrow.not = or i1 %1, %2
%3 = fsub float %0, %0
%4 = fcmp une float %3, 0.000000e+00
%or.cond = or i1 %narrow.not, %4
br i1 %or.cond, label %L17, label %L15
L15: ; preds = %top
; ││││ @ float.jl:992 within `Int64`
; ││││┌ @ float.jl:416 within `unsafe_trunc`
%5 = fptosi float %0 to i64
%6 = freeze i64 %5
%frame.prev9 = load ptr, ptr %frame.prev, align 8
store ptr %frame.prev9, ptr %tls_pgcstack, align 8
; ││││└
ret i64 %6
L17: ; preds = %top
; ││││ @ float.jl:994 within `Int64`
%7 = load ptr, ptr getelementptr (i8, ptr @jl_small_typeof, i64 256), align 8
%gc_slot_addr_1 = getelementptr inbounds ptr, ptr %gcframe2, i64 3
store ptr %7, ptr %gc_slot_addr_1, align 8
%box_Float32 = call ptr @ijl_box_float32(float %0)
%gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe2, i64 2
store ptr %box_Float32, ptr %gc_slot_addr_0, align 16
store ptr @"jl_sym#Int64#807.jit", ptr %jlcallframe1, align 8
%8 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 1
store ptr %7, ptr %8, align 8
%9 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 2
store ptr %box_Float32, ptr %9, align 8
%10 = call nonnull ptr @j1_InexactError_805(ptr nonnull @"+Core.InexactError#806.jit", ptr nonnull %jlcallframe1, i32 3)
call void @ijl_throw(ptr nonnull %10)
unreachable
; └└└└
}
And the reason it works on CUDA, because CUDA has malloc intrinsic for that.
I am trying to port a CUDA code to AMDGPU. A lot of things already work, but I have a problem with the
floor
function, which seems to trigger a host call. I guess the warning mean thatfloor
is not implemented for AMDGPUs and that the CPU version is used instead? The julia code:The output:
I use AMDGPU v1.1.2 on julia 1.11.1. Is there some information about how to implement this function?
It seems that there is a floating point version (single and double precision) for the floor function defined in ROC. https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/reference/kernel_language.html
And
floor(Float32,x)
does seem to work.However, in my case I would need an integer as I will use it as index to an array.
In any case, thanks a lot for this great package!