EnzymeAD / Enzyme.jl

Julia bindings for the Enzyme automatic differentiator
https://enzyme.mit.edu
MIT License
439 stars 62 forks source link

dot product on Vector{Float32} fails #495

Closed freddycct closed 1 year ago

freddycct commented 1 year ago

Julia Version 1.8.1 (2022-09-06) Enzyme v0.10.6 OSX Apple Silicon

Note this works: basically a dot product of x itself

xx = rand(Float32, 10)
grads = zeros(Float32, size(xx))
autodiff(Reverse, (y) -> mapreduce(x -> x*x, +, y), Duplicated(xx, grads))
@assert xx .* 2 == grads

This works too

xx = rand(Float32, 10)
grads = zeros(Float32, size(xx))
autodiff(Reverse, (x) -> sum(x .* x), Duplicated(xx, grads))
@assert xx .* 2 == grads

This doesn’t work on my computer. Also a dot product, similar to above but written concisely.

xx = rand(Float32, 10)
grads = zeros(Float32, size(xx))
autodiff(Reverse, (x) -> x' * x, Duplicated(xx, grads))

However, changing Float32 to Float64 will work.

Log on Pluto:

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

task switch not allowed from inside staged nor pure functions

try_yieldto(::typeof(Base.ensure_rescheduled))@task.jl:861
wait()@task.jl:931
uv_write(::Base.PipeEndpoint, ::Ptr{UInt8}, ::UInt64)@stream.jl:1043
unsafe_write(::Base.PipeEndpoint, ::Ptr{UInt8}, ::UInt64)@stream.jl:1115
write@io.jl:244[inlined]
print@io.jl:246[inlined]
print(::Base.PipeEndpoint, ::String, ::String)@io.jl:46
println@io.jl:75[inlined]
var"#handle_message#101"(::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, ::typeof(Base.CoreLogging.handle_message), ::Main.PlutoRunner.PlutoLogger, ::Base.CoreLogging.LogLevel, ::String, ::Module, ::Symbol, ::Symbol, ::String, ::Int64)@PlutoRunner.jl:2222
handle_message(::Main.PlutoRunner.PlutoLogger, ::Base.CoreLogging.LogLevel, ::String, ::Module, ::Symbol, ::Symbol, ::String, ::Int64)@PlutoRunner.jl:2197
macro expansion@logging.jl:330[inlined]
macro expansion@utils.jl:43[inlined]
var"#codegen#132"(::Bool, ::Bool, ::Bool, ::LLVM.Context, ::Bool, ::Bool, ::Bool, ::Nothing, ::typeof(GPUCompiler.codegen), ::Symbol, ::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{var"#9#10"{typeof(*)}, Tuple{Vector{Float32}}}})@compiler.jl:5472
_thunk@compiler.jl:6170[inlined]
_thunk(::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{var"#9#10"{typeof(*)}, Tuple{Vector{Float32}}}})@compiler.jl:6164
cached_compilation(::GPUCompiler.CompilerJob, ::UInt64, ::UInt64)@compiler.jl:6208
#s766#159@compiler.jl:6268[inlined]
var"#s766#159"(::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Type, ::Any, ::Any, ::Type, ::Any, ::Type, ::Type, ::Type, ::Type, ::Any)@none:0
(::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any})@boot.jl:582
thunk@compiler.jl:6301[inlined]
thunk@compiler.jl:6294[inlined]
autodiff@Enzyme.jl:311[inlined]
autodiff(::Enzyme.ReverseMode, ::var"#9#10"{typeof(*)}, ::Enzyme.Duplicated{Vector{Float32}})@Enzyme.jl:348
top-level scope@Local: 4[inlined]

On REPL:

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

┌ Warning: Using fallback BLAS replacements, performance may be degraded
└ @ Enzyme.Compiler ~/.julia/packages/GPUCompiler/07qaN/src/utils.jl:35
warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

┌ Warning: Using fallback BLAS replacements, performance may be degraded
└ @ Enzyme.Compiler ~/.julia/packages/GPUCompiler/07qaN/src/utils.jl:35
warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

warning: Linking two modules of different target triples: 'bcloader' is 'arm64-apple-macosx11.0.0' whereas 'text' is 'arm64-apple-darwin21.6.0'

┌ Warning: Using fallback BLAS replacements, performance may be degraded
└ @ Enzyme.Compiler ~/.julia/packages/GPUCompiler/07qaN/src/utils.jl:35
ERROR: Enzyme compilation failed due to illegal type analysis.
Current scope:
; Function Attrs: alwaysinline argmemonly mustprogress nofree nosync nounwind readonly ssp uwtable willreturn
define internal double @preprocess_sdot_64_(i64* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) %0, float* nocapture nofree readonly %1, i64* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) %2, float* nocapture nofree readonly %3, i64* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) %4) unnamed_addr #13 {
  %6 = load i64, i64* %0, align 8, !tbaa !81
  %7 = trunc i64 %6 to i32
  %8 = load i64, i64* %2, align 8, !tbaa !81
  %9 = trunc i64 %8 to i32
  %10 = load i64, i64* %4, align 8, !tbaa !81
  %11 = trunc i64 %10 to i32
  %12 = sub i32 1, %7
  %13 = icmp sgt i32 %7, 0
  br i1 %13, label %14, label %cblas_sdot.exit

14:                                               ; preds = %5
  %15 = icmp sgt i32 %11, 0
  %16 = mul i32 %12, %11
  %17 = select i1 %15, i32 0, i32 %16
  %18 = icmp sgt i32 %9, 0
  %19 = mul i32 %12, %9
  %20 = select i1 %18, i32 0, i32 %19
  %21 = sext i32 %17 to i64
  %sext = shl i64 %10, 32
  %22 = ashr exact i64 %sext, 32
  %23 = sext i32 %20 to i64
  %sext1 = shl i64 %8, 32
  %24 = ashr exact i64 %sext1, 32
  %25 = sext i32 %9 to i64
  %26 = sext i32 %11 to i64
  br label %27

27:                                               ; preds = %27, %14
  %iv = phi i64 [ %iv.next, %27 ], [ 0, %14 ]
  %28 = phi float [ 0.000000e+00, %14 ], [ %39, %27 ]
  %29 = trunc i64 %iv to i32
  %iv.next = add nuw nsw i64 %iv, 1
  %30 = mul i64 %26, %iv
  %31 = add i64 %21, %30
  %32 = mul i64 %25, %iv
  %33 = add i64 %23, %32
  %34 = getelementptr inbounds float, float* %1, i64 %33
  %35 = load float, float* %34, align 4, !tbaa !85
  %36 = getelementptr inbounds float, float* %3, i64 %31
  %37 = load float, float* %36, align 4, !tbaa !85
  %38 = fmul float %35, %37
  %39 = fadd float %28, %38
  %40 = add i64 %33, %24
  %41 = add i64 %31, %22
  %42 = add nuw nsw i32 %29, 1
  %43 = icmp eq i32 %42, %7
  br i1 %43, label %cblas_sdot.exit.loopexit, label %27, !llvm.loop !87

cblas_sdot.exit.loopexit:                         ; preds = %27
  %phi.cast = fpext float %39 to double
  br label %cblas_sdot.exit

cblas_sdot.exit:                                  ; preds = %cblas_sdot.exit.loopexit, %5
  %44 = phi double [ 0.000000e+00, %5 ], [ %phi.cast, %cblas_sdot.exit.loopexit ]
  ret double %44
}

 Type analysis state:
<analysis>
  %36 = getelementptr inbounds float, float* %3, i64 %31: {[-1]:Pointer, [-1,-1]:Float@float}, intvals: {}
  %34 = getelementptr inbounds float, float* %1, i64 %33: {[-1]:Pointer, [-1,-1]:Float@float}, intvals: {}
  %17 = select i1 %15, i32 0, i32 %16: {[-1]:Integer}, intvals: {}
  %20 = select i1 %18, i32 0, i32 %19: {[-1]:Integer}, intvals: {}
i64* %0: {[-1]:Pointer, [-1,-1]:Integer}, intvals: {}
float* %1: {[-1]:Pointer, [-1,-1]:Float@float}, intvals: {}
i64* %2: {[-1]:Pointer, [-1,-1]:Integer}, intvals: {}
float* %3: {[-1]:Pointer, [-1,-1]:Float@float}, intvals: {}
i64* %4: {[-1]:Pointer, [-1,-1]:Integer}, intvals: {}
  %9 = trunc i64 %8 to i32: {[-1]:Integer}, intvals: {}
  %23 = sext i32 %20 to i64: {[-1]:Integer}, intvals: {}
  %35 = load float, float* %34, align 4, !tbaa !20: {[-1]:Float@float}, intvals: {}
  %37 = load float, float* %36, align 4, !tbaa !20: {[-1]:Float@float}, intvals: {}
  %11 = trunc i64 %10 to i32: {[-1]:Integer}, intvals: {}
  %10 = load i64, i64* %4, align 8, !tbaa !16: {[-1]:Integer}, intvals: {}
  %6 = load i64, i64* %0, align 8, !tbaa !16: {[-1]:Integer}, intvals: {}
  %8 = load i64, i64* %2, align 8, !tbaa !16: {[-1]:Integer}, intvals: {}
  %7 = trunc i64 %6 to i32: {[-1]:Integer}, intvals: {}
  %phi.cast = fpext float %39 to double: {[-1]:Float@double}, intvals: {}
  %29 = trunc i64 %iv to i32: {[-1]:Integer}, intvals: {0,}
  %26 = sext i32 %11 to i64: {[-1]:Integer}, intvals: {}
  %25 = sext i32 %9 to i64: {[-1]:Integer}, intvals: {}
  %21 = sext i32 %17 to i64: {[-1]:Integer}, intvals: {}
  %28 = phi float [ 0.000000e+00, %14 ], [ %39, %27 ]: {[-1]:Float@float}, intvals: {}
  %44 = phi double [ 0.000000e+00, %5 ], [ %phi.cast, %cblas_sdot.exit.loopexit ]: {[-1]:Float@float}, intvals: {}
  %iv = phi i64 [ %iv.next, %27 ], [ 0, %14 ]: {[-1]:Integer}, intvals: {0,}
  %13 = icmp sgt i32 %7, 0: {[-1]:Integer}, intvals: {}
  %19 = mul i32 %12, %9: {[-1]:Integer}, intvals: {}
  %18 = icmp sgt i32 %9, 0: {[-1]:Integer}, intvals: {}
  %sext = shl i64 %10, 32: {[-1]:Integer}, intvals: {}
  %33 = add i64 %23, %32: {[-1]:Integer}, intvals: {}
  %iv.next = add nuw nsw i64 %iv, 1: {[-1]:Integer}, intvals: {1,}
  %24 = ashr exact i64 %sext1, 32: {[-1]:Integer}, intvals: {}
  %38 = fmul float %35, %37: {[-1]:Float@float}, intvals: {}
  %sext1 = shl i64 %8, 32: {[-1]:Integer}, intvals: {}
  %31 = add i64 %21, %30: {[-1]:Integer}, intvals: {}
  %42 = add nuw nsw i32 %29, 1: {[-1]:Integer}, intvals: {1,}
  %16 = mul i32 %12, %11: {[-1]:Integer}, intvals: {}
  %43 = icmp eq i32 %42, %7: {[-1]:Integer}, intvals: {}
  %15 = icmp sgt i32 %11, 0: {[-1]:Integer}, intvals: {}
  %41 = add i64 %31, %22: {[-1]:Integer}, intvals: {}
  %30 = mul i64 %26, %iv: {[-1]:Integer}, intvals: {0,}
  %39 = fadd float %28, %38: {[-1]:Float@float}, intvals: {}
  %32 = mul i64 %25, %iv: {[-1]:Integer}, intvals: {0,}
  %22 = ashr exact i64 %sext, 32: {[-1]:Integer}, intvals: {}
  %40 = add i64 %33, %24: {[-1]:Integer}, intvals: {}
  %12 = sub i32 1, %7: {[-1]:Integer}, intvals: {}
float 0.000000e+00: {[-1]:Anything}, intvals: {}
i64 32: {[-1]:Integer}, intvals: {32,}
i64 0: {[-1]:Anything}, intvals: {0,}
i64 1: {[-1]:Integer}, intvals: {1,}
i32 1: {[-1]:Integer}, intvals: {1,}
i32 0: {[-1]:Anything}, intvals: {0,}
</analysis>

Illegal updateAnalysis prev:{[-1]:Float@double} new: {[-1]:Float@float}
val:   %phi.cast = fpext float %39 to double origin=  %44 = phi double [ 0.000000e+00, %5 ], [ %phi.cast, %cblas_sdot.exit.loopexit ]

Caused by:

Stacktrace:
  [1] julia_error(cstr::Cstring, val::Ptr{LLVM.API.LLVMOpaqueValue}, errtype::Enzyme.API.ErrorType, data::Ptr{Nothing})
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:3373
  [2] EnzymeCreatePrimalAndGradient(logic::Enzyme.Logic, todiff::LLVM.Function, retType::Enzyme.API.CDIFFE_TYPE, constant_args::Vector{Enzyme.API.CDIFFE_TYPE}, TA::Enzyme.TypeAnalysis, returnValue::Bool, dretUsed::Bool, mode::Enzyme.API.CDerivativeMode, width::Int64, additionalArg::Ptr{Nothing}, typeInfo::Enzyme.FnTypeInfo, uncacheable_args::Vector{Bool}, augmented::Ptr{Nothing}, atomicAdd::Bool)
    @ Enzyme.API ~/.julia/packages/Enzyme/3t61Z/src/api.jl:118
  [3] enzyme!(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{var"#1#2", Tuple{Vector{Float32}}}}, mod::LLVM.Module, primalf::LLVM.Function, adjoint::GPUCompiler.FunctionSpec{var"#1#2", Tuple{Duplicated{Vector{Float32}}}}, mode::Enzyme.API.CDerivativeMode, width::Int64, parallel::Bool, actualRetType::Type, dupClosure::Bool, wrap::Bool, modifiedBetween::Bool, returnPrimal::Bool, jlrules::Vector{String})
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:4611
  [4] codegen(output::Symbol, job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{var"#1#2", Tuple{Vector{Float32}}}}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, ctx::LLVM.Context, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:5703
  [5] _thunk
    @ ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:6170 [inlined]
  [6] _thunk(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{var"#1#2", Tuple{Vector{Float32}}}})
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:6164
  [7] cached_compilation(job::GPUCompiler.CompilerJob, key::UInt64, specid::UInt64)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:6208
  [8] #s766#159
    @ ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:6268 [inlined]
  [9] var"#s766#159"(F::Any, Fn::Any, DF::Any, A::Any, TT::Any, Mode::Any, ModifiedBetween::Any, width::Any, specid::Any, ReturnPrimal::Any, ::Any, #unused#::Type, f::Any, df::Any, #unused#::Type, tt::Any, #unused#::Type, #unused#::Type, #unused#::Type, #unused#::Type, #unused#::Any)
    @ Enzyme.Compiler ./none:0
 [10] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any})
    @ Core ./boot.jl:582
 [11] thunk
    @ ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:6301 [inlined]
 [12] thunk (repeats 2 times)
    @ ~/.julia/packages/Enzyme/3t61Z/src/compiler.jl:6294 [inlined]
 [13] autodiff
    @ ~/.julia/packages/Enzyme/3t61Z/src/Enzyme.jl:311 [inlined]
 [14] autodiff(mode::Enzyme.ReverseMode, f::var"#1#2", args::Duplicated{Vector{Float32}})
    @ Enzyme ~/.julia/packages/Enzyme/3t61Z/src/Enzyme.jl:348
 [15] top-level scope
    @ REPL[4]:1
wsmoses commented 1 year ago

Looks like a bug on clapack. Fixed here: https://github.com/EnzymeAD/Enzyme/pull/886 You'll have to wait for a jll bump though.

freddycct commented 1 year ago

This problem is also occurs on x86 machines. Flux and Lux uses Float32 vectors by default, so it will be a blocker until we have this. Thanks for quick fix though.