oolong-dev / OpenTelemetry.jl

An unofficial implementation of OpenTelemetry in Julia.
https://oolong.dev/OpenTelemetry.jl/
Apache License 2.0
30 stars 9 forks source link

OpenTelemetry with PythonCall can yield LLVM ERROR: Cannot emit physreg copy instruction #80

Open MartinWolke opened 1 year ago

MartinWolke commented 1 year ago

Hi!

Thank you for writing this package; it's awesome!

I've been experimenting with OpenTelemetry (OpenTelemetry v0.3.0) and stumbled over a weird interaction when combined with the PythonCall (PythonCall v0.9.13) Package. Running the following code reliably crashes Julia (I tried with Julia 1.9.0, Julia 1.9.1, and Julia 1.8.5; all installed with Juliaup) for me.

using PythonCall
using OpenTelemetry

with_span("test") do
    error()
end

The weird thing is that PythonCall isn't even used.

Julia crashes with

LLVM ERROR: Cannot emit physreg copy instruction

[28652] signal (6.-6): Aborted
[...]

Do you have any idea what might cause the issue? I couldn't find anything so far. I'll post a complete stack trace below.

LLVM ERROR: Cannot emit physreg copy instruction

[15626] signal (6.-6): Aborted
in expression starting at /home/martin/.julia/dev/AzureDev/startup.jl:9
gsignal at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
abort at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
_ZN4llvm18report_fatal_errorERKNS_5TwineEb at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm18report_fatal_errorEPKcb at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZNK4llvm12X86InstrInfo11copyPhysRegERNS_17MachineBasicBlockENS_26MachineInstrBundleIteratorINS_12MachineInstrELb0EEERKNS_8DebugLocENS_10MCRegisterES9_b at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN12_GLOBAL__N_112ExpandPostRA20runOnMachineFunctionERN4llvm15MachineFunctionE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm19MachineFunctionPass13runOnFunctionERNS_8FunctionE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm13FPPassManager13runOnFunctionERNS_8FunctionE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm13FPPassManager11runOnModuleERNS_6ModuleE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm6legacy15PassManagerImpl3runERNS_6ModuleE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc14SimpleCompilerclERNS_6ModuleE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
operator() at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jitlayers.cpp:1206
_ZN4llvm3orc14IRCompileLayer4emitESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EENS0_16ThreadSafeModuleE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16IRTransformLayer4emitESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EENS0_16ThreadSafeModuleE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
emit at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jitlayers.cpp:631
_ZN4llvm3orc31BasicIRLayerMaterializationUnit11materializeESt10unique_ptrINS0_29MaterializationResponsibilityESt14default_deleteIS3_EE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc19MaterializationTask3runEv at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm6detail18UniqueFunctionBaseIvJSt10unique_ptrINS_3orc4TaskESt14default_deleteIS4_EEEE8CallImplIPFvS7_EEEvPvRS7_ at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession22dispatchOutstandingMUsEv at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession17OL_completeLookupESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EESt10shared_ptrINS0_23AsynchronousSymbolQueryEESt8functionIFvRKNS_8DenseMapIPNS0_8JITDylibENS_8DenseSetINS0_15SymbolStringPtrENS_12DenseMapInfoISF_vEEEENSG_ISD_vEENS_6detail12DenseMapPairISD_SI_EEEEEE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc25InProgressFullLookupState8completeESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession19OL_applyQueryPhase1ESt10unique_ptrINS0_21InProgressLookupStateESt14default_deleteIS3_EENS_5ErrorE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupENS0_10LookupKindERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS8_EENS0_15SymbolLookupSetENS0_11SymbolStateENS_15unique_functionIFvNS_8ExpectedINS_8DenseMapINS0_15SymbolStringPtrENS_18JITEvaluatedSymbolENS_12DenseMapInfoISI_vEENS_6detail12DenseMapPairISI_SJ_EEEEEEEEESt8functionIFvRKNSH_IS6_NS_8DenseSetISI_SL_EENSK_IS6_vEENSN_IS6_SV_EEEEEE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS7_EERKNS0_15SymbolLookupSetENS0_10LookupKindENS0_11SymbolStateESt8functionIFvRKNS_8DenseMapIS5_NS_8DenseSetINS0_15SymbolStringPtrENS_12DenseMapInfoISK_vEEEENSL_IS5_vEENS_6detail12DenseMapPairIS5_SN_EEEEEE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupERKSt6vectorISt4pairIPNS0_8JITDylibENS0_19JITDylibLookupFlagsEESaIS7_EENS0_15SymbolStringPtrENS0_11SymbolStateE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupENS_8ArrayRefIPNS0_8JITDylibEEENS0_15SymbolStringPtrENS0_11SymbolStateE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
_ZN4llvm3orc16ExecutionSession6lookupENS_8ArrayRefIPNS0_8JITDylibEEENS_9StringRefENS0_11SymbolStateE at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/bin/../lib/julia/libLLVM-14jl.so (unknown line)
addModule at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jitlayers.cpp:1420
jl_add_to_ee at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jitlayers.cpp:1815
_jl_compile_codeinst at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jitlayers.cpp:241
jl_generate_fptr_impl at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jitlayers.cpp:460
jl_compile_method_internal at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2348 [inlined]
jl_compile_method_internal at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2237
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2750 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
#32 at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/trace/tracer_provider.jl:346
task_local_storage at ./task.jl:296
unknown function (ip: 0x7f193597942d)
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
#with_context#6 at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/common/context.jl:31
with_context at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/common/context.jl:31
unknown function (ip: 0x7f1935976de6)
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
#with_context#5 at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/common/context.jl:22
unknown function (ip: 0x7f1935976936)
with_context at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/common/context.jl:22 [inlined]
#with_span#31 at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/trace/tracer_provider.jl:337 [inlined]
with_span at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/trace/tracer_provider.jl:327
unknown function (ip: 0x7f193597397d)
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
with_span at /home/martin/.julia/packages/OpenTelemetryAPI/898mf/src/trace/tracer_provider.jl:327
unknown function (ip: 0x7f1935972a26)
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
jl_apply at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/julia.h:1879 [inlined]
do_call at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/interpreter.c:126
eval_value at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/interpreter.c:226
eval_stmt_value at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/interpreter.c:177 [inlined]
eval_body at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/interpreter.c:624
jl_interpret_toplevel_thunk at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/interpreter.c:762
jl_toplevel_eval_flex at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/toplevel.c:912
jl_toplevel_eval_flex at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/toplevel.c:856
ijl_toplevel_eval_in at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/toplevel.c:971
eval at ./boot.jl:370 [inlined]
include_string at ./loading.jl:1899
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
_include at ./loading.jl:1959
include at ./Base.jl:457
jfptr_include_31033.clone_1 at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/lib/julia/sys.so (unknown line)
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
exec_options at ./client.jl:307
_start at ./client.jl:522
jfptr__start_29455.clone_1 at /home/martin/.julia/juliaup/julia-1.9.1+0.x64.linux.gnu/lib/julia/sys.so (unknown line)
_jl_invoke at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2758 [inlined]
ijl_apply_generic at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/gf.c:2940
jl_apply at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/julia.h:1879 [inlined]
true_main at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jlapi.c:573
jl_repl_entrypoint at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/src/jlapi.c:717
main at /cache/build/default-amdci4-6/julialang/julia-release-1-dot-9/cli/loader_exe.c:59
__libc_start_main at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
unknown function (ip: 0x401098)
Allocations: 4082098 (Pool: 4079787; Big: 2311); GC: 6
Aborted (core dumped)
findmyway commented 1 year ago

Hi @MartinWolke ,

I can't reproduce this error on my linux environment with Julia@v1.9.0 . This error seems to be found in several issues in the Julia repo:

https://github.com/JuliaLang/julia/issues?q=is%3Aissue+physreg+is%3Aclosed

So I'm tagging @vtjnash here. Maybe Jameson is familiar with such errors.

MartinWolke commented 1 year ago

Hi @findmyway ,

thanks for checking this out! That's very interesting; Maybe it's something particular to the Linux versions or the Python versions that I was using. I'll have a look and try different setups and document my findings here.


I was able to narrow down the point at which Julia crashes:


using OpenTelemetry
using OpenTelemetry.OpenTelemetryAPI
using OpenTelemetry.OpenTelemetrySDK
using PythonCall

tracer = Tracer()
span = OpenTelemetryAPI.create_span("foo", tracer)

# this works
span.status[] = OpenTelemetryAPI.SpanStatus(
    OpenTelemetryAPI.SPAN_STATUS_ERROR, nothing
)

## this works, too
function foo(status)
    return span.status[] = status
end
status = OpenTelemetryAPI.SpanStatus(
    OpenTelemetryAPI.SPAN_STATUS_ERROR, nothing
)
foo(status)

## this crashes Julia
function foo(s, code, description=nothing)
    return s.status[] = OpenTelemetryAPI.SpanStatus(code, description)
end
foo(span, OpenTelemetryAPI.SPAN_STATUS_ERROR)

I have no clue why the last example crashes Julia while the prior ones do not. Tested with:

Julia Version 1.9.1
Commit 147bdf428cd (2023-06-07 08:27 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 32 × Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, icelake-server)
  Threads: 1 on 32 virtual cores

and

Julia Version 1.9.0
Commit 8e630552924 (2023-05-07 11:25 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 2 × Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, broadwell)
  Threads: 1 on 2 virtual cores

With this information, I tried the following code which seems to work for me:

 function my_span_status!(
    span::OpenTelemetrySDK.Span, status::OpenTelemetryAPI.SpanStatus
)
    if OpenTelemetryAPI.is_recording(span)
        if span.status[].code === OpenTelemetryAPI.SPAN_STATUS_OK
            # no further updates
        else
            if code === OpenTelemetryAPI.SPAN_STATUS_UNSET
                # ignore
            else
                span.status[] = status
            end
        end
    else
        @warn "the span is not recording."
    end
end

function OpenTelemetryAPI.with_span(
    f::Function,
    name::String,
    tracer::OpenTelemetryAPI.Tracer=OpenTelemetryAPI.Tracer();
    end_on_exit=true,
    record_exception=true,
    set_status_on_exception=true,
    kw...,
)
    s = OpenTelemetryAPI.create_span(name, tracer; kw...)
    OpenTelemetryAPI.with_context(;
        OpenTelemetryAPI.SPAN_KEY_IN_CONTEXT => s
    ) do
        try
            f()
        catch ex
            if OpenTelemetryAPI.is_recording(s)
                if record_exception
                    push!(s, ex; is_rethrow_followed=true)
                end
                if set_status_on_exception
                    status = OpenTelemetryAPI.SpanStatus(
                        OpenTelemetryAPI.SPAN_STATUS_ERROR, string(ex)
                    )
                    my_span_status!(s, status)
                end
            end
            rethrow(ex)
        finally
            if end_on_exit
                OpenTelemetryAPI.end_span!(s)
            end
        end
    end
end
vtjnash commented 1 year ago

I get an assertion error in codegen on this, so it is like a Julia bug. I see there is a package being used (by PythonCall) called UnsafePointers, and inference is accidentally inferring it might be used on a struct called SpanStatus. The struct SpanStatus does not have a C-compatible definition, so it is not well defined what happens when you use it with unsafe pointer-based operations. But this is in dead-code, so codegen should not crash.

The error could likely also be avoided by removing uses of Ref from this package (those are probably slowing things down anyways over making the struct mutable instead, so it is likely a good QOL change too).