Open polytypic opened 1 month ago
I was able to reproduce the Undefined
exception with the following:
let () =
let barrier = Atomic.make 2 in
let parse () =
Atomic.decr barrier;
while Atomic.get barrier <> 0 do
Domain.cpu_relax ()
done;
Uri.of_string "http://localhost:8080/" |> ignore
in
let d1 = Domain.spawn parse and d2 = Domain.spawn parse in
Domain.join d1 |> ignore;
Domain.join d2 |> ignore;
()
NOTE: It may take a few retries!
EDIT: I updated the above to use a different URI.
Interestingly Uri-re isn't safe either:
module Uri = Uri_legacy
let () =
Printexc.record_backtrace true;
let barrier = Atomic.make 2 in
let parse () =
Printexc.record_backtrace true;
Atomic.decr barrier;
while Atomic.get barrier <> 0 do
Domain.cpu_relax ()
done;
try
Uri.of_string "http://[::1]:8080" |> ignore
with e ->
(* Domain backtraces are broken on 5.2 *)
let bt = Printexc.get_raw_backtrace () in
Printexc.print_raw_backtrace stderr bt;
Printexc.to_string e |> prerr_endline;
Printexc.raise_with_backtrace e bt
in
let d1 = Domain.spawn parse and d2 = Domain.spawn parse in
Domain.join d1 |> ignore;
Domain.join d2 |> ignore;
()
fails after a couple of tries with:
Raised at Stdlib.invalid_arg in file "stdlib.ml", line 30, characters 20-45
Called from Stdlib__String.sub in file "string.ml" (inlined), line 43, characters 2-23
Called from Re__Group.get in file "lib/group.ml", line 20, characters 2-29
Called from Uri_legacy.of_string.get_opt in file "lib_re/uri_legacy.ml", line 708, characters 33-51
Called from Uri_legacy.of_string in file "lib_re/uri_legacy.ml", line 713, characters 15-29
Called from Dune__exe__Lazybug.parse in file "lazybug.ml", line 12, characters 6-39
Invalid_argument("String.sub / Bytes.sub")
Fatal error: exception Invalid_argument("String.sub / Bytes.sub")
Raised at Stdlib__Domain.join in file "domain.ml", line 296, characters 16-24
Called from Dune__exe__Lazybug in file "lazybug.ml", line 21, characters 2-16
Uri_legacy uses Lazy too internally :(
Here is a slightly tweaked testcase that crashes on OCaml 4.14.2 too (with a segfault too sometimes :( ):
let check _ =
Thread.yield ();
None
let () =
let tracker = Gc.Memprof.{null_tracker with alloc_minor = check; alloc_major = check } in
Gc.Memprof.start ~sampling_rate:1e-1 ~callstack_size:0 tracker;
let n = 100 in
let barrier = Atomic.make n in
let parse _ =
Atomic.decr barrier;
while Atomic.get barrier <> 0 do
Thread.yield ()
done;
for _ = 1 to 100 do
Sys.opaque_identity (Uri.of_string "http://[::1]:8080") |> ignore
done
in
let threads = Array.init n (Thread.create parse) in
Array.iter Thread.join threads
dune exec ./lazybug.exe
Thread 91 killed on uncaught exception CamlinternalLazy.Undefined
Thread 88 killed on uncaught exception CamlinternalLazy.Undefined
Thread 86 killed on uncaught exception CamlinternalLazy.Undefined
Thread 83 killed on uncaught exception CamlinternalLazy.Undefined
Thread 78 killed on uncaught exception CamlinternalLazy.Undefined
Thread 57 killed on uncaught exception CamlinternalLazy.Undefined
[3] 233673 segmentation fault (core dumped) dune exec ./lazybug.exe
FWIW I couldn't get the equivalent uri-re code to crash/fail on OCaml 4 yet, although that doesn't mean it is thread safe there.
Potential workaround is to do this in Angstrom:
diff --git a/lib/angstrom.ml b/lib/angstrom.ml
index e546fbc63856..286992f4e870 100644
--- a/lib/angstrom.ml
+++ b/lib/angstrom.ml
@@ -459,6 +459,7 @@ let fix_direct f =
and r = { run = fun buf pos more fail succ ->
(Lazy.force p).run buf pos more fail succ }
in
+ (Sys.opaque_identity (Lazy.force p) |> ignore);
r
Or this:
diff --git a/lib/angstrom.ml b/lib/angstrom.ml
index e546fbc63856..304d3f3014de 100644
--- a/lib/angstrom.ml
+++ b/lib/angstrom.ml
@@ -455,10 +455,11 @@ let choice ?(failure_msg="no more choices") ps =
List.fold_right (<|>) ps (fail failure_msg)
let fix_direct f =
- let rec p = lazy (f r)
- and r = { run = fun buf pos more fail succ ->
- (Lazy.force p).run buf pos more fail succ }
+ let p = ref None in
+ let r = { run = fun buf pos more fail succ ->
+ (Option.get !p).run buf pos more fail succ }
in
+ p := Some (f r);
r
There might be other ways of fixing it too
It's somewhat worrying to see that test case segfaulting. Do you have a backtrace for that, @edwintorok?
It's somewhat worrying to see that test case segfaulting. Do you have a backtrace for that, @edwintorok?
OCaml 4.14.2, uri 4.4.0 and angstrom 0.16.0
$ ocamlfind ocamlopt lazybug.ml -g -o lazybug.exe -package uri,threads.posix -thread -linkpkg
$ ulimit -c unlimited
$ ./lazybug.exe
$ gdb -c core
(gdb) bt
#0 0x0000000000447ed9 in caml_apply5 ()
#1 0x0000000000458155 in camlAngstrom__fun_2531 () at lib/angstrom.ml:615
#2 0x0000000000000001 in ?? ()
#3 0x00007f38fbdfff70 in ?? ()
#4 0x0000000000458155 in camlAngstrom__fun_2531 () at lib/angstrom.ml:615
#5 0x0000000000448434 in camlUri__compare_opt_377 () at lib/uri.ml:65
#6 0x000000000000001d in ?? ()
#7 0x00007f38a8000b70 in ?? ()
#8 0x00007f38fbdffd50 in ?? ()
#9 0x00000000000038c0 in ?? ()
#10 0x00007f38a8000030 in ?? ()
#11 0x00000000004c8e0d in caml_start_program ()
#12 0x9e612da82678d500 in ?? ()
#13 0x00007f38fbe00cdc in ?? ()
#14 0x00007fff786e2200 in ?? ()
#15 0x000000000000001d in ?? ()
#16 0xffffffffffffff88 in ?? ()
#17 0x00007f38fbe006c0 in ?? ()
#18 0x00007f38fbdfff70 in ?? ()
#19 0x00007f399a7f5180 in ?? ()
#20 0x00000000004bf3bd in default_fatal_uncaught_exception (exn=1) at printexc.c:130
#21 caml_fatal_uncaught_exception (exn=1) at printexc.c:155
#22 0x00000000018e5140 in ?? ()
#23 0x0000000000000000 in ?? ()
(gdb) disass 0x0000000000447ed9
Dump of assembler code for function caml_apply5:
0x0000000000447ea0 <+0>: sub $0x28,%rsp
0x0000000000447ea4 <+4>: mov 0x8(%rcx),%r8
0x0000000000447ea8 <+8>: sar $0x38,%r8
0x0000000000447eac <+12>: cmp $0x5,%r8
0x0000000000447eb0 <+16>: jne 0x447ec0 <caml_apply5+32>
0x0000000000447eb2 <+18>: mov 0x10(%rcx),%r8
0x0000000000447eb6 <+22>: add $0x28,%rsp
0x0000000000447eba <+26>: jmp *%r8
0x0000000000447ebd <+29>: nopl (%rax)
0x0000000000447ec0 <+32>: mov %rdx,0x18(%rsp)
0x0000000000447ec5 <+37>: mov %rsi,0x10(%rsp)
0x0000000000447eca <+42>: mov %rdi,0x8(%rsp)
0x0000000000447ecf <+47>: mov %rbx,(%rsp)
0x0000000000447ed3 <+51>: mov (%rcx),%rdi
0x0000000000447ed6 <+54>: mov %rcx,%rbx
=> 0x0000000000447ed9 <+57>: call *%rdi
0x0000000000447edb <+59>: mov %rax,%rbx
0x0000000000447ede <+62>: mov (%rbx),%rdi
0x0000000000447ee1 <+65>: mov (%rsp),%rax
0x0000000000447ee5 <+69>: call *%rdi
0x0000000000447ee7 <+71>: mov %rax,%rbx
0x0000000000447eea <+74>: mov (%rbx),%rdi
0x0000000000447eed <+77>: mov 0x8(%rsp),%rax
0x0000000000447ef2 <+82>: call *%rdi
0x0000000000447ef4 <+84>: mov %rax,%rbx
0x0000000000447ef7 <+87>: mov (%rbx),%rdi
0x0000000000447efa <+90>: mov 0x10(%rsp),%rax
0x0000000000447eff <+95>: call *%rdi
0x0000000000447f01 <+97>: mov %rax,%rbx
0x0000000000447f04 <+100>: mov (%rbx),%rdi
0x0000000000447f07 <+103>: mov 0x18(%rsp),%rax
0x0000000000447f0c <+108>: add $0x28,%rsp
0x0000000000447f10 <+112>: jmp *%rdi
End of assembler dump.
(gdb) print rdi
No symbol "rdi" in current context.
(gdb) info registers
rax 0x7ffff7be6fe8 140737349840872
rbx 0x466da0 4615584
rcx 0x466da0 4615584
rdx 0x7ffff7b49470 140737349194864
rsi 0x7ffff7b5e830 140737349281840
rdi 0x10438b4808ec8348 1171933469550084936
rbp 0x7ffec59fff70 0x7ffec59fff70
rsp 0x7ffec59ffc50 0x7ffec59ffc50
r8 0x0 0
r9 0x502580 5252480
r10 0x4 4
r11 0x7ffec59ffc10 140732214017040
r12 0x7ffff7b4c9c0 140737349208512
r13 0x7ffff7b4ca48 140737349208648
r14 0x55c2a0 5620384
r15 0x7ffff7b49468 140737349194856
rip 0x447ed9 0x447ed9 <caml_apply5+57>
eflags 0x10293 [ CF AF SF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
fs_base 0x7ffec5a006c0 140732214019776
gs_base 0x0 0
(gdb) print 0x10438b4808ec8348
$1 = 1171933469550084936
(gdb) print *0x10438b4808ec8348
Cannot access memory at address 0x10438b4808ec8348
Disabling compaction doesn't help and the debug runtime doesn't give any particularly useful clues either.
Perhaps we should move debugging this part to an ocaml/ocaml issue on 4.14 if it is not meant to SIGSEGV (which I asume is not, it is meant to raise the exception).
Thank you Edwin for the fixes to Angstrom! Now released into opam repo as 0.16.1
I've looked at the OCaml 5 failure with Uri_re (Uri_re is safe to use on OCaml 4, I wasn't able to find any bugs there).
It looks like the (p2-p1
) in group.ml
sometimes become negative, which points to some potential corruption of the group match positions, but these group matches are supposed to be per 'Re.exec' invocation, and not shared with other threads.
There are a lot of complaints from the thread sanitizer: log.txt
This has been discussed before, although in the context of Re.Str not being thread-safe https://discuss.ocaml.org/t/re-str-is-not-thread-safe/9425/4?u=edwin, and I think we reasonably believed Re itself to be domain-safe though.
Although https://github.com/ocaml/ocaml-re/issues/287#issuecomment-2124229502 claims that Re
is not thread-safe (I thought it was, except for Str, that is the reason we migrated most of our regexes to it...), which means that Uri_re
is probably not thread safe either.
Uri_re
is deprecated anyway, although probably still the safer choice on OCaml <=4.14.2 or Angstrom<0.16.1.
It's about time to remove Uri_re -- it's been deprecated for some years now, and the older version will still be available in opam.
We encountered the following:
Our program runs multiple domains in parallel.
I spent a brief moment to look through the Uri and Angstrom code bases. I strongly suspect the problem is that the
Uri.Parser
module uses the Angstromfix
combinator, which useslazy
internally. The problem with that is thatLazy
is not safe to use concurrently / in parallel — an attempt to do so may raise theUndefined
exception. As a workaround we will try to force the parser before spawning domains, but this should really be fixed properly.