ocaml-community / ocaml-mariadb

OCaml bindings to MariaDB, supporting the nonblocking API
55 stars 18 forks source link

fix memory leak in free_res #39

Closed ygrek closed 3 years ago

ygrek commented 3 years ago

https://dev.mysql.com/doc/c-api/8.0/en/mysql-stmt-result-metadata.html

When you are done with the metadata result set structure, free it by passing it to mysql_free_result().

Repro case

dune (devkit is used for Memory.reclaim) :

(executable
  (name testdb)
  (modes exe)
  (libraries devkit mariadb lwt.unix)
  (modules testdb)
  (preprocess
    (pps lwt_ppx)))

testdb.ml:

open Devkit
module S = Mariadb.Nonblocking.Status

module M = Mariadb.Nonblocking.Make (struct
  module IO = struct
    type 'a future = 'a Lwt.t

    let ( >>= ) = Lwt.bind

    let return = Lwt.return
  end

  let wait mariadb status =
    let fd = Lwt_unix.of_unix_file_descr @@ Mariadb.Nonblocking.fd mariadb in
    assert (S.read status || S.write status || S.timeout status);
    let idle, _ = Lwt.task () in
    let rt = if S.read status then Lwt_unix.wait_read fd else idle in
    let wt = if S.write status then Lwt_unix.wait_write fd else idle in
    let tt =
      match S.timeout status, Mariadb.Nonblocking.timeout mariadb with
      | true, 0 -> Lwt.return ()
      | true, tmout -> Lwt_unix.timeout (float tmout)
      | false, _ -> idle
    in
    Lwt.catch
      (fun () ->
        let%lwt _ = Lwt.nchoose [ rt; wt; tt ] in
        Lwt.return @@ S.create ~read:(Lwt_unix.readable fd) ~write:(Lwt_unix.writable fd) ())
      (function
        | Lwt_unix.Timeout -> Lwt.return @@ S.create ~timeout:true ()
        | e -> Lwt.fail e)
end)

let run () =
  let connect () = M.connect ~host:"127.0.0.1" ~port:3306 ~user:"test" ~pass:"test" ~db:"test" () in
  let or_die x =
    match%lwt x with
    | Error (i, s) ->
      eprintfn "Error %d %s" i s;
      exit 2
    | Ok r -> Lwt.return r
  in
  let%lwt db = connect () |> or_die in
  let query = "SELECT 1" in
  let%lwt () =
    while%lwt true do
      Memory.reclaim ();
      for%lwt i = 0 to 1000 do
        let%lwt stmt = M.prepare db query |> or_die in
        let%lwt () = M.Stmt.close stmt |> or_die in
        Lwt.return_unit
      done
    done
  in
  Memory.reclaim ();
  M.close db

let () = Lwt_main.run @@ run ()

before the fix:

$ GLIBC_TUNABLES=glibc.malloc.trim_threshold=0 ./testdb.exe 
[2021-03-08T21:15:05.9545] 176132: [memory:info] Memory.reclaim: heap 992KB -> 480KB live 35KB -> 153KB freelist = 1 (0.0008 secs), rss 12.8MB -> 13.3MB (0.0000 secs)
[2021-03-08T21:15:16.3621] 176132: [memory:info] Memory.reclaim: heap = 480KB live 294KB -> 154KB freelist 5018 -> 1 (0.0005 secs), rss 14.8MB -> 15.1MB (0.0000 secs)
[2021-03-08T21:15:27.6536] 176132: [memory:info] Memory.reclaim: heap = 480KB live 227KB -> 154KB freelist 5490 -> 1 (0.0005 secs), rss 15.2MB -> 15.2MB (0.0000 secs)
[2021-03-08T21:15:38.8880] 176132: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 154KB freelist 4977 -> 1 (0.0005 secs), rss 15.3MB -> 15.3MB (0.0000 secs)
[2021-03-08T21:15:49.7611] 176132: [memory:info] Memory.reclaim: heap = 480KB live 227KB -> 154KB freelist 5491 -> 1 (0.0005 secs), rss 15.3MB -> 15.5MB (0.0000 secs)
[2021-03-08T21:16:00.6917] 176132: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 154KB freelist 4977 -> 1 (0.0005 secs), rss 15.6MB -> 15.6MB (0.0000 secs)
[2021-03-08T21:16:11.4848] 176132: [memory:info] Memory.reclaim: heap = 480KB live 227KB -> 154KB freelist 5491 -> 1 (0.0005 secs), rss 15.6MB -> 15.8MB (0.0000 secs)
[2021-03-08T21:16:23.4493] 176132: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 154KB freelist 4977 -> 1 (0.0005 secs), rss 15.8MB -> 15.9MB (0.0000 secs)
[2021-03-08T21:16:33.5215] 176132: [memory:info] Memory.reclaim: heap = 480KB live 227KB -> 154KB freelist 5491 -> 1 (0.0006 secs), rss 15.9MB -> 16.1MB (0.0000 secs)
[2021-03-08T21:16:44.8638] 176132: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 154KB freelist 4977 -> 1 (0.0005 secs), rss 16.1MB -> 16.2MB (0.0000 secs)
[2021-03-08T21:16:56.4649] 176132: [memory:info] Memory.reclaim: heap = 480KB live 227KB -> 154KB freelist 5491 -> 1 (0.0007 secs), rss 16.2MB -> 16.3MB (0.0000 secs)
[2021-03-08T21:17:08.5156] 176132: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 154KB freelist 4977 -> 1 (0.0005 secs), rss 16.4MB -> 16.4MB (0.0000 secs)
[2021-03-08T21:17:18.9161] 176132: [memory:info] Memory.reclaim: heap = 480KB live 227KB -> 154KB freelist 5491 -> 1 (0.0006 secs), rss 16.4MB -> 16.6MB (0.0000 secs)

(important number is the last bytes measurement, which is resident set size after malloc_release)

after the fix:

$ GLIBC_TUNABLES=glibc.malloc.trim_threshold=0 ./testdb.exe 
[2021-03-08T21:19:15.3249] 179674: [memory:info] Memory.reclaim: heap 992KB -> 480KB live 35KB -> 153KB freelist = 1 (0.0008 secs), rss 12.7MB -> 13.2MB (0.0000 secs)
[2021-03-08T21:19:24.8679] 179674: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 155KB freelist 5020 -> 1 (0.0005 secs), rss 14.8MB -> 14.9MB (0.0000 secs)
[2021-03-08T21:19:34.7832] 179674: [memory:info] Memory.reclaim: heap = 480KB live 228KB -> 155KB freelist 5491 -> 1 (0.0006 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:19:44.7059] 179674: [memory:info] Memory.reclaim: heap = 480KB live 296KB -> 155KB freelist 4967 -> 1 (0.0006 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:19:54.4285] 179674: [memory:info] Memory.reclaim: heap = 480KB live 225KB -> 155KB freelist 5466 -> 1 (0.0006 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:20:04.7968] 179674: [memory:info] Memory.reclaim: heap = 480KB live 296KB -> 155KB freelist 4967 -> 1 (0.0005 secs), rss 14.9MB -> 14.9MB (0.0000 secs)
[2021-03-08T21:20:14.6548] 179674: [memory:info] Memory.reclaim: heap = 480KB live 228KB -> 155KB freelist 5491 -> 1 (0.0005 secs), rss 14.9MB -> 14.9MB (0.0000 secs)
[2021-03-08T21:20:24.4880] 179674: [memory:info] Memory.reclaim: heap = 480KB live 295KB -> 155KB freelist 4978 -> 1 (0.0005 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:20:35.3080] 179674: [memory:info] Memory.reclaim: heap = 480KB live 225KB -> 155KB freelist 5466 -> 1 (0.0007 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:20:45.6080] 179674: [memory:info] Memory.reclaim: heap = 480KB live 296KB -> 155KB freelist 4967 -> 1 (0.0005 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:20:55.9028] 179674: [memory:info] Memory.reclaim: heap = 480KB live 225KB -> 155KB freelist 5466 -> 1 (0.0007 secs), rss = 14.9MB (0.0000 secs)
[2021-03-08T21:21:06.4356] 179674: [memory:info] Memory.reclaim: heap = 480KB live 296KB -> 155KB freelist 4967 -> 1 (0.0005 secs), rss = 14.9MB (0.0000 secs)
ygrek commented 3 years ago

(this may be related to #29)

andrenth commented 3 years ago

This is great, thanks!

andrenth commented 3 years ago

I think a new release is due after this fix and the ones from your other PR. However I don't have an OCaml development environment right now. Would you like developer access to this repository?

ygrek commented 3 years ago

I can look into that time permitting, yes

ygrek commented 2 years ago

took me one year https://github.com/ocaml/opam-repository/pull/21009 :)

paurkedal commented 2 years ago

I started to see ABRT and SEGV signals from the Caqti testsuite after upgrading to mariadb 1.1.5. Compiling against the master branch with this commit reverted, fixes that issue, though of that still leaves the original memory issue.

ygrek commented 2 years ago

do you have a stacktrace? there is indeed not only free done in mysql_free_result but some interaction with reading from server..

paurkedal commented 2 years ago

In one case it terminates in OCaml:

$ OCAMLRUNPARAM=b dune exec -- examples/bikereg.exe -u mariadb://urkedal:XXXXXXXXXXXXXX@localhost/urkedal
BIKE-0003 is owned by Trillian.       
bikereg: internal error, uncaught exception:
         Out of memory
         Raised by primitive operation at Ctypes_memory.allocate_n in file "src/ctypes/ctypes_memory.ml", line 137, characters 12-48
         Called from Common.Stmt.malloc in file "lib/common.ml", line 365, characters 12-34
         Called from Common.Stmt.alloc_buffer in file "lib/common.ml", line 375, characters 30-62
         Called from Common.Stmt.bind_result in file "lib/common.ml", line 389, characters 10-29
         Called from Nonblocking.Stmt.store_result_start in file "lib/nonblocking.ml" (inlined), line 298, characters 4-89
         Called from Nonblocking.Stmt.store_result in file "lib/nonblocking.ml", line 306, characters 5-28
         Called from Nonblocking.Make.Stmt.handle_execute in file "lib/nonblocking.ml", line 631, characters 56-80
         Called from Lwt.Sequential_composition.bind.create_result_promise_and_callback_if_deferred.callback in file "src/core/lwt.ml", line 1860, characters 23-26
         Re-raised at Lwt.Miscellaneous.poll in file "src/core/lwt.ml", line 3068, characters 20-29
         Called from Lwt_main.run.run_loop in file "src/unix/lwt_main.ml", line 31, characters 10-20
         Called from Lwt_main.run in file "src/unix/lwt_main.ml", line 118, characters 8-13
         Re-raised at Lwt_main.run in file "src/unix/lwt_main.ml", line 124, characters 4-13
         Called from Cmdliner_term.app.(fun) in file "cmdliner_term.ml", line 24, characters 19-24
         Called from Cmdliner_eval.run_parser in file "cmdliner_eval.ml", line 34, characters 37-44

In the other case, I get a core dump with backtrace from gdb:

Core was generated by `_build/default/testsuite/main_lwt.exe -u mariadb://urkedal:XXXXXXXXXXXXXX@localhost/urkedal'.
Program terminated with signal SIGABRT, Aborted.
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007f3ce7dfb859 in __GI_abort () at abort.c:79
#2  0x00007f3ce7e6629e in __libc_message (action=action@entry=do_abort, 
    fmt=fmt@entry=0x7f3ce7f90298 "%s\n") at ../sysdeps/posix/libc_fatal.c:155
#3  0x00007f3ce7e6e32c in malloc_printerr (
    str=str@entry=0x7f3ce7f92690 "double free or corruption (!prev)") at malloc.c:5347
#4  0x00007f3ce7e6ffdc in _int_free (av=0x7f3ce7fc5b80 <main_arena>, p=0x557ee4e49e80, 
    have_lock=<optimized out>) at malloc.c:4317
#5  0x00007f3ce77d9205 in mariadb_stub_21_mysql_free_result (x94=<optimized out>)
    at lib/ffi_generated_stubs.c:138
#6  0x00007f3ce77d4953 in camlFfi_generated__fun_2466 () at lib/ffi_generated.ml:781
#7  0x00007f3ce77c893c in camlNonblocking__free_res_3558 () at lib/nonblocking.ml:647
#8  0x00007f3ce77c8b97 in camlNonblocking__close_3748 () at lib/nonblocking.ml:657
#9  0x00007f3ce779b4e8 in camlCaqti_driver_mariadb__deallocate_3192 ()
    at caqti-driver-mariadb/lib/caqti_driver_mariadb.ml:475
#10 0x0000557ee3a460e6 in camlTest_sql__fun_2166 () at testsuite/test_sql.ml:243
#11 0x0000557ee3ae2281 in camlLwt__callback_1370 () at src/core/lwt.ml:1860
#12 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#13 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#14 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#15 0x0000557ee3ae22ad in camlLwt__callback_1370 () at src/core/lwt.ml:1874
#16 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#17 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#18 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#19 0x0000557ee3ae2ead in camlLwt__callback_1497 () at src/core/lwt.ml:2035
#20 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#21 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#22 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#23 0x0000557ee3ae2a8b in camlLwt__callback_1455 () at src/core/lwt.ml:1983
#24 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#25 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#26 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#27 0x0000557ee3ae22ad in camlLwt__callback_1370 () at src/core/lwt.ml:1874
#28 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#29 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#30 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#31 0x0000557ee3ae22ad in camlLwt__callback_1370 () at src/core/lwt.ml:1874
#32 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#33 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#34 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#35 0x0000557ee3ae2a8b in camlLwt__callback_1455 () at src/core/lwt.ml:1983
#36 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#37 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#38 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#39 0x0000557ee3ae22ad in camlLwt__callback_1370 () at src/core/lwt.ml:1874
#40 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#41 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#42 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#43 0x0000557ee3ae2ead in camlLwt__callback_1497 () at src/core/lwt.ml:2035
#44 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#45 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#46 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#47 0x0000557ee3ae2a8b in camlLwt__callback_1455 () at src/core/lwt.ml:1983
#48 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#49 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#50 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#51 0x0000557ee3ae5f89 in camlLwt__callback_2083 () at src/core/lwt.ml:2714
#52 0x0000557ee3ae0f88 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1214
#53 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#54 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#55 0x0000557ee3ae2ead in camlLwt__callback_1497 () at src/core/lwt.ml:2035
#56 0x0000557ee3ae0f58 in camlLwt__iter_callback_list_953 () at src/core/lwt.ml:1207
#57 0x0000557ee3ae10d5 in camlLwt__run_in_resolution_loop_1006 () at src/core/lwt.ml:1273
#58 0x0000557ee3ae1275 in camlLwt__resolve_1026 () at src/core/lwt.ml:1309
#59 0x0000557ee3ae14c2 in camlLwt__wakeup_general_1063 () at src/core/lwt.ml:1383
#60 0x0000557ee3adfa32 in camlLwt_sequence__loop_158 () at src/core/lwt_sequence.ml:132
#61 0x0000557ee3adfa32 in camlLwt_sequence__loop_158 () at src/core/lwt_sequence.ml:132
#62 0x0000557ee3b9d2a6 in camlStdlib__List__iter_261 () at list.ml:110
#63 0x0000557ee3ab1440 in camlLwt_engine__fun_2253 () at src/unix/lwt_engine.ml:359
#64 0x0000557ee3ab3c52 in camlLwt_main__run_loop_135 () at src/unix/lwt_main.ml:41
#65 0x0000557ee3ab3ece in camlLwt_main__run_176 () at src/unix/lwt_main.ml:118
#66 0x0000557ee3a446de in camlDune__exe__Main_lwt__entry () at testsuite/main_lwt.ml:67
#67 0x0000557ee3a3f419 in caml_program ()
#68 0x0000557ee3c327ed in caml_start_program ()
#69 0x0000557ee3c11554 in caml_startup_common (argv=0x7fff9b1cfa18, pooling=<optimized out>, 
    pooling@entry=0) at startup_nat.c:158
#70 0x0000557ee3c1159f in caml_startup_exn (argv=<optimized out>) at startup_nat.c:168
#71 caml_startup (argv=<optimized out>) at startup_nat.c:168
#72 0x0000557ee3a3e192 in main (argc=<optimized out>, argv=<optimized out>) at main.c:41
ygrek commented 2 years ago

ftr I am using this code extensively and didn't observe any crashes.. Anyway released 1.1.6 with this commit reverted until this is properly investigated.

paurkedal commented 2 years ago

Thanks. I had a look at it myself, without finding the source of the leak, though I'm wondering whether the slots for the bindings which are pre-allocated by prepare gets freed.

apeschar commented 2 years ago

Firstly, the problem with the original fix for the memory leak is that statement metadata can be used again after freeing.

It's fine to free the result set, since it will be repopulated on the next execution, but the metadata is retrieved only on first execution. So if a statement is reused (ie, reset is called and the statement is executed again), it will try to use metadata that has been freed.

Secondly, it should actually be fine to use mysql_free_result on the metadata result set. mysql_free_result may block only if there are still rows to be retrieved from the server. But getting the metadata happens on the client side and is non-blocking; it does not retrieve anything from the server.

https://dev.mysql.com/doc/c-api/5.7/en/mysql-stmt-result-metadata.html

I previously submitted a PR #41 which loads the metadata after each execution. This also fixes the memory leak, and should not have had the segfault issue because it avoids the use-after-free.

paurkedal commented 2 years ago

@apeschar Thanks for your investigating.

Secondly, it should actually be fine to use mysql_free_result on the metadata result set. mysql_free_result may block only if there are still rows to be retrieved from the server. But getting the metadata happens on the client side and is non-blocking; it does not retrieve anything from the server.

What if the client calls close or reset after an incomplete retrieval of rows? I think would be reasonable usage, e.g. if an error occurs during processing of the result.

apeschar commented 2 years ago

What if the client calls close or reset after an incomplete retrieval of rows?

There are two result sets:

  1. The statement result set, freed using non-blocking mysql_stmt_free_result_start.
  2. The metadata result set, currently not freed, but freeable using mysql_free_result (blocking) or mysql_free_result_start (non-blocking). This PR caused the metadata result set to be freed.

For the statement result set it is necessary to use the non-blocking APIs because of the situations you mentioned; ie, not all rows have been read.

For the metadata result set this is not applicable. Freeing the metadata should never block.