Segmentation fault in `mrb_vm_exec` when nesting calls to mruby

fourmond commented 5 years ago

I have a program, called QSoas that sometimes can nest calls to mruby, i.e. a mruby eval that runs a C++ code that runs a mruby eval. In version 1.4.1, this worked perfectly fine. In 2.0.0 and in the current HEAD, that segfaults with the following error:

Program received signal SIGSEGV, Segmentation fault.
0x00005555557d0132 in mrb_vm_exec (mrb=0x555555ef6a60, proc=0x55555602a380, pc=0x0)
    at /home/vincent/Prog/mruby-recent/src/vm.c:1471
1471            JUMP;
(gdb) bt 4
#0  0x00005555557d0132 in mrb_vm_exec (mrb=0x555555ef6a60, proc=0x55555602a380, pc=0x0)
    at /home/vincent/Prog/mruby-recent/src/vm.c:1471
#1  0x00005555557cc57b in mrb_vm_run (mrb=0x555555ef6a60, proc=0x55555602a380, self=..., stack_keep=2)
    at /home/vincent/Prog/mruby-recent/src/vm.c:949
#2  0x00005555557de631 in mrb_run (mrb=0x555555ef6a60, proc=0x55555602a380, self=...)
    at /home/vincent/Prog/mruby-recent/src/vm.c:2823
#3  0x0000555555789076 in MRuby::<lambda()>::operator() (__closure=0x7fffffffbe10) at src/mruby.cc:198
(More stack frames follow...)
(gdb)

Is that a known issue ?

Great thanks

fourmond commented 5 years ago

Any idea ? This is important for me, since it means I'm stuck with old versions of mruby in my project.

matz commented 5 years ago

Without source code, it's hard to help you. All I can say is mrb_vm_exec is called with pc=0, that means irep->iseq is NULL. So maybe there's something wrong in Proc generation.

fourmond commented 5 years ago

Thank you. The code is rather complex, I'll try to come up with a minimal non-working example.

matz commented 5 years ago

OK, sorry for the inconvenience. if you can show me the code that generates the Proc (of which irep->iseq is NULL), that would help me to identify the problem.

fourmond commented 5 years ago

Sorry for being so long to answer. While this is not strictly speaking what I'm doing, the following code used to work (at least in commit 55431b4cef95541dc07f30f52b7fad9e9638b0be), but segfaults now... Or at least, segfaulted with the current state of master when I opened that bug report... Now it seems OK, but I'm posting that for reference.

#include <mruby.h>
#include <mruby/compile.h>
#include <mruby/variable.h>
#include <mruby/string.h>

mrb_state * mrb;

const char code[] =
  "a = Test.new\n"
  "b = a.test_method('2*3')\n"
  "p b\n";
  ;

static mrb_value test_method(mrb_state * mrb, mrb_value self)
{
  mrb_sym sym;
  mrb_int nb;
  mrb_value v;
  mrb_get_args(mrb, "S", &v);

  const char * code = mrb_string_value_cstr(mrb, &v);
  fprintf(stderr, "Code: '%s'\n", code);

  mrb_load_string(mrb, code);

  return v;
}

int main()
{
  mrb = mrb_open();
  struct RClass * t = mrb_define_class(mrb, "Test",
                                       mrb->object_class);

  mrb_define_method(mrb, t, "test_method",
                    &test_method, MRB_ARGS_REQ(1));

  mrb_load_string(mrb, code);

  mrb_close(mrb);
}

fourmond commented 5 years ago

... but my complex code still segfaults unfortunately...

astral1 commented 5 years ago

I have the same problem. Running the code below will cause segfault.

#include <mruby.h>
#include <mruby/compile.h>

mrb_value hello(mrb_state *mrb, mrb_value self);

int main() {
    struct mrb_state *mrb = mrb_open();

    mrb_define_method(mrb, mrb->object_class, "hello", hello, MRB_ARGS_NONE());

    mrb_load_string(mrb, "hello");

    mrb_close(mrb);
    return 0;
}

mrb_value hello(mrb_state *mrb, mrb_value self) {
    return mrb_load_string(mrb, "puts 'hello'");
}

At some point, mrb->c->stack is set to NULL. Then segfault is raised in processing OP_SENDB.

matz commented 5 years ago

I couldn't reproduce the segv. Can you show us more info?

astral1 commented 5 years ago

Version Info

$ uname -mprsv
Darwin 18.6.0 Darwin Kernel Version 18.6.0: Thu Apr 25 23:16:27 PDT 2019; root:xnu-4903.261.4~2/RELEASE_X86_64 x86_64 i386
$ clang --version
Apple LLVM version 10.0.1 (clang-1001.0.46.4)
Target: x86_64-apple-darwin18.6.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin
$ ld -v
@(#)PROGRAM:ld  PROJECT:ld64-450.3
BUILD 18:16:53 Apr  5 2019
configured to support archs: armv6 armv7 armv7s arm64 arm64e arm64_32 i386 x86_64 x86_64h armv6m armv7k armv7m armv7em
LTO support using: LLVM version 10.0.1, (clang-1001.0.46.4) (static support for 22, runtime is 22)
TAPI support using: Apple TAPI version 10.0.1 (tapi-1001.0.4.1)
$ cmake --version
cmake version 3.15.0

CMake suite maintained and supported by Kitware (kitware.com/cmake).

CMake

cmake_minimum_required(VERSION 3.14)
project(segvtest C)

include(ExternalProject)

set(CMAKE_C_STANDARD 11)

ExternalProject_Add(
        mruby
        GIT_REPOSITORY https://github.com/mruby/mruby.git
        GIT_TAG 2.0.1
        CONFIGURE_COMMAND ""
        SOURCE_DIR mruby
        BUILD_IN_SOURCE true
        BUILD_COMMAND make
        INSTALL_COMMAND "")

ExternalProject_Get_Property(mruby source_dir)

add_executable(segvtest main.c)

include_directories(${source_dir}/include)
target_link_libraries(segvtest
        debug ${source_dir}/build/host-debug/lib/libmruby.a
        optimized ${source_dir}/build/host/lib/libmruby.a)

add_dependencies(segvtest mruby)

main.c

#include <mruby.h>
#include <mruby/compile.h>

mrb_value hello(mrb_state *mrb, mrb_value self);

int main() {
    struct mrb_state *mrb = mrb_open();

    mrb_define_method(mrb, mrb->object_class, "hello", hello, MRB_ARGS_NONE());

    mrb_load_string(mrb, "hello");

    mrb_close(mrb);
    return 0;
}

mrb_value hello(mrb_state *mrb, mrb_value self) {
    mrb_value result = mrb_load_string(mrb, "puts 'hello'");
    return result;
}

Do you need any additional information?

dearblue commented 5 years ago

I could not reproduce it on master, but I could reproduce it on 2.0.1. Is @astral1 trying a master?

astral1 commented 5 years ago

@dearblue I tried with the master. And it works. My issue also seems to be related to #4384. Thanks very much!

matz commented 5 years ago

Can I close this issue?

fourmond commented 4 years ago

Sorry to dig this report again. I've identified the commit which makes my code segfault, it is this one:

commit 55431b4cef95541dc07f30f52b7fad9e9638b0be
Author: Yukihiro "Matz" Matsumoto <matz@ruby.or.jp>
Date:   Tue Jan 30 22:55:58 2018 +0900

    Need not to pop `callinfo` on `OP_STOP`.

Does that tell something about what I may be doing wrong in my code ? My code still segfaults in master.

matz commented 4 years ago

Thank you. This is valuable input. If you can show us the backtrace of the crash, along with a commit hash, I think I can fix it.

fourmond commented 4 years ago

Thanks. I've reproduced the bug against the HEAD of stable (commit 3d46f1b62092, unsure whether that's what you wanted).

The backtrace I've collected is the following:

(gdb) bt 8
#0  0x00005555557d5d4b in mrb_vm_exec (mrb=0x555555e01f70, proc=0x555555f1aea0, pc=0x0)
    at /home/vincent/Prog/mruby-github/src/vm.c:1477
#1  0x00005555557d20cd in mrb_vm_run (mrb=0x555555e01f70, proc=0x555555f1aea0, self=..., stack_keep=2)
    at /home/vincent/Prog/mruby-github/src/vm.c:948
#2  0x00005555557e4356 in mrb_run (mrb=0x555555e01f70, proc=0x555555f1aea0, self=...)
    at /home/vincent/Prog/mruby-github/src/vm.c:2836
#3  0x00005555557876f6 in MRuby::<lambda()>::operator() (__closure=0x7fffffffcab0) at src/mruby.cc:209
#4  std::_Function_handler<mrb_value(), MRuby::eval(const QByteArray&, const QString&, int)::<lambda()> >::_M_invoke(const std::_Any_data &) (__functor=...) at /usr/include/c++/6/functional:1726
#5  0x00005555557876c1 in std::function<mrb_value ()>::operator()() const (this=<optimized out>)
    at /usr/include/c++/6/functional:2136
#6  protect_helper (helper=...) at src/mruby.cc:111
#7  0x000055555582acc7 in mrb_protect (mrb=0x555555e01f70, 
    body=0x5555557876b0 <protect_helper(mrb_state*, mrb_value)>, data=..., state=0x7fffffffca2f "")
    at /home/vincent/Prog/mruby-github/mrbgems/mruby-error/src/exception.c:17

The problem occurs when the following happens: I'm calling a Proc that goes back to C++ code and creates another code and executes it (many times). The inner calls to mrb_vm_exec succeed (many calls to mrb_vm_exec with the same arguments). The segfault occurs upon returning to the outer mrb_vm_exec. I guess somehow the return stack is messed up by the inner calls ?

I can try to make a minimal crashing example, I haven't succeeded so far, though.

fourmond commented 4 years ago

More information: I've added a small fprintf at the beginning of mrb_vm_exec to show the arguments. It gives me this (the pointers are mrb, proc and pc):

mrb_vm_exec: 0x555555e029c0 0x555555f1b800 0x555555faa2e0
mrb_vm_exec: 0x555555e029c0 0x555555f1b530 0x55555604adb0
mrb_vm_exec: 0x555555e029c0 0x555555f1b470 0x5555560f58a0
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac

Program received signal SIGSEGV, Segmentation fault.
0x00005555557d5d4b in mrb_vm_exec (mrb=0x555555e029c0, proc=0x555555f1b800, pc=0x0)

So there 4 different calls of mrb_vm_exec, all of which succeed except the outermost one at the end. I think the calls are nested this way:

mrb_vm_exec: 0x555555e029c0 0x555555f1b800 0x555555faa2e0
  mrb_vm_exec: 0x555555e029c0 0x555555f1b530 0x55555604adb0
  mrb_vm_exec: 0x555555e029c0 0x555555f1b470 0x5555560f58a0
  mrb_vm_exec: 0x555555e029c0 0x555555f23540 0x5555558a13ac
  [...]

It is clear ? Does it help ?

matz commented 4 years ago

Thank you for the info. Could you please show us two more info:

the line where SEGV occurs (line 1477 in 3d46f1b was mere else).
check how it works (or not) when you revert 55431b4cef95541dc07f30f52b7fad9e9638b0be Especially the latter. As far as I remember, I change just for optimization. If it doesn't work, I think it should be reverted.

fourmond commented 4 years ago

Ah, sorry, it is line 1475 (so JUMP;), I've got two lines more because I've introduced a couple of fprintf at the beginning of mrb_vm_exec.

Unfortunately reverting 55431b4 does not seem to be enough... I'll try to come up with a minimal crashing example.

Thanks

fourmond commented 4 years ago

Here is a simple example that crashes with the current stable:

#include <mruby.h>
#include <mruby/compile.h>
#include <mruby/variable.h>
#include <mruby/string.h>
#include <mruby/error.h>
#include <mruby/compile.h>
#include <mruby/proc.h>

mrb_state * mrb;

struct RProc * generate_proc(mrb_state * mrb, const char * code)
{
  struct mrbc_context * c = mrbc_context_new(mrb);
  struct mrb_parser_state * p = mrb_parse_string(mrb, code, c);
  struct RProc * proc = mrb_generate_code(mrb, p);
  mrbc_context_free(mrb, c);
  mrb_parser_free(p);
  return proc;
}

mrb_value run_proc(mrb_state * mrb, mrb_value h)
{
  /* Assumption: h is just a converted RProc */
  struct RProc * proc = (struct RProc *)mrb_cptr(h);
  return mrb_run(mrb, proc, mrb_top_self(mrb));
}

mrb_value run_code(mrb_state * mrb, const char * code)
{
  mrb_value p_wrap;
  mrb_bool failed;
  struct RProc * proc = generate_proc(mrb, code);
  SET_CPTR_VALUE(mrb, p_wrap, proc);
  return mrb_protect(mrb, &run_proc, p_wrap, &failed);
}

static mrb_value test_method(mrb_state * mrb, mrb_value self)
{
  mrb_value v;
  mrb_get_args(mrb, "S", &v);

  const char * cd = mrb_string_value_cstr(mrb, &v);
  return run_code(mrb, cd);
}

const char test_code[] =
  "t = Test.new\n"
  "p t\n"
  "b = t.test_method('2**8')\n"
  "p b\n"
  ;

int main()
{
  mrb = mrb_open();
  struct RClass * t = mrb_define_class(mrb, "Test",
                                       mrb->object_class);

  mrb_define_method(mrb, t, "test_method",
                    &test_method, MRB_ARGS_REQ(1));

  run_code(mrb, "p 2+2\n");
  run_code(mrb, test_code);
  mrb_close(mrb);
}

The trick here is that it crashes upon the nested calls to mrb_run. Is that because of the nested mrb_protect ? Am I doing something wrong ?

fourmond commented 4 years ago

Leaving out the mrb_protect completely also crashes (always at the same place):

mrb_value run_code(mrb_state * mrb, const char * code)
{
  mrb_value p_wrap;
  mrb_bool failed;
  struct RProc * proc = generate_proc(mrb, code);
  SET_CPTR_VALUE(mrb, p_wrap, proc);
  return run_proc(mrb, p_wrap);
}

fourmond commented 4 years ago

Additional naive question: is mruby freeing itself the generated RProc*, or should I dispose of it myself ?

matz commented 4 years ago

The generated proc will be garbage collected. I will investigate your code sample. Thank you!

matz commented 4 years ago

The good news is that I can reproduce the problem with the sample code (yey!) I found out that mrb_run function is not supposed to be called recursively. I think I can fix it soon.

matz commented 4 years ago

Use mrb_top_run instead of mrb_run. I mean replace the code:

  return mrb_run(mrb, proc, mrb_top_self(mrb));

to

  return mrb_top_run(mrb, proc, mrb_top_self(mrb), 0);

Actually, you have to manually push callinfo stack before calling mrb_run. We will make mrb_run out from API functions.

fourmond commented 4 years ago

Thanks a lot ! That indeed fixes the segmentation fault with stable. That is not enough to prevent it on older mruby releases though (for instance 2.0.0 still segfaults), but I guess it doesn't matter so much.

mruby / mruby

Segmentation fault in `mrb_vm_exec` when nesting calls to mruby #4488