jarro2783 / cxxopts

Lightweight C++ command line option parser
MIT License
4.16k stars 582 forks source link

std::bad_alloc during dlopen on integer_pattern initialization #405

Closed mxmlnkn closed 1 year ago

mxmlnkn commented 1 year ago

Hello,

I'm using cxxopts v2.2.1 for argument parsing in indexed_bzip2 and got a very weird user-reported issue.

#2  0x00007fcac0dc77ec in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6                                                                                                   [150/22615]
#3  0x00007fcac0dd2966 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00007fcac0dd29d1 in std::terminate() () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#5  0x00007fcac0dd2c65 in __cxa_throw () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#6  0x00007fcac0dc742a in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#7  0x00007fcac0e607b2 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_assign(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocato
r<char> > const&) () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#8  0x00007fc9d5eb2c65 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::assign (__str="", this=0x7ffce48d9540)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/basic_string.h:1387
#9  std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::operator= (__str="", this=0x7ffce48d9540)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/basic_string.h:681
#10 std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_M_match_token (token=std::__detail::_ScannerBase::_S_token_subexpr_begin, this=0x7ffce48d9430)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:593
#11 std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_M_atom (this=this@entry=0x7ffce48d9430) at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:340
#12 0x00007fc9d5eb30b0 in std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_M_term (this=0x7ffce48d9430)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:136
#13 std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_M_term (this=0x7ffce48d9430) at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:136
#14 std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_M_alternative (this=0x7ffce48d9430) at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:123
#15 0x00007fc9d5eb3389 in std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_M_disjunction (this=this@entry=0x7ffce48d9430)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:99
#16 0x00007fc9d5eb3c2e in std::__detail::_Compiler<std::__cxx11::regex_traits<char> >::_Compiler (this=0x7ffce48d9430, __b=<optimized out>, __e=<optimized out>, __loc=...,
    __flags=<optimized out>) at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.tcc:84
#17 0x00007fc9d5eb430f in std::__detail::__compile_nfa<std::__cxx11::regex_traits<char>, char const*> (__first=__first@entry=0x7fc9d5ed1320 "(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)",
    __last=__last@entry=0x7fc9d5ed1340 "", __loc=..., __flags=<optimized out>) at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex_compiler.h:183
#18 0x00007fc9d5eb44a6 in std::__cxx11::basic_regex<char, std::__cxx11::regex_traits<char> >::basic_regex<char const*> (__f=<optimized out>, __loc=..., __last=0x7fc9d5ed1340 "",
    __first=0x7fc9d5ed1320 "(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)", this=0x7fc9d5ef61a0 <cxxopts::values::(anonymous namespace)::integer_pattern>)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/move.h:104
#19 std::__cxx11::basic_regex<char, std::__cxx11::regex_traits<char> >::basic_regex<char const*> (__f=<optimized out>, __last=0x7fc9d5ed1340 "",
    __first=0x7fc9d5ed1320 "(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)", this=0x7fc9d5ef61a0 <cxxopts::values::(anonymous namespace)::integer_pattern>)
    at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex.h:508
#20 std::__cxx11::basic_regex<char, std::__cxx11::regex_traits<char> >::basic_regex (this=0x7fc9d5ef61a0 <cxxopts::values::(anonymous namespace)::integer_pattern>,
    __p=0x7fc9d5ed1320 "(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)", __f=<optimized out>) at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/regex.h:441
#21 0x00007fc9d5e5b8ee in __static_initialization_and_destruction_0 (__initialize_p=1, __priority=65535) at external/cxxopts/include/cxxopts.hpp:475
#22 _GLOBAL__sub_I_indexed_bzip2.cpp(void) () at indexed_bzip2.cpp:14247
#23 0x00007fcadfa28fe2 in call_init (l=<optimized out>, argc=argc@entry=3, argv=argv@entry=0x7ffce48e1e78, env=env@entry=0x556a6c0693f0) at dl-init.c:72
#24 0x00007fcadfa290e9 in call_init (env=0x556a6c0693f0, argv=0x7ffce48e1e78, argc=3, l=<optimized out>) at dl-init.c:30
#25 _dl_init (main_map=0x556a6d602950, argc=3, argv=0x7ffce48e1e78, env=0x556a6c0693f0) at dl-init.c:119
#26 0x00007fcadf646aed in __GI__dl_catch_exception (exception=<optimized out>, operate=<optimized out>, args=<optimized out>) at dl-error-skeleton.c:182
#27 0x00007fcadfa2d364 in dl_open_worker (a=a@entry=0x7ffce48d99b0) at dl-open.c:758
#28 0x00007fcadf646a90 in __GI__dl_catch_exception (exception=0x7ffce48d9990, operate=0x7fcadfa2cca0 <dl_open_worker>, args=0x7ffce48d99b0) at dl-error-skeleton.c:208
#29 0x00007fcadfa2c8fa in _dl_open (file=0x7fc9d5f65bd0 "/usr/local/lib/python3.10/site-packages/indexed_bzip2.cpython-310-x86_64-linux-gnu.so", mode=-2147483390,
    caller_dlopen=0x7fcadf8e5e12, nsid=-2, argc=3, argv=0x7ffce48d9990, env=0x556a6c0693f0) at dl-open.c:837
#30 0x00007fcadf4ea258 in dlopen_doit (a=a@entry=0x7ffce48d9bd0) at dlopen.c:66
#31 0x00007fcadf646a90 in __GI__dl_catch_exception (exception=exception@entry=0x7ffce48d9b70, operate=0x7fcadf4ea200 <dlopen_doit>, args=0x7ffce48d9bd0) at dl-error-skeleton.c:208
#32 0x00007fcadf646b4f in __GI__dl_catch_error (objname=0x556a66d36230, errstring=0x556a66d36238, mallocedp=0x556a66d36228, operate=<optimized out>, args=<optimized out>)
--Type <RET> for more, q to quit, c to continue without paging--
    at dl-error-skeleton.c:227
#33 0x00007fcadf4eaa65 in _dlerror_run (operate=operate@entry=0x7fcadf4ea200 <dlopen_doit>, args=args@entry=0x7ffce48d9bd0) at dlerror.c:170
#34 0x00007fcadf4ea2e4 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87

This seems to be a similar issue to the 6 years old #88. The solution there seems to be to ensure that something newer than GCC 4.8 is used. However, based on the line information in the backtrace /opt/rh/gcc-toolset-11/root/usr/include/c++/11/, I'm pretty sure that GCC 11 was used to compile it. The shared library is built inside manylinux2014 Docker container and then installed as a Python wheel on the user system, which is a Ubuntu 22.04 Docker container.

Do you have any idea what to do to fix this?

I doubt that updating cxxopts helps much because the regex is unchanged since v2.2.1.

Personally, I would have preferred those regexes to be initialized on a first-use basis, e.g., by implementing a singleton pattern instead of static initialization. In my case, this probably would fix the issue for most of the users of indexed_bzip2 because it probably mostly is used as a library instead of the command line interface. However, I see that it might be a major refactor. Would a pull request for something like this be welcome? (If there is no other solution).

jarro2783 commented 1 year ago

That's a bit of a nasty one. Do you know which version of libstdc++ is running? I could try and reproduce it by building a shared library that I dlopen.

I think if it is a static initialisation problem then the alternative is to either do what you said with a singleton or to wrap them up in some object that is only constructed when you construct an options parser.

mxmlnkn commented 1 year ago

Do you know which version of libstdc++ is running?

Based on the backtrace it looks like libstdc++ is not linked statically into the indexed_bzip2 shared library because /usr/lib/x86_64-linux-gnu/libstdc++.so.6 is used. This backtrace was supposedly from inside an Ubuntu 22.04 Docker image. Based on that information and the so path, which also exists on my local Ubuntu 22.04, it seems to be provided by this package libstdc++6:amd64 and links to libstdc++.so.6.0.30. I'd say that version 6.0.30 is being used.

I could try and reproduce it by building a shared library that I dlopen.

According to the user, a simple import via Python did not even trigger the issue instead it only happened deep inside some more complex software. Unfortunately there is no minimal reproducer known to me yet. It might be something that happens because of multiple dlopens, maybe even from multiple threads?

mxmlnkn commented 1 year ago

I have updated my library to use v3.1.1 and took a look into avoiding those static variables. I noticed that each regex variable is only ever used exactly inside one function, so it isn't the major refactor I thought it to be and simply making those global variables static function-scope variables should circumvent the problem without any downsides. I guess the only downside would be that they are not listed right next to each other but I guess even that could be fixed by crating static constexpr string_view or const char* global variables holding the pattern and then initialize the static function-scope regexes with those patterns.

Here is my patch, trying to be least invasive. It still contains trailing whitespace fixes because my editor removes those automatically.

diff --git a/include/cxxopts.hpp b/include/cxxopts.hpp
index b789a5c..aff0d44 100644
--- a/include/cxxopts.hpp
+++ b/include/cxxopts.hpp
@@ -55,8 +55,8 @@ THE SOFTWARE.
 #define CXXOPTS_LINKONCE_CONST __declspec(selectany) extern
 #define CXXOPTS_LINKONCE       __declspec(selectany) extern
 #else
-#define CXXOPTS_LINKONCE_CONST 
-#define CXXOPTS_LINKONCE       
+#define CXXOPTS_LINKONCE_CONST
+#define CXXOPTS_LINKONCE
 #endif

 #ifndef CXXOPTS_NO_REGEX
@@ -758,29 +758,31 @@ inline ArguDesc ParseArgument(const char *arg, bool &matched)

 namespace {
 CXXOPTS_LINKONCE
-std::basic_regex<char> integer_pattern
-  ("(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)");
+const char* const integer_pattern =
+  "(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)";
 CXXOPTS_LINKONCE
-std::basic_regex<char> truthy_pattern
-  ("(t|T)(rue)?|1");
+const char* const truthy_pattern =
+  "(t|T)(rue)?|1";
 CXXOPTS_LINKONCE
-std::basic_regex<char> falsy_pattern
-  ("(f|F)(alse)?|0");
+const char* const falsy_pattern =
+  "(f|F)(alse)?|0";
 CXXOPTS_LINKONCE
-std::basic_regex<char> option_matcher
-  ("--([[:alnum:]][-_[:alnum:]\\.]+)(=(.*))?|-([[:alnum:]].*)");
+const char* const option_pattern =
+  "--([[:alnum:]][-_[:alnum:]\\.]+)(=(.*))?|-([[:alnum:]].*)";
 CXXOPTS_LINKONCE
-std::basic_regex<char> option_specifier
-  ("([[:alnum:]][-_[:alnum:]\\.]*)(,[ ]*[[:alnum:]][-_[:alnum:]]*)*");
+const char* const option_specifier_pattern =
+  "([[:alnum:]][-_[:alnum:]\\.]*)(,[ ]*[[:alnum:]][-_[:alnum:]]*)*";
 CXXOPTS_LINKONCE
-std::basic_regex<char> option_specifier_separator(", *");
+const char* const option_specifier_separator_pattern = ", *";

 } // namespace

 inline IntegerDesc SplitInteger(const std::string &text)
 {
+  static const std::basic_regex<char> integer_matcher(integer_pattern);
+
   std::smatch match;
-  std::regex_match(text, match, integer_pattern);
+  std::regex_match(text, match, integer_matcher);

   if (match.length() == 0)
   {
@@ -804,15 +806,17 @@ inline IntegerDesc SplitInteger(const std::string &text)

 inline bool IsTrueText(const std::string &text)
 {
+  static const std::basic_regex<char> truthy_matcher(truthy_pattern);
   std::smatch result;
-  std::regex_match(text, result, truthy_pattern);
+  std::regex_match(text, result, truthy_matcher);
   return !result.empty();
 }

 inline bool IsFalseText(const std::string &text)
 {
+  static const std::basic_regex<char> falsy_matcher(falsy_pattern);
   std::smatch result;
-  std::regex_match(text, result, falsy_pattern);
+  std::regex_match(text, result, falsy_matcher);
   return !result.empty();
 }

@@ -821,22 +825,25 @@ inline bool IsFalseText(const std::string &text)
 // (without considering which or how many are single-character)
 inline OptionNames split_option_names(const std::string &text)
 {
-  if (!std::regex_match(text.c_str(), option_specifier))
+  static const std::basic_regex<char> option_specifier_matcher(option_specifier_pattern);
+  if (!std::regex_match(text.c_str(), option_specifier_matcher))
   {
     throw_or_mimic<exceptions::invalid_option_format>(text);
   }

   OptionNames split_names;

+  static const std::basic_regex<char> option_specifier_separator_matcher(option_specifier_separator_pattern);
   constexpr int use_non_matches { -1 };
   auto token_iterator = std::sregex_token_iterator(
-    text.begin(), text.end(), option_specifier_separator, use_non_matches);
+    text.begin(), text.end(), option_specifier_separator_matcher, use_non_matches);
   std::copy(token_iterator, std::sregex_token_iterator(), std::back_inserter(split_names));
   return split_names;
 }

 inline ArguDesc ParseArgument(const char *arg, bool &matched)
 {
+  static const std::basic_regex<char> option_matcher(option_pattern);
   std::match_results<const char*> result;
   std::regex_match(arg, result, option_matcher);
   matched = !result.empty();
@@ -1551,7 +1558,7 @@ class ParseResult
     Iterator(const Iterator&) = default;

 // GCC complains about m_iter not being initialised in the member
-// initializer list 
+// initializer list
 CXXOPTS_DIAGNOSTIC_PUSH
 CXXOPTS_IGNORE_WARNING("-Weffc++")
     Iterator(const ParseResult *pr, bool end=false)
mxmlnkn commented 1 year ago

Fixed with #406.