Overhead due to default std::tuple initialization in generated code

awelzel commented 11 months ago

Similar to #1592 (this is using the same netsted.spicy reproducer), patching the generated code to remove the default initialization of the std::tuple __result in the __parse functions results in significant parsing performance improvement for GCC + ibstdc++ on Debian 12.

Test2.cc removes __result from the parse stage2 functions and returns std::make_tuple directly. This results in a 14% improvement and seems a correct optimization.

Test22.cc additionally does the same in stage1 function. Though that short-cuts some of the __error assignments (see also #1592). The result is 1) over-promising and 2) probably also incorrect.

Main point of the ticket: Default initialization of the std::tuple might not come for free and at least with GCC/libstdc++ it's significant overhead. This was found by running perf annotate on the parse functions with the hottest instruction being rep stos hinting at initialization of the tuple. Using godbolt and pinging Benjamin helped. Wrapping the std::tuple in a std::optional wouldn't help with GCC, as that seems to also always fully initialize an optional.

Screenshot-20231110-161727

I don't know the right fix, mostly just want to raise awareness of missed optimization opportunity.

Command	Mean [s]	Min [s]	Max [s]	Relative
`cat ./test-data.txt \\| spicy-driver Test.hlto`	6.041 ± 0.186	5.885	6.363	1.44 ± 0.05
`cat ./test-data.txt \\| spicy-driver Test2.hlto`	5.303 ± 0.091	5.218	5.437	1.26 ± 0.03
`cat ./test-data.txt \\| spicy-driver Test22.hlto`	4.197 ± 0.074	4.129	4.303	1.00

Test2.cc patch:

--- Test.cc     2023-11-10 15:07:04.152724653 +0100
+++ Test2.cc    2023-11-10 16:04:45.822619567 +0100
@@ -174,7 +174,7 @@
     auto __self = K::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:3:10-5:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     __location__("nested.spicy:4:3");

     // Begin parsing production: Skip: x   -> skip: x;
@@ -201,8 +201,7 @@
     (void());
     __error = (*__self).__error;
     ::hilti::rt::debug::dedent(std::string("spicy"));
-    __result = std::make_tuple(__cur, __lah, __lahe, __error);
-    return __result;
+    return std::make_tuple(__cur, __lah, __lahe, __error);
 }

 auto __hlt::Test::K::__parse_stage1(::hilti::rt::ValueReference<::hilti::rt::Stream>& __data, std::optional<::hilti::rt::stream::SafeConstIterator> __begin, ::hilti::rt::stream::View __cur, ::hilti::rt::Bool __trim, ::hilti::rt::integer::safe<int64_t> __lah, ::hilti::rt::stream::SafeConstIterator __lahe, std::optional<::hilti::rt::RecoverableFailure> __error) -> std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> {
@@ -243,7 +242,7 @@
     auto __self = L::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:7:10-9:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     __location__("nested.spicy:3:10-5:2");

     // Begin parsing production: Unit: Test_K_2 -> x_2;
@@ -260,8 +259,7 @@
     (void());
     __error = (*__self).__error;
     ::hilti::rt::debug::dedent(std::string("spicy"));
-    __result = std::make_tuple(__cur, __lah, __lahe, __error);
-    return __result;
+    return std::make_tuple(__cur, __lah, __lahe, __error);
 }

 auto __hlt::Test::L::__parse_stage1(::hilti::rt::ValueReference<::hilti::rt::Stream>& __data, std::optional<::hilti::rt::stream::SafeConstIterator> __begin, ::hilti::rt::stream::View __cur, ::hilti::rt::Bool __trim, ::hilti::rt::integer::safe<int64_t> __lah, ::hilti::rt::stream::SafeConstIterator __lahe, std::optional<::hilti::rt::RecoverableFailure> __error) -> std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> {
@@ -302,7 +300,7 @@
     auto __self = M::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:11:10-13:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     __location__("nested.spicy:7:10-9:2");

     // Begin parsing production: Unit: Test_L_2 -> Test_K_3;
@@ -319,8 +317,8 @@
     (void());
     __error = (*__self).__error;
     ::hilti::rt::debug::dedent(std::string("spicy"));
-    __result = std::make_tuple(__cur, __lah, __lahe, __error);
-    return __result;
+    return std::make_tuple(__cur, __lah, __lahe, __error);
+    // return __result;
 }

 auto __hlt::Test::M::__parse_stage1(::hilti::rt::ValueReference<::hilti::rt::Stream>& __data, std::optional<::hilti::rt::stream::SafeConstIterator> __begin, ::hilti::rt::stream::View __cur, ::hilti::rt::Bool __trim, ::hilti::rt::integer::safe<int64_t> __lah, ::hilti::rt::stream::SafeConstIterator __lahe, std::optional<::hilti::rt::RecoverableFailure> __error) -> std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> {
@@ -361,7 +359,7 @@
     auto __self = N::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:15:17-17:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     ::hilti::rt::Vector<__hlt::Test::M> __transient_anon;
     __location__("nested.spicy:15:23-16:14");
     ::hilti::rt::integer::safe<uint64_t> __pre_container_offset = ::hilti::rt::integer::safe<std::uint64_t>{0U};
@@ -376,8 +374,7 @@
     (void());
     __error = (*__self).__error;
     ::hilti::rt::debug::dedent(std::string("spicy"));
-    __result = std::make_tuple(__cur, __lah, __lahe, __error);
-    return __result;
+    return std::make_tuple(__cur, __lah, __lahe, __error);
 }

 auto __hlt::Test::N::__parse_anon_stage1(::hilti::rt::ValueReference<::hilti::rt::Stream>& __data, std::optional<::hilti::rt::stream::SafeConstIterator> __begin, ::hilti::rt::stream::View __cur, ::hilti::rt::Bool __trim, ::hilti::rt::integer::safe<int64_t> __lah, ::hilti::rt::stream::SafeConstIterator __lahe, std::optional<::hilti::rt::RecoverableFailure> __error, ::hilti::rt::Vector<__hlt::Test::M>& __dst) -> std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> {

Test2.cc to Test22.cc patch:

--- Test2.cc    2023-11-10 16:04:45.822619567 +0100
+++ Test22.cc   2023-11-10 16:13:54.987417198 +0100
@@ -208,7 +208,7 @@
--- Test2.cc    2023-11-10 16:04:45.822619567 +0100
+++ Test22.cc   2023-11-10 16:13:54.987417198 +0100
@@ -208,7 +208,7 @@
     auto __self = K::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:3:10-5:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     try {
         ::hilti::rt::debug::indent(std::string("spicy"));
         std::optional<::hilti::rt::stream::SafeConstIterator> __begin = std::make_optional(__cur.begin());
@@ -218,7 +218,7 @@
         __error = (*__self).__error;
         ::hilti::rt::StrongReference<::hilti::rt::Stream> filtered = ::hilti::rt::StrongReference<::hilti::rt::Stream>();
         if ( ! (::hilti::rt::Bool(static_cast<bool>(filtered))) ) {
-            __result = (*__self).__parse_Test_K_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
+            return (*__self).__parse_Test_K_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
         }
     }
     catch ( const ::std::exception& __except ) {
@@ -235,7 +235,7 @@
       __location__("nested.spicy:3:10-5:2");
     (void());
     __error = (*__self).__error;
-    return __result;
+    return {};
 }

 auto __hlt::Test::L::__parse_Test_L_stage2(::hilti::rt::ValueReference<::hilti::rt::Stream>& __data, std::optional<::hilti::rt::stream::SafeConstIterator> __begin, ::hilti::rt::stream::View __cur, ::hilti::rt::Bool __trim, ::hilti::rt::integer::safe<int64_t> __lah, ::hilti::rt::stream::SafeConstIterator __lahe, std::optional<::hilti::rt::RecoverableFailure> __error) -> std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> {
@@ -266,7 +266,7 @@
     auto __self = L::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:7:10-9:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     try {
         ::hilti::rt::debug::indent(std::string("spicy"));
         std::optional<::hilti::rt::stream::SafeConstIterator> __begin = std::make_optional(__cur.begin());
@@ -276,7 +276,7 @@
         __error = (*__self).__error;
         ::hilti::rt::StrongReference<::hilti::rt::Stream> filtered = ::hilti::rt::StrongReference<::hilti::rt::Stream>();
         if ( ! (::hilti::rt::Bool(static_cast<bool>(filtered))) ) {
-            __result = (*__self).__parse_Test_L_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
+            return (*__self).__parse_Test_L_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
         }
     }
     catch ( const ::std::exception& __except ) {
@@ -293,7 +293,7 @@
       __location__("nested.spicy:7:10-9:2");
     (void());
     __error = (*__self).__error;
-    return __result;
+    return {};
 }

 auto __hlt::Test::M::__parse_Test_M_stage2(::hilti::rt::ValueReference<::hilti::rt::Stream>& __data, std::optional<::hilti::rt::stream::SafeConstIterator> __begin, ::hilti::rt::stream::View __cur, ::hilti::rt::Bool __trim, ::hilti::rt::integer::safe<int64_t> __lah, ::hilti::rt::stream::SafeConstIterator __lahe, std::optional<::hilti::rt::RecoverableFailure> __error) -> std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> {
@@ -325,7 +325,7 @@
     auto __self = M::__self();
     ::hilti::rt::detail::checkStack();
     __location__("nested.spicy:11:10-13:2");
-    std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
+    // std::tuple<::hilti::rt::stream::View, ::hilti::rt::integer::safe<int64_t>, ::hilti::rt::stream::SafeConstIterator, std::optional<::hilti::rt::RecoverableFailure>> __result;
     try {
         ::hilti::rt::debug::indent(std::string("spicy"));
         std::optional<::hilti::rt::stream::SafeConstIterator> __begin = std::make_optional(__cur.begin());
@@ -335,7 +335,7 @@
         __error = (*__self).__error;
         ::hilti::rt::StrongReference<::hilti::rt::Stream> filtered = ::hilti::rt::StrongReference<::hilti::rt::Stream>();
         if ( ! (::hilti::rt::Bool(static_cast<bool>(filtered))) ) {
-            __result = (*__self).__parse_Test_M_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
+            return (*__self).__parse_Test_M_stage2(__data, __begin, __cur, __trim, __lah, __lahe, __error);
         }
     }
     catch ( const ::std::exception& __except ) {
@@ -352,7 +352,7 @@
       __location__("nested.spicy:11:10-13:2");
     (void());
     __error = (*__self).__error;
-    return __result;
+    return {};
 }

bbannier commented 11 months ago

Your patched code definitely looks better, but the question is how hard it is to generate that code in general (i.e., the responsible section in the parser builder might create more state for other input; in modifications of the parser state via pushState/popState might make this hard).

Maybe a less intrusive fix would be to instead use an optional<tuple> to reduce the cost from default-initializing __result (even though you mentioned https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86173 offline). It should be possible to start it out as a default-constructed (unset) optional, assign to it like to a tuple, and then at the end unconditionally deref it.

bbannier commented 11 months ago

Like we discussed offline, one approach of not dealing with the different code paths would be to trade work on running the default constructors of the tuple elements for default initializing a tuple, e.g.,

From a36af2d6042247b7754d8f00981aebbb62972139 Mon Sep 17 00:00:00 2001
From: Benjamin Bannier <benjamin.bannier@corelight.com>
Date: Fri, 10 Nov 2023 12:40:52 +0100
Subject: [PATCH] Postphone allocating result tuple in generated code.

---
 spicy/toolchain/src/compiler/codegen/parser-builder.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spicy/toolchain/src/compiler/codegen/parser-builder.cc b/spicy/toolchain/src/compiler/codegen/parser-builder.cc
index cfcf647e3..3aafaf311 100644
--- a/spicy/toolchain/src/compiler/codegen/parser-builder.cc
+++ b/spicy/toolchain/src/compiler/codegen/parser-builder.cc
@@ -298,7 +298,7 @@ struct ProductionVisitor
                     std::vector<Type> x = {type::stream::View(), look_ahead::Type, type::stream::Iterator(),
                                            type::Optional(builder::typeByID("hilti::RecoverableFailure"))};
                     auto result_type = type::Tuple(std::move(x));
-                    auto store_result = builder()->addTmp("result", result_type);
+                    auto store_result = builder()->addTmp("result", type::Optional(result_type));

                     auto try_ = begin_try();

@@ -360,7 +360,7 @@ struct ProductionVisitor
                     run_finally();
                     popState();

-                    builder()->addReturn(store_result);
+                    builder()->addReturn(builder::deref(store_result));

                     return popBuilder()->block();
                 }; // End of build_parse_stage1()
-- 
2.42.1

In theory this could be much faster, but like is not (at least with GCC), https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86173.

I tested this in a huge internal analyzer we have and saw small throughput improvements when compiling with Clang, but unchanged or maybe even worse throughput with GCC.

zeek / spicy

Overhead due to default std::tuple initialization in generated code #1593