alpaka-group / alpaka

Abstraction Library for Parallel Kernel Acceleration :llama:
https://alpaka.readthedocs.io
Mozilla Public License 2.0
356 stars 74 forks source link

reduce register footprint for the new iterators #2382

Closed psychocoderHPC closed 2 months ago

psychocoderHPC commented 2 months ago

https://github.com/alpaka-group/alpaka/pull/2377#issuecomment-2348309816 showed that the register footprint increased strongly when switching examples to the new iteration schema. The reason is that the iterator state is huge and increases the register footprint.

This issue should evaluate possible optimizations to solve this issue.

psychocoderHPC commented 2 months ago

This decreases the register footprint to 64/71 for the example bufferCopy.

diff --git a/include/alpaka/exec/UniformElements.hpp b/include/alpaka/exec/UniformElements.hpp
index b7f6cd2ee54..cf06e312a1e 100644
--- a/include/alpaka/exec/UniformElements.hpp
+++ b/include/alpaka/exec/UniformElements.hpp
@@ -134,7 +134,6 @@ namespace alpaka
                     , extent_{extent}
                     , first_{std::min(first, extent)}
                     , index_{first_}
-                    , range_{std::min(first + elements, extent)}
                 {
                 }

@@ -149,20 +148,18 @@ namespace alpaka
                 {
                     // increment the index along the elements processed by the current thread
                     ++index_;
-                    if(index_ < range_)
-                        return *this;
-
-                    // increment the thread index with the grid stride
-                    first_ += stride_;
-                    index_ = first_;
-                    range_ = std::min(first_ + elements_, extent_);
-                    if(index_ < extent_)
-                        return *this;
-
-                    // the iterator has reached or passed the end of the extent, clamp it to the extent
-                    first_ = extent_;
-                    index_ = extent_;
-                    range_ = extent_;
+                    if(index_ >= std::min(first_ + elements_, extent_))
+                    {
+                        // increment the thread index with the grid stride
+                        first_ += stride_;
+                        index_ = first_;
+                        if(index_ >= extent_)
+                        {
+                            // the iterator has reached or passed the end of the extent, clamp it to the extent
+                            first_ = extent_;
+                            index_ = extent_;
+                        }
+                    }
                     return *this;
                 }

@@ -192,7 +189,6 @@ namespace alpaka
                 // modified by the pre/post-increment operator
                 Idx first_;
                 Idx index_;
-                Idx range_;
             };

         private: