non-optimal mem.copy on small buffers

I've ported ryu to zig but noticed slightly slower performance than expected. This issue will refer to https://github.com/tiehuis/zig-ryu/tree/724f66d81cf598d53df8eca2a93bf2eb8c7cdb1d.

First, the benchmarks of that tree (best of 5, build.sh in repo changed to use clang++).

C Reference

    Average & Stddev Ryu
32:   26.908    5.560
64:   30.465    5.866

Zig Reference

    Average & Stddev Ryu
32:   32.423    6.206
64:   42.308    7.505

I checked the disassembly for the Zig benchmark and noticed that the memcpy LLVM emits (avx on my machine) is being inlined when copying from a fixed digit table. The interesting thing here is that these std.mem.copy invocations are only two bytes in length and should be statically known by the compiler.

For example, the following expands to the assembly

std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c0 .. c0 + 2]);

   56a0:       c5 fc 10 84 2b 20 fe    vmovups -0x1e0(%rbx,%rbp,1),%ymm0
    56a7:       ff ff 
    56a9:       c5 fc 10 8c 2b 40 fe    vmovups -0x1c0(%rbx,%rbp,1),%ymm1
    56b0:       ff ff 
    56b2:       c5 fc 10 94 2b 60 fe    vmovups -0x1a0(%rbx,%rbp,1),%ymm2
    56b9:       ff ff 
    56bb:       c5 fc 10 9c 2b 80 fe    vmovups -0x180(%rbx,%rbp,1),%ymm3
    56c2:       ff ff 
    56c4:       c5 fc 11 84 2f 20 fe    vmovups %ymm0,-0x1e0(%rdi,%rbp,1)
    56cb:       ff ff 
    56cd:       c5 fc 11 8c 2f 40 fe    vmovups %ymm1,-0x1c0(%rdi,%rbp,1)
    56d4:       ff ff 
    56d6:       c5 fc 11 94 2f 60 fe    vmovups %ymm2,-0x1a0(%rdi,%rbp,1)
    56dd:       ff ff 
    56df:       c5 fc 11 9c 2f 80 fe    vmovups %ymm3,-0x180(%rdi,%rbp,1)
    56e6:       ff ff 
    56e8:       c5 fc 10 84 2b a0 fe    vmovups -0x160(%rbx,%rbp,1),%ymm0
    56ef:       ff ff 
    56f1:       c5 fc 10 8c 2b c0 fe    vmovups -0x140(%rbx,%rbp,1),%ymm1
    56f8:       ff ff 
    56fa:       c5 fc 10 94 2b e0 fe    vmovups -0x120(%rbx,%rbp,1),%ymm2
    5701:       ff ff 
    5703:       c5 fc 10 9c 2b 00 ff    vmovups -0x100(%rbx,%rbp,1),%ymm3
    570a:       ff ff 
    570c:       c5 fc 11 84 2f a0 fe    vmovups %ymm0,-0x160(%rdi,%rbp,1)
    5713:       ff ff 
    5715:       c5 fc 11 8c 2f c0 fe    vmovups %ymm1,-0x140(%rdi,%rbp,1)
    571c:       ff ff 
    571e:       c5 fc 11 94 2f e0 fe    vmovups %ymm2,-0x120(%rdi,%rbp,1)
    5725:       ff ff 
    5727:       c5 fc 11 9c 2f 00 ff    vmovups %ymm3,-0x100(%rdi,%rbp,1)
    572e:       ff ff 
    5730:       c5 fc 10 84 2b 20 ff    vmovups -0xe0(%rbx,%rbp,1),%ymm0
    5737:       ff ff 
    5739:       c5 fc 10 8c 2b 40 ff    vmovups -0xc0(%rbx,%rbp,1),%ymm1
    5740:       ff ff 
    5742:       c5 fc 10 94 2b 60 ff    vmovups -0xa0(%rbx,%rbp,1),%ymm2
    5749:       ff ff 
    574b:       c5 fc 10 5c 2b 80       vmovups -0x80(%rbx,%rbp,1),%ymm3
    5751:       c5 fc 11 84 2f 20 ff    vmovups %ymm0,-0xe0(%rdi,%rbp,1)
    5758:       ff ff 
    575a:       c5 fc 11 8c 2f 40 ff    vmovups %ymm1,-0xc0(%rdi,%rbp,1)
    5761:       ff ff 
    5763:       c5 fc 11 94 2f 60 ff    vmovups %ymm2,-0xa0(%rdi,%rbp,1)
    576a:       ff ff 
    576c:       c5 fc 11 5c 2f 80       vmovups %ymm3,-0x80(%rdi,%rbp,1)
    5772:       c5 fe 6f 44 2b a0       vmovdqu -0x60(%rbx,%rbp,1),%ymm0
    5778:       c5 fc 10 4c 2b c0       vmovups -0x40(%rbx,%rbp,1),%ymm1
    577e:       c5 fc 10 54 2b e0       vmovups -0x20(%rbx,%rbp,1),%ymm2
    5784:       c5 fc 10 1c 2b          vmovups (%rbx,%rbp,1),%ymm3
    5789:       c5 fe 7f 44 2f a0       vmovdqu %ymm0,-0x60(%rdi,%rbp,1)
    578f:       c5 fc 11 4c 2f c0       vmovups %ymm1,-0x40(%rdi,%rbp,1)
    5795:       c5 fc 11 54 2f e0       vmovups %ymm2,-0x20(%rdi,%rbp,1)
    579b:       c5 fc 11 1c 2f          vmovups %ymm3,(%rdi,%rbp,1)
    57a0:       48 81 c5 00 02 00 00    add    $0x200,%rbp
    57a7:       48 83 c0 04             add    $0x4,%rax
    57ab:       0f 85 ef fe ff ff       jne    56a0 <ryu64+0x8f0>

Now, we can easily see this is non-optimal as if we replace the two-byte std.mem.copy with explicit byte assignments with the following patch (likewise for ryu64.zig) we get much better performance

diff --git a/src/ryu32.zig b/src/ryu32.zig
index 176a3a8..cfafad4 100644
--- a/src/ryu32.zig
+++ b/src/ryu32.zig
@@ -317,15 +317,20 @@ fn decimalToBuffer(v: Decimal32, sign: bool, result: []u8) usize {
         const c0 = (c % 100) << 1;
         const c1 = (c / 100) << 1;

-        std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c0 .. c0 + 2]);
-        std.mem.copy(u8, result[index + olength - i - 3 ..], DIGIT_TABLE[c1 .. c1 + 2]);
+        result[index + olength - i - 1 + 0] = DIGIT_TABLE[c0 + 0];
+        result[index + olength - i - 1 + 1] = DIGIT_TABLE[c0 + 1];
+        result[index + olength - i - 3 + 0] = DIGIT_TABLE[c1 + 0];
+        result[index + olength - i - 3 + 1] = DIGIT_TABLE[c1 + 1];
+
         i += 4;
     }
     if (output >= 100) {
         const c = (output % 100) << 1;
         output /= 100;

-        std.mem.copy(u8, result[index + olength - i - 1 ..], DIGIT_TABLE[c .. c + 2]);
+        result[index + olength - i - 1 + 0] = DIGIT_TABLE[c + 0];
+        result[index + olength - i - 1 + 1] = DIGIT_TABLE[c + 1];
+
         i += 2;
     }
     if (output >= 10) {
@@ -357,7 +362,9 @@ fn decimalToBuffer(v: Decimal32, sign: bool, result: []u8) usize {
     var expu = @intCast(usize, exp);

     if (exp >= 10) {
-        std.mem.copy(u8, result[index..], DIGIT_TABLE[2 * expu .. 2 * expu + 2]);
+        result[index + 0] = DIGIT_TABLE[2 * expu + 0];
+        result[index + 1] = DIGIT_TABLE[2 * expu + 1];
+
         index += 2;
     } else {
         result[index] = @intCast(u8, '0' + expu);

C Reference

    Average & Stddev Ryu
32:   26.908    5.560
64:   30.465    5.866

Zig with explicit copies

    Average & Stddev Ryu
32:   28.352    5.948
64:   36.163    6.815

I'm not sure if this is a failed upstream LLVM optimization or we aren't emitting good enough information from Zig. I expect this would have a fair bit of performance improvement in many tight code spots since std.mem.copy is recommended for any size slice and I would expect it to produce as good assembly as the hand-unrolled for statically known sizes.

ziglang / zig