ziglang / zig

General-purpose programming language and toolchain for maintaining robust, optimal, and reusable software.
https://ziglang.org
MIT License
34.65k stars 2.53k forks source link

Another array access performance issue. #13938

Open IntegratedQuantum opened 1 year ago

IntegratedQuantum commented 1 year ago

Zig Version

0.11.0-dev.753+331861161

Steps to Reproduce and Observed Behavior

I thought this was covered by #12215, but now that it is fixed I still get terrible performance. Here is a simple benchmark:

const std = @import("std");

const len = 32*32*32;

fn getIndex(i: u16) u16 {
    return i;
}

pub const Chunk = struct {
    blocks: [len]u16 = undefined,
};

pub noinline fn regenerateMainMesh(chunk: *Chunk) u32 {
    var sum: u32 = 0;
    var i: u16 = 0;
    while(i < len) : (i += 1) {
        sum += chunk.blocks[getIndex(i)]; // ← workaround: (&chunk.blocks)[...]
    }
    return sum;
}

pub fn main() void {
    var chunk: Chunk = Chunk{};
    for(chunk.blocks) |*block, i| {
        block.* = @intCast(u16, i);
    }
    const start = std.time.nanoTimestamp();
    const sum = regenerateMainMesh(&chunk);
    const end = std.time.nanoTimestamp();
    std.log.err("Time: {} Sum: {}", .{end - start, sum});
}

Even in release-fast the performance is terrible:

$ zig run test.zig -OReleaseFast
error: Time: 104980842 Sum: 536854528

[godbolt](https://godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:2,lang:zig,selection:(endColumn:31,endLineNumber:17,positionColumn:31,positionLineNumber:17,selectionStartColumn:31,selectionStartLineNumber:17,startColumn:31,startLineNumber:17),source:'const+std+%3D+@import(%22std%22)%3B%0A%0Aconst+len+%3D+32*32*32%3B%0A%0Afn+getIndex(i:+u16)+u16+%7B%0A%09return+i%3B%0A%7D%0A%0Apub+const+Chunk+%3D+struct+%7B%0A%09blocks:+%5Blen%5Du16+%3D+undefined,%0A%7D%3B%0A%0Apub+noinline+fn+regenerateMainMesh(chunk:+*Chunk)+u32+%7B%0A%09var+sum:+u32+%3D+0%3B%0A%09var+i:+u16+%3D+0%3B%0A%09while(i+%3C+len)+:+(i+%2B%3D+1)+%7B%0A%09%09sum+%2B%3D+chunk.blocks%5BgetIndex(i)%5D%3B+//+%E2%86%90+workaround:+(%26chunk.blocks)%5B...%5D%0A%09%7D%0A%09return+sum%3B%0A%7D%0A%0Apub+fn+main()+void+%7B%0A%09var+chunk:+Chunk+%3D+Chunk%7B%7D%3B%0A%09for(chunk.blocks)+%7C*block,+i%7C+%7B%0A%09%09block.*+%3D+@intCast(u16,+i)%3B%0A%09%7D%0A%09const+start+%3D+std.time.nanoTimestamp()%3B%0A%09const+sum+%3D+regenerateMainMesh(%26chunk)%3B%0A%09const+end+%3D+std.time.nanoTimestamp()%3B%0A%09std.log.err(%22Time:+%7B%7D+Sum:+%7B%7D%22,+.%7Bend+-+start,+sum%7D)%3B%0A%7D'),l:'5',n:'0',o:'Zig+source+%232',t:'0')),k:48.250460405156545,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,deviceViewOpen:'1',filters:(b:'0',binary:'1',commentOnly:'0',demangle:'0',directives:'0',execute:'0',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:zig,libs:!(),options:'-OReleaseFast',selection:(endColumn:9,endLineNumber:479,positionColumn:9,positionLineNumber:479,selectionStartColumn:9,selectionStartLineNumber:479,startColumn:9,startLineNumber:479),source:2),l:'5',n:'0',o:'+zig+trunk+(Editor+%232)',t:'0')),header:(),k:29.233691411637672,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:output,i:(compilerName:'zig+0.9.0',editorid:2,fontScale:14,fontUsePx:'0',j:1,wrap:'1'),l:'5',n:'0',o:'Output+of+zig+trunk+(Compiler+%231)',t:'0')),k:22.515848183205783,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4) reveals that, like in #12215, there is a memcpy when accessing the array.

Expected Behavior

When applying the workaround

- sum += chunk.blocks[getIndex(i)]; // ← workaround: (&chunk.blocks)[...]
+ sum += (&chunk.blocks)[getIndex(i)]; // ← workaround: (&chunk.blocks)[...]

the performance is significantly better:

$ zig run test.zig -OReleaseFast
error: Time: 4188 Sum: 536854528
shwqf commented 1 year ago

I guess .field_ptr should be generated in fieldAccess in this case.

working on it.

andrewrk commented 9 months ago

Fix reverted in f93a36c091a151ed02602d2f330f7206eb9f95a3

zeroZshadow commented 5 months ago

I've been running into this quite often on platforms where llvm isn't great at optimizing (powerpc).

The following snipped worked great as a test example to show this bug happening and not happening even though all examples index into the array.

const std = @import("std");
const E = enum(u8){
  a,
  b,
  c,
  d,
};

var array: std.enums.EnumArray(E, u32) = std.enums.EnumArray(E, u32).initFill(0);

noinline fn handle(id: u32) u32 {
    // Causes odd stack pushing and popping with 4 or less enum members, but optimized out the memcpy
    // LLVM is unable to optimize out the memcopy when the enum has more than 4 members.
    return array.get(@enumFromInt(id));

    // These both cause correct code generation, and do not generate a call to memcpy in llvm ir
    //return array.values[id];
    //return array.getPtrConst(@enumFromInt(id)).*;
}

export fn square(id: u32) u32 {
    array.set(.a, 5);
    array.set(.c, 8);
    return handle(id);
}

I'm compiling this snipped with -O ReleaseSmall -target powerpc-freestanding-eabi to see the issue easily. I hope it is of some use.

cdemirer commented 3 months ago

A good explanation can be found here: https://github.com/ziglang/zig/issues/17580#issuecomment-1951467750

Summary: When an array is indexed, the expression on the left (the array, which is a value) is evaluated before (by design) the expression on the right (the function that returns the index). If there's any possibility that the function call modifies the array (which is the default assumption), we have no choice but to make a copy of the array before calling that function.


    return array.get(@enumFromInt(id));

~I guess special cases (as in #13233) can be made for builtin functions like this.~ (Seems that's not the case as the index expression is the one at here and not the builtin one here)

For the general case, AFAIK possible solutions are: 1) Redefine the evaluation order of the index and the indexed. 2) Somehow optimize-out the copy...