dart-lang / sdk

The Dart SDK, including the VM, JS and Wasm compilers, analysis, core libraries, and more.
https://dart.dev
BSD 3-Clause "New" or "Revised" License
10.23k stars 1.58k forks source link

[vm/x64] Lack of immediate operands in binary operations #48222

Open rakudrama opened 2 years ago

rakudrama commented 2 years ago

I'm not sure why a Uint32 representation is chosen in examples like this. BinaryUint32Op is not as fleshed-out as BinaryInt64Op, and does not attempt to use immediate operands. If I grep the output of precompiler2 --disassemble for dart2js, there are 2169 occurrences of BinaryUint32Op(& vs 300 BinaryInt64Op(&, so quite a lot of masking instructions are using unnecessary temporaries.

Code for optimized function 'package:kernel/ast.dart_TreeNode_TreeNode.' (Constructor) {
        ;; B0
        ;; B1
        ;; Enter frame
        ;; PrologueOffset = 0
0x7ffa82918000    55                     push rbp
0x7ffa82918001    4889e5                 movq rbp,rsp
        ;; ParallelMove rdx <- C, rcx <- C
0x7ffa82918004    ba01000000             movl rdx,1
0x7ffa82918009    b9ffffff3f             movl rcx,0x3fffffff

๐Ÿ ‰The above two constants are used once each and could be immediate operands

        ;; ParallelMove rbx <- S+2
0x7ffa8291800e    488b5d10               movq rbx,[rbp+0x10]
        ;; StoreInstanceField(v2 . fileOffset = v3, NoStoreBarrier)
0x7ffa82918012    48c74317feffffff       movq [rbx+0x17],-2
        ;; v4 <- LoadStaticField(_hashCounter@27479794) T{int?}
0x7ffa8291801a    498bb680000000         movq rsi,[thr+0x80]
0x7ffa82918021    488bb6200d0000         movq rsi,[rsi+0xd20]
        ;; Line 143 in 'package:kernel/ast.dart_TreeNode_TreeNode.':
             final int hashCode = _hashCounter = (_hashCounter + 1) & 0x3fffffff;
        ;; CheckNull:10(v4, NoSuchMethodError) [-9223372036854775808, 9223372036854775807] T{int}
0x7ffa82918028    493bb6c8000000         cmpq rsi,[thr+0xc8]   null
0x7ffa8291802f    0f8436000000           jz 0x00007ffa8291806b
        ;; ParallelMove rsi <- rsi
        ;; v23 <- UnboxUint32([tr], [non-speculative], v4 T{int}) [0, 4294967295] T{int}
0x7ffa82918035    48d1fe                 sarq rsi,1
0x7ffa82918038    7308                   jnc 0x00007ffa82918042
0x7ffa8291803a    488b347508000000       movq rsi,[rsi*2+0x8]
        ;; ParallelMove rsi <- rsi
        ;; v7 <- BinaryUint32Op(+ [tr], v23 T{int}, v35) [0, 4294967295] T{_Smi}
0x7ffa82918042    03f2                   addl rsi,rdx

๐Ÿ ‰ here

        ;; ParallelMove rsi <- rsi
        ;; v10 <- BinaryUint32Op(& [tr], v7, v36) [0, 1073741823] T{_Smi}
0x7ffa82918044    23f1                   andl rsi,rcx

๐Ÿ ‰ and here

        ;; v25 <- BoxUint32(v10) [0, 1073741823] T{_Smi}
0x7ffa82918046    8bc6                   movl rax,rsi
0x7ffa82918048    4803c0                 addq rax,rax
        ;; ParallelMove rax <- rax
        ;; Line 143 in 'package:kernel/ast.dart_TreeNode_TreeNode.':
             final int hashCode = _hashCounter = (_hashCounter + 1) & 0x3fffffff;
        ;; StoreStaticField(_hashCounter@27479794, v25)
0x7ffa8291804b    498b8e80000000         movq rcx,[thr+0x80]
0x7ffa82918052    488981200d0000         movq [rcx+0xd20],rax
        ;; ParallelMove rsi <- rsi
        ;; v33 <- IntConverter(uint32->int64, v10)
0x7ffa82918059    8bf6                   movl rsi,rsi

๐Ÿ ‰ This seems unnecessary as andl rsi, rcx clears the high part

        ;; StoreInstanceField(v2 . hashCode = v33 T{_Smi})
        ;; NativeUnboxedStoreInstanceFieldInstr
0x7ffa8291805b    48897307               movq [rbx+0x7],rsi
        ;; ParallelMove rax <- C
0x7ffa8291805f    498b86c8000000         movq rax,[thr+0xc8]   null
        ;; Return:18(v0)
0x7ffa82918066    4889ec                 movq rsp,rbp
0x7ffa82918069    5d                     pop rbp
0x7ffa8291806a    c3                     ret
        ;; slow path check null (nsm) operation
0x7ffa8291806b    4d8b677f               movq r12,[pp+0x7f]
0x7ffa8291806f    41ff542407             call [r12+0x7]
mraleph commented 2 years ago

I have noticed this before when looking at another issue. The actual fix is relatively simple (taken from another patch I made: https://gist.github.com/mraleph/30183ef67352c6d5aec604d82f34e1be)

@@ -7100,7 +7100,7 @@ LocationSummary* BinaryUint32OpInstr::MakeLocationSummary(Zone* zone,
   LocationSummary* summary = new (zone)
       LocationSummary(zone, kNumInputs, kNumTemps, LocationSummary::kNoCall);
   summary->set_in(0, Location::RequiresRegister());
-  summary->set_in(1, Location::RequiresRegister());
+  summary->set_in(1, LocationRegisterOrConstant(right()));
   summary->set_out(0, Location::SameAsFirstInput());
   return summary;
 }
@@ -7136,6 +7136,15 @@ static void EmitIntegerArithmetic(FlowGraphCompiler* compiler,

 void BinaryUint32OpInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
   Register left = locs()->in(0).reg();
+  if (locs()->in(1).IsConstant()) {
+    int64_t value;
+    const bool ok = compiler::HasIntegerValue(locs()->in(1).constant(), &value);
+    RELEASE_ASSERT(ok);
+    EmitIntegerArithmetic(compiler, op_kind(), left,
+                        compiler::Immediate(value));
+    return;
+  }
+
   Register right = locs()->in(1).reg();
   Register out = locs()->out(0).reg();
   ASSERT(out == left);
rakudrama commented 2 years ago

@mraleph If I try something like that, I get immediate operands, but there are still a lot of unnecessary moves:

Code for optimized function 'package:kernel/ast.dart_Procedure_get_isNonNullableByDefault' (GetterFunction) {
        ;; B0
        ;; B1
        ;; ParallelMove rcx <- S+1
0x7f7bb9e9ed30    488b4c2408             movq rcx,[rsp+0x8]
        ;; v3 <- LoadField(v2 . flags) [-9223372036854775808, 9223372036854775807] T{int}
        ;; NativeUnboxedLoadFieldInstr
0x7f7bb9e9ed35    488b515f               movq rdx,[rcx+0x5f]
        ;; ParallelMove rdx <- rdx
        ;; v21 <- IntConverter(int64->uint32[tr], v3)
0x7f7bb9e9ed39    8bd2                   movl rdx,rdx
        ;; ParallelMove rdx <- rdx
        ;; v6 <- BinaryUint32Op(& [tr], v21 T{int}, v25) [0, 64] T{_Smi}
0x7f7bb9e9ed3b    83e240                 andl rdx,0x40
        ;; ParallelMove rdx <- rdx
        ;; v23 <- IntConverter(uint32->int64, v6)
0x7f7bb9e9ed3e    8bd2                   movl rdx,rdx
        ;; v9 <- EqualityCompare(v23 T{_Smi} != v18) T{bool}
0x7f7bb9e9ed40    4883fa00               cmpq rdx,0
0x7f7bb9e9ed44    7509                   jnz 0x00007f7bb9e9ed4f
0x7f7bb9e9ed46    498b86d8000000         movq rax,[thr+0xd8]   false
0x7f7bb9e9ed4d    eb07                   jmp 0x00007f7bb9e9ed56
0x7f7bb9e9ed4f    498b86d0000000         movq rax,[thr+0xd0]   true
        ;; ParallelMove rax <- rax
        ;; Return:22(v9)
0x7f7bb9e9ed56    c3                     ret

What would it take to replace the movl/andl/movl/cmpq with testb rdx,0x40?

I see there is code to generate bit-test patterns, but it is not happening here - RecognizeTestPattern in il.cc.

mraleph commented 2 years ago

We should be able to remove redundant moves caused by IntCoverter if we massage register allocator a bit, and permit it to coalesce input and output allocations for the converters that are no-ops.

We can also extend RecognizeTestPattern to handle more than just smi operations.