8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c causes more stack usage on PowerPC

Quuxplusone commented 3 years ago


Bugzilla Link	PR49610
Status	RESOLVED FIXED
Importance	P enhancement
Reported by	Nathan Chancellor (natechancellor@gmail.com)
Reported on	2021-03-16 16:00:09 -0700
Last modified on	2021-07-07 02:49:06 -0700
Version	trunk
Hardware	PC Linux
CC	arnd@linaro.org, craig.topper@gmail.com, fwage73@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, ndesaulniers@google.com, nemanja.i.ibm@gmail.com, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks	PR4068
Blocked by
See also

Forgive me if I am filing this in the wrong section or tagging the wrong
people, this is just based on my investigation and triage.

After https://github.com/llvm/llvm-
project/commit/8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c, there was a balloon in
stack usage in a particular function in the PowerPC KVM section of the Linux
kernel, which causes a build error because of -Werror.

Prior to 8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c (with Linux at 5.12-rc3 for
https://git.kernel.org/linus/97e4910232fa1f81e806aa60c25a0450276d99a2):

# legacy pass manager
$ make -skj"$(nproc)" ARCH=powerpc CC=clang CROSS_COMPILE=powerpc64-linux-gnu-
KCFLAGS="-fno-experimental-new-pass-manager -Wframe-larger-than=1024"
O=build/ppc64 distclean pseries_defconfig disable-werror.config
arch/powerpc/kvm/book3s_hv_nested.o
arch/powerpc/kvm/book3s_hv_nested.c:264:6: warning: stack frame size of 1728
bytes in function 'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
     ^
1 warning generated.

# new pass manager
$ make -skj"$(nproc)" ARCH=powerpc CC=clang CROSS_COMPILE=powerpc64-linux-gnu-
KCFLAGS="-fexperimental-new-pass-manager -Wframe-larger-than=1024"
O=build/ppc64 distclean pseries_defconfig disable-werror.config
arch/powerpc/kvm/book3s_hv_nested.o
arch/powerpc/kvm/book3s_hv_nested.c:264:6: warning: stack frame size of 1712
bytes in function 'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
     ^
1 warning generated.

After 8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c:

# legacy pass manager
$ make -skj"$(nproc)" ARCH=powerpc CC=clang CROSS_COMPILE=powerpc64-linux-gnu-
KCFLAGS="-fno-experimental-new-pass-manager -Wframe-larger-than=1024"
O=build/ppc64 distclean pseries_defconfig disable-werror.config
arch/powerpc/kvm/book3s_hv_nested.o
arch/powerpc/kvm/book3s_hv_nested.c:264:6: warning: stack frame size of 2480
bytes in function 'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
     ^
1 warning generated.

# new pass manager
$ make -skj"$(nproc)" ARCH=powerpc CC=clang CROSS_COMPILE=powerpc64-linux-gnu-
KCFLAGS="-fexperimental-new-pass-manager -Wframe-larger-than=1024"
O=build/ppc64 distclean pseries_defconfig disable-werror.config
arch/powerpc/kvm/book3s_hv_nested.o
arch/powerpc/kvm/book3s_hv_nested.c:264:6: warning: stack frame size of 2048
bytes in function 'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
     ^
1 warning generated.

For reference, GCC 10.2.0:

$ make -skj"$(nproc)" ARCH=powerpc ARCH=powerpc CROSS_COMPILE=powerpc64-linux-
KCFLAGS=-Wframe-larger-than=1024 O=build/ppc64 distclean pseries_defconfig
disable-werror.config arch/powerpc/kvm/book3s_hv_nested.o
arch/powerpc/kvm/book3s_hv_nested.c: In function 'kvmhv_enter_nested_guest':
arch/powerpc/kvm/book3s_hv_nested.c:387:1: warning: the frame size of 1280
bytes is larger than 1024 bytes [-Wframe-larger-than=]
  387 | }
      | ^

I tried to reduce this down with cvise and came up with:

struct hv_guest_state {
  int vcpu_token;
  long pcr;
  long amor;
  long dpdes;
  long hfscr;
  long tb_offset;
  long srr0;
  long srr1;
  long sprg[4];
  long pidr;
  long cfar;
  long ppr;
  long dawr1;
  long dawrx1;
} kvmhv_write_guest_state_and_regs_l2_hv;
struct pt_regs {
  struct {
    struct {
      long gpr[32];
      long nip;
      long msr;
      long orig_gpr3;
      long ctr;
      long link;
      long xer;
      long ccr;
      long softe;
      long trap;
      long dar;
      long dsisr;
      long result;
    };
    long __pad[4];
  };
} __srcu_read_unlock();
struct kvm_vcpu_arch {
  struct pt_regs regs;
  struct kvmppc_vcore *vcore;
  int trap;
};
struct kvmppc_vcore {
  long tb_offset;
  long pcr;
};
struct kvm_nested_guest {
  long l1_gr_to_hr;
};
int kvm_vcpu_read_guest();
static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) {
  struct kvm_nested_guest __trans_tmp_14 = *gp;
  __srcu_read_unlock(__trans_tmp_14);
}
void byteswap_pt_regs(struct pt_regs *regs) {
  unsigned long *addr = (long *)regs;
  for (; addr < (unsigned long *)(regs + 1); addr++)
    *addr = __builtin_bswap64(*addr);
}
int kvmhv_read_guest_state_and_regs(struct hv_guest_state *l2_hv,
                                    struct pt_regs *l2_regs) {
  return kvm_vcpu_read_guest(l2_hv) || kvm_vcpu_read_guest(l2_regs);
}
long kvmhv_enter_nested_guest(struct kvm_vcpu_arch *vcpu) {
  struct kvm_nested_guest *l2;
  struct pt_regs l2_regs, saved_l1_regs;
  struct hv_guest_state l2_hv;
  struct kvmppc_vcore *vc = vcpu->vcore;
  long regs_ptr = kvmhv_read_guest_state_and_regs(&l2_hv, &l2_regs);
  byteswap_pt_regs(&l2_regs);
  if (!l2->l1_gr_to_hr)
    kvmhv_update_ptbl_cache(l2);
  saved_l1_regs = vcpu->regs;
  vc->tb_offset += l2_hv.tb_offset;
  vcpu->regs = saved_l1_regs;
  vc = vcpu->vcore;
  vc->pcr = 0;
  struct hv_guest_state *hr = 0;
  hr->pidr = hr->dawr1 = __builtin_bswap64(hr->dawr1);
  hr->dawrx1 = __builtin_bswap64(hr->dawrx1);
  byteswap_pt_regs(&l2_regs);
  kvm_vcpu_read_guest(kvmhv_write_guest_state_and_regs_l2_hv, regs_ptr);
  return vcpu->trap;
}

which produces the same result: https://godbolt.org/z/754Men.

Prior to 8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c

# LPM
$ clang -fno-experimental-new-pass-manager --target=powerpc64-linux-gnu -O2 -
Wno-pointer-sign -Wframe-larger-than=512 -c -o /dev/null book3s_hv_nested.i
book3s_hv_nested.i:63:6: warning: stack frame size of 1136 bytes in function
'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu_arch *vcpu) {
     ^
1 warning generated.

# NPM
$ clang -fexperimental-new-pass-manager --target=powerpc64-linux-gnu -O2 -Wno-
pointer-sign -Wframe-larger-than=512 -c -o /dev/null book3s_hv_nested.i
book3s_hv_nested.i:63:6: warning: stack frame size of 640 bytes in function
'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu_arch *vcpu) {
     ^
1 warning generated.

After 8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c:

# LPM
$ clang -fno-experimental-new-pass-manager --target=powerpc64-linux-gnu -O2 -
Wno-pointer-sign -Wframe-larger-than=512 -c -o /dev/null book3s_hv_nested.i
book3s_hv_nested.i:54:6: warning: stack frame size of 944 bytes in function
'byteswap_pt_regs' [-Wframe-larger-than=]
void byteswap_pt_regs(struct pt_regs *regs) {
     ^
book3s_hv_nested.i:63:6: warning: stack frame size of 2064 bytes in function
'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu_arch *vcpu) {
     ^
2 warnings generated.

# NPM
$ clang -fexperimental-new-pass-manager --target=powerpc64-linux-gnu -O2 -Wno-
pointer-sign -Wframe-larger-than=512 -c -o /dev/null book3s_hv_nested.i
book3s_hv_nested.i:54:6: warning: stack frame size of 944 bytes in function
'byteswap_pt_regs' [-Wframe-larger-than=]
void byteswap_pt_regs(struct pt_regs *regs) {
     ^
book3s_hv_nested.i:63:6: warning: stack frame size of 1520 bytes in function
'kvmhv_enter_nested_guest' [-Wframe-larger-than=]
long kvmhv_enter_nested_guest(struct kvm_vcpu_arch *vcpu) {
     ^
2 warnings generated.

Arnd Bergmann points out that this is most likely because PowerPC does not
appear to be using optimal assembly for byte swapping (or at least would help),
which I believe should be visible from the Godbolt link above.

Quuxplusone commented 3 years ago

We're compiling for a (relatively) legacy system (no "stdbrx" -
https://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20130325/169706.html
)?

Let's reduce to the simplest load/swap/store 64-bit sequence:

define void @bs(i64* %p) {
  %x = load i64, i64* %p, align 8
  %b = call i64 @llvm.bswap.i64(i64 %x)
  store i64 %b, i64* %p, align 8
  ret void
}
declare i64 @llvm.bswap.i64(i64) #2

-----------------------------------------------------------------------

$ llc -o - bswap.ll -mtriple=powerpc64
    ld 4, 0(3)
    rotldi  5, 4, 16
    rotldi  6, 4, 8
    rldimi 6, 5, 8, 48
    rotldi  5, 4, 24
    rldimi 6, 5, 16, 40
    rotldi  5, 4, 32
    rldimi 6, 5, 24, 32
    rotldi  5, 4, 48
    rldimi 6, 5, 40, 16
    rotldi  5, 4, 56
    rldimi 6, 5, 48, 8
    rldimi 6, 4, 56, 0
    std 6, 0(3)

-----------------------------------------------------------------------

There might be a better trick, but we could at least split this into 2 lwz + 2
stwbrx?

Quuxplusone commented 3 years ago

Two other alternatives would be the following...

Single load, two swapping stores:
ld 4, 0(3)
stwbrx 4, 0(6)
srdi 4, 4, 32
stwbrx 4, 4(6)

Two swapping loads, merge, single store:
lwbrx 5, 0(3)
lwbrx 4, 4(3)
rldimi 5, 4, 32, 0
std 5, 0(6)

Quuxplusone commented 3 years ago

(In reply to Nemanja Ivanovic from comment #2)
> Two other alternatives would be the following...
>
> Single load, two swapping stores:
> ld 4, 0(3)
> stwbrx 4, 0(6)
> srdi 4, 4, 32
> stwbrx 4, 4(6)
>
> Two swapping loads, merge, single store:
> lwbrx 5, 0(3)
> lwbrx 4, 4(3)
> rldimi 5, 4, 32, 0
> std 5, 0(6)

Yes - that would reduce the scope of the problem. We just need to split one of
these basic patterns, and that hopefully solves the motivating example:

define void @split_store(i64 %x, i64* %p) {
  %b = call i64 @llvm.bswap.i64(i64 %x)
  store i64 %b, i64* %p, align 8
  ret void
}

define i64 @split_load(i64* %p) {
  %x = load i64, i64* %p, align 8
  %b = call i64 @llvm.bswap.i64(i64 %x)
  ret i64 %b
}

...but we may want to do both to cover all bases.

cc'ing Craig and Simon - does this look like something that belongs in generic
combiner or legalization (other targets might have the same issue)?

Quuxplusone commented 3 years ago

FWIW, I think it makes sense for this to be in the generic combiner. If BSWAP is legal for i32 but not for i64, we can emit

2 x (extract_element + bswap) + build_pair

Presumably the combiner will already do the right thing when the input is a load and/or the only user is a store.

Quuxplusone commented 3 years ago

(In reply to Sanjay Patel from comment #3)
> cc'ing Craig and Simon - does this look like something that belongs in
> generic combiner or legalization (other targets might have the same issue)?

Yes I think so - IIRC we do similar things in DAGCombine load/store combining
already.

Quuxplusone commented 3 years ago

(In reply to Nemanja Ivanovic from comment #4)
> FWIW, I think it makes sense for this to be in the generic combiner. If
> BSWAP is legal for i32 but not for i64, we can emit
>
> 2 x (extract_element + bswap) + build_pair
>
> Presumably the combiner will already do the right thing when the input is a
> load and/or the only user is a store.

I was hoping so, but it didn't seem to work. I just hacked this into
DAGCombiner::visitBSWAP():

  if (VT == MVT::i64) {
    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    SDLoc DL(N);
    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, N0, DAG.getIntPtrConstant(1, DL));
    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, N0, DAG.getIntPtrConstant(0, DL));
    SDValue SwapHi = DAG.getNode(ISD::BSWAP, DL, HalfVT, Hi);
    SDValue SwapLo = DAG.getNode(ISD::BSWAP, DL, HalfVT, Lo);
    return DAG.getNode(ISD::BUILD_PAIR, DL, VT, SwapLo, SwapHi);
  }

And neither the load nor store patterns appear to get merged with
extract/build_pair with powerpc64 target, so we end up with the same asm.

Quuxplusone commented 3 years ago

Bumping, we're probably going to have to spin down our CI coverage of PPC over this. :(

https://github.com/ClangBuiltLinux/continuous-integration2/pull/111

Quuxplusone commented 3 years ago

I wouldn't expect that DAG combine hack to give the same asm. The original asm had an expanded 64-bit swap. I assume that DAG combine would give two expanded 32 bit swaps.

Assuming the BUILD_PAIR becomes a shift/or sequence, there is a combine for split stores, but it requires isMultiStoresCheaperThanBitsMerge to return true.

Quuxplusone commented 3 years ago

(In reply to Craig Topper from comment #8)
> I wouldn't expect that DAG combine hack to give the same asm. The original
> asm had an expanded 64-bit swap. I assume that DAG combine would give two
> expanded 32 bit swaps.

Yes, you're right. I saw the string of rotate opcodes, but they're not the same
instructions.

> Assuming the BUILD_PAIR becomes a shift/or sequence, there is a combine for
> split stores, but it requires isMultiStoresCheaperThanBitsMerge to return
> true.

I think we would need to change DAGCombiner::splitMergedValStore() to recognize
a BUILD_PAIR input. Otherwise, the smaller bswaps are going to get expanded
before we split the store.

I'm seeing a lot of unexpected diffs across several targets with that change
hacked in. It seems like there are gaps in BUILD_PAIR and/or EXTRACT_ELEMENT
lowering because I see crashes too.

This is going to take multiple changes and at least some of those are specific
to PowerPC lowering, so I think we should view this as a PowerPC-specific bug
and then see if it is worth generalizing.

Nemanja, do you want to take this?

Quuxplusone commented 3 years ago

Nemanja, this came up again on LKML:
https://lore.kernel.org/lkml/CAK8P3a37Pj24WqSvMKnwSS74W+WMAsmk+-kXX5qE7fCZ=QBL0g@mail.gmail.com/

Quuxplusone commented 3 years ago

OK, please review the fix in https://reviews.llvm.org/D104836

Sorry for the delay. I realize that the target-independent combine doesn't work because we won't have the bswap(load) or store(bswap) idiom due to the intervening EXTRACT_VALUE and BUILD_PAIR respectively.

Quuxplusone commented 3 years ago

The fix is actually in two commits since I neglected to account for big endian
targets in the initial commit.

Initial commit:
https://reviews.llvm.org/rG0464586ac515e8cfebe4c7615387fd625c8869f5

Follow-up fix for BE:
https://reviews.llvm.org/rGdcccb2f59401

Quuxplusone commented 3 years ago

(In reply to Nemanja Ivanovic from comment #12)
> The fix is actually in two commits since I neglected to account for big
> endian targets in the initial commit.
>
> Initial commit:
> https://reviews.llvm.org/rG0464586ac515e8cfebe4c7615387fd625c8869f5
>
> Follow-up fix for BE:
> https://reviews.llvm.org/rGdcccb2f59401

Ugh, unfortunately this caused failures in the bootstrap build on BE Linux and
I had to disable it temporarily. I have now fixed it in yet another follow-up
commit. So this also needs the following two commits:

https://reviews.llvm.org/rG4e22c7265d86 - disables the change
https://reviews.llvm.org/rG4e22c7265d86 - fixes and re-enables the change

Quuxplusone / LLVMBugzillaTest

8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c causes more stack usage on PowerPC #48579