Open 3989010f-bff9-4a49-8c51-8090c17aa430 opened 14 years ago
I think vector demanded elements analysis should let us transform the vector icmp into a scalar icmp.
To be clear, if our IR canonicalization was working perfectly, the SSE-coded version of this program would be reduced to exactly the same IR as the original scalar code.
That would of course be bad for this particular case, but that's how IR is supposed to work. :)
%4 = bitcast <4 x float> %vecinit3.i to <4 x i32> %5 = bitcast <2 x i64> %or.i43 to <4 x i32> %6 = select <4 x i1> %cmp.i, <4 x i32> %4, <4 x i32> %5 %7 = bitcast <4 x i32> %6 to <4 x float>
This is similar to bug 27925 : we should transform a select/binop surrounded by bitcasts if one of the leading bitcasts is the inverse of the trailing bitcast.
https://reviews.llvm.org/rL288584
This allows us to transform the vector select into a scalar select, and that exposed another bug related to icmp+extractelement:
define float @rbetter(float %xx) local_unnamed_addr #0 {
entry:
%vecinit.i = insertelement <4 x float> undef, float %xx, i32 0
%vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%0 = bitcast <4 x float> %vecinit3.i to <2 x i64>
%and.i46 = and <2 x i64> %0, <i64 9223372034707292159, i64 9223372034707292159>
%1 = bitcast <2 x i64> %and.i46 to <4 x float>
%vecext1.i44 = extractelement <4 x float> %1, i32 0
%add.i = fadd float %vecext1.i44, 8.388608e+06
%sub.i = fadd float %add.i, -8.388608e+06
%vecins.i = insertelement <4 x float> %1, float %sub.i, i32 0
%xor.i = xor <2 x i64> %0, %and.i46
%2 = bitcast <4 x float> %vecins.i to <2 x i64>
%or.i43 = or <2 x i64> %2, %xor.i
%3 = bitcast <2 x i64> %xor.i to <4 x i32>
%cmp.i = icmp sgt <4 x i32> %3, <i32 1258291199, i32 1258291199, i32 1258291199, i32 1258291199>
%4 = bitcast <2 x i64> %or.i43 to <4 x float>
%cmp.i.elt = extractelement <4 x i1> %cmp.i, i32 0
%.elt = extractelement <4 x float> %4, i32 0
%vecext.i = select i1 %cmp.i.elt, float %xx, float %.elt
ret float %vecext.i
}
I think vector demanded elements analysis should let us transform the vector icmp into a scalar icmp.
After https://reviews.llvm.org/rL287171 :
_r: ## @r
.cfi_startproc
## BB#0: ## %entry
movd %xmm0, %eax
movl %eax, %ecx
andl $2147483647, %ecx ## imm = 0x7FFFFFFF
cmpl $1258291199, %ecx ## imm = 0x4AFFFFFF
ja LBB0_2
## BB#1: ## %cond.false
movd %ecx, %xmm1
addss LCPI0_0(%rip), %xmm1
addss LCPI0_1(%rip), %xmm1
xorl %ecx, %eax
movd %eax, %xmm0
orps %xmm1, %xmm0
LBB0_2: ## %cond.end
retq
We're still missing a fold to use an FP 'and' op and FP compare. If we solve that, we would appear to hit the basic-block-at-a-time limitation of the DAG...except that we can convert the IR to use a 'select' to create a single basic block, and it still doesn't solve the problem because we split it back up again:
define float @pr6137(float %x) {
%t0 = bitcast float %x to i32
%and = and i32 %t0, 2147483647
%cmp = icmp ugt i32 %and, 1258291199
%t1 = bitcast i32 %and to float
%add = fadd float %t1, 8.388608e+06
%sub = fadd float %add, -8.388608e+06
%t2 = bitcast float %sub to i32
%xor = xor i32 %and, %t0
%or = or i32 %t2, %xor
%t3 = bitcast i32 %or to float
%sel = select i1 %cmp, float %x, float %t3
ret float %sel
}
_pr6137: ## @pr6137
.cfi_startproc
## BB#0:
movd %xmm0, %ecx
movl %ecx, %eax
andl $2147483647, %eax ## imm = 0x7FFFFFFF
cmpl $1258291199, %eax ## imm = 0x4AFFFFFF
ja LBB1_2
## BB#1:
xorl %ecx, %eax
andps LCPI1_0(%rip), %xmm0
addss LCPI1_1(%rip), %xmm0
addss LCPI1_2(%rip), %xmm0
movd %eax, %xmm1
orps %xmm1, %xmm0
LBB1_2:
retq
We need to combine a pattern like: bitcast(and(bitcast (X), Y) --> fand(X, bitcast(Y))
Note that this fold is very similar to: https://reviews.llvm.org/D26641
Except we need a version that corresponds to the x86-specific FP-logic nodes (there are no FP-bitwise-logic instructions/intrinsics in IR)
I think we've now solved the x86 single-basic-block patterns. Eg: https://llvm.org/bugs/show_bug.cgi?id=22428#c8
...but I haven't checked closely with this example.
I looked closer, and we have not solved the single-basic-block patterns. We need to combine a pattern like: bitcast(and(bitcast (X), Y) --> fand(X, bitcast(Y))
But before that, we need this: https://reviews.llvm.org/D26712
%4 = bitcast <4 x float> %vecinit3.i to <4 x i32> %5 = bitcast <2 x i64> %or.i43 to <4 x i32> %6 = select <4 x i1> %cmp.i, <4 x i32> %4, <4 x i32> %5 %7 = bitcast <4 x i32> %6 to <4 x float>
This is similar to bug 27925 : we should transform a select/binop surrounded by bitcasts if one of the leading bitcasts is the inverse of the trailing bitcast.
For reference, here's the IR using r286768:
define float @r(float %x) local_unnamed_addr #0 {
entry:
%0 = bitcast float %x to i32
%and = and i32 %0, 2147483647
%cmp = icmp ugt i32 %and, 1258291199
br i1 %cmp, label %cond.end, label %cond.false
cond.false: ; preds = %entry
%1 = bitcast i32 %and to float
%add = fadd float %1, 8.388608e+06
%sub = fadd float %add, -8.388608e+06
%2 = bitcast float %sub to i32
%xor = xor i32 %and, %0
%or = or i32 %2, %xor
%3 = bitcast i32 %or to float
br label %cond.end
cond.end: ; preds = %entry, %cond.false
%cond = phi float [ %3, %cond.false ], [ %x, %entry ]
ret float %cond
}
And the IR for the hand-coded SSE:
define float @rbetter(float %xx) local_unnamed_addr #0 {
entry:
%vecinit.i = insertelement <4 x float> undef, float %xx, i32 0
%vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%0 = bitcast <4 x float> %vecinit3.i to <2 x i64>
%and.i46 = and <2 x i64> %0, <i64 9223372034707292159, i64 9223372034707292159>
%1 = bitcast <2 x i64> %and.i46 to <4 x float>
%vecext1.i44 = extractelement <4 x float> %1, i32 0
%add.i = fadd float %vecext1.i44, 8.388608e+06
%sub.i = fadd float %add.i, -8.388608e+06
%vecins.i = insertelement <4 x float> %1, float %sub.i, i32 0
%xor.i = xor <2 x i64> %0, %and.i46
%2 = bitcast <4 x float> %vecins.i to <2 x i64>
%or.i43 = or <2 x i64> %2, %xor.i
%3 = bitcast <2 x i64> %xor.i to <4 x i32>
%cmp.i = icmp sgt <4 x i32> %3, <i32 1258291199, i32 1258291199, i32 1258291199, i32 1258291199>
%4 = bitcast <4 x float> %vecinit3.i to <4 x i32>
%5 = bitcast <2 x i64> %or.i43 to <4 x i32>
%6 = select <4 x i1> %cmp.i, <4 x i32> %4, <4 x i32> %5
%7 = bitcast <4 x i32> %6 to <4 x float>
%vecext.i = extractelement <4 x float> %7, i32 0
ret float %vecext.i
}
If this was fabs(): %0 = bitcast float %x to i32 %and = and i32 %0, 2147483647
we might produce the optimal code...but I already got in trouble for that transform. :)
https://llvm.org/bugs/show_bug.cgi?id=24886#c4
So there has to be a target-specific solution, but as noted in comment 1, the basic-block limitation of the DAG may be interfering. I think we've now solved the x86 single-basic-block patterns. Eg: https://llvm.org/bugs/show_bug.cgi?id=22428#c8
...but I haven't checked closely with this example.
Side note: I think we're still missing at least one bitcast+select fold in IR based on the SSE code.
I have hit the same problem. I am surprised that this bug report is still open and the problem is still there in LLVM after 2 years after the report.
I am using SSE optimized 4x4 matrix multiplication in loop as described at: http://fhtr.blogspot.com/2010/02/4x4-float-matrix-multiplication-using.html and GCC -O3 produces code that is 2x faster than latest Clang LLVM 3.1, and I believe this is because GCC does not move values between XMM and GP registers back and forth in every loop iteration.
Please see my little benchmark suite available at: https://github.com/nanoant/ssebench
Please run it with Clang with: make TYPE=2 O=3 CC=clang
See generates .s assembly file, then use it with gcc: make TYPE=2 O=3 CC=gcc
You will see that GCC does all use only XMM registers for doing actual calculation and single GP register just for loop index, while LLVM moves values back & forth between XMM and GP 4 times for every loop iteration!
FWIW, LLVM misses this optimization within a single block too. Multiple blocks, and an and with multiple uses, do complicate this testcase further though.
This may be because "r" has three basic blocks, and currently the code generator works one basic block at a time, which can lead to this kind of pointless operation. This is just a guess though.
Extended Description
For the following C code:
The x86 back end is generating several unnecessary MOVD instructions to move data between XMM and general purpose registers. These moves are unnecessary because the argument arrives in an XMM register, the result returns in an XMM register, and all the operations performed can be done with XMM register operations.
I would like the x86 back end to generate the same code for the function above as it does for the following function which performs the same operation entirely in XMM registers:
Calling one of these functions in a loop (where it is probably inlined) of 1G iterations with various compilers on an Phenom II processor, I get:
gcc 4.4.1: orig: 7.99 new: 7.32 icc 11.0: orig: 5.32 new: 20.50 clang trunk: orig: 11.53 new: 6.82