Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

LLVM generating inefficient code for simple loops #22368

Open Quuxplusone opened 9 years ago

Quuxplusone commented 9 years ago
Bugzilla Link PR22369
Status NEW
Importance P normal
Reported by Daniel Jasper (djasper@google.com)
Reported on 2015-01-28 05:50:05 -0800
Last modified on 2015-01-30 09:14:32 -0800
Version trunk
Hardware PC All
CC chandlerc@gmail.com, jsweval@arxan.com, llvm-bugs@lists.llvm.org, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also
Consider the function:

uint8 *f(uint32 value, uint8 *target) {
  while (value >= 0x80) {
    *target = static_cast<uint8>(value | 0x80);
    value >>= 7;
    ++target;
  }
  *target = static_cast<uint8>(value);
  return target + 1;
}

For, this function, "clang -O2 -S" generates:

        cmpl    $128, %edi
        jb      .LBB0_1
        .align  16, 0x90
.LBB0_2:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        movl    %edi, %eax
        orl     $128, %eax
        movb    %al, (%rsi)
        movl    %edi, %eax
        shrl    $7, %eax
        incq    %rsi
        cmpl    $16383, %edi            # imm = 0x3FFF
        movl    %eax, %edi
        ja      .LBB0_2
        jmp     .LBB0_3
.LBB0_1:
        movl    %edi, %eax
.LBB0_3:                                # %while.end
        movb    %al, (%rsi)
        incq    %rsi
        movq    %rsi, %rax
        retq

Which seems quite inefficient. There are several unnecessary moves. GCC instead
translates this to:

        jmp     .L4
        .p2align 4,,10
        .p2align 3
.L3:
        movl    %edi, %eax
        addq    $1, %rsi
        shrl    $7, %edi
        orl     $-128, %eax
        movb    %al, -1(%rsi)
.L4:
        cmpl    $127, %edi
        ja      .L3
        movb    %dil, (%rsi)
        leaq    1(%rsi), %rax
        ret

The LLVM IR generated by clang here already generates code that is very similar
to the final output (with instcombine folding the shift into the compare). If I
prevent this folding by adding "if (!Shr->hasOneUse()) return nullptr;" in
InstCombiner::FoldICmpShrCst()
(lib/Transforms/InstCombine/InstCombineCompares.cpp), then LLVM's code gets
somewhat better, but I suspect that LLVM should really be able to lower the IR
better.
Quuxplusone commented 9 years ago
Massive write up from my digging into this ahead...

I'm really convinced that instcombine is doing the right thing here -- a direct
comparison of a constant is better than rinsing it through a shift first. The
key is that the backend needs to recover well here. Notably, it doesn't
*require* more instructions to do this, LLVM is just messing everything up.

If I manually run parts of the optimizer I can simulate your change by not
running instcombine on the core loop after running loop-rotate:

% ./bin/clang++ -c -S -emit-llvm -O2 -o - write.cc -mllvm -disable-llvm-optzns
| opt -S -instcombine -sroa -instcombine -loop-rotate | ./bin/llc
        .text
        .file   "<stdin>"
        .globl  _Z1fjPh
        .align  16, 0x90
        .type   _Z1fjPh,@function
_Z1fjPh:                                # @_Z1fjPh
        .cfi_startproc
# BB#0:                                 # %entry
        cmpl    $128, %edi
        jb      .LBB0_2
        .align  16, 0x90
.LBB0_1:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        movl    %edi, %eax
        orl     $128, %eax
        movb    %al, (%rsi)
        shrl    $7, %edi
        incq    %rsi
        cmpl    $127, %edi
        ja      .LBB0_1
.LBB0_2:                                # %while.end
        movb    %dil, (%rsi)
        incq    %rsi
        movq    %rsi, %rax
        retq
.Ltmp0:
        .size   _Z1fjPh, .Ltmp0-_Z1fjPh
        .cfi_endproc

        .ident  "clang version 3.7.0 (trunk 227279) (llvm/trunk 227300)"
        .section        ".note.GNU-stack","",@progbits

Now, if I bracket this whole thing for IACA:
% iaca -64 -arch SNB write2.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write2.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 3.00 Cycles       Throughput Bottleneck: InterIteration

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.6    0.0  | 2.5  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.8  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x16
|   1    | 0.6       | 0.4 |           |           |     |     |    | mov eax,
edi
|   1    | 0.4       | 0.6 |           |           |     | 0.1 |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 0.4       |     |           |           |     | 0.6 |    | shr edi,
0x7
|   1    | 0.2       | 0.6 |           |           |     | 0.2 | CP | inc rsi
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x7f
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffee
|   2^   |           |     |           | 1.0       | 1.0 |     | CP | mov byte
ptr [rsi], dil
|   1    | 0.6       | 0.4 |           |           |     |     | CP | inc rsi
|   1    | 0.4       | 0.6 |           |           |     |     |    | mov rax,
rsi
Total Num Of Uops: 12

% iaca -64 -arch IVB write2.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write2.o
Binary Format - 64Bit
Architecture  - IVB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.0  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.0  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x16
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   1    | 1.0       |     |           |           |     |     |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 1.0       |     |           |           |     |     |    | shr edi,
0x7
|   1    |           | 1.0 |           |           |     |     |    | inc rsi
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x7f
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffee
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], dil
|   1    |           | 1.0 |           |           |     |     |    | inc rsi
|   0*   |           |     |           |           |     |     |    | mov rax,
rsi
Total Num Of Uops: 10

% iaca -64 -arch HSW write2.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write2.o
Binary Format - 64Bit
Architecture  - HSW
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6
|  7   |
---------------------------------------------------------------------------------------
| Cycles | 1.5    0.0  | 1.5  | 0.6    0.0  | 0.7    0.0  | 2.0  | 1.5  | 1.5
| 0.6  |
---------------------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |                    Ports pressure in cycles                     |
|
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |
|
---------------------------------------------------------------------------------
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp edi, 0x80
|   0F   |           |     |           |           |     |     |     |     |
| jb 0x16
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| or eax, 0x80
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], al
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| shr edi, 0x7
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| inc rsi
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp edi, 0x7f
|   0F   |           |     |           |           |     |     |     |     |
| jnbe 0xffffffffffffffee
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], dil
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| inc rsi
|   0*   |           |     |           |           |     |     |     |     |
| mov rax, rsi
Total Num Of Uops: 10

Pretty good. When I run instcombine one more time I immediately get your
problem:
% ./bin/clang++ -c -S -emit-llvm -O2 -o - write.cc -mllvm -disable-llvm-optzns
| opt -S -instcombine -sroa -instcombine -loop-rotate -instcombine | ./bin/llc
        .text
        .file   "<stdin>"
        .globl  _Z1fjPh
        .align  16, 0x90
        .type   _Z1fjPh,@function
_Z1fjPh:                                # @_Z1fjPh
        .cfi_startproc
# BB#0:                                 # %entry
        cmpl    $128, %edi
        jb      .LBB0_1
        .align  16, 0x90
.LBB0_2:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        movl    %edi, %eax
        orl     $128, %eax
        movb    %al, (%rsi)
        movl    %edi, %eax
        shrl    $7, %eax
        incq    %rsi
        cmpl    $16383, %edi            # imm = 0x3FFF
        movl    %eax, %edi
        ja      .LBB0_2
        jmp     .LBB0_3
.LBB0_1:
        movl    %edi, %eax
.LBB0_3:                                # %while.end
        movb    %al, (%rsi)
        incq    %rsi
        movq    %rsi, %rax
        retq
.Ltmp0:
        .size   _Z1fjPh, .Ltmp0-_Z1fjPh
        .cfi_endproc

        .ident  "clang version 3.7.0 (trunk 227279) (llvm/trunk 227300)"
        .section        ".note.GNU-stack","",@progbits

IACA says that SandyBridge hates this code:
% iaca -64 -arch SNB write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 4.65 Cycles       Throughput Bottleneck: Port5

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 4.1    0.0  | 4.4  | 1.0    0.0  | 1.0    0.0  | 2.0  | 4.5  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 | CP | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x1f
|   1    | 0.3       | 0.6 |           |           |     |     |    | mov eax,
edi
|   1    | 0.6       | 0.4 |           |           |     |     |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 0.2       | 0.6 |           |           |     | 0.2 | CP | mov eax,
edi
|   1    | 0.3       |     |           |           |     | 0.6 | CP | shr eax,
0x7
|   1    | 0.3       | 0.7 |           |           |     |     |    | inc rsi
|   1    | 0.3       | 0.3 |           |           |     | 0.3 | CP | cmp edi,
0x3fff
|   1    | 0.3       | 0.4 |           |           |     | 0.3 | CP | mov edi,
eax
|   1    |           |     |           |           |     | 1.0 | CP | jnbe
0xffffffffffffffe7
|   1    |           |     |           |           |     | 1.0 | CP | jmp 0x4
|   1    | 0.6       | 0.3 |           |           |     |     |    | mov eax,
edi
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 0.3       | 0.7 |           |           |     |     |    | inc rsi
|   1    | 0.7       | 0.3 |           |           |     |     |    | mov rax,
rsi
Total Num Of Uops: 17

I really don't get this. It claims that it has to burn a uop for every single
register->register mov?? So weird. Ivybridge and Haswell show what I expect --
lots of instructions but not *too* many more microops. However, they still show
more cycles:
% iaca -64 -arch IVB write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - IVB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 4.00 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.5    0.0  | 2.5  | 1.0    0.0  | 1.0    0.0  | 2.0  | 3.0  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x1f
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   1    | 0.5       | 0.5 |           |           |     |     |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   1    | 1.0       |     |           |           |     |     |    | shr eax,
0x7
|   1    |           | 1.0 |           |           |     |     |    | inc rsi
|   1    | 0.5       | 0.5 |           |           |     |     |    | cmp edi,
0x3fff
|   0*   |           |     |           |           |     |     |    | mov edi,
eax
|   1    |           |     |           |           |     | 1.0 |    | jnbe
0xffffffffffffffe7
|   1    |           |     |           |           |     | 1.0 |    | jmp 0x4
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 0.5       | 0.5 |           |           |     |     |    | inc rsi
|   0*   |           |     |           |           |     |     |    | mov rax,
rsi
Total Num Of Uops: 12

% iaca -64 -arch HSW write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - HSW
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 4.00 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6
|  7   |
---------------------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.0  | 0.6    0.0  | 0.7    0.0  | 2.0  | 2.0  | 2.0
| 0.6  |
---------------------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |                    Ports pressure in cycles                     |
|
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |
|
---------------------------------------------------------------------------------
|   1    |           |     |           |           |     |     | 1.0 |     |
| cmp edi, 0x80
|   0F   |           |     |           |           |     |     |     |     |
| jb 0x1f
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   1    |           |     |           |           |     | 1.0 |     |     |
| or eax, 0x80
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], al
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   1    | 1.0       |     |           |           |     |     |     |     |
| shr eax, 0x7
|   1    |           | 1.0 |           |           |     |     |     |     |
| inc rsi
|   1    |           |     |           |           |     | 1.0 |     |     |
| cmp edi, 0x3fff
|   0*   |           |     |           |           |     |     |     |     |
| mov edi, eax
|   1    |           |     |           |           |     |     | 1.0 |     |
| jnbe 0xffffffffffffffe7
|   1    | 1.0       |     |           |           |     |     |     |     |
| jmp 0x4
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], al
|   1    |           | 1.0 |           |           |     |     |     |     |
| inc rsi
|   0*   |           |     |           |           |     |     |     |     |
| mov rax, rsi
Total Num Of Uops: 12

OK, but is this a combine bug? I think not. That assembly produced by LLVM is
just *BAD*. Really bad. Here is a trivial and obvious re-register allocation of
it. The key thing is to set up the accumulated register in %eax, and copy to
%edi for the comparison.
% cat write.s
        .text
        .file   "<stdin>"
        .globl  _Z1fjPh
        .align  16, 0x90
        .type   _Z1fjPh,@function
_Z1fjPh:                                # @_Z1fjPh
        .cfi_startproc
# BB#0:                                 # %entry
        movl    %edi, %eax
        cmpl    $128, %eax
        jb      .LBB0_1
        .align  16, 0x90
.LBB0_2:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        orl     $128, %eax
        movb    %al, (%rsi)
        movl    %eax, %edi
        shrl    $7, %eax
        incq    %rsi
        cmpl    $16383, %edi            # imm = 0x3FFF
        ja      .LBB0_2
.LBB0_1:
        movb    %al, (%rsi)
        incq    %rsi
        movq    %rsi, %rax
        retq
.Ltmp0:
        .size   _Z1fjPh, .Ltmp0-_Z1fjPh
        .cfi_endproc

        .ident  "clang version 3.7.0 (trunk 227279) (llvm/trunk 227300)"
        .section        ".note.GNU-stack","",@progbits

Suddenly, this looks not really worse than the un-combined variant. IACA agrees
and finds this to be exactly as good as the uncombined variant on all three
architectures:
% iaca -64 -arch SNB write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 3.00 Cycles       Throughput Bottleneck: InterIteration

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.6    0.0  | 2.5  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.8  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x18
|   1    | 0.6       | 0.4 |           |           |     |     |    | mov eax,
edi
|   1    | 0.4       | 0.6 |           |           |     | 0.1 |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 0.4       |     |           |           |     | 0.6 |    | shr edi,
0x7
|   1    | 0.2       | 0.6 |           |           |     | 0.2 | CP | inc rsi
|   1    |           |     |           |           |     | 1.0 |    | cmp eax,
0x3fff
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffec
|   2^   |           |     |           | 1.0       | 1.0 |     | CP | mov byte
ptr [rsi], dil
|   1    | 0.6       | 0.4 |           |           |     |     | CP | inc rsi
|   1    | 0.4       | 0.6 |           |           |     |     |    | mov rax,
rsi
Total Num Of Uops: 12

% iaca -64 -arch IVB write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - IVB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.0  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.0  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x18
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   1    | 1.0       |     |           |           |     |     |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 1.0       |     |           |           |     |     |    | shr edi,
0x7
|   1    |           | 1.0 |           |           |     |     |    | inc rsi
|   1    |           |     |           |           |     | 1.0 |    | cmp eax,
0x3fff
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffec
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], dil
|   1    |           | 1.0 |           |           |     |     |    | inc rsi
|   0*   |           |     |           |           |     |     |    | mov rax,
rsi
Total Num Of Uops: 10

% iaca -64 -arch HSW write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - HSW
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6
|  7   |
---------------------------------------------------------------------------------------
| Cycles | 1.5    0.0  | 1.5  | 0.6    0.0  | 0.7    0.0  | 2.0  | 1.5  | 1.5
| 0.6  |
---------------------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |                    Ports pressure in cycles                     |
|
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |
|
---------------------------------------------------------------------------------
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp edi, 0x80
|   0F   |           |     |           |           |     |     |     |     |
| jb 0x18
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| or eax, 0x80
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], al
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| shr edi, 0x7
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| inc rsi
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp eax, 0x3fff
|   0F   |           |     |           |           |     |     |     |     |
| jnbe 0xffffffffffffffec
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], dil
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| inc rsi
|   0*   |           |     |           |           |     |     |     |     |
| mov rax, rsi
Total Num Of Uops: 10

Finally, for comparison, GCC produces similar code to the un-combined variant
but rotated differently and using slightly different instructions:
% ~/bin/g++ -c -S -O2 write.cc -o -
        .file   "write.cc"
        .section        .text.unlikely,"ax",@progbits
.LCOLDB0:
        .text
.LHOTB0:
        .p2align 4,,15
        .globl  _Z1fjPh
        .type   _Z1fjPh, @function
_Z1fjPh:
.LFB0:
        .cfi_startproc
        jmp     .L5
        .p2align 4,,10
        .p2align 3
.L3:
        movl    %edi, %eax
        addq    $1, %rsi
        shrl    $7, %edi
        orl     $-128, %eax
        movb    %al, -1(%rsi)
.L5:
        cmpl    $127, %edi
        ja      .L3
        movb    %dil, (%rsi)
        leaq    1(%rsi), %rax
        ret
        .cfi_endproc
.LFE0:
        .size   _Z1fjPh, .-_Z1fjPh
        .section        .text.unlikely
.LCOLDE0:
        .text
.LHOTE0:
        .ident  "GCC: (GNU) 4.9.x-google 20141223 (prerelease)"
        .section        .note.GNU-stack,"",@progbits

This appears the same or slightly worse in IACA for all three architectures:
% iaca -64 -arch SNB write.gcc.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.gcc.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 3.00 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.3    0.0  | 2.3  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.3  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | jmp 0x17
|   0*   |           |     |           |           |     |     |    | nop word
ptr [rax+rax*1], ax
|   1    | 0.3       | 0.7 |           |           |     |     |    | mov eax,
edi
|   1    | 0.7       | 0.3 |           |           |     |     |    | add rsi,
0x1
|   1    | 0.6       |     |           |           |     | 0.3 |    | shr edi,
0x7
|   1    |           | 1.0 |           |           |     |     |    | or eax,
0xffffff80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi-0x1], al
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x7f
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffee
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], dil
|   1    | 0.6       | 0.3 |           |           |     |     |    | lea rax,
ptr [rsi+0x1]
Total Num Of Uops: 11

% iaca -64 -arch IVB write.gcc.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.gcc.o
Binary Format - 64Bit
Architecture  - IVB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 3.00 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.0  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.0  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | jmp 0x17
|   0*   |           |     |           |           |     |     |    | nop word
ptr [rax+rax*1], ax
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   1    | 1.0       |     |           |           |     |     |    | add rsi,
0x1
|   1    | 1.0       |     |           |           |     |     |    | shr edi,
0x7
|   1    |           | 1.0 |           |           |     |     |    | or eax,
0xffffff80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi-0x1], al
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x7f
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffee
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], dil
|   1    |           | 1.0 |           |           |     |     |    | lea rax,
ptr [rsi+0x1]
Total Num Of Uops: 10

% iaca -64 -arch HSW write.gcc.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.gcc.o
Binary Format - 64Bit
Architecture  - HSW
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 3.00 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6
|  7   |
---------------------------------------------------------------------------------------
| Cycles | 1.5    0.0  | 1.5  | 0.6    0.0  | 0.7    0.0  | 2.0  | 1.5  | 1.5
| 0.6  |
---------------------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |                    Ports pressure in cycles                     |
|
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |
|
---------------------------------------------------------------------------------
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| jmp 0x17
|   0*   |           |     |           |           |     |     |     |     |
| nop word ptr [rax+rax*1], ax
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| add rsi, 0x1
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| shr edi, 0x7
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| or eax, 0xffffff80
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi-0x1], al
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp edi, 0x7f
|   0F   |           |     |           |           |     |     |     |     |
| jnbe 0xffffffffffffffee
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], dil
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| lea rax, ptr [rsi+0x1]
Total Num Of Uops: 10

However, the last instruction here really points out something that GCC is
winning at handily. Doing the inc+mov for rsi doesn't seem to make any sense
here. Replacing that instruction in my hacked up LLVM asm produces code that
IACA prefers over *all* the other archituctures, and has the most significant
impact on Sandybridge:

% cat write.s
        .text
        .file   "<stdin>"
        .globl  _Z1fjPh
        .align  16, 0x90
        .type   _Z1fjPh,@function
_Z1fjPh:                                # @_Z1fjPh
        .cfi_startproc
# BB#0:                                 # %entry
        cmpl    $128, %edi
        jb      .LBB0_1
        .align  16, 0x90
.LBB0_2:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        movl    %edi, %eax
        orl     $128, %eax
        movb    %al, (%rsi)
        shrl    $7, %edi
        incq    %rsi
        cmpl    $16383, %eax            # imm = 0x3FFF
        ja      .LBB0_2
.LBB0_1:
        movb    %dil, (%rsi)
        leaq    1(%rsi), %rax
        retq
.Ltmp0:
        .size   _Z1fjPh, .Ltmp0-_Z1fjPh
        .cfi_endproc

        .ident  "clang version 3.7.0 (trunk 227279) (llvm/trunk 227300)"
        .section        ".note.GNU-stack","",@progbits

% iaca -64 -arch SNB write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.35 Cycles       Throughput Bottleneck: FrontEnd, Port0,
Port1, Port5

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.3    0.0  | 2.3  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.3  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 | CP | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x18
|   1    | 0.4       | 0.6 |           |           |     |     | CP | mov eax,
edi
|   1    | 0.6       | 0.4 |           |           |     |     | CP | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 0.8       |     |           |           |     | 0.2 | CP | shr edi,
0x7
|   1    |           | 0.9 |           |           |     | 0.1 | CP | inc rsi
|   1    |           |     |           |           |     | 1.0 | CP | cmp eax,
0x3fff
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffec
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], dil
|   1    | 0.6       | 0.4 |           |           |     |     | CP | lea rax,
ptr [rsi+0x1]
Total Num Of Uops: 11

% iaca -64 -arch IVB write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - IVB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.25 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.0  | 1.0    0.0  | 1.0    0.0  | 2.0  | 2.0  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     |           |           |     | 1.0 |    | cmp edi,
0x80
|   0F   |           |     |           |           |     |     |    | jb 0x18
|   0*   |           |     |           |           |     |     |    | mov eax,
edi
|   1    | 1.0       |     |           |           |     |     |    | or eax,
0x80
|   2^   |           |     | 1.0       |           | 1.0 |     |    | mov byte
ptr [rsi], al
|   1    | 1.0       |     |           |           |     |     |    | shr edi,
0x7
|   1    |           | 1.0 |           |           |     |     |    | inc rsi
|   1    |           |     |           |           |     | 1.0 |    | cmp eax,
0x3fff
|   0F   |           |     |           |           |     |     |    | jnbe
0xffffffffffffffec
|   2^   |           |     |           | 1.0       | 1.0 |     |    | mov byte
ptr [rsi], dil
|   1    |           | 1.0 |           |           |     |     |    | lea rax,
ptr [rsi+0x1]
Total Num Of Uops: 10

% iaca -64 -arch HSW write.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - write.o
Binary Format - 64Bit
Architecture  - HSW
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.25 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6
|  7   |
---------------------------------------------------------------------------------------
| Cycles | 1.5    0.0  | 1.5  | 0.6    0.0  | 0.7    0.0  | 2.0  | 1.5  | 1.5
| 0.6  |
---------------------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV -
Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is
expected
! - instruction not supported, was not accounted in Analysis

| Num Of |                    Ports pressure in cycles                     |
|
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |
|
---------------------------------------------------------------------------------
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp edi, 0x80
|   0F   |           |     |           |           |     |     |     |     |
| jb 0x18
|   0*   |           |     |           |           |     |     |     |     |
| mov eax, edi
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| or eax, 0x80
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], al
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| shr edi, 0x7
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| inc rsi
|   1    | 0.5       |     |           |           |     |     | 0.5 |     |
| cmp eax, 0x3fff
|   0F   |           |     |           |           |     |     |     |     |
| jnbe 0xffffffffffffffec
|   2^   |           |     | 0.3       | 0.3       | 1.0 |     |     | 0.3 |
| mov byte ptr [rsi], dil
|   1    |           | 0.5 |           |           |     | 0.5 |     |     |
| lea rax, ptr [rsi+0x1]
Total Num Of Uops: 10

So, we should be fixing the register allocator to do a *much* better job here,
and fixing our patterns to emit LEA in slightly more cases. At the least, it
seems profitable when we're going to have a reg->reg move otherwise as it helps
sandybridge's lack of a strong register renamer there.

Of course, all of this is assuming IACA is perfectly accurate. ;] We should
probably measure this too.
Quuxplusone commented 9 years ago
> OK, but is this a combine bug? I think not. That assembly produced by LLVM
> is just *BAD*. Really bad. Here is a trivial and obvious re-register
> allocation of it. The key thing is to set up the accumulated register in
> %eax, and copy to %edi for the comparison.
> % cat write.s
>         .text
>         .file   "<stdin>"
>         .globl  _Z1fjPh
>         .align  16, 0x90
>         .type   _Z1fjPh,@function
> _Z1fjPh:                                # @_Z1fjPh
>         .cfi_startproc
> # BB#0:                                 # %entry
>         movl    %edi, %eax
>         cmpl    $128, %eax
>         jb      .LBB0_1
>         .align  16, 0x90
> .LBB0_2:                                # %while.body
>                                         # =>This Inner Loop Header: Depth=1
>         orl     $128, %eax
>         movb    %al, (%rsi)
>         movl    %eax, %edi
>         shrl    $7, %eax
>         incq    %rsi
>         cmpl    $16383, %edi            # imm = 0x3FFF
>         ja      .LBB0_2
> .LBB0_1:
>         movb    %al, (%rsi)
>         incq    %rsi
>         movq    %rsi, %rax
>         retq
> .Ltmp0:
>         .size   _Z1fjPh, .Ltmp0-_Z1fjPh
>         .cfi_endproc
>
>
>         .ident  "clang version 3.7.0 (trunk 227279) (llvm/trunk 227300)"
>         .section        ".note.GNU-stack","",@progbits
>

Arrrg, this section was stale. The above code is just wrong. Ignore it. What I
actually benchmarked was as follows. It requires both better register
allocation by continuing to use %edi out of the loop, and realizing that we can
compare against %eax even after or-ing in 128 because that couldn't have
changed the result. Anyways, sorry for the confusion. Here is the code I
actually measured at this phase of the writeup:

        .text
        .file   "<stdin>"
        .globl  _Z1fjPh
        .align  16, 0x90
        .type   _Z1fjPh,@function
_Z1fjPh:                                # @_Z1fjPh
        .cfi_startproc
# BB#0:                                 # %entry
        cmpl    $128, %edi
        jb      .LBB0_1
        .align  16, 0x90
.LBB0_2:                                # %while.body
                                        # =>This Inner Loop Header: Depth=1
        movl    %edi, %eax
        orl     $128, %eax
        movb    %al, (%rsi)
        shrl    $7, %edi
        incq    %rsi
        cmpl    $16383, %eax            # imm = 0x3FFF
        ja      .LBB0_2
.LBB0_1:
        movb    %dil, (%rsi)
        incq    %rsi
        movq    %rsi, %rax
        retq
.Ltmp0:
        .size   _Z1fjPh, .Ltmp0-_Z1fjPh
        .cfi_endproc

        .ident  "clang version 3.7.0 (trunk 227279) (llvm/trunk 227300)"
        .section        ".note.GNU-stack","",@progbits