int f1(const char c)
{
switch (c) {
case ' ':
case '\0':
return 2;
case '|':
return 1;
}
return 0;
}
f1(char): # @f1(char)
mov eax, 2
test edi, edi
je .LBB1_5
cmp edi, 32
je .LBB1_5
cmp edi, 124
jne .LBB1_4
mov eax, 1
ret
.LBB1_4:
xor eax, eax
.LBB1_5:
ret
Adds compares and branches.
int f1a(const char c)
{
switch (c) {
case '|':
return 1;
case ' ':
case '\0':
return 2;
}
return 0;
}
f1a(char): # @f1a(char)
test edi, edi
je .LBB2_4
cmp edi, 124
je .LBB2_2
cmp edi, 32
jne .LBB2_5
.LBB2_4:
mov eax, 2
ret
.LBB2_2:
mov eax, 1
ret
.LBB2_5:
xor eax, eax
ret
Leads to different code than f1() since the order of the cases matters. I always thought that's why you should prefer a switch over an if-chain.
int f2(const char c)
{
switch (c) {
case ' ':
case '\0':
return 2;
}
if (c == '|')
return 1;
return 0;
}
f2(char): # @f2(char)
mov eax, 2
test edi, -33
jne .LBB3_1
ret
.LBB3_1:
xor eax, eax
cmp dil, 124
sete al
ret
Not the same as f0() but better as f1(). I would not have expected that splitting up a switch leads to more efficient code.
GCC produces generally worse code but appears to be more consistent
There's even more fun if such code is part of a bigger function. That's how I arrived at this. I had to extract that into a separate function and in some case change the code used in f0() to the one used in f2() to get better generated code. I have not yet been able to reduce those other cases yet but they are not in the scope of this ticket anyways.
All these code samples do the same and I would have expected them to generate essentially the same code but they are all different:
```cpp
int f0(const char c)
{
if (c == ' ' || c == '\0')
return 2;
if (c == '|')
return 1;
return 0;
}
```
```asm
f0(char): # @f0(char)
xor ecx, ecx
cmp dil, 124
sete cl
test dil, -33
mov eax, 2
cmovne eax, ecx
ret
```
The best code according to `llvm-mca`.
```cpp
int f1(const char c)
{
switch (c) {
case ' ':
case '\0':
return 2;
case '|':
return 1;
}
return 0;
}
```
```asm
f1(char): # @f1(char)
mov eax, 2
test edi, edi
je .LBB1_5
cmp edi, 32
je .LBB1_5
cmp edi, 124
jne .LBB1_4
mov eax, 1
ret
.LBB1_4:
xor eax, eax
.LBB1_5:
ret
```
Adds compares and branches.
```cpp
int f1a(const char c)
{
switch (c) {
case '|':
return 1;
case ' ':
case '\0':
return 2;
}
return 0;
}
```
```asm
f1a(char): # @f1a(char)
test edi, edi
je .LBB2_4
cmp edi, 124
je .LBB2_2
cmp edi, 32
jne .LBB2_5
.LBB2_4:
mov eax, 2
ret
.LBB2_2:
mov eax, 1
ret
.LBB2_5:
xor eax, eax
ret
```
Leads to different code than `f1()` since the order of the cases matters. I always thought that's why you should prefer a `switch` over an `if`-chain.
```cpp
int f2(const char c)
{
switch (c) {
case ' ':
case '\0':
return 2;
}
if (c == '|')
return 1;
return 0;
}
```
```asm
f2(char): # @f2(char)
mov eax, 2
test edi, -33
jne .LBB3_1
ret
.LBB3_1:
xor eax, eax
cmp dil, 124
sete al
ret
```
Not the same as `f0()` but better as `f1()`. I would not have expected that splitting up a `switch` leads to more efficient code.
GCC produces generally worse code but appears to be more consistent
https://godbolt.org/z/E7rso66n3
There's even more fun if such code is part of a bigger function. That's how I arrived at this. I had to extract that into a separate function and in some case change the code used in `f0()` to the one used in `f2()` to get better generated code. I have not yet been able to reduce those other cases yet but they are not in the scope of this ticket anyways.
All these code samples do the same and I would have expected them to generate essentially the same code but they are all different:
The best code according to
llvm-mca
.Adds compares and branches.
Leads to different code than
f1()
since the order of the cases matters. I always thought that's why you should prefer aswitch
over anif
-chain.Not the same as
f0()
but better asf1()
. I would not have expected that splitting up aswitch
leads to more efficient code.GCC produces generally worse code but appears to be more consistent
https://godbolt.org/z/E7rso66n3
There's even more fun if such code is part of a bigger function. That's how I arrived at this. I had to extract that into a separate function and in some case change the code used in
f0()
to the one used inf2()
to get better generated code. I have not yet been able to reduce those other cases yet but they are not in the scope of this ticket anyways.