Open llvmbot opened 9 years ago
I tried some variations of the compiler output to narrow down the largest factor. The main problem is register spill. clang uses 4 registers and the stack visual c uses 6 registers
// Version 1 same as Clang
// ARGBToRAW_Opt (23716 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"push %eax \n"
"mov 0x14(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0x10(%esp),%ecx \n"
"mov 0xc(%esp),%edx \n"
"1: \n"
"mov (%edx),%bl \n"
"mov %bl,0x3(%esp) \n"
"mov 0x1(%edx),%bh \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov %bh,0x1(%ecx) \n"
"mov 0x3(%esp),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"add $0x4,%esp \n"
"pop %ebx \n"
);
}
// Version 2a same as Clang but use bl and slight reorder.
// ARGBToRAW_Opt (21331 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"push %eax \n"
"mov 0x14(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0x10(%esp),%ecx \n"
"mov 0xc(%esp),%edx \n"
"1: \n"
"mov (%edx),%bl \n"
"mov %bl,0x3(%esp) \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov 0x1(%edx),%bl \n"
"mov %bl,0x1(%ecx) \n"
"mov 0x3(%esp),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"add $0x4,%esp \n"
"pop %ebx \n"
);
}
// Version 2b same as Clang but no stack. Still uses bh.
// ARGBToRAW_Opt (11089 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"mov 0x10(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0xc(%esp),%ecx \n"
"mov 0x8(%esp),%edx \n"
"1: \n"
"mov 0x1(%edx),%bh \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov %bh,0x1(%ecx) \n"
"mov (%edx),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"pop %ebx \n"
);
}
// Version 3b same as 2b but no bh. And no stack.
// ARGBToRAW_Opt (9691 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"mov 0x10(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0xc(%esp),%ecx \n"
"mov 0x8(%esp),%edx \n"
"1: \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov 0x1(%edx),%bl \n"
"mov %bl,0x1(%ecx) \n"
"mov (%edx),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"pop %ebx \n"
);
}
// Version 4 by hand
// ARGBToRAW_Opt (9460 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"mov 0x8(%esp),%ebx \n"
"mov 0xc(%esp),%edx \n"
"mov 0x10(%esp),%ecx \n"
"test %ecx,%ecx \n"
"jle 2f \n"
"1: \n"
"mov 0x2(%ebx),%al \n"
"mov %al,0x0(%edx) \n"
"mov 0x1(%ebx),%al \n"
"mov %al,0x1(%edx) \n"
"mov 0x0(%ebx),%al \n"
"mov %al,0x2(%edx) \n"
"lea 0x4(%ebx),%ebx \n"
"lea 0x3(%edx),%edx \n"
"sub $1,%ecx \n"
"jne 1b \n"
"2: \n"
"pop %ebx \n"
);
}
Extended Description
For the following testcase, visual studio 2015 generated better code than llvm (more than 2 times faster on windows+sandybridge).
vs2015
clang
Another testcase:
VS2015
clang