Open Quuxplusone opened 9 years ago
Bugzilla Link | PR23508 |
Status | NEW |
Importance | P normal |
Reported by | Wei Mi (wmi@google.com) |
Reported on | 2015-05-12 16:50:00 -0700 |
Last modified on | 2015-05-22 20:54:22 -0700 |
Version | trunk |
Hardware | PC Windows XP |
CC | fbarchard@google.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, rafael@espindo.la, spatel+llvm@rotateright.com |
Fixed by commit(s) | |
Attachments | |
Blocks | |
Blocked by | |
See also |
I tried some variations of the compiler output to narrow down the largest
factor. The main problem is register spill.
clang uses 4 registers and the stack
visual c uses 6 registers
// Version 1 same as Clang
// ARGBToRAW_Opt (23716 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"push %eax \n"
"mov 0x14(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0x10(%esp),%ecx \n"
"mov 0xc(%esp),%edx \n"
"1: \n"
"mov (%edx),%bl \n"
"mov %bl,0x3(%esp) \n"
"mov 0x1(%edx),%bh \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov %bh,0x1(%ecx) \n"
"mov 0x3(%esp),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"add $0x4,%esp \n"
"pop %ebx \n"
);
}
// Version 2a same as Clang but use bl and slight reorder.
// ARGBToRAW_Opt (21331 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"push %eax \n"
"mov 0x14(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0x10(%esp),%ecx \n"
"mov 0xc(%esp),%edx \n"
"1: \n"
"mov (%edx),%bl \n"
"mov %bl,0x3(%esp) \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov 0x1(%edx),%bl \n"
"mov %bl,0x1(%ecx) \n"
"mov 0x3(%esp),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"add $0x4,%esp \n"
"pop %ebx \n"
);
}
// Version 2b same as Clang but no stack. Still uses bh.
// ARGBToRAW_Opt (11089 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"mov 0x10(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0xc(%esp),%ecx \n"
"mov 0x8(%esp),%edx \n"
"1: \n"
"mov 0x1(%edx),%bh \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov %bh,0x1(%ecx) \n"
"mov (%edx),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"pop %ebx \n"
);
}
// Version 3b same as 2b but no bh. And no stack.
// ARGBToRAW_Opt (9691 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"mov 0x10(%esp),%eax \n"
"test %eax,%eax \n"
"jle 2f \n"
"mov 0xc(%esp),%ecx \n"
"mov 0x8(%esp),%edx \n"
"1: \n"
"mov 0x2(%edx),%bl \n"
"mov %bl,(%ecx) \n"
"mov 0x1(%edx),%bl \n"
"mov %bl,0x1(%ecx) \n"
"mov (%edx),%bl \n"
"mov %bl,0x2(%ecx) \n"
"add $0x4,%edx \n"
"add $0x3,%ecx \n"
"dec %eax \n"
"jne 1b \n"
"2: \n"
"pop %ebx \n"
);
}
// Version 4 by hand
// ARGBToRAW_Opt (9460 ms)
void ARGBToRAWRow_C(const uint8* src, uint8* dst, int pix) {
asm volatile (
"push %ebx \n"
"mov 0x8(%esp),%ebx \n"
"mov 0xc(%esp),%edx \n"
"mov 0x10(%esp),%ecx \n"
"test %ecx,%ecx \n"
"jle 2f \n"
"1: \n"
"mov 0x2(%ebx),%al \n"
"mov %al,0x0(%edx) \n"
"mov 0x1(%ebx),%al \n"
"mov %al,0x1(%edx) \n"
"mov 0x0(%ebx),%al \n"
"mov %al,0x2(%edx) \n"
"lea 0x4(%ebx),%ebx \n"
"lea 0x3(%edx),%edx \n"
"sub $1,%ecx \n"
"jne 1b \n"
"2: \n"
"pop %ebx \n"
);
}