Closed GoogleCodeExporter closed 9 years ago
Passes on Intel
LIBYUV_WIDTH=130 LIBYUV_HEIGHT=80 out/Release/libyuv_unittest
--gtest_filter=*ARGBToI411*
Note: Google Test filter = *ARGBToI411*
[==========] Running 4 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 4 tests from libyuvTest
[ RUN ] libyuvTest.ARGBToI411_Any
[ OK ] libyuvTest.ARGBToI411_Any (2 ms)
[ RUN ] libyuvTest.ARGBToI411_Unaligned
[ OK ] libyuvTest.ARGBToI411_Unaligned (1 ms)
[ RUN ] libyuvTest.ARGBToI411_Invert
[ OK ] libyuvTest.ARGBToI411_Invert (1 ms)
[ RUN ] libyuvTest.ARGBToI411_Opt
[ OK ] libyuvTest.ARGBToI411_Opt (1 ms)
[----------] 4 tests from libyuvTest (5 ms total)
[----------] Global test environment tear-down
[==========] 4 tests from 1 test case ran. (5 ms total)
[ PASSED ] 4 tests.
Original comment by fbarch...@google.com
on 17 Sep 2015 at 8:04
Original comment by fbarch...@google.com
on 17 Sep 2015 at 8:09
Original comment by fbarch...@chromium.org
on 12 Oct 2015 at 11:48
With default parameters, this function passes
[ RUN ] LibYUVConvertTest.ARGBToI411_Any
[ OK ] LibYUVConvertTest.ARGBToI411_Any (5 ms)
[ RUN ] LibYUVConvertTest.ARGBToI411_Unaligned
[ OK ] LibYUVConvertTest.ARGBToI411_Unaligned (4 ms)
[ RUN ] LibYUVConvertTest.ARGBToI411_Invert
[ OK ] LibYUVConvertTest.ARGBToI411_Invert (4 ms)
[ RUN ] LibYUVConvertTest.ARGBToI411_Opt
[ OK ] LibYUVConvertTest.ARGBToI411_Opt (4 ms)
[----------] 4 tests from LibYUVConvertTest (18 ms total)
Original comment by fbarch...@chromium.org
on 14 Oct 2015 at 10:33
LIBYUV_WIDTH=130 LIBYUV_HEIGHT=80 LIBYUV_REPEAT=1 LIBYUV_DISABLE_ASM=1
out/Release/libyuv_unittest
This is the Intel code
0000000000556f00 <ARGBToUV411Row_C>:
push %r13
cmp $0x3,%ecx
push %r12
push %rbp
push %rbx
↓ jle 2c3
lea -0x4(%rcx),%r12d
mov %rsi,%rbx
mov %rdi,%rax
shr $0x2,%r12d
add $0x1,%r12
lea (%rdx,%r12,1),%rbp
nop
28: movzbl (%rax),%r9d
movzbl 0x4(%rax),%r8d
add $0x1,%rdx
movzbl 0x1(%rax),%r10d
movzbl 0x2(%rax),%r11d
add $0x1,%rbx
add $0x10,%rax
add %r9d,%r8d
movzbl -0x8(%rax),%r9d
add %r9d,%r8d
movzbl -0x4(%rax),%r9d
add %r9d,%r8d
movzbl -0xb(%rax),%r9d
sar $0x2,%r8d
mov %r8d,%r13d
lea (%r8,%r8,8),%r8d
add %r10d,%r9d
movzbl -0x7(%rax),%r10d
add %r8d,%r8d
add %r10d,%r9d
movzbl -0x3(%rax),%r10d
add %r10d,%r9d
movzbl -0xa(%rax),%r10d
sar $0x2,%r9d
add %r11d,%r10d
movzbl -0x6(%rax),%r11d
add %r11d,%r10d
movzbl -0x2(%rax),%r11d
add %r11d,%r10d
mov $0x70,%r11d
imul %r11d,%r13d
sar $0x2,%r10d
imul $0xffffffb6,%r9d,%r11d
add %r11d,%r13d
imul $0xffffffda,%r10d,%r11d
lea 0x8080(%r13,%r11,1),%r11d
mov $0xffffffa2,%r13d
imul %r13d,%r9d
sar $0x8,%r11d
mov %r11b,-0x1(%rbx)
mov $0x70,%r11d
imul %r11d,%r10d
add %r10d,%r9d
sub %r8d,%r9d
add $0x8080,%r9d
sar $0x8,%r9d
mov %r9b,-0x1(%rdx)
cmp %rbp,%rdx
↑ jne 28
mov %r12,%rax
add %r12,%rsi
shl $0x4,%rax
add %rax,%rdi
104: and $0x3,%ecx
cmp $0x3,%ecx
↓ je 198
cmp $0x2,%ecx
↓ je 238
cmp $0x1,%ecx
↓ je 128
pop %rbx
pop %rbp
pop %r12
pop %r13
← retq
nop
Original comment by fbarch...@chromium.org
on 16 Oct 2015 at 6:16
I've reproduced the error... had to hard code the unittest/unittest.cc
dimensions to 130x80. 130x72 also fails... its the width not being a multiple
of 4.
411 subsampling is really not used in practice, so it may be better to remove
the functionality than spend time fixing it.
Original comment by fbarch...@google.com
on 20 Oct 2015 at 1:37
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUV411ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
if (IS_ALIGNED(width, 32)) {
ARGBToUV411Row = ARGBToUV411Row_NEON;
}
}
#endif
Original comment by fbarch...@google.com
on 20 Oct 2015 at 1:45
There are 2 issues with ARGBTo411.
1. any handles odd 422 by duplicating the last ARGB. It doesnt support
remainder of 2 or 3 bytes.
// Any 1 to 2. Outputs UV planes.
#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
SIMD_ALIGNED(uint8 temp[128 * 3]); \
memset(temp, 0, 128); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, 4); \
} \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
}
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
so the 411 averages in zeros when using neon for remainder of 2 or 3.
2. The C version handles remainder differently than Neon
void ARGBToUV411Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width) {
int x;
for (x = 0; x < width - 3; x += 4) {
uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
src_argb += 16;
dst_u += 1;
dst_v += 1;
}
if ((width & 3) == 3) {
uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
} else if ((width & 3) == 2) {
uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
} else if ((width & 3) == 1) {
uint8 ab = src_argb[0];
uint8 ag = src_argb[1];
uint8 ar = src_argb[2];
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
}
}
The Any function used to use C, so it would have been self consistent, but now
it uses SIMD the difference causes failure.
2 solutions:
1. C and ANY should match behavior on odd width
2. remove 411 support. 422 is easy enough and popular but 411 is difficult and
unpopular.
Original comment by fbarch...@chromium.org
on 21 Oct 2015 at 6:49
util/android/test_runner.py gtest -t 1800 -s libyuv_unittest --verbose
--release --gtest_filter=*ARGBToI4*_Opt -a "--libyuv_width=642
--libyuv_height=360 --libyuv_repeat=1000 --libyuv_flags=0"
I 13.367s run_tests_on_device(HT4A2JT03762) [==========] Running 5 tests
from 1 test case.
I 13.367s run_tests_on_device(HT4A2JT03762) [----------] Global test
environment set-up.
I 13.367s run_tests_on_device(HT4A2JT03762) [----------] 5 tests from
LibYUVConvertTest
I 13.367s run_tests_on_device(HT4A2JT03762) [ RUN ]
LibYUVConvertTest.ARGBToI420_Opt
I 13.367s run_tests_on_device(HT4A2JT03762) [ OK ]
LibYUVConvertTest.ARGBToI420_Opt (792 ms)
I 13.367s run_tests_on_device(HT4A2JT03762) [ RUN ]
LibYUVConvertTest.ARGBToI411_Opt
I 13.367s run_tests_on_device(HT4A2JT03762) [ OK ]
LibYUVConvertTest.ARGBToI411_Opt (896 ms)
I 13.367s run_tests_on_device(HT4A2JT03762) [ RUN ]
LibYUVConvertTest.ARGBToI422_Opt
I 13.367s run_tests_on_device(HT4A2JT03762) [ OK ]
LibYUVConvertTest.ARGBToI422_Opt (661 ms)
I 13.367s run_tests_on_device(HT4A2JT03762) [ RUN ]
LibYUVConvertTest.ARGBToI444_Opt
I 13.367s run_tests_on_device(HT4A2JT03762) [ OK ]
LibYUVConvertTest.ARGBToI444_Opt (743 ms)
I 13.368s run_tests_on_device(HT4A2JT03762) [ RUN ]
LibYUVConvertTest.ARGBToI400_Opt
I 13.368s run_tests_on_device(HT4A2JT03762) [ OK ]
LibYUVConvertTest.ARGBToI400_Opt (345 ms)
I 13.368s run_tests_on_device(HT4A2JT03762) [----------] 5 tests from
LibYUVConvertTest (3440 ms total)
I 13.368s run_tests_on_device(HT4A2JT03762)
I 13.368s run_tests_on_device(HT4A2JT03762) [----------] Global test
environment tear-down
I 13.368s run_tests_on_device(HT4A2JT03762) [==========] 5 tests from 1 test
case ran. (3441 ms total)
I 13.368s run_tests_on_device(HT4A2JT03762) [ PASSED ] 5 tests.
Original comment by fbarch...@google.com
on 21 Oct 2015 at 7:53
Original issue reported on code.google.com by
zhongwei...@linaro.org
on 15 Sep 2015 at 7:55