Open GoogleCodeExporter opened 8 years ago
ScaleRowDown2Box_SSSE3 does 32x2 to 16x1 ScaleRowDown2Box_AVX2 does 64x2 to 32x1 ScaleRowDown2Box_NEON does 32x2 to 16x1 where destination width is multiple of 16 Any variations support odd destination width, but source is a multiple of 2. Suggest doing an Odd variation of Any that does odd source width. ScaleRowDown2Box_Odd_SSSE3 already exist for I420Blend subsampling. For reference this is SSSE3 gcc code: void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "psrlw $0x1,%%xmm0 \n" "psrlw $0x1,%%xmm1 \n" "pavgw %%xmm5,%%xmm0 \n" "pavgw %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } // Fixed scale down for odd source width. Used by I420Blend subsampling. // Since dst_width is (width + 1) / 2, this function scales one less pixel // and copies the last pixel. #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ uint8* dst_ptr, int dst_width) { \ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ int n = dst_width - r; \ if (n > 0) { \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ } \ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ dst_ptr + n * BPP, r); \ } SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, 2, 1, 31) SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C, 2, 1, 31) #endif
Original issue reported on code.google.com by fbarch...@google.com on 26 Jan 2016 at 2:01
fbarch...@google.com
Original issue reported on code.google.com by
fbarch...@google.com
on 26 Jan 2016 at 2:01