ARGBToA - Githubissues

GoogleCodeExporter commented 8 years ago

Write a function to extract just the 'A' channel from ARGB

Here is a quick SSE2 version
// Specialized ARGB to Bayer that just isolates G channel.  
__declspec(naked) 
void ARGBToARow_SSE2(const uint8* src_argb, uint8* dst_a, int pix) {  
  __asm { 
    mov        eax, [esp + 4]    // src_argb  
    mov        edx, [esp + 8]    // dst_a
    mov        ecx, [esp + 12]   // pix 

  wloop:  
    movdqu     xmm0, [eax]  
    movdqu     xmm1, [eax + 16] 
    lea        eax, [eax + 32]  
    psrld      xmm0, 24  // Move alpha to bottom.  
    psrld      xmm1, 24  
    packssdw   xmm0, xmm1 
    packuswb   xmm0, xmm1 
    movq       qword ptr [edx], xmm0  
    lea        edx, [edx + 8] 
    sub        ecx, 8 
    jg         wloop  
    ret 
  } 
}

In this CL sobel switched to grey scale, but was previously extracting just the 
G channel from ARGB:
https://webrtc-codereview.appspot.com/57479004/

It was based on Bayer code which used a channel selector.  The calling code was
ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);

// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
 147 LIBYUV_API
 148 int ARGBToG(const uint8* src_argb, int src_stride_argb,
 149             uint8* dst_g, int dst_stride_g,
 150             int width, int height);

// Specialized ARGB to Bayer that just isolates G channel.  
 5879 __declspec(naked) 
 5880 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,   
 5881                            uint32 selector, int pix) {    
 5882   __asm { 
 5883     mov        eax, [esp + 4]    // src_argb  
 5884     mov        edx, [esp + 8]    // dst_bayer 
 5885                                  // selector  
 5886     mov        ecx, [esp + 16]   // pix   
 5887     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff  
 5888     psrld      xmm5, 24   
 5889   
 5890   wloop:  
 5891     movdqu     xmm0, [eax]    
 5892     movdqu     xmm1, [eax + 16]   
 5893     lea        eax, [eax + 32]    
 5894     psrld      xmm0, 8  // Move green to bottom.  
 5895     psrld      xmm1, 8    
 5896     pand       xmm0, xmm5 
 5897     pand       xmm1, xmm5 
 5898     packssdw   xmm0, xmm1 
 5899     packuswb   xmm0, xmm1 
 5900     movq       qword ptr [edx], xmm0  
 5901     lea        edx, [edx + 8] 
 5902     sub        ecx, 8 
 5903     jg         wloop  
 5904     ret   
 5905   }   
 5906 }

#ifdef HAS_ARGBTOBAYERGGROW_SSE2    
 5034 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,   
 5035                            uint32 selector, int pix) {    
 5036   asm volatile (  
 5037     "pcmpeqb   %%xmm5,%%xmm5                   \n"    
 5038     "psrld     $0x18,%%xmm5                    \n"    
 5039     LABELALIGN    
 5040   "1:                                          \n"    
 5041     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"    
 5042     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"    
 5043     "lea       " MEMLEA(0x20,0) ",%0           \n"    
 5044     "psrld     $0x8,%%xmm0                     \n"    
 5045     "psrld     $0x8,%%xmm1                     \n"    
 5046     "pand      %%xmm5,%%xmm0                   \n"    
 5047     "pand      %%xmm5,%%xmm1                   \n"    
 5048     "packssdw  %%xmm1,%%xmm0                   \n"    
 5049     "packuswb  %%xmm1,%%xmm0                   \n"    
 5050     "movq      %%xmm0," MEMACCESS(1) "         \n"    
 5051     "lea       " MEMLEA(0x8,1) ",%1            \n"    
 5052     "sub       $0x8,%2                         \n"    
 5053     "jg        1b                              \n"    
 5054   : "+r"(src_argb),  // %0    
 5055     "+r"(dst_bayer), // %1    
 5056     "+r"(pix)        // %2    
 5057   :   
 5058   : "memory", "cc"    
 5059     , "xmm0", "xmm1", "xmm5"  
 5060   );  
 5061 } 
 5062 #endif  // HAS_ARGBTOBAYERGGROW_SSE2

1262 // Select G channels from ARGB.  e.g.  GGGGGGGG    
 1263 #ifdef HAS_ARGBTOBAYERGGROW_NEON  
 1264 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,   
 1265                            uint32 /*selector*/, int pix) {    
 1266   asm volatile (  
 1267   "1:                                          \n"    
 1268     MEMACCESS(0)  
 1269     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load row 8 pixels    
 1270     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop   
 1271     MEMACCESS(1)  
 1272     "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.   
 1273     "b.gt       1b                             \n"    
 1274   : "+r"(src_argb),   // %0   
 1275     "+r"(dst_bayer),  // %1   
 1276     "+r"(pix)         // %2   
 1277   :   
 1278   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List   
 1279   );  
 1280 } 
 1281 #endif  // HAS_ARGBTOBAYERGGROW_NEON

1254 // Select G channels from ARGB.  e.g.  GGGGGGGG    
 1255 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,   
 1256                            uint32 /*selector*/, int pix) {    
 1257   asm volatile (  
 1258   "1:                                          \n"    
 1259     MEMACCESS(0)  
 1260     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels. 
 1261     "subs       %2, %2, #8                     \n"  // 8 processed per loop   
 1262     MEMACCESS(1)  
 1263     "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.   
 1264     "bgt        1b                             \n"    
 1265   : "+r"(src_argb),   // %0   
 1266     "+r"(dst_bayer),  // %1   
 1267     "+r"(pix)         // %2   
 1268   :   
 1269   : "cc", "memory", "q0", "q1"  // Clobber List   
 1270   );  
 1271 }

89 // Select G channel from ARGB.  e.g.  GGGGGGGG   
 2090 void ARGBToBayerGGRow_C(const uint8* src_argb,    
 2091                         uint8* dst_bayer, uint32 selector, int pix) { 
 2092   // Copy a row of G. 
 2093   int x;  
 2094   for (x = 0; x < pix - 1; x += 2) {  
 2095     dst_bayer[0] = src_argb[1];   
 2096     dst_bayer[1] = src_argb[5];   
 2097     src_argb += 8;    
 2098     dst_bayer += 2;   
 2099   }   
 2100   if (pix & 1) {  
 2101     dst_bayer[0] = src_argb[1];   
 2102   }   
 2103 }

 281 // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.  
 282 #define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK)        \   
 283     void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) {   \   
 284       int n = width & ~MASK;                                                   \   
 285       if (n > 0) {                                                             \   
 286         ARGBTORGB_SIMD(src, dst, selector, n);                                 \   
 287       }                                                                        \   
 288       ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK);      \   
 289     }  
 290    
 291 #if defined(HAS_ARGBTOBAYERGGROW_SSE2) 
 292 BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C, 
 293          4, 1, 7)  
 294 #endif 
 295 #if defined(HAS_ARGBTOBAYERGGROW_NEON) 
 296 BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, 
 297          4, 1, 7)  
 298 #endif 
 299    
 300 #undef BAYERANY

Original issue reported on code.google.com by fbarch...@google.com on 25 Feb 2016 at 12:04

GoogleCodeExporter commented 8 years ago

Just wanted to let you know that spec discussion went through and I can use a 
ARGBtoA() fucntion here: 
https://code.google.com/p/chromium/codesearch#chromium/src/content/renderer/medi
a/canvas_capture_handler.cc&l=212
Let me know if it makes sense to add it, or any way I can help with.

Original comment by emir...@google.com on 11 Mar 2016 at 2:34

GoogleCodeExporter commented 8 years ago

Original comment by mag...@google.com on 14 Mar 2016 at 12:51

GoogleCodeExporter commented 8 years ago

It would seem that we could use the same or almost the same code for extracting 
any of the channels, R, G, B, or A. It only takes adding an offset 0..3 to the 
source pointer.

Or do we want separate loops for efficiency reasons?  I.e., buffers are assumed 
to normally 128-bit aligned and we want to give the hardware the chance to 
optimize our aligned loads (essentially execute movdqu as if they were movdqa).

Original comment by torbjo...@google.com on 14 Mar 2016 at 2:04

GoogleCodeExporter commented 8 years ago

Theres 2 ways you could write this for Neon or SSE2:

1. hard coded, likely with a shift.  Likely most efficient, and can extract any 
1 byte from 4 bytes by adjusting the pointer.

2. shuffle - read 16 bytes, shuffle and write 4 bytes.  More general, but 
likely slower.

If we really need ARGBToA and BGRAToA, I would suggest shuffle.
But if its just ARGBToA, a shift is simplier/faster.

Original comment by fbarch...@google.com on 14 Mar 2016 at 6:01

Changed state: Accepted

bmharper / libyuv

ARGBToA #572