SunnyFog commented 4 years ago

In facedetectcnn.cpp line 114 int sumarray[8]; should be: attribute((aligned(32))) int sumarray[8];

so that the _mm256_store_si256() call on line 131 receives a 32-byte aligned argument.

This should also be done for the int maxarray_int32x8[8]; around facedetectcnn.cpp line 390

ShiqiYu commented 4 years ago

Dear SunnyFog,

Thank you for the suggestion. I should be aligned. I tried to compile "attribute((aligned(32))) int sumarray[8];" using MS Visual Studio. But the compiler complained that attribute is undefined. Could you push a patch that VS and GCC both can compile?

Regards, Shiqi

On Thu, Dec 5, 2019 at 6:30 AM SunnyFog notifications@github.com wrote:

In facedetectcnn.cpp line 114 int sumarray[8]; should be: attribute((aligned(32))) int sumarray[8];

so that the _mm256_store_si256() call on line 131 receives a 32-byte aligned argument.

This should also be done for the int maxarray_int32x8[8]; around facedetectcnn.cpp line 390

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/ShiqiYu/libfacedetection/issues/197?email_source=notifications&email_token=ABWR4HNXHRNAHSI6Z7YQRW3QXAVRDA5CNFSM4JVQGH6KYY3PNVWWK3TUL52HS4DFUVEXG43VMWVGG33NNVSW45C7NFSM4H6ERVCQ, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABWR4HK6ZIQS2RAATNQTR5TQXAVRDANCNFSM4JVQGH6A .

SunnyFog commented 4 years ago

This is my first time using github. (I'm not new to git.)

I'm not sure if I'm going about this incorrectly. git push origin avx2_alignment Username for 'https://github.com': SunnyFog remote: Permission to ShiqiYu/libfacedetection.git denied to SunnyFog. fatal: unable to access 'https://github.com/ShiqiYu/libfacedetection.git/': The requested URL returned error: 403

Please note that I've experienced different results between the AVX2 code and the default non-vector code as commented below.

The following should fix alignment for both VS and g++ in facedetectcnn.cpp It has only been tested in Linux:

[convolution_relu: Around line 400 becomes:]

if defined(_ENABLE_NEON)

elif defined(_ENABLE_AVX2)

__m256i maxarray_int32x8;
// Use an int* to index into maxarray_int32x8
int     *int_max;
int_max = (int *) &maxarray_int32x8;

_mm256_store_si256( &maxarray_int32x8, max_int32x8);
for(int i=0; i < 8; i++)
    nMaxValue = MAX(int_max[i], nMaxValue);

endif

[dotProductUint8Int8 becomes:] inline int dotProductUint8Int8(unsigned char p1, signed char p2, int num, int lengthInBytes) { int sum = 0;

if defined(_ENABLE_NEON)

int8x8x2_t a, b;
int16x8_t result_vec;
int32x4_t d;
result_vec = vdupq_n_s16(0); //zeros

for (int i = 0; i < num; i += 16)
{
    a = vld2_s8((signed char*)p1 + i);
    b = vld2_s8(p2 + i);
    result_vec = vmlal_s8(result_vec, a.val[0], b.val[0]);
    result_vec = vmlal_s8(result_vec, a.val[1], b.val[1]);
}
d = vpaddlq_s16(result_vec);
sum += vgetq_lane_s32(d, 0);
sum += vgetq_lane_s32(d, 1);
sum += vgetq_lane_s32(d, 2);
sum += vgetq_lane_s32(d, 3);

elif defined(_ENABLE_AVX2)

__m256i sumarray;
__m256i sum_int16x16;
__m256i tmp_int32x8;
__m256i a_uint8x32, b_int8x32;
__m256i ones16 = _mm256_set1_epi16(1);
__m256i sum_int32x8 = _mm256_setzero_si256();

// Use an int* to index into sumarray
int     *int_sum;
int_sum = (int *) &sumarray;

for (int i = 0; i < num; i += 32)
{
    a_uint8x32 = _mm256_load_si256((__m256i const *)(p1 + i));
    b_int8x32 = _mm256_load_si256((__m256i const *)(p2 + i));
    sum_int16x16 = _mm256_maddubs_epi16(a_uint8x32, b_int8x32);
    tmp_int32x8 = _mm256_madd_epi16(sum_int16x16, ones16);
    sum_int32x8 = _mm256_add_epi32(sum_int32x8, tmp_int32x8);
}
sum_int32x8 = _mm256_hadd_epi32(sum_int32x8, sum_int32x8);
sum_int32x8 = _mm256_hadd_epi32(sum_int32x8, sum_int32x8);
_mm256_store_si256(&sumarray, sum_int32x8);
sum += (int_sum[0] + int_sum[4]);

else

if defined(_ENABLE_OPENMP_SIMD)

pragma omp simd reduction(+:sum)

endif

for (int i = 0; i < num; i++)
{
    sum += (int(p1[i]) * int(p2[i]));
}

endif

// There is a problem with the AVX2 code such that when num==27, the correct sum (produced by the for loop directly above) // is often not produced. This feels like a failure to zero the end of the vector (or similar), as the result is sometimes // correct. This can be checked by running both at the same time and detecting when they differ. // This also produces different detection results. return sum; }

ShiqiYu / libfacedetection

AVX2 alignment issues #197

if defined(_ENABLE_NEON)

elif defined(_ENABLE_AVX2)

endif

if defined(_ENABLE_NEON)

elif defined(_ENABLE_AVX2)

else

if defined(_ENABLE_OPENMP_SIMD)

pragma omp simd reduction(+:sum)

endif

endif