Closed gfoidl closed 6 years ago
Just for reference a portion of C++:
#include <iostream>
#include <immintrin.h>
//-----------------------------------------------------------------------------
float max_sse(float* a)
{
__m128* f4 = reinterpret_cast<__m128*>(a);
__m128 maxval = *f4;
for (int i = 0; i < 3; ++i)
{
__m128 tmp = _mm_shuffle_ps(maxval, maxval, 0x93);
maxval = _mm_max_ps(maxval, tmp);
}
float res;
_mm_store_ss(&res, maxval);
return res;
}
//-----------------------------------------------------------------------------
double max_sse(double* a)
{
__m256d* d4 = reinterpret_cast<__m256d*>(a);
__m256d maxval = *d4;
for (int i = 0; i < 3; ++i)
{
__m256d tmp = _mm256_permute4x64_pd(maxval, 0x39);
maxval = _mm256_max_pd(maxval, tmp);
}
double res;
_mm256_store_pd(&res, maxval);
return res;
}
//-----------------------------------------------------------------------------
#define MM_SHUFFLE(fp0,fp1,fp2,fp3) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
//-----------------------------------------------------------------------------
int main()
{
int a = 0x93;
int b = MM_SHUFFLE(2, 1, 0, 3);
float arr[] = {1, 2, 3, 4};
float max = max_sse(arr);
double darr[] = {1, 2, 3, 4};
double dmax = max_sse(darr);
using namespace std;
cout << max << endl;
cout << dmax << endl;
}
I haven't tested the double-version in C#, because in the reference-assembly _mm256_permute4x64_pd
is missing (though it's available in CoreLib).
But I believe the implemented variant is faster, because it's just
__m128
out of the __m256d
instead of rotating and min/max.
Fixes #43
44 made some improvements on this, but the codegen wasn't perfect. Cf. https://github.com/gfoidl/Stochastics/pull/44#issuecomment-364503574
https://github.com/gfoidl/Stochastics/issues/43#issuecomment-382133531 opened the door for better codegen, this is the implementation.
Biggest improvement to #44 is in
ReduceMinMax
:With wonderful dasm :wink: