root-project / veccore

C++ Library for Portable SIMD Vectorization
https://root-project.github.io/veccore
Other
80 stars 21 forks source link

Mandelbrot benchmark's SIMD doesn't give the same output as the scalar version. #26

Open Panhaolin2001 opened 8 hours ago

Panhaolin2001 commented 8 hours ago
#include "VecCore/VecCore"

using namespace vecCore;

template<typename T>
void mandelbrot(T xmin, T xmax, size_t nx,
                T ymin, T ymax, size_t ny,
                size_t max_iter, unsigned char *image)
{
    T dx = (xmax - xmin) / T(nx);
    T dy = (ymax - ymin) / T(ny);

    for (size_t i = 0; i < nx; ++i) {
        for (size_t j = 0; j < ny; ++j) {
            size_t k = 0;
            T x = xmin + T(i) * dx, cr = x, zr = x;
            T y = ymin + T(j) * dy, ci = y, zi = y;

            do {
                x  = zr*zr - zi*zi + cr;
                y  = T(2.0) * zr*zi + ci;
                zr = x;
                zi = y;
            } while (++k < max_iter && (zr*zr + zi*zi < T(4.0)));

            image[ny*i + j] = k;
        }
    }
}

template<typename T>
void mandelbrot_v(Scalar<T> xmin, Scalar<T> xmax, size_t nx,
                  Scalar<T> ymin, Scalar<T> ymax, size_t ny,
                  Scalar<Index<T>> max_iter, unsigned char *image)
{
    T iota;
    for (size_t i = 0; i < VectorSize<T>(); ++i)
        Set<T>(iota, i, i);

    T dx = T(xmax - xmin) / T((Scalar<T>)nx);
    T dy = T(ymax - ymin) / T((Scalar<T>)ny), dyv = iota * dy;

    for (size_t i = 0; i < nx; ++i) {
        for (size_t j = 0; j < ny; j += VectorSize<T>()) {
            Scalar<Index<T>> k(0);
            T x = xmin + T((Scalar<T>)i) * dx,       cr = x, zr = x;
            T y = ymin + T((Scalar<T>)j) * dy + dyv, ci = y, zi = y;

            Index<T> kv(0);
            Mask<T> m(true);

            do {
                x = zr*zr - zi*zi + cr;
                y = T((Scalar<T>)2.0) * zr*zi + ci;
                MaskedAssign<T>(zr, m, x);
                MaskedAssign<T>(zi, m, y);
                MaskedAssign<Index<T>>(kv, m, ++k);
                m = zr*zr + zi*zi < T((Scalar<T>)4.0);
            } while (k < max_iter && !MaskEmpty(m));

            for (size_t k = 0; k < VectorSize<T>(); ++k)
                image[ny*i + j + k] = (unsigned char) Get(kv, k);
        }
    }
}

int main(){
    double xmin = -2.1, xmax = 1.1;
    double ymin = -1.35, ymax = 1.35;

    size_t nx = 1024, ny = 864, max_iter = 500;
    unsigned char *image_scalar = new unsigned char[nx*ny];
    unsigned char *image_simd = new unsigned char[nx*ny];

    mandelbrot_v<backend::SIMDNative::Double_v>(xmin, xmax, nx, ymin, ymax, ny,
                            max_iter, image_simd);
    mandelbrot<double>(xmin, xmax, nx, ymin, ymax, ny,
                            max_iter, image_scalar);

    for(int i=0 ; i<nx*ny; ++i){
        assert(image_simd[i] == image_scalar[i]);
    }

    delete[] image_scalar;
    delete[] image_simd;
}
Panhaolin2001 commented 8 hours ago

My compiler version is clang 16.0.6, and the compile options are -march=native -O2 -std=c++2a