halide / Halide

a language for fast, portable data-parallel computation
https://halide-lang.org
Other
5.88k stars 1.07k forks source link

Blur implementation #2905

Open Bastacyclop opened 6 years ago

Bastacyclop commented 6 years ago

Hello, I have some issues trying to implement a fast blur with Halide. I am trying to match a hand-written code with rotation of vertical reductions (the code will be below and is explained in this paper), without success. The default inline halide code was surprisingly faster than both my hand-written inline and rotation of vertical reductions. So I tried to compile the Halide generated C code instead of using the generated object file and now it was slower than both of my hand-written versions.

How is the generated object file that much faster than -Ofast -march=native ? Can I get the rotation of vertical reductions scheduling with Halide ?

Codegen:

const Type real = Float(32);

struct Kernel {
    const char* name;
    Func f;
    std::vector<Argument> args;
};

Kernel blur_kernel(const Target& target) {
    ImageParam input(real, 2, "input");

    Func clamped_input = Halide::BoundaryConditions::repeat_edge(input);
    Func vertical("vertical"), horizontal("horizontal"), blur("blur");
    Var x("x"), y("y");

    vertical(x, y) = clamped_input(x, y-1) + 2*clamped_input(x, y) + clamped_input(x, y+1);
    horizontal(x, y) = vertical(x-1, y) + 2*vertical(x, y) + vertical(x+1, y);
    blur(x, y) = horizontal(x, y) * 1.f/16.f;

    ////

    auto vsize = target.natural_vector_size(real);
    input.set_host_alignment(vsize);
    input.dim(0).set_stride(1);
    // input.dim(0).set_min(0);
    // input.dim(1).set_min(0);

    // I tried various schedulings, the following seemed close to what I want but is incredibly slow:
    // vertical.store_at(blur, y).compute_at(blur, x);

    return {
        .name = "blur",
        .f = blur,
        .args = { input }
    };
}

void generate_kernel(Kernel& k, const Target& target, const std::string& gendir) {
    std::cout << "generating " << k.name << std::endl;
    k.f.print_loop_nest();
    k.f.compile_to_file(gendir + k.name, k.args, k.name, target);
    k.f.compile_to_lowered_stmt(gendir + k.name + ".txt", k.args, Text, target);
    k.f.compile_to_c(gendir + k.name + ".c", k.args, k.name, target);
    k.f.compile_to_assembly(gendir + k.name + ".asm", k.args, k.name, target);
}

int main(int argc, char** argv) {
    Target target = get_host_target()
        .with_feature(Target::NoRuntime);

    Kernel blur = blur_kernel(target);
    generate_kernel(blur, target, gendir);

    return EXIT_SUCCESS;
}

Benchmark:

template<typename F, typename TS, typename TE>
double time_ms(int sample_count, int batch_count, F f,
               TS time_start, TE time_elapsed) {
    double min_elapsed;

    for (int s = 0; s < sample_count; s++) {
        time_start();

        for (int b = 0; b < batch_count; b++) {
            f();
        }

        double elapsed = time_elapsed();
        if (s == 0 || elapsed < min_elapsed) {
            min_elapsed = elapsed;
        }
    }

    return min_elapsed / (double)(batch_count);
}

template<typename F>
double cpu_time_ms(int sample_count, int batch_count, F f) {
    struct timespec t1, t2;
    auto time_start = [&] {
        clock_gettime(CLOCK_MONOTONIC, &t1);
    };
    auto time_elapsed = [&] {
        clock_gettime(CLOCK_MONOTONIC, &t2);

        return (double)(t2.tv_sec - t1.tv_sec) * 1000. +
            (double)(t2.tv_nsec - t1.tv_nsec) / 1000000.;
    };
    return time_ms(sample_count, batch_count, f, time_start, time_elapsed);
}

const int size = 2000;
const int sample_count = 3;
const int batch_count = 100;

template<typename F>
void bench(const char* name, F f) {
    double time = cpu_time_ms(sample_count, batch_count, f);
    double nspp = (time * 1e6) / (size * size);
    printf("%s time: %.4lf ms, %.4lf ns/px\n", name, time, nspp);
}

int main(int argc, char** argv) {
    Halide::Runtime::Buffer<float> i0(size, size);
    Halide::Runtime::Buffer<float> i1(size, size);
    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            i0(x, y) = y + x;
        }
    }

    bench("blur", [&] {
        blur(i0, i1);
        i1.device_sync();
    });

    float* __restrict a = (float*) malloc(size * size * sizeof(float));
    float* __restrict b = (float*) malloc(size * size * sizeof(float));
    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            a[y*size + x] = y + x;
        }
    }

    bench("blur-inline", [&] {
        int r0 = 0*size;
        for (int y = 0; y < (size-1); y++) {
            int r1 = y*size;
            int r2 = (y+1)*size;

            int x0 = 0;
            for (int x = 0; x < (size-1); x++) {
                int x1 = x;
                int x2 = x+1;

                float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
                float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
                float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

                x0 = x1;
            }

            // last column
            int x1 = size-1;
            int x2 = x1;
            float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
            float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
            float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
            b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }

        // last row
        int y = size-1;
        int r1 = y*size;
        int r2 = r1;

        int x0 = 0;
        for (int x = 0; x < (size-1); x++) {
            int x1 = x;
            int x2 = x+1;

            float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
            float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
            float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
            b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

            x0 = x1;
        }

        // last column
        int x1 = size-1;
        int x2 = x1;
        float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
        float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
        float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
        b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
    });

    bench("blur-reduction", [&] {
        for (int y = 0; y < size; y++) {
            float* r0 = &a[std::max(0, y-1)*size];
            float* r1 = &a[y*size];
            float* r2 = &a[std::min(y+1, size-1)*size];

        float v0 = r0[0] + 2.f*r1[0] + r2[0];
            float v1 = v0;
            for (int x = 0; x < (size-1); x++) {
                float v2 = r0[x+1] + 2.f*r1[x+1] + r2[x+1];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

                v0 = v1;
                v1 = v2;
            }

            // last column
            float v2 = v1;
            b[y*size + (size-1)] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }
    });

    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            if (i1(x, y) != b[y*size + x]) {
                fprintf(stderr, "output mismatch (%i, %i) = %f or %f\n", x, y, i1(x, y), b[y*size + x]);
            }
        }
    }

    free(a);
    free(b);
    return EXIT_SUCCESS;
}

Running:

# hw=host, CC=clang++
mkdir -p $hw/bin &&
# uncomment to use the generated C++
# $CC $hw/gen/blur.c -c -o $hw/gen/blur.o -std=c++11 -Ofast -march=native &&
$CC src/bench.cpp $hw/gen/blur.o ../$hw/halide_runtime.a -Ofast -o $hw/bin/bench -I $hw/gen/ -I ../halide-headers/ -lpthread -ldl -lrt -std=c++11 -D_POSIX_C_SOURCE=200809 -march=native &&
./$hw/bin/bench

Intel i5 results:

Thank you for you help. I also planned to add parallel tiles and vectorisation on top of this.

abadams commented 6 years ago

I would explicitly vectorize it from the start. If you don't then in some of your benchmarks LLVM (which sits underneath Halie) will successfully autovectorize it, and in others it won't, so you'll accidentally be comparing vector code to scalar code. In particular, your store_at/compute_at schedule is written in a way that will prevent autovectorization (because there's a serial dependence along the scanline).

Generating C is almost always going to produce slower code. Going from Halide directly to machine code uses all the same LLVM compiler machinery a C compiler would at -Ofast -ffast-math, but we can additionally give exploit additional information that we can't express easily in C (e.g. what branches are likely, what memory addresses may alias).

All things considered, I think you want something like:

blur.compute_root().vectorize(x, 8).parallel(y); vertical.compute_at(blur, y).vectorize(x, 8);

abadams commented 6 years ago

Note that in my schedule above I explicitly didn't use a rotating buffer because the additional modulo-indexing code in the inner loop usually makes it not worth it at that fine granularity. You might want to unroll and rotate through a set of registers to avoid that indexing, but that's not something Halide can express right now.

Bastacyclop commented 6 years ago

Thank you for the explanation. I removed the vector features from the target (this should prevent autovectorisation right ?) and the difference is not that big so the additional LLVM information seems to be bringing most of the speedup. I did expect the generated C to be slower for that reason but I did not expect it to be that much slower (around 3 times)!

Do you think register rotation would be hard to add to Halide capabilities ?

Also when I try to vectorize the code for an ARM/Neon there doesn't seem to be any vector instruction generated (no instruction starting with "v"), am I missing something ?

      Target(Target::Linux, Target::ARM, 64)
        .with_feature(Target::ARMv7s)
        .with_feature(Target::CUDA)
        .with_feature(Target::CUDACapability61)
        // no performance difference with this
        // .with_feature(Target::Feature::NoNEON)
    auto vsize = target.natural_vector_size(real);
    input.set_host_alignment(vsize);
    input.dim(0).set_stride(1);

    blur.vectorize(x, vsize, TailStrategy::RoundUp)
        .align_storage(x, vsize)
        .align_bounds(x, vsize)
        .parallel(y, 10);
    vertical.compute_at(blur, y)
        .vectorize(x, vsize, TailStrategy::RoundUp)
        .align_storage(x, vsize)
        .align_bounds(x, vsize);
Bastacyclop commented 2 years ago

Hi @abadams and @dsharletg .

Note that in my schedule above I explicitly didn't use a rotating buffer because the additional modulo-indexing code in the inner loop usually makes it not worth it at that fine granularity. You might want to unroll and rotate through a set of registers to avoid that indexing, but that's not something Halide can express right now.

I am curious, can this register rotation now be expressed in Halide after https://github.com/halide/Halide/pull/5815, and what would the blur schedule look like? More precisely if I want the y dimension to be parallelized, the x dimension to be vectorized, and vector registers to be rotated for the sliding window, can I write the following schedule?

blur.parallel(y).vectorize(x, v);
vertical.store_at(blur, y).compute_at(blur, x).store_in(MemoryType::Register).vectorize(x, v);

The code I would like this schedule to generate is basically what I wrote at the beginning of this issue, plus parallelized and vectorized:

        for (int y = 0; y < size; y++) { // this in parallel
            // all of the following vectorized
            float* r0 = &a[std::max(0, y-1)*size];
            float* r1 = &a[y*size];
            float* r2 = &a[std::min(y+1, size-1)*size];

        float v0 = r0[0] + 2.f*r1[0] + r2[0];
            float v1 = v0;
            for (int x = 0; x < (size-1); x++) {
                float v2 = r0[x+1] + 2.f*r1[x+1] + r2[x+1];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

                v0 = v1;
                v1 = v2;
            }

            float v2 = v1;
            b[y*size + (size-1)] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }