DTolm / VkFFT

Vulkan/CUDA/HIP/OpenCL/Level Zero/Metal Fast Fourier Transform library
MIT License
1.48k stars 88 forks source link

[Need Help] Is it possible to reduce compiled binary size when using VkFFT.h? #136

Closed AtomicVar closed 9 months ago

AtomicVar commented 9 months ago

Hi! First, thank you for this awesome multi-platform GPU FFT library. I'm using it for an iOS App (Metal backend). The only thing that bothers me is that this library is header-only, which requires me to include all of its source code into my source code, and it makes the compiled binary have a large size.

Even for this most simple demo that include vkFFT.h and do nothing, the resulting executable is ~312KiB:

// Compile with:
// clang++ -Imetal-cpp -std=c++17 -framework Foundation -framework QuartzCore -framework Metal -Wno-deprecated-declarations t.cpp -o t
#include <iostream>

#define VKFFT_BACKEND 5
#include "vkFFT/vkFFT.h"

int main() {
    return 0;
}

After I added my code that only call some VkFFT APIs to do 2D FFT/iFFT, the executable comes to 1.2MiB:

#include <iostream>

#define VKFFT_BACKEND 5
#include "vkFFT/vkFFT.h"

#define PAGE_SIZE 4096

uint64_t align_to_page_size(uint64_t sz) {
  return (sz + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
}

class MetalFFT2D {
  float *ibufferData = nullptr;
  float *bufferData = nullptr;

public:
  MTL::Buffer *ibuffer = nullptr;
  MTL::Buffer *buffer = nullptr;

  int height = 0;
  int width = 0;

  MetalFFT2D(int h, int w, int batchSize) : height(h), width(w) {
    configuration.FFTdim = 2;
    configuration.size[0] = w;
    configuration.size[1] = h;
    configuration.performR2C = 1;
    configuration.numberBatches = batchSize;
    configuration.inverseReturnToInputBuffer = 1;
    configuration.device = MTL::CreateSystemDefaultDevice();
    configuration.queue = configuration.device->newCommandQueue();

    // Prepare Metal Buffer with aligned memory, which allows it to create new
    // Buffers without copy
    int rowStride = w / 2 + 1;
    uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * h * rowStride *
                          configuration.numberBatches;
    uint64_t ibufferSize =
        (uint64_t)sizeof(float) * h * w * configuration.numberBatches;
    bufferSize = align_to_page_size(bufferSize);
    ibufferSize = align_to_page_size(ibufferSize);

    posix_memalign((void **)&ibufferData, PAGE_SIZE, ibufferSize);
    posix_memalign((void **)&bufferData, PAGE_SIZE, bufferSize);

    ibuffer = configuration.device->newBuffer(
        ibufferData, ibufferSize, MTL::ResourceStorageModeShared, nil);
    buffer = configuration.device->newBuffer(
        bufferData, bufferSize, MTL::ResourceStorageModeShared, nil);

    configuration.buffer = &buffer;
    configuration.bufferSize = &bufferSize;

    configuration.isInputFormatted = 1;
    configuration.inputBuffer = &ibuffer;
    configuration.inputBufferSize = &ibufferSize;
    configuration.inputBufferStride[0] = configuration.size[0];
    configuration.inputBufferStride[1] =
        configuration.size[0] * configuration.size[1];

    // initialize app
    res = initializeVkFFT(&app, configuration);
    if (res != VKFFT_SUCCESS) {
      printf("[ERROR] initializeVkFFT returned %d\n", res);
    }
  }

  ~MetalFFT2D() {
    deleteVkFFT(&app);
    delete[] ibufferData;
    delete[] bufferData;
    ibuffer->release();
    buffer->release();
    configuration.queue->release();
    configuration.device->release();
  }

  void Forward() {
    // Prepare launch params
    VkFFTLaunchParams launchParams = {};
    MTL::CommandBuffer *commandBuffer = configuration.queue->commandBuffer();
    if (commandBuffer == 0) {
      printf("[MetalFFT2D Forward] failed to create command buffer.\n");
    }
    launchParams.commandBuffer = commandBuffer;
    MTL::ComputeCommandEncoder *commandEncoder =
        commandBuffer->computeCommandEncoder();
    if (commandEncoder == 0) {
      printf("[MetalFFT2D Forward] failed to create command encoder.\n");
    }
    launchParams.commandEncoder = commandEncoder;

    // encode FFT
    res = VkFFTAppend(&app, -1, &launchParams);
    if (res != VKFFT_SUCCESS) {
      printf("[MetalFFT2D Forward] VkFFTAppend failed and returned %d\n", res);
    }
    commandEncoder->endEncoding();

    // commit and wait
    commandBuffer->commit();
    commandBuffer->waitUntilCompleted(); // can not be omitted, why?

    // clean: in a ARC environment, the following two lines can be omitted
    //    commandEncoder->release();
    //    commandBuffer->release();
  }

  void Backward() {
    // Prepare launch params
    VkFFTLaunchParams launchParams = {};
    MTL::CommandBuffer *commandBuffer = configuration.queue->commandBuffer();
    if (commandBuffer == 0) {
      printf("[MetalFFT2D Backward] failed to create command buffer.\n");
    }
    launchParams.commandBuffer = commandBuffer;
    MTL::ComputeCommandEncoder *commandEncoder =
        commandBuffer->computeCommandEncoder();
    if (commandEncoder == 0) {
      printf("[MetalFFT2D Backward] failed to create command encoder.\n");
    }
    launchParams.commandEncoder = commandEncoder;

    // encode iFFT
    res = VkFFTAppend(&app, 1, &launchParams);
    if (res != VKFFT_SUCCESS) {
      printf("[MetalFFT2D Backward] VkFFTAppend failed and returned %d\n", res);
    }
    commandEncoder->endEncoding();

    // commit and wait
    commandBuffer->commit();
    commandBuffer->waitUntilCompleted(); // can not be omitted, why?

    // clean: in a ARC environment, the following two lines can be omitted
    //    commandEncoder->release();
    //    commandBuffer->release();
  }

private:
  // Configuration + FFT application
  VkFFTConfiguration configuration = {};
  VkFFTApplication app = {};
  VkFFTResult res = VKFFT_SUCCESS;
};

// wrap MetalFFT2D as C interface, so that it can be called by various languages
// (C/C++/Objective-C/Objective-C++)
extern "C" {

// MetalFFT2D handle
typedef void *MetalFFT2D_t;

MetalFFT2D_t fft2d_init(int h, int w, int b) { return new MetalFFT2D(h, w, b); }

void fft2d_deinit(MetalFFT2D_t handle) {
  MetalFFT2D *obj = reinterpret_cast<MetalFFT2D *>(handle);
  if (obj) {
    delete obj;
  }
}

void fft2d(MetalFFT2D_t handle) {
  MetalFFT2D *obj = reinterpret_cast<MetalFFT2D *>(handle);
  obj->Forward();
}

void ifft2d(MetalFFT2D_t handle) {
  MetalFFT2D *obj = reinterpret_cast<MetalFFT2D *>(handle);
  obj->Backward();
}
}

int main() { return 0; }

My question: is it possible to "exclude" some stuffs that I don't need to make compiled binary have a smaller size? I can trade some performance for reducing binary size, e.g. be limited to use some simple algorithms, rather than trying all possible algorithms.

DTolm commented 9 months ago

Hello,

Thanks for being interested in VkFFT! Sorry for the long reply. I think it is hard to make the code being able to exclude some algorithms and it will be even harder to maintain, so I won't promise anything on this being ever implemented. I didn't think ~1MB was a big issue on most modern systems?

What I can suggest is using one of the way older versions of VkFFT which had fewer algorithms (like 1.2.17) and maybe this will suit your use case (though they didn't support metal).

Best regards, Dmitrii

en-GB commented 9 months ago

The compiler will remove dead code if you tell it to. Your first example is ~10kb with clang++ -s -Os ...etc... hope that helps