Learn OpenCL step by step.
TNN对mali的调度、cl&gl交互、subworkgroup的magic number

arm mali:针对buffer的特殊调度策略


下面,将针对TNN arm mali的调度策略进行分析,其回答中提到了clFlush,根据opencl对clFlush的解释,有如下说明:


  1. 带有blocking_read=CL_TRUEclEnqueueReadImage
  2. 带有blocking_write=CL_TRUEclEnqueueReadBufferclEnqueueWriteImageclEnqueueWriteBuffer
  3. 带有blocking_map=CL_TRUEclEnqueueMapImageclEnqueueMapBuffer
  4. clWaitForEvents





Avoid application processor and GPU interactions in the middle of processing

Enqueue all the kernels first, and call clFinish() at the end if possible. Call clFlush() after one or more clEnqueueNDRange() calls, and call clFinish() before checking the final result.

Avoid blocking calls in the submission thread

Avoid clFinish() or clWaitForEvent() or any other blocking calls in the submission thread. If possible, wait for an asynchronous callback if you want to check the result while computations are in progress. Try double buffering, if you are using blocking operations in your submission thread.

Batching kernels submission

From version r17p0 onwards, the OpenCL driver batches kernels that are flushed together for submission to the hardware. Batching kernels can significantly reduce the runtime overheads and cache maintenance costs. For example, this reduction is useful when the application is accessing multiple sub-buffers created from a buffer imported using clImportMemoryARM in separate kernels. The application should flush kernels in groups as large as possible. When the GPU is idle though, reaching optimal performance requires the application to flush an initial batch of kernels early so that the GPU execution overlaps the queuing of further kernels.

Execution optimizations

•   If you use callbacks to prompt the processor to continue processing data resulting from the execution of a kernel, ensure that the callbacks are set before you flush the queue. If you do not do this, the callbacks might occur at the end of a larger batch of work, later than they might have based on actual completion of work.

Wondering when I should use clFlush or clFinish.



Status OpenCLLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
    int unit_idx = 0;
    for (auto execute_unit : execute_units_) {
        ret = RunKernel(execute_unit.ocl_kernel, execute_unit.global_work_size, execute_unit.local_work_size,
                        ocl_context_->CommandQueue(), op_name_);

    if (NeedFlush()) {

    return TNN_OK;

bool OpenCLLayerAcc::NeedFlush() {
    // flush by magic number
    if (0 == ocl_context_->AddAndGetFlushCount() % 10) {
        return true;
    return false;


// https://github.com/Tencent/TNN/blob/a315d2acfb327014721b308359a6d534470289ba/source/tnn/device/opencl/opencl_context.cc
// opencl kernel flush strategy, some devices(special for huawei device) whave serious latency.
unsigned int OpenCLContext::AddAndGetFlushCount() {
    return flush_count_;

// https://github.com/Tencent/TNN/blob/a315d2acfb327014721b308359a6d534470289ba/source/tnn/device/opencl/opencl_context.h#L88
class OpenCLContext : public Context {

    // @brief get tnn command queue
    // @param command_queue device command queue for forward
    Status GetCommandQueue(void **command_queue) override;

    // @brief share tnn command queue to another context
    Status ShareCommandQueue(Context* context) override;

     * @brief get CommandQueue
    cl::CommandQueue *CommandQueue();

    cl::CommandQueue *TuneCommandQueue();

    // load library
    virtual Status LoadLibrary(std::vector<std::string> path) override;
     * @brief befor instace forword
     * @param instance instace
    virtual Status OnInstanceForwardBegin() override;
     * @brief after instace forword
     * @param instance instace
    virtual Status OnInstanceForwardEnd() override;

     // @brief before instance Reshape
    virtual Status OnInstanceReshapeBegin() override;

    // @brief after instace Reshape
    virtual Status OnInstanceReshapeEnd() override;   

    // @brief wait for jobs in the current context to complete
    virtual Status Synchronize() override;

    // @brief add flush_count_ and return val
    unsigned int AddAndGetFlushCount();

    std::map<std::string, std::vector<uint32_t>>& GetLocalSizeTuneMap();

    Status StoreLocalSizeTuneMap();

     * @brief initialize opencl env
    Status Init();

    std::shared_ptr<cl::CommandQueue> command_queue_ = nullptr;
    std::shared_ptr<cl::CommandQueue> tune_command_queue_ = nullptr;
    std::shared_ptr<cl::CommandQueue> GetCommandQueue();
    OpenCLRuntime *opencl_runtime_ = nullptr;
    unsigned int flush_count_ = 0;
    cl_command_queue_properties properties_ = 0;

    bool ReadStatusCheck(std::ifstream& is);

    std::map<std::string, std::vector<uint32_t>> local_size_tune_map_;
    uint32_t tune_map_size_;

    static std::mutex s_mutex_;

magic number for workgroup


//magic number
static std::map<int, int> AdrenoSubGroup{
    {640, 128}, {630, 128}, {616, 128}, {612, 64}, {610, 64}, {540, 32}, {530, 32},
    {512, 32},  {510, 32},  {509, 32},  {506, 32}, {505, 32}, {405, 32}, {330, 16},

//opencl 2.0 can get SubGroupSize.
uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
    uint32_t sub_group_size = 0;

    if (ADRENO == gpu_info_.type) {
        cl_int cl_ret;
        sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
        if (cl_ret != CL_SUCCESS) {
            sub_group_size = 0;
        if (AdrenoSubGroup.find(gpu_info_.model_num) != AdrenoSubGroup.end()) {
            sub_group_size = AdrenoSubGroup[gpu_info_.model_num];

    return sub_group_size;
// https://github.com/Tencent/TNN/blob/aedc6c849e711a6386a8d2cd4ebb0bc94c7b9285/source/tnn/device/opencl/opencl_runtime.cc#L341
#include <EGL/egl.h>

//Init will get platforms info, get devices info, create opencl context.
Status OpenCLRuntime::Init() {
// ....

        // create context from glcontext
        LOGI("Create special opencl context to share with OpenGL\n");
        LOGI("eglGetCurrentContext(): 0x%x\n", eglGetCurrentContext());
        cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
                                                CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
        context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, context_prop, nullptr, nullptr, &err));

        if (err != CL_SUCCESS) {
                "Create special opencl context falied, Create common opencl "
                "context then.\n");
            context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, nullptr, nullptr, nullptr, &err));
        LOGI("Create common opencl context\n");
        context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, nullptr, nullptr, nullptr, &err));
// https://github.com/Tencent/TNN/blob/4b9ffbecc22f5ea4ba6bc4fdacff85475a59d08d/source/tnn/device/opencl/acc/opencl_layer_acc.cc#L160

Status OpenCLLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
    auto execute_unit_org                                 = execute_units_[0];
    auto max_wgs                                          = execute_unit_org.workgroupsize_max;
    std::vector<std::vector<uint32_t>> local_size_list_3d = {
        {16, 4, 1}, {8, 8, 1},   {4, 16, 1}, {2, 32, 1}, {1, 64, 1}, {2, 64, 1}, {4, 64, 1},
        {8, 64, 1}, {16, 64, 1}, {8, 64, 2}, {4, 64, 4}, {2, 64, 8}, {2, 64, 4}, {},
    std::vector<std::vector<uint32_t>> local_size_list_2d = {
        {2, max_wgs / 2},   {4, max_wgs / 4},   {8, max_wgs / 8},
        {16, max_wgs / 16}, {max_wgs / 2, 2},   {max_wgs / 4, 4},
        {max_wgs / 8, 8},   {max_wgs / 16, 16}, {},
    std::vector<uint32_t> local_size_default;
    if (execute_unit_org.global_work_size.size() == 2) {
        local_size_default = LocalWS2DDefault(execute_unit_org);
    } else if (execute_unit_org.global_work_size.size() == 3) {
        local_size_default = LocalWS3DDefault(execute_unit_org);

    OpenCLExecuteUnit exec_unit_default = execute_unit_org;
    exec_unit_default.local_work_size   = local_size_default;

    if (execute_unit_org.global_work_size.size() == 2) {
        for (auto local_size : local_size_list_2d) {
            OpenCLExecuteUnit exec_unit_temp = execute_unit_org;
            exec_unit_temp.local_work_size   = local_size;
    } else if (execute_unit_org.global_work_size.size() == 3) {
        for (auto local_size : local_size_list_3d) {
            OpenCLExecuteUnit exec_unit_temp = execute_unit_org;
            exec_unit_temp.local_work_size   = local_size;
