HSAFoundation / HSA-Runtime-AMD

The HSA-Runtime
Other
48 stars 16 forks source link

__local(or group) kernel is not functional #8

Closed 0charleschen0 closed 9 years ago

0charleschen0 commented 9 years ago

Hi, I want to test the kernel with __local memory and run with HSA runtime. I modified the example vector_copy for testing. Below are what I modified : First, it is my kernel _vectorsquare.cl:

__kernel void square(
    __global float *input,
    __global float *output,
    __local float *temp,
    const unsigned int count)
{
    int gtid = get_global_id(0);
    int ltid = get_local_id(0);
    if (gtid < count)
    {
        temp[ltid] = input[gtid];
        output[gtid] =  temp[ltid] * temp[ltid];
    }
}

and it was compiled it by cloc to .brig file. Then, I changed the file_name from

   char file_name[128] = "vector_copy.brig";

to

   char file_name[128] = "vector_square.brig";

kernel_name from

    char kernel_name[128] = "&__vector_copy_kernel";

to

    char kernel_name[128] = "&__OpenCL_square_kernel";

Then, I modified the dispatch information inspired by HSA-System-Runtime-Specification-Provisional-1.0 from

    aql.group_segment_size=0;
    aql.private_segment_size=0;

to

    aql.group_segment_size=hsaCodeDescriptor->workgroup_group_segment_byte_size;
    aql.private_segment_size=hsaCodeDescriptor->workitem_private_segment_byte_size;

Finally, I added a variable for _local argument 'temp'_

    char* temp = (char*)malloc(1024*1024*4);    
    memset(out, 0, 1024*1024*4);
    err=hsa_memory_register(out, 1024*1024*4);
    check(Registering argument memory for output parameter, err);    

and modified args structure like this :

     struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) args_t {
        void* arg0;
        void* arg1;
    void* arg2;
    int   arg3;
    } args;

and passed arguments like this :

    args.arg0=in;
    args.arg1=out;
    args.arg2=temp;
    args.arg3=1024*1024*4;

Of course, print out the result for validity.

    printf("in:\n");
    for(int i=0; i<256; i++)
        printf("%d, ",in[i]);
    printf("\nout:\n");
    for(int i=0; i<256; i++)
        printf("%d, ",out[i]);

But, unfortunately, there is the result : screenshot from 2014-12-10 23 49 49 Do I miss something need to do?

Below is the source code.

/* Copyright 2014 HSA Foundation Inc.  All Rights Reserved.
 *
 * HSAF is granting you permission to use this software and documentation (if
 * any) (collectively, the "Materials") pursuant to the terms and conditions
 * of the Software License Agreement included with the Materials.  If you do
 * not have a copy of the Software License Agreement, contact the  HSA Foundation for a copy.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 */

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "hsa.h"
#include "hsa_ext_finalize.h"
#include "elf_utils.h"

#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
    printf("%s failed.\n", #msg); \
    exit(1); \
} else { \
   printf("%s succeeded.\n", #msg); \
}

#define check_build(msg, status) \
if (status != STATUS_SUCCESS) { \
    printf("%s failed.\n", #msg); \
    exit(1); \
} else { \
   printf("%s succeeded.\n", #msg); \
}

/*
 * Define required BRIG data structures.
 */

typedef uint32_t BrigCodeOffset32_t;

typedef uint32_t BrigDataOffset32_t;

typedef uint16_t BrigKinds16_t;

typedef uint8_t BrigLinkage8_t;

typedef uint8_t BrigExecutableModifier8_t;

typedef BrigDataOffset32_t BrigDataOffsetString32_t;

enum BrigKinds {
    BRIG_KIND_NONE = 0x0000,
    BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,
    BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,
};

typedef struct BrigBase BrigBase;
struct BrigBase {
    uint16_t byteCount;
    BrigKinds16_t kind;
};

typedef struct BrigExecutableModifier BrigExecutableModifier;
struct BrigExecutableModifier {
    BrigExecutableModifier8_t allBits;
};

typedef struct BrigDirectiveExecutable BrigDirectiveExecutable;
struct BrigDirectiveExecutable {
    uint16_t byteCount;
    BrigKinds16_t kind;
    BrigDataOffsetString32_t name;
    uint16_t outArgCount;
    uint16_t inArgCount;
    BrigCodeOffset32_t firstInArg;
    BrigCodeOffset32_t firstCodeBlockEntry;
    BrigCodeOffset32_t nextModuleEntry;
    uint32_t codeBlockEntryCount;
    BrigExecutableModifier modifier;
    BrigLinkage8_t linkage;
    uint16_t reserved;
};

typedef struct BrigData BrigData;
struct BrigData {
    uint32_t byteCount;
    uint8_t bytes[1];
};

/*
 * Determines if the given agent is of type HSA_DEVICE_TYPE_GPU
 * and sets the value of data to the agent handle if it is.
 */
static hsa_status_t find_gpu(hsa_agent_t agent, void *data) {
    if (data == NULL) {
        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
    }
    hsa_device_type_t device_type;
    hsa_status_t stat =
    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
    if (stat != HSA_STATUS_SUCCESS) {
        return stat;
    }
    if (device_type == HSA_DEVICE_TYPE_GPU) {
        *((hsa_agent_t *)data) = agent;
    }
    return HSA_STATUS_SUCCESS;
}

/*
 * Determines if a memory region can be used for kernarg
 * allocations.
 */
static hsa_status_t get_kernarg(hsa_region_t region, void* data) {
    hsa_region_flag_t flags;
    hsa_region_get_info(region, HSA_REGION_INFO_FLAGS, &flags);
    if (flags & HSA_REGION_FLAG_KERNARG) {
        hsa_region_t* ret = (hsa_region_t*) data;
        *ret = region;
    }
    return HSA_STATUS_SUCCESS;
}

/*
 * Finds the specified symbols offset in the specified brig_module.
 * If the symbol is found the function returns HSA_STATUS_SUCCESS, 
 * otherwise it returns HSA_STATUS_ERROR.
 */
hsa_status_t find_symbol_offset(hsa_ext_brig_module_t* brig_module, 
    char* symbol_name,
    hsa_ext_brig_code_section_offset32_t* offset) {

    /* 
     * Get the data section 
     */
    hsa_ext_brig_section_header_t* data_section_header = 
                brig_module->section[HSA_EXT_BRIG_SECTION_DATA];
    /* 
     * Get the code section
     */
    hsa_ext_brig_section_header_t* code_section_header =
             brig_module->section[HSA_EXT_BRIG_SECTION_CODE];

    /* 
     * First entry into the BRIG code section
     */
    BrigCodeOffset32_t code_offset = code_section_header->header_byte_count;
    BrigBase* code_entry = (BrigBase*) ((char*)code_section_header + code_offset);
    while (code_offset != code_section_header->byte_count) {
        if (code_entry->kind == BRIG_KIND_DIRECTIVE_KERNEL) {
            /* 
             * Now find the data in the data section
             */
            BrigDirectiveExecutable* directive_kernel = (BrigDirectiveExecutable*) (code_entry);
            BrigDataOffsetString32_t data_name_offset = directive_kernel->name;
            BrigData* data_entry = (BrigData*)((char*) data_section_header + data_name_offset);
            if (!strncmp(symbol_name, (char*) data_entry->bytes, strlen(symbol_name))) {
                *offset = code_offset;
                return HSA_STATUS_SUCCESS;
            }
        }
        code_offset += code_entry->byteCount;
        code_entry = (BrigBase*) ((char*)code_section_header + code_offset);
    }
    return HSA_STATUS_ERROR;
}

int main(int argc, char **argv) {
    hsa_status_t err;
    status_t build_err;

    err = hsa_init();
    check(Initializing the hsa runtime, err);

    /* 
     * Iterate over the agents and pick the gpu agent using 
     * the find_gpu callback.
     */
    hsa_agent_t device = 0;
    err = hsa_iterate_agents(find_gpu, &device);
    check(Calling hsa_iterate_agents, err);

    err = (device == 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
    check(Checking if the GPU device is non-zero, err);

    /*
     * Query the name of the device.
     */
    char name[64] = { 0 };
    err = hsa_agent_get_info(device, HSA_AGENT_INFO_NAME, name);
    check(Querying the device name, err);
    printf("The device name is %s.\n", name);

    /*
     * Query the maximum size of the queue.
     */
    uint32_t queue_size = 0;
    err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
    check(Querying the device maximum queue size, err);
    printf("The maximum queue size is %u.\n", (unsigned int) queue_size);

    /*
     * Create a queue using the maximum size.
     */
    hsa_queue_t* commandQueue;
    err = hsa_queue_create(device, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, &commandQueue);
    check(Creating the queue, err);

    /*
     * Load BRIG, encapsulated in an ELF container, into a BRIG module.
     */
    hsa_ext_brig_module_t* brigModule;
    char file_name[128] = "vector_square.brig";
    build_err = create_brig_module_from_brig_file(file_name, &brigModule);
    check_build(Creating the brig module from vector_copy.brig, build_err);

    /*
     * Create hsa program.
     */
    hsa_ext_program_handle_t hsaProgram;
    err = hsa_ext_program_create(&device, 1, HSA_EXT_BRIG_MACHINE_LARGE, HSA_EXT_BRIG_PROFILE_FULL, &hsaProgram);
    check(Creating the hsa program, err);

    /*
     * Add the BRIG module to hsa program.
     */
    hsa_ext_brig_module_handle_t module;
    err = hsa_ext_add_module(hsaProgram, brigModule, &module);
    check(Adding the brig module to the program, err);

    /* 
     * Construct finalization request list.
     */
    hsa_ext_finalization_request_t finalization_request_list;
    finalization_request_list.module = module;
    finalization_request_list.program_call_convention = 0;
    char kernel_name[128] = "&__OpenCL_square_kernel";
    err = find_symbol_offset(brigModule, kernel_name, &finalization_request_list.symbol);
    check(Finding the symbol offset for the kernel, err);

    /*
     * Finalize the hsa program.
     */
    err = hsa_ext_finalize_program(hsaProgram, device, 1, &finalization_request_list, NULL, NULL, 0, "-cf-dump-isa", 1);
    check(Finalizing the program, err);

    /*
     * Destroy the brig module. The program was successfully created the kernel
     * symbol was found and the program was finalized, so it is no longer needed.
     */
     destroy_brig_module(brigModule);

    /*
     * Get the hsa code descriptor address.
     */
    hsa_ext_code_descriptor_t *hsaCodeDescriptor;
    err = hsa_ext_query_kernel_descriptor_address(hsaProgram, module, finalization_request_list.symbol, &hsaCodeDescriptor);
    check(Querying the kernel descriptor address, err);

    /*
     * Create a signal to wait for the dispatch to finish.
     */ 
    hsa_signal_t signal;
    err=hsa_signal_create(1, 0, NULL, &signal);
    check(Creating a HSA signal, err);

    /*
     * Initialize the dispatch packet.
     */
    hsa_dispatch_packet_t aql;
    memset(&aql, 0, sizeof(aql));

    /*
     * Setup the dispatch information.
     */
    aql.completion_signal=signal;
    aql.dimensions=1;
    aql.workgroup_size_x=256;
    aql.workgroup_size_y=1;
    aql.workgroup_size_z=1;
    aql.grid_size_x=1024*1024;
    aql.grid_size_y=1;
    aql.grid_size_z=1;
    aql.header.type=HSA_PACKET_TYPE_DISPATCH;
    aql.header.acquire_fence_scope=2;
    aql.header.release_fence_scope=2;
    aql.header.barrier=1;
    aql.group_segment_size=hsaCodeDescriptor->workgroup_group_segment_byte_size;
    aql.private_segment_size=hsaCodeDescriptor->workitem_private_segment_byte_size;

    /*
     * Allocate and initialize the kernel arguments.
     */
    char* in=(char*)malloc(1024*1024*4);
    memset(in, 1, 1024*1024*4);
    err=hsa_memory_register(in, 1024*1024*4);
    check(Registering argument memory for input parameter, err);

    char* out=(char*)malloc(1024*1024*4);
    memset(out, 0, 1024*1024*4);
    err=hsa_memory_register(out, 1024*1024*4);
    check(Registering argument memory for output parameter, err);

    char* temp = (char*)malloc(1024*1024*4);    
    memset(out, 0, 1024*1024*4);
    err=hsa_memory_register(out, 1024*1024*4);
    check(Registering argument memory for output parameter, err);    

    struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) args_t {
        void* arg0;
        void* arg1;
    void* arg2;
    int   arg3;
    } args;

    args.arg0=in;
    args.arg1=out;
    args.arg2=temp;
    args.arg3=1024*1024*4;
    /*
     * Find a memory region that supports kernel arguments.
     */
    hsa_region_t kernarg_region = 0;
    hsa_agent_iterate_regions(device, get_kernarg, &kernarg_region);
    err = (kernarg_region == 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
    check(Finding a kernarg memory region, err);
    void* kernel_arg_buffer = NULL;

    size_t kernel_arg_buffer_size = hsaCodeDescriptor->kernarg_segment_byte_size;

    /*
     * Allocate the kernel argument buffer from the correct region.
     */   
    err = hsa_memory_allocate(kernarg_region, kernel_arg_buffer_size, 
                        &kernel_arg_buffer);
    check(Allocating kernel argument memory buffer, err);
    memcpy(kernel_arg_buffer, &args, sizeof(args));

    /*
     * Bind kernel code and the kernel argument buffer to the
     * aql packet.
     */
    aql.kernel_object_address=hsaCodeDescriptor->code.handle;
    aql.kernarg_address=(uint64_t)kernel_arg_buffer;

    /*
     * Register the memory region for the argument buffer.
     */
    err = hsa_memory_register(&args, sizeof(struct args_t));
    check(Registering the argument buffer, err);

    /*
     * Obtain the current queue write index.
     */
    uint64_t index = hsa_queue_load_write_index_relaxed(commandQueue);

    /*
     * Write the aql packet at the calculated queue index address.
     */
    const uint32_t queueMask = commandQueue->size - 1;
    ((hsa_dispatch_packet_t*)(commandQueue->base_address))[index&queueMask]=aql;

    /*
     * Increment the write index and ring the doorbell to dispatch the kernel.
     */
    hsa_queue_store_write_index_relaxed(commandQueue, index+1);
    hsa_signal_store_relaxed(commandQueue->doorbell_signal, index);
    check(Dispatching the kernel, err);

    /*
     * Wait on the dispatch signal until the kernel is finished.
     */
    hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_LT, 1, (uint64_t) -1, HSA_WAIT_EXPECTANCY_UNKNOWN);

    /*
     * Validate the data in the output buffer.
     */
    int valid=1;
    int failIndex=0;
    for(int i=0; i<1024*1024; i++) {
        if(out[i]!=in[i]*in[i]) {
            failIndex=i;
            valid=0;
            break;
        }
    }
    printf("in:\n");
    for(int i=0; i<256; i++)
        printf("%d, ",in[i]);
    printf("\nout:\n");
    for(int i=0; i<256; i++)
        printf("%d, ",out[i]);

    if(valid) {
        printf("Passed validation.\n");
    } else {
        printf("VALIDATION FAILED!\nBad index: %d\n", failIndex);
    }

    /*
     * Cleanup all allocated resources.
     */
    err=hsa_signal_destroy(signal);
    check(Destroying the signal, err);

    err=hsa_ext_program_destroy(hsaProgram);
    check(Destroying the program, err);

    err=hsa_queue_destroy(commandQueue);
    check(Destroying the queue, err);

    err=hsa_shut_down();
    check(Shutting down the runtime, err);

    free(in);
    free(out);

    return 0;
}
ogabbay commented 9 years ago

Just to make sure, which kfd version are you using ?

0charleschen0 commented 9 years ago

I am using kfd-1.0. Runtime is 1.0,too.

ogabbay commented 9 years ago

ok, thanks. That version should support local memory, at least from the kernel side.

ogabbay commented 9 years ago

Tried to recreate your steps (with the code you pasted here) I got: ogabbay@odedg-ubuntu:~/HSA-Runtime-AMD/sample$ ./vector_copy Initializing the hsa runtime succeeded. Calling hsa_iterate_agents succeeded. Checking if the GPU device is non-zero succeeded. Querying the device name succeeded. The device name is Spectre. Querying the device maximum queue size succeeded. The maximum queue size is 131072. Creating the queue succeeded. Segmentation fault (core dumped)

alienanthill commented 9 years ago

Hi,

Can you paste the hsail text. You can use the -hsail option in cloc.

Thanks, Shreyas

On Thu, Dec 11, 2014 at 6:12 AM, Oded Gabbay notifications@github.com wrote:

Tried to recreate your steps (with the code you pasted here) I got: ogabbay@odedg-ubuntu:~/HSA-Runtime-AMD/sample$ ./vector_copy Initializing the hsa runtime succeeded. Calling hsa_iterate_agents succeeded. Checking if the GPU device is non-zero succeeded. Querying the device name succeeded. The device name is Spectre. Querying the device maximum queue size succeeded. The maximum queue size is 131072. Creating the queue succeeded. Segmentation fault (core dumped)

— Reply to this email directly or view it on GitHub https://github.com/HSAFoundation/HSA-Runtime-AMD/issues/8#issuecomment-66610626 .

0charleschen0 commented 9 years ago

Hi, Shreyas, Here it is.

version 0:20140528:$full:$large;
extension "amd:gcn";
extension "IMAGE";

decl prog function &abort()();

prog kernel &__OpenCL_square_kernel(
    kernarg_u64 %input,
    kernarg_u64 %output,
    kernarg_u64 %temp,
    kernarg_u32 %count)
{
    pragma  "AMD RTI", "ARGSTART:__OpenCL_square_kernel";
    pragma  "AMD RTI", "version:3:1:104";
    pragma  "AMD RTI", "device:generic";
    pragma  "AMD RTI", "uniqueid:1024";
    pragma  "AMD RTI", "function:1:0";
    pragma  "AMD RTI", "memory:64bitABI";
    pragma  "AMD RTI", "uavid:8";
    pragma  "AMD RTI", "privateid:8";
    pragma  "AMD RTI", "ARGEND:__OpenCL_square_kernel";

@__OpenCL_square_kernel_entry:
    // BB#0:                                // %entry
    workitemabsid_u32   $s0, 0;
    ld_kernarg_align(4)_width(all)_u32  $s1, [%count];
    cmp_ge_b1_u32   $c0, $s0, $s1;
    cbr_b1  $c0, @BB0_2;
    // BB#1:                                // %if.then
    ld_kernarg_align(8)_width(all)_u64  $d2, [%temp];
    ld_kernarg_align(8)_width(all)_u64  $d0, [%output];
    ld_kernarg_align(8)_width(all)_u64  $d1, [%input];
    workitemid_u32  $s1, 0;
    cvt_s64_s32 $d3, $s1;
    shl_u64 $d3, $d3, 2;
    add_u64 $d2, $d2, $d3;
    cvt_s64_s32 $d3, $s0;
    shl_u64 $d3, $d3, 2;
    add_u64 $d1, $d1, $d3;
    ld_global_f32   $s0, [$d1];
    st_group_f32    $s0, [$d2];
    add_u64 $d0, $d0, $d3;
    mul_ftz_f32 $s0, $s0, $s0;
    st_global_f32   $s0, [$d0];

@BB0_2:
    // %if.end
    ret;
};

gabbayo, I think maybe you forgot to compile the kernel code to .brig file. My program can run successfully(but with wrong result).

Thanks for your help, gabbayo and Shreyas.

alienanthill commented 9 years ago

Hi Charles,

A couple of things.

  1. Local memory cannot be malloced.
  2. Local memory is shared between the workitems in a workgroup and its size is determined by the device.
  3. Local memory can be used in two ways a. Local memory defined within the kernel using the __local qualifier. We'll call this static local memory b. Local memory passed in as a pointer into the kernel through the arguments. We'll call this dynamic local memory

The HSAIL code generated here assumes that all pointers to local memory are offsets with base 0 (This will be different if FLAT addressing is used) . The offsets are in the range from 0 to size of Local Memory as defined in the device properties

  1. The static local memory starts at base 0 and its size is hsaCodeDescriptor->workgroup_group_segment_byte_size
  2. The dynamic local memory starts at offset hsaCodeDescriptor->workgroup_group_segment_byte_size.

In your example. change your code to

uint64_t size_of_local_temp = 1024 * 1024 4; //You probably need lesser- WORK_GROUP_SIZE 4 uint64_t static_local_size = hsaCodeDescriptor->workgroup_group_segment_byte_size; uint64_t temp_ptr = hsaCodeDescriptor->workgroup_group_segment_byte_size; uint64_t dynamic_local_size = size_of_local_temp; aql.group_segment_size= static_local_size + dynamic_local_size;

--Pass temp_ptr to the arguments--

This should work. Hope that helps.

0charleschen0 commented 9 years ago

Hi Shreyas,

It worked!!

Thank you for your help! I have struggled it for several days.

By the way, could HSA Foundation release example which uses dynamic local memory? I think others may want to know this. ^ _ ^ Thank you gabbayo.

Sincerely, Charles