HSAFoundation / HSA-Runtime-AMD

The HSA-Runtime
Other
48 stars 16 forks source link

How do I use enqueue_kernel to calculate vec_add? #19

Open ryansoq opened 8 years ago

ryansoq commented 8 years ago

Hi All, I have a question. How do I use enqueue_kernel ?

This is my environment : CLOC-0.8 /opt/amd/bin/clc2 -version LLVM version 3.2svn /opt/amd/bin/opt -version LLVM version 3.2svn HSA-Runtime-AMD-release-v1.0

.cl code

void VectorAdd_child(__global int* a, __global int* b, __global int* res);

__kernel void vectoradd(__global int* a, __global int* b, __global int* res)
{
        int gid = get_global_id(0);
        //res[gid] = a[gid] + b[gid];
        queue_t defQ = get_default_queue();
        ndrange_t ndrange1 = ndrange_1D(256);
        void (^myblock)(void)=^{VectorAdd_child(a, b, res);};
        int err_ret = enqueue_kernel(defQ, 1, ndrange1, myblock);
}

void VectorAdd_child(__global int* a, __global int* b, __global int* res)
{
        int gid = get_global_id(0);
        res[gid] = a[gid] + b[gid];
}

host code

// ref libclc code

typedef struct _HsaAqlDispatchPacket {
    uint    mix;
    ushort  workgroup_size[3];
    ushort  reserved2;
    uint    grid_size[3];
    uint    private_segment_size_bytes;
    uint    group_segment_size_bytes;
    ulong   kernel_object_address;
    ulong   kernel_arg_address;
    ulong   reserved3;
    ulong   completion_signal;
} HsaAqlDispatchPacket;

typedef struct _AmdAqlWrap {
    uint state;
    uint enqueue_flags; 
    uint command_id;
    uint child_counter;
    ulong completion;
    ulong parent_wrap;
    ulong wait_list;
    uint wait_num;
    uint reserved[5];
    HsaAqlDispatchPacket aql;
} AmdAqlWrap;

typedef struct _AmdVQueueHeader {
    uint    aql_slot_num;
    uint    event_slot_num;
    ulong   event_slot_mask;
    ulong   event_slots;
    ulong   aql_slot_mask;
    uint    command_counter;
    uint    wait_size;
    uint    arg_size;
    uint    reserved0;
    ulong   kernel_table;
    uint    reserved[2];
} AmdVQueueHeader;

int main(int argc, char **argv) { ... hsa_status_t err; status_t build_err;

err = hsa_init();
check(Initializing the hsa runtime, err);

/* 
 * Iterate over the agents and pick the gpu agent using 
 * the find_gpu callback.
 */
hsa_agent_t device = 0;
err = hsa_iterate_agents(find_gpu, &device);
check(Calling hsa_iterate_agents, err);

err = (device == 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
check(Checking if the GPU device is non-zero, err);

/*
 * Query the name of the device.
 */
char name[64] = { 0 };
err = hsa_agent_get_info(device, HSA_AGENT_INFO_NAME, name);
check(Querying the device name, err);
printf("The device name is %s.\n", name);

/*
 * Query the maximum size of the queue.
 */
uint32_t queue_size = 0;
err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
check(Querying the device maximum queue size, err);
printf("The maximum queue size is %u.\n", (unsigned int) queue_size);

/*
 * Create a queue using the maximum size.
 */
hsa_queue_t* commandQueue;
err = hsa_queue_create(device, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, &commandQueue);
check(Creating the queue, err);

/*
 * Load BRIG, encapsulated in an ELF container, into a BRIG module.
 */
hsa_ext_brig_module_t* brigModule;
char file_name[128] = "shader.brig";
build_err = create_brig_module_from_brig_file(file_name, &brigModule);
check_build(Creating the brig module from shader.brig, build_err);

/*
 * Create hsa program.
 */
hsa_ext_program_handle_t hsaProgram;
err = hsa_ext_program_create(&device, 1, HSA_EXT_BRIG_MACHINE_LARGE, HSA_EXT_BRIG_PROFILE_FULL, &hsaProgram);
check(Creating the hsa program, err);

/*
 * Add the BRIG module to hsa program.
 */
hsa_ext_brig_module_handle_t module;
err = hsa_ext_add_module(hsaProgram, brigModule, &module);
check(Adding the brig module to the program, err);

/* 
 * Construct finalization request list.
 */
hsa_ext_finalization_request_t finalization_request_list;
finalization_request_list.module = module;
finalization_request_list.program_call_convention = 0;
char kernel_name[128] = "&__OpenCL_vectoradd_kernel"; //&__OpenCL_%s_kernel
//char kernel_name[128] = "&__OpenCL___amd_blocks_func___vectoradd_block_invoke_kernel";
err = find_symbol_offset(brigModule, kernel_name, &finalization_request_list.symbol);
check(Finding the symbol offset for the kernel, err);

/*
 * Finalize the hsa program.
 */
err = hsa_ext_finalize_program(hsaProgram, device, 1, &finalization_request_list, NULL, NULL, 0, NULL, 0);
check(Finalizing the program, err);

/*
 * Destroy the brig module. The program was successfully created the kernel
 * symbol was found and the program was finalized, so it is no longer needed.
 */
 destroy_brig_module(brigModule);

/*
 * Get the hsa code descriptor address.
 */
hsa_ext_code_descriptor_t *hsaCodeDescriptor;
err = hsa_ext_query_kernel_descriptor_address(hsaProgram, module,    finalization_request_list.symbol, &hsaCodeDescriptor);
check(Querying the kernel descriptor address, err);

/*
 * Create a signal to wait for the dispatch to finish.
 */ 
hsa_signal_t signal;
err=hsa_signal_create(1, 0, NULL, &signal);
check(Creating a HSA signal, err);

/*
 * Initialize the dispatch packet.
 */
hsa_dispatch_packet_t aql;
memset(&aql, 0, sizeof(aql));

/*
 * Allocate and initialize the kernel arguments.
 */
const int DATA_SIZE = 128;

int   *a=(int*)malloc(sizeof(int)*DATA_SIZE);
int   *b=(int*)malloc(sizeof(int)*DATA_SIZE);
int *res=(int*)malloc(sizeof(int)*DATA_SIZE);

memset(a, 0, DATA_SIZE*sizeof(int));
//err=hsa_memory_register(a, DATA_SIZE*4);
//check(Registering argument memory for input parameter, err);

memset(b, 0, DATA_SIZE*sizeof(int));
//err=hsa_memory_register(b, DATA_SIZE*4);
//check(Registering argument memory for input parameter, err);

memset(res, 0, DATA_SIZE*sizeof(int));
//err=hsa_memory_register(res, DATA_SIZE*4);
//check(Registering argument memory for input parameter, err);
srand (time(NULL));
for(int i = 0; i < DATA_SIZE; i++) {
    a[i] = rand() % 10;
    b[i] = rand() % 10;
}

    /*
 * Setup the dispatch information.
 */
aql.completion_signal=signal;
aql.dimensions=(uint16_t)1;
aql.workgroup_size_x=(uint16_t)32;
aql.workgroup_size_y=(uint16_t)1;
aql.workgroup_size_z=(uint16_t)1;
aql.grid_size_x=DATA_SIZE;
aql.grid_size_y=1;
aql.grid_size_z=1;
aql.header.type=HSA_PACKET_TYPE_DISPATCH;
aql.header.acquire_fence_scope=2;
aql.header.release_fence_scope=2;
aql.header.barrier=1;
aql.group_segment_size=0;
aql.private_segment_size=0;

...


size_t kernel_arg_buffer_size = hsaCodeDescriptor->kernarg_segment_byte_size;
void* args __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) = malloc(hsaCodeDescriptor->kernarg_segment_byte_size) ;
memset(args, 0, kernel_arg_buffer_size);
void* args_pointer = args;
args_pointer += sizeof(void*) * (/*kernel->hsaArgCount*/9 - /*kernel->num_args*/3 - /* enque arg*/3);

  uint64_t   *pb=(uint64_t *)malloc(sizeof(uint64_t)*128);
  AmdVQueueHeader *vq =(AmdVQueueHeader *) malloc(sizeof(AmdVQueueHeader));
  AmdAqlWrap *aw = (AmdAqlWrap *)malloc(sizeof(AmdAqlWrap));

  memset(&pb, 0, sizeof(uint64_t)*128);
  memset(&vq, 0, sizeof(AmdVQueueHeader));
  memset(&aw, 0, sizeof(AmdAqlWrap));

  void* q1 = &pb;
  void* q2 = &vq;
  void* q3 = &aw;

   //set argument 
   //map  __vqueue_pointer, __aqlwrap_pointer

    memcpy(args_pointer, &q1, sizeof(void*));
    args_pointer += sizeof(void*);
    memcpy(args_pointer, &q2, sizeof(void*));
    args_pointer += sizeof(void*);
    memcpy(args_pointer, &q3, sizeof(void*));
    args_pointer += sizeof(void*);

    void* p1 = &vq;
    memcpy(args_pointer, &p1, sizeof(void*));
    args_pointer += sizeof(void*);

    void* p2 = &aw;
    memcpy(args_pointer, &p2, sizeof(void*));
    args_pointer += sizeof(void*);

    void* p3 = res;
    memcpy(args_pointer, &p3, sizeof(void*));
    args_pointer += sizeof(void*);

...

    printf("aw->aql.mix  : %d \n",(aw->aql).mix);
    printf("aw->aql.ws 0 : %hd \n",(aw->aql).workgroup_size[0]);
    printf("aw->aql.ws 1 : %hd \n",(aw->aql).workgroup_size[1]);
    printf("aw->aql.ws 2 : %hd \n",(aw->aql).workgroup_size[2]);
    printf("aw->aql.gs 0 : %d \n",(aw->aql).grid_size[0]);
    printf("aw->aql.gs 1 : %d \n",(aw->aql).grid_size[1]);
    printf("aw->aql.gs 2 : %d \n",(aw->aql).grid_size[2]);

// I get error data

.hsail

version 0:20140528:$full:$large; extension "amd:gcn"; extension "IMAGE";

decl prog function &abort()(); ...

prog kernel &__OpenCL_vectoradd_kernel(
        kernarg_u64 %__global_offset_0,
        kernarg_u64 %__global_offset_1,
        kernarg_u64 %__global_offset_2,
        kernarg_u64 %__printf_buffer,
        kernarg_u64 %__vqueue_pointer,
        kernarg_u64 %__aqlwrap_pointer,
        kernarg_u64 %a,
        kernarg_u64 %b,
        kernarg_u64 %res)
{

... Thanks