ROCm / hcc

HCC is an Open Source, Optimizing C++ Compiler for Heterogeneous Compute currently for the ROCm GPU Computing Platform
https://github.com/RadeonOpenCompute/hcc/wiki
Other
429 stars 107 forks source link

How to convert LDS memory address to the address can be passed into DS_READ_* and DS_WRITE_* instructions? #1455

Open terU3760 opened 2 years ago

terU3760 commented 2 years ago

Have followed here to write the following code: The hip file:

#include "hip/hip_runtime.h"
#include "hip/hcc_detail/device_library_decls.h"

__global__ void halfVec_v_pk_sts_then_lds( uint16_t * dst , uint32_t * ptr , uint16_t * val , int n )
{

    __shared__ uint16_t temp_local_share_memory_allocated[5];

    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if ( id < n )
    {

        asm volatile("DS_WRITE_B16 %0, %1;\nS_WAITCNT lgkmcnt(0);\n" : : "v"( __to_local( &temp_local_share_memory_allocated[ptr[id]] ) ), "v"(val[id]) );
        printf("Store value: %d into local shared memory address: %d.\n", val , ptr[id] );

        asm volatile("DS_READ_U16 %0, %1;\nS_WAITCNT lgkmcnt(0);\n" : "=v"(dst[id]) : "v"( __to_local( &temp_local_share_memory_allocated[ptr[id]] ) ));
        printf("Load value: %d from local shared memory address: %d.\n", dst[id] , ptr[id] );

    }

}

The cpp file:

#pragma once

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include "test_simple_kernel.hip"

int main(int argc,char**vargs)
{

    uint16_t* to_be_stored = new uint16_t[1];
    uint16_t* to_be_loaded = new uint16_t[1];
    uint32_t* local_share_memory_address = new uint32_t[1];

    uint16_t* to_be_stored_d;
    uint16_t* to_be_loaded_d;
    uint32_t* local_share_memory_address_d;

    to_be_stored[ 0 ] = 1;
    local_share_memory_address[ 0 ] = 0;
    hipSetDevice(0);
    hipMalloc(&to_be_stored_d, 2);
    hipMalloc(&to_be_loaded_d, 2);
    hipMalloc(&local_share_memory_address_d, 4);
    hipMemcpy(to_be_stored_d, to_be_stored, 2, hipMemcpyHostToDevice);
    hipMemcpy(local_share_memory_address_d, local_share_memory_address, 4, hipMemcpyHostToDevice);
    printf("Stored value: %d at local share memory address: %d.\n" , to_be_stored[ 0 ] , local_share_memory_address[ 0 ] );
    hipLaunchKernelGGL( halfVec_v_pk_sts_then_lds , dim3( 1 ) , dim3( 1 ) , sizeof(uint16_t), 0, (uint16_t*)(to_be_loaded_d), (uint32_t*)(local_share_memory_address_d) , (uint16_t*)(to_be_stored_d) , 1 );
    hipMemcpy(to_be_loaded, to_be_loaded_d, 2, hipMemcpyDeviceToHost);
    hipDeviceSynchronize();
    printf("And then load value: %d from local share memory address: %d.\n", to_be_loaded[ 0 ] , local_share_memory_address[ 0 ] );

}

The code just doesn't compile! How to correct it? Thanks in advance!

b-sumner commented 2 years ago

Accessing __shared__ memory via inline ASM is not recommended and will probably cause more harm than good. What problem are you trying to solve? Why not simply code temp_local...[i]?

terU3760 commented 2 years ago

@b-sumner I'm trying to test and let the inline assembly working. How to make this program(inline assembly) workable? Or could you please provide a program architecture to let these inline assembly workable?