Open wangzy0327 opened 7 months ago
Hi @wangzy0327! This sounds like an interesting project. Do you have some documentation on the target device that might help us understand the mapping of address spaces?
Tag @Naghasan.
This seems to be overlapping with https://github.com/intel/llvm/issues/13467. @wangzy0327 could you please clarify what the intention of the separation in discussion is here? If there isn't a strong reason for the separation, I would prefer we continue the discussion in your previous issue thread.
This seems to be overlapping with #13467. @wangzy0327 could you please clarify what the intention of the separation in discussion is here? If there isn't a strong reason for the separation, I would prefer we continue the discussion in your previous issue thread.
Yes,you can continue discussion in previous issue.
Hi @wangzy0327! This sounds like an interesting project. Do you have some documentation on the target device that might help us understand the mapping of address spaces?
Tag @Naghasan.
This is the documentation about target device driver-doc and the target device mapping of address spaces dev-doc
static const unsigned MLISAAddrSpaceMap[] = {
101, // bang_nram
102, // bang_wram
103, // bang_ldram
104, // bang_param
105, // bang_local
}
This issue is stale because it has been open 180 days with no activity. Remove stale label or comment or this will be automatically closed in 30 days.
Describe the bug
It is planned to expand new hardware based on SYCL. No relevant guidance has been found regarding the development of the address mapping part. I haved completed main development of the new device based on SYCL. But it cannot correctly execute in address space operation. I compared SYCL on cuda and SYCL on new-device using same simple source code. Can you give me some help to solve this problem?
To reproduce
There is the simple source code.
simple-add.cpp
``` #includeI tried to compile the above sample code using the cuda version and extended hardware version of sycl released in 2022-06. The device-side llvm ir code compiled by sycl-cuda is as follows.
simple-add-sm_70.ll
``` ; Function Attrs: noinline norecurse define weak_odr dso_local void @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_EUlNS0_2idILi1EEEE_(float addrspace(3)* noundef align 4 %_arg_localAccA, float add rspace(1)* noundef readonly align 4 %_arg_aA, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_aA6, float addrspace(1)* noundef align 4 %_arg_aC, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_aC9) local_unnamed_addr #0 comdat !kernel_arg_buffer_location !24 !kernel_arg_runtime_aligned !25 !kernel_arg_exclusive_ptr !25 !sycl_kernel_omit_args !26 {entry: %0 = getelementptr inbounds %"class.cl::sycl::id", %"class.cl::sycl::id"* %_arg_aA6, i64 0, i32 0, i32 0, i64 0 %1 = load i64, i64* %0, align 8 %add.ptr.i = getelementptr inbounds float, float addrspace(1)* %_arg_aA, i64 %1 %2 = getelementptr inbounds %"class.cl::sycl::id", %"class.cl::sycl::id"* %_arg_aC9, i64 0, i32 0, i32 0, i64 0 %3 = load i64, i64* %2, align 8 %add.ptr.i41 = getelementptr inbounds float, float addrspace(1)* %_arg_aC, i64 %3 %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #4 %conv.i.i.i.i.i.i.i = sext i32 %4 to i64 %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #4 %conv.i1.i.i.i.i.i.i = sext i32 %5 to i64 %mul.i.i.i.i.i.i = mul nsw i64 %conv.i1.i.i.i.i.i.i, %conv.i.i.i.i.i.i.i %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4 %conv.i2.i.i.i.i.i.i = sext i32 %6 to i64 %add.i.i.i.i.i.i = add nsw i64 %mul.i.i.i.i.i.i, %conv.i2.i.i.i.i.i.i %7 = tail call i32* @llvm.nvvm.implicit.offset() #4 %8 = load i32, i32* %7, align 4, !tbaa !14 %conv.i3.i.i.i.i.i.i = zext i32 %8 to i64 %add4.i.i.i.i.i.i = add nsw i64 %add.i.i.i.i.i.i, %conv.i3.i.i.i.i.i.i %cmp.i.i.i = icmp ult i64 %add4.i.i.i.i.i.i, 2147483648 tail call void @llvm.assume(i1 %cmp.i.i.i) #4 br label %for.body.i for.body.i: ; preds = %for.body.i, %entry %j.015.i = phi i32 [ 0, %entry ], [ %inc.i.1, %for.body.i ] %conv.i = zext i32 %j.015.i to i64 %arrayidx.i.i42 = getelementptr inbounds float, float addrspace(1)* %add.ptr.i, i64 %conv.i %arrayidx.ascast.i.i = addrspacecast float addrspace(1)* %arrayidx.i.i42 to float* %9 = load float, float* %arrayidx.ascast.i.i, align 4, !tbaa !18 %arrayidx.i3.i = getelementptr inbounds float, float addrspace(3)* %_arg_localAccA, i64 %conv.i %arrayidx.ascast.i4.i = addrspacecast float addrspace(3)* %arrayidx.i3.i to float* store float %9, float* %arrayidx.ascast.i4.i, align 4, !tbaa !18 %add.i = fadd float %9, 5.000000e-01 %arrayidx.i11.i = getelementptr inbounds float, float addrspace(1)* %add.ptr.i41, i64 %conv.i %arrayidx.ascast.i12.i = addrspacecast float addrspace(1)* %arrayidx.i11.i to float* store float %add.i, float* %arrayidx.ascast.i12.i, align 4, !tbaa !18 %inc.i = or i32 %j.015.i, 1 %conv.i.1 = zext i32 %inc.i to i64 %arrayidx.i.i42.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr.i, i64 %conv.i.1 %arrayidx.ascast.i.i.1 = addrspacecast float addrspace(1)* %arrayidx.i.i42.1 to float* %10 = load float, float* %arrayidx.ascast.i.i.1, align 4, !tbaa !18 %arrayidx.i3.i.1 = getelementptr inbounds float, float addrspace(3)* %_arg_localAccA, i64 %conv.i.1 %arrayidx.ascast.i4.i.1 = addrspacecast float addrspace(3)* %arrayidx.i3.i.1 to float* store float %10, float* %arrayidx.ascast.i4.i.1, align 4, !tbaa !18 %add.i.1 = fadd float %10, 5.000000e-01 %arrayidx.i11.i.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr.i41, i64 %conv.i.1 %arrayidx.ascast.i12.i.1 = addrspacecast float addrspace(1)* %arrayidx.i11.i.1 to float* store float %add.i.1, float* %arrayidx.ascast.i12.i.1, align 4, !tbaa !18 %inc.i.1 = add nuw nsw i32 %j.015.i, 2 %exitcond.not.i.1 = icmp eq i32 %inc.i.1, 256 br i1 %exitcond.not.i.1, label %_ZZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_ENKUlNS0_2idILi1EEEE_clES5_.exit, label %for.body.i, !llvm.loop !22 _ZZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_ENKUlNS0_2idILi1EEEE_clES5_.exit: ; preds = %for.body.i ret void ; uselistorder directives uselistorder float addrspace(3)* %_arg_localAccA, { 1, 0 } uselistorder i32 %j.015.i, { 1, 0, 2 } uselistorder i32 %inc.i.1, { 1, 0 } } ```The device-side llvm ir code compiled by the extended hardware is as follows.
simple-add-mtp_372.ll
``` ; Function Attrs: convergent noinline norecurse define weak_odr dso_local void @_ZTSN2cl4sycl6detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE(%"cl ass.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_NumWorkItems, float addrspace(101)* noundef align 4 %_arg_localAccA, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_localAccA1, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_localAccA2, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_localAccA3, float addrspace(1)* noundef readonly align 4 %_arg_aA, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aA4, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aA5, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_aA6, float addrspace(1)* noundef align 4 %_arg_aC, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aC7, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aC8, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_aC9) local_unnamed_addr #0 comdat !kernel_arg_buffer_location !9 !kernel_arg_runtime_aligned !10 !kernel_arg_exclusive_ptr !10 {entry: %0 = getelementptr inbounds %"class.cl::sycl::range", %"class.cl::sycl::range"* %_arg_NumWorkItems, i64 0, i32 0, i32 0, i64 0 %1 = load i64, i64* %0, align 8 %2 = getelementptr inbounds %"class.cl::sycl::id", %"class.cl::sycl::id"* %_arg_aA6, i64 0, i32 0, i32 0, i64 0 %3 = load i64, i64* %2, align 8 %add.ptr.i = getelementptr inbounds float, float addrspace(1)* %_arg_aA, i64 %3 %4 = getelementptr inbounds %"class.cl::sycl::id", %"class.cl::sycl::id"* %_arg_aC9, i64 0, i32 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8 %add.ptr.i44 = getelementptr inbounds float, float addrspace(1)* %_arg_aC, i64 %5 %6 = tail call i32 @llvm.mlvm.read.mlu.sreg.taskidx() #5 %conv.i.i.i.i.i.i = sext i32 %6 to i64 %call.i.i.i.i.i.i = tail call i64 @_Z23__spirv_NumWorkgroups_xv() #6 %call1.i.i.i.i.i.i = tail call i64 @_Z23__spirv_WorkgroupSize_xv() #6 %call.i.i.i.i.i = tail call noundef i64 @_Z22__spirv_GlobalOffset_xv() #7 %cmp.i.i = icmp sgt i32 %6, -1 tail call void @llvm.assume(i1 %cmp.i.i) #5 %cmp.not.i = icmp ugt i64 %1, %conv.i.i.i.i.i.i br i1 %cmp.not.i, label %for.body.i.i, label %_ZNK2cl4sycl6detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idI Li1EEEE_EclES4_.exit for.body.i.i: ; preds = %entry, %for.body.i.i %indvars.iv.i.i = phi i64 [ %indvars.iv.next.i.i, %for.body.i.i ], [ 0, %entry ] %arrayidx.i.i6.i = getelementptr inbounds float, float addrspace(1)* %add.ptr.i, i64 %indvars.iv.i.i %arrayidx.ascast.i.i.i = addrspacecast float addrspace(1)* %arrayidx.i.i6.i to float* %7 = load float, float* %arrayidx.ascast.i.i.i, align 4, !tbaa !11 %arrayidx.i3.i.i = getelementptr inbounds float, float addrspace(101)* %_arg_localAccA, i64 %indvars.iv.i.i %arrayidx.ascast.i4.i.i = addrspacecast float addrspace(101)* %arrayidx.i3.i.i to float* store float %7, float* %arrayidx.ascast.i4.i.i, align 4, !tbaa !11 %add.i.i = fadd float %7, 5.000000e-01 %arrayidx.i11.i.i = getelementptr inbounds float, float addrspace(1)* %add.ptr.i44, i64 %indvars.iv.i.i %arrayidx.ascast.i12.i.i = addrspacecast float addrspace(1)* %arrayidx.i11.i.i to float* store float %add.i.i, float* %arrayidx.ascast.i12.i.i, align 4, !tbaa !11 %indvars.iv.next.i.i = add nuw nsw i64 %indvars.iv.i.i, 1 %exitcond.not.i.i = icmp eq i64 %indvars.iv.next.i.i, 256 br i1 %exitcond.not.i.i, label %_ZNK2cl4sycl6detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EclES4 _.exit, label %for.body.i.i, !llvm.loop !15 _ZNK2cl4sycl6detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EclES4_.exit: ; preds = %for.body.i.i, % entry ret void ; uselistorder directives uselistorder label %for.body.i.i, { 1, 0 } uselistorder i64 %indvars.iv.next.i.i, { 1, 0 } } ; Function Attrs: inaccessiblememonly mustprogress nocallback nofree nosync nounwind willreturn declare void @llvm.assume(i1 noundef) #1 ; Function Attrs: convergent declare dso_local noundef i64 @_Z22__spirv_GlobalOffset_xv() local_unnamed_addr #2 ; Function Attrs: convergent noinline norecurse define weak_odr dso_local void @_ZTSZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_EUlNS0_2idILi1EEEE_(float addrspace(101)* noundef align 4 %_arg_localAccA, %"class .cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_localAccA1, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_localAccA2, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_localAccA3, float addrspace(1)* noundef readonly align 4 %_arg_aA, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aA4, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aA5, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_aA6, float addrspace(1)* noundef align 4 %_arg_aC, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aC7, %"class.cl::sycl::range"* noundef byval(%"class.cl::sycl::range") align 8 %_arg_aC8, %"class.cl::sycl::id"* noundef byval(%"class.cl::sycl::id") align 8 %_arg_aC9) local_unnamed_addr #0 comdat !kernel_arg_buffer_location !17 !kernel_arg_runtime_aligned !18 !kernel_arg_exclusive_ptr !18 {entry: %0 = getelementptr inbounds %"class.cl::sycl::id", %"class.cl::sycl::id"* %_arg_aA6, i64 0, i32 0, i32 0, i64 0 %1 = load i64, i64* %0, align 8 %add.ptr.i = getelementptr inbounds float, float addrspace(1)* %_arg_aA, i64 %1 %2 = getelementptr inbounds %"class.cl::sycl::id", %"class.cl::sycl::id"* %_arg_aC9, i64 0, i32 0, i32 0, i64 0 %3 = load i64, i64* %2, align 8 %add.ptr.i41 = getelementptr inbounds float, float addrspace(1)* %_arg_aC, i64 %3 %4 = tail call i32 @llvm.mlvm.read.mlu.sreg.taskidx() #5 %call.i.i.i.i.i.i = tail call i64 @_Z23__spirv_NumWorkgroups_xv() #6 %call1.i.i.i.i.i.i = tail call i64 @_Z23__spirv_WorkgroupSize_xv() #6 %call.i.i.i.i.i = tail call noundef i64 @_Z22__spirv_GlobalOffset_xv() #7 %cmp.i.i = icmp sgt i32 %4, -1 tail call void @llvm.assume(i1 %cmp.i.i) #5 br label %for.body.i for.body.i: ; preds = %for.body.i, %entry %indvars.iv.i = phi i64 [ 0, %entry ], [ %indvars.iv.next.i, %for.body.i ] %arrayidx.i.i42 = getelementptr inbounds float, float addrspace(1)* %add.ptr.i, i64 %indvars.iv.i %arrayidx.ascast.i.i = addrspacecast float addrspace(1)* %arrayidx.i.i42 to float* %5 = load float, float* %arrayidx.ascast.i.i, align 4, !tbaa !11 %arrayidx.i3.i = getelementptr inbounds float, float addrspace(101)* %_arg_localAccA, i64 %indvars.iv.i %arrayidx.ascast.i4.i = addrspacecast float addrspace(101)* %arrayidx.i3.i to float* store float %5, float* %arrayidx.ascast.i4.i, align 4, !tbaa !11 %add.i = fadd float %5, 5.000000e-01 %arrayidx.i11.i = getelementptr inbounds float, float addrspace(1)* %add.ptr.i41, i64 %indvars.iv.i %arrayidx.ascast.i12.i = addrspacecast float addrspace(1)* %arrayidx.i11.i to float* store float %add.i, float* %arrayidx.ascast.i12.i, align 4, !tbaa !11 %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 %exitcond.not.i = icmp eq i64 %indvars.iv.next.i, 256 br i1 %exitcond.not.i, label %_ZZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_ENKUlNS0_2idILi1EEEE_clES5_.exit, label %for.body.i, !llvm.loop !15 _ZZZ4mainENKUlRN2cl4sycl7handlerEE_clES2_ENKUlNS0_2idILi1EEEE_clES5_.exit: ; preds = %for.body.i ret void ; uselistorder directives uselistorder i64 %indvars.iv.next.i, { 1, 0 } } ```It is found that the handler of the extended hardware does not have the address 1 address number. How to fix this problem? How are the variable parameters of address 1 address defined and used? reference to source code (clang/lib/Basic/Targets/NVPTX.h)NVPTXAddrSpaceMap . The relevant content I implemented is here MLISAAddrSpaceMap
Environment
Ubuntu 18.04 SYCL 2022-06 release version cuda version 11.2
Additional context
No response