Open wbrickner opened 3 weeks ago
the source gets generated and looks fine.
typedef unsigned int uint;
extern "C"
__global__ void kernel(
float input_0[], float output_0[], uint info[]
) {
int3 absoluteIdx = make_int3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint rank = info[0];
uint rank_2 = rank * 2;
uint l_0_0;
bool l_0_1;
float l_0_2;
float l_0_3;
l_0_0 = info[(2 * 2 * info[0]) + 1];
l_0_1 = idxGlobal < l_0_0;
if (l_0_1) {
l_0_2 = input_0[idxGlobal];
l_0_3 = sqrt(float(2.0));
l_0_3 = l_0_2 / l_0_3;
l_0_3 = erf(l_0_3);
l_0_3 = l_0_3 + float(1.0);
l_0_2 = l_0_2 * l_0_3;
l_0_2 = l_0_2 / float(2.0);
output_0[idxGlobal] = l_0_2;
}
}
the decoded PTX:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-30672275
// Cuda compilation tools, release 11.5, V11.5.119
// Based on NVVM 7.0.1
//
.version 7.5
.target sm_70
.address_size 64
// .globl kernel
.visible .entry kernel(
.param .u64 kernel_param_0,
.param .u64 kernel_param_1,
.param .u64 kernel_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<31>;
.reg .b32 %r<27>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [kernel_param_0];
ld.param.u64 %rd3, [kernel_param_1];
ld.param.u64 %rd4, [kernel_param_2];
cvta.to.global.u64 %rd5, %rd4;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.s32 %r5, %r3, %r2, %r4;
mov.u32 %r6, %ntid.y;
mov.u32 %r7, %ctaid.y;
mov.u32 %r8, %tid.y;
mad.lo.s32 %r9, %r7, %r6, %r8;
mov.u32 %r10, %ntid.z;
mov.u32 %r11, %ctaid.z;
mov.u32 %r12, %tid.z;
mad.lo.s32 %r13, %r11, %r10, %r12;
mov.u32 %r14, %nctaid.y;
mul.lo.s32 %r15, %r6, %r14;
mad.lo.s32 %r16, %r15, %r13, %r9;
mov.u32 %r17, %nctaid.x;
mul.lo.s32 %r18, %r17, %r2;
mad.lo.s32 %r1, %r18, %r16, %r5;
ld.global.u32 %r19, [%rd5];
shl.b32 %r20, %r19, 2;
or.b32 %r21, %r20, 1;
mul.wide.u32 %rd6, %r21, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.u32 %r22, [%rd7];
setp.ge.u32 %p1, %r1, %r22;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd8, %rd2;
cvt.u64.u32 %rd1, %r1;
mul.wide.u32 %rd9, %r1, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.f32 %f1, [%rd10];
div.rn.f32 %f2, %f1, 0f3FB504F3;
abs.f32 %f6, %f2;
setp.ltu.f32 %p2, %f6, 0f3F8060FE;
setp.ge.f32 %p3, %f6, 0f3F8060FE;
mul.f32 %f7, %f2, %f2;
selp.f32 %f8, %f6, %f7, %p3;
selp.f32 %f9, 0f3789CA3C, 0f38B1E96A, %p3;
selp.f32 %f10, 0fB9F560B9, 0fBA574D20, %p3;
fma.rn.f32 %f11, %f9, %f8, %f10;
selp.f32 %f12, 0f3BAC840B, 0f3BAAD5EA, %p3;
fma.rn.f32 %f13, %f11, %f8, %f12;
selp.f32 %f14, 0fBD0C8162, 0fBCDC1BE7, %p3;
fma.rn.f32 %f15, %f13, %f8, %f14;
selp.f32 %f16, 0f3E1CF906, 0f3DE718AF, %p3;
fma.rn.f32 %f17, %f15, %f8, %f16;
selp.f32 %f18, 0f3F6A937E, 0fBEC093AC, %p3;
fma.rn.f32 %f19, %f17, %f8, %f18;
selp.f32 %f20, 0f3F20D842, 0f3E0375D3, %p3;
fma.rn.f32 %f21, %f19, %f8, %f20;
neg.f32 %f22, %f6;
selp.f32 %f23, %f22, %f2, %p3;
fma.rn.f32 %f30, %f21, %f23, %f23;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f24, %f30;
mov.f32 %f25, 0f3F800000;
sub.f32 %f26, %f25, %f24;
mov.b32 %r23, %f26;
mov.b32 %r24, %f2;
and.b32 %r25, %r24, -2147483648;
or.b32 %r26, %r25, %r23;
mov.b32 %f30, %r26;
$L__BB0_3:
add.f32 %f27, %f30, 0f3F800000;
mul.f32 %f28, %f1, %f27;
mul.f32 %f29, %f28, 0f3F000000;
cvta.to.global.u64 %rd11, %rd3;
shl.b64 %rd12, %rd1, 2;
add.s64 %rd13, %rd11, %rd12;
st.global.f32 [%rd13], %f29;
$L__BB0_4:
ret;
}
we then die at cudarc::driver::result::module::load_data(ptx.as_ptr() as *const _)
Okay the issue is that the arch
argument is incorrect for my GPU (was 70).
I am running an old 1050 Ti. Editing arch to be 50
in debugger gives success.
arch
comes from self.minimum_arch_version
which is written in CudaServer::new
:
/// Create a new cuda server.
pub(crate) fn new(index: usize, init: Box<dyn Fn(usize) -> CudaContext<MM>>) -> Self {
let archs = unsafe {
let mut num_supported_arg: core::ffi::c_int = 0;
cudarc::nvrtc::sys::lib()
.nvrtcGetNumSupportedArchs(core::ptr::from_mut(&mut num_supported_arg));
let mut archs: Vec<core::ffi::c_int> = vec![0; num_supported_arg as usize];
cudarc::nvrtc::sys::lib().nvrtcGetSupportedArchs(core::ptr::from_mut(&mut archs[0]));
archs
};
println!("Supported archs: {:?}", archs);
let minimum_arch_version = archs[0];
Self {
state: CudaServerState::Uninitialized {
device_index: index,
init,
},
logger: DebugLogger::new(),
archs,
minimum_arch_version,
}
}
We see
Supported archs: [35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86, 87]
and indeed at that time minimum_arch_version = 35
.
It becomes 70
because of CudaRuntime::client
:
if let Some(wmma_minimum_version) = register_wmma_features(&mut features, &server.archs)
{
server.minimum_arch_version =
i32::max(server.minimum_arch_version, wmma_minimum_version);
}
so if i follow, you are trying to emit WMMA instructions which require a more recent gpu architecture than the real GPU.
Was working on a PR, but a kindof complicated redesign is necessary I think
Issue is we can't tell the max arch for a device without initializing. The design of CudaServer
is lazy init.
But we require construction of Features
before then. we need to be able to get device compute capability to filter archs
.
I think this part of the system needs a rethink. I vote to either eager init or relocate archs
/ minimum_arch_version
and redesign the types.
Thanks for the details. It's kinda weird that compiling using one of the supported archs generates invalid PTX.
the supported archs are just queried from the nvidia runtime compiler and are not filtered by the actual card capability. this is bc the card capability is unknown until it is initialized which is deferred in the current implementation
(the compilation is not failing, the loading / execution is failing bc any PTX arch too recent is invalid for the device)
the supported archs are just queried from the nvidia runtime compiler and are not filtered by the actual card capability. this is bc the card capability is unknown until it is initialized which is deferred in the current implementation
So maybe we should initialize the device earlier. I had deferred it due to multi-threading issues initially, but now that I think those issues are resolved, we might not need to defer it anymore.