Closed laggui closed 1 month ago
Trying to run Llama 3.1 or TinyLlama with cuda using f16 raises compilation errors:
f16
thread 'main' panicked at /home/laggui/.cargo/registry/src/index.crates.io-6f17d22bba15001f/cubecl-cuda-0.2.0/src/compute/server.rs:237:17: [Compilation Error] default_program(56): error: class "__half2" has no member "i_0" l_0_8.i_0 = __half(0.0); ^ default_program(57): error: class "__half2" has no member "i_1" l_0_8.i_1 = __half(0.0); ^ default_program(66): error: class "__half2" has no member "i_0" l_0_9.i_0 = __half(0.0); ^ default_program(67): error: class "__half2" has no member "i_1" l_0_9.i_1 = __half(0.0); ^ 4 errors detected in the compilation of "default_program". [Source] #include <cuda_fp16.h> typedef unsigned int uint; extern "C" __global__ void kernel( __half2 input_0[],__half2 input_1[],uint info[] ) { int3 absoluteIdx = make_int3( blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y, blockIdx.z * blockDim.z + threadIdx.z ); uint idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x; uint rank = info[0]; uint rank_2 = rank * 2; uint l_0_0; uint l_0_1; uint l_0_2; uint l_0_3; bool l_0_4; uint l_0_5; uint l_0_6; uint l_0_7; __half2 l_0_8; __half2 l_0_9; l_0_0 = idxGlobal; l_0_1 = idxGlobal; l_0_2 = idxGlobal; l_0_3 = info[(2 * 2 * info[0]) + 1] / 2; l_0_4 = l_0_0 >= l_0_3; if (l_0_4) { return;} l_0_3 = l_0_0 * uint(2); l_0_5 = uint(0); for (uint l_1_0 = uint(0); l_1_0 < rank; ++l_1_0) { l_0_6 = info[(0 * rank_2) + l_1_0 + 1]; l_0_6 = l_0_3 / l_0_6; l_0_7 = info[(1 * rank_2) + rank + l_1_0 + 1]; l_0_6 = l_0_6 % l_0_7; l_0_7 = info[(1 * rank_2) + l_1_0 + 1]; l_0_6 = l_0_6 * l_0_7; l_0_5 = l_0_5 + l_0_6; } l_0_5 = l_0_5 / uint(2); l_0_2 = l_0_5; uint l_0_10; bool l_0_11; l_0_10 = info[(2 * 2 * info[0]) + 1] / 2; l_0_11 = l_0_1 < l_0_10; if (l_0_11) { l_0_8 = input_0[l_0_1]; } else { l_0_8.i_0 = __half(0.0); l_0_8.i_1 = __half(0.0); } uint l_0_12; bool l_0_13; l_0_12 = info[(2 * 2 * info[0]) + 2] / 2; l_0_13 = l_0_2 < l_0_12; if (l_0_13) { l_0_9 = input_1[l_0_2]; } else { l_0_9.i_0 = __half(0.0); l_0_9.i_1 = __half(0.0); } l_0_8 = l_0_8 * l_0_9; uint l_0_14; bool l_0_15; l_0_14 = info[(2 * 2 * info[0]) + 1] / 2; l_0_15 = l_0_0 < l_0_14; if (l_0_15) { input_0[l_0_0] = l_0_8; } } note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
fix with #92
Trying to run Llama 3.1 or TinyLlama with cuda using
f16
raises compilation errors: