Closed laggui closed 2 months ago
Some notes:
wgpu
backend fails due to lack of memory (even with TinyLlama):
// WSL
Loading record...
thread 'main' panicked at /home/laggui/.cargo/registry/src/index.crates.io-6f17d22bba15001f/cubecl-runtime-0.2.0/src/memory_management/dynamic.rs:156:9:
No memory pool big enough to reserve 262144000 bytes.
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
// Windows
Loading record...
thread 'main' panicked at C:\Users\guila\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-22.1.0\src\backend\wgpu_core.rs:3411:5:
wgpu error: Validation Error
Caused by:
In Device::create_buffer
Not enough memory left.
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
error: process didn't exit successfully: `target\release\examples\chat.exe` (exit code: 101)
cuda
backend currently uses f32 because f16 has some compilation errors:
Loading record...
Loaded in 1s
Processing prompt: How many helicopters can a human eat in one sitting?
thread 'main' panicked at /home/laggui/.cargo/registry/src/index.crates.io-6f17d22bba15001f/cubecl-cuda-0.2.0/src/compute/server.rs:237:17:
[Compilation Error]
default_program(56): error: class "__half2" has no member "i_0"
l_0_8.i_0 = __half(0.0);
^
default_program(57): error: class "__half2" has no member "i_1"
l_0_8.i_1 = __half(0.0);
^
default_program(66): error: class "__half2" has no member "i_0"
l_0_9.i_0 = __half(0.0);
^
default_program(67): error: class "__half2" has no member "i_1"
l_0_9.i_1 = __half(0.0);
^
4 errors detected in the compilation of "default_program".
[Source]
#include <cuda_fp16.h>
typedef unsigned int uint;
extern "C" __global__ void kernel(
__half2 input_0[],__half2 input_1[],uint info[]
) {
int3 absoluteIdx = make_int3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint rank = info[0];
uint rank_2 = rank * 2;
uint l_0_0;
uint l_0_1;
uint l_0_2;
uint l_0_3;
bool l_0_4;
uint l_0_5;
uint l_0_6;
uint l_0_7;
__half2 l_0_8;
__half2 l_0_9;
l_0_0 = idxGlobal;
l_0_1 = idxGlobal;
l_0_2 = idxGlobal;
l_0_3 = info[(2 * 2 * info[0]) + 1] / 2;
l_0_4 = l_0_0 >= l_0_3;
if (l_0_4) {
return;}
l_0_3 = l_0_0 * uint(2);
l_0_5 = uint(0);
for (uint l_1_0 = uint(0); l_1_0 < rank; ++l_1_0) {
l_0_6 = info[(0 * rank_2) + l_1_0 + 1];
l_0_6 = l_0_3 / l_0_6;
l_0_7 = info[(1 * rank_2) + rank + l_1_0 + 1];
l_0_6 = l_0_6 % l_0_7;
l_0_7 = info[(1 * rank_2) + l_1_0 + 1];
l_0_6 = l_0_6 * l_0_7;
l_0_5 = l_0_5 + l_0_6;
}
l_0_5 = l_0_5 / uint(2);
l_0_2 = l_0_5;
uint l_0_10;
bool l_0_11;
l_0_10 = info[(2 * 2 * info[0]) + 1] / 2;
l_0_11 = l_0_1 < l_0_10;
if (l_0_11) {
l_0_8 = input_0[l_0_1];
} else {
l_0_8.i_0 = __half(0.0);
l_0_8.i_1 = __half(0.0);
}
uint l_0_12;
bool l_0_13;
l_0_12 = info[(2 * 2 * info[0]) + 2] / 2;
l_0_13 = l_0_2 < l_0_12;
if (l_0_13) {
l_0_9 = input_1[l_0_2];
} else {
l_0_9.i_0 = __half(0.0);
l_0_9.i_1 = __half(0.0);
}
l_0_8 = l_0_8 * l_0_9;
uint l_0_14;
bool l_0_15;
l_0_14 = info[(2 * 2 * info[0]) + 1] / 2;
l_0_15 = l_0_0 < l_0_14;
if (l_0_15) {
input_0[l_0_0] = l_0_8;
}
}
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Ported the new Llama weights to our implementation along with their custom RoPE frequency scaling.
Made a couple additional changes to fix minor things along the way:
max_seq_len
to be properly propagated to (command argument value was not used before)import
feature flag to keep pytorch weights loading code optionalTODO: