tracel-ai / models

Models and examples built with Burn
Apache License 2.0
180 stars 24 forks source link

Llama 3.1 #38

Closed laggui closed 2 months ago

laggui commented 2 months ago

Ported the new Llama weights to our implementation along with their custom RoPE frequency scaling.

Made a couple additional changes to fix minor things along the way:

TODO:

laggui commented 2 months ago

Some notes:

wgpu backend fails due to lack of memory (even with TinyLlama):

// WSL
Loading record...
thread 'main' panicked at /home/laggui/.cargo/registry/src/index.crates.io-6f17d22bba15001f/cubecl-runtime-0.2.0/src/memory_management/dynamic.rs:156:9:
No memory pool big enough to reserve 262144000 bytes.
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
// Windows
Loading record...
thread 'main' panicked at C:\Users\guila\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-22.1.0\src\backend\wgpu_core.rs:3411:5:
wgpu error: Validation Error

Caused by:
  In Device::create_buffer
    Not enough memory left.

note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
error: process didn't exit successfully: `target\release\examples\chat.exe` (exit code: 101)

cuda backend currently uses f32 because f16 has some compilation errors:

Loading record...
Loaded in 1s
Processing prompt: How many helicopters can a human eat in one sitting?
thread 'main' panicked at /home/laggui/.cargo/registry/src/index.crates.io-6f17d22bba15001f/cubecl-cuda-0.2.0/src/compute/server.rs:237:17:
[Compilation Error]
    default_program(56): error: class "__half2" has no member "i_0"
      l_0_8.i_0 = __half(0.0);
            ^
    default_program(57): error: class "__half2" has no member "i_1"
      l_0_8.i_1 = __half(0.0);
            ^
    default_program(66): error: class "__half2" has no member "i_0"
      l_0_9.i_0 = __half(0.0);
            ^
    default_program(67): error: class "__half2" has no member "i_1"
      l_0_9.i_1 = __half(0.0);
            ^
    4 errors detected in the compilation of "default_program".
[Source]
#include <cuda_fp16.h>
typedef unsigned int uint;

extern "C" __global__ void kernel(
__half2 input_0[],__half2 input_1[],uint info[]
) {

    int3 absoluteIdx = make_int3(
        blockIdx.x * blockDim.x + threadIdx.x,
        blockIdx.y * blockDim.y + threadIdx.y,
        blockIdx.z * blockDim.z + threadIdx.z
    );

    uint idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint rank = info[0];
uint rank_2 = rank * 2;
uint l_0_0;
uint l_0_1;
uint l_0_2;
uint l_0_3;
bool l_0_4;
uint l_0_5;
uint l_0_6;
uint l_0_7;
__half2 l_0_8;
__half2 l_0_9;
l_0_0 = idxGlobal;
l_0_1 = idxGlobal;
l_0_2 = idxGlobal;
l_0_3 = info[(2 * 2 * info[0]) + 1] / 2;
l_0_4 = l_0_0 >= l_0_3;
if (l_0_4) {
return;}
l_0_3 = l_0_0 * uint(2);
l_0_5 = uint(0);

for (uint l_1_0 = uint(0); l_1_0 < rank; ++l_1_0) {
l_0_6 = info[(0 * rank_2) + l_1_0 + 1];
l_0_6 = l_0_3 / l_0_6;
l_0_7 = info[(1 * rank_2) + rank + l_1_0 + 1];
l_0_6 = l_0_6 % l_0_7;
l_0_7 = info[(1 * rank_2) + l_1_0 + 1];
l_0_6 = l_0_6 * l_0_7;
l_0_5 = l_0_5 + l_0_6;
}
l_0_5 = l_0_5 / uint(2);
l_0_2 = l_0_5;
uint l_0_10;
bool l_0_11;
l_0_10 = info[(2 * 2 * info[0]) + 1] / 2;
l_0_11 = l_0_1 < l_0_10;
if (l_0_11) {
l_0_8 = input_0[l_0_1];
} else {
l_0_8.i_0 = __half(0.0);
l_0_8.i_1 = __half(0.0);
}
uint l_0_12;
bool l_0_13;
l_0_12 = info[(2 * 2 * info[0]) + 2] / 2;
l_0_13 = l_0_2 < l_0_12;
if (l_0_13) {
l_0_9 = input_1[l_0_2];
} else {
l_0_9.i_0 = __half(0.0);
l_0_9.i_1 = __half(0.0);
}
l_0_8 = l_0_8 * l_0_9;
uint l_0_14;
bool l_0_15;
l_0_14 = info[(2 * 2 * info[0]) + 1] / 2;
l_0_15 = l_0_0 < l_0_14;
if (l_0_15) {
input_0[l_0_0] = l_0_8;
}

}
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace