BlinkDL / RWKV-LM

RWKV is an RNN with transformer-level LLM performance. It can be directly trained like a GPT (parallelizable). So it's combining the best of RNN and transformer - great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding.
Apache License 2.0
12.05k stars 827 forks source link

wsl2 Ubuntu 运行 wkv_cuda_bf16和wkv_cuda的__global__错误提示没有存储类或者说明符 #180

Closed OuroborosAL closed 11 months ago

OuroborosAL commented 11 months ago

include

include

include "ATen/ATen.h"

define MIN_VALUE (-1e38)

typedef at::BFloat16 bf16;

global void kernel_forward(const int B, const int T, const int C, const float restrict const _w, const bf16 restrict const _u, const bf16 restrict const _k, const bf16 restrict const _v, bf16 restrict const _y) { const int idx = blockIdx.x blockDim.x + threadIdx.x; const int _b = idx / C; const int _c = idx % C; const int _offset = _b T C + _c;

float u = float(_u[_c]);
float w = _w[_c];
const bf16 *__restrict__ const k = _k + _offset;
const bf16 *__restrict__ const v = _v + _offset;
bf16 *__restrict__ const y = _y + _offset;

// aa and bb are running sums divided by exp(pp) (to avoid overflow)
float aa = 0, bb = 0, pp = MIN_VALUE;
for (int i = 0; i < T; i++) {
    const int ii = i * C;
    const float kk = float(k[ii]);
    const float vv = float(v[ii]);

    float ww = u + kk;
    float p = max(pp, ww);
    float e1 = exp(pp - p);
    float e2 = exp(ww - p);
    y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));

    ww = w + pp;
    p = max(ww, kk);
    e1 = exp(ww - p);
    e2 = exp(kk - p);
    aa = e1 * aa + e2 * vv;
    bb = e1 * bb + e2;
    pp = p;
}

} [{ "resource": "/root/RWKV-LM-main/RWKV-v4neo/cuda/wkv_cuda_bf16.cu", "owner": "C/C++: IntelliSense", "code": "77", "severity": 8, "message": "此声明没有存储类或类型说明符", "source": "C/C++", "startLineNumber": 7, "startColumn": 1, "endLineNumber": 7, "endColumn": 11 }] 之前全部#include也有错误 我在配置里添加路径解决了