RuntimeError: Error building extension 'slstm_HS128BS8NH4NS4DBfDRbDWbDGbDSbDAfNG4SA1GRCV0GRC0d0FCV0FC0d0'

Building extension module slstm_HS128BS8NH4NS4DBfDRbDWbDGbDSbDAfNG4SA1GRCV0GRC0d0FCV0FC0d0... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) {'verbose': True, 'with_cuda': True, 'extra_ldflags': ['-L/home/songx_lab/cse12110714/cuda-12.1/lib', '-lcublas'], 'extra_cflags': ['-DSLSTM_HIDDEN_SIZE=128', '-DSLSTM_BATCH_SIZE=8', '-DSLSTM_NUM_HEADS=4', '-DSLSTM_NUM_STATES=4', '-DSLSTM_DTYPE_B=float', '-DSLSTM_DTYPE_R=nv_bfloat16', '-DSLSTM_DTYPE_W=__nv_bfloat16', '-DSLSTM_DTYPE_G=nv_bfloat16', '-DSLSTM_DTYPE_S=nv_bfloat16', '-DSLSTM_DTYPE_A=float', '-DSLSTM_NUM_GATES=4', '-DSLSTM_SIMPLE_AGG=true', '-DSLSTM_GRADIENT_RECURRENT_CLIPVAL_VALID=false', '-DSLSTM_GRADIENT_RECURRENT_CLIPVAL=0.0', '-DSLSTM_FORWARD_CLIPVAL_VALID=false', '-DSLSTM_FORWARD_CLIPVAL=0.0', '-U__CUDA_NO_HALF_OPERATORS', '-UCUDA_NO_HALF_CONVERSIONS', '-UCUDA_NO_BFLOAT16_OPERATORS', '-UCUDA_NO_BFLOAT16_CONVERSIONS', '-UCUDA_NO_BFLOAT162_OPERATORS', '-UCUDA_NO_BFLOAT162_CONVERSIONS'], 'extra_cuda_cflags': ['-Xptxas="-v"', '-gencode', 'arch=compute_80,code=compute_80', '-res-usage', '--use_fast_math', '-O3', '-Xptxas -O3', '--extra-device-vectorization', '-DSLSTM_HIDDEN_SIZE=128', '-DSLSTM_BATCH_SIZE=8', '-DSLSTM_NUM_HEADS=4', '-DSLSTM_NUM_STATES=4', '-DSLSTM_DTYPE_B=float', '-DSLSTM_DTYPE_R=nv_bfloat16', '-DSLSTM_DTYPE_W=nv_bfloat16', '-DSLSTM_DTYPE_G=nv_bfloat16', '-DSLSTM_DTYPE_S=__nv_bfloat16', '-DSLSTM_DTYPE_A=float', '-DSLSTM_NUM_GATES=4', '-DSLSTM_SIMPLE_AGG=true', '-DSLSTM_GRADIENT_RECURRENT_CLIPVAL_VALID=false', '-DSLSTM_GRADIENT_RECURRENT_CLIPVAL=0.0', '-DSLSTM_FORWARD_CLIPVAL_VALID=false', '-DSLSTM_FORWARD_CLIPVAL=0.0', '-UCUDA_NO_HALF_OPERATORS', '-UCUDA_NO_HALF_CONVERSIONS', '-UCUDA_NO_BFLOAT16_OPERATORS', '-UCUDA_NO_BFLOAT16_CONVERSIONS', '-UCUDA_NO_BFLOAT162_OPERATORS', '-UCUDA_NO_BFLOAT162_CONVERSIONS']} [1/6] /home/songx_lab/cse12110714/cuda-12.1/bin/nvcc -DTORCH_EXTENSION_NAME=slstm_HS128BS8NH4NS4DBfDRbDWbDGbDSbDAfNG4SA1GRCV0GRC0d0FCV0FC0d0 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include/TH -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include/THC -isystem /home/songx_lab/cse12110714/cuda-12.1/include -isystem /home/songx_lab/cse12110714/.conda/envs/bert/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -Xptxas="-v" -gencode arch=compute_80,code=compute_80 -res-usage --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -DSLSTM_HIDDEN_SIZE=128 -DSLSTM_BATCH_SIZE=8 -DSLSTM_NUM_HEADS=4 -DSLSTM_NUM_STATES=4 -DSLSTM_DTYPE_B=float -DSLSTM_DTYPE_R=nv_bfloat16 -DSLSTM_DTYPE_W=nv_bfloat16 -DSLSTM_DTYPE_G=__nv_bfloat16 -DSLSTM_DTYPE_S=nv_bfloat16 -DSLSTM_DTYPE_A=float -DSLSTM_NUM_GATES=4 -DSLSTM_SIMPLE_AGG=true -DSLSTM_GRADIENT_RECURRENT_CLIPVAL_VALID=false -DSLSTM_GRADIENT_RECURRENT_CLIPVAL=0.0 -DSLSTM_FORWARD_CLIPVAL_VALID=false -DSLSTM_FORWARD_CLIPVAL=0.0 -UCUDA_NO_HALF_OPERATORS -UCUDA_NO_HALF_CONVERSIONS -UCUDA_NO_BFLOAT16_OPERATORS -UCUDA_NO_BFLOAT16_CONVERSIONS -UCUDA_NO_BFLOAT162_OPERATORS -UCUDA_NO_BFLOAT162_CONVERSIONS -std=c++17 -c /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/xlstm/blocks/slstm/src/cuda/slstm_forward.cu -o slstm_forward.cuda.o FAILED: slstm_forward.cuda.o /home/songx_lab/cse12110714/cuda-12.1/bin/nvcc -DTORCH_EXTENSION_NAME=slstm_HS128BS8NH4NS4DBfDRbDWbDGbDSbDAfNG4SA1GRCV0GRC0d0FCV0FC0d0 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include/TH -isystem /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/torch/include/THC -isystem /home/songx_lab/cse12110714/cuda-12.1/include -isystem /home/songx_lab/cse12110714/.conda/envs/bert/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -Xptxas="-v" -gencode arch=compute_80,code=compute_80 -res-usage --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -DSLSTM_HIDDEN_SIZE=128 -DSLSTM_BATCH_SIZE=8 -DSLSTM_NUM_HEADS=4 -DSLSTM_NUM_STATES=4 -DSLSTM_DTYPE_B=float -DSLSTM_DTYPE_R=nv_bfloat16 -DSLSTM_DTYPE_W=__nv_bfloat16 -DSLSTM_DTYPE_G=nv_bfloat16 -DSLSTM_DTYPE_S=nv_bfloat16 -DSLSTM_DTYPE_A=float -DSLSTM_NUM_GATES=4 -DSLSTM_SIMPLE_AGG=true -DSLSTM_GRADIENT_RECURRENT_CLIPVAL_VALID=false -DSLSTM_GRADIENT_RECURRENT_CLIPVAL=0.0 -DSLSTM_FORWARD_CLIPVAL_VALID=false -DSLSTM_FORWARD_CLIPVAL=0.0 -U__CUDA_NO_HALF_OPERATORS -UCUDA_NO_HALF_CONVERSIONS -UCUDA_NO_BFLOAT16_OPERATORS -UCUDA_NO_BFLOAT16_CONVERSIONS -UCUDA_NO_BFLOAT162_OPERATORS -UCUDA_NO_BFLOAT162_CONVERSIONS -std=c++17 -c /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/xlstm/blocks/slstm/src/cuda/slstm_forward.cu -o slstm_forward.cuda.o /home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/xlstm/blocks/slstm/src/cuda/../util/inline_ops_bf16.cuh(76): error: no suitable user-defined conversion from "const nv_bfloat16" to "half" exists return __hadd_rn(a, b); ^

/home/songx_lab/cse12110714/.conda/envs/bert/lib/python3.10/site-packages/xlstm/blocks/slstm/src/cuda/../util/inline_ops_bf16.cuh(76): error: no suitable user-defined conversion from "const __nv_bfloat16" to "half" exists return hadd_rn(a, b);

I met this problem and there are hundreds of errors like "no suitable user-defined conversion from ***" How can I solve it? Is there anyone facing the same problem? I have tried install cccl and my env variables are properly set including PATH, CUDA_HOME.

NX-AI / xlstm

RuntimeError: Error building extension 'slstm_HS128BS8NH4NS4DBfDRbDWbDGbDSbDAfNG4SA1GRCV0GRC0d0FCV0FC0d0' #43