RWKV_MY_TESTING Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... Detected CUDA files, patching ldflags Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/wkv_2048_bf16/build.ninja... Building extension module wkv_2048_bf16... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) [1/2] /usr/bin/nvcc -DTORCH_EXTENSION_NAME=wkv_2048_bf16 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.8/dist-packages/torch/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.8/dist-packages/torch/include/THC -isystem /usr/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -t 4 -std=c++17 -res-usage --maxrregcount 60 --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -DTmax=2048 -c /mnt/g/rwkv/finetune/lora/cuda/wkv_cuda_bf16.cu -o wkv_cuda_bf16.cuda.o FAILED: wkv_cuda_bf16.cuda.o /usr/bin/nvcc -DTORCH_EXTENSION_NAME=wkv_2048_bf16 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.8/dist-packages/torch/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.8/dist-packages/torch/include/THC -isystem /usr/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -t 4 -std=c++17 -res-usage --maxrregcount 60 --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -DTmax=2048 -c /mnt/g/rwkv/finetune/lora/cuda/wkv_cuda_bf16.cu -o wkv_cuda_bf16.cuda.o nvcc fatal : Unknown option 't' ninja: build stopped: subcommand failed. Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1900, in _run_ninja_build subprocess.run( File "/usr/lib/python3.8/subprocess.py", line 516, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1. The above exception was the direct cause of the following exception: Traceback (most recent call last): File "./finetune/lora/train.py", line 339, in from src.trainer import train_callback, generate_init_weight File "/mnt/g/rwkv/finetune/lora/src/trainer.py", line 6, in from .model import LORA_CONFIG File "/mnt/g/rwkv/finetune/lora/src/model.py", line 56, in wkvcuda = load(name=f"wkv{T_MAX}_bf16", sources=["finetune/lora/cuda/wkv_op_bf16.cpp", "finetune/lora/cuda/wkv_cuda_bf16.cu"], verbose=True, extra_cuda_cflags=["-t 4", "-std=c++17", "-res-usage", "--maxrregcount 60", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization", f"-DTmax={T_MAX}"]) File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1284, in load return _jit_compile( File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1508, in _jit_compile _write_ninja_file_and_build_library( File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1623, in _write_ninja_file_and_build_library _run_ninja_build( File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1916, in _run_ninja_build raise RuntimeError(message) from e RuntimeError: Error building extension 'wkv_2048_bf16' --load_model models/novel-RWKV-4-World-CHNtuned-7B-v1-20230709-ctx32k.pth --data_file ./finetune/json2binidx_tool/data/source_text_document --vocab_size 65536 --ctx_len 2048 --epoch_steps 200 --epoch_count 10 --epoch_begin 0 --epoch_save 1 --micro_bsz 1 --accumulate_grad_batches 8 --pre_ffn 0 --head_qk 0 --lr_init 5e-5 --lr_final 5e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 --devices 1 --precision bf16 --grad_cp 0 --lora_r 8 --lora_alpha 32 --lora_dropout 0.01 apt cnMirror already set gcc installed pip installed ninja installed cuda 12 installed requirements satisfied loading models/novel-RWKV-4-World-CHNtuned-7B-v1-20230709-ctx32k.pth --n_layer 32 --n_embd 4096 ########## work in progress ########## [2023-12-14 20:40:34,559] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) ############################################################################ #

RWKV-4 BF16 on 1x1 GPU, bsz 1x1x1=1, deepspeed_stage_2

Data = ./finetune/json2binidx_tool/data/source_text_document (binidx), ProjDir = lora-models

Epoch = 0 to 9, save every 1 epoch

Each "epoch" = 200 steps, 200 samples, 409600 tokens

Model = 32 n_layer, 4096 n_embd, 2048 ctx_len

LoRA = enabled, 8 r, 32.0 alpha, 0.01 dropout, on att,ffn,time,ln

Adam = lr 5e-05 to 5e-05, warmup 0 steps, beta (0.9, 0.999), eps 1e-08

Found torch 1.13.1+cu117, recommend 1.13.1+cu117 or newer

Found deepspeed 0.11.2, recommend 0.7.0 (faster than newer versions)

Found pytorch_lightning 1.9.5, recommend 1.9.1 or newer

# ############################################################################ {'load_model': 'models/novel-RWKV-4-World-CHNtuned-7B-v1-20230709-ctx32k.pth', 'wandb': '', 'proj_dir': 'lora-models', 'random_seed': -1, 'data_file': './finetune/json2binidx_tool/data/source_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 2048, 'epoch_steps': 200, 'epoch_count': 10, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 1, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 5e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 8, 'lora_alpha': 32.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ffn,time,ln', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 8, 'max_epochs': 10, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-14-20-40-35', 'betas': (0.9, 0.999), 'real_bsz': 1, 'run_name': '65536 ctx2048 L32 D4096'} RWKV_MY_TESTING Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... Detected CUDA files, patching ldflags Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/wkv_2048_bf16/build.ninja... Building extension module wkv_2048_bf16... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) [1/2] /usr/bin/nvcc -DTORCH_EXTENSION_NAME=wkv_2048_bf16 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.8/dist-packages/torch/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.8/dist-packages/torch/include/THC -isystem /usr/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -t 4 -std=c++17 -res-usage --maxrregcount 60 --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -DTmax=2048 -c /mnt/g/rwkv/finetune/lora/cuda/wkv_cuda_bf16.cu -o wkv_cuda_bf16.cuda.o FAILED: wkv_cuda_bf16.cuda.o /usr/bin/nvcc -DTORCH_EXTENSION_NAME=wkv_2048_bf16 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.8/dist-packages/torch/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.8/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.8/dist-packages/torch/include/THC -isystem /usr/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -t 4 -std=c++17 -res-usage --maxrregcount 60 --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -DTmax=2048 -c /mnt/g/rwkv/finetune/lora/cuda/wkv_cuda_bf16.cu -o wkv_cuda_bf16.cuda.o nvcc fatal : Unknown option 't' ninja: build stopped: subcommand failed. Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1900, in _run_ninja_build subprocess.run( File "/usr/lib/python3.8/subprocess.py", line 516, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1. The above exception was the direct cause of the following exception: Traceback (most recent call last): File "./finetune/lora/train.py", line 339, in from src.trainer import train_callback, generate_init_weight File "/mnt/g/rwkv/finetune/lora/src/trainer.py", line 6, in from .model import LORA_CONFIG File "/mnt/g/rwkv/finetune/lora/src/model.py", line 56, in wkvcuda = load(name=f"wkv{T_MAX}_bf16", sources=["finetune/lora/cuda/wkv_op_bf16.cpp", "finetune/lora/cuda/wkv_cuda_bf16.cu"], verbose=True, extra_cuda_cflags=["-t 4", "-std=c++17", "-res-usage", "--maxrregcount 60", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization", f"-DTmax={T_MAX}"]) File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1284, in load return _jit_compile( File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1508, in _jit_compile _write_ninja_file_and_build_library( File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1623, in _write_ninja_file_and_build_library _run_ninja_build( File "/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py", line 1916, in _run_ninja_build raise RuntimeError(message) from e RuntimeError: Error building extension 'wkv_2048_bf16'

nvcc - v11.7.64 python3.8 PyTorch v1.13.1+cu117 gcc v10.5.0 g++ v9.4.0 windwos 11专业版操作系统版本22621.2715 NVIDIA GeForce RTX3060 12G WSL 版本： 2.0.9.0 内核版本： 5.15.133.1-1 WSLg 版本： 1.0.59 MSRDC 版本： 1.2.4677 Direct3D 版本： 1.611.1-81528511 DXCore 版本： 10.0.25131.1002-220531-1700.rs-onecore-base2-hyp Windows 版本： 10.0.22621.2715

大佬帮看一下，到底是什么原因导致的

josStorer / RWKV-Runner

Error building extension 'wkv_2048_bf16'-内含所有报错信息和本地环境 #240