'Triton Error [CUDA]: device kernel image is invalid' while compiling

GPU: V100 CUDA: 11.8

has changed all the torch.bfloat16 to torch.float16 as stated in #49 . Is there something i still missing?

Error Log: root@84bb9affda66:/workspace/gpt-fast/gpt-fast-main# CUDA_VISIBLE_DEVICES=1 python generate.py --compile --checkpoint_path /datasets/llm/Llama-2-7b-chat-gpt-fast/model.pth --prompt "Hello, my name is" Loading model ... Time to load model: 4.16 seconds Traceback (most recent call last): File "/workspace/gpt-fast/gpt-fast-main/generate.py", line 410, in main( File "/workspace/gpt-fast/gpt-fast-main/generate.py", line 348, in main y, metrics = generate( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(args, kwargs) File "/workspace/gpt-fast/gpt-fast-main/generate.py", line 193, in generate generatedtokens, = decode_n_tokens(model, next_token.view(1, -1), input_pos, max_new_tokens - 1, callback=callback, sampling_kwargs) File "/workspace/gpt-fast/gpt-fast-main/generate.py", line 65, in decode_n_tokens next_token, next_prob = decode_one_token( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 417, in _fn return fn(args, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 580, in catch_errors return callback(frame, cache_entry, hooks, frame_state) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 384, in _convert_frame_assert return _compile( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 641, in _compile guarded_code = compile_inner(code, one_graph, hooks, transform) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 246, in time_wrapper r = func(*args, *kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 522, in compile_inner out_code = transform_code_object(code, transform) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object transformations(instructions, code_options) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 151, in _fn return fn(args, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 487, in transform tracer.run() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2093, in run super().run() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 780, in run and self.step() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 743, in step getattr(self, inst.opname)(inst) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2211, in RETURN_VALUE self.output.compile_subgraph( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 940, in compile_subgraph self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/contextlib.py", line 79, in inner return func(*args, kwds) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1085, in compile_and_call_fx_graph compiled_fn = self.call_user_compiler(gm) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 246, in time_wrapper r = func(args, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1157, in call_user_compiler raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1138, in call_user_compiler compiled_fn = compiler_fn(gm, self.example_inputs()) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper compiled_gm = compiler_fn(gm, example_inputs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper compiled_gm = compiler_fn(gm, example_inputs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/init.py", line 1697, in call return compilefx(model, inputs_, config_patches=self.config) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 949, in compile_fx return compile_fx( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1165, in compile_fx return aot_autograd( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/backends/common.py", line 55, in compiler_fn cg = aot_module_simplified(gm, example_inputs, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified compiled_fn = create_aot_dispatcher_function( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 246, in time_wrapper r = func(args, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 427, in aot_wrapper_dedupe return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 632, in aot_wrapper_synthetic_base return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base compiled_fw = compiler(fw_module, updated_flat_args) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 246, in time_wrapper r = func(*args, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1097, in fw_compiler_base return inner_compile( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/contextlib.py", line 79, in inner return func(*args, *kwds) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper inner_compiled_fn = compiler_fn(gm, example_inputs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/debug.py", line 305, in inner return fn(args, kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/contextlib.py", line 79, in inner return func(args, kwds) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 313, in compile_fx_inner compiled_graph = FxGraphCache.load( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 801, in load compiled_graph = compile_fx_fn(gm, example_inputs, fx_kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 547, in fx_codegen_and_compile compiled_fn = graph.compile_to_fn() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/graph.py", line 1140, in compile_to_fn return self.compile_to_module().call File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 246, in time_wrapper r = func(args, **kwargs) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/graph.py", line 1092, in compile_to_module mod = PyCodeCache.load_by_key_path( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1891, in load_by_key_path exec(code, mod.dict, mod.dict) File "/tmp/torchinductor_root/p7/cp76nmwkua2aiamfmc4zo5xzrahbgyn65ucwwsiznboqg4vejgrv.py", line 1679, in async_compile.wait(globals()) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 2470, in wait scope[key] = result.result() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 2314, in result kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code) File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 2290, in _load_kernel kernel.precompile() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/triton_heuristics.py", line 189, in precompile compiled_binary, launcher = self._precompile_config( File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/torch/_inductor/triton_heuristics.py", line 344, in _precompile_config binary._init_handles() File "/opt/conda/envs/gpt-fast-new/lib/python3.10/site-packages/triton/compiler/compiler.py", line 683, in _init_handles mod, func, n_regs, n_spills = fn_load_binary(self.metadata["name"], self.asm[bin_path], self.shared, device) torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: RuntimeError: Triton Error [CUDA]: device kernel image is invalid

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information

You can suppress this exception and fall back to eager by setting: import torch._dynamo torch._dynamo.config.suppress_errors = True

pip list: Package Version

certifi 2023.11.17 charset-normalizer 3.3.2 filelock 3.9.0 fsspec 2023.12.2 huggingface-hub 0.20.1 idna 3.6 Jinja2 3.1.2 MarkupSafe 2.1.3 mpmath 1.2.1 networkx 3.0rc1 numpy 1.26.2 nvidia-cublas-cu11 11.11.3.6 nvidia-cuda-cupti-cu11 11.8.87 nvidia-cuda-nvrtc-cu11 11.8.89 nvidia-cuda-runtime-cu11 11.8.89 nvidia-cudnn-cu11 8.7.0.84 nvidia-cufft-cu11 10.9.0.58 nvidia-curand-cu11 10.3.0.86 nvidia-cusolver-cu11 11.4.1.48 nvidia-cusparse-cu11 11.7.5.86 nvidia-nccl-cu11 2.19.3 nvidia-nvtx-cu11 11.8.86 packaging 23.2 pip 23.3.1 pytorch-triton 2.2.0+e28a256d71 PyYAML 6.0.1 requests 2.31.0 sentencepiece 0.1.99 setuptools 68.2.2 sympy 1.11.1 torch 2.3.0.dev20231225+cu118 tqdm 4.66.1 typing_extensions 4.8.0 urllib3 2.1.0 wheel 0.41.2

pytorch-labs / gpt-fast

'Triton Error [CUDA]: device kernel image is invalid' while compiling #68