Open GBrochado11 opened 5 months ago
@GBrochado11 When did you install Unsloth? Can you check your xformers, CUDA versions
I get the same thing. I'm using python Python 3.12.3 withxformers 0.0.26.post1
, unsloth @ git+https://github.com/unslothai/unsloth.git@27fa021a7bb959a53667dd4e7cdb9598c207aa0d
(also tried with main
, which was 172219e3e76e5508e97da3e5e281597a4246dcb7
).
A simple script like this is already enough to trigger the error:
from unsloth import FastLanguageModel
Full trace for me:
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Traceback (most recent call last):
File "/home/user/ML/test.py", line 1, in <module>
from unsloth import FastLanguageModel
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/__init__.py", line 121, in <module>
from .models import *
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/__init__.py", line 15, in <module>
from .loader import FastLanguageModel
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/loader.py", line 15, in <module>
from .llama import FastLlamaModel, logger
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/llama.py", line 28, in <module>
from ._utils import *
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/_utils.py", line 63, in <module>
import xformers.ops.fmha as xformers
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/__init__.py", line 8, in <module>
from .fmha import (
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/fmha/__init__.py", line 10, in <module>
from . import (
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py", line 548, in <module>
_get_splitk_kernel(num_groups)
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py", line 503, in _get_splitk_kernel
_fwd_kernel_splitK_unrolled = unroll_varargs(_fwd_kernel_splitK, N=num_groups)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/triton/vararg_kernel.py", line 166, in unroll_varargs
jitted_fn = triton.jit(fn)
^^^^^^^^^^^^^^
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/triton/runtime/jit.py", line 570, in jit
return decorator(fn)
^^^^^^^^^^^^^
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/triton/runtime/jit.py", line 561, in decorator
return JITFunction(
^^^^^^^^^^^^
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/triton/runtime/jit.py", line 440, in __init__
self.starting_line_number = inspect.getsourcelines(fn)[1]
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/inspect.py", line 1270, in getsourcelines
return getblock(lines[lnum:]), lnum + 1
^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/inspect.py", line 1237, in getblock
for _token in tokens:
File "/home/user/mambaforge/envs/tensorml/lib/python3.12/tokenize.py", line 582, in _generate_tokens_from_c_tokenizer
raise TokenError(msg, (e.lineno, e.offset)) from None
tokenize.TokenError: ('unterminated string literal (detected at line 1122)', (1122, 1))
CUDA version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:18:24_PDT_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0
Driver Version: 550.54.15
CUDA Version: 12.4
As a temporary solution, downgrading to Python3.10 seems to have solved this.
Oh my that's a very very weird problem - that seems like a Xformers issue itself hmm
I have the same issue with python 3.12.6 and am using a fresh clone of the unsloth repo
@mrheinen Is this via Conda as well?
When I try to execute "from unsloth import FastLanguageModel" the following error appear.
TokenError Traceback (most recent call last) Cell In[1], line 3 1 import torch 2 import pandas as pd ----> 3 from unsloth import FastLanguageModel 4 from trl import SFTTrainer 5 from transformers import TrainingArguments
File ~/myenv/lib/python3.12/site-packages/unsloth/init.py:121 111 warnings.warn( 112 "Unsloth: CUDA is not linked properly.\n"\ 113 "Try running
python -m bitsandbytes
thenpython -m xformers.info
\n"\ (...) 117 "Unsloth will still run for now, but maybe it might crash - let's hope it works!" 118 ) 119 pass --> 121 from .models import 122 from .save import 123 from .chat_templates import *File ~/myenv/lib/python3.12/site-packages/unsloth/models/init.py:15 1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); (...) 12 # See the License for the specific language governing permissions and 13 # limitations under the License. ---> 15 from .loader import FastLanguageModel 16 from .llama import FastLlamaModel 17 from .mistral import FastMistralModel
File ~/myenv/lib/python3.12/site-packages/unsloth/models/loader.py:15 1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); (...) 12 # See the License for the specific language governing permissions and 13 # limitations under the License. ---> 15 from .llama import FastLlamaModel, logger 16 from .mistral import FastMistralModel 17 from .qwen2 import FastQwen2Model
File ~/myenv/lib/python3.12/site-packages/unsloth/models/llama.py:28 24 from transformers.modeling_attn_mask_utils import ( 25 _prepare_4d_causal_attention_mask_for_sdpa, 26 ) 27 from ..kernels import ---> 28 from ._utils import 29 from ._utils import version 30 from ..tokenizer_utils import *
File ~/myenv/lib/python3.12/site-packages/unsloth/models/_utils.py:63 61 HAS_FLASH_ATTENTION = False 62 pass ---> 63 import xformers.ops.fmha as xformers 64 xformers_attention = xformers.memory_efficient_attention 65 from xformers import version as xformers_version
File ~/myenv/lib/python3.12/site-packages/xformers/ops/init.py:8 1 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 2 # 3 # This source code is licensed under the BSD license found in the 4 # LICENSE file in the root directory of this source tree. 6 import torch ----> 8 from .fmha import ( 9 AttentionBias, 10 AttentionOp, 11 AttentionOpBase, 12 AttentionOpDispatch, 13 LowerTriangularMask, 14 MemoryEfficientAttentionCkOp, 15 MemoryEfficientAttentionCutlassFwdFlashBwOp, 16 MemoryEfficientAttentionCutlassOp, 17 MemoryEfficientAttentionFlashAttentionOp, 18 MemoryEfficientAttentionOp, 19 MemoryEfficientAttentionSplitKCkOp, 20 memory_efficient_attention, 21 memory_efficient_attention_backward, 22 memory_efficient_attention_forward, 23 memory_efficient_attention_forward_requires_grad, 24 ) 25 from .indexing import index_select_cat, scaled_index_add 26 from .ipc import init_ipc
File ~/myenv/lib/python3.12/site-packages/xformers/ops/fmha/init.py:10 6 from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast 8 import torch ---> 10 from . import ( 11 attn_bias, 12 ck, 13 ck_decoder, 14 ck_splitk, 15 cutlass, 16 decoder, 17 flash, 18 small_k, 19 triton_splitk, 20 ) 21 from .attn_bias import ( 22 AttentionBias, 23 BlockDiagonalGappyKeysMask, (...) 28 PagedBlockDiagonalPaddedKeysMask, 29 ) 30 from .common import ( 31 AttentionBwOpBase, 32 AttentionFwOpBase, (...) 39 bmk2bmhk, 40 )
File ~/myenv/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py:548 544 if sys.version_info >= (3, 9): 545 # unroll_varargs requires Python 3.9+ 546 for num_groups in [1, 2, 4, 8]: 547 _fwd_kernel_splitK_autotune[num_groups] = autotune_kernel( --> 548 _get_splitk_kernel(num_groups) 549 ) 551 def get_autotuner_cache(num_groups: int) -> Dict[Tuple[int], triton.Config]: 552 """Returns a triton.runtime.autotuner.AutoTuner.cache object, which 553 represents mappings from kernel autotune keys (tuples describing kernel inputs) 554 to triton.Config 555 """
File ~/myenv/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py:503, in _get_splitk_kernel(num_groups) 495 def _get_splitk_kernel(num_groups): 496 """ 497 Kernel _fwd_kernel_splitK needs to be post-processed by unroll_varargs 498 to specialize it for a given number of quantization groups N_GROUPS 499 before we can apply triton.heuristics and triton.autotune, so we 500 don't do them as decorators. 501 """ --> 503 _fwd_kernel_splitK_unrolled = unroll_varargs(_fwd_kernel_splitK, N=num_groups) 504 kernel = triton.heuristics( 505 { 506 "BOUNDS_CHECKS_N": lambda args: ( (...) 511 } 512 )(_fwd_kernel_splitK_unrolled) 513 return kernel
File ~/myenv/lib/python3.12/site-packages/xformers/triton/vararg_kernel.py:166, in unroll_varargs(kernel, N) 163 linecache.getlines = _monkey_patched_getlines 164 _FILENAME_TO_SRC[fn_filename] = new_src --> 166 jitted_fn = triton.jit(fn) 167 jitted_fn.src = new_src 168 return jitted_fn
File ~/myenv/lib/python3.12/site-packages/triton/runtime/jit.py:570, in jit(fn, version, do_not_specialize, debug, noinline) 561 return JITFunction( 562 fn, 563 version=version, (...) 566 noinline=noinline, 567 ) 569 if fn is not None: --> 570 return decorator(fn) 572 else: 573 return decorator
File ~/myenv/lib/python3.12/site-packages/triton/runtime/jit.py:561, in jit..decorator(fn)
559 return InterpretedFunction(fn)
560 else:
--> 561 return JITFunction(
562 fn,
563 version=version,
564 do_not_specialize=do_not_specialize,
565 debug=debug,
566 noinline=noinline,
567 )
File ~/myenv/lib/python3.12/site-packages/triton/runtime/jit.py:440, in JITFunction.init(self, fn, version, do_not_specialize, debug, noinline) 438 self.signature = inspect.signature(fn) 439 self.do_not_specialize = do_not_specialize --> 440 self.starting_line_number = inspect.getsourcelines(fn)[1] 442 self.params = [] 443 for i, param in enumerate(self.signature.parameters.values()):
File /usr/lib/python3.12/inspect.py:1270, in getsourcelines(object) 1268 return lines, 0 1269 else: -> 1270 return getblock(lines[lnum:]), lnum + 1
File /usr/lib/python3.12/inspect.py:1237, in getblock(lines) 1235 try: 1236 tokens = tokenize.generate_tokens(iter(lines).next) -> 1237 for _token in tokens: 1238 blockfinder.tokeneater(*_token) 1239 except (EndOfBlock, IndentationError):
File /usr/lib/python3.12/tokenize.py:582, in _generate_tokens_from_c_tokenizer(source, encoding, extra_tokens) 580 raise e from None 581 msg = _transform_msg(e.msg) --> 582 raise TokenError(msg, (e.lineno, e.offset)) from None
TokenError: ('unterminated string literal (detected at line 1122)', (1122, 1))