unslothai / unsloth

Finetune Llama 3.1, Mistral, Phi & Gemma LLMs 2-5x faster with 80% less memory
https://unsloth.ai
Apache License 2.0
15.61k stars 1.05k forks source link

Error import unsloth #530

Open GBrochado11 opened 3 months ago

GBrochado11 commented 3 months ago

When I try to execute "from unsloth import FastLanguageModel" the following error appear.

TokenError Traceback (most recent call last) Cell In[1], line 3 1 import torch 2 import pandas as pd ----> 3 from unsloth import FastLanguageModel 4 from trl import SFTTrainer 5 from transformers import TrainingArguments

File ~/myenv/lib/python3.12/site-packages/unsloth/init.py:121 111 warnings.warn( 112 "Unsloth: CUDA is not linked properly.\n"\ 113 "Try running python -m bitsandbytes then python -m xformers.info\n"\ (...) 117 "Unsloth will still run for now, but maybe it might crash - let's hope it works!" 118 ) 119 pass --> 121 from .models import 122 from .save import 123 from .chat_templates import *

File ~/myenv/lib/python3.12/site-packages/unsloth/models/init.py:15 1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); (...) 12 # See the License for the specific language governing permissions and 13 # limitations under the License. ---> 15 from .loader import FastLanguageModel 16 from .llama import FastLlamaModel 17 from .mistral import FastMistralModel

File ~/myenv/lib/python3.12/site-packages/unsloth/models/loader.py:15 1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); (...) 12 # See the License for the specific language governing permissions and 13 # limitations under the License. ---> 15 from .llama import FastLlamaModel, logger 16 from .mistral import FastMistralModel 17 from .qwen2 import FastQwen2Model

File ~/myenv/lib/python3.12/site-packages/unsloth/models/llama.py:28 24 from transformers.modeling_attn_mask_utils import ( 25 _prepare_4d_causal_attention_mask_for_sdpa, 26 ) 27 from ..kernels import ---> 28 from ._utils import 29 from ._utils import version 30 from ..tokenizer_utils import *

File ~/myenv/lib/python3.12/site-packages/unsloth/models/_utils.py:63 61 HAS_FLASH_ATTENTION = False 62 pass ---> 63 import xformers.ops.fmha as xformers 64 xformers_attention = xformers.memory_efficient_attention 65 from xformers import version as xformers_version

File ~/myenv/lib/python3.12/site-packages/xformers/ops/init.py:8 1 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 2 # 3 # This source code is licensed under the BSD license found in the 4 # LICENSE file in the root directory of this source tree. 6 import torch ----> 8 from .fmha import ( 9 AttentionBias, 10 AttentionOp, 11 AttentionOpBase, 12 AttentionOpDispatch, 13 LowerTriangularMask, 14 MemoryEfficientAttentionCkOp, 15 MemoryEfficientAttentionCutlassFwdFlashBwOp, 16 MemoryEfficientAttentionCutlassOp, 17 MemoryEfficientAttentionFlashAttentionOp, 18 MemoryEfficientAttentionOp, 19 MemoryEfficientAttentionSplitKCkOp, 20 memory_efficient_attention, 21 memory_efficient_attention_backward, 22 memory_efficient_attention_forward, 23 memory_efficient_attention_forward_requires_grad, 24 ) 25 from .indexing import index_select_cat, scaled_index_add 26 from .ipc import init_ipc

File ~/myenv/lib/python3.12/site-packages/xformers/ops/fmha/init.py:10 6 from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast 8 import torch ---> 10 from . import ( 11 attn_bias, 12 ck, 13 ck_decoder, 14 ck_splitk, 15 cutlass, 16 decoder, 17 flash, 18 small_k, 19 triton_splitk, 20 ) 21 from .attn_bias import ( 22 AttentionBias, 23 BlockDiagonalGappyKeysMask, (...) 28 PagedBlockDiagonalPaddedKeysMask, 29 ) 30 from .common import ( 31 AttentionBwOpBase, 32 AttentionFwOpBase, (...) 39 bmk2bmhk, 40 )

File ~/myenv/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py:548 544 if sys.version_info >= (3, 9): 545 # unroll_varargs requires Python 3.9+ 546 for num_groups in [1, 2, 4, 8]: 547 _fwd_kernel_splitK_autotune[num_groups] = autotune_kernel( --> 548 _get_splitk_kernel(num_groups) 549 ) 551 def get_autotuner_cache(num_groups: int) -> Dict[Tuple[int], triton.Config]: 552 """Returns a triton.runtime.autotuner.AutoTuner.cache object, which 553 represents mappings from kernel autotune keys (tuples describing kernel inputs) 554 to triton.Config 555 """

File ~/myenv/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py:503, in _get_splitk_kernel(num_groups) 495 def _get_splitk_kernel(num_groups): 496 """ 497 Kernel _fwd_kernel_splitK needs to be post-processed by unroll_varargs 498 to specialize it for a given number of quantization groups N_GROUPS 499 before we can apply triton.heuristics and triton.autotune, so we 500 don't do them as decorators. 501 """ --> 503 _fwd_kernel_splitK_unrolled = unroll_varargs(_fwd_kernel_splitK, N=num_groups) 504 kernel = triton.heuristics( 505 { 506 "BOUNDS_CHECKS_N": lambda args: ( (...) 511 } 512 )(_fwd_kernel_splitK_unrolled) 513 return kernel

File ~/myenv/lib/python3.12/site-packages/xformers/triton/vararg_kernel.py:166, in unroll_varargs(kernel, N) 163 linecache.getlines = _monkey_patched_getlines 164 _FILENAME_TO_SRC[fn_filename] = new_src --> 166 jitted_fn = triton.jit(fn) 167 jitted_fn.src = new_src 168 return jitted_fn

File ~/myenv/lib/python3.12/site-packages/triton/runtime/jit.py:570, in jit(fn, version, do_not_specialize, debug, noinline) 561 return JITFunction( 562 fn, 563 version=version, (...) 566 noinline=noinline, 567 ) 569 if fn is not None: --> 570 return decorator(fn) 572 else: 573 return decorator

File ~/myenv/lib/python3.12/site-packages/triton/runtime/jit.py:561, in jit..decorator(fn) 559 return InterpretedFunction(fn) 560 else: --> 561 return JITFunction( 562 fn, 563 version=version, 564 do_not_specialize=do_not_specialize, 565 debug=debug, 566 noinline=noinline, 567 )

File ~/myenv/lib/python3.12/site-packages/triton/runtime/jit.py:440, in JITFunction.init(self, fn, version, do_not_specialize, debug, noinline) 438 self.signature = inspect.signature(fn) 439 self.do_not_specialize = do_not_specialize --> 440 self.starting_line_number = inspect.getsourcelines(fn)[1] 442 self.params = [] 443 for i, param in enumerate(self.signature.parameters.values()):

File /usr/lib/python3.12/inspect.py:1270, in getsourcelines(object) 1268 return lines, 0 1269 else: -> 1270 return getblock(lines[lnum:]), lnum + 1

File /usr/lib/python3.12/inspect.py:1237, in getblock(lines) 1235 try: 1236 tokens = tokenize.generate_tokens(iter(lines).next) -> 1237 for _token in tokens: 1238 blockfinder.tokeneater(*_token) 1239 except (EndOfBlock, IndentationError):

File /usr/lib/python3.12/tokenize.py:582, in _generate_tokens_from_c_tokenizer(source, encoding, extra_tokens) 580 raise e from None 581 msg = _transform_msg(e.msg) --> 582 raise TokenError(msg, (e.lineno, e.offset)) from None

TokenError: ('unterminated string literal (detected at line 1122)', (1122, 1))

danielhanchen commented 3 months ago

@GBrochado11 When did you install Unsloth? Can you check your xformers, CUDA versions

GuilhermeFreire commented 3 months ago

I get the same thing. I'm using python Python 3.12.3 withxformers 0.0.26.post1, unsloth @ git+https://github.com/unslothai/unsloth.git@27fa021a7bb959a53667dd4e7cdb9598c207aa0d (also tried with main, which was 172219e3e76e5508e97da3e5e281597a4246dcb7).

A simple script like this is already enough to trigger the error:

from unsloth import FastLanguageModel

Full trace for me:

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Traceback (most recent call last):
  File "/home/user/ML/test.py", line 1, in <module>
    from unsloth import FastLanguageModel
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/__init__.py", line 121, in <module>
    from .models import *
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/__init__.py", line 15, in <module>
    from .loader  import FastLanguageModel
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/loader.py", line 15, in <module>
    from .llama import FastLlamaModel, logger
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/llama.py", line 28, in <module>
    from ._utils import *
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/unsloth/models/_utils.py", line 63, in <module>
    import xformers.ops.fmha as xformers
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/__init__.py", line 8, in <module>
    from .fmha import (
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/fmha/__init__.py", line 10, in <module>
    from . import (
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py", line 548, in <module>
    _get_splitk_kernel(num_groups)
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/ops/fmha/triton_splitk.py", line 503, in _get_splitk_kernel
    _fwd_kernel_splitK_unrolled = unroll_varargs(_fwd_kernel_splitK, N=num_groups)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/xformers/triton/vararg_kernel.py", line 166, in unroll_varargs
    jitted_fn = triton.jit(fn)
                ^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/triton/runtime/jit.py", line 570, in jit
    return decorator(fn)
           ^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/triton/runtime/jit.py", line 561, in decorator
    return JITFunction(
           ^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/site-packages/triton/runtime/jit.py", line 440, in __init__
    self.starting_line_number = inspect.getsourcelines(fn)[1]
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/inspect.py", line 1270, in getsourcelines
    return getblock(lines[lnum:]), lnum + 1
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/inspect.py", line 1237, in getblock
    for _token in tokens:
  File "/home/user/mambaforge/envs/tensorml/lib/python3.12/tokenize.py", line 582, in _generate_tokens_from_c_tokenizer
    raise TokenError(msg, (e.lineno, e.offset)) from None
tokenize.TokenError: ('unterminated string literal (detected at line 1122)', (1122, 1))
These are the installed packages if helpful: ``` Package Version ------------------------- -------------- accelerate 0.30.1 aiohttp 3.9.5 aiosignal 1.3.1 anyio 4.3.0 argon2-cffi 23.1.0 argon2-cffi-bindings 21.2.0 arrow 1.3.0 asttokens 2.4.1 async-lru 2.0.4 attrs 23.2.0 Babel 2.14.0 beautifulsoup4 4.12.3 bitsandbytes 0.43.1 bleach 6.1.0 Bottleneck 1.3.5 Brotli 1.1.0 cached-property 1.5.2 certifi 2024.2.2 cffi 1.16.0 charset-normalizer 3.3.2 click 8.1.7 comm 0.2.2 contourpy 1.2.1 cycler 0.12.1 datasets 2.19.2 debugpy 1.8.1 decorator 5.1.1 defusedxml 0.7.1 dill 0.3.8 docker-pycreds 0.4.0 docstring_parser 0.16 entrypoints 0.4 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema 2.19.1 filelock 3.14.0 fonttools 4.51.0 fqdn 1.5.1 frozenlist 1.4.1 fsspec 2024.3.1 gitdb 4.0.11 GitPython 3.1.43 h11 0.14.0 h2 4.1.0 hpack 4.0.0 httpcore 1.0.5 httpx 0.27.0 huggingface-hub 0.23.3 hyperframe 6.0.1 idna 3.7 importlib_metadata 7.1.0 importlib_resources 6.4.0 ipykernel 6.29.3 ipython 8.22.2 ipywidgets 8.1.2 isoduration 20.11.0 jedi 0.19.1 Jinja2 3.1.3 json5 0.9.25 jsonpointer 2.4 jsonschema 4.21.1 jsonschema-specifications 2023.12.1 jupyter 1.0.0 jupyter_client 8.6.1 jupyter-console 6.6.3 jupyter_core 5.7.2 jupyter-events 0.10.0 jupyter-lsp 2.2.5 jupyter_server 2.14.0 jupyter_server_terminals 0.5.3 jupyterlab 4.1.8 jupyterlab_pygments 0.3.0 jupyterlab_server 2.27.1 jupyterlab_widgets 3.0.10 kiwisolver 1.4.5 markdown-it-py 3.0.0 MarkupSafe 2.1.5 matplotlib 3.8.4 matplotlib-inline 0.1.7 mdurl 0.1.2 mistune 3.0.2 mpmath 1.3.0 multidict 6.0.5 multiprocess 0.70.16 munkres 1.1.4 nbclient 0.10.0 nbconvert 7.16.4 nbformat 5.10.4 nest_asyncio 1.6.0 networkx 3.3 notebook 7.1.3 notebook_shim 0.2.4 numexpr 2.9.0 numpy 1.26.4 overrides 7.7.0 packaging 24.0 pandas 2.1.1 pandocfilters 1.5.0 parso 0.8.4 pexpect 4.9.0 pickleshare 0.7.5 pillow 10.3.0 pip 24.0 pkgutil_resolve_name 1.3.10 platformdirs 4.2.1 ply 3.11 prometheus_client 0.20.0 prompt-toolkit 3.0.42 protobuf 3.20.3 psutil 5.9.8 ptyprocess 0.7.0 pure-eval 0.2.2 pyarrow 16.1.0 pyarrow-hotfix 0.6 pycparser 2.22 Pygments 2.17.2 pyparsing 3.1.2 PyQt5 5.15.9 PyQt5-sip 12.12.2 PySocks 1.7.1 python-dateutil 2.9.0 python-json-logger 2.0.7 pytz 2024.1 PyYAML 6.0.1 pyzmq 26.0.2 qtconsole 5.5.1 QtPy 2.4.1 referencing 0.35.0 regex 2024.5.15 requests 2.32.3 rfc3339-validator 0.1.4 rfc3986-validator 0.1.1 rich 13.7.1 rpds-py 0.18.0 safetensors 0.4.3 Send2Trash 1.8.3 sentencepiece 0.2.0 sentry-sdk 2.4.0 setproctitle 1.3.3 setuptools 69.5.1 shtab 1.7.1 sip 6.7.12 six 1.16.0 smmap 5.0.1 sniffio 1.3.1 soupsieve 2.5 stack-data 0.6.2 sympy 1.12 terminado 0.18.1 tinycss2 1.3.0 tokenizers 0.19.1 toml 0.10.2 tomli 2.0.1 torch 2.3.0 torchaudio 2.3.0 torchvision 0.18.0 tornado 6.4 tqdm 4.66.4 traitlets 5.14.3 transformers 4.41.2 triton 2.3.1 trl 0.9.3 types-python-dateutil 2.9.0.20240316 typing_extensions 4.11.0 typing-utils 0.1.0 tyro 0.8.4 tzdata 2023.3 unsloth 2024.5 uri-template 1.3.0 urllib3 2.2.1 wandb 0.17.0 wcwidth 0.2.13 webcolors 1.13 webencodings 0.5.1 websocket-client 1.8.0 wheel 0.43.0 widgetsnbextension 4.0.10 xformers 0.0.26.post1 xxhash 3.4.1 yarl 1.9.4 zipp 3.17.0 ```

CUDA version:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:18:24_PDT_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0

Driver Version: 550.54.15
CUDA Version: 12.4
GuilhermeFreire commented 3 months ago

As a temporary solution, downgrading to Python3.10 seems to have solved this.

danielhanchen commented 3 months ago

Oh my that's a very very weird problem - that seems like a Xformers issue itself hmm