Your Flash Attention 2 installation seems to be broken

C0casio45 commented 2 months ago

Unsloth: Your Flash Attention 2 installation seems to be broken? A possible explanation is you have a new CUDA version which isn't yet compatible with FA2? Please file a ticket to Unsloth or FA2. We shall now use Xformers instead, which does not have any performance hits! We found this negligible impact by benchmarking on 1x A100.

ImportError                               Traceback (most recent call last)
File /usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py:1603, in _LazyModule._get_module(self, module_name)
   1602 try:
-> 1603     return importlib.import_module("." + module_name, self.__name__)
   1604 except Exception as e:

File /usr/lib/python3.10/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1050, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1027, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1006, in _find_and_load_unlocked(name, import_)

File <frozen importlib._bootstrap>:688, in _load_unlocked(spec)

File <frozen importlib._bootstrap_external>:883, in exec_module(self, module)

File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)

File /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:119
    118 if is_accelerate_available():
--> 119     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
    121 NEED_SETUP_CACHE_CLASSES_MAPPING = {
    122     "static": StaticCache,
    123     "sliding_window": SlidingWindowCache,
    124     "hybrid": HybridCache,
    125     "mamba": MambaCache,
    126 }

File /usr/local/lib/python3.10/dist-packages/accelerate/__init__.py:16
     14 __version__ = "0.33.0"
---> 16 from .accelerator import Accelerator
     17 from .big_modeling import (
     18     cpu_offload,
     19     cpu_offload_with_hook,
   (...)
     24     load_checkpoint_and_dispatch,
     25 )

File /usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py:36
     34 from huggingface_hub import split_torch_state_dict_into_shards
---> 36 from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
     37 from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches

File /usr/local/lib/python3.10/dist-packages/accelerate/checkpointing.py:24
     22 from torch.cuda.amp import GradScaler
---> 24 from .utils import (
     25     MODEL_NAME,
     26     OPTIMIZER_NAME,
     27     RNG_STATE_NAME,
     28     SAFE_MODEL_NAME,
     29     SAFE_WEIGHTS_NAME,
     30     SAMPLER_NAME,
     31     SCALER_NAME,
     32     SCHEDULER_NAME,
     33     WEIGHTS_NAME,
     34     get_pretty_name,
     35     is_torch_xla_available,
     36     is_xpu_available,
     37     save,
     38 )
     41 if is_torch_xla_available():

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/__init__.py:190
    181     from .deepspeed import (
    182         DeepSpeedEngineWrapper,
    183         DeepSpeedOptimizerWrapper,
   (...)
    187         HfDeepSpeedConfig,
    188     )
--> 190 from .bnb import has_4bit_bnb_layers, load_and_quantize_model
    191 from .fsdp_utils import load_fsdp_model, load_fsdp_optimizer, merge_fsdp_weights, save_fsdp_model, save_fsdp_optimizer

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/bnb.py:29
     24 from accelerate.utils.imports import (
     25     is_4bit_bnb_available,
     26     is_8bit_bnb_available,
     27 )
---> 29 from ..big_modeling import dispatch_model, init_empty_weights
     30 from .dataclasses import BnbQuantizationConfig

File /usr/local/lib/python3.10/dist-packages/accelerate/big_modeling.py:24
     22 import torch.nn as nn
---> 24 from .hooks import (
     25     AlignDevicesHook,
     26     CpuOffload,
     27     UserCpuOffloadHook,
     28     add_hook_to_module,
     29     attach_align_device_hook,
     30     attach_align_device_hook_on_blocks,
     31 )
     32 from .utils import (
     33     OffloadedWeightsLoader,
     34     check_cuda_p2p_ib_support,
   (...)
     48     retie_parameters,
     49 )

File /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:30
     29 from .utils.modeling import get_non_persistent_buffers
---> 30 from .utils.other import recursive_getattr
     33 _accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/other.py:36
     35 from .modeling import id_tensor_storage
---> 36 from .transformer_engine import convert_model
     37 from .versions import is_torch_version

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/transformer_engine.py:21
     20 if is_fp8_available():
---> 21     import transformer_engine.pytorch as te
     24 def convert_model(model, to_transformer_engine=True, _convert_linear=True, _convert_ln=True):

File /usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/__init__.py:34
     31     spec.loader.exec_module(solib)
---> 34 _load_library()
     35 from transformer_engine.pytorch.module import LayerNormLinear

File /usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/__init__.py:29, in _load_library()
     28 spec = importlib.util.spec_from_file_location(module_name, so_path)
---> 29 solib = importlib.util.module_from_spec(spec)
     30 sys.modules[module_name] = solib

ImportError: /usr/local/lib/python3.10/dist-packages/transformer_engine/transformer_engine_torch.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c105Error4whatEv

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[4], line 1
----> 1 from unsloth import FastLanguageModel
      2 import torch
      3 max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!

File /usr/local/lib/python3.10/dist-packages/unsloth/__init__.py:154
    144         warnings.warn(
    145             "Unsloth: CUDA is not linked properly[.\n](http://51.159.172.108:8889/lab/workspaces/auto-4/tree/RTC%3Atom_FT/n)"\
    146             "Try running `python -m bitsandbytes` then `python -m xformers.info`\n"\
   (...)
    150             "Unsloth will still run for now, but maybe it might crash - let's hope it works!"
    151         )
    152 pass
--> 154 from .models import *
    155 from .save import *
    156 from .chat_templates import *

File /usr/local/lib/python3.10/dist-packages/unsloth/models/__init__.py:15
      1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
   (...)
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
---> 15 from .loader  import FastLanguageModel
     16 from .llama   import FastLlamaModel
     17 from .mistral import FastMistralModel

File /usr/local/lib/python3.10/dist-packages/unsloth/models/loader.py:15
      1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
   (...)
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
---> 15 from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING
     16 from .llama import FastLlamaModel, logger
     17 from .mistral import FastMistralModel

File /usr/local/lib/python3.10/dist-packages/unsloth/models/_utils.py:213
    210     HAS_FLASH_ATTENTION = False
    211 pass
--> 213 from transformers.models.llama.modeling_llama import logger
    215 # =============================================
    216 # Get Xformers
    217 from xformers import __version__ as xformers_version

File /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:41
     33 from ...modeling_outputs import (
     34     BaseModelOutputWithPast,
     35     CausalLMOutputWithPast,
   (...)
     38     TokenClassifierOutput,
     39 )
     40 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
---> 41 from ...modeling_utils import PreTrainedModel
     42 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
     43 from ...utils import (
     44     add_start_docstrings,
     45     add_start_docstrings_to_model_forward,
   (...)
     48     replace_return_docstrings,
     49 )

File /usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:46
     44 from .configuration_utils import PretrainedConfig
     45 from .dynamic_module_utils import custom_object_save
---> 46 from .generation import GenerationConfig, GenerationMixin
     47 from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
     48 from .pytorch_utils import (  # noqa: F401
     49     Conv1D,
     50     apply_chunking_to_forward,
   (...)
     56     prune_linear_layer,
     57 )

File <frozen importlib._bootstrap>:1075, in _handle_fromlist(module, fromlist, import_, recursive)

File /usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py:1593, in _LazyModule.__getattr__(self, name)
   1591     value = self._get_module(name)
   1592 elif name in self._class_to_module.keys():
-> 1593     module = self._get_module(self._class_to_module[name])
   1594     value = getattr(module, name)
   1595 else:

File /usr/local/lib/python3.10/dist-packages/transformers/utils/import_utils.py:1605, in _LazyModule._get_module(self, module_name)
   1603     return importlib.import_module("." + module_name, self.__name__)
   1604 except Exception as e:
-> 1605     raise RuntimeError(
   1606         f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
   1607         f" traceback):\n{e}"
   1608     ) from e

RuntimeError: Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
/usr/local/lib/python3.10/dist-packages/transformer_engine/transformer_engine_torch.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c105Error4whatEv

When running

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Mistral-Nemo-Base-2407",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

I am in a pytorch:24.07 docker environment this is the cuda version (nvcc -V)

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0

This is the nvidia-smi

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA H100 PCIe               Off |   00000000:01:00.0 Off |                    0 |
| N/A   54C    P0             62W /  350W |       4MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

ivsanro1 commented 2 months ago

Yeah IMO it’s very tricky to install all correctly right now, there’s some dependency hell going on.

I took some good hours to figure out some installation order that works for my case.

You can try this:

python3 -m pip install torch==2.2.1+cu121 torchvision --index-url https://download.pytorch.org/whl/cu121
python3 -m pip install "unsloth @ git+https://github.com/unslothai/unsloth.git@d0ca3497eb5911483339be025e9924cf73280178"
python3 -m pip install --no-deps "xformers<0.0.26" --force-reinstall
python3 -m pip install flash_attn==2.6.3

They have to be in that specific order and not together in the same pip command, or the dependencies get resolved in a way they’re not compatible.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.4.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.151 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.25.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth

Edit: you can check this dockerfile

C0casio45 commented 2 months ago

Hey,

So I tried to run all of this, but the code still ask me to download some dependencies bitsandbytes transformers trl peft

And I still get an error

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[10], line 1
----> 1 from unsloth import FastLanguageModel
      2 import torch
      3 max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!

File /usr/local/lib/python3.10/dist-packages/unsloth/__init__.py:154
    144         warnings.warn(
    145             "Unsloth: CUDA is not linked properly[.\n](http://51.159.172.108:8889/lab/tree/RTC%3Atom_FT/n)"\
    146             "Try running `python -m bitsandbytes` then `python -m xformers.info`\n"\
   (...)
    150             "Unsloth will still run for now, but maybe it might crash - let's hope it works!"
    151         )
    152 pass
--> 154 from .models import *
    155 from .save import *
    156 from .chat_templates import *

File /usr/local/lib/python3.10/dist-packages/unsloth/models/__init__.py:15
      1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
   (...)
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
---> 15 from .loader  import FastLanguageModel
     16 from .llama   import FastLlamaModel
     17 from .mistral import FastMistralModel

File /usr/local/lib/python3.10/dist-packages/unsloth/models/loader.py:15
      1 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
   (...)
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
---> 15 from ._utils import is_bfloat16_supported, HAS_FLASH_ATTENTION, HAS_FLASH_ATTENTION_SOFTCAPPING
     16 from .llama import FastLlamaModel, logger
     17 from .mistral import FastMistralModel

File /usr/local/lib/python3.10/dist-packages/unsloth/models/_utils.py:559
    553 pass
    556 # =============================================
    557 # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
    558 # For mixed precision, we need it to be in float32 not float16.
--> 559 from peft import __version__ as peft_version
    560 if Version(peft_version) < Version("0.12.0"):
    561     from peft.tuners.lora.layer import LoraLayer

File /usr/local/lib/python3.10/dist-packages/peft/__init__.py:22
      1 # flake8: noqa
      2 # There's no way to ignore "F401 '...' imported but unused" warnings in this
      3 # module, but to preserve other warnings. So, don't check this module at all.
   (...)
     17 # See the License for the specific language governing permissions and
     18 # limitations under the License.
     20 __version__ = "0.12.0"
---> 22 from .auto import (
     23     AutoPeftModel,
     24     AutoPeftModelForCausalLM,
     25     AutoPeftModelForSequenceClassification,
     26     AutoPeftModelForSeq2SeqLM,
     27     AutoPeftModelForTokenClassification,
     28     AutoPeftModelForQuestionAnswering,
     29     AutoPeftModelForFeatureExtraction,
     30 )
     31 from .mapping import (
     32     MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
     33     PEFT_TYPE_TO_CONFIG_MAPPING,
   (...)
     36     inject_adapter_in_model,
     37 )
     38 from .mixed_model import PeftMixedModel

File /usr/local/lib/python3.10/dist-packages/peft/auto.py:31
     19 from typing import Optional
     21 from transformers import (
     22     AutoModel,
     23     AutoModelForCausalLM,
   (...)
     28     AutoTokenizer,
     29 )
---> 31 from .config import PeftConfig
     32 from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
     33 from .peft_model import (
     34     PeftModel,
     35     PeftModelForCausalLM,
   (...)
     40     PeftModelForTokenClassification,
     41 )

File /usr/local/lib/python3.10/dist-packages/peft/config.py:24
     21 from huggingface_hub import hf_hub_download
     22 from transformers.utils import PushToHubMixin
---> 24 from .utils import CONFIG_NAME, PeftType, TaskType
     27 @dataclass
     28 class PeftConfigMixin(PushToHubMixin):
     29     r"""
     30     This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all
     31     PEFT adapter models. This class inherits from [`~transformers.utils.PushToHubMixin`] which contains the methods to
   (...)
     36         peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
     37     """

File /usr/local/lib/python3.10/dist-packages/peft/utils/__init__.py:23
     21 from .loftq_utils import replace_lora_weights_loftq
     22 from .peft_types import PeftType, TaskType
---> 23 from .other import (
     24     TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
     25     TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
     26     TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
     27     TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
     28     TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
     29     TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING,
     30     TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING,
     31     TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING,
     32     CONFIG_NAME,
     33     WEIGHTS_NAME,
     34     SAFETENSORS_WEIGHTS_NAME,
     35     INCLUDE_LINEAR_LAYERS_SHORTHAND,
     36     _set_trainable,
     37     bloom_model_postprocess_past_key_value,
     38     prepare_model_for_kbit_training,
     39     shift_tokens_right,
     40     transpose,
     41     _get_batch_size,
     42     _get_submodules,
     43     _set_adapter,
     44     _freeze_adapter,
     45     ModulesToSaveWrapper,
     46     _prepare_prompt_learning_config,
     47     _is_valid_match,
     48     infer_device,
     49     get_auto_gptq_quant_linear,
     50     get_quantization_config,
     51     id_tensor_storage,
     52     cast_mixed_precision_params,
     53 )
     54 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights

File /usr/local/lib/python3.10/dist-packages/peft/utils/other.py:21
     18 from contextlib import nullcontext
     19 from typing import Optional, Tuple
---> 21 import accelerate
     22 import torch
     23 from accelerate.hooks import add_hook_to_module, remove_hook_from_module

File /usr/local/lib/python3.10/dist-packages/accelerate/__init__.py:16
      1 # Copyright 2020 The HuggingFace Team. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
   (...)
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 __version__ = "0.33.0"
---> 16 from .accelerator import Accelerator
     17 from .big_modeling import (
     18     cpu_offload,
     19     cpu_offload_with_hook,
   (...)
     24     load_checkpoint_and_dispatch,
     25 )
     26 from .data_loader import skip_first_batches

File /usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py:36
     33 import torch.utils.hooks as hooks
     34 from huggingface_hub import split_torch_state_dict_into_shards
---> 36 from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
     37 from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches
     38 from .hooks import AlignDevicesHook

File /usr/local/lib/python3.10/dist-packages/accelerate/checkpointing.py:24
     21 from safetensors.torch import load_model
     22 from torch.cuda.amp import GradScaler
---> 24 from .utils import (
     25     MODEL_NAME,
     26     OPTIMIZER_NAME,
     27     RNG_STATE_NAME,
     28     SAFE_MODEL_NAME,
     29     SAFE_WEIGHTS_NAME,
     30     SAMPLER_NAME,
     31     SCALER_NAME,
     32     SCHEDULER_NAME,
     33     WEIGHTS_NAME,
     34     get_pretty_name,
     35     is_torch_xla_available,
     36     is_xpu_available,
     37     save,
     38 )
     41 if is_torch_xla_available():
     42     import torch_xla.core.xla_model as xm

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/__init__.py:190
    180 if is_deepspeed_available():
    181     from .deepspeed import (
    182         DeepSpeedEngineWrapper,
    183         DeepSpeedOptimizerWrapper,
   (...)
    187         HfDeepSpeedConfig,
    188     )
--> 190 from .bnb import has_4bit_bnb_layers, load_and_quantize_model
    191 from .fsdp_utils import load_fsdp_model, load_fsdp_optimizer, merge_fsdp_weights, save_fsdp_model, save_fsdp_optimizer
    192 from .launch import (
    193     PrepareForLaunch,
    194     _filter_args,
   (...)
    199     prepare_tpu,
    200 )

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/bnb.py:29
     22 import torch.nn as nn
     24 from accelerate.utils.imports import (
     25     is_4bit_bnb_available,
     26     is_8bit_bnb_available,
     27 )
---> 29 from ..big_modeling import dispatch_model, init_empty_weights
     30 from .dataclasses import BnbQuantizationConfig
     31 from .modeling import (
     32     find_tied_parameters,
     33     get_balanced_memory,
   (...)
     37     set_module_tensor_to_device,
     38 )

File /usr/local/lib/python3.10/dist-packages/accelerate/big_modeling.py:24
     21 import torch
     22 import torch.nn as nn
---> 24 from .hooks import (
     25     AlignDevicesHook,
     26     CpuOffload,
     27     UserCpuOffloadHook,
     28     add_hook_to_module,
     29     attach_align_device_hook,
     30     attach_align_device_hook_on_blocks,
     31 )
     32 from .utils import (
     33     OffloadedWeightsLoader,
     34     check_cuda_p2p_ib_support,
   (...)
     48     retie_parameters,
     49 )
     50 from .utils.other import recursive_getattr

File /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:30
     22 from .utils import (
     23     PrefixedDataset,
     24     find_device,
   (...)
     27     set_module_tensor_to_device,
     28 )
     29 from .utils.modeling import get_non_persistent_buffers
---> 30 from .utils.other import recursive_getattr
     33 _accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
     36 class ModelHook:

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/other.py:36
     34 from .imports import is_deepspeed_available, is_torch_distributed_available, is_torch_xla_available
     35 from .modeling import id_tensor_storage
---> 36 from .transformer_engine import convert_model
     37 from .versions import is_torch_version
     40 logger = get_logger(__name__)

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/transformer_engine.py:21
     17 from .imports import is_fp8_available
     20 if is_fp8_available():
---> 21     import transformer_engine.pytorch as te
     24 def convert_model(model, to_transformer_engine=True, _convert_linear=True, _convert_ln=True):
     25     """
     26     Recursively converts the linear and layernorm layers of a model to their `transformers_engine` counterpart.
     27     """

File /usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/__init__.py:34
     30     sys.modules[module_name] = solib
     31     spec.loader.exec_module(solib)
---> 34 _load_library()
     35 from transformer_engine.pytorch.module import LayerNormLinear
     36 from transformer_engine.pytorch.module import Linear

File /usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/__init__.py:29, in _load_library()
     27 module_name = "transformer_engine_torch"
     28 spec = importlib.util.spec_from_file_location(module_name, so_path)
---> 29 solib = importlib.util.module_from_spec(spec)
     30 sys.modules[module_name] = solib
     31 spec.loader.exec_module(solib)

ImportError: /usr/local/lib/python3.10/dist-packages/transformer_engine/transformer_engine_torch.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c105Error4whatEv

ivsanro1 commented 2 months ago

Apologies @C0casio45, I should've shared a Dockerfile instead.

I created this Dockerfile where it's working: https://github.com/zytedata/unsloth_docker/blob/main/Dockerfile

C0casio45 commented 2 months ago

It's working like a charm Thank you Have a good day

danielhanchen commented 2 months ago

@C0casio45 It's not necessary to install flash-attn! Xformers is on par or sometimes faster than FA2! I would simply uninstall it via pip uninstall flash-attn

unslothai / unsloth

Your Flash Attention 2 installation seems to be broken #948