NVIDIA TX2+jetpack 5+Ubuntu20.4: CUDA Setup failed despite GPU being available #1154

Closed qxpBlog closed 7 months ago

qxpBlog commented 7 months ago

System Info

cuda 11.4
bitsandbytes 0.42.0
torch 2.0.0+nv23.5
transformers 4.38.2
accelerate 0.28.0


When i run the following code:

import os
import sys
import argparse
import accelerate
from accelerate.utils import BnbQuantizationConfig
import torch
import numpy as np
import time
import transformers 
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer,AutoModel,AutoTokenizer,AutoModelForCausalLM,GPTQConfig
from codecarbon import track_emissions,EmissionsTracker
from LLMPruner.utils.logger import LoggerWithDepth
from transformers.models.opt.modeling_opt import OPTAttention, OPTDecoderLayer, OPTForCausalLM
from ptflops import get_model_complexity_info
from ptflops.pytorch_ops import bn_flops_counter_hook, pool_flops_counter_hook
from LLMPruner.evaluator.ppl import PPLMetric,test_latency_energy
from LLMPruner.models.hf_llama.modeling_llama import LlamaForCausalLM, LlamaRMSNorm, LlamaAttention, LlamaMLP
from LLMPruner.peft import PeftModel
if torch.cuda.is_available():
    device = "cuda"
    device = "cpu"
torch_version = int(torch.__version__.split('.')[1])

def LlamaAttention_counter_hook(module, input, output):
    # (1) Ignore past-key values
    # (2) Assume there is no attention mask
    # Input will be empty in some pytorch version. use output here since input.shape == output.shape
    flops = 0
    q_len = output[0].shape[1]
    linear_dim = output[0].shape[-1]
    num_heads = module.num_heads
    head_dim = module.head_dim

    rotary_flops = 2 * (q_len * num_heads * head_dim) * 2
    attention_flops = num_heads * (q_len * q_len * head_dim + q_len * q_len + q_len * q_len * head_dim) #QK^T + softmax + AttentionV
    linear_flops = 4 * (q_len * linear_dim * num_heads * head_dim) # 4 for q, k, v, o. 
    flops += rotary_flops + attention_flops + linear_flops
    module.__flops__ += int(flops)

def rmsnorm_flops_counter_hook(module, input, output):
    input = input[0]

    batch_flops =
    batch_flops *= 2
    module.__flops__ += int(batch_flops)

# @track_emissions()
def main(args):

    if args.test_mod == 'tuned':
        # 微调过后的模型的延迟和功耗的评估
        pruned_dict = torch.load(args.ckpt, map_location='cpu')
        tokenizer, model = pruned_dict['tokenizer'], pruned_dict['model']
        model = PeftModel.from_pretrained(
    elif args.test_mod == 'pruned':
        # 剪枝过后的模型的延迟和功耗的评估
        pruned_dict = torch.load(args.ckpt, map_location='cpu')
        tokenizer, model = pruned_dict['tokenizer'], pruned_dict['model']
    elif args.test_mod == 'base':
        model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B", torch_dtype="auto", trust_remote_code=True)
        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", trust_remote_code=True)      
    #     'model': model, 
    #     'tokenizer': tokenizer,
    # }, "/home/iotsc01/xinpengq/LLM-Pruner-main/prune_log/quant/pytorch_model.bin")    

    # model.config.pad_token_id = tokenizer.pad_token_id = 0 
    # model.config.bos_token_id = 1
    # model.config.eos_token_id = 2


    after_pruning_parameters = sum(p.numel() for p in model.parameters())
    print("#parameters: {}".format(after_pruning_parameters))

    ppl = test_latency_energy(model, tokenizer, ['wikitext2', 'ptb'], args.max_seq_len, device=device)
    print("PPL after pruning: {}".format(ppl))
    print("Memory Requirement: {} MiB\n".format(torch.cuda.memory_allocated() / 1024 / 1024))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Tuning Pruned LLaMA (huggingface version)')

    parser.add_argument('--base_model', type=str, default="llama-7b-hf", help='base model name')
    parser.add_argument('--ckpt', type=str, default=None)
    parser.add_argument('--lora_ckpt', type=str, default=None)
    parser.add_argument('--max_seq_len', type=int, default=128, help='max sequence length')
    parser.add_argument('--test_mod', type=str, default="tuned", help='choose from [pruned, tuned, base]')
    args = parser.parse_args()


I set the attribution test_mod to base,but the following issues occurred:

/home/jetson/.local/lib/python3.8/site-packages/torchvision-0.13.0-py3.8-linux-aarch64.egg/torchvision/io/ UserWarning: Failed to load image Python extension: /home/jetson/.local/lib/python3.8/site-packages/torchvision-0.13.0-py3.8-linux-aarch64.egg/torchvision/ undefined symbol: _ZNK3c1010TensorImpl36is_contiguous_nondefault_policy_implENS_12MemoryFormatE
  warn(f"Failed to load image Python extension: {e}")
/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/cuda_setup/ UserWarning: Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/cuda_setup/ UserWarning: /home/jetson/archiconda3/envs/llm did not contain ['', '', ''] as expected! Searching further paths...
/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/cuda_setup/ UserWarning: Found duplicate ['', '', ''] files: {PosixPath('/usr/local/cuda-11.4/lib64/'), PosixPath('/usr/local/cuda-11.4/lib64/')}.. We select the PyTorch default, which is {torch.version.cuda},but this might missmatch with the CUDA version that is needed for bitsandbytes.To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variableFor example, if you want to use the CUDA version 122BNB_CUDA_VERSION=122 python ...OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g.export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2
/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/cuda_setup/ UserWarning: /opt/ros/noetic/lib:/usr/local/cuda-11.4/lib64 did not contain ['', '', ''] as expected! Searching further paths...
/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/cuda_setup/ UserWarning: Found duplicate ['', '', ''] files: {PosixPath('/usr/local/cuda/lib64/'), PosixPath('/usr/local/cuda/lib64/')}.. We select the PyTorch default, which is {torch.version.cuda},but this might missmatch with the CUDA version that is needed for bitsandbytes.To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variableFor example, if you want to use the CUDA version 122BNB_CUDA_VERSION=122 python ...OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g.export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2

===================================BUG REPORT===================================
The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//')}
The following directories listed in your path were found to be non-existent: {PosixPath('//localhost'), PosixPath('http'), PosixPath('11311')}
CUDA_SETUP: WARNING! not found in any environmental path. Searching in backup paths...
DEBUG: Possible options found for {PosixPath('/usr/local/cuda/lib64/'), PosixPath('/usr/local/cuda/lib64/')}
CUDA SETUP: PyTorch settings found: CUDA_VERSION=114, Highest Compute Capability: 8.7.
CUDA SETUP: To manually override the PyTorch CUDA version please see:
CUDA SETUP: Loading binary /home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/
/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/ cannot open shared object file: No such file or directory
CUDA SETUP: Something unexpected happened. Please compile from source:
git clone
cd bitsandbytes
CUDA_VERSION=114 make cuda11x
python install
Traceback (most recent call last):
  File "/home/jetson/llm-mian/LLM-Pruner-main/", line 18, in <module>
    from LLMPruner.peft import PeftModel
  File "/home/jetson/llm-mian/LLM-Pruner-main/LLMPruner/peft/", line 22, in <module>
    from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config, get_peft_model
  File "/home/jetson/llm-mian/LLM-Pruner-main/LLMPruner/peft/", line 16, in <module>
    from .peft_model import (
  File "/home/jetson/llm-mian/LLM-Pruner-main/LLMPruner/peft/", line 31, in <module>
    from .tuners import AdaLoraModel, LoraModel, PrefixEncoder, PromptEmbedding, PromptEncoder
  File "/home/jetson/llm-mian/LLM-Pruner-main/LLMPruner/peft/tuners/", line 20, in <module>
    from .lora import LoraConfig, LoraModel
  File "/home/jetson/llm-mian/LLM-Pruner-main/LLMPruner/peft/tuners/", line 40, in <module>
    import bitsandbytes as bnb
  File "/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/", line 6, in <module>
    from . import cuda_setup, utils, research
  File "/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/research/", line 1, in <module>
    from . import nn
  File "/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/research/nn/", line 1, in <module>
    from .modules import LinearFP8Mixed, LinearFP8Global
  File "/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/research/nn/", line 8, in <module>
    from bitsandbytes.optim import GlobalOptimManager
  File "/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/optim/", line 6, in <module>
    from bitsandbytes.cextension import COMPILED_WITH_CUDA
  File "/home/jetson/archiconda3/envs/llm/lib/python3.8/site-packages/bitsandbytes/", line 20, in <module>
    raise RuntimeError('''
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at:

Expected behavior

@kashif @stephenroller @akx @jbn @I want to konw how to solve this problem, and if bitsandbytes does not support the TX2.Looking forward to your reply.

matthewdouglas commented 7 months ago

Duplicate of #1151. There has not been a bitsandbytes release built for aarch64 yet.

qxpBlog commented 7 months ago
