Closed najsword closed 4 days ago
Hi, Installing older versions < 0.10 is not recommended please pip install tensorrt_llm==0.11.0.dev2024061800 or pip install tensorrt_llm== 0.10.0 Also, please use the container: nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3 Or build the container using make -C docker release_build
/root/miniconda3/envs/trt/lib/python3.10/site-packages/torch/cuda/init.py:138: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 804: forward compatibility was attempted on non supported HW (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.) return torch._C._cuda_getDeviceCount() > 0 Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.18it/s] Traceback (most recent call last): File "/root/projects/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 1532, in
main()
File "/root/projects/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 1508, in main
covert_and_save(rank, convert_args)
File "/root/projects/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 1480, in covert_and_save
weights = convert_hf_llama(
File "/root/projects/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 885, in convert_hf_llama
get_tllm_linear_weight(split_v, tllm_prex + 'attention.qkv.',
File "/root/projects/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 659, in get_tllm_linear_weight
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
File "/root/miniconda3/envs/trt/lib/python3.10/site-packages/torch/_ops.py", line 692, in call
return self._op(*args, *kwargs or {})
RuntimeError: [TensorRT-LLM][ERROR] CUDA runtime error in cudaGetDevice(&device): forward compatibility was attempted on non supported HW (/home/jenkins/agent/workspace/LLM/release-0.8/L0_PostMerge/tensorrt_llm/cpp/tensorrt_llm/common/cudaUtils.h:258)
1 0x7f0d9f58ddce void tensorrt_llm::common::check(cudaError, char const , char const, int) + 94
2 0x7f0d9f5ccc0d tensorrt_llm::kernels::cutlass_kernels::getLayoutDetailsForTransform(tensorrt_llm::kernels::cutlass_kernels::QuantType) + 61
3 0x7f0d9f5cce35 tensorrt_llm::kernels::cutlass_kernels::preprocess_weights_for_mixed_gemm(signed char, signed char const, std::vector<unsigned long, std::allocator > const&, tensorrt_llm::kernels::cutlass_kernels::QuantType) + 37
4 0x7f0d9f5d33d8 void tensorrt_llm::kernels::cutlass_kernels::symmetric_quantize<half, half>(signed char , signed char, __half, half const, std::vector<unsigned long, std::allocator > const&, tensorrt_llm::kernels::cutlass_kernels::QuantType) + 1416
5 0x7f10752236ad torch_ext::symmetric_quantize_helper(at::Tensor, c10::ScalarType, bool) + 2173
6 0x7f1075223897 torch_ext::symmetric_quantize_last_axis_of_batched_matrix(at::Tensor, c10::ScalarType) + 55
7 0x7f107522724f c10::impl::make_boxed_from_unboxedfunctor<c10::impl::detail::WrapFunctionIntoRuntimeFunctor<std::vector<at::Tensor, std::allocator > ( )(at::Tensor, c10::ScalarType), std::vector<at::Tensor, std::allocator >, c10::guts::typelist::typelist<at::Tensor, c10::ScalarType> >, true>::call(c10::OperatorKernel, c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator > ) + 127
8 0x7f0f9eeb85e2 c10::Dispatcher::callBoxed(c10::OperatorHandle const&, std::vector<c10::IValue, std::allocator >*) const + 562
9 0x7f0f9ec4fb43 torch::jit::invokeOperatorFromPython(std::vector<std::shared_ptr, std::allocator<std::shared_ptr > > const&, pybind11::args, pybind11::kwargs const&, c10::optional) + 1155
10 0x7f0f9ec50438 torch::jit::_get_operation_for_overload_or_packet(std::vector<std::shared_ptr, std::allocator<std::shared_ptr > > const&, c10::Symbol, pybind11::args, pybind11::kwargs const&, bool, c10::optional) + 1448
11 0x7f0f9eb343e0 /root/miniconda3/envs/trt/lib/python3.10/site-packages/torch/lib/libtorch_python.so(+0x8023e0) [0x7f0f9eb343e0]
12 0x7f0f9e720bb4 /root/miniconda3/envs/trt/lib/python3.10/site-packages/torch/lib/libtorch_python.so(+0x3eebb4) [0x7f0f9e720bb4]
13 0x4fdc87 python3() [0x4fdc87]
14 0x50a659 PyObject_Call + 521
15 0x4f3b64 _PyEval_EvalFrameDefault + 23156
16 0x4f676d _PyObject_FastCallDictTstate + 205
17 0x507f36 _PyObject_Call_Prepend + 102
18 0x5cf883 python3() [0x5cf883]
19 0x4f741b _PyObject_MakeTpCall + 603
20 0x4f34c6 _PyEval_EvalFrameDefault + 21462
21 0x4fe0cf _PyFunction_Vectorcall + 111
22 0x4ee40f _PyEval_EvalFrameDefault + 799
23 0x4fe0cf _PyFunction_Vectorcall + 111
24 0x50a508 PyObject_Call + 184
25 0x4f0c69 _PyEval_EvalFrameDefault + 11129
26 0x4fe0cf _PyFunction_Vectorcall + 111
27 0x4ee40f _PyEval_EvalFrameDefault + 799
28 0x4fe0cf _PyFunction_Vectorcall + 111
29 0x4ee40f _PyEval_EvalFrameDefault + 799
30 0x5950f2 python3() [0x5950f2]
31 0x595037 PyEval_EvalCode + 135
32 0x5c5e67 python3() [0x5c5e67]
33 0x5c0fb0 python3() [0x5c0fb0]
34 0x45970e python3() [0x45970e]
35 0x5bb53f _PyRun_SimpleFileObject + 415
36 0x5bb2a3 _PyRun_AnyFileObject + 67
37 0x5b805d Py_RunMain + 909
38 0x588679 Py_BytesMain + 57
39 0x7f107e6c7d90 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7f107e6c7d90]
40 0x7f107e6c7e40 libc_start_main + 128
41 0x58852e python3() [0x58852e]
Graphics card: NVIDIA GeForce 4090 HOST MACHINE Driver: NVIDIA-SMI 525.147.05 Driver Version: 525.147.05 CUDA Version: 12.0 | SYSTEM: ubuntu22.04 DOCKER IMAGE: nvidia/cuda:12.1.0-devel-ubuntu22.04 pip tensorRT_llm package: v.0.8.0 python: 3.10
when i use this commad: python3 convert_checkpoint.py --model_dir /root/models/Meta-Llama-3-8B-Instruct \ --output_dir /root/models/output/example-2/tllm_checkpoint_1gpu_fp16_wq \ --dtype float16 \ --use_weight_only \ --weight_only_precision int8
the question happend. How to solve it?