OutOfMemory - Not able to run the text-generation.py example on V100, and A10G cores.

I am trying to port the transformers based AutoModelForCausalLM to optimum.nvidia and I hit OutOfMemory. I assume I need to add quantization_config like I do with the transformers:

from transformers import AutoModelForCausalLM,BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(llm_int8_threshold=200.0,load_in_8bit=True)

and pass it in:

model = AutoModelForCausalLM.from_pretrained(model_id,device_map='auto',torch_dtype=torch.float16,quantization_config=quantization_config,)

The script I run:

import time

from optimum.nvidia import AutoModelForCausalLM, ExportConfig
from optimum.nvidia.utils.cli import (
    postprocess_quantization_parameters,
    register_common_model_topology_args,
    register_optimization_profiles_args,
    register_quantization_args,
  )

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
export = ExportConfig.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
export.max_input_len = 128
export.max_output_len = 64
export.max_num_tokens = 64
export.max_beam_width = 1

def gentext(prompt):
  start_time = time.time()
  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
  generated = model.generate(
        tokens["input_ids"],
  )
  response = tokenizer.decode(generated, skip_special_tokens=True)
  total_time =  time.time()-start_time
  return str(response), str(total_time)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",device_map="sequential", export_config=export)
print(gentext("write a poem"))

root@sd2-compile-nvidia-ckz65:/optimum-nvidia# nvidia-smi 
Sun Aug 11 06:18:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla V100-SXM2-16GB           On  | 00000000:00:1C.0 Off |                    0 |
| N/A   37C    P0              37W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-16GB           On  | 00000000:00:1D.0 Off |                    0 |
| N/A   36C    P0              40W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+
root@sd2-compile-nvidia-ckz65:/optimum-nvidia#

The error:

[TensorRT-LLM] TensorRT-LLM version: 0.12.0.dev2024072300
tokenizer_config.json: 100%|████████████████████████████████████████████████████████████| 1.62k/1.62k [00:00<00:00, 11.3MB/s]
tokenizer.model: 100%|████████████████████████████████████████████████████████████████████| 500k/500k [00:00<00:00, 14.6MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████████| 1.84M/1.84M [00:00<00:00, 22.5MB/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████| 414/414 [00:00<00:00, 3.82MB/s]
config.json: 100%|██████████████████████████████████████████████████████████████████████████| 587/587 [00:00<00:00, 5.03MB/s]
model.safetensors.index.json: 100%|█████████████████████████████████████████████████████| 33.4k/33.4k [00:00<00:00, 66.4MB/s]
generation_config.json: 100%|███████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 1.25MB/s]
model-00003-of-00003.safetensors: 100%|██████████████████████████████████████████████████| 6.18G/6.18G [00:17<00:00, 346MB/s]
model-00002-of-00003.safetensors: 100%|██████████████████████████████████████████████████| 9.90G/9.90G [00:30<00:00, 324MB/s]
model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████| 9.95G/9.95G [02:19<00:00, 71.3MB/s]
Fetching 6 files: 100%|████████████████████████████████████████████████████████████████████████| 6/6 [02:19<00:00, 23.28s/it]
Fetching 6 files: 100%|██████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 5853.88it/s]
Fetching 6 files: 100%|██████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 7686.57it/s]
/opt/conda/lib/python3.10/site-packages/tensorrt_llm/models/llama/convert.py:1414: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  q, k, v = (torch.tensor(weights[t]) for t in ['q', 'k', 'v'])
[08/11/2024-06:10:36] [TRT] [E] [virtualMemoryBuffer.cpp::resizePhysical::151] Error Code 2: OutOfMemory (Requested size was 16654532608 bytes.)
[08/11/2024-06:10:36] [TRT] [E] [virtualMemoryBuffer.cpp::resizePhysical::151] Error Code 2: OutOfMemory (Requested size was 16654532608 bytes.)
[08/11/2024-06:10:36] [TRT] [E] [globWriter.cpp::makeResizableGpuMemory::435] Error Code 2: OutOfMemory (Requested size was 16654532608 bytes.)
[08/11/2024-06:10:36] [TRT-LLM] [E] Engine building failed, please check the error log.
[TensorRT-LLM][INFO] Engine version 0.12.0.dev2024072300 found in the config file, assuming engine(s) built by new builder API.
[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0
[TensorRT-LLM][INFO] Rank 0 is using GPU 0
[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 1
[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 1
[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1
[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 64
[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0
[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: 64
[TensorRT-LLM][INFO] TRTGptModel computeContextLogits: 0
[TensorRT-LLM][INFO] TRTGptModel computeGenerationLogits: 0
[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0
[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 1
[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 64
[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 63 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled
[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT
[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None
[TensorRT-LLM][INFO] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
Traceback (most recent call last):
  File "/optimum-nvidia/1.py", line 44, in <module>
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",device_map="sequential", export_config=export)
  File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py", line 569, in from_pretrained
    instance = cls._from_pretrained(
  File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/models/auto.py", line 75, in _from_pretrained
    model = model_clazz.from_pretrained(
  File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py", line 569, in from_pretrained
    instance = cls._from_pretrained(
  File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/hub.py", line 307, in _from_pretrained
    return cls(
  File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/runtime.py", line 166, in __init__
    InferenceRuntimeBase.__init__(
  File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/runtime.py", line 93, in __init__
    self._executor = GenerationExecutor.create(
  File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/executor.py", line 404, in create
    return ExecutorBindingsWorker(**worker_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/executor.py", line 425, in __init__
    self.engine = tllm.Executor(engine_dir,
RuntimeError: [TensorRT-LLM][ERROR] Assertion failed: Error opening engine file: /root/.cache/huggingface/assets/trtllm/0.12.0.dev2024072300/meta-llama--Llama-2-13b-chat-hf/V100-SXM2-16GB/engines/rank0.engine (/home/jenkins/agent/workspace/LLM/main/L0_MergeRequest/tensorrt_llm/cpp/tensorrt_llm/runtime/tllmRuntime.cpp:66)
1       0x7feb878af7e1 tensorrt_llm::common::throwRuntimeError(char const*, int, std::string const&) + 82
2       0x7feb88dca0d0 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(tensorrt_llm::runtime::RawEngine const&, nvinfer1::ILogger*, float, bool) + 3152
3       0x7feb8902a1c9 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer1::ILogger>, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::runtime::RawEngine const&, bool, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 1017
4       0x7feb89045a03 tensorrt_llm::executor::Executor::Impl::createModel(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::executor::ExecutorConfig const&) + 419
5       0x7feb89046208 tensorrt_llm::executor::Executor::Impl::loadModel(std::optional<std::filesystem::path> const&, std::optional<std::vector<unsigned char, std::allocator<unsigned char> > > const&, tensorrt_llm::runtime::GptJsonConfig const&, tensorrt_llm::executor::ExecutorConfig const&, bool) + 1272
6       0x7feb8904c239 tensorrt_llm::executor::Executor::Impl::Impl(std::filesystem::path const&, std::optional<std::filesystem::path> const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 1785
7       0x7feb89040570 tensorrt_llm::executor::Executor::Executor(std::filesystem::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 64
8       0x7febe2181f92 /opt/conda/lib/python3.10/site-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0xb6f92) [0x7febe2181f92]
9       0x7febe21243ac /opt/conda/lib/python3.10/site-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x593ac) [0x7febe21243ac]
10      0x55be86842b27 python(+0x13fb27) [0x55be86842b27]
11      0x55be8683c42b _PyObject_MakeTpCall + 619
12      0x55be8684e934 python(+0x14b934) [0x55be8684e934]
13      0x55be8684f322 PyVectorcall_Call + 146
14      0x55be8684c74a python(+0x14974a) [0x55be8684c74a]
15      0x55be8683c7a0 python(+0x1397a0) [0x55be8683c7a0]
16      0x7febe21239cb /opt/conda/lib/python3.10/site-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x589cb) [0x7febe21239cb]
17      0x55be8683c42b _PyObject_MakeTpCall + 619
18      0x55be8683884e _PyEval_EvalFrameDefault + 23134
19      0x55be86842f8f _PyFunction_Vectorcall + 111
20      0x55be8683b985 _PyObject_FastCallDictTstate + 389
21      0x55be8684c34b python(+0x14934b) [0x55be8684c34b]
22      0x55be8683c7a0 python(+0x1397a0) [0x55be8683c7a0]
23      0x55be8684f139 PyObject_Call + 521
24      0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
25      0x55be86842f8f _PyFunction_Vectorcall + 111
26      0x55be868341c0 _PyEval_EvalFrameDefault + 5072
27      0x55be86842f8f _PyFunction_Vectorcall + 111
28      0x55be86837afd _PyEval_EvalFrameDefault + 19725
29      0x55be86842f8f _PyFunction_Vectorcall + 111
30      0x55be8683b985 _PyObject_FastCallDictTstate + 389
31      0x55be8684c34b python(+0x14934b) [0x55be8684c34b]
32      0x55be8683c47b _PyObject_MakeTpCall + 699
33      0x55be8683884e _PyEval_EvalFrameDefault + 23134
34      0x55be8684e641 python(+0x14b641) [0x55be8684e641]
35      0x55be8684efe8 PyObject_Call + 184
36      0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
37      0x55be86842f8f _PyFunction_Vectorcall + 111
38      0x55be8684efe8 PyObject_Call + 184
39      0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
40      0x55be8684e641 python(+0x14b641) [0x55be8684e641]
41      0x55be8684efe8 PyObject_Call + 184
42      0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
43      0x55be8684e641 python(+0x14b641) [0x55be8684e641]
44      0x55be8684efe8 PyObject_Call + 184
45      0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
46      0x55be86842f8f _PyFunction_Vectorcall + 111
47      0x55be8684efe8 PyObject_Call + 184
48      0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
49      0x55be8684e641 python(+0x14b641) [0x55be8684e641]
50      0x55be868341c0 _PyEval_EvalFrameDefault + 5072
51      0x55be868dba82 python(+0x1d8a82) [0x55be868dba82]
52      0x55be868db9c7 PyEval_EvalCode + 135
53      0x55be8690e82c python(+0x20b82c) [0x55be8690e82c]
54      0x55be86909704 python(+0x206704) [0x55be86909704]
55      0x55be8679a53c python(+0x9753c) [0x55be8679a53c]
56      0x55be86903925 _PyRun_SimpleFileObject + 437
57      0x55be869034d3 _PyRun_AnyFileObject + 67
58      0x55be869006a9 Py_RunMain + 921
59      0x55be868ce089 Py_BytesMain + 57
60      0x7fee5c293083 __libc_start_main + 243
61      0x55be868cdf81 python(+0x1caf81) [0x55be868cdf81]
Exception ignored in: <function PretrainedModel.__del__ at 0x7feb30289480>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/models/modeling_utils.py", line 394, in __del__
    self.release()
  File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/models/modeling_utils.py", line 391, in release
    release_gc()
  File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/_utils.py", line 483, in release_gc
    torch.cuda.ipc_collect()
  File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 804, in ipc_collect
    _lazy_init()
  File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 312, in _lazy_init
    raise DeferredCudaCallError(msg) from e
torch.cuda.DeferredCudaCallError: CUDA call failed lazily at initialization with error: 'NoneType' object is not iterable

CUDA call was originally invoked at:

  File "/optimum-nvidia/1.py", line 5, in <module>
    import torch
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/opt/conda/lib/python3.10/site-packages/torch/__init__.py", line 1480, in <module>
    _C._initExtension(manager_path())
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 1294, in <module>
    _lazy_call(_register_triton_kernels)
  File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 235, in _lazy_call
    _queued_calls.append((callable, traceback.format_stack()))

huggingface / optimum-nvidia

OutOfMemory - Not able to run the text-generation.py example on V100, and A10G cores. #146