Out of Memory When Tracing Model

I am trying compile LLaVA 1.5 7B to Neuron. As far as I can tell, the way to do this is to select some specific inputs and then trace the model execution with those inputs. However, when I try to trace the model, I get the error:

2024-Oct-15 18:01:17.728750  3882:3882  ERROR  TDRV:dmem_alloc_internal                     Failed to alloc DEVICE memory: 180355072
2024-Oct-15 18:01:18.343121  3882:3882  ERROR  TDRV:dml_dump                                Wrote nrt memory alloc debug info to [/tmp/nrt_mem_log_device_0_670eaded.csv](http://localhost:8890/tmp/nrt_mem_log_device_0_670eaded.csv)
2024-Oct-15 18:01:18.347720  3882:3882  ERROR  TDRV:log_dev_mem                             Failed to allocate 172.000MB (usage: tensors) on ND 0:NC 0, current utilization:
    * total: 15.855GB
    * tensors: 15.855GB
    * runtime: 1.062KB
    * dma rings: 32.000KB

2024-Oct-15 18:01:18.356933  3882:3882  ERROR  TDRV:tensor_allocate                         Failed to allocate 180355072 bytes on DEVICE for tensor UNKNOWN.
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[6], line 1
----> 1 model_neuron = torch_neuronx.trace(pipe.model, model_inputs)

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/trace.py:574](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/trace.py#line=573), in trace(func, example_inputs, input_output_aliases, compiler_workdir, compiler_args, partitioner_config, inline_weights_to_neff, *_, **kwargs)
    569     return torch_neuronx.partition(
    570         func, example_inputs, **(partitioner_config.__dict__)
    571     )
    573 with context:
--> 574     neff_filename, metaneff, flattener, packer, weights = _trace(
    575         func,
    576         example_inputs,
    577         states,
    578         input_output_aliases,
    579         compiler_workdir,
    580         compiler_args,
    581         inline_weights_to_neff,
    582     )
    583     return create_neuron_model(
    584         neff_filename,
    585         metaneff,
   (...)
    590         weights,
    591     )

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/trace.py:631](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/trace.py#line=630), in _trace(func, example_inputs, states, input_output_aliases, compiler_workdir, compiler_args, inline_weights_to_neff)
    621 def _trace(
    622     func: Union[Callable, torch.nn.Module],
    623     example_inputs: Any,
   (...)
    629 ) -> Union[str, str, structure.Flattener, structure.Packer]:
    630     # Convert the function to a HloProto message
--> 631     hlo_artifacts = generate_hlo(
    632         func,
    633         example_inputs,
    634         input_output_aliases=input_output_aliases,
    635         inline_weights_to_neff=inline_weights_to_neff,
    636     )
    638     # Call neuronx-cc to generate neff
    639     neff_artifacts = generate_neff(
    640         hlo_artifacts,
    641         compiler_workdir=compiler_workdir,
    642         compiler_args=compiler_args,
    643         inline_weights_to_neff=inline_weights_to_neff,
    644     )

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/trace.py:437](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/trace.py#line=436), in generate_hlo(func, example_inputs, input_output_aliases, inline_weights_to_neff, return_weights, output_aliased_tensor)
    419 def generate_hlo(
    420     func: Union[Callable, torch.nn.Module],
    421     example_inputs: Any,
   (...)
    425     output_aliased_tensor: bool = True,
    426 ):
    427     with torch_neuronx.contexts.mock_neuron_cores(), revert_device_placement(func):
    428         (
    429             hlo,
    430             input_parameter_names,
    431             constant_parameter_tensors,
    432             weights_param_num_to_module_path,
    433             buffers_param_num_to_module_path,
    434             flattener,
    435             packer,
    436             updated_input_output_aliases,
--> 437         ) = xla_trace(
    438             func,
    439             example_inputs,
    440             input_output_aliases=input_output_aliases,
    441             output_aliased_tensor=output_aliased_tensor
    442         )
    444     # get weight dict if in weight separation mode
    445     weight_idxs, weights, weight_name_to_idx = get_weights_and_index(
    446         inline_weights_to_neff,
    447         constant_parameter_tensors,
    448         weights_param_num_to_module_path,
    449     )

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/hlo_conversion.py:119](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/hlo_conversion.py#line=118), in xla_trace(func, example_inputs, states, input_output_aliases, output_aliased_tensor)
    117 if states is not None:
    118     for state in states:
--> 119         placement.move(state, xla_device)
    121 aliased_inputs = {}
    122 if input_output_aliases:
    123     # Get the xla_tensor for the corresponding aliased input.

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/placement.py:51](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch_neuronx/xla_impl/placement.py#line=50), in move(func, device)
     49     _move_script(func, device)
     50 elif isinstance(func, torch.nn.Module):
---> 51     func.to(device)
     52 elif isinstance(func, torch.optim.Optimizer):
     53     for param_group in func.param_groups:

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/transformers/modeling_utils.py:2905](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/transformers/modeling_utils.py#line=2904), in PreTrainedModel.to(self, *args, **kwargs)
   2900     if dtype_present_in_args:
   2901         raise ValueError(
   2902             "You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired"
   2903             " `dtype` by passing the correct `torch_dtype` argument."
   2904         )
-> 2905 return super().to(*args, **kwargs)

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py:1160](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py#line=1159), in Module.to(self, *args, **kwargs)
   1156         return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
   1157                     non_blocking, memory_format=convert_to_format)
   1158     return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
-> 1160 return self._apply(convert)

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py:810](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py#line=809), in Module._apply(self, fn, recurse)
    808 if recurse:
    809     for module in self.children():
--> 810         module._apply(fn)
    812 def compute_should_use_set_data(tensor, tensor_applied):
    813     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    814         # If the new tensor has compatible tensor type as the existing tensor,
    815         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    820         # global flag to let the user control whether they want the future
    821         # behavior of overwriting the existing tensor or not.

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py:810](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py#line=809), in Module._apply(self, fn, recurse)
    808 if recurse:
    809     for module in self.children():
--> 810         module._apply(fn)
    812 def compute_should_use_set_data(tensor, tensor_applied):
    813     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    814         # If the new tensor has compatible tensor type as the existing tensor,
    815         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    820         # global flag to let the user control whether they want the future
    821         # behavior of overwriting the existing tensor or not.

    [... skipping similar frames: Module._apply at line 810 (3 times)]

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py:810](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py#line=809), in Module._apply(self, fn, recurse)
    808 if recurse:
    809     for module in self.children():
--> 810         module._apply(fn)
    812 def compute_should_use_set_data(tensor, tensor_applied):
    813     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    814         # If the new tensor has compatible tensor type as the existing tensor,
    815         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    820         # global flag to let the user control whether they want the future
    821         # behavior of overwriting the existing tensor or not.

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py:833](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py#line=832), in Module._apply(self, fn, recurse)
    829 # Tensors stored in modules are graph leaves, and we don't want to
    830 # track autograd history of `param_applied`, so we have to use
    831 # `with torch.no_grad():`
    832 with torch.no_grad():
--> 833     param_applied = fn(param)
    834 should_use_set_data = compute_should_use_set_data(param, param_applied)
    835 if should_use_set_data:

File [/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py:1158](http://localhost:8890/opt/aws_neuronx_venv_transformers_neuronx/lib/python3.10/site-packages/torch/nn/modules/module.py#line=1157), in Module.to.<locals>.convert(t)
   1155 if convert_to_format is not None and t.dim() in (4, 5):
   1156     return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
   1157                 non_blocking, memory_format=convert_to_format)
-> 1158 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)

RuntimeError: Bad StatusOr access: RESOURCE_EXHAUSTED: AllocBuffer: error condition NRT_RESOURCE == rt_status: Not enough Neuron memory on core 0 for size=180355072

I have seen this error with both trn1.2xlarge and trn1.32xlarge on the most recent Neuron DLAMI.

The source-code to reproduce my setup is:

import torch
import torch_neuronx
import transformers

import requests
from PIL import Image

from transformers import pipeline

image_url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:"

model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id)

model_inputs = pipe.image_processor(image, return_tensors=pipe.framework)
model_inputs.update(pipe.tokenizer(prompt, return_tensors=pipe.framework))

model_neuron = torch_neuronx.trace(pipe.model, model_inputs)

aws-neuron / aws-neuron-sdk

Out of Memory When Tracing Model #1014