Out of memory error when running torchdynamo with model and custom backend

gaetansnl commented 1 year ago

🐛 Describe the bug

Hello ! I have an out of memory error when I try to run Whisper through torchdynamo.

It works fine without torchdynamo.
It works fine on openai/whisper-medium
It works fine with num_beams=1
Seems it is the decoder causing the problem, If I remove optimize_model(model.model.decoder) it works

When I set use_cache to false in generate it segfault instead of OOM. And I don't think the minifier is working for this case.

Pytorch: 1.14.0.dev20221130+cu117 (nightly)

Minimal reproduction

dynamo.config.cache_size_limit = 512
def _compiler(gm: torch.fx.GraphModule, example_inputs):
    return gm
def optimize_model(original_model) -> None:
    original_model.forward2 = original_model.forward
    @torchdynamo.optimize(_compiler)
    def run(*args, **kwargs):
        return original_model.forward2(*args, **kwargs)

    original_model.forward = run

    audio_dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to("cuda")

    optimize_model(model.model.encoder)
    optimize_model(model.model.decoder)

    processor = WhisperProcessor.from_pretrained("openai/whisper-large")
    speech_data = audio_dataset[0]["audio"]["array"]

    inputs = processor(speech_data, return_tensors="pt", sampling_rate=16_000).input_features.to("cuda")
    with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
        predicted_ids = model.generate( inputs, min_length=25, max_length=25, num_beams=2, do_sample=False)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
        assert transcription == "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"

Error logs

x = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def clone_input(x):
        """copy while preserving strides"""

        def torch_clone(x):
            y = torch.clone(x)
            if x.is_leaf:
                y.requires_grad_(x.requires_grad)
            if x.is_leaf and x.grad is not None:
                y.grad = clone_input(x.grad)
            return y

        with torch.no_grad():
            if x.device.type == "xla":
                # Access data_ptr() for a xla tensor will cause crash
                return torch_clone(x)

            needed_size = sum(
                (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
            )
            if x.is_quantized:
                result = torch.empty_quantized((needed_size + 32,), x)
            else:
                result = torch.empty(needed_size + 32, dtype=x.dtype, device=x.device)
            cache_line_offset = (
                (x.data_ptr() - result.data_ptr()) % 32
            ) // x.element_size()
            result.as_strided_(x.size(), x.stride(), cache_line_offset)
            try:
>               result.copy_(x.clone())
E               torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 22.20 GiB total capacity; 17.57 GiB already allocated; 6.12 MiB free; 20.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/utils.py:427: OutOfMemoryError

During handling of the above exception, another exception occurred:

code = <code object run at 0x7fe85b69cdf0, file "/kernl/test/test_torchdynamo.py", line 142>
globals = {'@py_builtins': <module 'builtins' (built-in)>, '@pytest_ar': <module '_pytest.assertion.rewrite' from '/usr/local/li...auto.AutoModelForSeq2SeqLM'>, 'AutoTokenizer': <class 'transformers.models.auto.tokenization_auto.AutoTokenizer'>, ...}
locals = {'args': (), 'kwargs': {'attention_mask': None, 'cross_attn_head_mask': None, 'encoder_hidden_states': tensor([[[-9.05...eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)}
builtins = {'ArithmeticError': <class 'ArithmeticError'>, 'AssertionError': <class 'AssertionError'>, 'AttributeError': <class 'AttributeError'>, 'BaseException': <class 'BaseException'>, ...}
compiler_fn = <function _compiler at 0x7fe8416fab80>, one_graph = False, export = False, guard_export_fn = None, frame = <frame at 0x7fe81ce13c80, file '/kernl/test/test_torchdynamo.py', line 142, code run>

    def _compile(
        code: types.CodeType,
        globals,
        locals,
        builtins,
        compiler_fn,
        one_graph,
        export,
        guard_export_fn=None,
        frame=None,
    ) -> Optional[GuardedCode]:
        output: Optional[OutputGraph] = None

        # from .utils import print_once;  print_once(code.co_filename)
        def transform(instructions, code_options):
            nonlocal output
            tracer = InstructionTranslator(
                instructions,
                code,
                locals,
                globals,
                builtins,
                code_options,
                compiler_fn,
                one_graph,
                export,
            )
            tracer.run()
            output = tracer.output
            assert output is not None
            assert output.output_instructions
            instructions[:] = output.output_instructions
            code_options.update(output.code_options)

            if config.dead_code_elimination:
                instructions[:] = remove_pointless_jumps(remove_dead_code(instructions))

        try:
            for attempt in itertools.count():
                try:
>                   out_code = transform_code_object(code, transform)

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/convert_frame.py:393: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

code = <code object run at 0x7fe85b69cdf0, file "/kernl/test/test_torchdynamo.py", line 142>, transformations = <function _compile.<locals>.transform at 0x7fe7afdbfee0>, safe = False

    def transform_code_object(code, transformations, safe=False):
        keys = [
            "co_argcount",
            "co_posonlyargcount",  # python 3.8+
            "co_kwonlyargcount",
            "co_nlocals",
            "co_stacksize",
            "co_flags",
            "co_code",
            "co_consts",
            "co_names",
            "co_varnames",
            "co_filename",
            "co_name",
            "co_firstlineno",
            "co_lnotab",  # changed to "co_linetable" if python 3.10+
            "co_freevars",
            "co_cellvars",
        ]
        if sys.version_info < (3, 8):
            keys.pop(1)
        if sys.version_info >= (3, 10):
            keys = list(map(lambda x: x.replace("co_lnotab", "co_linetable"), keys))
        code_options = {k: getattr(code, k) for k in keys}
        assert len(code_options["co_varnames"]) == code_options["co_nlocals"]

        instructions = cleaned_instructions(code, safe)
        propagate_line_nums(instructions)

>       transformations(instructions, code_options)

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/bytecode_transformation.py:341: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

instructions = [Instruction(opcode=136, opname='LOAD_DEREF', arg=0, argval='original_model', offset=0, starts_line=144, is_jump_targe...(opcode=164, opname='DICT_MERGE', arg=1, argval=1, offset=10, starts_line=144, is_jump_target=False, target=None), ...]
code_options = {'co_argcount': 0, 'co_cellvars': (), 'co_code': b'\x88\x00j\x00|\x00i\x00|\x01\xa4\x01\x8e\x01S\x00', 'co_consts': (None,), ...}

    def transform(instructions, code_options):
        nonlocal output
>       tracer = InstructionTranslator(
            instructions,
            code,
            locals,
            globals,
            builtins,
            code_options,
            compiler_fn,
            one_graph,
            export,
        )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/convert_frame.py:369: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.symbolic_convert.InstructionTranslator object at 0x7fe7af6b0280>
instructions = [Instruction(opcode=136, opname='LOAD_DEREF', arg=0, argval='original_model', offset=0, starts_line=144, is_jump_targe...(opcode=164, opname='DICT_MERGE', arg=1, argval=1, offset=10, starts_line=144, is_jump_target=False, target=None), ...]
f_code = <code object run at 0x7fe85b69cdf0, file "/kernl/test/test_torchdynamo.py", line 142>
f_locals = {'args': (), 'kwargs': {'attention_mask': None, 'cross_attn_head_mask': None, 'encoder_hidden_states': tensor([[[-9.05...eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)}
f_globals = {'@py_builtins': <module 'builtins' (built-in)>, '@pytest_ar': <module '_pytest.assertion.rewrite' from '/usr/local/li...auto.AutoModelForSeq2SeqLM'>, 'AutoTokenizer': <class 'transformers.models.auto.tokenization_auto.AutoTokenizer'>, ...}
f_builtins = {'ArithmeticError': <class 'ArithmeticError'>, 'AssertionError': <class 'AssertionError'>, 'AttributeError': <class 'AttributeError'>, 'BaseException': <class 'BaseException'>, ...}
code_options = {'co_argcount': 0, 'co_cellvars': (), 'co_code': b'\x88\x00j\x00|\x00i\x00|\x01\xa4\x01\x8e\x01S\x00', 'co_consts': (None,), ...}, compiler_fn = <function _compiler at 0x7fe8416fab80>, one_graph = False, export = False

    def __init__(
        self,
        instructions: List[Instruction],
        f_code,
        f_locals,
        f_globals,
        f_builtins,
        code_options,
        compiler_fn,
        one_graph,
        export,
    ):
        super(InstructionTranslator, self).__init__(
            output=OutputGraph(f_globals, code_options, compiler_fn, self),
            instructions=instructions,
            f_locals=f_locals,
            f_globals=f_globals,
            f_builtins=f_builtins,
            code_options=code_options,
            symbolic_locals=collections.OrderedDict(),  # set below
            # A global var is inserted only after a STORE_GLOBAL happens to it
            symbolic_globals=collections.OrderedDict(),
            f_code=f_code,
            export=export,
        )
        self.one_graph: bool = one_graph
        self.export = export
        if self.export:
            assert (
                self.one_graph
            ), "Export without one graph - something has gone wrong."

        vars = list(code_options["co_varnames"])
        vars.extend(x for x in self.cell_and_freevars() if x not in vars)
>       self.symbolic_locals = collections.OrderedDict(
            (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
            for k in vars
            if k in f_locals
        )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/symbolic_convert.py:1567: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

.0 = <list_iterator object at 0x7fe7f08c1730>

    self.symbolic_locals = collections.OrderedDict(
>       (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
        for k in vars
        if k in f_locals
    )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/symbolic_convert.py:1568: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af63b160>
value = {'attention_mask': None, 'cross_attn_head_mask': None, 'encoder_hidden_states': tensor([[[-9.0559e-01,  4.3369e-02,  8...8e-01, -2.8411e-01,  ...,  9.2083e-01,
          -2.8621e-01,  1.7712e-01]]], device='cuda:0'), 'head_mask': None, ...}

    def __call__(self, value):
        if value in self.tx.output.side_effects:
            # TODO(jansel): add guard for alias relationship
            return self.tx.output.side_effects[value]
>       return self._wrap(value).clone(**self.options())

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:146: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af63b160>
value = {'attention_mask': None, 'cross_attn_head_mask': None, 'encoder_hidden_states': tensor([[[-9.0559e-01,  4.3369e-02,  8...8e-01, -2.8411e-01,  ...,  9.2083e-01,
          -2.8621e-01,  1.7712e-01]]], device='cuda:0'), 'head_mask': None, ...}

    def _wrap(self, value):
        make_guards = self.make_guards
        if istype(value, (torch.SymInt, torch.SymFloat)):
            return self.wrap_sym(value)
        if istensor(value):
            return self.wrap_tensor(value)
        elif istype(value, (tuple, list, odict_values)) or is_namedtuple(value):
            # One can index a tensor with a list/tuple. Therefore, we need to
            # have a stricter match.
            if istype(value, (tuple, list)) and all(
                [isinstance(x, int) or is_numpy_int_type(x) or x is None for x in value]
            ):
                guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
            else:
                guards = self.make_guards(GuardBuilder.LIST_LENGTH)
            output = [
                VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
                    item
                ).add_guards(guards)
                for i, item in enumerate(value)
            ]
            result = self.list_type(value)(output, guards=guards)
            if istype(value, list):
                return self.tx.output.side_effects.track_list(
                    self.source, value, result
                )
            return result
        elif istype(value, tuple_iterator):
            guards = self.make_guards(GuardBuilder.TUPLE_ITERATOR_LEN)
            output = [
                VariableBuilder(
                    self.tx, TupleIteratorGetItemSource(self.get_source(), i)
                )(tuple_iterator_getitem(value, i)).add_guards(guards)
                for i in range(tuple_iterator_len(value))
            ]
            return ListIteratorVariable(
                output, mutable_local=MutableLocal(), guards=guards
            )
        elif istype(value, (slice, range)):
            items = [
                VariableBuilder(self.tx, AttrSource(self.get_source(), k))(
                    getattr(value, k)
                )
                for k in ("start", "stop", "step")
            ]
            if isinstance(value, slice):
                return SliceVariable(items, guards=make_guards(GuardBuilder.TYPE_MATCH))
            else:
                return RangeVariable(
                    items, guards=make_guards(GuardBuilder.EQUALS_MATCH)
                )
        elif istype(
            value, (dict, collections.defaultdict, collections.OrderedDict)
        ) and all(
            map(
                lambda k: ConstantVariable.is_literal(k)
                or self.tensor_can_be_dict_key(k),
                value.keys(),
            )
        ):
            guards = self.make_guards(GuardBuilder.DICT_KEYS)

            # store key variables in global location for reconstruction
            for key in value.keys():
                if self.tensor_can_be_dict_key(key):
                    self.tx.store_dict_key(global_key_name(key), key)

            def index_source(key):
                if self.tensor_can_be_dict_key(key):
                    return GlobalWeakRefSource(global_key_name(key))
                else:
                    return key

            result = dict(
>               [
                    (
                        k,
                        VariableBuilder(
                            self.tx, GetItemSource(self.get_source(), index_source(k))
                        )(value[k]).add_guards(guards),
                    )
                    for k in value.keys()
                ]
            )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:279: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

.0 = <dict_keyiterator object at 0x7fe7ec0d7130>

        [
            (
                k,
>               VariableBuilder(
                    self.tx, GetItemSource(self.get_source(), index_source(k))
                )(value[k]).add_guards(guards),
            )
            for k in value.keys()
        ]
    )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:282: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af445430>
value = ((tensor([[[[-2.9565e-01,  2.6001e-01,  5.9033e-01,  ..., -2.0483e-01,
            6.0205e-01,  1.7151e-01],
         ..., -3.5449e-01,  ..., -3.3234e-02,
           -1.8591e-01,  6.0539e-03]]]], device='cuda:0', dtype=torch.float16)), ...)

    def __call__(self, value):
        if value in self.tx.output.side_effects:
            # TODO(jansel): add guard for alias relationship
            return self.tx.output.side_effects[value]
>       return self._wrap(value).clone(**self.options())

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:146: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af445430>
value = ((tensor([[[[-2.9565e-01,  2.6001e-01,  5.9033e-01,  ..., -2.0483e-01,
            6.0205e-01,  1.7151e-01],
         ..., -3.5449e-01,  ..., -3.3234e-02,
           -1.8591e-01,  6.0539e-03]]]], device='cuda:0', dtype=torch.float16)), ...)

    def _wrap(self, value):
        make_guards = self.make_guards
        if istype(value, (torch.SymInt, torch.SymFloat)):
            return self.wrap_sym(value)
        if istensor(value):
            return self.wrap_tensor(value)
        elif istype(value, (tuple, list, odict_values)) or is_namedtuple(value):
            # One can index a tensor with a list/tuple. Therefore, we need to
            # have a stricter match.
            if istype(value, (tuple, list)) and all(
                [isinstance(x, int) or is_numpy_int_type(x) or x is None for x in value]
            ):
                guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
            else:
                guards = self.make_guards(GuardBuilder.LIST_LENGTH)
>           output = [
                VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
                    item
                ).add_guards(guards)
                for i, item in enumerate(value)
            ]

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:220: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

.0 = <enumerate object at 0x7fe7af9b2ec0>

    output = [
>       VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
            item
        ).add_guards(guards)
        for i, item in enumerate(value)
    ]

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:221: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7f09e64c0>
value = (tensor([[[[-0.7871, -0.1460,  0.0963,  ..., -0.1921,  0.1439, -0.1443],
          [-0.3533, -0.3508,  0.0318,  ...,  ...29e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16))

    def __call__(self, value):
        if value in self.tx.output.side_effects:
            # TODO(jansel): add guard for alias relationship
            return self.tx.output.side_effects[value]
>       return self._wrap(value).clone(**self.options())

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:146: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7f09e64c0>
value = (tensor([[[[-0.7871, -0.1460,  0.0963,  ..., -0.1921,  0.1439, -0.1443],
          [-0.3533, -0.3508,  0.0318,  ...,  ...29e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16))

    def _wrap(self, value):
        make_guards = self.make_guards
        if istype(value, (torch.SymInt, torch.SymFloat)):
            return self.wrap_sym(value)
        if istensor(value):
            return self.wrap_tensor(value)
        elif istype(value, (tuple, list, odict_values)) or is_namedtuple(value):
            # One can index a tensor with a list/tuple. Therefore, we need to
            # have a stricter match.
            if istype(value, (tuple, list)) and all(
                [isinstance(x, int) or is_numpy_int_type(x) or x is None for x in value]
            ):
                guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
            else:
                guards = self.make_guards(GuardBuilder.LIST_LENGTH)
>           output = [
                VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
                    item
                ).add_guards(guards)
                for i, item in enumerate(value)
            ]

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:220: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

.0 = <enumerate object at 0x7fe7f054a440>

    output = [
>       VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
            item
        ).add_guards(guards)
        for i, item in enumerate(value)
    ]

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:221: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af4e2070>
value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def __call__(self, value):
        if value in self.tx.output.side_effects:
            # TODO(jansel): add guard for alias relationship
            return self.tx.output.side_effects[value]
>       return self._wrap(value).clone(**self.options())

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:146: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af4e2070>
value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def _wrap(self, value):
        make_guards = self.make_guards
        if istype(value, (torch.SymInt, torch.SymFloat)):
            return self.wrap_sym(value)
        if istensor(value):
>           return self.wrap_tensor(value)

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:210: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <torch._dynamo.variables.builder.VariableBuilder object at 0x7fe7af4e2070>
value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def wrap_tensor(self, value: torch.Tensor):
        if self.get_source().guard_source().is_nn_module():
            return self.tx.output.register_attr_or_module(
                value,
                self.name,
                source=self.get_source(),
                # Guards are done inside register_attr_or_module
                # guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
            )
        else:
            if not is_constant_source(self.get_source()):
                self.tx.output.graphargs.append(
                    GraphArg(self.get_source(), value, False)
                )
            # Disable __torch_function__ to prevent cloning of `value` to hit
            # us
            with torch._C.DisableTorchFunction():
                if is_constant_source(self.get_source()):
                    return self.tx.output.register_attr_or_module(
                        value,
                        re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
                        source=None,
                        # Guards are added inside register_attr_or_module
                    )
>               tensor_variable = wrap_fx_proxy(
                    tx=self.tx,
                    proxy=self.tx.output.create_graph_input(
                        re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
                    ),
                    example_value=value,
                    guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
                    should_specialize=self.tensor_should_specialize(),
                )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:568: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

tx = <torch._dynamo.symbolic_convert.InstructionTranslator object at 0x7fe7af6b0280>, proxy = Proxy(kwargs_past_key_values_29_3_)
example_value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)
options = {'guards': {Guard(name="kwargs['past_key_values'][29][3]", source=<GuardSource.LOCAL: 0>, create_fn=<function GuardBui...le=False, guard_types=None, code_list=None, obj_weakref=None, guarded_class_weakref=None)}, 'should_specialize': False}

    def wrap_fx_proxy(tx, proxy, example_value=None, **options):
>       return wrap_fx_proxy_cls(
            target_cls=TensorVariable,
            tx=tx,
            proxy=proxy,
            example_value=example_value,
            **options,
        )

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:657: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_cls = <class 'torch._dynamo.variables.tensor.TensorVariable'>, tx = <torch._dynamo.symbolic_convert.InstructionTranslator object at 0x7fe7af6b0280>, proxy = Proxy(kwargs_past_key_values_29_3_)
example_value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)
options = {'guards': {Guard(name="kwargs['past_key_values'][29][3]", source=<GuardSource.LOCAL: 0>, create_fn=<function GuardBui...le=False, guard_types=None, code_list=None, obj_weakref=None, guarded_class_weakref=None)}, 'should_specialize': False}
initial_example_value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)
_clone_input = <function wrap_fx_proxy_cls.<locals>._clone_input at 0x7fe7af8b9dc0>

    def wrap_fx_proxy_cls(target_cls, tx, proxy, example_value=None, **options):
        if "guards" in options and options["guards"] is not None:
            tx.output.guards.update(options["guards"])

        assert "example_value" not in proxy.node.meta
        if not config.dynamic_propagation:
            if isinstance(example_value, torch.Tensor):
                options.update(target_cls.specialize(example_value))
            return target_cls(proxy, **options)

        initial_example_value = example_value

        def _clone_input(value):
            if isinstance(value, torch.Tensor):
                # tensor subclasses will not be converted to FakeTensors and need to be cloned
                if not isinstance(value, torch._subclasses.fake_tensor.FakeTensor):
                    # NB: ensure strides are preserved
                    value = clone_input(value)

            return value

        with preserve_rng_state():
            if example_value is None:
                example_value = get_fake_value(proxy.node, tx)

            else:
>               proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:694: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

value = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def _clone_input(value):
        if isinstance(value, torch.Tensor):
            # tensor subclasses will not be converted to FakeTensors and need to be cloned
            if not isinstance(value, torch._subclasses.fake_tensor.FakeTensor):
                # NB: ensure strides are preserved
>               value = clone_input(value)

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/variables/builder.py:685: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

x = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def clone_input(x):
        """copy while preserving strides"""

        def torch_clone(x):
            y = torch.clone(x)
            if x.is_leaf:
                y.requires_grad_(x.requires_grad)
            if x.is_leaf and x.grad is not None:
                y.grad = clone_input(x.grad)
            return y

        with torch.no_grad():
            if x.device.type == "xla":
                # Access data_ptr() for a xla tensor will cause crash
                return torch_clone(x)

            needed_size = sum(
                (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
            )
            if x.is_quantized:
                result = torch.empty_quantized((needed_size + 32,), x)
            else:
                result = torch.empty(needed_size + 32, dtype=x.dtype, device=x.device)
            cache_line_offset = (
                (x.data_ptr() - result.data_ptr()) % 32
            ) // x.element_size()
            result.as_strided_(x.size(), x.stride(), cache_line_offset)
            try:
                result.copy_(x.clone())
                if x.is_leaf:
                    result.requires_grad_(x.requires_grad)
                if x.is_leaf and x.grad is not None:
                    result.grad = clone_input(x.grad)
            except RuntimeError:
                # RuntimeError: unsupported operation: more than one element of the written-to
                # tensor refers to a single memory location. Please clone() the tensor before
                # performing the operation.
>               return torch_clone(x)

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/utils.py:436: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

x = tensor([[[[-3.9331e-01,  2.4365e-01,  4.9634e-01,  ..., -3.2983e-01,
            5.1117e-02,  4.9341e-01],
          [...129e-01, -7.0947e-01,  ...,  6.1218e-02,
            2.4744e-01, -2.6904e-01]]]], device='cuda:0', dtype=torch.float16)

    def torch_clone(x):
>       y = torch.clone(x)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 22.20 GiB total capacity; 17.57 GiB already allocated; 6.12 MiB free; 20.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
E       
E       Set torch._dynamo.config.verbose=True for more information
E       
E       
E       You can suppress this exception and fall back to eager by setting:
E           torch._dynamo.config.suppress_errors = True

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/utils.py:403: OutOfMemoryError

The above exception was the direct cause of the following exception:

benchmark = <kernl.benchmark.benchmark_fixture.BenchmarkFixture object at 0x7fe85b69b910>, implementation = 'optimized'

    @setup_dynamo()
    @pytest.mark.parametrize("implementation", ["optimized"])
    def test_whisper_hf(benchmark, implementation):
        audio_dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to("cuda")

        optimize_model(model.model.encoder)
        optimize_model(model.model.decoder)

        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
        speech_data = audio_dataset[0]["audio"]["array"]

        inputs = processor(speech_data, return_tensors="pt", sampling_rate=16_000).input_features.to("cuda")
        with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
>           predicted_ids = benchmark(model.generate, inputs, min_length=25, max_length=25, num_beams=2, do_sample=False)

test/test_torchdynamo.py:162: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/kernl/benchmark/benchmark_fixture.py:53: in __call__
    function_to_benchmark(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/torch/autograd/grad_mode.py:34: in decorate_context
    return func(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/transformers/generation_utils.py:1577: in generate
    return self.beam_search(
/usr/local/lib/python3.9/dist-packages/transformers/generation_utils.py:2747: in beam_search
    outputs = self(
/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1480: in _call_impl
    return forward_call(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/transformers/models/whisper/modeling_whisper.py:1192: in forward
    outputs = self.model(
/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1480: in _call_impl
    return forward_call(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/transformers/models/whisper/modeling_whisper.py:1061: in forward
    decoder_outputs = self.decoder(
/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1480: in _call_impl
    return forward_call(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/torch/_dynamo/eval_frame.py:209: in _fn
    return fn(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/torch/_dynamo/eval_frame.py:329: in catch_errors
    return callback(frame, cache_size)
/usr/local/lib/python3.9/dist-packages/torch/_dynamo/convert_frame.py:466: in _convert_frame
    result = inner_convert(frame, cache_size)
/usr/local/lib/python3.9/dist-packages/torch/_dynamo/convert_frame.py:103: in _fn
    return fn(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/torch/_dynamo/utils.py:90: in time_wrapper
    r = func(*args, **kwargs)
/usr/local/lib/python3.9/dist-packages/torch/_dynamo/convert_frame.py:337: in _convert_frame_assert
    return _compile(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

code = <code object run at 0x7fe85b69cdf0, file "/kernl/test/test_torchdynamo.py", line 142>
globals = {'@py_builtins': <module 'builtins' (built-in)>, '@pytest_ar': <module '_pytest.assertion.rewrite' from '/usr/local/li...auto.AutoModelForSeq2SeqLM'>, 'AutoTokenizer': <class 'transformers.models.auto.tokenization_auto.AutoTokenizer'>, ...}
locals = {'args': (), 'kwargs': {'attention_mask': None, 'cross_attn_head_mask': None, 'encoder_hidden_states': tensor([[[-9.05...eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)}
builtins = {'ArithmeticError': <class 'ArithmeticError'>, 'AssertionError': <class 'AssertionError'>, 'AttributeError': <class 'AttributeError'>, 'BaseException': <class 'BaseException'>, ...}
compiler_fn = <function _compiler at 0x7fe8416fab80>, one_graph = False, export = False, guard_export_fn = None, frame = <frame at 0x7fe81ce13c80, file '/kernl/test/test_torchdynamo.py', line 142, code run>

    def _compile(
        code: types.CodeType,
        globals,
        locals,
        builtins,
        compiler_fn,
        one_graph,
        export,
        guard_export_fn=None,
        frame=None,
    ) -> Optional[GuardedCode]:
        output: Optional[OutputGraph] = None

        # from .utils import print_once;  print_once(code.co_filename)
        def transform(instructions, code_options):
            nonlocal output
            tracer = InstructionTranslator(
                instructions,
                code,
                locals,
                globals,
                builtins,
                code_options,
                compiler_fn,
                one_graph,
                export,
            )
            tracer.run()
            output = tracer.output
            assert output is not None
            assert output.output_instructions
            instructions[:] = output.output_instructions
            code_options.update(output.code_options)

            if config.dead_code_elimination:
                instructions[:] = remove_pointless_jumps(remove_dead_code(instructions))

        try:
            for attempt in itertools.count():
                try:
                    out_code = transform_code_object(code, transform)
                    orig_code_map[out_code] = code
                    break
                except exc.RestartAnalysis:
                    log.debug("Restarting analysis ...")
                    if attempt > 100:
                        unimplemented("100+ RestartAnalysis() calls")
                except exc.SkipFrame:
                    log.debug(
                        f"Skipping frame {code.co_name} \
                        {code.co_filename} {code.co_firstlineno}"
                    )
                    if one_graph:
                        log.debug("No graph captured with one_graph=True")
                    return None
            output_codes.add(out_code)

            log.log(
                logging.CODE,  # type: ignore[attr-defined]
                format_bytecode(
                    "ORIGINAL BYTECODE",
                    code.co_name,
                    code.co_filename,
                    code.co_firstlineno,
                    code,
                ),
            )
            log.log(
                logging.CODE,  # type: ignore[attr-defined]
                format_bytecode(
                    "MODIFIED BYTECODE",
                    code.co_name,
                    code.co_filename,
                    code.co_firstlineno,
                    out_code,
                ),
            )

            assert output is not None
            assert output.guards is not None
            CleanupManager.instance[out_code] = output.cleanups
            check_fn = CheckFunctionManager(output, output.guards, locals, globals)

            guarded_code = GuardedCode(out_code, check_fn.check_fn)
            guard_str = "GUARDS:\n"
            guard_str += "\n".join([f" - {str(guard)}" for guard in sorted(output.guards)])

            log.log(logging.CODE, guard_str)  # type: ignore[attr-defined]

            if guard_export_fn is not None:
                guard_export_fn(output.guards)

            return guarded_code
        except (
            Unsupported,
            TorchRuntimeError,
            BackendCompilerFailed,
            AssertionError,
        ) as e:
            exception_handler(e, code, frame)
            raise
        except Exception as e:
            exception_handler(e, code, frame)
>           raise InternalTorchDynamoError() from e
E           torch._dynamo.exc.InternalTorchDynamoError

/usr/local/lib/python3.9/dist-packages/torch/_dynamo/convert_frame.py:456: InternalTorchDynamoError
----------------------------------------------------------------------------------------------------------- Captured log call -----------------------------------------------------------------------------------------------------------
WARNING  datasets.builder:builder.py:747 Found cached dataset librispeech_asr_dummy (/root/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)
=========================================================================================================== warnings summary ============================================================================================================
../usr/lib/python3/dist-packages/requests/__init__.py:89
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.13) or chardet (3.0.4) doesn't match a supported version!
    warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
======================================================================================================== short test summary info ========================================================================================================
FAILED test/test_torchdynamo.py::test_whisper_hf[optimized] - torch._dynamo.exc.InternalTorchDynamoError

Minified repro

No response

pommedeterresautee commented 1 year ago

I have the same issue on T5-3B (OOM because of the clone_input function) and eager optimizer. Happens on beam = 1, only if we apply dynamo to the decoder part (encoder no pb). It has been reported by one of our user here: https://github.com/ELS-RD/kernl/issues/188

@williamwen42 any idea of a (dirty?) workaround if a clean fix takes time to come?

Seems to be related to https://github.com/pytorch/torchdynamo/issues/1950 CC @ezyang @voznesenskym

ezyang commented 1 year ago

If it is really #1950, I can give you a dirty workaround for it.

ezyang commented 1 year ago

diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 843e50687a..97a78f8638 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -717,7 +717,6 @@ def wrap_fx_proxy_cls(target_cls, tx, proxy, example_value=None, **options):
             # TODO(voz): Find all the callsites and burn this down.
             # Flipping it to an assert fails dozens of tests.
             if not isinstance(example_value, torch._subclasses.FakeTensor):
-                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
                 fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
                 example_value = fake_wrapper(example_value)

TheExGenesis commented 1 year ago

Correct me if I'm wrong but I think the dirty fix has been applied here https://github.com/pytorch/torchdynamo/issues/1950 and I tried testing it and am still running out of memory

ezyang commented 1 year ago

Yeah, then this is a different problem, we will need to investigate

pommedeterresautee commented 1 year ago

@TheExGenesis are you trying on Whisper? where does it crash? On last nighties, the issue seems to be elsewhere.

@ezyang is it possible that eager mode of dynamo has a higher (even slightly, like from 10.4 Gb to 10.6 Gb of CUDA memory reserved) memory footprint than "real" eager mode (aka without dynamo)? Also, would it be possible that the garbage collector is not called with eager+dynamo as it would for real eager mode? (later or never)

ezyang commented 1 year ago

Dynamo eager can use more memory, but we found in our benchmark suite that typically memory usage improved, because our min cut graph partitioner can make better choices about what to save for backwards. The other known and obvious culprits for memory usage is cuda graphs (but this is turned off by default) and fake tensor falling back to real operations to fallback for meta usage (but this is a very slight amount of extra memory usage, only as much as is necessary to allocate the inputs/outputs for a particular operation.) @eellison, do we have an easy log level to test for the latter?

I'm going to bump the priority to make sure we have someone look into this.

eellison commented 1 year ago

@ezyang we don't atm, I can add. the one off-ops culprit was actually a red herring for other things (cudagraphs), when I landed the change for running ops inductor with fake tensor instead of regular tensors memory compression didn't decrease at all. I think it would be worth adding a debug mode that prints out the additional memory overhead for some of the following when it's significant.

I think the remaining sources of memory overhead in order of likeliness:

cudagraphs, run with this off
functionalization (@mlazos is looking into memory overhead of compiling optimizer, and this seems a probable cause there)
triton autotuning allocates a 250mb buffer to clear caches
inplace autotuning necessitates a clone of mutated inputs (https://github.com/pytorch/pytorch/pull/89519 removed non-mutated clones)
horizontal inductor fusion
fake tensor running real ops (although.. unlikely imo, meta coverage is good now)

pommedeterresautee commented 1 year ago

The issue happens at inference time with dynamo+eager mode (no CUDA graph, no Triton involved). The fix of #1950 helps but it seems something else is not working as expected.

Code to make it raise OOM is the following:

import torch
import torch._dynamo as torchdynamo
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor

torch.cuda.memory._record_memory_history(True)
torchdynamo.config.cache_size_limit = 512

audio_dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to("cuda")

def optimize_model(original_model) -> None:
    original_model.forward2 = original_model.forward

    @torchdynamo.optimize("eager")
    def run(*args, **kwargs):
        return original_model.forward2(*args, **kwargs)

    original_model.forward = run

optimize_model(model.model.decoder)

processor = WhisperProcessor.from_pretrained("openai/whisper-large")
speech_data = audio_dataset[0]["audio"]["array"]

inputs = processor(speech_data, return_tensors="pt", sampling_rate=16_000).input_features.to("cuda")
with torch.no_grad(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
    predicted_ids = model.generate(inputs, min_length=25, max_length=25, num_beams=5, do_sample=False)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
    assert (
        transcription == "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
    ), transcription
    print(transcription)

print("torch.cuda.memory_allocated: %fGB" % (torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024))
print("torch.cuda.memory_reserved: %fGB" % (torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024))
print("torch.cuda.max_memory_reserved: %fGB" % (torch.cuda.max_memory_reserved(0) / 1024 / 1024 / 1024))

Traceback (most recent call last):
  File "/mnt/workspace/kernl/crash.py", line 31, in <module>
    predicted_ids = model.generate(inputs, min_length=25, max_length=25, num_beams=5, do_sample=False)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 34, in decorate_context
    return func(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/generation/utils.py", line 1608, in generate
    return self.beam_search(
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/generation/utils.py", line 2872, in beam_search
    model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1251, in _reorder_cache
    reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1251, in <genexpr>
    reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.69 GiB total capacity; 20.52 GiB already allocated; 59.19 MiB free; 21.93 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

1/ it works without dynamo (memory reserved < 12Gb), aka if you comment optimize_model(model.model.decoder) 2/ it OOM with torch dynamo on a 3090 (24Gb DDR)

The error is due to this line in Whisper model: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1252

This function is called by the beam decoder.

I know it because:

the code of HF code is inefficient and make unnecessary copies of some tensors, when I fix that, the memory footprint is < 13Gb (but still higher than without dynamo).
it's the line reported when running the code with CUDA_LAUNCH_BLOCKING=1

Without dynamo, it prints:

mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
torch.cuda.memory_allocated: 5.880043GB
torch.cuda.memory_reserved: 11.724609GB
torch.cuda.max_memory_reserved: 11.724609GB

Moreover new Pytorch memory profiler (torch.cuda.memory._snapshot()) reports that's where most of memory is allocated.

I am under the impression that with torch dynamo the garbage collector can't delete these tensors, and then the CUDA memory can't be freed. The tensors of this function will be output by the model (in the cache of the transformer model) and then reused as input to generate the next token. One possible issue is that, for some reason IDK, reference to those tensors are captured by dynamo and they can't be garbage collected anymore. Makes sense to you?

Not related, but still sharing, minifier doesn't seem to catch those OOM issues, at least it's the second time it fails for me (and works for simpler case).

TheExGenesis commented 1 year ago

I'm not getting OOM anymore on an 80GB A100, but I am hitting the cache limit and getting no speed improvement (strictly 0.99x of baseline). Cache limit warnings:


   function: 'run' (<ipython-input-3-d766d9f4b6a6>:5)
   reasons:  set(kwargs.keys()) == {'output_attentions', 'return_dict', 'input_features', 'output_hidden_states'}
to diagnose recompilation issues, see https://github.com/pytorch/torchdynamo/blob/main/TROUBLESHOOTING.md.
[2022-12-22 03:54:12,536] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)
   function: 'forward' (/root/anaconda3/envs/whisper_kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:861)
   reasons:  ___check_obj_id(past_key_values, 94636661886208)
to diagnose recompilation issues, see https://github.com/pytorch/torchdynamo/blob/main/TROUBLESHOOTING.md.
[2022-12-22 03:54:26,044] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)
   function: 'forward' (/root/anaconda3/envs/whisper_kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:396)
   reasons:  tensor 'past_key_value[0]' strides mismatch at index 0. expected 81920, actual 84480
to diagnose recompilation issues, see https://github.com/pytorch/torchdynamo/blob/main/TROUBLESHOOTING.md.
[2022-12-22 03:54:31,065] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)
   function: 'forward' (/root/anaconda3/envs/whisper_kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:166)
   reasons:  tensor 'past_key_value[0]' strides mismatch at index 0. expected 84480, actual 85760
to diagnose recompilation issues, see https://github.com/pytorch/torchdynamo/blob/main/TROUBLESHOOTING.md.
[2022-12-22 03:54:31,514] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)
   function: '_shape' (/root/anaconda3/envs/whisper_kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:158)
   reasons:  ___check_obj_id(self, 140545768259696)
to diagnose recompilation issues, see https://github.com/pytorch/torchdynamo/blob/main/TROUBLESHOOTING.md.
[2022-12-22 03:54:38,637] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)
   function: 'forward' (/root/anaconda3/envs/whisper_kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:120)
   reasons:  past_key_values_length == 64
to diagnose recompilation issues, see https://github.com/pytorch/torchdynamo/blob/main/TROUBLESHOOTING.md.```

pommedeterresautee commented 1 year ago

You can increase the cache limit by modifying Dynamo config like code posted just above:

torchdynamo.config.cache_size_limit = 512

Moreover can you share your cuda memory footprint after running the model? (See code above on how to do it)

TheExGenesis commented 1 year ago

With "eager", I can't raise the cache_size_limit above 64 without getting OOM

With "ofi", even at cache_size_limit=64, I'm getting OOM, also a bunch of these Warnings that I haven't had time to research

/root/anaconda3/envs/whisper_kernl/lib/python3.9/site-packages/torch/jit/_check.py:181: UserWarning: The TorchScript type system doesn't support instance-level annotations on empty non-base types in `__init__`. Instead, either 1) use a type annotation in the class body, or 2) wrap the type in `torch.jit.Attribute`.

eellison commented 1 year ago

@gaetansnl You are using the a no-op compiler which doesn't free the inputs to the backward when they are no longer needed. This will incur significant memory overhead. Could you try the default inductor backend ? i.e. torch.compile. If it doesn't succeed with batch size 64, what number does it succeed ?

eellison commented 1 year ago

@TheExGenesis if you are seeing issues different from this one, please open a new issue, thank you.

eellison commented 1 year ago

Removing high priority because this is using a non standard backend which doesn't free inputs, so memory regression is expected.

gaetansnl commented 1 year ago

@eellison do you have more details on what needs to be implemented in the backend ? I can't use inductor because I have a custom backend implementation

eellison commented 1 year ago

The problem is detailed here https://github.com/pytorch/pytorch/pull/83137/#issuecomment-1211320670.

To fix it for your backend, you want to return a compiled function that takes in a list of tensors by marking _boxed_call = True, and you also want to make sure the list is cleared and the inputs are freed when they are no longer needed.

https://github.com/pytorch/pytorch/pull/83137/ is a good example of a PR to follow.

CC @SherlockNoMad for custom backend this might be a good thing to document if it's not already.

gaetansnl commented 1 year ago

I also have ouf of memory with inductor

    torchdynamo.config.cache_size_limit = 512

    def optimize_model(original_model) -> None:
        original_model.forward2 = original_model.forward

        @torchdynamo.optimize("inductor")
        def run(*args, **kwargs):
            return original_model.forward2(*args, **kwargs)

        original_model.forward = run

    audio_dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to("cuda")

    optimize_model(model.model.encoder)
    optimize_model(model.model.decoder)

    processor = WhisperProcessor.from_pretrained("openai/whisper-large")
    speech_data = audio_dataset[0]["audio"]["array"]

    inputs = processor(speech_data, return_tensors="pt", sampling_rate=16_000).input_features.to("cuda")
    with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
        predicted_ids = model.generate(inputs, min_length=25, max_length=25, num_beams=2, do_sample=False)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
        assert transcription == "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"

pommedeterresautee commented 1 year ago

I just reran the code with eager compiler + today nightly... and no more OOM! But 2.0.0.dev20230104+cu117 raises OOM. So basically something has been fixed since my last post.

Inductor compiler raises OOM but on CUDA graph, it's not surprising as CG copy input tensors and this model has a huge encoder output (appear in cache) if duplicated for each seq len of the decoder, it s not surprising it OOM.

Traceback (most recent call last):
  File "/home/geantvert/workspace/kernl/toto.py", line 30, in <module>
    predicted_ids = model.generate(inputs, min_length=25, max_length=25, num_beams=2, do_sample=False)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/generation/utils.py", line 1608, in generate
    return self.beam_search(
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/generation/utils.py", line 2799, in beam_search
    outputs = self(
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1488, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1194, in forward
    outputs = self.model(
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1488, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1062, in forward
    decoder_outputs = self.decoder(
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1488, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 211, in _fn
    return fn(*args, **kwargs)
  File "/home/geantvert/workspace/kernl/toto.py", line 14, in run
    return original_model.forward2(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 767, in forward
    def forward(
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 211, in _fn
    return fn(*args, **kwargs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_functorch/aot_autograd.py", line 2467, in forward
    return compiled_fn(full_args)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_functorch/aot_autograd.py", line 1066, in new_fn
    fw_outs = call_func_with_args(compiled_fw, args, disable_amp=disable_amp)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_functorch/aot_autograd.py", line 1022, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_inductor/compile_fx.py", line 216, in run
    return model(new_inputs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_inductor/compile_fx.py", line 233, in run
    compiled_fn = cudagraphify_impl(model, new_inputs, static_input_idxs)
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_inductor/compile_fx.py", line 272, in cudagraphify_impl
    static_inputs = [
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_inductor/compile_fx.py", line 273, in <listcomp>
    static_input(x) if idx not in static_input_idxs else x.detach()
  File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_inductor/compile_fx.py", line 268, in static_input
    buffer = torch.zeros(needed_size, dtype=x.dtype, device=x.device)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.69 GiB total capacity; 14.60 GiB already allocated; 31.06 MiB free; 21.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

@gaetansnl can we close the issue?

gaetansnl commented 1 year ago

thanks a lot everyone !

pytorch / torchdynamo