pytorch / TensorRT

PyTorch/TorchScript/FX compiler for NVIDIA GPUs using TensorRT
https://pytorch.org/TensorRT
BSD 3-Clause "New" or "Revised" License
2.6k stars 351 forks source link

🐛 [Bug] ScatterLayer API Usage Error #3304

Open HolyWu opened 1 day ago

HolyWu commented 1 day ago

To Reproduce

import os

import torch
import torch_tensorrt

os.environ["CI_BUILD"] = "1"

class MyModule(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        x[:, :, 0] = 3.1415
        return x

with torch.inference_mode():
    model = MyModule().eval().cuda()
    inputs = [torch.zeros(1, 2, 3, device="cuda")]

    trt_model = torch_tensorrt.compile(model, "dynamo", inputs, debug=True, min_block_size=1)

    print(trt_model(*inputs))
DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_detach:Removed 0 detach nodes:
graph():
    %c_lifted_tensor_0 : [num_users=1] = placeholder[target=c_lifted_tensor_0]
    %x : [num_users=2] = placeholder[target=x]
    %lift_fresh_copy : [num_users=1] = call_function[target=torch.ops.aten.lift_fresh_copy.default](args = (%c_lifted_tensor_0,), kwargs = {})
    %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_1, 1, 0, 9223372036854775807), kwargs = {})
    %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%slice_2, 2, 0), kwargs = {})
    %fill_ : [num_users=0] = call_function[target=torch.ops.aten.fill_.Tensor](args = (%select, %lift_fresh_copy), kwargs = {})
    return (x,)
WARNING:py.warnings:/home/holywu/.local/lib/python3.12/site-packages/torch/export/_unlift.py:75: UserWarning: Attempted to insert a get_attr Node with no underlying reference in the owning GraphModule! Call GraphModule.add_submodule to add the necessary submodule, GraphModule.add_parameter to add the necessary Parameter, or nn.Module.register_buffer to add the necessary buffer
  getattr_node = gm.graph.get_attr(lifted_node)

WARNING:py.warnings:/home/holywu/.local/lib/python3.12/site-packages/torch/fx/graph.py:1800: UserWarning: Node lifted_tensor_0 target lifted_tensor_0 lifted_tensor_0 of  does not reference an nn.Module, nn.Parameter, or buffer, which is what 'get_attr' Nodes typically target
  warnings.warn(

WARNING:py.warnings:/home/holywu/.local/lib/python3.12/site-packages/torch/export/_unlift.py:75: UserWarning: Attempted to insert a get_attr Node with no underlying reference in the owning GraphModule! Call GraphModule.add_submodule to add the necessary submodule, GraphModule.add_parameter to add the necessary Parameter, or nn.Module.register_buffer to add the necessary buffer
  getattr_node = gm.graph.get_attr(lifted_node)

WARNING:py.warnings:/home/holywu/.local/lib/python3.12/site-packages/torch/fx/graph.py:1800: UserWarning: Node lifted_tensor_0 target lifted_tensor_0 lifted_tensor_0 of  does not reference an nn.Module, nn.Parameter, or buffer, which is what 'get_attr' Nodes typically target
  warnings.warn(

DEBUG:torch_tensorrt.dynamo._compiler:Input graph: graph():
    %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
    %x : [num_users=3] = placeholder[target=x]
    %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_1, 1, 0, 9223372036854775807), kwargs = {})
    %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%slice_2, 2, 0), kwargs = {})
    %copy : [num_users=1] = call_function[target=torch.ops.aten.copy.default](args = (%select, %lifted_tensor_0), kwargs = {})
    %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_4 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_3, 1, 0, 9223372036854775807), kwargs = {})
    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%copy, 2), kwargs = {})
    %full : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([1, 2], 1), kwargs = {dtype: torch.int64, layout: torch.strided, device: cpu, pin_memory: False})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%full, 0), kwargs = {})
    %unsqueeze_1 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%mul, 2), kwargs = {})
    %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_1], 2), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%cat,), kwargs = {dtype: torch.int64, layout: torch.strided, device: cuda:0})
    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.src](args = (%slice_4, 2, %_to_copy, %unsqueeze), kwargs = {})
    %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%x, %scatter), kwargs = {})
    return (copy__default,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.constant_folding:Graph after constant folding:
graph():
    %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
    %x : [num_users=3] = placeholder[target=x]
    %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_1, 1, 0, 9223372036854775807), kwargs = {})
    %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%slice_2, 2, 0), kwargs = {})
    %copy : [num_users=1] = call_function[target=torch.ops.aten.copy.default](args = (%select, %lifted_tensor_0), kwargs = {})
    %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_4 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_3, 1, 0, 9223372036854775807), kwargs = {})
    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%copy, 2), kwargs = {})
    %_frozen_param0 : [num_users=1] = get_attr[target=_frozen_param0]
    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.src](args = (%slice_4, 2, %_frozen_param0, %unsqueeze), kwargs = {})
    %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%x, %scatter), kwargs = {})
    return (copy__default,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_assert_scalar:Removed 0 assert_scalar nodes:
graph():
    %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
    %x : [num_users=3] = placeholder[target=x]
    %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_1, 1, 0, 9223372036854775807), kwargs = {})
    %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%slice_2, 2, 0), kwargs = {})
    %copy : [num_users=1] = call_function[target=torch.ops.aten.copy.default](args = (%select, %lifted_tensor_0), kwargs = {})
    %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_4 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_3, 1, 0, 9223372036854775807), kwargs = {})
    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%copy, 2), kwargs = {})
    %_frozen_param0 : [num_users=1] = get_attr[target=_frozen_param0]
    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.src](args = (%slice_4, 2, %_frozen_param0, %unsqueeze), kwargs = {})
    %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%x, %scatter), kwargs = {})
    return (copy__default,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.accumulate_fp32_matmul:Skipping FP32 accumulation for matmul layers as use_fp32_acc is not enabled in the compilation settings
DEBUG:torch_tensorrt.dynamo._compiler:Lowered Input graph: graph():
    %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
    %x : [num_users=3] = placeholder[target=x]
    %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_1, 1, 0, 9223372036854775807), kwargs = {})
    %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%slice_2, 2, 0), kwargs = {})
    %copy : [num_users=1] = call_function[target=torch.ops.aten.copy.default](args = (%select, %lifted_tensor_0), kwargs = {})
    %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_4 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_3, 1, 0, 9223372036854775807), kwargs = {})
    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%copy, 2), kwargs = {})
    %_frozen_param0 : [num_users=1] = get_attr[target=_frozen_param0]
    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.src](args = (%slice_4, 2, %_frozen_param0, %unsqueeze), kwargs = {})
    %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%x, %scatter), kwargs = {})
    return (copy__default,)
DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:
Supported Nodes:
- torch.ops.aten.slice.Tensor + Operator Count: 4
- torch.ops.aten.select.int + Operator Count: 1
- torch.ops.aten.copy.default + Operator Count: 1
- torch.ops.aten.unsqueeze.default + Operator Count: 1
- torch.ops.aten.scatter.src + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:
All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Detected support for 8 operators out of 9 in subgraph.
WARNING:torch_tensorrt.dynamo._compiler:Node copy__default of op type call_function does not have metadata. This could sometimes lead to undefined behavior.
WARNING:torch_tensorrt.dynamo._compiler:Some nodes do not have metadata (shape and dtype information). This could lead to problems sometimes if the graph has PyTorch and TensorRT segments.
INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner
WARNING:py.warnings:/home/holywu/.local/lib/python3.12/site-packages/torch/fx/graph.py:1800: UserWarning: Node lifted_tensor_0 target lifted_tensor_0 lifted_tensor_0 of  does not reference an nn.Module, nn.Parameter, or buffer, which is what 'get_attr' Nodes typically target
  warnings.warn(

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
Number of TensorRT-Accelerated Engines Generated: 1
DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
Supported Nodes:
- torch.ops.aten.slice.Tensor + Operator Count: 4
- torch.ops.aten.select.int + Operator Count: 1
- torch.ops.aten.copy.default + Operator Count: 1
- torch.ops.aten.unsqueeze.default + Operator Count: 1
- torch.ops.aten.scatter.src + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Updated metadata for node: _run_on_acc_0 with its corresponding submodule outputs
DEBUG:torch_tensorrt.dynamo._compiler:Converting submodule: _run_on_acc_0
 Input shapes: [(1, 2, 3)]
 graph():
    %x : [num_users=2] = placeholder[target=x]
    %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_1, 1, 0, 9223372036854775807), kwargs = {})
    %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%slice_2, 2, 0), kwargs = {})
    %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
    %copy : [num_users=1] = call_function[target=torch.ops.aten.copy.default](args = (%select, %lifted_tensor_0), kwargs = {})
    %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%x, 0, 0, 9223372036854775807), kwargs = {})
    %slice_4 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%slice_3, 1, 0, 9223372036854775807), kwargs = {})
    %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%copy, 2), kwargs = {})
    %_frozen_param0 : [num_users=1] = get_attr[target=_frozen_param0]
    %scatter : [num_users=1] = call_function[target=torch.ops.aten.scatter.src](args = (%slice_4, 2, %_frozen_param0, %unsqueeze), kwargs = {})
    return scatter
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node x (kind: x, args: ())
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Adding input to in-progress INetwork: x [shape=[1, 2, 3], dtype=DataType.FLOAT]
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node x [x] (Inputs: () | Outputs: (x: (1, 2, 3)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /slice_1 (kind: aten.slice.Tensor, args: ('x <Node>', '0 <int>', '0 <int>', '9223372036854775807 <int>'))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /slice_1 [aten.slice.Tensor] (Inputs: (x: (1, 2, 3)@torch.float32, 0, 0, 9223372036854775807) | Outputs: (slice_1: (1, 2, 3)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /slice_2 (kind: aten.slice.Tensor, args: ('slice_1 <Node>', '1 <int>', '0 <int>', '9223372036854775807 <int>'))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /slice_2 [aten.slice.Tensor] (Inputs: (slice_1: (1, 2, 3)@torch.float32, 1, 0, 9223372036854775807) | Outputs: (slice_2: (1, 2, 3)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /select (kind: aten.select.int, args: ('slice_2 <Node>', '2 <int>', '0 <int>'))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /select [aten.select.int] (Inputs: (slice_2: (1, 2, 3)@torch.float32, 2, 0) | Outputs: (select: (1, 2)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node lifted_tensor_0 (kind: lifted_tensor_0, args: ())
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node lifted_tensor_0 [lifted_tensor_0] (Inputs: () | Outputs: (lifted_tensor_0: ()@float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /copy (kind: aten.copy.default, args: ('select <Node>', 'lifted_tensor_0 <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion.converter_utils:Freezing tensor /copy_constant_1 to TRT IConstantLayer
DEBUG:torch_tensorrt.dynamo.conversion.converter_utils:Scalar numpy detected at /copy_constant_1, adding rank (1,)
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /copy [aten.copy.default] (Inputs: (select: (1, 2)@torch.float32, lifted_tensor_0: ()@float32) | Outputs: (copy: (1, 2)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /slice_3 (kind: aten.slice.Tensor, args: ('x <Node>', '0 <int>', '0 <int>', '9223372036854775807 <int>'))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /slice_3 [aten.slice.Tensor] (Inputs: (x: (1, 2, 3)@torch.float32, 0, 0, 9223372036854775807) | Outputs: (slice_3: (1, 2, 3)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /slice_4 (kind: aten.slice.Tensor, args: ('slice_3 <Node>', '1 <int>', '0 <int>', '9223372036854775807 <int>'))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /slice_4 [aten.slice.Tensor] (Inputs: (slice_3: (1, 2, 3)@torch.float32, 1, 0, 9223372036854775807) | Outputs: (slice_4: (1, 2, 3)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /unsqueeze (kind: aten.unsqueeze.default, args: ('copy <Node>', '2 <int>'))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /unsqueeze [aten.unsqueeze.default] (Inputs: (copy: (1, 2)@torch.float32, 2) | Outputs: (unsqueeze: (1, 2, 1)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node _frozen_param0 (kind: _frozen_param0, args: ())
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node _frozen_param0 [_frozen_param0] (Inputs: () | Outputs: (_frozen_param0: (1, 2, 1)@int64))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /scatter (kind: aten.scatter.src, args: ('slice_4 <Node>', '2 <int>', '_frozen_param0 <Node>', 'unsqueeze <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion.converter_utils:Freezing tensor /scatter_constant_2 to TRT IConstantLayer
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /scatter [aten.scatter.src] (Inputs: (slice_4: (1, 2, 3)@torch.float32, 2, _frozen_param0: (1, 2, 1)@int64, unsqueeze: (1, 2, 1)@torch.float32) | Outputs: (scatter: (1, 2, 3)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node output (kind: output, args: ('scatter <Node>',))
ERROR:torch_tensorrt [TensorRT Conversion Context]:ITensor::getDimensions: Error Code 4: API Usage Error ([SCATTER]-[aten_ops.scatter.src]-[/scatter_scatter_layer]: ScatterLayer in elements mode all inputs tensors rank must be same. Input 0 rank is 3, input 1 rank is 3, and input 2 rank is 2.)
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output0 [shape=(81), dtype=DataType.FLOAT]
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node output [output] (Inputs: (scatter: (1, 2, 3)@torch.float32) | Outputs: (output: ))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.033800
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Not found cached TRT engines. Start building engine.
ERROR:torch_tensorrt [TensorRT Conversion Context]:ITensor::getDimensions: Error Code 4: API Usage Error (Output shape can not be computed for node [SCATTER]-[aten_ops.scatter.src]-[/scatter_scatter_layer].)
ERROR:torch_tensorrt [TensorRT Conversion Context]:IBuilder::buildSerializedNetwork: Error Code 4: API Usage Error ([SCATTER]-[aten_ops.scatter.src]-[/scatter_scatter_layer]: ScatterLayer in elements mode all inputs tensors rank must be same. Input 0 rank is 3, input 1 rank is 3, and input 2 rank is 2.)
Traceback (most recent call last):
  File "/home/holywu/scatter.py", line 22, in <module>
    trt_model = torch_tensorrt.compile(model, "dynamo", inputs, debug=True, min_block_size=1)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/holywu/.local/lib/python3.12/site-packages/torch_tensorrt/_compile.py", line 286, in compile
    trt_graph_module = dynamo_compile(
                       ^^^^^^^^^^^^^^^
  File "/home/holywu/.local/lib/python3.12/site-packages/torch_tensorrt/dynamo/_compiler.py", line 608, in compile
    trt_gm = compile_module(
             ^^^^^^^^^^^^^^^
  File "/home/holywu/.local/lib/python3.12/site-packages/torch_tensorrt/dynamo/_compiler.py", line 810, in compile_module
    trt_module = convert_module(
                 ^^^^^^^^^^^^^^^
  File "/home/holywu/.local/lib/python3.12/site-packages/torch_tensorrt/dynamo/conversion/_conversion.py", line 90, in convert_module
    interpreter_result = interpret_module_to_result(
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/holywu/.local/lib/python3.12/site-packages/torch_tensorrt/dynamo/conversion/_conversion.py", line 69, in interpret_module_to_result
    interpreter_result = interpreter.run()
                         ^^^^^^^^^^^^^^^^^
  File "/home/holywu/.local/lib/python3.12/site-packages/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py", line 645, in run
    assert serialized_engine
           ^^^^^^^^^^^^^^^^^
AssertionError

Environment