facebookincubator / AITemplate

AITemplate is a Python framework which renders neural network into high performance CUDA/HIP C++ code. Specialized for FP16 TensorCore (NVIDIA GPU) and MatrixCore (AMD GPU) inference.
Apache License 2.0
4.55k stars 369 forks source link

Compile issue: Tensor conv2d_bias_64_1 not in outputs for op avg_pool2d_53 #60

Open howellyoung-s opened 2 years ago

howellyoung-s commented 2 years ago

summary

I use AITemplate to re-construct a diffusion model which is slightly different than the one in examples, but error occurs while call compile_model(). Since it says some conv2d_bias tensor is not in the outputs of some AvgPooling op, so I just show the related code as below, where 'AvgPooling' is only used by the 'Resample' module. I review the forward implementation of 'ResidualBlock' several times, but no clue can be found.

code1

the nn.AvgPool2d is only defined in Resample Module, and only used in ResidualBlock forward()

class Resample(nn.Module):
    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
        assert scale_factor in [0.5, 1.0, 2.0]
        super(Resample, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.scale_factor = scale_factor
        self.use_conv = use_conv

        # layers
        if scale_factor == 2.0:
            self.resample = nn.Sequential(
                nn.Upsampling2d(scale_factor=scale_factor, mode='nearest'),
                nn.Conv2dBias(in_dim, out_dim, 3, 1, padding=1) if use_conv else nn.Identity())
        elif scale_factor == 0.5:
            if use_conv:
                self.resample = nn.Conv2dBias(in_dim, out_dim, 3, stride=2, padding=1) 
            else:
                self.resample = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
        else:
            self.resample = nn.Identity()

    def forward(self, x):
        return self.resample(x)

class SiLU(nn.Module):
    def __init__(self) -> None:
        super(SiLU, self).__init__()
        self.silu = ops.silu

    def forward(self, x):
        out = self.silu(x)
        return out

class ResidualBlock(nn.Module):

    def __init__(self, in_dim, embed_dim, out_dim, use_scale_shift_norm=True,
                 scale_factor=1.0, dropout=0.0):
        super(ResidualBlock, self).__init__()
        self.in_dim = in_dim
        self.embed_dim = embed_dim
        self.out_dim = out_dim
        self.use_scale_shift_norm = use_scale_shift_norm
        self.scale_factor = scale_factor

        # layers
        self.layer1 = nn.ModuleList([
            nn.GroupNorm(32, in_dim),
            SiLU(),
            nn.Conv2dBias(in_dim, out_dim, 3, 1, padding=1)])
        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
        self.embedding = nn.Sequential(
            SiLU(),
            nn.Linear(embed_dim, out_dim * 2 if use_scale_shift_norm else out_dim))
        self.layer2 = nn.ModuleList([
            nn.GroupNorm(32, out_dim),
            SiLU(),
            nn.Dropout(dropout),
            nn.Conv2dBias(out_dim, out_dim, 3, 1, padding=1)])
        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2dBias(in_dim, out_dim, 1, 1)

    def forward(self, x, e):
        hidden_states = x

        hidden_states = self.layer1[0](hidden_states)
        hidden_states_0 = self.layer1[1](hidden_states)
        x = self.resample(x)  
        hidden_states_1 = self.resample(hidden_states_0) # error may occur here ?
        hidden_states_2 = self.layer1[2](hidden_states_1) 
        e = self.embedding(e)
        bs, dim = get_shape(e)
        e = ops.reshape()(e, [bs, 1, 1, dim])

        hidden_states = hidden_states_2 + e
        hidden_states = self.layer2[0](hidden_states)
        hidden_states = self.layer2[1](hidden_states)
        hidden_states = self.layer2[2](hidden_states)
        hidden_states = self.layer2[3](hidden_states)

        x = self.shortcut(x)
        out = hidden_states + x
        return out

code2

code in convert2ait_upsampler.py

def rebuild_net(use_fp16_acc=False, convert_conv_to_gemm=False):

    ...
    net = pytorch_model().cuda().half() # use fp16
    net.eval()
    ait_net = AITUpsampler()
    ait_net.name_parameter_tensor()
    mapped_params = map_pt_params(ait_net, net)

    batch_size = 4
    hh = 256
    ww = 256
    cc = 3
    x0 = Tensor(
        [batch_size, hh, ww, cc], name="input0", is_input=True
    )
    t = Tensor([batch_size, upsampler256_config['dim']], name="input1", is_input=True)
    y = Tensor([batch_size, upsampler256_config['y_dim']], name="input2", is_input=True)
    concat = Tensor(
        [batch_size, hh, ww, cc], name="input3", is_input=True
    )

    Y_out = ait_net(x0, t, y, concat)
    target = detect_target(
        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
    )

    compile_model(Y_out, target, "./tmp", "AIT_UPSAMPLER256", constants=mapped_params)

Error

Traceback (most recent call last):
  File "convert2ait_upsampler.py", line 106, in <module>
    compile_net(True, True)
  File "convert2ait_upsampler.py", line 103, in compile_net
    compile_model(Y_out, target, "./tmp", "AIT_UPSAMPLER", constants=mapped_params)
  File "/home/envs/zero/lib/python3.8/site-packages/aitemplate/compiler/compiler.py", line 152, in compile_model
    compiler.transform.remove_no_ops(graph)
  File "/home/envs/zero/lib/python3.8/site-packages/aitemplate/compiler/transform/remove_no_ops.py", line 167, in remove_no_ops
    sorted_graph = f_pass(sorted_graph)
  File "/home/envs/zero/lib/python3.8/site-packages/aitemplate/compiler/transform/remove_no_ops.py", line 82, in _remove_no_op_expands
    return transform_utils.sanitize_sorted_graph(sorted_graph)
  File "/home/envs/zero/lib/python3.8/site-packages/aitemplate/compiler/transform/transform_utils.py", line 272, in sanitize_sorted_graph
    check_graph_validity(new_sorted_graph, raiseError=True)
  File "/home/envs/zero/lib/python3.8/site-packages/aitemplate/compiler/transform/transform_utils.py", line 69, in check_graph_validity
    valid = handleError(
  File "/home/envs/zero/lib/python3.8/site-packages/aitemplate/compiler/transform/transform_utils.py", line 40, in handleError
    raise RuntimeError(msg)
RuntimeError: Tensor conv2d_bias_64_1 not in outputs for op avg_pool2d_53
hlu1 commented 1 year ago

Can you try commenting out this line: https://github.com/facebookincubator/AITemplate/blob/main/python/aitemplate/compiler/transform/remove_no_ops.py#L163? I think you hit a bug in this pass. Also, are there expand ops in your model?

ioeddk commented 1 year ago

Is there an update on this problem? I'm facing a similar problem and the line to be commented off above is an parenthesis. So in the function _is_compatible_with_broadcasting we just try simply return True?

My code is trying to build Backbone in YOLOX:

class YOLOPAFPN(nn.Module):
    """
    YOLOv3 model. Darknet 53 is the default backbone of this model.
    """

    def __init__(
        self,
        depth=1.0,
        width=1.0,
        in_features=("dark3", "dark4", "dark5"),
        in_channels=[256, 512, 1024],
        depthwise=False,
        act="relu",
    ):
        super().__init__()
        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, out_features=in_features, act=act)
        self.in_features = in_features
        self.in_channels = in_channels
        Conv = DWConv if depthwise else BaseConv

        self.upsample = nn.upsampling2d(scale_factor=2, mode="nearest")
        self.lateral_conv0 = BaseConv(
            int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
        )
        self.C3_p4 = CSPLayer(
            int(2 * in_channels[1] * width),
            int(in_channels[1] * width),
            round(3 * depth),
            False,
            depthwise=depthwise,
            act=act,
        )  # cat

        self.reduce_conv1 = BaseConv(
            int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
        )
        self.C3_p3 = CSPLayer(
            int(2 * in_channels[0] * width),
            int(in_channels[0] * width),
            round(3 * depth),
            False,
            depthwise=depthwise,
            act=act,
        )

        # bottom-up conv
        self.bu_conv2 = Conv(
            int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
        )
        self.C3_n3 = CSPLayer(
            int(2 * in_channels[0] * width),
            int(in_channels[1] * width),
            round(3 * depth),
            False,
            depthwise=depthwise,
            act=act,
        )

        # bottom-up conv
        self.bu_conv1 = Conv(
            int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
        )
        self.C3_n4 = CSPLayer(
            int(2 * in_channels[1] * width),
            int(in_channels[2] * width),
            round(3 * depth),
            False,
            depthwise=depthwise,
            act=act,
        )

    def forward(self, input):
        """
        Args:
            inputs: input images.

        Returns:
            Tuple[Tensor]: FPN feature.
        """

        #  backbone
        out_features = self.backbone(input)

        features = [out_features[f] for f in self.in_features]
        [x2, x1, x0] = features

        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
        f_out0 = self.upsample(fpn_out0)  # 512/16
        f_out0 = aitcat()([f_out0, x1], 3)  # 512->1024/16
        f_out0 = self.C3_p4(f_out0)  # 1024->512/16

        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
        f_out1 = self.upsample(fpn_out1)  # 256/8
        f_out1 = aitcat()([f_out1, x2], 3)  # 256->512/8
        pan_out2 = self.C3_p3(f_out1)  # 512->256/8

        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
        p_out1 = aitcat()([p_out1, fpn_out1], 3)  # 256->512/16
        pan_out1 = self.C3_n3(p_out1)  # 512->512/16

        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
        p_out0 = aitcat()([p_out0, fpn_out0], 3)  # 512->1024/32
        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32

        # outputs = (pan_out2, pan_out1, pan_out0)
        return pan_out2

I have already replaced the convolution operation with aitemplate.frontend.nn.conv2dBias and can compile successfully when only have f_out0. My error looks like this:

.
.
.
2023-07-17 19:50:15,384 DEBUG <aitemplate.compiler.transform.name_graph> before name_graph: func_cnt=101, tensor_cnt=0, len(func_name_to_tensor_cnt)=101, len(user_provided_dim)=546
2023-07-17 19:50:15,385 DEBUG <aitemplate.compiler.transform.name_graph> after name_graph: func_cnt=101, tensor_cnt=0, len(func_name_to_tensor_cnt)=101, len(user_provided_dim)=546
2023-07-17 19:50:15,565 DEBUG <aitemplate.utils.graph_utils> Dumped dedup_symbolic_name visualization to ./tmp/test_compile_yolox_backbone_1/dedup_symbolic_name_graph_vis.html
2023-07-17 19:50:15,569 INFO <aitemplate.compiler.transform.memory_planning> Workspace shared_size=0 unique_size=0
2023-07-17 19:50:15,569 INFO <aitemplate.compiler.transform.memory_planning> max_blob=3670016 constant_offset=0
Traceback (most recent call last):
  File "/workspaces/torchsparse-misc/conversion/yolox/test_ait_model.py", line 72, in <module>
    model_compiled = compile_module(
  File "/workspaces/torchsparse-misc/conversion/yolox/test_ait_model.py", line 44, in compile_module
    module = compile_model(y, target, "./tmp", model_name)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/utils/misc.py", line 93, in inner_function
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/compiler.py", line 308, in compile_model
    _verify_outputs_still_in_graph(graph, output_tensors)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/compiler.py", line 95, in _verify_outputs_still_in_graph
    raise ValueError(
ValueError: Output output_0 was not found in the graph after optimizations.
hlky commented 1 year ago

Ran into a similar issue while working on RRDBNet.

    def forward(self, x):
        x1 = self.lrelu(self.conv1(x))
        x2 = self.lrelu(self.conv2(self.cat((x, x1), 3)))
        x3 = self.lrelu(self.conv3(self.cat((x, x1, x2), 3)))
        x4 = self.lrelu(self.conv4(self.cat((x, x1, x2, x3), 3)))
        x5 = self.conv5(self.cat((x, x1, x2, x3, x4), 3))
        # Empirically, we use 0.2 to scale the residual for better performance
        out = x5 * 0.2 + x
        return out

When using ops.concatenate directly it appears that the next conv layers are not used, only the first is used.

AIT dump (dump_ait_to_py):

    def model(self):
        conv2d_bias_0_0 = ops.conv2d_bias(dilate=1, group=1, pad=1, stride=1)(self.rdb_input, self.conv1_weight, self.conv1_bias)

        # Set outputs

        # End of setting outputs
        return 

Another indicator was that profiling only ran for the first conv layer.

Marking as is_input resolves the issue. Wrapping the operator appears to resolve the issue.

def cat(self, tensors, dim):
        out = ops.concatenate()(tensors, dim)
        return out