ThanatosShinji / onnx-tool

A parser, editor and profiler tool for ONNX models.
MIT License
383 stars 51 forks source link

Incorrect MACs calculation for recurrent layers #64

Open TimCJanke opened 8 months ago

TimCJanke commented 8 months ago

Describe the bug MACs calculation is broken for recurrent layers, i.e., if there are T time steps, MACs are off by a factor of T. Code below reproduces the bug for GRU and LSTM layers, it works correctly for looping over a linear layer (although MAC numbers are not perfectly identical). For LSTM and GRU layers, thop (correctly) calculates MACs that are 100x higher for the torch model compared to the onnx model using onnx-tool.

This is the result of running code below:

Layer Type: linear

Layer Type: lstm

Layer Type: gru

To Reproduce The code below creates a torch model with a single recurrent layer and measures MACs for torch model using thop and for exported onnx models using onnx-tool. torch == 1.13.1 onnx-tool == 0.8.5

import torch
import torch.nn as nn
from thop import profile
import onnx_tool
from onnx_tool import create_ndarray_f32
import os
import pandas as pd

# Input size
STEPS = 100

# Define a simple Seq2Seq model
class Seq2SeqModel(nn.Module):
    def __init__(self, layer_type: str):
        self.layer_type = layer_type
        if self.layer_type == "linear":
            self.layer = nn.Linear(INPUT_SIZE, HIDDEN_SIZE)
        elif self.layer_type == "lstm":
            self.layer = nn.LSTM(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
        elif self.layer_type == "gru":
            self.layer = nn.GRU(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
            raise ValueError("Invalid layer type")

    def forward(self, x):
        if self.layer_type == "linear":
            out = []
            for s in range(STEPS):
                y = self.layer(x[:, s, :])
            out = torch.stack(out, dim=1)
            out, h = self.layer(x)
        return out

# Test each layer and print results
layer_types = ["linear", "lstm", "gru"]

for layer_type in layer_types:
    model = Seq2SeqModel(layer_type=layer_type)

    # Sample input
    sample_input = torch.randn(BATCH_SIZE, STEPS, INPUT_SIZE)

    # Measure MACs and FLOPs with torch
    macs, params = profile(model, inputs=(sample_input,))

    # Print results
    print("=" * 30)
    print(f"Layer Type: {layer_type}")
    print("-" * 30)
    print("torch profiler results:")
    print("-" * 30)
    # print(f"Input shape: {sample_input.shape}")
    # print(f"Output shape: {y.shape}")
    print(f"Params: {int(params)}")
    print(f"MACs: {int(macs)}")
    print("-" * 30)

    # export to onnx
    tmpfile = "tmp.onnx"
    with torch.no_grad():
        torch_out = torch.onnx.export(model, sample_input, tmpfile, opset_version=12)

    # profile onnx model with onnx-tool and save results to csv
        dynamic_shapes={"input_1": create_ndarray_f32(sample_input.shape)},

    # Print results from onnx-tool profiler
    df = pd.read_csv(f"{layer_type}_profile.csv")
    print("-" * 30)
    print("onnx-tool profiler results:")
    print("-" * 30)
    print(f"Params: {df.iloc[-1, :]['Params']}")
    print(f"MACs: {df.iloc[-1, :]['Forward_MACs']}")
    print("=" * 30)
ThanatosShinji commented 8 months ago

Thanks for your report!

ThanatosShinji commented 8 months ago

Hello, this issue has been fixed. But the result may be different from thop's. Onnx-tool also considers sigmoid and tanh activation MACs into count.

ThanatosShinji commented 8 months ago

You can view the change in this commit dd895b056f1eded06cf8fede9176dba0516cd37f