[Build] --external_graph_transformer_path doesn't. --test_external_transformer_example removed from build.py?

thiagocrepaldi commented 3 months ago

Describe the issue

@snnn FYI

https://github.com/microsoft/onnxruntime/pull/9478/ was introduced to allow External Graph Transformers to be compiled along with ORT through the flag --external_graph_transformer_path PATH. This PR also introduced the flag --test_external_transformer_example to test it on CI.

However, https://github.com/microsoft/onnxruntime/pull/15416/ (accidentally?!) removed the --test_external_transformer_example flag and CI doesn't test --external_graph_transformer_path anymore, which is now broken (didn't track the exact commit for the regression though)

The repro for --external_graph_transformer_path is from the original test on the PR #9478:


# run_repro.py
import sys
import threading
import time

class OutputGrabber(object):
    """
    Class used to grab standard output or another stream.
    """
    escape_char = "\b"

    def __init__(self, stream=None, threaded=False):
        self.origstream = stream
        self.threaded = threaded
        if self.origstream is None:
            self.origstream = sys.stdout
        self.origstreamfd = self.origstream.fileno()
        self.capturedtext = ""
        # Create a pipe so the stream can be captured:
        self.pipe_out, self.pipe_in = os.pipe()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, type, value, traceback):
        self.stop()

    def start(self):
        """
        Start capturing the stream data.
        """
        self.capturedtext = ""
        # Save a copy of the stream:
        self.streamfd = os.dup(self.origstreamfd)
        # Replace the original stream with our write pipe:
        os.dup2(self.pipe_in, self.origstreamfd)
        if self.threaded:
            # Start thread that will read the stream:
            self.workerThread = threading.Thread(target=self.readOutput)
            self.workerThread.start()
            # Make sure that the thread is running and os.read() has executed:
            time.sleep(0.01)

    def stop(self):
        """
        Stop capturing the stream data and save the text in `capturedtext`.
        """
        # Print the escape character to make the readOutput method stop:
        self.origstream.write(self.escape_char)
        # Flush the stream to make sure all our data goes in before
        # the escape character:
        self.origstream.flush()
        if self.threaded:
            # wait until the thread finishes so we are sure that
            # we have until the last character:
            self.workerThread.join()
        else:
            self.readOutput()
        # Close the pipe:
        os.close(self.pipe_in)
        os.close(self.pipe_out)
        # Restore the original stream:
        os.dup2(self.streamfd, self.origstreamfd)
        # Close the duplicate stream:
        os.close(self.streamfd)

    def readOutput(self):
        """
        Read the stream data (one byte at a time)
        and save the text in `capturedtext`.
        """
        while True:
            char = os.read(self.pipe_out,1).decode(self.origstream.encoding)
            if not char or self.escape_char in char:
                break
            self.capturedtext += char

import torch
from onnxruntime.capi import _pybind_state as torch_ort_eager
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
from onnxruntime.training import optim, orttrainer, orttrainer_options
import unittest

def my_loss(x, target):
    return F.nll_loss(F.log_softmax(x, dim=1), target)

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x, target):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return my_loss(out, target)

class OrtEPTests(unittest.TestCase):
    def test_external_graph_transformer_triggering(self):
        input_size = 784
        hidden_size = 500
        num_classes = 10
        batch_size = 128
        model = NeuralNet(input_size, hidden_size, num_classes)

        model_desc = {'inputs': [('x', [batch_size, input_size]),
                                    ('target', [batch_size,])],
                                    'outputs': [('loss', [], True)]}
        optim_config = optim.SGDConfig()
        opts =  orttrainer.ORTTrainerOptions({'device':{'id':'cpu'}})
        model = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts)
        # because orttrainer is lazy initialized, feed in a random data to trigger the graph transformer
        data = torch.rand(batch_size, input_size)
        target = torch.randint(0, 10, (batch_size,))

        with OutputGrabber() as out:
            loss = model.train_step(data, target)
        assert '******************Trigger Customized Graph Transformer:  MyGraphTransformer!' in out.capturedtext

if __name__ == '__main__':
    unittest.main()

# /path/to/my_external_graph_transformer.cpp
#include "core/optimizer/rewrite_rule.h"
#include "orttraining/core/optimizer/graph_transformer_registry.h"
#include "onnx/defs/schema.h"
#include <memory>
#include <iostream>

namespace onnxruntime {
namespace training {

class MyRewriteRule : public RewriteRule {
public:
  MyRewriteRule() noexcept
      : RewriteRule("MyRewriteRule") {
  }
  std::vector<std::string> TargetOpTypes() const noexcept override {
    return {};
  }

private:
  bool SatisfyCondition(const Graph& /*graph*/, const Node& /*node*/, const logging::Logger& /*logger*/) const override {
    return true;
  }

  Status Apply(Graph& /*graph*/, Node& /*node*/, RewriteRuleEffect& /*rule_effect*/, const logging::Logger& /*logger*/) const override{
    std::cout << "******************Trigger Customized Graph Transformer:  MyGraphTransformer!" << std::endl;
    return Status::OK();
  }
};

void RegisterTrainingExternalTransformers() {
  ONNX_REGISTER_EXTERNAL_REWRITE_RULE(MyRewriteRule, Level1, true);
}

}
}

Urgency

No response

Target platform

Linux

Build script

Then build with

./build.sh --config RelWithDebInfo --skip_tests --build_shared_lib --parallel --external_graph_transformer_path /home/tfernand/dev/workspace/onnxruntime_external_graph_transformer/external_graph_transformers/

Error / output

[ 47%] Building CXX object CMakeFiles/onnxruntime_optimizer.dir/home/tfernand/dev/workspace/onnxruntime_external_graph_transformer/external_graph_transformers/my_external_graph_transformer.cc.o
[ 50%] Built target onnxruntime_mlas_test
/home/tfernand/dev/workspace/onnxruntime_external_graph_transformer/external_graph_transformers/my_external_graph_transformer.cc:2:10: fatal error: orttraining/core/optimizer/graph_transformer_registry.h: No such file or directory
    2 | #include "orttraining/core/optimizer/graph_transformer_registry.h"
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
gmake[2]: *** [CMakeFiles/onnxruntime_optimizer.dir/build.make:1350: CMakeFiles/onnxruntime_optimizer.dir/home/tfernand/dev/workspace/onnxruntime_external_graph_transformer/external_graph_transformers/my_external_graph_transformer.cc.o] Error 1
gmake[1]: *** [CMakeFiles/Makefile2:2039: CMakeFiles/onnxruntime_optimizer.dir/all] Error 2
gmake[1]: *** Waiting for unfinished jobs....
[ 66%] Built target onnxruntime_providers
gmake: *** [Makefile:146: all] Error 2
Traceback (most recent call last):
  File "/home/tfernand/dev/github/onnxruntime/tools/ci_build/build.py", line 2950, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/tfernand/dev/github/onnxruntime/tools/ci_build/build.py", line 2842, in main
    build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, args.target)
  File "/home/tfernand/dev/github/onnxruntime/tools/ci_build/build.py", line 1731, in build_targets
    run_subprocess(cmd_args, env=env)
  File "/home/tfernand/dev/github/onnxruntime/tools/ci_build/build.py", line 861, in run_subprocess
    return run(*args, cwd=cwd, capture_stdout=capture_stdout, shell=shell, env=my_env)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tfernand/dev/github/onnxruntime/tools/python/util/run.py", line 49, in run
    completed_process = subprocess.run(
                        ^^^^^^^^^^^^^^^
  File "/home/tfernand/miniconda3/envs/dev1/lib/python3.11/subprocess.py", line 571, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['/usr/bin/cmake', '--build', '/home/tfernand/dev/github/onnxruntime/build/Linux/RelWithDebInfo', '--config', 'RelWithDebInfo', '--', '-j16']' returned non-zero exit status 2.

Visual Studio Version

No response

GCC / Compiler Version

No response

snnn commented 3 months ago

The "--test_external_transformer_example" was only used in eager mode. And the PR deleted eager mode code.

thiagocrepaldi commented 3 months ago

The "--test_external_transformer_example" was only used in eager mode. And the PR deleted eager mode code.

would be possible to readd it? For hardware vendors, it is very interesting to have external graph transformers available to experiment new fusions before proposing them as contribution to ort - if they can be publicly published

github-actions[bot] commented 2 months ago

This issue has been automatically marked as stale due to inactivity and will be closed in 30 days if no further activity occurs. If further support is needed, please provide an update and/or more details.

microsoft / onnxruntime