webmachinelearning / webnn

🧠 Web Neural Network API
https://www.w3.org/TR/webnn/
Other
371 stars 46 forks source link

Survey graph-building APIs from native ecosystem #20

Closed huningxin closed 5 years ago

huningxin commented 5 years ago

Per resolution on the 9 May 2019 CG call, this issue is for surveying graph-building APIs from native ecosystem which aims to support the discussion in #16. The current foundation spec is direct derivative from Android NNAPI which is a C style API, during the CG call, the participants agreed to survey other graph-building APIs in native ecosystem to learn API design patterns.

There were three APIs mentioned in the CG call.

@walrusmcd mentioned "would love to contribute our learnings from two Microsoft's graph-building APIs". Feel free to add them into the list. Thanks.

huningxin commented 5 years ago
  • nGraph

The following code snippets use nGraph C++ API to build and execute a computation graph same as one in WebNN example (tensor0 and tensor2 are constants, tensor1 and tensor3 are user inputs):

tensor0 ---+
           +--- ADD ---> intermediateOutput0 ---+
tensor1 ---+                                    |
                                                +--- MUL---> output
tensor2 ---+                                    |
           +--- ADD ---> intermediateOutput1 ---+
tensor3 ---+

The following code builds the graph.

    // Build the graph
    const Shape shape{2, 2, 2, 2};
    const size_t size = shape_size(shape);
    const std::vector<float> constant_data(size, 0.5);

    auto tensor0 = std::make_shared<op::Constant>(element::f32, shape, constant_data);
    auto tensor1 = std::make_shared<op::Parameter>(element::f32, shape);
    auto tensor2 = std::make_shared<op::Constant>(element::f32, shape, constant_data);
    auto tensor3 = std::make_shared<op::Parameter>(element::f32, shape);

    auto add0 = std::make_shared<op::Add>(tensor0, tensor1);
    auto add1 = std::make_shared<op::Add>(tensor2, tensor3);

    auto mul = std::make_shared<op::Multiply>(add0, add1);

    // Make the function for the graph
    // The 1st argument specifies the results/outputs. 
    // The 2nd argument specifies the inputs.
    auto function = std::make_shared<Function>(NodeVector{mul},
                                               ParameterVector{tensor1, tensor3});

The following code compiles the graph.

    // Create the backend and compile the function
    auto backend = runtime::Backend::create("CPU");
    auto exec = backend->compile(function);

The following code executes the compiled graph.

    // Allocate tensors for inputs
    auto input0 = backend->create_tensor(element::f32, shape);
    auto input1 = backend->create_tensor(element::f32, shape);

    // Allocate tensor for output
    auto output = backend->create_tensor(element::f32, shape);

    // Initialize the input tensors
    const std::vector<float> input_data0(size, 1), input_data1(size, 2);
    input0->write(input_data0.data(), 0, sizeof(float)*input_data0.size());
    input1->write(input_data1.data(), 0, sizeof(float)*input_data1.size());

    // Invoke the function
    exec->call({output}, {input0, input1});

    // Get the result
    std::vector<float> output_data(size);
    output->read(output_data.data(), 0, sizeof(float)*output_data.size());

The complete example is hosted here.

huningxin commented 5 years ago
  • MPSNNGraph

My colleague @fujunwei helped create an example with MPSNNGraph API.

The following code builds the graph.

  // Build the graph.
  const std::vector<int> shape = {2, 2, 2, 2};
  size_t length = 16;
  const std::vector<__fp16> constant_data(length, 0.5);
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
  id<MTLCommandBuffer> command_buffer = [[device newCommandQueue] commandBuffer];

  MPSImage* constant0 = CreateMPSImageWithData(device, constant_data, shape);
  MPSNNImageNode* tensor0 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:constant0]];
  MPSImage* input0 = CreateMPSImage(device, shape);
  MPSNNImageNode* tensor1 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:input0]];
  MPSImage* constant1 = CreateMPSImageWithData(device, constant_data, shape);
  MPSNNImageNode* tensor2 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:constant1]];
  MPSImage* input1 = CreateMPSImage(device, shape);
  MPSNNImageNode* tensor3 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:input1]];
  MPSNNAdditionNode* add_0 = [MPSNNAdditionNode nodeWithLeftSource:tensor0
                                                       rightSource:tensor1];
  MPSNNAdditionNode* add_1 = [MPSNNAdditionNode nodeWithLeftSource:tensor2
                                                       rightSource:tensor3];
  MPSNNMultiplicationNode* mul = [MPSNNMultiplicationNode nodeWithLeftSource:add_0.resultImage
                                                                 rightSource:add_1.resultImage];
  MPSNNGraph* graph = [MPSNNGraph graphWithDevice:device
                                      resultImage:mul.resultImage
                              resultImageIsNeeded:true];

The following code executes the graph with input data and gets output data.

  // Execute the graph.
  NSMutableArray<MPSImage*>* image_array = [NSMutableArray arrayWithCapacity:1];
  const std::vector<__fp16> input_data0(length, 1);
  const std::vector<__fp16> input_data1(length, 2);
  UploadDataToMPSImage(input0, input_data0);
  UploadDataToMPSImage(input1, input_data1);
  NSArray<MPSImageHandle*> * handles = graph.sourceImageHandles;
  for (size_t i = 0; i < handles.count; ++i) {
    [image_array addObject:handles[i].image];
  }
  MPSImage* output_image = [graph encodeToCommandBuffer:command_buffer
                                           sourceImages:image_array
                                           sourceStates:nullptr
                                           intermediateImages:nullptr
                                           destinationStates:nullptr];

  // Get output data.
  size_t size = length * sizeof(__fp16);
  id<MTLBuffer> output_buffer = CreateOutputBuffer(device, command_buffer, output_image, size);

  [command_buffer commit];
  [command_buffer waitUntilCompleted];
  std::vector<__fp16> output_data(length);
  memcpy(output_data.data(), [output_buffer contents], size);
  std::cout << "[";
  for (size_t i = 0; i < length; ++i) {
    std::cout << output_data[i] << ' ';
  }
  std::cout << ']' << std::endl;

The complete example is hosted here.

walrusmcd commented 5 years ago

Thanks @huningxin !

Here are two examples we would love to share:

  1. The ONNX Runtime

ONNX is an OSS standards body that drives opsets and schema for cross framework interchange in ML. GitHub here.

Microsoft has also contributed a ML Runtime that works with ONNX. This is called the ONNX runtime (ORT). This is a cross platform, high performance ML engine that works with multiple forms of hardware acceleration. We ship this runtime built into Windows starting with version 1809 (October 2018 Update). GitHub here

Execution providers are the abstraction layer between the runtime, and the provider that supplies operator kernels (implementations).

Graphs are used to work with execution providers.

This is the mechanism that the provider can use to participate in graph optimization and rewriting.

Notice that the pattern here is to provider a full graph representation. With Nodes and Edges.

There is a lot of power that comes from this implementation around allowing for things like:

This is the mechanism for the provider all up.

The currency between the runtime and the provider are these IndexedSubGraphs and their Nodes.

The runtime will call IExecutionProvider:Compile() on the subgraph that it says it can handle. This then allows the provider to build the kernels and compute functions for those Nodes (ops).

This manner allows multiple providers to all participate in the execution of the Graph. Or , a single provider can also handle the entire graph (in our case we have both CPU and DirectX GPU providers that can handle entire graphs). This is largely driven by our opset schema. This give us great flexibility in that new ops can appear, and all providers don't have to be updated as long as there exists one provider that can handle the new op.

This is how we tend to do our GPU and hardware accelerated work, that allows for fallback to CPU.

We have reference implementations for many providers, including some that work with graphs (ngraph, tensorRT, etc.) here.

  1. Direct ML

New in Windows is a hardware abstraction layer designed for ML. This is the DirectX ML system, or DirectML. MSDN here.

Starting in version 1903 (May 2019 Update) DirectML has an operator level interface. It has a pattern that works well for hardware optimization by breaking work into 2 stages: (1) Initialization (2) Execution.

DirectML also participates underneath the ONNX Runtime. Using WDDM and MCDM Windows allows IHV vendors to supply drivers that work with DirectML for accelerating ML workloads. This allows anyone to participate across multiple GPU and TPU/VPU vendors at the operator kernel level.

We are also working on future innovation around how to have those higher level Graph and IndexedSubGraph interactions work going forward with the ORT.

huningxin commented 5 years ago
  • TensorRT

To complete the list, here comes an example with TensorRT API.

The following code builds the graph.

    // Build the graph
    IBuilder* builder = createInferBuilder(logger);
    INetworkDefinition* network = builder->createNetwork();

    Dims4 dims({2, 2, 2, 2});
    const size_t length = 16;
    std::vector<float> buffer(length, 0.5);
    const Weights constant{DataType::kFLOAT, buffer.data(), length};
    ITensor* tensor0 = network->addInput("tensor0", DataType::kFLOAT, dims);
    IConstantLayer* constant1 = network->addConstant(dims, constant);
    ITensor* tensor2 = network->addInput("tensor2", DataType::kFLOAT, dims);
    IConstantLayer* constant3 = network->addConstant(dims, constant);
    IElementWiseLayer* add0 = network->addElementWise(
        *tensor0, *constant1->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* add1 = network->addElementWise(
        *tensor2, *constant3->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* mul = network->addElementWise(
        *add0->getOutput(0), *add1->getOutput(0), ElementWiseOperation::kPROD);
    mul->getOutput(0)->setName("output");
    network->markOutput(*mul->getOutput(0));  

The following code executes the graph with input data and gets output data.

    // Execute the graph
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    IExecutionContext* context = engine->createExecutionContext();
    const int input0Index = engine->getBindingIndex("tensor0");
    const int input1Index = engine->getBindingIndex("tensor2");
    const int outputIndex = engine->getBindingIndex("output");

    void* buffers[3];
    cudaMalloc(&buffers[input0Index], length * sizeof(float));
    cudaMalloc(&buffers[input1Index], length * sizeof(float));
    cudaMalloc(&buffers[outputIndex], length * sizeof(float));

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    std::vector<float> input0Data(length, 1);
    std::vector<float> input1Data(length, 2);
    cudaMemcpyAsync(buffers[input0Index], input0Data.data(),
                    input0Data.size() * sizeof(float),
                    cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(buffers[input1Index], input1Data.data(),
                    input1Data.size() * sizeof(float),
                    cudaMemcpyHostToDevice, stream);
    context->enqueue(1, buffers, stream, nullptr);
    float output[length];
    cudaMemcpyAsync(output, buffers[outputIndex],
                    length * sizeof(float), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    // Print output
    std::cout << "output: [";
    for (unsigned int i = 0; i < length; i++)
    {
        std::cout << output[i] << " ";
    }
    std::cout << "]" << std::endl;
huningxin commented 5 years ago

As https://github.com/webmachinelearning/webnn/pull/22 has been merged, close this one.