Closed huningxin closed 5 years ago
- nGraph
The following code snippets use nGraph C++ API to build and execute a computation graph same as one in WebNN example (tensor0 and tensor2 are constants, tensor1 and tensor3 are user inputs):
tensor0 ---+
+--- ADD ---> intermediateOutput0 ---+
tensor1 ---+ |
+--- MUL---> output
tensor2 ---+ |
+--- ADD ---> intermediateOutput1 ---+
tensor3 ---+
The following code builds the graph.
// Build the graph
const Shape shape{2, 2, 2, 2};
const size_t size = shape_size(shape);
const std::vector<float> constant_data(size, 0.5);
auto tensor0 = std::make_shared<op::Constant>(element::f32, shape, constant_data);
auto tensor1 = std::make_shared<op::Parameter>(element::f32, shape);
auto tensor2 = std::make_shared<op::Constant>(element::f32, shape, constant_data);
auto tensor3 = std::make_shared<op::Parameter>(element::f32, shape);
auto add0 = std::make_shared<op::Add>(tensor0, tensor1);
auto add1 = std::make_shared<op::Add>(tensor2, tensor3);
auto mul = std::make_shared<op::Multiply>(add0, add1);
// Make the function for the graph
// The 1st argument specifies the results/outputs.
// The 2nd argument specifies the inputs.
auto function = std::make_shared<Function>(NodeVector{mul},
ParameterVector{tensor1, tensor3});
The following code compiles the graph.
// Create the backend and compile the function
auto backend = runtime::Backend::create("CPU");
auto exec = backend->compile(function);
The following code executes the compiled graph.
// Allocate tensors for inputs
auto input0 = backend->create_tensor(element::f32, shape);
auto input1 = backend->create_tensor(element::f32, shape);
// Allocate tensor for output
auto output = backend->create_tensor(element::f32, shape);
// Initialize the input tensors
const std::vector<float> input_data0(size, 1), input_data1(size, 2);
input0->write(input_data0.data(), 0, sizeof(float)*input_data0.size());
input1->write(input_data1.data(), 0, sizeof(float)*input_data1.size());
// Invoke the function
exec->call({output}, {input0, input1});
// Get the result
std::vector<float> output_data(size);
output->read(output_data.data(), 0, sizeof(float)*output_data.size());
The complete example is hosted here.
- MPSNNGraph
My colleague @fujunwei helped create an example with MPSNNGraph API.
The following code builds the graph.
// Build the graph.
const std::vector<int> shape = {2, 2, 2, 2};
size_t length = 16;
const std::vector<__fp16> constant_data(length, 0.5);
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
id<MTLCommandBuffer> command_buffer = [[device newCommandQueue] commandBuffer];
MPSImage* constant0 = CreateMPSImageWithData(device, constant_data, shape);
MPSNNImageNode* tensor0 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
initWithImage:constant0]];
MPSImage* input0 = CreateMPSImage(device, shape);
MPSNNImageNode* tensor1 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
initWithImage:input0]];
MPSImage* constant1 = CreateMPSImageWithData(device, constant_data, shape);
MPSNNImageNode* tensor2 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
initWithImage:constant1]];
MPSImage* input1 = CreateMPSImage(device, shape);
MPSNNImageNode* tensor3 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
initWithImage:input1]];
MPSNNAdditionNode* add_0 = [MPSNNAdditionNode nodeWithLeftSource:tensor0
rightSource:tensor1];
MPSNNAdditionNode* add_1 = [MPSNNAdditionNode nodeWithLeftSource:tensor2
rightSource:tensor3];
MPSNNMultiplicationNode* mul = [MPSNNMultiplicationNode nodeWithLeftSource:add_0.resultImage
rightSource:add_1.resultImage];
MPSNNGraph* graph = [MPSNNGraph graphWithDevice:device
resultImage:mul.resultImage
resultImageIsNeeded:true];
The following code executes the graph with input data and gets output data.
// Execute the graph.
NSMutableArray<MPSImage*>* image_array = [NSMutableArray arrayWithCapacity:1];
const std::vector<__fp16> input_data0(length, 1);
const std::vector<__fp16> input_data1(length, 2);
UploadDataToMPSImage(input0, input_data0);
UploadDataToMPSImage(input1, input_data1);
NSArray<MPSImageHandle*> * handles = graph.sourceImageHandles;
for (size_t i = 0; i < handles.count; ++i) {
[image_array addObject:handles[i].image];
}
MPSImage* output_image = [graph encodeToCommandBuffer:command_buffer
sourceImages:image_array
sourceStates:nullptr
intermediateImages:nullptr
destinationStates:nullptr];
// Get output data.
size_t size = length * sizeof(__fp16);
id<MTLBuffer> output_buffer = CreateOutputBuffer(device, command_buffer, output_image, size);
[command_buffer commit];
[command_buffer waitUntilCompleted];
std::vector<__fp16> output_data(length);
memcpy(output_data.data(), [output_buffer contents], size);
std::cout << "[";
for (size_t i = 0; i < length; ++i) {
std::cout << output_data[i] << ' ';
}
std::cout << ']' << std::endl;
The complete example is hosted here.
Thanks @huningxin !
Here are two examples we would love to share:
ONNX is an OSS standards body that drives opsets and schema for cross framework interchange in ML. GitHub here.
Microsoft has also contributed a ML Runtime that works with ONNX. This is called the ONNX runtime (ORT). This is a cross platform, high performance ML engine that works with multiple forms of hardware acceleration. We ship this runtime built into Windows starting with version 1809 (October 2018 Update). GitHub here
Execution providers are the abstraction layer between the runtime, and the provider that supplies operator kernels (implementations).
Graphs are used to work with execution providers.
This is the mechanism that the provider can use to participate in graph optimization and rewriting.
Notice that the pattern here is to provider a full graph representation. With Nodes and Edges.
There is a lot of power that comes from this implementation around allowing for things like:
This is the mechanism for the provider all up.
The currency between the runtime and the provider are these IndexedSubGraphs and their Nodes.
The runtime will call IExecutionProvider:Compile() on the subgraph that it says it can handle. This then allows the provider to build the kernels and compute functions for those Nodes (ops).
This manner allows multiple providers to all participate in the execution of the Graph. Or , a single provider can also handle the entire graph (in our case we have both CPU and DirectX GPU providers that can handle entire graphs). This is largely driven by our opset schema. This give us great flexibility in that new ops can appear, and all providers don't have to be updated as long as there exists one provider that can handle the new op.
This is how we tend to do our GPU and hardware accelerated work, that allows for fallback to CPU.
We have reference implementations for many providers, including some that work with graphs (ngraph, tensorRT, etc.) here.
New in Windows is a hardware abstraction layer designed for ML. This is the DirectX ML system, or DirectML. MSDN here.
Starting in version 1903 (May 2019 Update) DirectML has an operator level interface. It has a pattern that works well for hardware optimization by breaking work into 2 stages: (1) Initialization (2) Execution.
DirectML also participates underneath the ONNX Runtime. Using WDDM and MCDM Windows allows IHV vendors to supply drivers that work with DirectML for accelerating ML workloads. This allows anyone to participate across multiple GPU and TPU/VPU vendors at the operator kernel level.
We are also working on future innovation around how to have those higher level Graph and IndexedSubGraph interactions work going forward with the ORT.
- TensorRT
To complete the list, here comes an example with TensorRT API.
The following code builds the graph.
// Build the graph
IBuilder* builder = createInferBuilder(logger);
INetworkDefinition* network = builder->createNetwork();
Dims4 dims({2, 2, 2, 2});
const size_t length = 16;
std::vector<float> buffer(length, 0.5);
const Weights constant{DataType::kFLOAT, buffer.data(), length};
ITensor* tensor0 = network->addInput("tensor0", DataType::kFLOAT, dims);
IConstantLayer* constant1 = network->addConstant(dims, constant);
ITensor* tensor2 = network->addInput("tensor2", DataType::kFLOAT, dims);
IConstantLayer* constant3 = network->addConstant(dims, constant);
IElementWiseLayer* add0 = network->addElementWise(
*tensor0, *constant1->getOutput(0), ElementWiseOperation::kSUM);
IElementWiseLayer* add1 = network->addElementWise(
*tensor2, *constant3->getOutput(0), ElementWiseOperation::kSUM);
IElementWiseLayer* mul = network->addElementWise(
*add0->getOutput(0), *add1->getOutput(0), ElementWiseOperation::kPROD);
mul->getOutput(0)->setName("output");
network->markOutput(*mul->getOutput(0));
The following code executes the graph with input data and gets output data.
// Execute the graph
ICudaEngine* engine = builder->buildCudaEngine(*network);
IExecutionContext* context = engine->createExecutionContext();
const int input0Index = engine->getBindingIndex("tensor0");
const int input1Index = engine->getBindingIndex("tensor2");
const int outputIndex = engine->getBindingIndex("output");
void* buffers[3];
cudaMalloc(&buffers[input0Index], length * sizeof(float));
cudaMalloc(&buffers[input1Index], length * sizeof(float));
cudaMalloc(&buffers[outputIndex], length * sizeof(float));
cudaStream_t stream;
cudaStreamCreate(&stream);
std::vector<float> input0Data(length, 1);
std::vector<float> input1Data(length, 2);
cudaMemcpyAsync(buffers[input0Index], input0Data.data(),
input0Data.size() * sizeof(float),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(buffers[input1Index], input1Data.data(),
input1Data.size() * sizeof(float),
cudaMemcpyHostToDevice, stream);
context->enqueue(1, buffers, stream, nullptr);
float output[length];
cudaMemcpyAsync(output, buffers[outputIndex],
length * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// Print output
std::cout << "output: [";
for (unsigned int i = 0; i < length; i++)
{
std::cout << output[i] << " ";
}
std::cout << "]" << std::endl;
As https://github.com/webmachinelearning/webnn/pull/22 has been merged, close this one.
Per resolution on the 9 May 2019 CG call, this issue is for surveying graph-building APIs from native ecosystem which aims to support the discussion in #16. The current foundation spec is direct derivative from Android NNAPI which is a C style API, during the CG call, the participants agreed to survey other graph-building APIs in native ecosystem to learn API design patterns.
There were three APIs mentioned in the CG call.
@walrusmcd mentioned "would love to contribute our learnings from two Microsoft's graph-building APIs". Feel free to add them into the list. Thanks.