microsoft / DirectML

DirectML is a high-performance, hardware-accelerated DirectX 12 library for machine learning. DirectML provides GPU acceleration for common machine learning tasks across a broad range of supported hardware and drivers, including all DirectX 12-capable GPUs from vendors such as AMD, Intel, NVIDIA, and Qualcomm.
MIT License
2.17k stars 288 forks source link

DirectMLNpuInference fails to run on the ARM64 NPU #640

Open xiaoweiChen opened 2 weeks ago

xiaoweiChen commented 2 weeks ago

Base on DirectMLNpuInference sample and #625(update SDK to Windows 11 SDK(10.0.26100.0)).

I make DirectML NPU work well on Intel Lunar Lake Client Platform, I see the NPU usage not 0 when running program on Windows Task Manager.

While, I try this sample on My Windows ARM64 machine, the program would tell me "No NPU device found"...

Anyone know the reason? Is DirectML NPU not support ARM64 platform?

My device info: CPU: Snapdragon(R) X 12-core X1E80100 @ 3.40 GHz GPU: Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Adreno(TM) GPU NPU: Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Hexagon(TM) NPU

My test code:

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "pch.h"

#include <dxcore_interface.h>
#include <dxcore.h>

#include "onnxruntime_cxx_api.h"
#include "dml_provider_factory.h"

#include "TensorHelper.h"

using Microsoft::WRL::ComPtr;

void InitializeDirectML(ID3D12Device1** d3dDeviceOut, ID3D12CommandQueue** commandQueueOut, IDMLDevice** dmlDeviceOut) {
    // Whether to skip adapters which support Graphics in order to target NPU for testing
    bool forceComputeOnlyDevice = true;
    bool forceGenericMLDevice = false;

    ComPtr<IDXCoreAdapterFactory> factory;
    HMODULE dxCoreModule = LoadLibraryW(L"DXCore.dll");
    if (dxCoreModule)
    {
        auto dxcoreCreateAdapterFactory = reinterpret_cast<HRESULT(WINAPI*)(REFIID, void**)>(
            GetProcAddress(dxCoreModule, "DXCoreCreateAdapterFactory")
            );
        if (dxcoreCreateAdapterFactory)
        {
            dxcoreCreateAdapterFactory(IID_PPV_ARGS(&factory));
        }
    }
    // Create the DXCore Adapter
    ComPtr<IDXCoreAdapter> adapter;
    if (factory)
    {
#if 1
        const GUID dxGUIDs[] = { 
            DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE,
            DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU
        };
        ComPtr<IDXCoreAdapterList> adapterList;
        THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
        for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
        {
            ComPtr<IDXCoreAdapter> currentGpuAdapter;
            THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&currentGpuAdapter)));

            if (!forceComputeOnlyDevice && !forceGenericMLDevice)
            {
                // No device restrictions
                adapter = std::move(currentGpuAdapter);
                break;
            }
            else if (forceComputeOnlyDevice && currentGpuAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE))
            {
                adapter = std::move(currentGpuAdapter);
                break;
            }
            else if (forceGenericMLDevice && currentGpuAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML))
            {
                adapter = std::move(currentGpuAdapter);
                break;
            }
        }
#else
        const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML };
        ComPtr<IDXCoreAdapterList> adapterList;
        THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
        for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
        {
            ComPtr<IDXCoreAdapter> nextGpuAdapter;
            THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&nextGpuAdapter)));
            if (nextGpuAdapter->IsAttributeSupported(DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU))
            {
                adapter = std::move(nextGpuAdapter);
                break;
            }
        }
#endif
    }
    // Create the D3D12 Device
    ComPtr<ID3D12Device1> d3dDevice;
    if (adapter)
    {
        HMODULE d3d12Module = LoadLibraryW(L"d3d12.dll");
        if (d3d12Module)
        {
            auto d3d12CreateDevice = reinterpret_cast<HRESULT(WINAPI*)(IUnknown*, D3D_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(d3d12Module, "D3D12CreateDevice")
                );
            if (d3d12CreateDevice)
            {
                THROW_IF_FAILED(d3d12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_CORE, IID_PPV_ARGS(&d3dDevice)));
            }
        }
    }
    // Create the DML Device and D3D12 Command Queue
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    if (d3dDevice)
    {
        D3D12_COMMAND_QUEUE_DESC queueDesc = {};
        queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
        THROW_IF_FAILED(d3dDevice->CreateCommandQueue(
            &queueDesc,
            IID_PPV_ARGS(commandQueue.ReleaseAndGetAddressOf())));
        HMODULE dmlModule = LoadLibraryW(L"DirectML.dll");
        if (dmlModule)
        {
            auto dmlCreateDevice = reinterpret_cast<HRESULT(WINAPI*)(ID3D12Device*, DML_CREATE_DEVICE_FLAGS, DML_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(dmlModule, "DMLCreateDevice1")
                );
            if (dmlCreateDevice)
            {
                THROW_IF_FAILED(dmlCreateDevice(d3dDevice.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(dmlDevice.ReleaseAndGetAddressOf())));
            }
        }
    }

    d3dDevice.CopyTo(d3dDeviceOut);
    commandQueue.CopyTo(commandQueueOut);
    dmlDevice.CopyTo(dmlDeviceOut);
}

void main()
{
    ComPtr<ID3D12Device1> d3dDevice;
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    InitializeDirectML(d3dDevice.GetAddressOf(), commandQueue.GetAddressOf(), dmlDevice.GetAddressOf());

    // Add the DML execution provider to ORT using the DML Device and D3D12 Command Queue created above.
    if (!dmlDevice)
    {
        printf("No NPU device found\n");
        return;
    }

    const OrtApi& ortApi = Ort::GetApi();
    static Ort::Env s_OrtEnv{ nullptr };
    s_OrtEnv = Ort::Env(Ort::ThreadingOptions{});
    s_OrtEnv.DisableTelemetryEvents();

    auto sessionOptions = Ort::SessionOptions{};
    sessionOptions.DisableMemPattern();
    sessionOptions.DisablePerSessionThreads();
    sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
    const OrtDmlApi* ortDmlApi = nullptr;
    Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
    Ort::ThrowOnError(ortDmlApi->SessionOptionsAppendExecutionProvider_DML1(sessionOptions, dmlDevice.Get(), commandQueue.Get()));

    // Create the session
    auto session = Ort::Session(s_OrtEnv, L"mobilenetv2-7-fp16.onnx", sessionOptions);
    const char* inputName = "input";
    const char* outputName = "output";

    // Create input tensor
    Ort::TypeInfo type_info = session.GetInputTypeInfo(0);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
    auto input = CreateDmlValue(tensor_info, commandQueue.Get());
    auto inputTensor = std::move(input.first);

    const auto memoryInfo = inputTensor.GetTensorMemoryInfo();
    Ort::Allocator allocator(session, memoryInfo);

    // Get the inputResource and populate!
    ComPtr<ID3D12Resource> inputResource;
    Ort::ThrowOnError(ortDmlApi->GetD3D12ResourceFromAllocation(allocator, inputTensor.GetTensorMutableData<void*>(), &inputResource));

    // Create output tensor
    type_info = session.GetOutputTypeInfo(0);
    tensor_info = type_info.GetTensorTypeAndShapeInfo();
    auto output = CreateDmlValue(tensor_info, commandQueue.Get());
    auto outputTensor = std::move(output.first);

    // Run warmup
    session.Run(Ort::RunOptions{ nullptr }, &inputName, &inputTensor, 1, &outputName, &outputTensor, 1);

    // Queue fence, and wait for completion
    ComPtr<ID3D12Fence> fence;
    THROW_IF_FAILED(d3dDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(fence.GetAddressOf())));
    THROW_IF_FAILED(commandQueue->Signal(fence.Get(), 1));

    wil::unique_handle fenceEvent(CreateEvent(nullptr, FALSE, FALSE, nullptr));
    THROW_IF_FAILED(fence->SetEventOnCompletion(1, fenceEvent.get()));
    THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);

    // Record start
    auto start = std::chrono::high_resolution_clock::now();

    // Run performance test
    constexpr int fenceValueStart = 2;
    constexpr int numIterations = 100;
    for (int i = fenceValueStart; i < (numIterations + fenceValueStart); i++)
    {
        session.Run(Ort::RunOptions{ nullptr }, &inputName, &inputTensor, 1, &outputName, &outputTensor, 1);

        {
            // Synchronize with CPU before queuing more inference runs
            THROW_IF_FAILED(commandQueue->Signal(fence.Get(), i));
            THROW_HR_IF(E_FAIL, ResetEvent(fenceEvent.get()) == 0);
            THROW_IF_FAILED(fence->SetEventOnCompletion(i, fenceEvent.get()));
            THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);
        }
    }

    // Record end and calculate duration
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::micro> duration = end - start;
    printf("Evaluate Took: %fus\n", float(duration.count())/100);

    // Read results
    ComPtr<ID3D12Resource> outputResource;
    Ort::ThrowOnError(ortDmlApi->GetD3D12ResourceFromAllocation(allocator, outputTensor.GetTensorMutableData<void*>(), &outputResource));
}
xiaoweiChen commented 2 weeks ago

And Could you add some build script(such as CMakeLists.txt) in to sample project?

xiaoweiChen commented 1 week ago

Base on this blog. The DirectML version need 1.15.2, and onnxruntime need 1.18.

The native demo use 1.17 onnxruntime....

https://github.com/microsoft/DirectML/blob/master/Samples/DirectMLNpuInference/packages.config

<?xml version="1.0" encoding="utf-8"?>
<packages>
  <package id="Microsoft.AI.DirectML" version="1.15.2" targetFramework="native" />
  <package id="Microsoft.AI.MachineLearning" version="1.17.0" targetFramework="native" />
  <package id="Microsoft.Windows.ImplementationLibrary" version="1.0.220914.1" targetFramework="native" />
</packages>
mrsabhar commented 1 week ago

Seeing similar issue on ASUS X Elite. Updated drivers based on the blog. WebNN gives driver not installed. [10:05:43] [Config] Demo config updated · resnet-50 · webnn · npu [10:05:43] [Error] UnknownError: Failed to execute 'createContext' on 'ML': DirectML: Failed to create a WebNN context. [10:05:43] [Error] Your device probably doesn't have an AI processor (NPU) or the NPU driver is not successfully installed