microsoft / DirectXMath

DirectXMath is an all inline SIMD C++ linear algebra library for use in games and graphics apps
https://walbourn.github.io/introducing-directxmath/
MIT License
1.53k stars 235 forks source link

[MSVC][ARM64EC] Failed to build with error: cannot convert argument 1 from 'const DirectX::XMVECTOR' to '__m128' #194

Closed NEIL-smtg closed 1 month ago

NEIL-smtg commented 1 month ago

DirectXMath is encountering the following error when building the project with MSVC in arm64ec architecture:

error C2664: '__m128 _mm_fmadd_ps(__m128,__m128,__m128)': 
cannot convert argument 1 from 'const DirectX::XMVECTOR' to '__m128' 

This issue occurs to the header files in the Extension/.

Operating System: Windows

Step to reproduce:

  1. Open X64 Native Tools Command Prompt
  2. git clone https://github.com/microsoft/DirectXMath.git
  3. cd DirectXMath
  4. git clone https://github.com/walbourn/directxmathtest.git Tests
  5. set VSCMD_SKIP_SENDTELEMETRY=1 & "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\VsDevCmd.bat" -host_arch=amd64 -arch=arm64
  6. cl /c /IC:\gitP\Microsoft\DirectXMath\Tests..\Inc /IC:\gitP\Microsoft\DirectXMath\Tests..\Extensions /nologo /Wall /WX- /diagnostics:column /O2 /Ob2 /Oy- /D AMD64 /D ARM64EC /D _UNICODE /D UNICODE /D WIN32 /D AMD64 /D AMD64 /D ARM64EC /D ARM64EC /D _WINDOWS /D NDEBUG /D _UNICODE /D UNICODE /D "CMAKE_INTDIR=\"Release\"" /Gm- /EHsc /MT /GS /arch:AVX2 /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /GR- /std:c++17 /permissive- /external:W4 /Gd /TP /wd4061 /wd4365 /wd4514 /wd4710 /wd4820 /wd4668 /wd5039 /wd5045 /wd5264 /analyze- /errorReport:queue /arm64EC /Zc:__cplusplus C:\gitP\Microsoft\DirectXMath\Tests\ext\testfma3.cpp

or simply run this:

  1. Download the preprocessed file testfma3.txt
  2. Open X64 Native Tools Command Prompt
  3. set VSCMD_SKIP_SENDTELEMETRY=1 & "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\VsDevCmd.bat" -host_arch=amd64 -arch=arm64
  4. cl /c /nologo /D ARM64EC /Gm- /EHsc /std:c++17 /permissive- /TP testfma3.txt
### Tasks
walbourn commented 1 month ago

The ARM64EC platform implementation for DirectXMath is to use ARM-NEON, NOT SSE/SSE2. Therefore, the specific extensions around Intel FMA3 are not going to work for that platform.

Note that the "extensions" folder are all educational things. I have folded in a lot of that functionality into the core library over time. For example, when you build with /arch:AVX2, DirectXMath's XMVectorMultiplyAdd will end up using FMA3. For ARM64EC, it will use ARM-NEON vfmaq_f32.

#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
#define _XM_SSE_INTRINSICS_
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
#define _XM_ARM_NEON_INTRINSICS_
#elif !defined(_XM_NO_INTRINSICS_)
#error DirectX Math does not support this target
#endif
#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_AVX2_INTRINSICS_
#endif

#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_FMA3_INTRINSICS_
#endif
#if defined(_XM_FMA3_INTRINSICS_)
#define XM_FMADD_PS( a, b, c ) _mm_fmadd_ps((a), (b), (c))
#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c))
#else
#define XM_FMADD_PS( a, b, c ) _mm_add_ps(_mm_mul_ps((a), (b)), (c))
#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
#endif
inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR V3
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0],
            V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1],
            V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2],
            V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vfmaq_f32(V3, V1, V2);
#else
    return vmlaq_f32(V3, V1, V2);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_FMADD_PS(V1, V2, V3);
#endif
}
walbourn commented 1 month ago

There was a build error built into these extensions to flag this scenario, but I never updated when they changed _M_HYBRID_X86_ARM64 to _M_ARM64EC.