plctlab / numpy

Porting RVV to Numpy.
https://numpy.org
Other
0 stars 0 forks source link

need implent simd func #3

Open luyahan opened 6 months ago

luyahan commented 6 months ago

Proposed new feature or change:

CMD:

gcc -DNPY_SIMD=128 -D_NPY_SIMD_H_ -D__aarch64__  -E ./neon.h -o neon.hpp

neon.patch

Avx2

gcc  -D_NPY_SIMD_H_   -E ./avx2.h -o avx2.hpp
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/avx2$ cat ./avx2.hpp | grep NPY_FINLINE | wc -l
363

Avx512

luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/avx512$ gcc  -D_NPY_SIMD_H_   -E ./avx512.h -o avx512.hpp
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/avx512$ cat ./avx512.hpp | grep NPY_FINLINE | wc -l
413
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/avx512$ 

vec

luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/vec$ gcc  -D_NPY_SIMD_H_ -DNPY_HAVE_VX   -E ./vec.h -o vec.hpp
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/vec$ cat ./vec.hpp | grep NPY_FINLINE | wc -l
343
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/vec$ 
luyahan commented 6 months ago
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/neon$ cat ./neon.hpp | grep NPY_FINLINE
NPY_FINLINE npyv_u8 npyv_load_u8(const npyv_lanetype_u8* ptr) {
NPY_FINLINE npyv_u8 npyv_loada_u8(const npyv_lanetype_u8* ptr) {
NPY_FINLINE npyv_u8 npyv_loads_u8(const npyv_lanetype_u8* ptr) {
NPY_FINLINE npyv_u8 npyv_loadl_u8(const npyv_lanetype_u8* ptr) {
NPY_FINLINE void npyv_store_u8(npyv_lanetype_u8* ptr, npyv_u8 vec) {
NPY_FINLINE void npyv_storea_u8(npyv_lanetype_u8* ptr, npyv_u8 vec) {
NPY_FINLINE void npyv_stores_u8(npyv_lanetype_u8* ptr, npyv_u8 vec) {
NPY_FINLINE void npyv_storel_u8(npyv_lanetype_u8* ptr, npyv_u8 vec) {
NPY_FINLINE void npyv_storeh_u8(npyv_lanetype_u8* ptr, npyv_u8 vec) {
NPY_FINLINE npyv_s8 npyv_load_s8(const npyv_lanetype_s8* ptr) {
NPY_FINLINE npyv_s8 npyv_loada_s8(const npyv_lanetype_s8* ptr) {
NPY_FINLINE npyv_s8 npyv_loads_s8(const npyv_lanetype_s8* ptr) {
NPY_FINLINE npyv_s8 npyv_loadl_s8(const npyv_lanetype_s8* ptr) {
NPY_FINLINE void npyv_store_s8(npyv_lanetype_s8* ptr, npyv_s8 vec) {
NPY_FINLINE void npyv_storea_s8(npyv_lanetype_s8* ptr, npyv_s8 vec) {
NPY_FINLINE void npyv_stores_s8(npyv_lanetype_s8* ptr, npyv_s8 vec) {
NPY_FINLINE void npyv_storel_s8(npyv_lanetype_s8* ptr, npyv_s8 vec) {
NPY_FINLINE void npyv_storeh_s8(npyv_lanetype_s8* ptr, npyv_s8 vec) {
NPY_FINLINE npyv_u16 npyv_load_u16(const npyv_lanetype_u16* ptr) {
NPY_FINLINE npyv_u16 npyv_loada_u16(const npyv_lanetype_u16* ptr) {
NPY_FINLINE npyv_u16 npyv_loads_u16(const npyv_lanetype_u16* ptr) {
NPY_FINLINE npyv_u16 npyv_loadl_u16(const npyv_lanetype_u16* ptr) {
NPY_FINLINE void npyv_store_u16(npyv_lanetype_u16* ptr, npyv_u16 vec) {
NPY_FINLINE void npyv_storea_u16(npyv_lanetype_u16* ptr, npyv_u16 vec) {
NPY_FINLINE void npyv_stores_u16(npyv_lanetype_u16* ptr, npyv_u16 vec) {
NPY_FINLINE void npyv_storel_u16(npyv_lanetype_u16* ptr, npyv_u16 vec) {
NPY_FINLINE void npyv_storeh_u16(npyv_lanetype_u16* ptr, npyv_u16 vec) {
NPY_FINLINE npyv_s16 npyv_load_s16(const npyv_lanetype_s16* ptr) {
NPY_FINLINE npyv_s16 npyv_loada_s16(const npyv_lanetype_s16* ptr) {
NPY_FINLINE npyv_s16 npyv_loads_s16(const npyv_lanetype_s16* ptr) {
NPY_FINLINE npyv_s16 npyv_loadl_s16(const npyv_lanetype_s16* ptr) {
NPY_FINLINE void npyv_store_s16(npyv_lanetype_s16* ptr, npyv_s16 vec) {
NPY_FINLINE void npyv_storea_s16(npyv_lanetype_s16* ptr, npyv_s16 vec) {
NPY_FINLINE void npyv_stores_s16(npyv_lanetype_s16* ptr, npyv_s16 vec) {
NPY_FINLINE void npyv_storel_s16(npyv_lanetype_s16* ptr, npyv_s16 vec) {
NPY_FINLINE void npyv_storeh_s16(npyv_lanetype_s16* ptr, npyv_s16 vec) {
NPY_FINLINE npyv_u32 npyv_load_u32(const npyv_lanetype_u32* ptr) {
NPY_FINLINE npyv_u32 npyv_loada_u32(const npyv_lanetype_u32* ptr) {
NPY_FINLINE npyv_u32 npyv_loads_u32(const npyv_lanetype_u32* ptr) {
NPY_FINLINE npyv_u32 npyv_loadl_u32(const npyv_lanetype_u32* ptr) {
NPY_FINLINE void npyv_store_u32(npyv_lanetype_u32* ptr, npyv_u32 vec) {
NPY_FINLINE void npyv_storea_u32(npyv_lanetype_u32* ptr, npyv_u32 vec) {
NPY_FINLINE void npyv_stores_u32(npyv_lanetype_u32* ptr, npyv_u32 vec) {
NPY_FINLINE void npyv_storel_u32(npyv_lanetype_u32* ptr, npyv_u32 vec) {
NPY_FINLINE void npyv_storeh_u32(npyv_lanetype_u32* ptr, npyv_u32 vec) {
NPY_FINLINE npyv_s32 npyv_load_s32(const npyv_lanetype_s32* ptr) {
NPY_FINLINE npyv_s32 npyv_loada_s32(const npyv_lanetype_s32* ptr) {
NPY_FINLINE npyv_s32 npyv_loads_s32(const npyv_lanetype_s32* ptr) {
NPY_FINLINE npyv_s32 npyv_loadl_s32(const npyv_lanetype_s32* ptr) {
NPY_FINLINE void npyv_store_s32(npyv_lanetype_s32* ptr, npyv_s32 vec) {
NPY_FINLINE void npyv_storea_s32(npyv_lanetype_s32* ptr, npyv_s32 vec) {
NPY_FINLINE void npyv_stores_s32(npyv_lanetype_s32* ptr, npyv_s32 vec) {
NPY_FINLINE void npyv_storel_s32(npyv_lanetype_s32* ptr, npyv_s32 vec) {
NPY_FINLINE void npyv_storeh_s32(npyv_lanetype_s32* ptr, npyv_s32 vec) {
NPY_FINLINE npyv_u64 npyv_load_u64(const npyv_lanetype_u64* ptr) {
NPY_FINLINE npyv_u64 npyv_loada_u64(const npyv_lanetype_u64* ptr) {
NPY_FINLINE npyv_u64 npyv_loads_u64(const npyv_lanetype_u64* ptr) {
NPY_FINLINE npyv_u64 npyv_loadl_u64(const npyv_lanetype_u64* ptr) {
NPY_FINLINE void npyv_store_u64(npyv_lanetype_u64* ptr, npyv_u64 vec) {
NPY_FINLINE void npyv_storea_u64(npyv_lanetype_u64* ptr, npyv_u64 vec) {
NPY_FINLINE void npyv_stores_u64(npyv_lanetype_u64* ptr, npyv_u64 vec) {
NPY_FINLINE void npyv_storel_u64(npyv_lanetype_u64* ptr, npyv_u64 vec) {
NPY_FINLINE void npyv_storeh_u64(npyv_lanetype_u64* ptr, npyv_u64 vec) {
NPY_FINLINE npyv_s64 npyv_load_s64(const npyv_lanetype_s64* ptr) {
NPY_FINLINE npyv_s64 npyv_loada_s64(const npyv_lanetype_s64* ptr) {
NPY_FINLINE npyv_s64 npyv_loads_s64(const npyv_lanetype_s64* ptr) {
NPY_FINLINE npyv_s64 npyv_loadl_s64(const npyv_lanetype_s64* ptr) {
NPY_FINLINE void npyv_store_s64(npyv_lanetype_s64* ptr, npyv_s64 vec) {
NPY_FINLINE void npyv_storea_s64(npyv_lanetype_s64* ptr, npyv_s64 vec) {
NPY_FINLINE void npyv_stores_s64(npyv_lanetype_s64* ptr, npyv_s64 vec) {
NPY_FINLINE void npyv_storel_s64(npyv_lanetype_s64* ptr, npyv_s64 vec) {
NPY_FINLINE void npyv_storeh_s64(npyv_lanetype_s64* ptr, npyv_s64 vec) {
NPY_FINLINE npyv_f32 npyv_load_f32(const npyv_lanetype_f32* ptr) {
NPY_FINLINE npyv_f32 npyv_loada_f32(const npyv_lanetype_f32* ptr) {
NPY_FINLINE npyv_f32 npyv_loads_f32(const npyv_lanetype_f32* ptr) {
NPY_FINLINE npyv_f32 npyv_loadl_f32(const npyv_lanetype_f32* ptr) {
NPY_FINLINE void npyv_store_f32(npyv_lanetype_f32* ptr, npyv_f32 vec) {
NPY_FINLINE void npyv_storea_f32(npyv_lanetype_f32* ptr, npyv_f32 vec) {
NPY_FINLINE void npyv_stores_f32(npyv_lanetype_f32* ptr, npyv_f32 vec) {
NPY_FINLINE void npyv_storel_f32(npyv_lanetype_f32* ptr, npyv_f32 vec) {
NPY_FINLINE void npyv_storeh_f32(npyv_lanetype_f32* ptr, npyv_f32 vec) {
NPY_FINLINE npyv_f64 npyv_load_f64(const npyv_lanetype_f64* ptr) {
NPY_FINLINE npyv_f64 npyv_loada_f64(const npyv_lanetype_f64* ptr) {
NPY_FINLINE npyv_f64 npyv_loads_f64(const npyv_lanetype_f64* ptr) {
NPY_FINLINE npyv_f64 npyv_loadl_f64(const npyv_lanetype_f64* ptr) {
NPY_FINLINE void npyv_store_f64(npyv_lanetype_f64* ptr, npyv_f64 vec) {
NPY_FINLINE void npyv_storea_f64(npyv_lanetype_f64* ptr, npyv_f64 vec) {
NPY_FINLINE void npyv_stores_f64(npyv_lanetype_f64* ptr, npyv_f64 vec) {
NPY_FINLINE void npyv_storel_f64(npyv_lanetype_f64* ptr, npyv_f64 vec) {
NPY_FINLINE void npyv_storeh_f64(npyv_lanetype_f64* ptr, npyv_f64 vec) {
NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32* ptr, npy_intp stride) {
NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32* ptr, npy_intp stride) {
NPY_FINLINE npyv_f32 npyv_loadn_f32(const float* ptr, npy_intp stride) {
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64* ptr, npy_intp stride) {
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64* ptr, npy_intp stride) {
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double* ptr, npy_intp stride) {
NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32* ptr, npy_intp stride) {
NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32* ptr, npy_intp stride) {
NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float* ptr, npy_intp stride) {
NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64* ptr, npy_intp stride) {
NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64* ptr, npy_intp stride) {
NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double* ptr, npy_intp stride) {
NPY_FINLINE void npyv_storen_s32(npy_int32* ptr, npy_intp stride, npyv_s32 a) {
NPY_FINLINE void npyv_storen_u32(npy_uint32* ptr, npy_intp stride, npyv_u32 a) {
NPY_FINLINE void npyv_storen_f32(float* ptr, npy_intp stride, npyv_f32 a) {
NPY_FINLINE void npyv_storen_s64(npy_int64* ptr, npy_intp stride, npyv_s64 a) {
NPY_FINLINE void npyv_storen_u64(npy_uint64* ptr, npy_intp stride, npyv_u64 a) {
NPY_FINLINE void npyv_storen_f64(double* ptr, npy_intp stride, npyv_f64 a) {
NPY_FINLINE void npyv_storen2_u32(npy_uint32* ptr,
NPY_FINLINE void npyv_storen2_s32(npy_int32* ptr, npy_intp stride, npyv_s32 a) {
NPY_FINLINE void npyv_storen2_f32(float* ptr, npy_intp stride, npyv_f32 a) {
NPY_FINLINE void npyv_storen2_u64(npy_uint64* ptr,
NPY_FINLINE void npyv_storen2_s64(npy_int64* ptr, npy_intp stride, npyv_s64 a) {
NPY_FINLINE void npyv_storen2_f64(double* ptr, npy_intp stride, npyv_f64 a) {
NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s32 npyv_loadn_till_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s32 npyv_loadn_tillz_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32* ptr,
NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64* ptr,
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64* ptr,
NPY_FINLINE void npyv_store_till_s32(npy_int32* ptr,
NPY_FINLINE void npyv_store_till_s64(npy_int64* ptr,
NPY_FINLINE void npyv_store2_till_s32(npy_int32* ptr,
NPY_FINLINE void npyv_store2_till_s64(npy_int64* ptr,
NPY_FINLINE void npyv_storen_till_s32(npy_int32* ptr,
NPY_FINLINE void npyv_storen_till_s64(npy_int64* ptr,
NPY_FINLINE void npyv_storen2_till_s32(npy_int32* ptr,
NPY_FINLINE void npyv_storen2_till_s64(npy_int64* ptr,
NPY_FINLINE npyv_u32 npyv_load_till_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_u32 npyv_loadn_till_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_u32 npyv_load_tillz_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_u32 npyv_loadn_tillz_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE void npyv_store_till_u32(npyv_lanetype_u32* ptr,
NPY_FINLINE void npyv_storen_till_u32(npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_f32 npyv_load_till_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_f32 npyv_loadn_till_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_f32 npyv_load_tillz_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_f32 npyv_loadn_tillz_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE void npyv_store_till_f32(npyv_lanetype_f32* ptr,
NPY_FINLINE void npyv_storen_till_f32(npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_u64 npyv_load_till_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_u64 npyv_loadn_till_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_u64 npyv_load_tillz_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_u64 npyv_loadn_tillz_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE void npyv_store_till_u64(npyv_lanetype_u64* ptr,
NPY_FINLINE void npyv_storen_till_u64(npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_f64 npyv_load_till_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_f64 npyv_loadn_till_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_f64 npyv_load_tillz_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_f64 npyv_loadn_tillz_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE void npyv_store_till_f64(npyv_lanetype_f64* ptr,
NPY_FINLINE void npyv_storen_till_f64(npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_u32 npyv_load2_till_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_u32 npyv_loadn2_till_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_u32 npyv_load2_tillz_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_u32 npyv_loadn2_tillz_u32(const npyv_lanetype_u32* ptr,
NPY_FINLINE void npyv_store2_till_u32(npyv_lanetype_u32* ptr,
NPY_FINLINE void npyv_storen2_till_u32(npyv_lanetype_u32* ptr,
NPY_FINLINE npyv_f32 npyv_load2_till_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_f32 npyv_loadn2_till_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_f32 npyv_load2_tillz_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_f32 npyv_loadn2_tillz_f32(const npyv_lanetype_f32* ptr,
NPY_FINLINE void npyv_store2_till_f32(npyv_lanetype_f32* ptr,
NPY_FINLINE void npyv_storen2_till_f32(npyv_lanetype_f32* ptr,
NPY_FINLINE npyv_u64 npyv_load2_till_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_u64 npyv_loadn2_till_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_u64 npyv_load2_tillz_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_u64 npyv_loadn2_tillz_u64(const npyv_lanetype_u64* ptr,
NPY_FINLINE void npyv_store2_till_u64(npyv_lanetype_u64* ptr,
NPY_FINLINE void npyv_storen2_till_u64(npyv_lanetype_u64* ptr,
NPY_FINLINE npyv_f64 npyv_load2_till_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_f64 npyv_loadn2_till_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_f64 npyv_load2_tillz_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_f64 npyv_loadn2_tillz_f64(const npyv_lanetype_f64* ptr,
NPY_FINLINE void npyv_store2_till_f64(npyv_lanetype_f64* ptr,
NPY_FINLINE void npyv_storen2_till_f64(npyv_lanetype_f64* ptr,
NPY_FINLINE npyv_u8x2 npyv_load_u8x2(const npyv_lanetype_u8* ptr) {
NPY_FINLINE void npyv_store_u8x2(npyv_lanetype_u8* ptr, npyv_u8x2 v) {
NPY_FINLINE npyv_s8x2 npyv_load_s8x2(const npyv_lanetype_s8* ptr) {
NPY_FINLINE void npyv_store_s8x2(npyv_lanetype_s8* ptr, npyv_s8x2 v) {
NPY_FINLINE npyv_u16x2 npyv_load_u16x2(const npyv_lanetype_u16* ptr) {
NPY_FINLINE void npyv_store_u16x2(npyv_lanetype_u16* ptr, npyv_u16x2 v) {
NPY_FINLINE npyv_s16x2 npyv_load_s16x2(const npyv_lanetype_s16* ptr) {
NPY_FINLINE void npyv_store_s16x2(npyv_lanetype_s16* ptr, npyv_s16x2 v) {
NPY_FINLINE npyv_u32x2 npyv_load_u32x2(const npyv_lanetype_u32* ptr) {
NPY_FINLINE void npyv_store_u32x2(npyv_lanetype_u32* ptr, npyv_u32x2 v) {
NPY_FINLINE npyv_s32x2 npyv_load_s32x2(const npyv_lanetype_s32* ptr) {
NPY_FINLINE void npyv_store_s32x2(npyv_lanetype_s32* ptr, npyv_s32x2 v) {
NPY_FINLINE npyv_f32x2 npyv_load_f32x2(const npyv_lanetype_f32* ptr) {
NPY_FINLINE void npyv_store_f32x2(npyv_lanetype_f32* ptr, npyv_f32x2 v) {
NPY_FINLINE npyv_f64x2 npyv_load_f64x2(const npyv_lanetype_f64* ptr) {
NPY_FINLINE void npyv_store_f64x2(npyv_lanetype_f64* ptr, npyv_f64x2 v) {
NPY_FINLINE npyv_u64x2 npyv_load_u64x2(const npyv_lanetype_u64* ptr) {
NPY_FINLINE void npyv_store_u64x2(npyv_lanetype_u64* ptr, npyv_u64x2 v) {
NPY_FINLINE npyv_s64x2 npyv_load_s64x2(const npyv_lanetype_s64* ptr) {
NPY_FINLINE void npyv_store_s64x2(npyv_lanetype_s64* ptr, npyv_s64x2 v) {
NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32* table, npyv_u32 idx) {
NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32* table, npyv_u32 idx) {
NPY_FINLINE npyv_f32 npyv_lut32_f32(const float* table, npyv_u32 idx) {
NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64* table, npyv_u64 idx) {
NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64* table, npyv_u64 idx) {
NPY_FINLINE npyv_f64 npyv_lut16_f64(const double* table, npyv_u64 idx) {
NPY_FINLINE npyv_u8x2 npyv_combine_u8(npyv_u8 a, npyv_u8 b) {
NPY_FINLINE npyv_s8x2 npyv_combine_s8(npyv_s8 a, npyv_s8 b) {
NPY_FINLINE npyv_u16x2 npyv_combine_u16(npyv_u16 a, npyv_u16 b) {
NPY_FINLINE npyv_s16x2 npyv_combine_s16(npyv_s16 a, npyv_s16 b) {
NPY_FINLINE npyv_u32x2 npyv_combine_u32(npyv_u32 a, npyv_u32 b) {
NPY_FINLINE npyv_s32x2 npyv_combine_s32(npyv_s32 a, npyv_s32 b) {
NPY_FINLINE npyv_u64x2 npyv_combine_u64(npyv_u64 a, npyv_u64 b) {
NPY_FINLINE npyv_s64x2 npyv_combine_s64(npyv_s64 a, npyv_s64 b) {
NPY_FINLINE npyv_f32x2 npyv_combine_f32(npyv_f32 a, npyv_f32 b) {
NPY_FINLINE npyv_f64x2 npyv_combine_f64(npyv_f64 a, npyv_f64 b) {
NPY_FINLINE npyv_u8x2 npyv_zip_u8(npyv_u8 a, npyv_u8 b) {
NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 a, npyv_u8 b) {
NPY_FINLINE npyv_s8x2 npyv_zip_s8(npyv_s8 a, npyv_s8 b) {
NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 a, npyv_s8 b) {
NPY_FINLINE npyv_u16x2 npyv_zip_u16(npyv_u16 a, npyv_u16 b) {
NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 a, npyv_u16 b) {
NPY_FINLINE npyv_s16x2 npyv_zip_s16(npyv_s16 a, npyv_s16 b) {
NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 a, npyv_s16 b) {
NPY_FINLINE npyv_u32x2 npyv_zip_u32(npyv_u32 a, npyv_u32 b) {
NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 a, npyv_u32 b) {
NPY_FINLINE npyv_s32x2 npyv_zip_s32(npyv_s32 a, npyv_s32 b) {
NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 a, npyv_s32 b) {
NPY_FINLINE npyv_f32x2 npyv_zip_f32(npyv_f32 a, npyv_f32 b) {
NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 a, npyv_f32 b) {
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) {
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) {
NPY_FINLINE bool npyv_any_b8(npyv_b8 a) {
NPY_FINLINE bool npyv_all_b8(npyv_b8 a) {
NPY_FINLINE bool npyv_any_b16(npyv_b16 a) {
NPY_FINLINE bool npyv_all_b16(npyv_b16 a) {
NPY_FINLINE bool npyv_any_b32(npyv_b32 a) {
NPY_FINLINE bool npyv_all_b32(npyv_b32 a) {
NPY_FINLINE bool npyv_any_u8(npyv_u8 a) {
NPY_FINLINE bool npyv_all_u8(npyv_u8 a) {
NPY_FINLINE bool npyv_any_s8(npyv_s8 a) {
NPY_FINLINE bool npyv_all_s8(npyv_s8 a) {
NPY_FINLINE bool npyv_any_u16(npyv_u16 a) {
NPY_FINLINE bool npyv_all_u16(npyv_u16 a) {
NPY_FINLINE bool npyv_any_s16(npyv_s16 a) {
NPY_FINLINE bool npyv_all_s16(npyv_s16 a) {
NPY_FINLINE bool npyv_any_u32(npyv_u32 a) {
NPY_FINLINE bool npyv_all_u32(npyv_u32 a) {
NPY_FINLINE bool npyv_any_s32(npyv_s32 a) {
NPY_FINLINE bool npyv_all_s32(npyv_s32 a) {
NPY_FINLINE bool npyv_any_b64(npyv_b64 a) {
NPY_FINLINE bool npyv_all_b64(npyv_b64 a) {
NPY_FINLINE bool npyv_all_u64(npyv_u64 a) {
NPY_FINLINE bool npyv_any_s64(npyv_s64 a) {
NPY_FINLINE bool npyv_all_s64(npyv_s64 a) {
NPY_FINLINE bool npyv_any_f32(npyv_f32 a) {
NPY_FINLINE bool npyv_all_f32(npyv_f32 a) {
NPY_FINLINE bool npyv_any_f64(npyv_f64 a) {
NPY_FINLINE bool npyv_all_f64(npyv_f64 a) {
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) {
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) {
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) {
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) {
NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) {
NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a,
NPY_FINLINE npyv_b8 npyv_pack_b8_b64(npyv_b64 a,
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) {
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) {
NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) {
NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) {
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) {
NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) {
NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) {
NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) {
NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) {
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) {
NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) {
NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) {
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) {
NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) {
NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) {
NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) {
NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) {
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) {
NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) {
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) {
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) {
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) {
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) {
NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) {
NPY_FINLINE npyv_u64 npyv_max_u64(npyv_u64 a, npyv_u64 b) {
NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) {
NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) {
NPY_FINLINE npyv_u64 npyv_min_u64(npyv_u64 a, npyv_u64 b) {
NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) {
NPY_FINLINE npy_uint64 npyv_reduce_max_u64(npyv_u64 a) {
NPY_FINLINE npy_int64 npyv_reduce_max_s64(npyv_s64 a) {
NPY_FINLINE npy_uint64 npyv_reduce_min_u64(npyv_u64 a) {
NPY_FINLINE npy_int64 npyv_reduce_min_s64(npyv_s64 a) {
NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) {
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) {
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) {
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) {
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/neon$ cat ./neon.hpp | grep NPY_FINLINE | wc -l
311
luyahan@plct-c7:~/source/numpy/numpy/_core/src/common/simd/neon$