Open a74nh opened 11 months ago
Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics See info in area-owners.md if you want to be subscribed.
Author: | a74nh |
---|---|
Assignees: | - |
Labels: | `area-System.Runtime.Intrinsics` |
Milestone: | - |
/// Full API
public abstract partial class Sve : AdvSimd /// Feature: FEAT_SVE Category: stores
{
/// Store : Non-truncating store
/// void svst1[_f32](svbool_t pg, float32_t *base, svfloat32_t data) : "ST1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<float> mask, float* base, Vector<float> data);
/// void svst1[_f64](svbool_t pg, float64_t *base, svfloat64_t data) : "ST1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "ST1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<double> mask, double* base, Vector<double> data);
/// void svst1[_s8](svbool_t pg, int8_t *base, svint8_t data) : "ST1B Zdata.B, Pg, [Xarray, Xindex]" or "ST1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<sbyte> mask, sbyte* base, Vector<sbyte> data);
/// void svst1[_s16](svbool_t pg, int16_t *base, svint16_t data) : "ST1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<short> mask, short* base, Vector<short> data);
/// void svst1[_s32](svbool_t pg, int32_t *base, svint32_t data) : "ST1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<int> mask, int* base, Vector<int> data);
/// void svst1[_s64](svbool_t pg, int64_t *base, svint64_t data) : "ST1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "ST1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<long> mask, long* base, Vector<long> data);
/// void svst1[_u8](svbool_t pg, uint8_t *base, svuint8_t data) : "ST1B Zdata.B, Pg, [Xarray, Xindex]" or "ST1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<byte> mask, byte* base, Vector<byte> data);
/// void svst1[_u16](svbool_t pg, uint16_t *base, svuint16_t data) : "ST1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<ushort> mask, ushort* base, Vector<ushort> data);
/// void svst1[_u32](svbool_t pg, uint32_t *base, svuint32_t data) : "ST1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<uint> mask, uint* base, Vector<uint> data);
/// void svst1[_u64](svbool_t pg, uint64_t *base, svuint64_t data) : "ST1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "ST1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Store(Vector<ulong> mask, ulong* base, Vector<ulong> data);
/// StoreInt16NarrowToSByte : Truncate to 8 bits and store
/// void svst1b[_s16](svbool_t pg, int8_t *base, svint16_t data) : "ST1B Zdata.H, Pg, [Xarray, Xindex]" or "ST1B Zdata.H, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreInt16NarrowToSByte(Vector<short> mask, sbyte* base, Vector<short> data);
/// StoreInt32NarrowToInt16 : Truncate to 16 bits and store
/// void svst1h[_s32](svbool_t pg, int16_t *base, svint32_t data) : "ST1H Zdata.S, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreInt32NarrowToInt16(Vector<int> mask, short* base, Vector<int> data);
/// StoreInt32NarrowToSByte : Truncate to 8 bits and store
/// void svst1b[_s32](svbool_t pg, int8_t *base, svint32_t data) : "ST1B Zdata.S, Pg, [Xarray, Xindex]" or "ST1B Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreInt32NarrowToSByte(Vector<int> mask, sbyte* base, Vector<int> data);
/// StoreInt64NarrowToInt16 : Truncate to 16 bits and store
/// void svst1h[_s64](svbool_t pg, int16_t *base, svint64_t data) : "ST1H Zdata.D, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreInt64NarrowToInt16(Vector<long> mask, short* base, Vector<long> data);
/// StoreInt64NarrowToInt32 : Truncate to 32 bits and store
/// void svst1w[_s64](svbool_t pg, int32_t *base, svint64_t data) : "ST1W Zdata.D, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreInt64NarrowToInt32(Vector<long> mask, int* base, Vector<long> data);
/// StoreInt64NarrowToSByte : Truncate to 8 bits and store
/// void svst1b[_s64](svbool_t pg, int8_t *base, svint64_t data) : "ST1B Zdata.D, Pg, [Xarray, Xindex]" or "ST1B Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreInt64NarrowToSByte(Vector<long> mask, sbyte* base, Vector<long> data);
/// StoreNonTemporal : Non-truncating store, non-temporal
/// void svstnt1[_f32](svbool_t pg, float32_t *base, svfloat32_t data) : "STNT1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "STNT1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<float> mask, float* base, Vector<float> data);
/// void svstnt1[_f64](svbool_t pg, float64_t *base, svfloat64_t data) : "STNT1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "STNT1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<double> mask, double* base, Vector<double> data);
/// void svstnt1[_s8](svbool_t pg, int8_t *base, svint8_t data) : "STNT1B Zdata.B, Pg, [Xarray, Xindex]" or "STNT1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<sbyte> mask, sbyte* base, Vector<sbyte> data);
/// void svstnt1[_s16](svbool_t pg, int16_t *base, svint16_t data) : "STNT1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "STNT1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<short> mask, short* base, Vector<short> data);
/// void svstnt1[_s32](svbool_t pg, int32_t *base, svint32_t data) : "STNT1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "STNT1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<int> mask, int* base, Vector<int> data);
/// void svstnt1[_s64](svbool_t pg, int64_t *base, svint64_t data) : "STNT1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "STNT1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<long> mask, long* base, Vector<long> data);
/// void svstnt1[_u8](svbool_t pg, uint8_t *base, svuint8_t data) : "STNT1B Zdata.B, Pg, [Xarray, Xindex]" or "STNT1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<byte> mask, byte* base, Vector<byte> data);
/// void svstnt1[_u16](svbool_t pg, uint16_t *base, svuint16_t data) : "STNT1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "STNT1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<ushort> mask, ushort* base, Vector<ushort> data);
/// void svstnt1[_u32](svbool_t pg, uint32_t *base, svuint32_t data) : "STNT1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "STNT1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<uint> mask, uint* base, Vector<uint> data);
/// void svstnt1[_u64](svbool_t pg, uint64_t *base, svuint64_t data) : "STNT1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "STNT1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreNonTemporal(Vector<ulong> mask, ulong* base, Vector<ulong> data);
/// StoreUInt16NarrowToByte : Truncate to 8 bits and store
/// void svst1b[_u16](svbool_t pg, uint8_t *base, svuint16_t data) : "ST1B Zdata.H, Pg, [Xarray, Xindex]" or "ST1B Zdata.H, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreUInt16NarrowToByte(Vector<ushort> mask, byte* base, Vector<ushort> data);
/// StoreUInt32NarrowToByte : Truncate to 8 bits and store
/// void svst1b[_u32](svbool_t pg, uint8_t *base, svuint32_t data) : "ST1B Zdata.S, Pg, [Xarray, Xindex]" or "ST1B Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreUInt32NarrowToByte(Vector<uint> mask, byte* base, Vector<uint> data);
/// StoreUInt32NarrowToUInt16 : Truncate to 16 bits and store
/// void svst1h[_u32](svbool_t pg, uint16_t *base, svuint32_t data) : "ST1H Zdata.S, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.S, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreUInt32NarrowToUInt16(Vector<uint> mask, ushort* base, Vector<uint> data);
/// StoreUInt64NarrowToByte : Truncate to 8 bits and store
/// void svst1b[_u64](svbool_t pg, uint8_t *base, svuint64_t data) : "ST1B Zdata.D, Pg, [Xarray, Xindex]" or "ST1B Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreUInt64NarrowToByte(Vector<ulong> mask, byte* base, Vector<ulong> data);
/// StoreUInt64NarrowToUInt16 : Truncate to 16 bits and store
/// void svst1h[_u64](svbool_t pg, uint16_t *base, svuint64_t data) : "ST1H Zdata.D, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreUInt64NarrowToUInt16(Vector<ulong> mask, ushort* base, Vector<ulong> data);
/// StoreUInt64NarrowToUInt32 : Truncate to 32 bits and store
/// void svst1w[_u64](svbool_t pg, uint32_t *base, svuint64_t data) : "ST1W Zdata.D, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.D, Pg, [Xbase, #0, MUL VL]"
public static unsafe void StoreUInt64NarrowToUInt32(Vector<ulong> mask, uint* base, Vector<ulong> data);
/// Storex2 : Store two vectors into two-element tuples
/// void svst2[_f32](svbool_t pg, float32_t *base, svfloat32x2_t data) : "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<float> mask, float* base, (Vector<float> data1, Vector<float> data2));
/// void svst2[_f64](svbool_t pg, float64_t *base, svfloat64x2_t data) : "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<double> mask, double* base, (Vector<double> data1, Vector<double> data2));
/// void svst2[_s8](svbool_t pg, int8_t *base, svint8x2_t data) : "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xarray, Xindex]" or "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<sbyte> mask, sbyte* base, (Vector<sbyte> data1, Vector<sbyte> data2));
/// void svst2[_s16](svbool_t pg, int16_t *base, svint16x2_t data) : "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<short> mask, short* base, (Vector<short> data1, Vector<short> data2));
/// void svst2[_s32](svbool_t pg, int32_t *base, svint32x2_t data) : "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<int> mask, int* base, (Vector<int> data1, Vector<int> data2));
/// void svst2[_s64](svbool_t pg, int64_t *base, svint64x2_t data) : "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<long> mask, long* base, (Vector<long> data1, Vector<long> data2));
/// void svst2[_u8](svbool_t pg, uint8_t *base, svuint8x2_t data) : "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xarray, Xindex]" or "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<byte> mask, byte* base, (Vector<byte> data1, Vector<byte> data2));
/// void svst2[_u16](svbool_t pg, uint16_t *base, svuint16x2_t data) : "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<ushort> mask, ushort* base, (Vector<ushort> data1, Vector<ushort> data2));
/// void svst2[_u32](svbool_t pg, uint32_t *base, svuint32x2_t data) : "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<uint> mask, uint* base, (Vector<uint> data1, Vector<uint> data2));
/// void svst2[_u64](svbool_t pg, uint64_t *base, svuint64x2_t data) : "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex2(Vector<ulong> mask, ulong* base, (Vector<ulong> data1, Vector<ulong> data2));
/// Storex3 : Store three vectors into three-element tuples
/// void svst3[_f32](svbool_t pg, float32_t *base, svfloat32x3_t data) : "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<float> mask, float* base, (Vector<float> data1, Vector<float> data2, Vector<float> data3));
/// void svst3[_f64](svbool_t pg, float64_t *base, svfloat64x3_t data) : "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<double> mask, double* base, (Vector<double> data1, Vector<double> data2, Vector<double> data3));
/// void svst3[_s8](svbool_t pg, int8_t *base, svint8x3_t data) : "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xarray, Xindex]" or "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<sbyte> mask, sbyte* base, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3));
/// void svst3[_s16](svbool_t pg, int16_t *base, svint16x3_t data) : "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<short> mask, short* base, (Vector<short> data1, Vector<short> data2, Vector<short> data3));
/// void svst3[_s32](svbool_t pg, int32_t *base, svint32x3_t data) : "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<int> mask, int* base, (Vector<int> data1, Vector<int> data2, Vector<int> data3));
/// void svst3[_s64](svbool_t pg, int64_t *base, svint64x3_t data) : "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<long> mask, long* base, (Vector<long> data1, Vector<long> data2, Vector<long> data3));
/// void svst3[_u8](svbool_t pg, uint8_t *base, svuint8x3_t data) : "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xarray, Xindex]" or "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<byte> mask, byte* base, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3));
/// void svst3[_u16](svbool_t pg, uint16_t *base, svuint16x3_t data) : "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<ushort> mask, ushort* base, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3));
/// void svst3[_u32](svbool_t pg, uint32_t *base, svuint32x3_t data) : "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<uint> mask, uint* base, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3));
/// void svst3[_u64](svbool_t pg, uint64_t *base, svuint64x3_t data) : "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex3(Vector<ulong> mask, ulong* base, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3));
/// Storex4 : Store four vectors into four-element tuples
/// void svst4[_f32](svbool_t pg, float32_t *base, svfloat32x4_t data) : "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<float> mask, float* base, (Vector<float> data1, Vector<float> data2, Vector<float> data3, Vector<float> data4));
/// void svst4[_f64](svbool_t pg, float64_t *base, svfloat64x4_t data) : "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<double> mask, double* base, (Vector<double> data1, Vector<double> data2, Vector<double> data3, Vector<double> data4));
/// void svst4[_s8](svbool_t pg, int8_t *base, svint8x4_t data) : "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xarray, Xindex]" or "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<sbyte> mask, sbyte* base, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3, Vector<sbyte> data4));
/// void svst4[_s16](svbool_t pg, int16_t *base, svint16x4_t data) : "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<short> mask, short* base, (Vector<short> data1, Vector<short> data2, Vector<short> data3, Vector<short> data4));
/// void svst4[_s32](svbool_t pg, int32_t *base, svint32x4_t data) : "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<int> mask, int* base, (Vector<int> data1, Vector<int> data2, Vector<int> data3, Vector<int> data4));
/// void svst4[_s64](svbool_t pg, int64_t *base, svint64x4_t data) : "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<long> mask, long* base, (Vector<long> data1, Vector<long> data2, Vector<long> data3, Vector<long> data4));
/// void svst4[_u8](svbool_t pg, uint8_t *base, svuint8x4_t data) : "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xarray, Xindex]" or "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<byte> mask, byte* base, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3, Vector<byte> data4));
/// void svst4[_u16](svbool_t pg, uint16_t *base, svuint16x4_t data) : "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<ushort> mask, ushort* base, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3, Vector<ushort> data4));
/// void svst4[_u32](svbool_t pg, uint32_t *base, svuint32x4_t data) : "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<uint> mask, uint* base, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3, Vector<uint> data4));
/// void svst4[_u64](svbool_t pg, uint64_t *base, svuint64x4_t data) : "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xbase, #0, MUL VL]"
public static unsafe void Storex4(Vector<ulong> mask, ulong* base, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3, Vector<ulong> data4));
/// total method signatures: 62
/// total method names: 17
}
/// Rejected:
/// public static unsafe void Store(Vector<float> mask, float* base, long vnum, Vector<float> data); // svst1_vnum[_f32]
/// public static unsafe void Store(Vector<double> mask, double* base, long vnum, Vector<double> data); // svst1_vnum[_f64]
/// public static unsafe void Store(Vector<sbyte> mask, sbyte* base, long vnum, Vector<sbyte> data); // svst1_vnum[_s8]
/// public static unsafe void Store(Vector<short> mask, short* base, long vnum, Vector<short> data); // svst1_vnum[_s16]
/// public static unsafe void Store(Vector<int> mask, int* base, long vnum, Vector<int> data); // svst1_vnum[_s32]
/// public static unsafe void Store(Vector<long> mask, long* base, long vnum, Vector<long> data); // svst1_vnum[_s64]
/// public static unsafe void Store(Vector<byte> mask, byte* base, long vnum, Vector<byte> data); // svst1_vnum[_u8]
/// public static unsafe void Store(Vector<ushort> mask, ushort* base, long vnum, Vector<ushort> data); // svst1_vnum[_u16]
/// public static unsafe void Store(Vector<uint> mask, uint* base, long vnum, Vector<uint> data); // svst1_vnum[_u32]
/// public static unsafe void Store(Vector<ulong> mask, ulong* base, long vnum, Vector<ulong> data); // svst1_vnum[_u64]
/// public static unsafe void StoreInt16NarrowToSByte(Vector<short> mask, sbyte* base, long vnum, Vector<short> data); // svst1b_vnum[_s16]
/// public static unsafe void StoreInt32NarrowToInt16(Vector<int> mask, short* base, long vnum, Vector<int> data); // svst1h_vnum[_s32]
/// public static unsafe void StoreInt32NarrowToSByte(Vector<int> mask, sbyte* base, long vnum, Vector<int> data); // svst1b_vnum[_s32]
/// public static unsafe void StoreInt64NarrowToInt16(Vector<long> mask, short* base, long vnum, Vector<long> data); // svst1h_vnum[_s64]
/// public static unsafe void StoreInt64NarrowToInt32(Vector<long> mask, int* base, long vnum, Vector<long> data); // svst1w_vnum[_s64]
/// public static unsafe void StoreInt64NarrowToSByte(Vector<long> mask, sbyte* base, long vnum, Vector<long> data); // svst1b_vnum[_s64]
/// public static unsafe void StoreNonTemporal(Vector<float> mask, float* base, long vnum, Vector<float> data); // svstnt1_vnum[_f32]
/// public static unsafe void StoreNonTemporal(Vector<double> mask, double* base, long vnum, Vector<double> data); // svstnt1_vnum[_f64]
/// public static unsafe void StoreNonTemporal(Vector<sbyte> mask, sbyte* base, long vnum, Vector<sbyte> data); // svstnt1_vnum[_s8]
/// public static unsafe void StoreNonTemporal(Vector<short> mask, short* base, long vnum, Vector<short> data); // svstnt1_vnum[_s16]
/// public static unsafe void StoreNonTemporal(Vector<int> mask, int* base, long vnum, Vector<int> data); // svstnt1_vnum[_s32]
/// public static unsafe void StoreNonTemporal(Vector<long> mask, long* base, long vnum, Vector<long> data); // svstnt1_vnum[_s64]
/// public static unsafe void StoreNonTemporal(Vector<byte> mask, byte* base, long vnum, Vector<byte> data); // svstnt1_vnum[_u8]
/// public static unsafe void StoreNonTemporal(Vector<ushort> mask, ushort* base, long vnum, Vector<ushort> data); // svstnt1_vnum[_u16]
/// public static unsafe void StoreNonTemporal(Vector<uint> mask, uint* base, long vnum, Vector<uint> data); // svstnt1_vnum[_u32]
/// public static unsafe void StoreNonTemporal(Vector<ulong> mask, ulong* base, long vnum, Vector<ulong> data); // svstnt1_vnum[_u64]
/// public static unsafe void StoreUInt16NarrowToByte(Vector<ushort> mask, byte* base, long vnum, Vector<ushort> data); // svst1b_vnum[_u16]
/// public static unsafe void StoreUInt32NarrowToByte(Vector<uint> mask, byte* base, long vnum, Vector<uint> data); // svst1b_vnum[_u32]
/// public static unsafe void StoreUInt32NarrowToUInt16(Vector<uint> mask, ushort* base, long vnum, Vector<uint> data); // svst1h_vnum[_u32]
/// public static unsafe void StoreUInt64NarrowToByte(Vector<ulong> mask, byte* base, long vnum, Vector<ulong> data); // svst1b_vnum[_u64]
/// public static unsafe void StoreUInt64NarrowToUInt16(Vector<ulong> mask, ushort* base, long vnum, Vector<ulong> data); // svst1h_vnum[_u64]
/// public static unsafe void StoreUInt64NarrowToUInt32(Vector<ulong> mask, uint* base, long vnum, Vector<ulong> data); // svst1w_vnum[_u64]
/// public static unsafe void Storex2(Vector<float> mask, float* base, long vnum, (Vector<float> data1, Vector<float> data2)); // svst2_vnum[_f32]
/// public static unsafe void Storex2(Vector<double> mask, double* base, long vnum, (Vector<double> data1, Vector<double> data2)); // svst2_vnum[_f64]
/// public static unsafe void Storex2(Vector<sbyte> mask, sbyte* base, long vnum, (Vector<sbyte> data1, Vector<sbyte> data2)); // svst2_vnum[_s8]
/// public static unsafe void Storex2(Vector<short> mask, short* base, long vnum, (Vector<short> data1, Vector<short> data2)); // svst2_vnum[_s16]
/// public static unsafe void Storex2(Vector<int> mask, int* base, long vnum, (Vector<int> data1, Vector<int> data2)); // svst2_vnum[_s32]
/// public static unsafe void Storex2(Vector<long> mask, long* base, long vnum, (Vector<long> data1, Vector<long> data2)); // svst2_vnum[_s64]
/// public static unsafe void Storex2(Vector<byte> mask, byte* base, long vnum, (Vector<byte> data1, Vector<byte> data2)); // svst2_vnum[_u8]
/// public static unsafe void Storex2(Vector<ushort> mask, ushort* base, long vnum, (Vector<ushort> data1, Vector<ushort> data2)); // svst2_vnum[_u16]
/// public static unsafe void Storex2(Vector<uint> mask, uint* base, long vnum, (Vector<uint> data1, Vector<uint> data2)); // svst2_vnum[_u32]
/// public static unsafe void Storex2(Vector<ulong> mask, ulong* base, long vnum, (Vector<ulong> data1, Vector<ulong> data2)); // svst2_vnum[_u64]
/// public static unsafe void Storex3(Vector<float> mask, float* base, long vnum, (Vector<float> data1, Vector<float> data2, Vector<float> data3)); // svst3_vnum[_f32]
/// public static unsafe void Storex3(Vector<double> mask, double* base, long vnum, (Vector<double> data1, Vector<double> data2, Vector<double> data3)); // svst3_vnum[_f64]
/// public static unsafe void Storex3(Vector<sbyte> mask, sbyte* base, long vnum, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3)); // svst3_vnum[_s8]
/// public static unsafe void Storex3(Vector<short> mask, short* base, long vnum, (Vector<short> data1, Vector<short> data2, Vector<short> data3)); // svst3_vnum[_s16]
/// public static unsafe void Storex3(Vector<int> mask, int* base, long vnum, (Vector<int> data1, Vector<int> data2, Vector<int> data3)); // svst3_vnum[_s32]
/// public static unsafe void Storex3(Vector<long> mask, long* base, long vnum, (Vector<long> data1, Vector<long> data2, Vector<long> data3)); // svst3_vnum[_s64]
/// public static unsafe void Storex3(Vector<byte> mask, byte* base, long vnum, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3)); // svst3_vnum[_u8]
/// public static unsafe void Storex3(Vector<ushort> mask, ushort* base, long vnum, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3)); // svst3_vnum[_u16]
/// public static unsafe void Storex3(Vector<uint> mask, uint* base, long vnum, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3)); // svst3_vnum[_u32]
/// public static unsafe void Storex3(Vector<ulong> mask, ulong* base, long vnum, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3)); // svst3_vnum[_u64]
/// public static unsafe void Storex4(Vector<float> mask, float* base, long vnum, (Vector<float> data1, Vector<float> data2, Vector<float> data3, Vector<float> data4)); // svst4_vnum[_f32]
/// public static unsafe void Storex4(Vector<double> mask, double* base, long vnum, (Vector<double> data1, Vector<double> data2, Vector<double> data3, Vector<double> data4)); // svst4_vnum[_f64]
/// public static unsafe void Storex4(Vector<sbyte> mask, sbyte* base, long vnum, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3, Vector<sbyte> data4)); // svst4_vnum[_s8]
/// public static unsafe void Storex4(Vector<short> mask, short* base, long vnum, (Vector<short> data1, Vector<short> data2, Vector<short> data3, Vector<short> data4)); // svst4_vnum[_s16]
/// public static unsafe void Storex4(Vector<int> mask, int* base, long vnum, (Vector<int> data1, Vector<int> data2, Vector<int> data3, Vector<int> data4)); // svst4_vnum[_s32]
/// public static unsafe void Storex4(Vector<long> mask, long* base, long vnum, (Vector<long> data1, Vector<long> data2, Vector<long> data3, Vector<long> data4)); // svst4_vnum[_s64]
/// public static unsafe void Storex4(Vector<byte> mask, byte* base, long vnum, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3, Vector<byte> data4)); // svst4_vnum[_u8]
/// public static unsafe void Storex4(Vector<ushort> mask, ushort* base, long vnum, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3, Vector<ushort> data4)); // svst4_vnum[_u16]
/// public static unsafe void Storex4(Vector<uint> mask, uint* base, long vnum, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3, Vector<uint> data4)); // svst4_vnum[_u32]
/// public static unsafe void Storex4(Vector<ulong> mask, ulong* base, long vnum, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3, Vector<ulong> data4)); // svst4_vnum[_u64]
/// Total Rejected: 62
/// Total ACLE covered across API: 124
This contributes to https://github.com/dotnet/runtime/issues/93095
It covers instructions in FEAT_SVE related to stores. Note there are more store methods in scatter stores
This list was auto generated from the C ACLE for SVE, and is in three parts:
The methods list reduced down to Vector versions. All possible varaints of T are given above the method. The complete list of all methods. The corresponding ACLE methods and SVE instructions are given above the method. All rejected ACLE methods. These are methods we have agreed that do not need including in C#. Where possible, existing C# naming conventions have been matched.
Many of the C functions include predicate argument(s), of type svbool_t as the first argument. These are missing from the C# method. It is expected that the Jit will create predicates where required, or combine with uses of conditionalSelect(). For more discussion see https://github.com/dotnet/runtime/issues/88140 comment.
Unlike the Load
APIs, we don't need to differentiate by return type here (we typically just use Store
, unlike vector where it is LoadVector
).
However, StoreVectorTruncate32
isn't necessarily clear on the semantic. We also notably opted for the terminology Narrow
/Narrowing
in AdvSimd
. So we probably want to give some consideration on whether we can make it clear that this effectively does ExtractNarrowingLower
followed by a Store
.
This issue has been marked needs-author-action
and may be missing some important information.
However,
StoreVectorTruncate32
isn't necessarily clear on the semantic. We also notably opted for the terminologyNarrow
/Narrowing
inAdvSimd
. So we probably want to give some consideration on whether we can make it clear that this effectively doesExtractNarrowingLower
followed by aStore
.
Updated toStoreInt32NarrowToInt16()
etc.
As far as I can tell, StoreInt32NarrowToInt16
and StoreUInt32NarrowToInt16()
are identical in operation. Maybe it can be renamed and one dropped?
Also, I feel that the type at the end (the destination type) isn't clear as to whether the destination type is signed or not.
As far as I can tell, StoreInt32NarrowToInt16 and StoreUInt32NarrowToInt16() are identical in operation. Maybe it can be renamed and one dropped?
That seems fine. We only need to differentiate by behavior or return type. If the behavior is identical and the delimiter would only be the input type (which can simply be an overload), then dropping the delimiter is fine.
Also, I feel that the type at the end (the destination type) isn't clear as to whether the destination type is signed or not.
We'd use Int32
for signed destination and UInt32
for unsigned destination.
All stores are predicated ("Inactive elements are not written to memory"). How did you see this working with the use of conditionalSelect?
Predicated stores/loads likely need an explicit overload that takes a mask since the operation happens to memory and the ConditionalSelect
operation can't clearly represent that.
There are potentially patterns that could be recognized, given the memory ordering rules. But exposing a couple extra overloads for this important concept should be fine.
Something like MaskLoad
/MaskStore
or MaskedLoad
/MaskedStore
would make it clearer on how the semantics work than simply an overload that takes a mask.
Regular Load
/Store
could then be kept for simplicity and use the "all active" predicate.
Something like
MaskLoad
/MaskStore
orMaskedLoad
/MaskedStore
would make it clearer on how the semantics work than simply an overload that takes a mask.
That would be fine. Warning though, it's going to bloat the API. Across load,gather,firstfaulting,store,scatter that's 890 functions just for SVE1.
Instead of writing all of them out, I might just add a note to the top of the API block.
that's 890 functions just for SVE1.
This is because every Store
API has a MaskedStore
equivalent, so its effectively doubling the total number of exposed APIs, right?
I wonder if we could compensate for this by avoiding the vnum
overloads here... That is, as I understand it we have two forms of the instruction, where the former is just a specialized encoding of the latter when the immediate is between [-8, +7]
. We're then exposing the vnum
overload so users can specify the index explicitly:
ST1W (scalar plus immediate, single register)
ST1W (scalar plus scalar, single register)
Most notably, we could just expose public static unsafe void Store(T* destination, Vector<T> data);
and then support the offset variants implicitly by recognizing patterns such as Store(destination + index, data)
could be used to generate the optimal overload of ST1W
. This is exactly as we would with normal scalar codegen for something like *(destination + 1) = data;
or destination[i] = data
, etc.
We could then expose MaskedStore(T* destination, Vector<T> mask, Vector<T> data)
and do the same there. Giving us 2 overloads per load/store instruction, rather than increasing it to 4.
On another note, I see these APIs explicitly take long
(System.Int64
) as the index. Is SVE an Arm64 exclusive or should this be nint
so that it's 32-bits on 32-bit architectures?
The way most of the ISAs are exposed today we have:
public abstract class IsaName
{
// only true if hardware supports IsaName
public static bool IsSupported { get; }
// Methods supported by both Arm32 and Arm64
public static void MethodA();
public abstract class Arm32
{
// only true if hardware supports IsaName and we're 32-bit
public static bool IsSupported { get; }
// Methods supported by only Arm32
public static void MethodB();
}
public abstract class Arm64
{
// only true if hardware supports IsaName and we're 64-bit
public static bool IsSupported { get; }
// Methods supported by only Arm64
public static void MethodC();
}
}
If there is zero chance of SVE ever being supported on Arm32, then we could potentially consider an exception to this normal design layout. But if that (or some other size) might be a consideration in the future, we may want to slightly tweak it to take nint
/nuint
or section off the 64-bit only APIs.
that's 890 functions just for SVE1.
This is because every
Store
API has aMaskedStore
equivalent, so its effectively doubling the total number of exposed APIs, right?
Yup. I just totalled what we had already.
I wonder if we could compensate for this by avoiding the
vnum
overloads here... That is, as I understand it we have two forms of the instruction, where the former is just a specialized encoding of the latter when the immediate is between[-8, +7]
. We're then exposing thevnum
overload so users can specify the index explicitly:ST1W (scalar plus immediate, single register) ST1W (scalar plus scalar, single register)
Most notably, we could just expose
public static unsafe void Store(T* destination, Vector<T> data);
and then support the offset variants implicitly by recognizing patterns such asStore(destination + index, data)
could be used to generate the optimal overload ofST1W
. This is exactly as we would with normal scalar codegen for something like*(destination + 1) = data;
ordestination[i] = data
, etc.We could then expose
MaskedStore(T* destination, Vector<T> mask, Vector<T> data)
and do the same there. Giving us 2 overloads per load/store instruction, rather than increasing it to 4.
That should work. I'm assuming the pattern recognition is generic enough so that it can be used mostly as is without much refactoring?
Alternatively, the user can use a generic vector<T>.Store()
to store an SVE vector? These will always be unmasked. So for the Sve
class, we could only expose the masked versions of the stores. If the user doesn't care about masks they would call Vector<T>.Store()
.
Happy with either.
On another note, I see these APIs explicitly take
long
(System.Int64
) as the index. Is SVE an Arm64 exclusive or should this benint
so that it's 32-bits on 32-bit architectures?The way most of the ISAs are exposed today we have:
public abstract class IsaName { // only true if hardware supports IsaName public static bool IsSupported { get; } // Methods supported by both Arm32 and Arm64 public static void MethodA(); public abstract class Arm32 { // only true if hardware supports IsaName and we're 32-bit public static bool IsSupported { get; } // Methods supported by only Arm32 public static void MethodB(); } public abstract class Arm64 { // only true if hardware supports IsaName and we're 64-bit public static bool IsSupported { get; } // Methods supported by only Arm64 public static void MethodC(); } }
If there is zero chance of SVE ever being supported on Arm32, then we could potentially consider an exception to this normal design layout. But if that (or some other size) might be a consideration in the future, we may want to slightly tweak it to take
nint
/nuint
or section off the 64-bit only APIs.
SVE is Arm64 only. It would take far too much silicon/power to be viable for 32bit. Closest there is for Arm32 is MVE aka Helium on M class.
Alternatively, the user can use a generic vector
.Store() to store an SVE vector? These will always be unmasked. So for the Sve class, we could only expose the masked versions of the stores. If the user doesn't care about masks they would call Vector .Store().
That sounds reasonable to me as well. They already have a "convenience" API in the form of x.Store(destination)
, so only having the predicated version exposed under Sve
is more verbose to use, but ultimately achieves the same thing, and really only needed if the need masking.
SVE is Arm64 only. It would take far too much silicon/power to be viable for 32bit. Closest there is for Arm32 is MVE aka Helium on M class.
👍. We'll probably end up discussing this a bit in API review and whether we want to be "consistent" and have it only under Sve.Arm64
or if we're fine saying "this is special enough and it's suitable to be exposed as simply Sve
". I'll push for the latter, because most of these have zero real consideration between 32-bit vs 64-bit anyways.
Updated:
vnum
variants into optionalNarrow
methods.Also updated the scatter,load,gather,firstfault APIs in the same way.
For this API and all the other SVE1 APIs that haven't been reviewed yet I've updated the API proposals.
These updates take into account the changes made in the other APIs. My updated scripts have automatically applied to entires in these APIs. Eg: Any addressing modes with a long vnum
argument are rejected.
namespace System.Runtime.Intrinsics.Arm;
/// VectorT Summary
public abstract partial class Sve : AdvSimd /// Feature: FEAT_SVE Category: stores
{
/// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
public static unsafe void Store(Vector<T> mask, T* address, Vector<T> data); // ST1W or ST1D or ST1B or ST1H
/// T: [short, sbyte], [int, short], [int, sbyte], [long, short], [long, int], [long, sbyte]
/// T: [ushort, byte], [uint, ushort], [uint, byte], [ulong, ushort], [ulong, uint], [ulong, byte]
public static unsafe void StoreNarrowing(Vector<T> mask, T2* address, Vector<T> data); // ST1B or ST1H or ST1W
/// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
public static unsafe void StoreNonTemporal(Vector<T> mask, T* address, Vector<T> data); // STNT1W or STNT1D or STNT1B or STNT1H
/// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2) data); // ST2W or ST2D or ST2B or ST2H
/// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2, Vector<T> Value3) data); // ST3W or ST3D or ST3B or ST3H
/// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2, Vector<T> Value3, Vector<T> Value4) data); // ST4W or ST4D or ST4B or ST4H
/// total method signatures: 17
}
@tannergooding
/// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2) data); // ST2W or ST2D or ST2B or ST2H
ST2W
which will store only the first 32bits from each vector to memory.ST3W
which stores the first 32bits from each vector.That means for the one arg version, eg:
public static unsafe void Store(Vector<T> mask, T* address, Vector<T> data); // ST1W or ST1D or ST1B or ST1H
This will only store the first 32bits from the input vector.
How does the user store a full vector to memory (using STR
) ?
Suggestion:
StoreFirstElement()
Store(Vector<T> mask, T* address, Vector<T> value)
which uses STRcc: @SwapnilGaikwad
Ignore the previous comment, it was wrong. st2w
store all values from the input vectors to memory, interleaving in word sized ammounts.