dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15.08k stars 4.69k forks source link

[API Proposal]: Arm64: FEAT_SVE: stores #94011

Open a74nh opened 11 months ago

a74nh commented 11 months ago
namespace System.Runtime.Intrinsics.Arm;

/// VectorT Summary
public abstract partial class Sve : AdvSimd /// Feature: FEAT_SVE  Category: stores
{

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Store(Vector<T> mask, T* address, Vector<T> data); // ST1W or ST1D or ST1B or ST1H

  /// T: [short, sbyte], [int, short], [int, sbyte], [long, short], [long, int], [long, sbyte]
  /// T: [ushort, byte], [uint, ushort], [uint, byte], [ulong, ushort], [ulong, uint], [ulong, byte]
  public static unsafe void StoreNarrowing(Vector<T> mask, T2* address, Vector<T> data); // ST1B or ST1H or ST1W

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void StoreNonTemporal(Vector<T> mask, T* address, Vector<T> data); // STNT1W or STNT1D or STNT1B or STNT1H

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Storex2(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2)); // ST2W or ST2D or ST2B or ST2H

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Storex3(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2, Vector<T> Value3)); // ST3W or ST3D or ST3B or ST3H

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Storex4(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2, Vector<T> Value3, Vector<T> Value4)); // ST4W or ST4D or ST4B or ST4H

  /// total method signatures: 17
}
ghost commented 11 months ago

Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics See info in area-owners.md if you want to be subscribed.

Issue Details
```csharp namespace System.Runtime.Intrinsics.Arm /// VectorT Summary public abstract class Sve : AdvSimd /// Feature: FEAT_SVE Category: stores { /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong public static unsafe void StoreVector(T *base, Vector data); // ST1W or ST1D or ST1B or ST1H /// T: float, double, sbyte, short, int, byte, ushort, uint, ulong public static unsafe void StoreVector(T *base, long vnum, Vector data); // ST1W or ST1D or ST1B or ST1H public static unsafe void StoreVector(long *base, long vnum, Vector data); /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong public static unsafe void StoreVectorNonTemporal(T *base, Vector data); // STNT1W or STNT1D or STNT1B or STNT1H /// T: float, double, sbyte, short, int, byte, ushort, uint, ulong public static unsafe void StoreVectorNonTemporal(T *base, long vnum, Vector data); // STNT1W or STNT1D or STNT1B or STNT1H public static unsafe void StoreVectorNonTemporal(long *base, long vnum, Vector data); /// T: int, long public static unsafe void StoreVectorTruncate16(short *base, Vector data); // ST1H /// T: uint, ulong public static unsafe void StoreVectorTruncate16(ushort *base, Vector data); // ST1H /// T: int, long public static unsafe void StoreVectorTruncate16(short *base, long vnum, Vector data); // ST1H /// T: uint, ulong public static unsafe void StoreVectorTruncate16(ushort *base, long vnum, Vector data); // ST1H public static unsafe void StoreVectorTruncate32(int *base, Vector data); public static unsafe void StoreVectorTruncate32(uint *base, Vector data); public static unsafe void StoreVectorTruncate32(int *base, long vnum, Vector data); public static unsafe void StoreVectorTruncate32(uint *base, long vnum, Vector data); /// T: short, int, long public static unsafe void StoreVectorTruncate8(sbyte *base, Vector data); // ST1B /// T: ushort, uint, ulong public static unsafe void StoreVectorTruncate8(byte *base, Vector data); // ST1B /// T: short, int, long public static unsafe void StoreVectorTruncate8(sbyte *base, long vnum, Vector data); // ST1B /// T: ushort, uint, ulong public static unsafe void StoreVectorTruncate8(byte *base, long vnum, Vector data); // ST1B /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong public static unsafe void StoreVectorx2(T *base, (Vector data1, Vector data2)); // ST2W or ST2D or ST2B or ST2H /// T: float, double, sbyte, short, int, byte, ushort, uint, ulong public static unsafe void StoreVectorx2(T *base, long vnum, (Vector data1, Vector data2)); // ST2W or ST2D or ST2B or ST2H public static unsafe void StoreVectorx2(long *base, long vnum, (Vector data1, Vector data2)); /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong public static unsafe void StoreVectorx3(T *base, (Vector data1, Vector data2, Vector data3)); // ST3W or ST3D or ST3B or ST3H /// T: float, double, sbyte, short, int, byte, ushort, uint, ulong public static unsafe void StoreVectorx3(T *base, long vnum, (Vector data1, Vector data2, Vector data3)); // ST3W or ST3D or ST3B or ST3H public static unsafe void StoreVectorx3(long *base, long vnum, (Vector data1, Vector data2, Vector data3)); /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong public static unsafe void StoreVectorx4(T *base, (Vector data1, Vector data2, Vector data3, Vector data4)); // ST4W or ST4D or ST4B or ST4H /// T: float, double, sbyte, short, int, byte, ushort, uint, ulong public static unsafe void StoreVectorx4(T *base, long vnum, (Vector data1, Vector data2, Vector data3, Vector data4)); // ST4W or ST4D or ST4B or ST4H public static unsafe void StoreVectorx4(long *base, long vnum, (Vector data1, Vector data2, Vector data3, Vector data4)); /// total method signatures: 27 } ```
Author: a74nh
Assignees: -
Labels: `area-System.Runtime.Intrinsics`
Milestone: -
a74nh commented 11 months ago

/// Full API
public abstract partial class Sve : AdvSimd /// Feature: FEAT_SVE  Category: stores
{
    /// Store : Non-truncating store

    /// void svst1[_f32](svbool_t pg, float32_t *base, svfloat32_t data) : "ST1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<float> mask, float* base, Vector<float> data);

    /// void svst1[_f64](svbool_t pg, float64_t *base, svfloat64_t data) : "ST1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "ST1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<double> mask, double* base, Vector<double> data);

    /// void svst1[_s8](svbool_t pg, int8_t *base, svint8_t data) : "ST1B Zdata.B, Pg, [Xarray, Xindex]" or "ST1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<sbyte> mask, sbyte* base, Vector<sbyte> data);

    /// void svst1[_s16](svbool_t pg, int16_t *base, svint16_t data) : "ST1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<short> mask, short* base, Vector<short> data);

    /// void svst1[_s32](svbool_t pg, int32_t *base, svint32_t data) : "ST1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<int> mask, int* base, Vector<int> data);

    /// void svst1[_s64](svbool_t pg, int64_t *base, svint64_t data) : "ST1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "ST1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<long> mask, long* base, Vector<long> data);

    /// void svst1[_u8](svbool_t pg, uint8_t *base, svuint8_t data) : "ST1B Zdata.B, Pg, [Xarray, Xindex]" or "ST1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<byte> mask, byte* base, Vector<byte> data);

    /// void svst1[_u16](svbool_t pg, uint16_t *base, svuint16_t data) : "ST1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<ushort> mask, ushort* base, Vector<ushort> data);

    /// void svst1[_u32](svbool_t pg, uint32_t *base, svuint32_t data) : "ST1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<uint> mask, uint* base, Vector<uint> data);

    /// void svst1[_u64](svbool_t pg, uint64_t *base, svuint64_t data) : "ST1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "ST1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Store(Vector<ulong> mask, ulong* base, Vector<ulong> data);

    /// StoreInt16NarrowToSByte : Truncate to 8 bits and store

    /// void svst1b[_s16](svbool_t pg, int8_t *base, svint16_t data) : "ST1B Zdata.H, Pg, [Xarray, Xindex]" or "ST1B Zdata.H, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreInt16NarrowToSByte(Vector<short> mask, sbyte* base, Vector<short> data);

    /// StoreInt32NarrowToInt16 : Truncate to 16 bits and store

    /// void svst1h[_s32](svbool_t pg, int16_t *base, svint32_t data) : "ST1H Zdata.S, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreInt32NarrowToInt16(Vector<int> mask, short* base, Vector<int> data);

    /// StoreInt32NarrowToSByte : Truncate to 8 bits and store

    /// void svst1b[_s32](svbool_t pg, int8_t *base, svint32_t data) : "ST1B Zdata.S, Pg, [Xarray, Xindex]" or "ST1B Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreInt32NarrowToSByte(Vector<int> mask, sbyte* base, Vector<int> data);

    /// StoreInt64NarrowToInt16 : Truncate to 16 bits and store

    /// void svst1h[_s64](svbool_t pg, int16_t *base, svint64_t data) : "ST1H Zdata.D, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreInt64NarrowToInt16(Vector<long> mask, short* base, Vector<long> data);

    /// StoreInt64NarrowToInt32 : Truncate to 32 bits and store

    /// void svst1w[_s64](svbool_t pg, int32_t *base, svint64_t data) : "ST1W Zdata.D, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreInt64NarrowToInt32(Vector<long> mask, int* base, Vector<long> data);

    /// StoreInt64NarrowToSByte : Truncate to 8 bits and store

    /// void svst1b[_s64](svbool_t pg, int8_t *base, svint64_t data) : "ST1B Zdata.D, Pg, [Xarray, Xindex]" or "ST1B Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreInt64NarrowToSByte(Vector<long> mask, sbyte* base, Vector<long> data);

    /// StoreNonTemporal : Non-truncating store, non-temporal

    /// void svstnt1[_f32](svbool_t pg, float32_t *base, svfloat32_t data) : "STNT1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "STNT1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<float> mask, float* base, Vector<float> data);

    /// void svstnt1[_f64](svbool_t pg, float64_t *base, svfloat64_t data) : "STNT1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "STNT1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<double> mask, double* base, Vector<double> data);

    /// void svstnt1[_s8](svbool_t pg, int8_t *base, svint8_t data) : "STNT1B Zdata.B, Pg, [Xarray, Xindex]" or "STNT1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<sbyte> mask, sbyte* base, Vector<sbyte> data);

    /// void svstnt1[_s16](svbool_t pg, int16_t *base, svint16_t data) : "STNT1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "STNT1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<short> mask, short* base, Vector<short> data);

    /// void svstnt1[_s32](svbool_t pg, int32_t *base, svint32_t data) : "STNT1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "STNT1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<int> mask, int* base, Vector<int> data);

    /// void svstnt1[_s64](svbool_t pg, int64_t *base, svint64_t data) : "STNT1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "STNT1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<long> mask, long* base, Vector<long> data);

    /// void svstnt1[_u8](svbool_t pg, uint8_t *base, svuint8_t data) : "STNT1B Zdata.B, Pg, [Xarray, Xindex]" or "STNT1B Zdata.B, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<byte> mask, byte* base, Vector<byte> data);

    /// void svstnt1[_u16](svbool_t pg, uint16_t *base, svuint16_t data) : "STNT1H Zdata.H, Pg, [Xarray, Xindex, LSL #1]" or "STNT1H Zdata.H, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<ushort> mask, ushort* base, Vector<ushort> data);

    /// void svstnt1[_u32](svbool_t pg, uint32_t *base, svuint32_t data) : "STNT1W Zdata.S, Pg, [Xarray, Xindex, LSL #2]" or "STNT1W Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<uint> mask, uint* base, Vector<uint> data);

    /// void svstnt1[_u64](svbool_t pg, uint64_t *base, svuint64_t data) : "STNT1D Zdata.D, Pg, [Xarray, Xindex, LSL #3]" or "STNT1D Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreNonTemporal(Vector<ulong> mask, ulong* base, Vector<ulong> data);

    /// StoreUInt16NarrowToByte : Truncate to 8 bits and store

    /// void svst1b[_u16](svbool_t pg, uint8_t *base, svuint16_t data) : "ST1B Zdata.H, Pg, [Xarray, Xindex]" or "ST1B Zdata.H, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreUInt16NarrowToByte(Vector<ushort> mask, byte* base, Vector<ushort> data);

    /// StoreUInt32NarrowToByte : Truncate to 8 bits and store

    /// void svst1b[_u32](svbool_t pg, uint8_t *base, svuint32_t data) : "ST1B Zdata.S, Pg, [Xarray, Xindex]" or "ST1B Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreUInt32NarrowToByte(Vector<uint> mask, byte* base, Vector<uint> data);

    /// StoreUInt32NarrowToUInt16 : Truncate to 16 bits and store

    /// void svst1h[_u32](svbool_t pg, uint16_t *base, svuint32_t data) : "ST1H Zdata.S, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.S, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreUInt32NarrowToUInt16(Vector<uint> mask, ushort* base, Vector<uint> data);

    /// StoreUInt64NarrowToByte : Truncate to 8 bits and store

    /// void svst1b[_u64](svbool_t pg, uint8_t *base, svuint64_t data) : "ST1B Zdata.D, Pg, [Xarray, Xindex]" or "ST1B Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreUInt64NarrowToByte(Vector<ulong> mask, byte* base, Vector<ulong> data);

    /// StoreUInt64NarrowToUInt16 : Truncate to 16 bits and store

    /// void svst1h[_u64](svbool_t pg, uint16_t *base, svuint64_t data) : "ST1H Zdata.D, Pg, [Xarray, Xindex, LSL #1]" or "ST1H Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreUInt64NarrowToUInt16(Vector<ulong> mask, ushort* base, Vector<ulong> data);

    /// StoreUInt64NarrowToUInt32 : Truncate to 32 bits and store

    /// void svst1w[_u64](svbool_t pg, uint32_t *base, svuint64_t data) : "ST1W Zdata.D, Pg, [Xarray, Xindex, LSL #2]" or "ST1W Zdata.D, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void StoreUInt64NarrowToUInt32(Vector<ulong> mask, uint* base, Vector<ulong> data);

    /// Storex2 : Store two vectors into two-element tuples

    /// void svst2[_f32](svbool_t pg, float32_t *base, svfloat32x2_t data) : "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<float> mask, float* base, (Vector<float> data1, Vector<float> data2));

    /// void svst2[_f64](svbool_t pg, float64_t *base, svfloat64x2_t data) : "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<double> mask, double* base, (Vector<double> data1, Vector<double> data2));

    /// void svst2[_s8](svbool_t pg, int8_t *base, svint8x2_t data) : "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xarray, Xindex]" or "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<sbyte> mask, sbyte* base, (Vector<sbyte> data1, Vector<sbyte> data2));

    /// void svst2[_s16](svbool_t pg, int16_t *base, svint16x2_t data) : "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<short> mask, short* base, (Vector<short> data1, Vector<short> data2));

    /// void svst2[_s32](svbool_t pg, int32_t *base, svint32x2_t data) : "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<int> mask, int* base, (Vector<int> data1, Vector<int> data2));

    /// void svst2[_s64](svbool_t pg, int64_t *base, svint64x2_t data) : "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<long> mask, long* base, (Vector<long> data1, Vector<long> data2));

    /// void svst2[_u8](svbool_t pg, uint8_t *base, svuint8x2_t data) : "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xarray, Xindex]" or "ST2B {Zdata0.B, Zdata1.B}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<byte> mask, byte* base, (Vector<byte> data1, Vector<byte> data2));

    /// void svst2[_u16](svbool_t pg, uint16_t *base, svuint16x2_t data) : "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST2H {Zdata0.H, Zdata1.H}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<ushort> mask, ushort* base, (Vector<ushort> data1, Vector<ushort> data2));

    /// void svst2[_u32](svbool_t pg, uint32_t *base, svuint32x2_t data) : "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST2W {Zdata0.S, Zdata1.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<uint> mask, uint* base, (Vector<uint> data1, Vector<uint> data2));

    /// void svst2[_u64](svbool_t pg, uint64_t *base, svuint64x2_t data) : "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST2D {Zdata0.D, Zdata1.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex2(Vector<ulong> mask, ulong* base, (Vector<ulong> data1, Vector<ulong> data2));

    /// Storex3 : Store three vectors into three-element tuples

    /// void svst3[_f32](svbool_t pg, float32_t *base, svfloat32x3_t data) : "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<float> mask, float* base, (Vector<float> data1, Vector<float> data2, Vector<float> data3));

    /// void svst3[_f64](svbool_t pg, float64_t *base, svfloat64x3_t data) : "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<double> mask, double* base, (Vector<double> data1, Vector<double> data2, Vector<double> data3));

    /// void svst3[_s8](svbool_t pg, int8_t *base, svint8x3_t data) : "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xarray, Xindex]" or "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<sbyte> mask, sbyte* base, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3));

    /// void svst3[_s16](svbool_t pg, int16_t *base, svint16x3_t data) : "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<short> mask, short* base, (Vector<short> data1, Vector<short> data2, Vector<short> data3));

    /// void svst3[_s32](svbool_t pg, int32_t *base, svint32x3_t data) : "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<int> mask, int* base, (Vector<int> data1, Vector<int> data2, Vector<int> data3));

    /// void svst3[_s64](svbool_t pg, int64_t *base, svint64x3_t data) : "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<long> mask, long* base, (Vector<long> data1, Vector<long> data2, Vector<long> data3));

    /// void svst3[_u8](svbool_t pg, uint8_t *base, svuint8x3_t data) : "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xarray, Xindex]" or "ST3B {Zdata0.B - Zdata2.B}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<byte> mask, byte* base, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3));

    /// void svst3[_u16](svbool_t pg, uint16_t *base, svuint16x3_t data) : "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST3H {Zdata0.H - Zdata2.H}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<ushort> mask, ushort* base, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3));

    /// void svst3[_u32](svbool_t pg, uint32_t *base, svuint32x3_t data) : "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST3W {Zdata0.S - Zdata2.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<uint> mask, uint* base, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3));

    /// void svst3[_u64](svbool_t pg, uint64_t *base, svuint64x3_t data) : "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST3D {Zdata0.D - Zdata2.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex3(Vector<ulong> mask, ulong* base, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3));

    /// Storex4 : Store four vectors into four-element tuples

    /// void svst4[_f32](svbool_t pg, float32_t *base, svfloat32x4_t data) : "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<float> mask, float* base, (Vector<float> data1, Vector<float> data2, Vector<float> data3, Vector<float> data4));

    /// void svst4[_f64](svbool_t pg, float64_t *base, svfloat64x4_t data) : "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<double> mask, double* base, (Vector<double> data1, Vector<double> data2, Vector<double> data3, Vector<double> data4));

    /// void svst4[_s8](svbool_t pg, int8_t *base, svint8x4_t data) : "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xarray, Xindex]" or "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<sbyte> mask, sbyte* base, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3, Vector<sbyte> data4));

    /// void svst4[_s16](svbool_t pg, int16_t *base, svint16x4_t data) : "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<short> mask, short* base, (Vector<short> data1, Vector<short> data2, Vector<short> data3, Vector<short> data4));

    /// void svst4[_s32](svbool_t pg, int32_t *base, svint32x4_t data) : "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<int> mask, int* base, (Vector<int> data1, Vector<int> data2, Vector<int> data3, Vector<int> data4));

    /// void svst4[_s64](svbool_t pg, int64_t *base, svint64x4_t data) : "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<long> mask, long* base, (Vector<long> data1, Vector<long> data2, Vector<long> data3, Vector<long> data4));

    /// void svst4[_u8](svbool_t pg, uint8_t *base, svuint8x4_t data) : "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xarray, Xindex]" or "ST4B {Zdata0.B - Zdata3.B}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<byte> mask, byte* base, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3, Vector<byte> data4));

    /// void svst4[_u16](svbool_t pg, uint16_t *base, svuint16x4_t data) : "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xarray, Xindex, LSL #1]" or "ST4H {Zdata0.H - Zdata3.H}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<ushort> mask, ushort* base, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3, Vector<ushort> data4));

    /// void svst4[_u32](svbool_t pg, uint32_t *base, svuint32x4_t data) : "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xarray, Xindex, LSL #2]" or "ST4W {Zdata0.S - Zdata3.S}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<uint> mask, uint* base, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3, Vector<uint> data4));

    /// void svst4[_u64](svbool_t pg, uint64_t *base, svuint64x4_t data) : "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xarray, Xindex, LSL #3]" or "ST4D {Zdata0.D - Zdata3.D}, Pg, [Xbase, #0, MUL VL]"
  public static unsafe void Storex4(Vector<ulong> mask, ulong* base, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3, Vector<ulong> data4));

  /// total method signatures: 62
  /// total method names:      17
}
a74nh commented 11 months ago

  /// Rejected:
  ///   public static unsafe void Store(Vector<float> mask, float* base, long vnum, Vector<float> data); // svst1_vnum[_f32]
  ///   public static unsafe void Store(Vector<double> mask, double* base, long vnum, Vector<double> data); // svst1_vnum[_f64]
  ///   public static unsafe void Store(Vector<sbyte> mask, sbyte* base, long vnum, Vector<sbyte> data); // svst1_vnum[_s8]
  ///   public static unsafe void Store(Vector<short> mask, short* base, long vnum, Vector<short> data); // svst1_vnum[_s16]
  ///   public static unsafe void Store(Vector<int> mask, int* base, long vnum, Vector<int> data); // svst1_vnum[_s32]
  ///   public static unsafe void Store(Vector<long> mask, long* base, long vnum, Vector<long> data); // svst1_vnum[_s64]
  ///   public static unsafe void Store(Vector<byte> mask, byte* base, long vnum, Vector<byte> data); // svst1_vnum[_u8]
  ///   public static unsafe void Store(Vector<ushort> mask, ushort* base, long vnum, Vector<ushort> data); // svst1_vnum[_u16]
  ///   public static unsafe void Store(Vector<uint> mask, uint* base, long vnum, Vector<uint> data); // svst1_vnum[_u32]
  ///   public static unsafe void Store(Vector<ulong> mask, ulong* base, long vnum, Vector<ulong> data); // svst1_vnum[_u64]
  ///   public static unsafe void StoreInt16NarrowToSByte(Vector<short> mask, sbyte* base, long vnum, Vector<short> data); // svst1b_vnum[_s16]
  ///   public static unsafe void StoreInt32NarrowToInt16(Vector<int> mask, short* base, long vnum, Vector<int> data); // svst1h_vnum[_s32]
  ///   public static unsafe void StoreInt32NarrowToSByte(Vector<int> mask, sbyte* base, long vnum, Vector<int> data); // svst1b_vnum[_s32]
  ///   public static unsafe void StoreInt64NarrowToInt16(Vector<long> mask, short* base, long vnum, Vector<long> data); // svst1h_vnum[_s64]
  ///   public static unsafe void StoreInt64NarrowToInt32(Vector<long> mask, int* base, long vnum, Vector<long> data); // svst1w_vnum[_s64]
  ///   public static unsafe void StoreInt64NarrowToSByte(Vector<long> mask, sbyte* base, long vnum, Vector<long> data); // svst1b_vnum[_s64]
  ///   public static unsafe void StoreNonTemporal(Vector<float> mask, float* base, long vnum, Vector<float> data); // svstnt1_vnum[_f32]
  ///   public static unsafe void StoreNonTemporal(Vector<double> mask, double* base, long vnum, Vector<double> data); // svstnt1_vnum[_f64]
  ///   public static unsafe void StoreNonTemporal(Vector<sbyte> mask, sbyte* base, long vnum, Vector<sbyte> data); // svstnt1_vnum[_s8]
  ///   public static unsafe void StoreNonTemporal(Vector<short> mask, short* base, long vnum, Vector<short> data); // svstnt1_vnum[_s16]
  ///   public static unsafe void StoreNonTemporal(Vector<int> mask, int* base, long vnum, Vector<int> data); // svstnt1_vnum[_s32]
  ///   public static unsafe void StoreNonTemporal(Vector<long> mask, long* base, long vnum, Vector<long> data); // svstnt1_vnum[_s64]
  ///   public static unsafe void StoreNonTemporal(Vector<byte> mask, byte* base, long vnum, Vector<byte> data); // svstnt1_vnum[_u8]
  ///   public static unsafe void StoreNonTemporal(Vector<ushort> mask, ushort* base, long vnum, Vector<ushort> data); // svstnt1_vnum[_u16]
  ///   public static unsafe void StoreNonTemporal(Vector<uint> mask, uint* base, long vnum, Vector<uint> data); // svstnt1_vnum[_u32]
  ///   public static unsafe void StoreNonTemporal(Vector<ulong> mask, ulong* base, long vnum, Vector<ulong> data); // svstnt1_vnum[_u64]
  ///   public static unsafe void StoreUInt16NarrowToByte(Vector<ushort> mask, byte* base, long vnum, Vector<ushort> data); // svst1b_vnum[_u16]
  ///   public static unsafe void StoreUInt32NarrowToByte(Vector<uint> mask, byte* base, long vnum, Vector<uint> data); // svst1b_vnum[_u32]
  ///   public static unsafe void StoreUInt32NarrowToUInt16(Vector<uint> mask, ushort* base, long vnum, Vector<uint> data); // svst1h_vnum[_u32]
  ///   public static unsafe void StoreUInt64NarrowToByte(Vector<ulong> mask, byte* base, long vnum, Vector<ulong> data); // svst1b_vnum[_u64]
  ///   public static unsafe void StoreUInt64NarrowToUInt16(Vector<ulong> mask, ushort* base, long vnum, Vector<ulong> data); // svst1h_vnum[_u64]
  ///   public static unsafe void StoreUInt64NarrowToUInt32(Vector<ulong> mask, uint* base, long vnum, Vector<ulong> data); // svst1w_vnum[_u64]
  ///   public static unsafe void Storex2(Vector<float> mask, float* base, long vnum, (Vector<float> data1, Vector<float> data2)); // svst2_vnum[_f32]
  ///   public static unsafe void Storex2(Vector<double> mask, double* base, long vnum, (Vector<double> data1, Vector<double> data2)); // svst2_vnum[_f64]
  ///   public static unsafe void Storex2(Vector<sbyte> mask, sbyte* base, long vnum, (Vector<sbyte> data1, Vector<sbyte> data2)); // svst2_vnum[_s8]
  ///   public static unsafe void Storex2(Vector<short> mask, short* base, long vnum, (Vector<short> data1, Vector<short> data2)); // svst2_vnum[_s16]
  ///   public static unsafe void Storex2(Vector<int> mask, int* base, long vnum, (Vector<int> data1, Vector<int> data2)); // svst2_vnum[_s32]
  ///   public static unsafe void Storex2(Vector<long> mask, long* base, long vnum, (Vector<long> data1, Vector<long> data2)); // svst2_vnum[_s64]
  ///   public static unsafe void Storex2(Vector<byte> mask, byte* base, long vnum, (Vector<byte> data1, Vector<byte> data2)); // svst2_vnum[_u8]
  ///   public static unsafe void Storex2(Vector<ushort> mask, ushort* base, long vnum, (Vector<ushort> data1, Vector<ushort> data2)); // svst2_vnum[_u16]
  ///   public static unsafe void Storex2(Vector<uint> mask, uint* base, long vnum, (Vector<uint> data1, Vector<uint> data2)); // svst2_vnum[_u32]
  ///   public static unsafe void Storex2(Vector<ulong> mask, ulong* base, long vnum, (Vector<ulong> data1, Vector<ulong> data2)); // svst2_vnum[_u64]
  ///   public static unsafe void Storex3(Vector<float> mask, float* base, long vnum, (Vector<float> data1, Vector<float> data2, Vector<float> data3)); // svst3_vnum[_f32]
  ///   public static unsafe void Storex3(Vector<double> mask, double* base, long vnum, (Vector<double> data1, Vector<double> data2, Vector<double> data3)); // svst3_vnum[_f64]
  ///   public static unsafe void Storex3(Vector<sbyte> mask, sbyte* base, long vnum, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3)); // svst3_vnum[_s8]
  ///   public static unsafe void Storex3(Vector<short> mask, short* base, long vnum, (Vector<short> data1, Vector<short> data2, Vector<short> data3)); // svst3_vnum[_s16]
  ///   public static unsafe void Storex3(Vector<int> mask, int* base, long vnum, (Vector<int> data1, Vector<int> data2, Vector<int> data3)); // svst3_vnum[_s32]
  ///   public static unsafe void Storex3(Vector<long> mask, long* base, long vnum, (Vector<long> data1, Vector<long> data2, Vector<long> data3)); // svst3_vnum[_s64]
  ///   public static unsafe void Storex3(Vector<byte> mask, byte* base, long vnum, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3)); // svst3_vnum[_u8]
  ///   public static unsafe void Storex3(Vector<ushort> mask, ushort* base, long vnum, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3)); // svst3_vnum[_u16]
  ///   public static unsafe void Storex3(Vector<uint> mask, uint* base, long vnum, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3)); // svst3_vnum[_u32]
  ///   public static unsafe void Storex3(Vector<ulong> mask, ulong* base, long vnum, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3)); // svst3_vnum[_u64]
  ///   public static unsafe void Storex4(Vector<float> mask, float* base, long vnum, (Vector<float> data1, Vector<float> data2, Vector<float> data3, Vector<float> data4)); // svst4_vnum[_f32]
  ///   public static unsafe void Storex4(Vector<double> mask, double* base, long vnum, (Vector<double> data1, Vector<double> data2, Vector<double> data3, Vector<double> data4)); // svst4_vnum[_f64]
  ///   public static unsafe void Storex4(Vector<sbyte> mask, sbyte* base, long vnum, (Vector<sbyte> data1, Vector<sbyte> data2, Vector<sbyte> data3, Vector<sbyte> data4)); // svst4_vnum[_s8]
  ///   public static unsafe void Storex4(Vector<short> mask, short* base, long vnum, (Vector<short> data1, Vector<short> data2, Vector<short> data3, Vector<short> data4)); // svst4_vnum[_s16]
  ///   public static unsafe void Storex4(Vector<int> mask, int* base, long vnum, (Vector<int> data1, Vector<int> data2, Vector<int> data3, Vector<int> data4)); // svst4_vnum[_s32]
  ///   public static unsafe void Storex4(Vector<long> mask, long* base, long vnum, (Vector<long> data1, Vector<long> data2, Vector<long> data3, Vector<long> data4)); // svst4_vnum[_s64]
  ///   public static unsafe void Storex4(Vector<byte> mask, byte* base, long vnum, (Vector<byte> data1, Vector<byte> data2, Vector<byte> data3, Vector<byte> data4)); // svst4_vnum[_u8]
  ///   public static unsafe void Storex4(Vector<ushort> mask, ushort* base, long vnum, (Vector<ushort> data1, Vector<ushort> data2, Vector<ushort> data3, Vector<ushort> data4)); // svst4_vnum[_u16]
  ///   public static unsafe void Storex4(Vector<uint> mask, uint* base, long vnum, (Vector<uint> data1, Vector<uint> data2, Vector<uint> data3, Vector<uint> data4)); // svst4_vnum[_u32]
  ///   public static unsafe void Storex4(Vector<ulong> mask, ulong* base, long vnum, (Vector<ulong> data1, Vector<ulong> data2, Vector<ulong> data3, Vector<ulong> data4)); // svst4_vnum[_u64]
  ///   Total Rejected: 62

  /// Total ACLE covered across API:      124
a74nh commented 11 months ago

This contributes to https://github.com/dotnet/runtime/issues/93095

It covers instructions in FEAT_SVE related to stores. Note there are more store methods in scatter stores

This list was auto generated from the C ACLE for SVE, and is in three parts:

The methods list reduced down to Vector versions. All possible varaints of T are given above the method. The complete list of all methods. The corresponding ACLE methods and SVE instructions are given above the method. All rejected ACLE methods. These are methods we have agreed that do not need including in C#. Where possible, existing C# naming conventions have been matched.

Many of the C functions include predicate argument(s), of type svbool_t as the first argument. These are missing from the C# method. It is expected that the Jit will create predicates where required, or combine with uses of conditionalSelect(). For more discussion see https://github.com/dotnet/runtime/issues/88140 comment.

tannergooding commented 11 months ago

Unlike the Load APIs, we don't need to differentiate by return type here (we typically just use Store, unlike vector where it is LoadVector).

However, StoreVectorTruncate32 isn't necessarily clear on the semantic. We also notably opted for the terminology Narrow/Narrowing in AdvSimd. So we probably want to give some consideration on whether we can make it clear that this effectively does ExtractNarrowingLower followed by a Store.

ghost commented 11 months ago

This issue has been marked needs-author-action and may be missing some important information.

a74nh commented 11 months ago

However, StoreVectorTruncate32 isn't necessarily clear on the semantic. We also notably opted for the terminology Narrow/Narrowing in AdvSimd. So we probably want to give some consideration on whether we can make it clear that this effectively does ExtractNarrowingLower followed by a Store.

Updated toStoreInt32NarrowToInt16() etc.

As far as I can tell, StoreInt32NarrowToInt16 and StoreUInt32NarrowToInt16() are identical in operation. Maybe it can be renamed and one dropped?

Also, I feel that the type at the end (the destination type) isn't clear as to whether the destination type is signed or not.

tannergooding commented 11 months ago

As far as I can tell, StoreInt32NarrowToInt16 and StoreUInt32NarrowToInt16() are identical in operation. Maybe it can be renamed and one dropped?

That seems fine. We only need to differentiate by behavior or return type. If the behavior is identical and the delimiter would only be the input type (which can simply be an overload), then dropping the delimiter is fine.

Also, I feel that the type at the end (the destination type) isn't clear as to whether the destination type is signed or not.

We'd use Int32 for signed destination and UInt32 for unsigned destination.

a74nh commented 11 months ago

All stores are predicated ("Inactive elements are not written to memory"). How did you see this working with the use of conditionalSelect?

tannergooding commented 11 months ago

Predicated stores/loads likely need an explicit overload that takes a mask since the operation happens to memory and the ConditionalSelect operation can't clearly represent that.

There are potentially patterns that could be recognized, given the memory ordering rules. But exposing a couple extra overloads for this important concept should be fine.

Something like MaskLoad/MaskStore or MaskedLoad/MaskedStore would make it clearer on how the semantics work than simply an overload that takes a mask.

Regular Load/Store could then be kept for simplicity and use the "all active" predicate.

a74nh commented 11 months ago

Something like MaskLoad/MaskStore or MaskedLoad/MaskedStore would make it clearer on how the semantics work than simply an overload that takes a mask.

That would be fine. Warning though, it's going to bloat the API. Across load,gather,firstfaulting,store,scatter that's 890 functions just for SVE1.

Instead of writing all of them out, I might just add a note to the top of the API block.

tannergooding commented 11 months ago

that's 890 functions just for SVE1.

This is because every Store API has a MaskedStore equivalent, so its effectively doubling the total number of exposed APIs, right?

I wonder if we could compensate for this by avoiding the vnum overloads here... That is, as I understand it we have two forms of the instruction, where the former is just a specialized encoding of the latter when the immediate is between [-8, +7]. We're then exposing the vnum overload so users can specify the index explicitly:

ST1W (scalar plus immediate, single register)
ST1W (scalar plus scalar, single register)

Most notably, we could just expose public static unsafe void Store(T* destination, Vector<T> data); and then support the offset variants implicitly by recognizing patterns such as Store(destination + index, data) could be used to generate the optimal overload of ST1W. This is exactly as we would with normal scalar codegen for something like *(destination + 1) = data; or destination[i] = data, etc.

We could then expose MaskedStore(T* destination, Vector<T> mask, Vector<T> data) and do the same there. Giving us 2 overloads per load/store instruction, rather than increasing it to 4.


On another note, I see these APIs explicitly take long (System.Int64) as the index. Is SVE an Arm64 exclusive or should this be nint so that it's 32-bits on 32-bit architectures?

The way most of the ISAs are exposed today we have:

public abstract class IsaName
{
    // only true if hardware supports IsaName
    public static bool IsSupported { get; }

    // Methods supported by both Arm32 and Arm64
    public static void MethodA();

    public abstract class Arm32
    {
        // only true if hardware supports IsaName and we're 32-bit
        public static bool IsSupported { get; }

        // Methods supported by only Arm32
        public static void MethodB();
    }

    public abstract class Arm64
    {
        // only true if hardware supports IsaName and we're 64-bit
        public static bool IsSupported { get; }

        // Methods supported by only Arm64
        public static void MethodC();
    }
}

If there is zero chance of SVE ever being supported on Arm32, then we could potentially consider an exception to this normal design layout. But if that (or some other size) might be a consideration in the future, we may want to slightly tweak it to take nint/nuint or section off the 64-bit only APIs.

a74nh commented 11 months ago

that's 890 functions just for SVE1.

This is because every Store API has a MaskedStore equivalent, so its effectively doubling the total number of exposed APIs, right?

Yup. I just totalled what we had already.

I wonder if we could compensate for this by avoiding the vnum overloads here... That is, as I understand it we have two forms of the instruction, where the former is just a specialized encoding of the latter when the immediate is between [-8, +7]. We're then exposing the vnum overload so users can specify the index explicitly:

ST1W (scalar plus immediate, single register)
ST1W (scalar plus scalar, single register)

Most notably, we could just expose public static unsafe void Store(T* destination, Vector<T> data); and then support the offset variants implicitly by recognizing patterns such as Store(destination + index, data) could be used to generate the optimal overload of ST1W. This is exactly as we would with normal scalar codegen for something like *(destination + 1) = data; or destination[i] = data, etc.

We could then expose MaskedStore(T* destination, Vector<T> mask, Vector<T> data) and do the same there. Giving us 2 overloads per load/store instruction, rather than increasing it to 4.

That should work. I'm assuming the pattern recognition is generic enough so that it can be used mostly as is without much refactoring?

Alternatively, the user can use a generic vector<T>.Store() to store an SVE vector? These will always be unmasked. So for the Sve class, we could only expose the masked versions of the stores. If the user doesn't care about masks they would call Vector<T>.Store().

Happy with either.

On another note, I see these APIs explicitly take long (System.Int64) as the index. Is SVE an Arm64 exclusive or should this be nint so that it's 32-bits on 32-bit architectures?

The way most of the ISAs are exposed today we have:

public abstract class IsaName
{
    // only true if hardware supports IsaName
    public static bool IsSupported { get; }

    // Methods supported by both Arm32 and Arm64
    public static void MethodA();

    public abstract class Arm32
    {
        // only true if hardware supports IsaName and we're 32-bit
        public static bool IsSupported { get; }

        // Methods supported by only Arm32
        public static void MethodB();
    }

    public abstract class Arm64
    {
        // only true if hardware supports IsaName and we're 64-bit
        public static bool IsSupported { get; }

        // Methods supported by only Arm64
        public static void MethodC();
    }
}

If there is zero chance of SVE ever being supported on Arm32, then we could potentially consider an exception to this normal design layout. But if that (or some other size) might be a consideration in the future, we may want to slightly tweak it to take nint/nuint or section off the 64-bit only APIs.

SVE is Arm64 only. It would take far too much silicon/power to be viable for 32bit. Closest there is for Arm32 is MVE aka Helium on M class.

tannergooding commented 11 months ago

Alternatively, the user can use a generic vector.Store() to store an SVE vector? These will always be unmasked. So for the Sve class, we could only expose the masked versions of the stores. If the user doesn't care about masks they would call Vector.Store().

That sounds reasonable to me as well. They already have a "convenience" API in the form of x.Store(destination), so only having the predicated version exposed under Sve is more verbose to use, but ultimately achieves the same thing, and really only needed if the need masking.

SVE is Arm64 only. It would take far too much silicon/power to be viable for 32bit. Closest there is for Arm32 is MVE aka Helium on M class.

👍. We'll probably end up discussing this a bit in API review and whether we want to be "consistent" and have it only under Sve.Arm64 or if we're fine saying "this is special enough and it's suitable to be exposed as simply Sve". I'll push for the latter, because most of these have zero real consideration between 32-bit vs 64-bit anyways.

a74nh commented 11 months ago

Updated:

a74nh commented 11 months ago

Also updated the scatter,load,gather,firstfault APIs in the same way.

a74nh commented 8 months ago

For this API and all the other SVE1 APIs that haven't been reviewed yet I've updated the API proposals.

These updates take into account the changes made in the other APIs. My updated scripts have automatically applied to entires in these APIs. Eg: Any addressing modes with a long vnum argument are rejected.

bartonjs commented 8 months ago

Video

namespace System.Runtime.Intrinsics.Arm;

/// VectorT Summary
public abstract partial class Sve : AdvSimd /// Feature: FEAT_SVE  Category: stores
{

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Store(Vector<T> mask, T* address, Vector<T> data); // ST1W or ST1D or ST1B or ST1H

  /// T: [short, sbyte], [int, short], [int, sbyte], [long, short], [long, int], [long, sbyte]
  /// T: [ushort, byte], [uint, ushort], [uint, byte], [ulong, ushort], [ulong, uint], [ulong, byte]
  public static unsafe void StoreNarrowing(Vector<T> mask, T2* address, Vector<T> data); // ST1B or ST1H or ST1W

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void StoreNonTemporal(Vector<T> mask, T* address, Vector<T> data); // STNT1W or STNT1D or STNT1B or STNT1H

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2) data); // ST2W or ST2D or ST2B or ST2H

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2, Vector<T> Value3) data); // ST3W or ST3D or ST3B or ST3H

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2, Vector<T> Value3, Vector<T> Value4) data); // ST4W or ST4D or ST4B or ST4H

  /// total method signatures: 17
}
a74nh commented 4 months ago

@tannergooding

  /// T: float, double, sbyte, short, int, long, byte, ushort, uint, ulong
  public static unsafe void Store(Vector<T> mask, T* address, (Vector<T> Value1, Vector<T> Value2) data); // ST2W or ST2D or ST2B or ST2H
  • Int version of this uses ST2W which will store only the first 32bits from each vector to memory.
  • The 3 args version then uses ST3W which stores the first 32bits from each vector.
  • etc

That means for the one arg version, eg:

  public static unsafe void Store(Vector<T> mask, T* address, Vector<T> data); // ST1W or ST1D or ST1B or ST1H

This will only store the first 32bits from the input vector.

How does the user store a full vector to memory (using STR) ?

Suggestion:

  • Rename all the Store() methods to StoreFirstElement()
  • Add a new Store(Vector<T> mask, T* address, Vector<T> value) which uses STR

cc: @SwapnilGaikwad

a74nh commented 4 months ago

Ignore the previous comment, it was wrong. st2w store all values from the input vectors to memory, interleaving in word sized ammounts.