jjjkkkjjj / Matft

Numpy-like library in swift. (Multi-dimensional Array, ndarray, matrix and vector library)
BSD 3-Clause "New" or "Revised" License
134 stars 21 forks source link

Boolean Indexing is slow #18

Open jjjkkkjjj opened 4 years ago

jjjkkkjjj commented 4 years ago

Regarding #17

Official boolean indexing code is https://github.com/numpy/numpy/blob/cf1306a842d7b1064270bd06951a485121e60816/numpy/core/src/multiarray/mapping.c#L1010

SIMD function is https://github.com/numpy/numpy/blob/45bc13e6d922690eea43b9d807d476e0f243f836/numpy/core/src/umath/loops_comparison.dispatch.c.src#L36

jjjkkkjjj commented 4 years ago

Unfortunately, Matft is 5 times slower than numpy...

Matft;

do{
            let a = Matft.arange(start: -10*10*10*10*10*5, to: 10*10*10*10*10*5, by: 1, shape: [10,10,10,10,10,10])

            self.measure {
                let _ = a[a>0]
            }
            /*
             '-[PerformanceTests.IndexingPefTests testPeformanceBooleanIndexing1]' measured [Time, seconds] average: 0.007, relative standard deviation: 17.050%, values: [0.010224, 0.007128, 0.006454, 0.007535, 0.006929, 0.006481, 0.006221, 0.006312, 0.006142, 0.006018]
            7ms
             */
        }

Numpy;

import numpy as np
#import timeit

a = np.arange(-10**6/2,10**6/2).reshape((10,10,10,10,10,10))

#timeit.timeit("b+c", repeat=10, globals=globals())
%timeit -n 10 a[a>0]
1.36 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
jjjkkkjjj commented 3 years ago

I think this parts cause above slowness

jjjkkkjjj commented 3 years ago

Gather may be useful Compress may be useful too

jjjkkkjjj commented 3 years ago

Should use boolean as storedType?

jjjkkkjjj commented 3 years ago

The commit 394d3a4 is invalid. These function must be declared outside the function like this;

public static func clip<T: MfTypable>(_ mfarray: MfArray, min: T? = nil, max: T? = nil) -> MfArray{

        switch mfarray.storedType {
        case .Float:
           return  _clip(vDSP_vclipc)
        case .Double:
            return _clip(vDSP_vclipcD)
        }
    }

fileprivate  func _clip<T: MfStorable>(_ vDSP_func: vDSP_clip_func<T>) -> MfArray{
            let min = min == nil ? -T.infinity : T.from(min!)
            let max = max == nil ? T.infinity : T.from(max!)
            return clip_by_vDSP(mfarray, min, max, vDSP_func)
        }

instead of

public static func clip<T: MfTypable>(_ mfarray: MfArray, min: T? = nil, max: T? = nil) -> MfArray{
        func _clip<T: MfStorable>(_ vDSP_func: vDSP_clip_func<T>) -> MfArray{
            let min = min == nil ? -T.infinity : T.from(min!)
            let max = max == nil ? T.infinity : T.from(max!)
            return clip_by_vDSP(mfarray, min, max, vDSP_func)
        }

        switch mfarray.storedType {
        case .Float:
           return  _clip(vDSP_vclipc)
        case .Double:
            return _clip(vDSP_vclipcD)
        }
    }
jjjkkkjjj commented 3 years ago

Use this commit‘s function and stored bool.

bool -> UInt8(cast only) -> Float(conversion) Float or Double -> UInt8(toBool_by_vDSP and conversion) -> bool(cast only)

However, vDSP can handle Floating point types only, not including UInt8. So, the last resort, extend vDSP for UInt8 such like https://forums.swift.org/t/vector-extensions-with-swift/37777/6

jjjkkkjjj commented 3 years ago

https://gain-performance.com/ume/ may be useful Call c++ from Swift? https://mike-neck.hatenadiary.com/entry/2018/12/02/080000

jjjkkkjjj commented 3 years ago

stride simd sample;

// GATHERU
        UME_FORCE_INLINE SIMDVec_u & gatheru(uint64_t const * baseAddr, uint64_t stride) {
#if defined (__AVX512DQ__)
            __m512i t0 = _mm512_set1_epi64(stride);
            __m512i t1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
            __m512i t2 = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
            __m512i t3 = _mm512_mullo_epi64(t0, t1);
            __m512i t4 = _mm512_mullo_epi64(t0, t2);
#else
            __m512i t3 = _mm512_setr_epi64(0, stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride);
            __m512i t4 = _mm512_setr_epi64(8*stride, 9*stride, 10*stride, 11*stride, 12*stride, 13*stride, 14*stride, 15*stride);
#endif
#if defined(WA_GCC_INTR_SUPPORT_6_2)
            // g++ has some interface issues.
            mVec[0] = _mm512_i64gather_epi64(t3, (const long long int*)baseAddr, 8);
            mVec[1] = _mm512_i64gather_epi64(t4, (const long long int*)baseAddr, 8);
#else
            mVec[0] = _mm512_i64gather_epi64(t3, (int64_t const*)baseAddr, 8);
            mVec[1] = _mm512_i64gather_epi64(t4, (int64_t const*)baseAddr, 8);
#endif
            return *this;
        }
        // MGATHERU
        UME_FORCE_INLINE SIMDVec_u & gatheru(SIMDVecMask<16> const & mask, uint64_t const * baseAddr, uint64_t stride) {
#if defined (__AVX512DQ__)
            __m512i t0 = _mm512_set1_epi64(stride);
            __m512i t1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
            __m512i t2 = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
            __m512i t3 = _mm512_mullo_epi64(t0, t1);
            __m512i t4 = _mm512_mullo_epi64(t0, t2);
#else
            __m512i t3 = _mm512_setr_epi64(0, stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride);
            __m512i t4 = _mm512_setr_epi64(8*stride, 9*stride, 10*stride, 11*stride, 12*stride, 13*stride, 14*stride, 15*stride);
#endif
            __mmask8 m0 = mask.mMask & 0x00FF;
            __mmask8 m1 = (mask.mMask & 0xFF00) >> 8;
#if defined(WA_GCC_INTR_SUPPORT_6_2)
            // g++ has some interface issues.
            __m512i t5 = _mm512_i64gather_epi64(t3, (const long long int*)baseAddr, 8);
            __m512i t6 = _mm512_i64gather_epi64(t4, (const long long int*)baseAddr, 8);
#else
            __m512i t5 = _mm512_i64gather_epi64(t3, (int64_t const*)baseAddr, 8);
            __m512i t6 = _mm512_i64gather_epi64(t4, (int64_t const*)baseAddr, 8);
#endif
            mVec[0] = _mm512_mask_mov_epi64(mVec[0], m0, t5);
            mVec[1] = _mm512_mask_mov_epi64(mVec[1], m1, t6);
            return *this;
        }
jjjkkkjjj commented 2 years ago

Try ‘withMemoryRebound’ in evdsp_sign like this

jjjkkkjjj commented 1 year ago

Bottleneck is the comparison operators. Use a specific vDSP function for comparison such that positive is true, negative is false

jjjkkkjjj commented 11 months ago

May use this function Then, add Boolean into StoredType

jjjkkkjjj commented 8 months ago

How about following calculation?

  1. Subtract two arrays
  2. And then, apply vDSP_cmprs with source as 1 vectors and gating vector as subtracted ones
jjjkkkjjj commented 8 months ago

I may find the solution... I'll try it!!

https://developer.apple.com/forums/thread/719117?answerId=734789022#734789022

jjjkkkjjj commented 8 months ago

I confirmed BNNS usage.

I can implement the comparison operators like this!

let a: [Float] = [1,2,3,4,5]
let b: [Float] = [1,-2,3,-4,5]
//var c: [Bool] = [0,0,0,0,0]
var c: [Bool] = [false,false,false,false,false]

let aDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: a, shape: .vector(a.count))
let bDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: b, shape: .vector(b.count))
let cDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: c, shape: .vector(c.count))

try! BNNS.compare(aDescriptor, bDescriptor, using: .equal, output: cDescriptor)

var ret = cDescriptor.makeArray(of: UInt8.self)!

var retF: [Float] = [0,0,0,0,0]

ret.withUnsafeMutableBufferPointer{
    retptr in
    retF.withUnsafeMutableBufferPointer{
        retFptr in
        vDSP_vfltu8(retptr.baseAddress!, vDSP_Stride(1), retFptr.baseAddress!, vDSP_Stride(1), vDSP_Length(5))
    }
}

print(retF) // [1.0, 0.0, 1.0, 0.0, 1.0]
jjjkkkjjj commented 8 months ago

3 times faster! 17.8ms -> 6ms However BNNS doesn't support Double