Open jjjkkkjjj opened 4 years ago
Unfortunately, Matft is 5 times slower than numpy...
Matft;
do{
let a = Matft.arange(start: -10*10*10*10*10*5, to: 10*10*10*10*10*5, by: 1, shape: [10,10,10,10,10,10])
self.measure {
let _ = a[a>0]
}
/*
'-[PerformanceTests.IndexingPefTests testPeformanceBooleanIndexing1]' measured [Time, seconds] average: 0.007, relative standard deviation: 17.050%, values: [0.010224, 0.007128, 0.006454, 0.007535, 0.006929, 0.006481, 0.006221, 0.006312, 0.006142, 0.006018]
7ms
*/
}
Numpy;
import numpy as np
#import timeit
a = np.arange(-10**6/2,10**6/2).reshape((10,10,10,10,10,10))
#timeit.timeit("b+c", repeat=10, globals=globals())
%timeit -n 10 a[a>0]
1.36 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
I think this parts cause above slowness
Should use boolean
as storedType?
The commit 394d3a4 is invalid. These function must be declared outside the function like this;
public static func clip<T: MfTypable>(_ mfarray: MfArray, min: T? = nil, max: T? = nil) -> MfArray{
switch mfarray.storedType {
case .Float:
return _clip(vDSP_vclipc)
case .Double:
return _clip(vDSP_vclipcD)
}
}
fileprivate func _clip<T: MfStorable>(_ vDSP_func: vDSP_clip_func<T>) -> MfArray{
let min = min == nil ? -T.infinity : T.from(min!)
let max = max == nil ? T.infinity : T.from(max!)
return clip_by_vDSP(mfarray, min, max, vDSP_func)
}
instead of
public static func clip<T: MfTypable>(_ mfarray: MfArray, min: T? = nil, max: T? = nil) -> MfArray{
func _clip<T: MfStorable>(_ vDSP_func: vDSP_clip_func<T>) -> MfArray{
let min = min == nil ? -T.infinity : T.from(min!)
let max = max == nil ? T.infinity : T.from(max!)
return clip_by_vDSP(mfarray, min, max, vDSP_func)
}
switch mfarray.storedType {
case .Float:
return _clip(vDSP_vclipc)
case .Double:
return _clip(vDSP_vclipcD)
}
}
Use this commit‘s function and stored bool.
bool -> UInt8(cast only) -> Float(conversion) Float or Double -> UInt8(toBool_by_vDSP and conversion) -> bool(cast only)
However, vDSP can handle Floating point types only, not including UInt8. So, the last resort, extend vDSP for UInt8 such like https://forums.swift.org/t/vector-extensions-with-swift/37777/6
https://gain-performance.com/ume/ may be useful Call c++ from Swift? https://mike-neck.hatenadiary.com/entry/2018/12/02/080000
stride simd sample;
// GATHERU
UME_FORCE_INLINE SIMDVec_u & gatheru(uint64_t const * baseAddr, uint64_t stride) {
#if defined (__AVX512DQ__)
__m512i t0 = _mm512_set1_epi64(stride);
__m512i t1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
__m512i t2 = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
__m512i t3 = _mm512_mullo_epi64(t0, t1);
__m512i t4 = _mm512_mullo_epi64(t0, t2);
#else
__m512i t3 = _mm512_setr_epi64(0, stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride);
__m512i t4 = _mm512_setr_epi64(8*stride, 9*stride, 10*stride, 11*stride, 12*stride, 13*stride, 14*stride, 15*stride);
#endif
#if defined(WA_GCC_INTR_SUPPORT_6_2)
// g++ has some interface issues.
mVec[0] = _mm512_i64gather_epi64(t3, (const long long int*)baseAddr, 8);
mVec[1] = _mm512_i64gather_epi64(t4, (const long long int*)baseAddr, 8);
#else
mVec[0] = _mm512_i64gather_epi64(t3, (int64_t const*)baseAddr, 8);
mVec[1] = _mm512_i64gather_epi64(t4, (int64_t const*)baseAddr, 8);
#endif
return *this;
}
// MGATHERU
UME_FORCE_INLINE SIMDVec_u & gatheru(SIMDVecMask<16> const & mask, uint64_t const * baseAddr, uint64_t stride) {
#if defined (__AVX512DQ__)
__m512i t0 = _mm512_set1_epi64(stride);
__m512i t1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
__m512i t2 = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
__m512i t3 = _mm512_mullo_epi64(t0, t1);
__m512i t4 = _mm512_mullo_epi64(t0, t2);
#else
__m512i t3 = _mm512_setr_epi64(0, stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride);
__m512i t4 = _mm512_setr_epi64(8*stride, 9*stride, 10*stride, 11*stride, 12*stride, 13*stride, 14*stride, 15*stride);
#endif
__mmask8 m0 = mask.mMask & 0x00FF;
__mmask8 m1 = (mask.mMask & 0xFF00) >> 8;
#if defined(WA_GCC_INTR_SUPPORT_6_2)
// g++ has some interface issues.
__m512i t5 = _mm512_i64gather_epi64(t3, (const long long int*)baseAddr, 8);
__m512i t6 = _mm512_i64gather_epi64(t4, (const long long int*)baseAddr, 8);
#else
__m512i t5 = _mm512_i64gather_epi64(t3, (int64_t const*)baseAddr, 8);
__m512i t6 = _mm512_i64gather_epi64(t4, (int64_t const*)baseAddr, 8);
#endif
mVec[0] = _mm512_mask_mov_epi64(mVec[0], m0, t5);
mVec[1] = _mm512_mask_mov_epi64(mVec[1], m1, t6);
return *this;
}
Try ‘withMemoryRebound’ in evdsp_sign like this
Bottleneck is the comparison operators. Use a specific vDSP function for comparison such that positive is true, negative is false
How about following calculation?
vDSP_cmprs
with source as 1 vectors and gating vector as subtracted ones I may find the solution... I'll try it!!
https://developer.apple.com/forums/thread/719117?answerId=734789022#734789022
I confirmed BNNS usage.
I can implement the comparison operators like this!
let a: [Float] = [1,2,3,4,5]
let b: [Float] = [1,-2,3,-4,5]
//var c: [Bool] = [0,0,0,0,0]
var c: [Bool] = [false,false,false,false,false]
let aDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: a, shape: .vector(a.count))
let bDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: b, shape: .vector(b.count))
let cDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: c, shape: .vector(c.count))
try! BNNS.compare(aDescriptor, bDescriptor, using: .equal, output: cDescriptor)
var ret = cDescriptor.makeArray(of: UInt8.self)!
var retF: [Float] = [0,0,0,0,0]
ret.withUnsafeMutableBufferPointer{
retptr in
retF.withUnsafeMutableBufferPointer{
retFptr in
vDSP_vfltu8(retptr.baseAddress!, vDSP_Stride(1), retFptr.baseAddress!, vDSP_Stride(1), vDSP_Length(5))
}
}
print(retF) // [1.0, 0.0, 1.0, 0.0, 1.0]
3 times faster! 17.8ms -> 6ms However BNNS doesn't support Double
Regarding #17
Official boolean indexing code is https://github.com/numpy/numpy/blob/cf1306a842d7b1064270bd06951a485121e60816/numpy/core/src/multiarray/mapping.c#L1010
SIMD function is https://github.com/numpy/numpy/blob/45bc13e6d922690eea43b9d807d476e0f243f836/numpy/core/src/umath/loops_comparison.dispatch.c.src#L36