Tax-Project uses the CryptoNight (AscendingNight) hash algorithm, which is optimized for CPU-based mining and GPU resistance, with the Algo specializing in fair sharing of the hashrate.
Edit: AscendingNight is similar to CryptoNight (same scratchpad implode and explode) but it utilizes a different main algorithm. While CryptoNight uses one aes step every second step, AscendingNight uses one AES every fourth step, has some substitutional parts and things like that to perform less calculations for more memory intesety.
In our private testing we came across performance boosts of 10% to 30% on Ryzen CPUs and up to 100% on older i7s.
Since the memory utilisation is still the same, the algorithm is just as fast on GPU (mathematically speaking, not tested yet).
The point with the reference implementation of AscendingNight is that, while it's modular, it's still using 2MB as a scratchpad size and 2^14 iterations to increase the speeds of the validation process.
By increasing the iteration number to the casual CryptoNight step count of 2^20 you'd see an even higher boost in the performance of CPUs compared to GPUs.
Tax-Project uses the CryptoNight (AscendingNight) hash algorithm, which is optimized for CPU-based mining and GPU resistance, with the Algo specializing in fair sharing of the hashrate.
Edit: AscendingNight is similar to CryptoNight (same scratchpad implode and explode) but it utilizes a different main algorithm. While CryptoNight uses one aes step every second step, AscendingNight uses one AES every fourth step, has some substitutional parts and things like that to perform less calculations for more memory intesety. In our private testing we came across performance boosts of 10% to 30% on Ryzen CPUs and up to 100% on older i7s. Since the memory utilisation is still the same, the algorithm is just as fast on GPU (mathematically speaking, not tested yet).
The point with the reference implementation of AscendingNight is that, while it's modular, it's still using 2MB as a scratchpad size and 2^14 iterations to increase the speeds of the validation process. By increasing the iteration number to the casual CryptoNight step count of 2^20 you'd see an even higher boost in the performance of CPUs compared to GPUs.
The important changes were done in the posr_aes macro. It can be found in here: https://github.com/Tax-Project/Tax/blob/master/src/crypto/slow-hash.c#L264
Links Algo= https://github.com/Tax-Project/Tax/blob/master/src/crypto/slow-hash.c http://clashproject.org/tax/ https://github.com/Tax-Project https://github.com/Tax-Project/Miner-UI
Algo:
cryptonight_single_hash
if (ALGO == xmrig::CRYPTONIGHT_ASC) { ptr0 = (m128i *)&l0[idx0 & 0x1FFFF0]; c0x = _mm_load_si128(ptr0); c0xx = _mm_load_si128(ptr0); if(SOFT_AES){ c0x = soft_aesenc(c0x, ax0); }else{ c0x = _mm_aesenc_si128(c0x, ax0); } _mm_store_si128((m128i )ptr0, c0x); idx0 = _mm_cvtsi128_si64(c0x); ptr0 = (__m128i )&l0[idx0 & 0x1FFFF0]; if(PREFETCH) { _mm_prefetch((const char)ptr0, _MM_HINT_T0); } _mm_store_si128((__m128i )ptr0, c0xx); c0xx = _mm_xor_si128(c0xx,c0x); idx0 = _mm_cvtsi128_si64(c0xx); ptr0 = (__m128i )&l0[idx0 & 0x1FFFF0]; _mm_store_si128((__m128i )ptr0, c0xx); uint64_t c0l, c0h; uint64_t al0 = _mm_cvtsi128_si64(ax0); uint64_t ah0 = ((uint64_t)&ax0)[1]; c0l = ((uint64_t)ptr0)[0]; c0h = ((uint64_t)ptr0)[1]; ((uint64_t)ptr0)[0] = al0;
cryptonight_double_hash
if (ALGO == xmrig::CRYPTONIGHT_ASC) { ptr0 = (m128i *)&l0[idx0 & 0x1FFFF0]; ptr1 = (m128i )&l1[idx1 & 0x1FFFF0]; c0x = _mm_load_si128(ptr0); c1x = _mm_load_si128(ptr1); c0xx = _mm_load_si128(ptr0); c1xx = _mm_load_si128(ptr1); if(SOFT_AES){ c0x = soft_aesenc(c0x, ax0); c1x = soft_aesenc(c1x, ax1); }else{ c0x = _mm_aesenc_si128(c0x, ax0); c1x = _mm_aesenc_si128(c1x, ax1); } _mm_store_si128((__m128i )ptr0, c0x); _mm_store_si128((m128i *)ptr1, c1x); idx0 = _mm_cvtsi128_si64(c0x); idx1 = _mm_cvtsi128_si64(c1x); ptr0 = (m128i )&l0[idx0 & 0x1FFFF0]; ptr1 = (__m128i )&l1[idx1 & 0x1FFFF0]; if(PREFETCH) { _mm_prefetch((const char)ptr0, _MM_HINT_T0); _mm_prefetch((const char)ptr1, _MM_HINT_T0); } _mm_store_si128((m128i )ptr0, c0xx); _mm_store_si128((__m128i )ptr1, c1xx); c0xx = _mm_xor_si128(c0xx,c0x); c1xx = _mm_xor_si128(c1xx,c1x); idx0 = _mm_cvtsi128_si64(c0xx); idx1 = _mm_cvtsi128_si64(c1xx); ptr0 = (m128i *)&l0[idx0 & 0x1FFFF0]; ptr1 = (m128i *)&l1[idx1 & 0x1FFFF0]; _mm_store_si128((m128i )ptr0, c0xx); _mm_store_si128((__m128i )ptr1, c1xx); uint64_t c0l, c0h; uint64_t c1l, c1h; uint64_t al1 = _mm_cvtsi128_si64(ax1); uint64_t al0 = _mm_cvtsi128_si64(ax0); uint64_t ah0 = ((uint64_t)&ax0)[1]; uint64_t ah1 = ((uint64_t)&ax1)[1]; c0l = ((uint64_t)ptr0)[0]; c1l = ((uint64_t)ptr1)[0]; c0h = ((uint64_t)ptr0)[1]; c1h = ((uint64_t)ptr1)[1]; ((uint64_t)ptr0)[0] = al0; ((uint64_t)ptr1)[0] = al1; if(PREFETCH) { _mm_prefetch((const char)ptr0, _MM_HINT_T0); _mm_prefetch((const char)ptr1, _MM_HINT_T0); } ((uint64_t)ptr0)[1] = ah0; ((uint64_t)ptr1)[1] = ah1; al0 ^= c0l; al1 ^= c1l; ah0 ^= c0h; ah1 ^= c1h; ax0 = _mm_set_epi64x(ah0, al0); ax1 = _mm_set_epi64x(ah1, al1); idx0 = al0; idx1 = al1; }