sgminer-dev / sgminer

Scrypt GPU miner
GNU General Public License v3.0
632 stars 826 forks source link

[x15] Proposal to add "shabal" & "whirlpool" hashing functions to a .cl file for GPU mining #290

Closed platinum4 closed 10 years ago

platinum4 commented 10 years ago

This weekend we will see the drop of two x15 coins, which appears to be x13+shabal+whirlpool, according to this github repo https://github.com/bitblockproject/Bitblock-CPUminer/blob/master/bitblock.c

The "new" added hashing functions found here: https://github.com/bitblockproject/Bitblock-CPUminer/blob/master/shabal.c https://github.com/bitblockproject/Bitblock-CPUminer/blob/master/whirlpool.c

Currently only the CPU miner is available, of course, this was the case with x13 about 6-8 weeks ago...

platinum4 commented 10 years ago

This also bitblock.cl which is found below:

/*

ifndef BITBLOCK_CL

define BITBLOCK_CL

define DEBUG(x)

if ENDIAN_LITTLE

define SPH_LITTLE_ENDIAN 1

else

define SPH_BIG_ENDIAN 1

endif

define SPH_UPTR sph_u64

typedef unsigned int sph_u32; typedef int sph_s32;

ifndef OPENCL_VERSION

typedef unsigned long long sph_u64; typedef long long sph_s64;

else

typedef unsigned long sph_u64; typedef long sph_s64;

endif

define SPH_64 1

define SPH_64_TRUE 1

define SPH_C32(x) ((sph_u32)(x ## U))

define SPH_T32(x) (as_uint(x))

define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))

define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))

define SPH_C64(x) ((sph_u64)(x ## UL))

define SPH_T64(x) (as_ulong(x))

define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)

define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))

define SPH_ECHO_64 1

define SPH_KECCAK_64 1

define SPH_JH_64 1

define SPH_SIMD_NOCOPY 0

define SPH_KECCAK_NOCOPY 0

define SPH_COMPACT_BLAKE_64 0

define SPH_LUFFA_PARALLEL 0

define SPH_SMALL_FOOTPRINT_GROESTL 0

define SPH_GROESTL_BIG_ENDIAN 0

define SPH_CUBEHASH_UNROLL 0

define SPH_KECCAK_UNROLL 0

if !defined SPH_HAMSI_EXPAND_BIG

define SPH_HAMSI_EXPAND_BIG 4

endif

include "blake.cl"

include "bmw.cl"

include "groestl.cl"

include "jh.cl"

include "keccak.cl"

include "skein.cl"

include "luffa.cl"

include "cubehash.cl"

include "shavite.cl"

include "simd.cl"

include "echo.cl"

include "hamsi.cl"

include "fugue.cl"

include "shabal.cl"

include "whirlpool.cl"

define SWAP4(x) as_uint(as_uchar4(x).wzyx)

define SWAP8(x) as_ulong(as_uchar8(x).s76543210)

if SPH_BIG_ENDIAN

define DEC64E(x) (x)

define DEC64BE(x) (_(const __global sph_u64 *) (x));

define DEC32LE(x) SWAP4(_(const __global sphu32 ) (x));

else

define DEC64E(x) SWAP8(x)

define DEC64BE(x) SWAP8(_(const __global sphu64 ) (x));

define DEC32LE(x) (_(const __global sph_u32 *) (x));

endif

define SHL(x, n) ((x) << (n))

define SHR(x, n) ((x) >> (n))

define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \

                q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
                q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
                q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])

typedef union { unsigned char h1[64]; uint h4[16]; ulong h8[8]; } hash_t;

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search(global unsigned char* block, __global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); // blake

sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;

if ((T0 = SPH_T64(T0 + 1024)) < 1024) T1 = SPH_T64(T1 + 1);

sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; M0 = DEC64BE(block + 0); M1 = DEC64BE(block + 8); M2 = DEC64BE(block + 16); M3 = DEC64BE(block + 24); M4 = DEC64BE(block + 32); M5 = DEC64BE(block + 40); M6 = DEC64BE(block + 48); M7 = DEC64BE(block + 56); M8 = DEC64BE(block + 64); M9 = DEC64BE(block + 72); M9 &= 0xFFFFFFFF00000000; M9 ^= SWAP4(gid); MA = 0x8000000000000000; MB = 0; MC = 0; MD = 1; ME = 0; MF = 0x280;

COMPRESS64;

hash->h8[0] = H0; hash->h8[1] = H1; hash->h8[2] = H2; hash->h8[3] = H3; hash->h8[4] = H4; hash->h8[5] = H5; hash->h8[6] = H6; hash->h8[7] = H7;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search1(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// bmw sph_u64 BMW_H[16];

pragma unroll 16

for(unsigned u = 0; u < 16; u++) BMW_H[u] = BMW_IV512[u];

sph_u64 mv[16],q[32]; sph_u64 tmp;

mv[0] = SWAP8(hash->h8[0]); mv[1] = SWAP8(hash->h8[1]); mv[2] = SWAP8(hash->h8[2]); mv[3] = SWAP8(hash->h8[3]); mv[4] = SWAP8(hash->h8[4]); mv[5] = SWAP8(hash->h8[5]); mv[6] = SWAP8(hash->h8[6]); mv[7] = SWAP8(hash->h8[7]); mv[8] = 0x80; mv[9] = 0; mv[10] = 0; mv[11] = 0; mv[12] = 0; mv[13] = 0; mv[14] = 0; mv[15] = SPH_C64(512);

tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];

pragma unroll 2

for(int i=0;i<2;i++) { q[i+16] = (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); }

pragma unroll 4

for(int i=2;i<6;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); }

pragma unroll 3

for(int i=6;i<9;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); }

pragma unroll 4

for(int i=9;i<13;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); }

pragma unroll 3

for(int i=13;i<16;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); }

sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];

BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);

BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);

pragma unroll 16

for(int i=0;i<16;i++) { mv[i] = BMW_H[i]; BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; }

tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];

pragma unroll 2

for(int i=0;i<2;i++) { q[i+16] = (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); }

pragma unroll 4

for(int i=2;i<6;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); }

pragma unroll 3

for(int i=6;i<9;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); }

pragma unroll 4

for(int i=9;i<13;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); }

pragma unroll 3

for(int i=13;i<16;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); }

XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);

BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);

hash->h8[0] = SWAP8(BMW_H[8]); hash->h8[1] = SWAP8(BMW_H[9]); hash->h8[2] = SWAP8(BMW_H[10]); hash->h8[3] = SWAP8(BMW_H[11]); hash->h8[4] = SWAP8(BMW_H[12]); hash->h8[5] = SWAP8(BMW_H[13]); hash->h8[6] = SWAP8(BMW_H[14]); hash->h8[7] = SWAP8(BMW_H[15]);

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search2(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

if !SPH_SMALL_FOOTPRINT_GROESTL

__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];

else

__local sph_u64 T0_C[256], T4_C[256];

endif

int init = get_local_id(0); int step = get_local_size(0);

for (int i = init; i < 256; i += step) { T0_C[i] = T0[i]; T4_C[i] = T4[i];

if !SPH_SMALL_FOOTPRINT_GROESTL

  T1_C[i] = T1[i];
  T2_C[i] = T2[i];
  T3_C[i] = T3[i];
  T5_C[i] = T5[i];
  T6_C[i] = T6[i];
  T7_C[i] = T7[i];
#endif

}

barrier(CLK_LOCAL_MEM_FENCE); // groestl

define T0 T0_C

define T1 T1_C

define T2 T2_C

define T3 T3_C

define T4 T4_C

define T5 T5_C

define T6 T6_C

define T7 T7_C

// groestl

sph_u64 H[16];

for (unsigned int u = 0; u < 15; u ++) H[u] = 0;

if USE_LE

H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);

else

H[15] = (sph_u64)512;

endif

sph_u64 g[16], m[16]; m[0] = DEC64E(hash->h8[0]); m[1] = DEC64E(hash->h8[1]); m[2] = DEC64E(hash->h8[2]); m[3] = DEC64E(hash->h8[3]); m[4] = DEC64E(hash->h8[4]); m[5] = DEC64E(hash->h8[5]); m[6] = DEC64E(hash->h8[6]); m[7] = DEC64E(hash->h8[7]);

for (unsigned int u = 0; u < 16; u ++) g[u] = m[u] ^ H[u];

m[8] = 0x80; g[8] = m[8] ^ H[8]; m[9] = 0; g[9] = m[9] ^ H[9]; m[10] = 0; g[10] = m[10] ^ H[10]; m[11] = 0; g[11] = m[11] ^ H[11]; m[12] = 0; g[12] = m[12] ^ H[12]; m[13] = 0; g[13] = m[13] ^ H[13]; m[14] = 0; g[14] = m[14] ^ H[14]; m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];

PERM_BIG_P(g); PERM_BIG_Q(m);

for (unsigned int u = 0; u < 16; u ++) H[u] ^= g[u] ^ m[u];

sph_u64 xH[16];

for (unsigned int u = 0; u < 16; u ++) xH[u] = H[u];

PERM_BIG_P(xH);

for (unsigned int u = 0; u < 16; u ++) H[u] ^= xH[u];

for (unsigned int u = 0; u < 8; u ++) hash->h8[u] = DEC64E(H[u + 8]);

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search3(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// skein

sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; sph_u64 bcount = 0;

m0 = SWAP8(hash->h8[0]); m1 = SWAP8(hash->h8[1]); m2 = SWAP8(hash->h8[2]); m3 = SWAP8(hash->h8[3]); m4 = SWAP8(hash->h8[4]); m5 = SWAP8(hash->h8[5]); m6 = SWAP8(hash->h8[6]); m7 = SWAP8(hash->h8[7]); UBI_BIG(480, 64); bcount = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; UBI_BIG(510, 8); hash->h8[0] = SWAP8(h0); hash->h8[1] = SWAP8(h1); hash->h8[2] = SWAP8(h2); hash->h8[3] = SWAP8(h3); hash->h8[4] = SWAP8(h4); hash->h8[5] = SWAP8(h5); hash->h8[6] = SWAP8(h6); hash->h8[7] = SWAP8(h7);

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search4(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// jh

sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); sph_u64 tmp;

for(int i = 0; i < 2; i++) { if (i == 0) { h0h ^= DEC64E(hash->h8[0]); h0l ^= DEC64E(hash->h8[1]); h1h ^= DEC64E(hash->h8[2]); h1l ^= DEC64E(hash->h8[3]); h2h ^= DEC64E(hash->h8[4]); h2l ^= DEC64E(hash->h8[5]); h3h ^= DEC64E(hash->h8[6]); h3l ^= DEC64E(hash->h8[7]); } else if(i == 1) { h4h ^= DEC64E(hash->h8[0]); h4l ^= DEC64E(hash->h8[1]); h5h ^= DEC64E(hash->h8[2]); h5l ^= DEC64E(hash->h8[3]); h6h ^= DEC64E(hash->h8[4]); h6l ^= DEC64E(hash->h8[5]); h7h ^= DEC64E(hash->h8[6]); h7l ^= DEC64E(hash->h8[7]);

  h0h ^= 0x80;
  h3l ^= 0x2000000000000;
}
E8;

} h4h ^= 0x80; h7l ^= 0x2000000000000;

hash->h8[0] = DEC64E(h4h); hash->h8[1] = DEC64E(h4l); hash->h8[2] = DEC64E(h5h); hash->h8[3] = DEC64E(h5l); hash->h8[4] = DEC64E(h6h); hash->h8[5] = DEC64E(h6l); hash->h8[6] = DEC64E(h7h); hash->h8[7] = DEC64E(h7l);

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search5(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// keccak

sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0;

a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); a04 = SPH_C64(0xFFFFFFFFFFFFFFFF);

a00 ^= SWAP8(hash->h8[0]); a10 ^= SWAP8(hash->h8[1]); a20 ^= SWAP8(hash->h8[2]); a30 ^= SWAP8(hash->h8[3]); a40 ^= SWAP8(hash->h8[4]); a01 ^= SWAP8(hash->h8[5]); a11 ^= SWAP8(hash->h8[6]); a21 ^= SWAP8(hash->h8[7]); a31 ^= 0x8000000000000001; KECCAK_F_1600; // Finalize the "lane complement" a10 = ~a10; a20 = ~a20;

hash->h8[0] = SWAP8(a00); hash->h8[1] = SWAP8(a10); hash->h8[2] = SWAP8(a20); hash->h8[3] = SWAP8(a30); hash->h8[4] = SWAP8(a40); hash->h8[5] = SWAP8(a01); hash->h8[6] = SWAP8(a11); hash->h8[7] = SWAP8(a21);

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search6(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// luffa

sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea);

DECL_TMP8(M);

M0 = hash->h4[1]; M1 = hash->h4[0]; M2 = hash->h4[3]; M3 = hash->h4[2]; M4 = hash->h4[5]; M5 = hash->h4[4]; M6 = hash->h4[7]; M7 = hash->h4[6];

for(uint i = 0; i < 5; i++) { MI5; LUFFA_P5;

if(i == 0) 
{
  M0 = hash->h4[9];
  M1 = hash->h4[8];
  M2 = hash->h4[11];
  M3 = hash->h4[10];
  M4 = hash->h4[13];
  M5 = hash->h4[12];
  M6 = hash->h4[15];
  M7 = hash->h4[14];
} 
else if(i == 1) 
{
  M0 = 0x80000000;
  M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} 
else if(i == 2)
  M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
else if(i == 3) 
{
  hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
  hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
  hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
  hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43;
  hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44;
  hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45;
  hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46;
  hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47;
}

}

hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search7(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// cubehash.h1

sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44);

x0 ^= SWAP4(hash->h4[1]); x1 ^= SWAP4(hash->h4[0]); x2 ^= SWAP4(hash->h4[3]); x3 ^= SWAP4(hash->h4[2]); x4 ^= SWAP4(hash->h4[5]); x5 ^= SWAP4(hash->h4[4]); x6 ^= SWAP4(hash->h4[7]); x7 ^= SWAP4(hash->h4[6]);

for (int i = 0; i < 13; i ++) { SIXTEEN_ROUNDS;

if (i == 0) 
{
  x0 ^= SWAP4(hash->h4[9]);
  x1 ^= SWAP4(hash->h4[8]);
  x2 ^= SWAP4(hash->h4[11]);
  x3 ^= SWAP4(hash->h4[10]);
  x4 ^= SWAP4(hash->h4[13]);
  x5 ^= SWAP4(hash->h4[12]);
  x6 ^= SWAP4(hash->h4[15]);
  x7 ^= SWAP4(hash->h4[14]);
} 
else if(i == 1) 
  x0 ^= 0x80;
else if (i == 2) 
  xv ^= SPH_C32(1);

}

hash->h4[0] = x0; hash->h4[1] = x1; hash->h4[2] = x2; hash->h4[3] = x3; hash->h4[4] = x4; hash->h4[5] = x5; hash->h4[6] = x6; hash->h4[7] = x7; hash->h4[8] = x8; hash->h4[9] = x9; hash->h4[10] = xa; hash->h4[11] = xb; hash->h4[12] = xc; hash->h4[13] = xd; hash->h4[14] = xe; hash->h4[15] = xf;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search8(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];

int init = get_local_id(0); int step = get_local_size(0);

for (int i = init; i < 256; i += step) { AES0[i] = AES0_C[i]; AES1[i] = AES1_C[i]; AES2[i] = AES2_C[i]; AES3[i] = AES3_C[i]; }

barrier(CLK_LOCAL_MEM_FENCE);

// shavite // IV sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A);

// state sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;

sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;

rk00 = hash->h4[0]; rk01 = hash->h4[1]; rk02 = hash->h4[2]; rk03 = hash->h4[3]; rk04 = hash->h4[4]; rk05 = hash->h4[5]; rk06 = hash->h4[6]; rk07 = hash->h4[7]; rk08 = hash->h4[8]; rk09 = hash->h4[9]; rk0A = hash->h4[10]; rk0B = hash->h4[11]; rk0C = hash->h4[12]; rk0D = hash->h4[13]; rk0E = hash->h4[14]; rk0F = hash->h4[15]; rk10 = 0x80; rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; rk1B = 0x2000000; rk1C = rk1D = rk1E = 0; rk1F = 0x2000000;

c512(buf);

hash->h4[0] = h0; hash->h4[1] = h1; hash->h4[2] = h2; hash->h4[3] = h3; hash->h4[4] = h4; hash->h4[5] = h5; hash->h4[6] = h6; hash->h4[7] = h7; hash->h4[8] = h8; hash->h4[9] = h9; hash->h4[10] = hA; hash->h4[11] = hB; hash->h4[12] = hC; hash->h4[13] = hD; hash->h4[14] = hE; hash->h4[15] = hF;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search9(global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);

// simd s32 q[256]; unsigned char x[128];

for(unsigned int i = 0; i < 64; i++) x[i] = hash->h1[i];

for(unsigned int i = 64; i < 128; i++) x[i] = 0;

u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);

FFT256(0, 1, 0, ll1); for (int i = 0; i < 256; i ++) { s32 tq;

  tq = q[i] + yoff_b_n[i];
  tq = REDS2(tq);
  tq = REDS1(tq);
  tq = REDS1(tq);
  q[i] = (tq <= 128 ? tq : tq - 257);

}

A0 ^= hash->h4[0]; A1 ^= hash->h4[1]; A2 ^= hash->h4[2]; A3 ^= hash->h4[3]; A4 ^= hash->h4[4]; A5 ^= hash->h4[5]; A6 ^= hash->h4[6]; A7 ^= hash->h4[7]; B0 ^= hash->h4[8]; B1 ^= hash->h4[9]; B2 ^= hash->h4[10]; B3 ^= hash->h4[11]; B4 ^= hash->h4[12]; B5 ^= hash->h4[13]; B6 ^= hash->h4[14]; B7 ^= hash->h4[15];

ONE_ROUNDBIG(0, 0, 3, 23, 17, 27); ONE_ROUNDBIG(1, 1, 28, 19, 22, 7); ONE_ROUNDBIG(2, 2, 29, 9, 15, 5); ONE_ROUNDBIG(3, 3, 4, 13, 10, 25);

STEP_BIG( C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), IF, 4, 13, PP84); STEP_BIG( C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), IF, 13, 10, PP85); STEP_BIG( C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), IF, 10, 25, PP86); STEP_BIG( C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), IF, 25, 4, PP80);

u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7;

define q SIMD_Q

A0 ^= 0x200;

ONE_ROUNDBIG(0, 0, 3, 23, 17, 27); ONE_ROUNDBIG(1, 1, 28, 19, 22, 7); ONE_ROUNDBIG(2, 2, 29, 9, 15, 5); ONE_ROUNDBIG(3, 3, 4, 13, 10, 25); STEP_BIG( COPY_A0, COPY_A1, COPY_A2, COPY_A3, COPY_A4, COPY_A5, COPY_A6, COPY_A7, IF, 4, 13, PP84); STEP_BIG( COPY_B0, COPY_B1, COPY_B2, COPY_B3, COPY_B4, COPY_B5, COPY_B6, COPY_B7, IF, 13, 10, PP85); STEP_BIG( COPY_C0, COPY_C1, COPY_C2, COPY_C3, COPY_C4, COPY_C5, COPY_C6, COPY_C7, IF, 10, 25, PP86); STEP_BIG( COPY_D0, COPY_D1, COPY_D2, COPY_D3, COPY_D4, COPY_D5, COPY_D6, COPY_D7, IF, 25, 4, PP80);

undef q

hash->h4[0] = A0; hash->h4[1] = A1; hash->h4[2] = A2; hash->h4[3] = A3; hash->h4[4] = A4; hash->h4[5] = A5; hash->h4[6] = A6; hash->h4[7] = A7; hash->h4[8] = B0; hash->h4[9] = B1; hash->h4[10] = B2; hash->h4[11] = B3; hash->h4[12] = B4; hash->h4[13] = B5; hash->h4[14] = B6; hash->h4[15] = B7;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search10(global hash_t* hashes) { uint gid = get_global_id(0); uint offset = get_global_offset(0); hash_t hash; __global hash_t *hashp = &(hashes[gid-offset]);

__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];

int init = get_local_id(0); int step = get_local_size(0);

for (int i = init; i < 256; i += step) { AES0[i] = AES0_C[i]; AES1[i] = AES1_C[i]; AES2[i] = AES2_C[i]; AES3[i] = AES3_C[i]; }

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 0; i < 8; i++) hash.h8[i] = hashes[gid-offset].h8[i];

// echo sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0;

sph_u32 K0 = 512; sph_u32 K1 = 0; sph_u32 K2 = 0; sph_u32 K3 = 0;

W00 = Vb00; W01 = Vb01; W10 = Vb10; W11 = Vb11; W20 = Vb20; W21 = Vb21; W30 = Vb30; W31 = Vb31; W40 = Vb40; W41 = Vb41; W50 = Vb50; W51 = Vb51; W60 = Vb60; W61 = Vb61; W70 = Vb70; W71 = Vb71; W80 = hash.h8[0]; W81 = hash.h8[1]; W90 = hash.h8[2]; W91 = hash.h8[3]; WA0 = hash.h8[4]; WA1 = hash.h8[5]; WB0 = hash.h8[6]; WB1 = hash.h8[7]; WC0 = 0x80; WC1 = 0; WD0 = 0; WD1 = 0; WE0 = 0; WE1 = 0x200000000000000; WF0 = 0x200; WF1 = 0;

for (unsigned u = 0; u < 10; u ++) BIG_ROUND;

hashp->h8[0] = hash.h8[0] ^ Vb00 ^ W00 ^ W80; hashp->h8[1] = hash.h8[1] ^ Vb01 ^ W01 ^ W81; hashp->h8[2] = hash.h8[2] ^ Vb10 ^ W10 ^ W90; hashp->h8[3] = hash.h8[3] ^ Vb11 ^ W11 ^ W91; hashp->h8[4] = hash.h8[4] ^ Vb20 ^ W20 ^ WA0; hashp->h8[5] = hash.h8[5] ^ Vb21 ^ W21 ^ WA1; hashp->h8[6] = hash.h8[6] ^ Vb30 ^ W30 ^ WB0; hashp->h8[7] = hash.h8[7] ^ Vb31 ^ W31 ^ WB1;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search11(global hash_t* hashes) { uint gid = get_global_id(0); uint offset = get_global_offset(0); __global hash_t *hash = &(hashes[gid-offset]);

sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };

define buf(u) hash->h1[i + u]

for(int i = 0; i < 64; i += 8) { INPUT_BIG; P_BIG; T_BIG; }

undef buf

define buf(u) (u == 0 ? 0x80 : 0)

INPUT_BIG; P_BIG; T_BIG;

undef buf

define buf(u) (u == 6 ? 2 : 0)

INPUT_BIG; PF_BIG; T_BIG;

for(unsigned u = 0; u < 16; u ++) hash->h4[u] = h[u];

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search12(global hash_t* hashes) { uint gid = get_global_id(0); uint offset = get_global_offset(0); __global hash_t *hash = &(hashes[gid-offset]);

// fugue sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; sph_u32 S30, S31, S32, S33, S34, S35;

ulong fc_bit_count = (sph_u64) 64 << 3;

S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);

FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2])); FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5])); FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8])); FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB])); FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE])); FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);

// apply round shift if necessary int i;

for (i = 0; i < 32; i ++) { ROR3; CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); SMIX(S00, S01, S02, S03); }

for (i = 0; i < 13; i ++) { S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00; ROR9; SMIX(S00, S01, S02, S03); S04 ^= S00; S10 ^= S00; S18 ^= S00; S27 ^= S00; ROR9; SMIX(S00, S01, S02, S03); S04 ^= S00; S10 ^= S00; S19 ^= S00; S27 ^= S00; ROR9; SMIX(S00, S01, S02, S03); S04 ^= S00; S10 ^= S00; S19 ^= S00; S28 ^= S00; ROR8; SMIX(S00, S01, S02, S03); } S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00;

hash->h4[0] = SWAP4(S01); hash->h4[1] = SWAP4(S02); hash->h4[2] = SWAP4(S03); hash->h4[3] = SWAP4(S04); hash->h4[4] = SWAP4(S09); hash->h4[5] = SWAP4(S10); hash->h4[6] = SWAP4(S11); hash->h4[7] = SWAP4(S12); hash->h4[8] = SWAP4(S18); hash->h4[9] = SWAP4(S19); hash->h4[10] = SWAP4(S20); hash->h4[11] = SWAP4(S21); hash->h4[12] = SWAP4(S27); hash->h4[13] = SWAP4(S28); hash->h4[14] = SWAP4(S29); hash->h4[15] = SWAP4(S30);

// bool result = (hash->h8[3] <= target); // if (result) // output[atomic_inc(output+0xFF)] = SWAP4(gid);

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search13(global hash_t* hashes) { uint gid = get_global_id(0); uint offset = get_global_offset(0); __global hash_t *hash = &(hashes[gid-offset]);

// shabal

sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7],
    A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7],
    B8 = B_init_512[8], B9 = B_init_512[9], BA = B_init_512[10], BB = B_init_512[11], BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
sph_u32 C0 = C_init_512[0], C1 = C_init_512[1], C2 = C_init_512[2], C3 = C_init_512[3], C4 = C_init_512[4], C5 = C_init_512[5], C6 = C_init_512[6], C7 = C_init_512[7],
    C8 = C_init_512[8], C9 = C_init_512[9], CA = C_init_512[10], CB = C_init_512[11], CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
sph_u32 Wlow = 1, Whigh = 0;

M0 = hash->h4[0];
M1 = hash->h4[1];
M2 = hash->h4[2];
M3 = hash->h4[3];
M4 = hash->h4[4];
M5 = hash->h4[5];
M6 = hash->h4[6];
M7 = hash->h4[7];
M8 = hash->h4[8];
M9 = hash->h4[9];
MA = hash->h4[10];
MB = hash->h4[11];
MC = hash->h4[12];
MD = hash->h4[13];
ME = hash->h4[14];
MF = hash->h4[15];

INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
INPUT_BLOCK_SUB;
SWAP_BC;
INCR_W;

M0 = 0x80;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;

INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
for (unsigned i = 0; i < 3; i ++) {
SWAP_BC;
XOR_W;
APPLY_P;
}

hash->h4[0] = B0;
hash->h4[1] = B1;
hash->h4[2] = B2;
hash->h4[3] = B3;
hash->h4[4] = B4;
hash->h4[5] = B5;
hash->h4[6] = B6;
hash->h4[7] = B7;
hash->h4[8] = B8;
hash->h4[9] = B9;
hash->h4[10] = BA;
hash->h4[11] = BB;
hash->h4[12] = BC;
hash->h4[13] = BD;
hash->h4[14] = BE;
hash->h4[15] = BF;

barrier(CLK_GLOBAL_MEM_FENCE); }

attribute((reqd_work_group_size(WORKSIZE, 1, 1))) kernel void search14(global hash_t* hashes, global uint* output, const ulong target) { uint gid = get_global_id(0); uint offset = get_global_offset(0); global hash_t *hash = &(hashes[gid-offset]);

// whirlpool sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; sph_u64 state[8];

n0 = (hash->h8[0]);
n1 = (hash->h8[1]);
n2 = (hash->h8[2]);
n3 = (hash->h8[3]);
n4 = (hash->h8[4]);
n5 = (hash->h8[5]);
n6 = (hash->h8[6]);
n7 = (hash->h8[7]);

h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0;

n0 ^= h0;
n1 ^= h1;
n2 ^= h2;
n3 ^= h3;
n4 ^= h4;
n5 ^= h5;
n6 ^= h6;
n7 ^= h7;

for (unsigned r = 0; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
}

state[0] = n0 ^ (hash->h8[0]);
state[1] = n1 ^ (hash->h8[1]);
state[2] = n2 ^ (hash->h8[2]);
state[3] = n3 ^ (hash->h8[3]);
state[4] = n4 ^ (hash->h8[4]);
state[5] = n5 ^ (hash->h8[5]);
state[6] = n6 ^ (hash->h8[6]);
state[7] = n7 ^ (hash->h8[7]);

n0 = 0x80;
n1 = n2 = n3 = n4 = n5 = n6 = 0;
n7 = 0x2000000000000;

h0 = state[0];
h1 = state[1];
h2 = state[2];
h3 = state[3];
h4 = state[4];
h5 = state[5];
h6 = state[6];
h7 = state[7];

n0 ^= h0;
n1 ^= h1;
n2 ^= h2;
n3 ^= h3;
n4 ^= h4;
n5 ^= h5;
n6 ^= h6;
n7 ^= h7;

for (unsigned r = 0; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
}

state[0] ^= n0 ^ 0x80;
state[1] ^= n1;
state[2] ^= n2;
state[3] ^= n3;
state[4] ^= n4;
state[5] ^= n5;
state[6] ^= n6;
state[7] ^= n7 ^ 0x2000000000000;

for (unsigned i = 0; i < 8; i ++)
hash->h8[i] = state[i];

// for(uint i = 0; i < 8; i++) // output[(NUMHASH * 9) * 15 + gid * 9 + i] = hash->h8[i];

// output[(NUMHASH * 9) * 15 + gid * 9 + 8] = nonce;

bool result = (hash->h8[3] <= target); if (result) output[atomic_inc(output+0xFF)] = SWAP4(gid);

barrier(CLK_GLOBAL_MEM_FENCE); }

endif // BITBLOCK_CL

kenshirothefist commented 10 years ago

This has already been implemented in this repo: https://github.com/aznboy84/X15GPU .. somebody should contact aznboy84 (https://bitcointalk.org/index.php?action=profile;u=208678) and do the pull/merge ...

platinum4 commented 10 years ago

May or may not be necessary, I think ystarnaud is on this as we speak, but it may get added to its own new, separate branch.

Looks like aznboy84 wrapped up an x64 Windows build as well, so that repo seems to be in good order, although I'm not sure that much was changed except for adding the ability to specify bitblock.cl and then adding its corresponding .c and .h files

On 6/27/2014 6:34 AM, kenshirothefist wrote:

This has already been implemented in this repo: https://github.com/aznboy84/X15GPU .. somebody should contact aznboy84 (https://bitcointalk.org/index.php?action=profile;u=208678) and do the pull/merge ...

— Reply to this email directly or view it on GitHub https://github.com/sgminer-dev/sgminer/issues/290#issuecomment-47333454.

Elbandi commented 10 years ago

there is something with whirlpool hash, i cannot compile the kernel:

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fffec8d2700 (LWP 16307)]
0x00007ffff02aa5ac in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
(gdb) where
#0  0x00007ffff02aa5ac in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#1  0x00007ffff02ae740 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#2  0x00007ffff02aea7f in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#3  0x00007ffff0015151 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#4  0x00007ffff0013abc in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#5  0x00007ffff0014210 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#6  0x00007ffff0017b7a in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#7  0x00007ffff03543f4 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#8  0x00007ffff038653c in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#9  0x00007ffff0386e25 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#10 0x00007ffff0388d24 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#11 0x00007ffff037776c in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#12 0x00007ffff037668b in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#13 0x00007ffff034aa24 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#14 0x00007fffefdab7f2 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#15 0x00007fffefdabb3e in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#16 0x00007fffefdba106 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#17 0x00007fffefdb937a in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#18 0x00007fffefd585f5 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#19 0x00007fffefd6a8a0 in ?? () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#20 0x00007fffefd509d7 in clBuildProgram () from /opt/AMDAPP/lib/x86_64/libamdocl64.so
#21 0x000000000043ab60 in build_opencl_kernel (data=0x7fffec8d0a20, filename=<optimized out>) at ocl/build_kernel.c:126
#22 0x000000000042f89e in initCl (gpu=0, name=0x7fffec8d1890 "Tahiti", nameSize=256, algorithm=0x718a40) at ocl.c:400
#23 0x000000000042ab14 in opencl_thread_prepare (thr=<optimized out>) at driver-opencl.c:1242

My mate tested, and it works for he, so there is some incompatible stuff on my system. maybe the driver is to low, i try to upgrade it.

platinum4 commented 10 years ago

[quote author=aznboy84 link=topic=665159.msg7546671#msg7546671 date=1403875747] [quote author=platinum4 link=topic=665159.msg7546088#msg7546088 date=1403873777] aznboy84, can you pull request with ystarnaud over at https://github.com/sgminer-dev/sgminer/tree/v5_0-new-algo ? [/quote]

Can't ... just signed up github today ... but i guess it's private [/quote]

How do we get this guy's pull request to go through?

platinum4 commented 10 years ago

Closed as it is now duplicitous, and handled by Pull Request #294