diff --git a/nano/node/openclwork.cpp b/nano/node/openclwork.cpp index e887fa83..88288d7d 100644 --- a/nano/node/openclwork.cpp +++ b/nano/node/openclwork.cpp @@ -16,379 +16,113 @@ bool nano::opencl_loaded{ false }; namespace { std::string opencl_program = R"%%%( -enum blake2b_constant -{ - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 +enum Blake2b_IV { + iv0 = 0x6a09e667f3bcc908UL, + iv1 = 0xbb67ae8584caa73bUL, + iv2 = 0x3c6ef372fe94f82bUL, + iv3 = 0xa54ff53a5f1d36f1UL, + iv4 = 0x510e527fade682d1UL, + iv5 = 0x9b05688c2b3e6c1fUL, + iv6 = 0x1f83d9abfb41bd6bUL, + iv7 = 0x5be0cd19137e2179UL, }; -typedef struct __blake2b_param -{ - uchar digest_length; // 1 - uchar key_length; // 2 - uchar fanout; // 3 - uchar depth; // 4 - uint leaf_length; // 8 - ulong node_offset; // 16 - uchar node_depth; // 17 - uchar inner_length; // 18 - uchar reserved[14]; // 32 - uchar salt[BLAKE2B_SALTBYTES]; // 48 - uchar personal[BLAKE2B_PERSONALBYTES]; // 64 -} blake2b_param; - -typedef struct __blake2b_state -{ - ulong h[8]; - ulong t[2]; - ulong f[2]; - uchar buf[2 * BLAKE2B_BLOCKBYTES]; - size_t buflen; - uchar last_node; -} blake2b_state; - -__constant static const ulong blake2b_IV[8] = -{ - 0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, - 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL, - 0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, - 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL +enum IV_Derived { + nano_xor_iv0 = 0x6a09e667f2bdc900UL, // iv1 ^ 0x1010000 ^ outlen + nano_xor_iv4 = 0x510e527fade682f9UL, // iv4 ^ inbytes + nano_xor_iv6 = 0xe07c265404be4294UL, // iv6 ^ ~0 }; -__constant static const uchar blake2b_sigma[12][16] = +#ifdef cl_amd_media_ops +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +static inline ulong rotr64(ulong x, int shift) { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - - -static inline int blake2b_set_lastnode( blake2b_state *S ) -{ - S->f[1] = ~0UL; - return 0; + uint2 x2 = as_uint2(x); + if (shift < 32) + return as_ulong(amd_bitalign(x2.s10, x2, shift)); + return as_ulong(amd_bitalign(x2, x2.s10, (shift - 32))); } - -/* Some helper functions, not necessarily useful */ -static inline int blake2b_set_lastblock( blake2b_state *S ) -{ - if( S->last_node ) blake2b_set_lastnode( S ); - - S->f[0] = ~0UL; - return 0; -} - -static inline int blake2b_increment_counter( blake2b_state *S, const ulong inc ) -{ - S->t[0] += inc; - S->t[1] += ( S->t[0] < inc ); - return 0; -} - -static inline ulong load64( const void *src ) -{ -#if defined(__ENDIAN_LITTLE__) - return *( ulong * )( src ); #else - const uchar *p = ( uchar * )src; - ulong w = *p++; - w |= ( ulong )( *p++ ) << 8; - w |= ( ulong )( *p++ ) << 16; - w |= ( ulong )( *p++ ) << 24; - w |= ( ulong )( *p++ ) << 32; - w |= ( ulong )( *p++ ) << 40; - w |= ( ulong )( *p++ ) << 48; - w |= ( ulong )( *p++ ) << 56; - return w; +static inline ulong rotr64(ulong x, int shift) +{ + return rotate(x, 64UL - shift); +} #endif -} -static inline void store32( void *dst, uint w ) +#define G32(m0, m1, m2, m3, vva, vb1, vb2, vvc, vd1, vd2) \ + do { \ + vva += (ulong2)(vb1 + m0, vb2 + m2); \ + vd1 = rotr64(vd1 ^ vva.s0, 32); \ + vd2 = rotr64(vd2 ^ vva.s1, 32); \ + vvc += (ulong2)(vd1, vd2); \ + vb1 = rotr64(vb1 ^ vvc.s0, 24); \ + vb2 = rotr64(vb2 ^ vvc.s1, 24); \ + vva += (ulong2)(vb1 + m1, vb2 + m3); \ + vd1 = rotr64(vd1 ^ vva.s0, 16); \ + vd2 = rotr64(vd2 ^ vva.s1, 16); \ + vvc += (ulong2)(vd1, vd2); \ + vb1 = rotr64(vb1 ^ vvc.s0, 63); \ + vb2 = rotr64(vb2 ^ vvc.s1, 63); \ + } while (0) + +#define G2v(m0, m1, m2, m3, a, b, c, d) \ + G32(m0, m1, m2, m3, vv[a / 2], vv[b / 2].s0, vv[b / 2].s1, vv[c / 2], \ + vv[d / 2].s0, vv[d / 2].s1) + +#define G2v_split(m0, m1, m2, m3, a, vb1, vb2, c, vd1, vd2) \ + G32(m0, m1, m2, m3, vv[a / 2], vb1, vb2, vv[c / 2], vd1, vd2) + +#define ROUND(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, \ + m15) \ + do { \ + G2v(m0, m1, m2, m3, 0, 4, 8, 12); \ + G2v(m4, m5, m6, m7, 2, 6, 10, 14); \ + G2v_split(m8, m9, m10, m11, 0, vv[5 / 2].s1, vv[6 / 2].s0, 10, \ + vv[15 / 2].s1, vv[12 / 2].s0); \ + G2v_split(m12, m13, m14, m15, 2, vv[7 / 2].s1, vv[4 / 2].s0, 8, \ + vv[13 / 2].s1, vv[14 / 2].s0); \ + } while (0) + +static inline ulong blake2b(ulong const nonce, __constant ulong *h) { -#if defined(__ENDIAN_LITTLE__) - *( uint * )( dst ) = w; -#else - uchar *p = ( uchar * )dst; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; -#endif + ulong2 vv[8] = { + {nano_xor_iv0, iv1}, {iv2, iv3}, {iv4, iv5}, + {iv6, iv7}, {iv0, iv1}, {iv2, iv3}, + {nano_xor_iv4, iv5}, {nano_xor_iv6, iv7}, + }; + + ROUND(nonce, h[0], h[1], h[2], h[3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + ROUND(0, 0, h[3], 0, 0, 0, 0, 0, h[0], 0, nonce, h[1], 0, 0, 0, h[2]); + ROUND(0, 0, 0, nonce, 0, h[1], 0, 0, 0, 0, h[2], 0, 0, h[0], 0, h[3]); + ROUND(0, 0, h[2], h[0], 0, 0, 0, 0, h[1], 0, 0, 0, h[3], nonce, 0, 0); + ROUND(0, nonce, 0, 0, h[1], h[3], 0, 0, 0, h[0], 0, 0, 0, 0, h[2], 0); + ROUND(h[1], 0, 0, 0, nonce, 0, 0, h[2], h[3], 0, 0, 0, 0, 0, h[0], 0); + ROUND(0, 0, h[0], 0, 0, 0, h[3], 0, nonce, 0, 0, h[2], 0, h[1], 0, 0); + ROUND(0, 0, 0, 0, 0, h[0], h[2], 0, 0, nonce, 0, h[3], 0, 0, h[1], 0); + ROUND(0, 0, 0, 0, 0, h[2], nonce, 0, 0, h[1], 0, 0, h[0], h[3], 0, 0); + ROUND(0, h[1], 0, h[3], 0, 0, h[0], 0, 0, 0, 0, 0, h[2], 0, 0, nonce); + ROUND(nonce, h[0], h[1], h[2], h[3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + ROUND(0, 0, h[3], 0, 0, 0, 0, 0, h[0], 0, nonce, h[1], 0, 0, 0, h[2]); + + return nano_xor_iv0 ^ vv[0].s0 ^ vv[4].s0; } - -static inline void store64( void *dst, ulong w ) -{ -#if defined(__ENDIAN_LITTLE__) - *( ulong * )( dst ) = w; -#else - uchar *p = ( uchar * )dst; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; w >>= 8; - *p++ = ( uchar )w; -#endif -} - -static inline ulong rotr64( const ulong w, const unsigned c ) -{ - return ( w >> c ) | ( w << ( 64 - c ) ); -} - -static void ucharset (void * dest_a, int val, size_t count) -{ - uchar * dest = (uchar *)dest_a; - for (size_t i = 0; i < count; ++i) - { - *dest++ = val; - } -} - -/* init xors IV with input parameter block */ -static inline int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) -{ - uchar *p, *h; - __constant uchar *v; - v = ( __constant uchar * )( blake2b_IV ); - h = ( uchar * )( S->h ); - p = ( uchar * )( P ); - /* IV XOR ParamBlock */ - ucharset( S, 0, sizeof( blake2b_state ) ); - - for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; - - return 0; -} - -static inline int blake2b_init( blake2b_state *S, const uchar outlen ) -{ - blake2b_param P[1]; - - if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; - - P->digest_length = outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store64( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - ucharset( P->reserved, 0, sizeof( P->reserved ) ); - ucharset( P->salt, 0, sizeof( P->salt ) ); - ucharset( P->personal, 0, sizeof( P->personal ) ); - return blake2b_init_param( S, P ); -} - -static int blake2b_compress( blake2b_state *S, __private const uchar block[BLAKE2B_BLOCKBYTES] ) -{ - ulong m[16]; - ulong v[16]; - int i; - - for( i = 0; i < 16; ++i ) - m[i] = load64( block + i * sizeof( m[i] ) ); - - for( i = 0; i < 8; ++i ) - v[i] = S->h[i]; - - v[ 8] = blake2b_IV[0]; - v[ 9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = S->t[0] ^ blake2b_IV[4]; - v[13] = S->t[1] ^ blake2b_IV[5]; - v[14] = S->f[0] ^ blake2b_IV[6]; - v[15] = S->f[1] ^ blake2b_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2*i+0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2*i+1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - for( i = 0; i < 8; ++i ) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - -#undef G +#undef G32 +#undef G2v +#undef G2v_split #undef ROUND - return 0; -} -static void ucharcpy (uchar * dst, uchar const * src, size_t count) +__kernel void nano_work(__constant ulong *attempt, + __global ulong *result_a, + __constant uchar *item_a, + __constant ulong *difficulty) { - for (size_t i = 0; i < count; ++i) - { - *dst++ = *src++; - } -} - -void printstate (blake2b_state * S) -{ - printf ("%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu ", S->h[0], S->h[1], S->h[2], S->h[3], S->h[4], S->h[5], S->h[6], S->h[7], S->t[0], S->t[1], S->f[0], S->f[1]); - for (int i = 0; i < 256; ++i) - { - printf ("%02x", S->buf[i]); - } - printf (" %lu %02x\n", S->buflen, S->last_node); -} - -/* inlen now in bytes */ -static int blake2b_update( blake2b_state *S, const uchar *in, ulong inlen ) -{ - while( inlen > 0 ) - { - size_t left = S->buflen; - size_t fill = 2 * BLAKE2B_BLOCKBYTES - left; - - if( inlen > fill ) - { - ucharcpy( S->buf + left, in, fill ); // Fill buffer - S->buflen += fill; - blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); - blake2b_compress( S, S->buf ); // Compress - ucharcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left - S->buflen -= BLAKE2B_BLOCKBYTES; - in += fill; - inlen -= fill; - } - else // inlen <= fill - { - ucharcpy( S->buf + left, in, inlen ); - S->buflen += inlen; // Be lazy, do not compress - in += inlen; - inlen -= inlen; - } - } - - return 0; -} - -/* Is this correct? */ -static int blake2b_final( blake2b_state *S, uchar *out, uchar outlen ) -{ - uchar buffer[BLAKE2B_OUTBYTES]; - - if( S->buflen > BLAKE2B_BLOCKBYTES ) - { - blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); - blake2b_compress( S, S->buf ); - S->buflen -= BLAKE2B_BLOCKBYTES; - ucharcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen ); - } - - //blake2b_increment_counter( S, S->buflen ); - ulong inc = (ulong)S->buflen; - S->t[0] += inc; -// if ( S->t[0] < inc ) -// S->t[1] += 1; - // This seems to crash the opencl compiler though fortunately this is calculating size and we don't do things bigger than 2^32 - - blake2b_set_lastblock( S ); - ucharset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ - blake2b_compress( S, S->buf ); - - for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ - store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - - ucharcpy( out, buffer, outlen ); - return 0; -} - -static void ucharcpyglb (uchar * dst, __global uchar const * src, size_t count) -{ - for (size_t i = 0; i < count; ++i) - { - *dst = *src; - ++dst; - ++src; - } -} - -__kernel void nano_work (__global ulong const * attempt, __global ulong * result_a, __global uchar const * item_a, __global ulong const * difficulty_a) -{ - int const thread = get_global_id (0); - uchar item_l [32]; - ucharcpyglb (item_l, item_a, 32); - ulong attempt_l = *attempt + thread; - blake2b_state state; - blake2b_init (&state, sizeof (ulong)); - blake2b_update (&state, (uchar *) &attempt_l, sizeof (ulong)); - blake2b_update (&state, item_l, 32); - ulong result; - blake2b_final (&state, (uchar *) &result, sizeof (result)); - if (result >= *difficulty_a) - { - *result_a = attempt_l; - } + const ulong attempt_l = *attempt + get_global_id(0); + if (blake2b(attempt_l, item_a) >= *difficulty) + *result_a = attempt_l; } )%%%"; } -void printstate (blake2b_state * S) -{ - std::cout << std::dec; - for (uint64_t x : { S->h[0], S->h[1], S->h[2], S->h[3], S->h[4], S->h[5], S->h[6], S->h[7], S->t[0], S->t[1], S->f[0], S->f[1] }) - { - std::cout << x << " "; - } - std::cout << std::endl; - - std::cout << std::hex; - for (uint8_t x : S->buf) - { - // print x as an integer, not as a char - std::cout << +x; - } - std::cout - << std::dec << " " << S->buflen - << std::hex << " " << S->last_node - << std::dec << std::endl; -} - nano::opencl_environment::opencl_environment (bool & error_a) { if (nano::opencl_loaded)