Use improved OpenCL kernel (#2902)

Credit and thanks go to @jserv and @PlasmaPower for the contribution. Originally pushed to https://github.com/nanocurrency/nano-work-server/pull/18, this kernel was rewritten with the following improvements: 1. Completely remove unnecessary intermediate buffers; 2. Fully vectorize Blake2b; 3. Schedule registers in Blake2b; 4. Load all sigma constants in a single instruction and use macros for constant evaluation; 5. Assume the messages no exceeding 17 exabytes and apply optimizations; 6. Implement AMD fastpath for rotr64; 7. Specify __constant for both optimization and error checking; Co-authored-by: Jim Huang <jserv@biilabs.io> Co-authored-by: Lee Bousfield <ljbousfield@gmail.com> Signed-off-by: Guilherme Lawless <guilherme@nano.org> Signed-off-by: Sergey Kroshnin <sergiysw@gmail.com>
2020-09-02 11:10:14 +01:00 · 2020-09-02 11:10:14 +01:00 · ff424afa00
commit ff424afa00
parent d7798c6e70
1 changed files with 89 additions and 355 deletions
--- a/nano/node/openclwork.cpp
+++ b/nano/node/openclwork.cpp
@ -16,379 +16,113 @@ bool nano::opencl_loaded{ false };
 namespace
 {
 std::string opencl_program = R"%%%(
-enum blake2b_constant
-{
-	BLAKE2B_BLOCKBYTES = 128,
-	BLAKE2B_OUTBYTES   = 64,
-	BLAKE2B_KEYBYTES   = 64,
-	BLAKE2B_SALTBYTES  = 16,
-	BLAKE2B_PERSONALBYTES = 16
+enum Blake2b_IV {
+    iv0 = 0x6a09e667f3bcc908UL,
+    iv1 = 0xbb67ae8584caa73bUL,
+    iv2 = 0x3c6ef372fe94f82bUL,
+    iv3 = 0xa54ff53a5f1d36f1UL,
+    iv4 = 0x510e527fade682d1UL,
+    iv5 = 0x9b05688c2b3e6c1fUL,
+    iv6 = 0x1f83d9abfb41bd6bUL,
+    iv7 = 0x5be0cd19137e2179UL,
 };

-typedef struct __blake2b_param
-{
-	uchar  digest_length; // 1
-	uchar  key_length;    // 2
-	uchar  fanout;        // 3
-	uchar  depth;         // 4
-	uint leaf_length;   // 8
-	ulong node_offset;   // 16
-	uchar  node_depth;    // 17
-	uchar  inner_length;  // 18
-	uchar  reserved[14];  // 32
-	uchar  salt[BLAKE2B_SALTBYTES]; // 48
-	uchar  personal[BLAKE2B_PERSONALBYTES];  // 64
-} blake2b_param;
-
-typedef struct __blake2b_state
-{
-	ulong h[8];
-	ulong t[2];
-	ulong f[2];
-	uchar  buf[2 * BLAKE2B_BLOCKBYTES];
-	size_t   buflen;
-	uchar  last_node;
-} blake2b_state;
-
-__constant static const ulong blake2b_IV[8] =
-{
-	0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL,
-	0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL,
-	0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL,
-	0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL
+enum IV_Derived {
+    nano_xor_iv0 = 0x6a09e667f2bdc900UL,  // iv1 ^ 0x1010000 ^ outlen
+    nano_xor_iv4 = 0x510e527fade682f9UL,  // iv4 ^ inbytes
+    nano_xor_iv6 = 0xe07c265404be4294UL,  // iv6 ^ ~0
 };

-__constant static const uchar blake2b_sigma[12][16] =
+#ifdef cl_amd_media_ops
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+static inline ulong rotr64(ulong x, int shift)
 {
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
-  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
-  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
-  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
-  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
-  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
-  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
-  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
-  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
-};
-
-
-static inline int blake2b_set_lastnode( blake2b_state *S )
-{
-  S->f[1] = ~0UL;
-  return 0;
+    uint2 x2 = as_uint2(x);
+    if (shift < 32)
+        return as_ulong(amd_bitalign(x2.s10, x2, shift));
+    return as_ulong(amd_bitalign(x2, x2.s10, (shift - 32)));
 }
-
-/* Some helper functions, not necessarily useful */
-static inline int blake2b_set_lastblock( blake2b_state *S )
-{
-  if( S->last_node ) blake2b_set_lastnode( S );
-
-  S->f[0] = ~0UL;
-  return 0;
-}
-
-static inline int blake2b_increment_counter( blake2b_state *S, const ulong inc )
-{
-  S->t[0] += inc;
-  S->t[1] += ( S->t[0] < inc );
-  return 0;
-}
-
-static inline ulong load64( const void *src )
-{
-#if defined(__ENDIAN_LITTLE__)
-  return *( ulong * )( src );
 #else
-  const uchar *p = ( uchar * )src;
-  ulong w = *p++;
-  w |= ( ulong )( *p++ ) <<  8;
-  w |= ( ulong )( *p++ ) << 16;
-  w |= ( ulong )( *p++ ) << 24;
-  w |= ( ulong )( *p++ ) << 32;
-  w |= ( ulong )( *p++ ) << 40;
-  w |= ( ulong )( *p++ ) << 48;
-  w |= ( ulong )( *p++ ) << 56;
-  return w;
+static inline ulong rotr64(ulong x, int shift)
+{
+    return rotate(x, 64UL - shift);
+}
 #endif
-}

-static inline void store32( void *dst, uint w )
+#define G32(m0, m1, m2, m3, vva, vb1, vb2, vvc, vd1, vd2) \
+    do {                                                  \
+        vva += (ulong2)(vb1 + m0, vb2 + m2);              \
+        vd1 = rotr64(vd1 ^ vva.s0, 32);                   \
+        vd2 = rotr64(vd2 ^ vva.s1, 32);                   \
+        vvc += (ulong2)(vd1, vd2);                        \
+        vb1 = rotr64(vb1 ^ vvc.s0, 24);                   \
+        vb2 = rotr64(vb2 ^ vvc.s1, 24);                   \
+        vva += (ulong2)(vb1 + m1, vb2 + m3);              \
+        vd1 = rotr64(vd1 ^ vva.s0, 16);                   \
+        vd2 = rotr64(vd2 ^ vva.s1, 16);                   \
+        vvc += (ulong2)(vd1, vd2);                        \
+        vb1 = rotr64(vb1 ^ vvc.s0, 63);                   \
+        vb2 = rotr64(vb2 ^ vvc.s1, 63);                   \
+    } while (0)
+
+#define G2v(m0, m1, m2, m3, a, b, c, d)                                   \
+    G32(m0, m1, m2, m3, vv[a / 2], vv[b / 2].s0, vv[b / 2].s1, vv[c / 2], \
+        vv[d / 2].s0, vv[d / 2].s1)
+
+#define G2v_split(m0, m1, m2, m3, a, vb1, vb2, c, vd1, vd2) \
+    G32(m0, m1, m2, m3, vv[a / 2], vb1, vb2, vv[c / 2], vd1, vd2)
+
+#define ROUND(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, \
+              m15)                                                             \
+    do {                                                                       \
+        G2v(m0, m1, m2, m3, 0, 4, 8, 12);                                      \
+        G2v(m4, m5, m6, m7, 2, 6, 10, 14);                                     \
+        G2v_split(m8, m9, m10, m11, 0, vv[5 / 2].s1, vv[6 / 2].s0, 10,         \
+                  vv[15 / 2].s1, vv[12 / 2].s0);                               \
+        G2v_split(m12, m13, m14, m15, 2, vv[7 / 2].s1, vv[4 / 2].s0, 8,        \
+                  vv[13 / 2].s1, vv[14 / 2].s0);                               \
+    } while (0)
+
+static inline ulong blake2b(ulong const nonce, __constant ulong *h)
 {
-#if defined(__ENDIAN_LITTLE__)
-  *( uint * )( dst ) = w;
-#else
-  uchar *p = ( uchar * )dst;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w;
-#endif
+    ulong2 vv[8] = {
+        {nano_xor_iv0, iv1}, {iv2, iv3},          {iv4, iv5},
+        {iv6, iv7},          {iv0, iv1},          {iv2, iv3},
+        {nano_xor_iv4, iv5}, {nano_xor_iv6, iv7},
+    };
+
+    ROUND(nonce, h[0], h[1], h[2], h[3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    ROUND(0, 0, h[3], 0, 0, 0, 0, 0, h[0], 0, nonce, h[1], 0, 0, 0, h[2]);
+    ROUND(0, 0, 0, nonce, 0, h[1], 0, 0, 0, 0, h[2], 0, 0, h[0], 0, h[3]);
+    ROUND(0, 0, h[2], h[0], 0, 0, 0, 0, h[1], 0, 0, 0, h[3], nonce, 0, 0);
+    ROUND(0, nonce, 0, 0, h[1], h[3], 0, 0, 0, h[0], 0, 0, 0, 0, h[2], 0);
+    ROUND(h[1], 0, 0, 0, nonce, 0, 0, h[2], h[3], 0, 0, 0, 0, 0, h[0], 0);
+    ROUND(0, 0, h[0], 0, 0, 0, h[3], 0, nonce, 0, 0, h[2], 0, h[1], 0, 0);
+    ROUND(0, 0, 0, 0, 0, h[0], h[2], 0, 0, nonce, 0, h[3], 0, 0, h[1], 0);
+    ROUND(0, 0, 0, 0, 0, h[2], nonce, 0, 0, h[1], 0, 0, h[0], h[3], 0, 0);
+    ROUND(0, h[1], 0, h[3], 0, 0, h[0], 0, 0, 0, 0, 0, h[2], 0, 0, nonce);
+    ROUND(nonce, h[0], h[1], h[2], h[3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    ROUND(0, 0, h[3], 0, 0, 0, 0, 0, h[0], 0, nonce, h[1], 0, 0, 0, h[2]);
+
+    return nano_xor_iv0 ^ vv[0].s0 ^ vv[4].s0;
 }
-
-static inline void store64( void *dst, ulong w )
-{
-#if defined(__ENDIAN_LITTLE__)
-  *( ulong * )( dst ) = w;
-#else
-  uchar *p = ( uchar * )dst;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w; w >>= 8;
-  *p++ = ( uchar )w;
-#endif
-}
-
-static inline ulong rotr64( const ulong w, const unsigned c )
-{
-  return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
-static void ucharset (void * dest_a, int val, size_t count)
-{
-	uchar * dest = (uchar *)dest_a;
-	for (size_t i = 0; i < count; ++i)
-	{
-		*dest++ = val;
-	}
-}
-
-/* init xors IV with input parameter block */
-static inline int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
-{
-  uchar *p, *h;
-  __constant uchar *v;
-  v = ( __constant uchar * )( blake2b_IV );
-  h = ( uchar * )( S->h );
-  p = ( uchar * )( P );
-  /* IV XOR ParamBlock */
-  ucharset( S, 0, sizeof( blake2b_state ) );
-
-  for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
-
-  return 0;
-}
-
-static inline int blake2b_init( blake2b_state *S, const uchar outlen )
-{
-  blake2b_param P[1];
-
-  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
-
-  P->digest_length = outlen;
-  P->key_length    = 0;
-  P->fanout        = 1;
-  P->depth         = 1;
-  store32( &P->leaf_length, 0 );
-  store64( &P->node_offset, 0 );
-  P->node_depth    = 0;
-  P->inner_length  = 0;
-  ucharset( P->reserved, 0, sizeof( P->reserved ) );
-  ucharset( P->salt,     0, sizeof( P->salt ) );
-  ucharset( P->personal, 0, sizeof( P->personal ) );
-  return blake2b_init_param( S, P );
-}
-
-static int blake2b_compress( blake2b_state *S, __private const uchar block[BLAKE2B_BLOCKBYTES] )
-{
-  ulong m[16];
-  ulong v[16];
-  int i;
-
-  for( i = 0; i < 16; ++i )
-	m[i] = load64( block + i * sizeof( m[i] ) );
-
-  for( i = 0; i < 8; ++i )
-	v[i] = S->h[i];
-
-  v[ 8] = blake2b_IV[0];
-  v[ 9] = blake2b_IV[1];
-  v[10] = blake2b_IV[2];
-  v[11] = blake2b_IV[3];
-  v[12] = S->t[0] ^ blake2b_IV[4];
-  v[13] = S->t[1] ^ blake2b_IV[5];
-  v[14] = S->f[0] ^ blake2b_IV[6];
-  v[15] = S->f[1] ^ blake2b_IV[7];
-#define G(r,i,a,b,c,d) \
-  do { \
-	a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-	d = rotr64(d ^ a, 32); \
-	c = c + d; \
-	b = rotr64(b ^ c, 24); \
-	a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-	d = rotr64(d ^ a, 16); \
-	c = c + d; \
-	b = rotr64(b ^ c, 63); \
-  } while(0)
-#define ROUND(r)  \
-  do { \
-	G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-	G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-	G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-	G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-	G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-	G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-  } while(0)
-  ROUND( 0 );
-  ROUND( 1 );
-  ROUND( 2 );
-  ROUND( 3 );
-  ROUND( 4 );
-  ROUND( 5 );
-  ROUND( 6 );
-  ROUND( 7 );
-  ROUND( 8 );
-  ROUND( 9 );
-  ROUND( 10 );
-  ROUND( 11 );
-
-  for( i = 0; i < 8; ++i )
-	S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-
-#undef G
+#undef G32
+#undef G2v
+#undef G2v_split
 #undef ROUND
-  return 0;
-}

-static void ucharcpy (uchar * dst, uchar const * src, size_t count)
+__kernel void nano_work(__constant ulong *attempt,
+                        __global ulong *result_a,
+                        __constant uchar *item_a,
+                        __constant ulong *difficulty)
 {
-	for (size_t i = 0; i < count; ++i)
-	{
-		*dst++ = *src++;
-	}
-}
-
-void printstate (blake2b_state * S)
-{
-	printf ("%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu ", S->h[0], S->h[1], S->h[2], S->h[3], S->h[4], S->h[5], S->h[6], S->h[7], S->t[0], S->t[1], S->f[0], S->f[1]);
-	for (int i = 0; i < 256; ++i)
-	{
-		printf ("%02x", S->buf[i]);
-	}
-	printf (" %lu %02x\n", S->buflen, S->last_node);
-}
-
-/* inlen now in bytes */
-static int blake2b_update( blake2b_state *S, const uchar *in, ulong inlen )
-{
-  while( inlen > 0 )
-  {
-	size_t left = S->buflen;
-	size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
-
-	if( inlen > fill )
-	{
-	  ucharcpy( S->buf + left, in, fill ); // Fill buffer
-	  S->buflen += fill;
-	  blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-	  blake2b_compress( S, S->buf ); // Compress
-	  ucharcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
-	  S->buflen -= BLAKE2B_BLOCKBYTES;
-	  in += fill;
-	  inlen -= fill;
-	}
-	else // inlen <= fill
-	{
-	  ucharcpy( S->buf + left, in, inlen );
-	  S->buflen += inlen; // Be lazy, do not compress
-	  in += inlen;
-	  inlen -= inlen;
-	}
-  }
-
-  return 0;
-}
-
-/* Is this correct? */
-static int blake2b_final( blake2b_state *S, uchar *out, uchar outlen )
-{
-  uchar buffer[BLAKE2B_OUTBYTES];
-
-  if( S->buflen > BLAKE2B_BLOCKBYTES )
-  {
-	blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-	blake2b_compress( S, S->buf );
-	S->buflen -= BLAKE2B_BLOCKBYTES;
-	ucharcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
-  }
-
-  //blake2b_increment_counter( S, S->buflen );
-  ulong inc = (ulong)S->buflen;
-  S->t[0] += inc;
-//  if ( S->t[0] < inc )
-//    S->t[1] += 1;
-  // This seems to crash the opencl compiler though fortunately this is calculating size and we don't do things bigger than 2^32
-	
-  blake2b_set_lastblock( S );
-  ucharset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
-  blake2b_compress( S, S->buf );
-
-  for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
-	store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
-
-  ucharcpy( out, buffer, outlen );
-  return 0;
-}
-
-static void ucharcpyglb (uchar * dst, __global uchar const * src, size_t count)
-{
-	for (size_t i = 0; i < count; ++i)
-	{
-		*dst = *src;
-		++dst;
-		++src;
-	}
-}
-	
-__kernel void nano_work (__global ulong const * attempt, __global ulong * result_a, __global uchar const * item_a, __global ulong const * difficulty_a)
-{
-	int const thread = get_global_id (0);
-	uchar item_l [32];
-	ucharcpyglb (item_l, item_a, 32);
-	ulong attempt_l = *attempt + thread;
-	blake2b_state state;
-	blake2b_init (&state, sizeof (ulong));
-	blake2b_update (&state, (uchar *) &attempt_l, sizeof (ulong));
-	blake2b_update (&state, item_l, 32);
-	ulong result;
-	blake2b_final (&state, (uchar *) &result, sizeof (result));
-	if (result >= *difficulty_a)
-	{
-		*result_a = attempt_l;
-	}
+    const ulong attempt_l = *attempt + get_global_id(0);
+    if (blake2b(attempt_l, item_a) >= *difficulty)
+        *result_a = attempt_l;
 }
 )%%%";
 }

-void printstate (blake2b_state * S)
-{
-	std::cout << std::dec;
-	for (uint64_t x : { S->h[0], S->h[1], S->h[2], S->h[3], S->h[4], S->h[5], S->h[6], S->h[7], S->t[0], S->t[1], S->f[0], S->f[1] })
-	{
-		std::cout << x << " ";
-	}
-	std::cout << std::endl;
-
-	std::cout << std::hex;
-	for (uint8_t x : S->buf)
-	{
-		// print x as an integer, not as a char
-		std::cout << +x;
-	}
-	std::cout
-	<< std::dec << " " << S->buflen
-	<< std::hex << " " << S->last_node
-	<< std::dec << std::endl;
-}
-
 nano::opencl_environment::opencl_environment (bool & error_a)
 {
 	if (nano::opencl_loaded)