diff --git a/CMakeLists.txt b/CMakeLists.txt index 6694216c..fd65953d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ else (WIN32) endif (WIN32) if (APPLE) - set (PLATFORM_LINK_FLAGS "-framework Foundation") + set (PLATFORM_LINK_FLAGS "-framework Foundation -framework OpenCL") else (APPLE) set (PLATFORM_LINK_FLAGS "") endif (APPLE) @@ -129,13 +129,16 @@ add_library (node rai/node/common.hpp rai/node/node.hpp rai/node/node.cpp + rai/node/openclwork.cpp + rai/node/openclwork.hpp rai/node/rpc.hpp rai/node/rpc.cpp rai/node/testing.hpp rai/node/testing.cpp rai/node/wallet.hpp rai/node/wallet.cpp - rai/node/working.hpp) + rai/node/working.hpp + rai/node/xorshift.hpp) add_executable (core_test rai/core_test/block.cpp diff --git a/rai/core_test/node.cpp b/rai/core_test/node.cpp index e7452920..4e5cc3ad 100644 --- a/rai/core_test/node.cpp +++ b/rai/core_test/node.cpp @@ -19,7 +19,7 @@ TEST (node, block_store_path_failure) auto service (boost::make_shared ()); rai::alarm alarm (*service); rai::logging logging; - rai::work_pool work; + rai::work_pool work (false); auto node (std::make_shared (init, *service, 0, rai::unique_path (), alarm, logging, work)); ASSERT_TRUE (node->wallets.items.empty ()); node->stop (); @@ -31,7 +31,7 @@ TEST (node, inactive_supply) auto service (boost::make_shared ()); rai::alarm alarm (*service); rai::node_config config; - rai::work_pool work; + rai::work_pool work (false); config.inactive_supply = 10; auto node (std::make_shared (init, *service, rai::unique_path (), alarm, config, work)); ASSERT_EQ (10, node->ledger.inactive_supply); @@ -43,7 +43,7 @@ TEST (node, password_fanout) auto service (boost::make_shared ()); rai::alarm alarm (*service); rai::node_config config; - rai::work_pool work; + rai::work_pool work (false); config.password_fanout = 10; auto node (std::make_shared (init, *service, rai::unique_path (), alarm, config, work)); auto wallet (node->wallets.create (100)); diff --git a/rai/core_test/work_pool.cpp b/rai/core_test/work_pool.cpp index 0bbd4a6e..a29dd4b7 100644 --- a/rai/core_test/work_pool.cpp +++ b/rai/core_test/work_pool.cpp @@ -1,9 +1,10 @@ #include #include +#include TEST (work, one) { - rai::work_pool pool; + rai::work_pool pool (false); rai::change_block block (1, 1, rai::keypair ().prv, 3, 4); block.block_work_set (pool.generate (block.root ())); ASSERT_FALSE (pool.work_validate (block)); @@ -11,7 +12,7 @@ TEST (work, one) TEST (work, validate) { - rai::work_pool pool; + rai::work_pool pool (false); rai::send_block send_block (1, 1, 2, rai::keypair ().prv, 4, 6); ASSERT_TRUE (pool.work_validate (send_block)); send_block.block_work_set (pool.generate (send_block.root ())); @@ -20,7 +21,7 @@ TEST (work, validate) TEST (work, cancel) { - rai::work_pool pool; + rai::work_pool pool (false); rai::uint256_union key (1); bool exited (false); std::thread thread ([&pool, &key, &exited] () @@ -36,4 +37,17 @@ TEST (work, cancel) } pool.cancel (key); thread.join (); +} + +TEST (work, opencl) +{ + rai::work_pool pool (true); + ASSERT_NE (nullptr, pool.opencl); + rai::uint256_union root; + for (auto i (0); i < 1; ++i) + { + rai::random_pool.GenerateBlock (root.bytes.data (), root.bytes.size ()); + auto result (pool.generate (root)); + ASSERT_FALSE (pool.work_validate (root, result)); + } } \ No newline at end of file diff --git a/rai/node/node.cpp b/rai/node/node.cpp index 265ebf30..0a0c813f 100644 --- a/rai/node/node.cpp +++ b/rai/node/node.cpp @@ -591,7 +591,8 @@ receive_minimum (rai::Mrai_ratio), inactive_supply (0), password_fanout (1024), io_threads (std::max (4, std::thread::hardware_concurrency ())), -work_threads (std::max (4, std::thread::hardware_concurrency ())) +work_threads (std::max (4, std::thread::hardware_concurrency ())), +opencl_work (false) { switch (rai::rai_network) { @@ -2955,7 +2956,8 @@ bool rai::handle_node_options (boost::program_options::variables_map & vm) rai::inactive_node::inactive_node () : service (boost::make_shared ()), -alarm (*service) +alarm (*service), +work (false) { auto working (rai::working_path ()); boost::filesystem::create_directories (working); diff --git a/rai/node/node.hpp b/rai/node/node.hpp index d1ce04bc..59017a47 100644 --- a/rai/node/node.hpp +++ b/rai/node/node.hpp @@ -307,6 +307,7 @@ public: unsigned password_fanout; unsigned io_threads; unsigned work_threads; + bool opencl_work; static std::chrono::seconds constexpr keepalive_period = std::chrono::seconds (60); static std::chrono::seconds constexpr keepalive_cutoff = keepalive_period * 5; static std::chrono::minutes constexpr wallet_backup_interval = std::chrono::minutes (5); diff --git a/rai/node/openclwork.cpp b/rai/node/openclwork.cpp new file mode 100644 index 00000000..632e36b8 --- /dev/null +++ b/rai/node/openclwork.cpp @@ -0,0 +1,594 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace +{ +std::string opencl_program = R"%%%( +enum blake2b_constant +{ + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 +}; + +typedef struct __blake2b_param +{ + uchar digest_length; // 1 + uchar key_length; // 2 + uchar fanout; // 3 + uchar depth; // 4 + uint leaf_length; // 8 + ulong node_offset; // 16 + uchar node_depth; // 17 + uchar inner_length; // 18 + uchar reserved[14]; // 32 + uchar salt[BLAKE2B_SALTBYTES]; // 48 + uchar personal[BLAKE2B_PERSONALBYTES]; // 64 +} blake2b_param; + +typedef struct __blake2b_state +{ + ulong h[8]; + ulong t[2]; + ulong f[2]; + uchar buf[2 * BLAKE2B_BLOCKBYTES]; + size_t buflen; + uchar last_node; +} blake2b_state; + +__constant static ulong blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +__constant static uchar blake2b_sigma[12][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + + +static inline int blake2b_set_lastnode( blake2b_state *S ) +{ + S->f[1] = ~0UL; + return 0; +} + +/* Some helper functions, not necessarily useful */ +static inline int blake2b_set_lastblock( blake2b_state *S ) +{ + if( S->last_node ) blake2b_set_lastnode( S ); + + S->f[0] = ~0UL; + return 0; +} + +static inline int blake2b_increment_counter( blake2b_state *S, const ulong inc ) +{ + S->t[0] += inc; + S->t[1] += ( S->t[0] < inc ); + return 0; +} + +static inline uint load32( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + return *( uint * )( src ); +#else + const uchar *p = ( uchar * )src; + uint w = *p++; + w |= ( uint )( *p++ ) << 8; + w |= ( uint )( *p++ ) << 16; + w |= ( uint )( *p++ ) << 24; + return w; +#endif +} + +static inline ulong load64( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + return *( ulong * )( src ); +#else + const uchar *p = ( uchar * )src; + ulong w = *p++; + w |= ( ulong )( *p++ ) << 8; + w |= ( ulong )( *p++ ) << 16; + w |= ( ulong )( *p++ ) << 24; + w |= ( ulong )( *p++ ) << 32; + w |= ( ulong )( *p++ ) << 40; + w |= ( ulong )( *p++ ) << 48; + w |= ( ulong )( *p++ ) << 56; + return w; +#endif +} + +static inline void store32( void *dst, uint w ) +{ +#if defined(__ENDIAN_LITTLE__) + *( uint * )( dst ) = w; +#else + uchar *p = ( uchar * )dst; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; +#endif +} + +static inline void store64( void *dst, ulong w ) +{ +#if defined(__ENDIAN_LITTLE__) + *( ulong * )( dst ) = w; +#else + uchar *p = ( uchar * )dst; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; w >>= 8; + *p++ = ( uchar )w; +#endif +} + +static inline ulong rotr64( const ulong w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/* init xors IV with input parameter block */ +static inline int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) +{ + uchar *p, *h, *v; + v = ( uchar * )( blake2b_IV ); + h = ( uchar * )( S->h ); + p = ( uchar * )( P ); + /* IV XOR ParamBlock */ + memset( S, 0, sizeof( blake2b_state ) ); + + for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; + + return 0; +} + +static inline int blake2b_init( blake2b_state *S, const uchar outlen ) +{ + blake2b_param P[1]; + + if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; + + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store64( &P->node_offset, 0 ); + P->node_depth = 0; + P->inner_length = 0; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return blake2b_init_param( S, P ); +} + +static int blake2b_compress( blake2b_state *S, const uchar block[BLAKE2B_BLOCKBYTES] ) +{ + ulong m[16]; + ulong v[16]; + int i; + + for( i = 0; i < 16; ++i ) + m[i] = load64( block + i * sizeof( m[i] ) ); + + for( i = 0; i < 8; ++i ) + v[i] = S->h[i]; + + v[ 8] = blake2b_IV[0]; + v[ 9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = S->t[0] ^ blake2b_IV[4]; + v[13] = S->t[1] ^ blake2b_IV[5]; + v[14] = S->f[0] ^ blake2b_IV[6]; + v[15] = S->f[1] ^ blake2b_IV[7]; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2*i+0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2*i+1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + ROUND( 10 ); + ROUND( 11 ); + + for( i = 0; i < 8; ++i ) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND + return 0; +} + +static void ucharcpy (uchar * dst, uchar const * src, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + *dst++ = *src++; + } +} + +void printstate (blake2b_state * S) +{ + printf ("%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu ", S->h[0], S->h[1], S->h[2], S->h[3], S->h[4], S->h[5], S->h[6], S->h[7], S->t[0], S->t[1], S->f[0], S->f[1]); + for (int i = 0; i < 256; ++i) + { + printf ("%02x", S->buf[i]); + } + printf (" %lu %02x\n", S->buflen, S->last_node); +} + +/* inlen now in bytes */ +static int blake2b_update( blake2b_state *S, const uchar *in, ulong inlen ) +{ + while( inlen > 0 ) + { + size_t left = S->buflen; + size_t fill = 2 * BLAKE2B_BLOCKBYTES - left; + + if( inlen > fill ) + { + ucharcpy( S->buf + left, in, fill ); // Fill buffer + S->buflen += fill; + blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); + blake2b_compress( S, S->buf ); // Compress + ucharcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left + S->buflen -= BLAKE2B_BLOCKBYTES; + in += fill; + inlen -= fill; + } + else // inlen <= fill + { + ucharcpy( S->buf + left, in, inlen ); + S->buflen += inlen; // Be lazy, do not compress + in += inlen; + inlen -= inlen; + } + } + + return 0; +} + +static void ucharset (uchar * dest, int val, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + *dest++ = val; + } +} + +/* Is this correct? */ +static int blake2b_final( blake2b_state *S, uchar *out, uchar outlen ) +{ + uchar buffer[BLAKE2B_OUTBYTES]; + + if( S->buflen > BLAKE2B_BLOCKBYTES ) + { + blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); + blake2b_compress( S, S->buf ); + S->buflen -= BLAKE2B_BLOCKBYTES; + ucharcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen ); + } + + //blake2b_increment_counter( S, S->buflen ); + ulong inc = (ulong)S->buflen; + S->t[0] += inc; +// if ( S->t[0] < inc ) +// S->t[1] += 1; + // This seems to crash the opencl compiler though fortunately this is calculating size and we don't do things bigger than 2^32 + + blake2b_set_lastblock( S ); + ucharset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ + blake2b_compress( S, S->buf ); + + for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ + store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); + + ucharcpy( out, buffer, outlen ); + return 0; +} + +static void ucharcpyglb (uchar * dst, __global uchar const * src, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + *dst = *src; + ++dst; + ++src; + } +} + +__kernel void raiblocks_work (__global ulong * attempt, __global ulong * result_a, __global uchar * item_a) +{ + int const thread = get_global_id (0); + uchar item_l [32]; + ucharcpyglb (item_l, item_a, 32); + ulong attempt_l = *attempt + thread; + blake2b_state state; + blake2b_init (&state, sizeof (ulong)); + blake2b_update (&state, (uchar *) &attempt_l, sizeof (ulong)); + blake2b_update (&state, item_l, 32); + ulong result; + blake2b_final (&state, (uchar *) &result, sizeof (result)); + if (result >= 0xffffffc000000000ul) + //if (result >= 0xff00000000000000ul) + { + *result_a = attempt_l; + } +} +)%%%"; +} + +void printstate (blake2b_state * S) +{ + printf ("%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", S->h[0], S->h[1], S->h[2], S->h[3], S->h[4], S->h[5], S->h[6], S->h[7], S->t[0], S->t[1], S->f[0], S->f[1]); + for (int i = 0; i < 256; ++i) + { + printf ("%02x", S->buf[i]); + } + printf (" %lu %02x\n", S->buflen, S->last_node); +} + +rai::opencl_environment::opencl_environment (bool & error_a) +{ + cl_uint platformIdCount = 0; + clGetPlatformIDs (0, nullptr, &platformIdCount); + std::vector platformIds (platformIdCount); + clGetPlatformIDs (platformIdCount, platformIds.data(), nullptr); + for (auto i (platformIds.begin ()), n (platformIds.end ()); i != n; ++i) + { + auto & devices_container (devices [*i]); + cl_uint deviceIdCount = 0; + clGetDeviceIDs (*i, CL_DEVICE_TYPE_ALL, 0, nullptr, &deviceIdCount); + std::vector deviceIds (deviceIdCount); + clGetDeviceIDs (*i, CL_DEVICE_TYPE_ALL, deviceIdCount, deviceIds.data (), nullptr); + for (auto j (deviceIds.begin ()), m (deviceIds.end ()); j != m; ++j) + { + devices_container.push_back (*j); + } + } +} + +void rai::opencl_environment::dump () +{ + auto index (0); + for (auto i (devices.begin ()), n (devices.end ()); i != n; ++i, ++index) + { + std::vector queries = {CL_PLATFORM_PROFILE, CL_PLATFORM_VERSION, CL_PLATFORM_NAME, CL_PLATFORM_VENDOR, CL_PLATFORM_EXTENSIONS}; + std::cout << "Platform: " << index << std::endl; + for (auto j (queries.begin ()), m (queries.end ()); j != m; ++j) + { + size_t platformInfoCount = 0; + clGetPlatformInfo(i->first, *j, 0, nullptr, &platformInfoCount); + std::vector info (platformInfoCount); + clGetPlatformInfo(i->first, *j, info.size (), info.data (), nullptr); + std::cout << info.data () << std::endl; + } + for (auto j (i->second.begin ()), m (i->second.end ()); j != m; ++j) + { + std::vector queries = {CL_DEVICE_NAME, CL_DEVICE_VENDOR, CL_DEVICE_PROFILE}; + std::cout << "Device: " << j - i->second.begin () << std::endl; + for (auto k (queries.begin ()), o (queries.end ()); k != o; ++k) + { + size_t platformInfoCount = 0; + clGetDeviceInfo(*j, *k, 0, nullptr, &platformInfoCount); + std::vector info (platformInfoCount); + clGetDeviceInfo(*j, *k, info.size (), info.data (), nullptr); + std::cout << '\t' << info.data () << std::endl; + } + size_t deviceTypeCount = 0; + clGetDeviceInfo(*j, CL_DEVICE_TYPE, 0, nullptr, &deviceTypeCount); + std::vector deviceTypeInfo (deviceTypeCount); + clGetDeviceInfo(*j, CL_DEVICE_TYPE, deviceTypeCount, deviceTypeInfo.data (), 0); + std::string device_type_string; + switch (deviceTypeInfo [0]) + { + case CL_DEVICE_TYPE_ACCELERATOR: + device_type_string = "ACCELERATOR"; + break; + case CL_DEVICE_TYPE_CPU: + device_type_string = "CPU"; + break; + case CL_DEVICE_TYPE_CUSTOM: + device_type_string = "CUSTOM"; + break; + case CL_DEVICE_TYPE_DEFAULT: + device_type_string = "DEFAULT"; + break; + case CL_DEVICE_TYPE_GPU: + device_type_string = "GPU"; + break; + default: + device_type_string = "Unknown"; + break; + } + std::cout << '\t' << device_type_string << std::endl; + size_t compilerAvailableCount = 0; + clGetDeviceInfo(*j, CL_DEVICE_COMPILER_AVAILABLE, 0, nullptr, &compilerAvailableCount); + std::vector compilerAvailableInfo (compilerAvailableCount); + clGetDeviceInfo(*j, CL_DEVICE_COMPILER_AVAILABLE, compilerAvailableCount, compilerAvailableInfo.data (), 0); + std::cout << '\t' << "Compiler available: " << (compilerAvailableInfo [0] ? "true" : "false") << std::endl; + size_t computeUnitsAvailableCount = 0; + clGetDeviceInfo(*j, CL_DEVICE_MAX_COMPUTE_UNITS, 0, nullptr, &computeUnitsAvailableCount); + std::vector computeUnitsAvailableInfo (computeUnitsAvailableCount); + clGetDeviceInfo(*j, CL_DEVICE_MAX_COMPUTE_UNITS, computeUnitsAvailableCount, computeUnitsAvailableInfo.data (), 0); + uint64_t computeUnits (computeUnitsAvailableInfo [0] | (computeUnitsAvailableInfo [1] << 8) | (computeUnitsAvailableInfo [2] << 16) | (computeUnitsAvailableInfo [3] << 24)); + std::cout << '\t' << "Compute units available: " << computeUnits << std::endl; + } + } +} + +rai::opencl_work::opencl_work (bool & error_a, rai::opencl_environment & environment_a, rai::work_pool & pool_a) : +context (0), +attempt_buffer (0), +result_buffer (0), +item_buffer (0), +program (0), +kernel (0), +queue (0), +pool (pool_a) +{ + rai::random_pool.GenerateBlock (reinterpret_cast (rand.s.data ()), rand.s.size () * sizeof (decltype (rand.s)::value_type)); + auto i (environment_a.devices.begin ()); + auto selected_platform (i->first); + std::array selected_devices; + selected_devices [0] = i->second [1]; + cl_context_properties contextProperties [] = + { + CL_CONTEXT_PLATFORM, + reinterpret_cast (selected_platform), + 0, 0 + }; + cl_int createContextError (0); + context = clCreateContext (contextProperties, selected_devices.size (), selected_devices.data (), nullptr, nullptr, &createContextError); + error_a |= createContextError != CL_SUCCESS; + if (!error_a) + { + cl_int queue_error (0); + queue = clCreateCommandQueue (context, selected_devices [0], 0, &queue_error); + error_a |= queue_error != CL_SUCCESS; + if (!error_a) + { + cl_int attempt_error (0); + attempt_buffer = clCreateBuffer (context, 0, sizeof (uint64_t), nullptr, &attempt_error); + error_a |= attempt_error != CL_SUCCESS; + if (!error_a) + { + cl_int result_error (0); + result_buffer = clCreateBuffer (context, 0, sizeof (uint64_t), nullptr, &result_error); + error_a |= result_error != CL_SUCCESS; + if (!error_a) + { + cl_int item_error (0); + size_t item_size (sizeof (rai::uint256_union)); + item_buffer = clCreateBuffer (context, 0, item_size, nullptr, &item_error); + error_a |= item_error != CL_SUCCESS; + if (!error_a) + { + cl_int program_error (0); + char const * program_data (opencl_program.data ()); + size_t program_length (opencl_program.size ()); + program = clCreateProgramWithSource (context, 1, &program_data, &program_length, &program_error); + error_a |= program_error != CL_SUCCESS; + if (!error_a) + { + error_a |= clBuildProgram (program, selected_devices.size (), selected_devices.data (), "-D __APPLE__", nullptr, nullptr) != CL_SUCCESS; + if (!error_a) + { + /*for (auto i (selected_devices.begin ()), n (selected_devices.end ()); i != n; ++i) + { + size_t log_size (0); + clGetProgramBuildInfo (program, *i, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size); + std::vector log (log_size); + clGetProgramBuildInfo (program, *i, CL_PROGRAM_BUILD_LOG, log.size (), log.data (), nullptr); + std::cout << log.data () << std::endl; + }*/ + cl_int kernel_error (0); + kernel = clCreateKernel (program, "raiblocks_work", &kernel_error); + error_a |= kernel_error != CL_SUCCESS; + if (!error_a) + { + cl_int arg0_error (clSetKernelArg (kernel, 0, sizeof (attempt_buffer), &attempt_buffer)); + error_a |= arg0_error != CL_SUCCESS; + if (!error_a) + { + cl_int arg1_error (clSetKernelArg (kernel, 1, sizeof (result_buffer), &result_buffer)); + error_a |= arg1_error != CL_SUCCESS; + if (!error_a) + { + cl_int arg2_error (clSetKernelArg (kernel, 2, sizeof (item_buffer), &item_buffer)); + error_a |= arg2_error != CL_SUCCESS; + } + } + } + } + } + } + } + } + } + } +} + +rai::opencl_work::~opencl_work () +{ + clReleaseKernel (kernel); + clReleaseProgram (program); + clReleaseContext (context); +} + +uint64_t rai::opencl_work::generate_work (rai::uint256_union const & root_a) +{ + std::lock_guard lock (mutex); + uint64_t result (0); + unsigned thread_count (rai::rai_network == rai::rai_networks::rai_test_network ? 128 : 1024 * 1024); + size_t work_size [] = { thread_count, 0, 0 }; + while (pool.work_validate (root_a, result)) + { + result = rand.next (); + cl_int write_error1 = clEnqueueWriteBuffer (queue, attempt_buffer, false, 0, sizeof (uint64_t), &result, 0, nullptr, nullptr); + cl_int write_error2 = clEnqueueWriteBuffer (queue, item_buffer, false, 0, sizeof (rai::uint256_union), root_a.bytes.data (), 0, nullptr, nullptr); + cl_int enqueue_error = clEnqueueNDRangeKernel (queue, kernel, 1, nullptr, work_size, nullptr, 0, nullptr, nullptr); + cl_int read_error1 = clEnqueueReadBuffer(queue, result_buffer, false, 0, sizeof (uint64_t), &result, 0, nullptr, nullptr); + cl_int finishError = clFinish (queue); + } + return result; +} \ No newline at end of file diff --git a/rai/node/openclwork.hpp b/rai/node/openclwork.hpp new file mode 100644 index 00000000..ba3c7cc0 --- /dev/null +++ b/rai/node/openclwork.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include + +#include +#include +#include + +#ifdef __APPLE__ +#include +#else +#include +#endif + +namespace rai +{ +class opencl_environment +{ +public: + opencl_environment (bool &); + void dump (); + std::map > devices; +}; +union uint256_union; +class work_pool; +class opencl_work +{ +public: + opencl_work (bool &, rai::opencl_environment &, rai::work_pool &); + ~opencl_work (); + uint64_t generate_work (rai::uint256_union const &); + std::mutex mutex; + cl_context context; + cl_mem attempt_buffer; + cl_mem result_buffer; + cl_mem item_buffer; + cl_program program; + cl_kernel kernel; + cl_command_queue queue; + rai::xorshift1024star rand; + rai::work_pool & pool; +}; +} \ No newline at end of file diff --git a/rai/node/testing.cpp b/rai/node/testing.cpp index dd6ae7f4..9824de33 100644 --- a/rai/node/testing.cpp +++ b/rai/node/testing.cpp @@ -5,7 +5,8 @@ rai::system::system (uint16_t port_a, size_t count_a) : service (new boost::asio::io_service), -alarm (*service) +alarm (*service), +work (false) { nodes.reserve (count_a); for (size_t i (0); i < count_a; ++i) diff --git a/rai/node/wallet.cpp b/rai/node/wallet.cpp index 2077e10c..8293c8af 100644 --- a/rai/node/wallet.cpp +++ b/rai/node/wallet.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -11,7 +12,7 @@ #include -rai::work_pool::work_pool () : +rai::work_pool::work_pool (bool opencl_work_a) : current (0), ticket (0), done (false) @@ -27,6 +28,12 @@ done (false) })); threads.push_back (std::move (thread)); } + if (opencl_work_a) + { + auto error (false); + rai::opencl_environment environment (error); + opencl.reset (new rai::opencl_work (error, environment, *this)); + } } rai::work_pool::~work_pool () @@ -38,32 +45,6 @@ rai::work_pool::~work_pool () } } -namespace -{ -class xorshift1024star -{ -public: - xorshift1024star (): - p (0) - { - } - std::array s; - unsigned p; - uint64_t next () - { - auto p_l (p); - auto pn ((p_l + 1) & 15); - p = pn; - uint64_t s0 = s[ p_l ]; - uint64_t s1 = s[ pn ]; - s1 ^= s1 << 31; // a - s1 ^= s1 >> 11; // b - s0 ^= s0 >> 30; // c - return ( s[ pn ] = s0 ^ s1 ) * 1181783497276652981LL; - } -}; -} - uint64_t rai::work_pool::work_value (rai::block_hash const & root_a, uint64_t work_a) { uint64_t result; @@ -183,19 +164,26 @@ boost::optional rai::work_pool::generate_maybe (rai::uint256_union co { assert (!root_a.is_zero ()); boost::optional result; - std::unique_lock lock (mutex); - pending.push_back (root_a); - producer_condition.notify_all (); - auto done (false); - while (!done) + if (opencl != nullptr) { - consumer_condition.wait (lock); - auto finish (completed.find (root_a)); - if (finish != completed.end ()) + result = opencl->generate_work (root_a); + } + else + { + std::unique_lock lock (mutex); + pending.push_back (root_a); + producer_condition.notify_all (); + auto done (false); + while (!done) { - done = true; - result = finish->second; - completed.erase (finish); + consumer_condition.wait (lock); + auto finish (completed.find (root_a)); + if (finish != completed.end ()) + { + done = true; + result = finish->second; + completed.erase (finish); + } } } return result; diff --git a/rai/node/wallet.hpp b/rai/node/wallet.hpp index 96a747c4..bae874a9 100644 --- a/rai/node/wallet.hpp +++ b/rai/node/wallet.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -15,7 +16,7 @@ namespace rai class work_pool { public: - work_pool (); + work_pool (bool); ~work_pool (); void loop (uint64_t); void stop (); @@ -34,6 +35,7 @@ public: std::mutex mutex; std::condition_variable consumer_condition; std::condition_variable producer_condition; + std::unique_ptr opencl; // Local work threshold for rate-limiting publishing blocks. ~5 seconds of work. static uint64_t const publish_test_threshold = 0xff00000000000000; static uint64_t const publish_full_threshold = 0xffffffc000000000; diff --git a/rai/node/xorshift.hpp b/rai/node/xorshift.hpp new file mode 100644 index 00000000..0beff25b --- /dev/null +++ b/rai/node/xorshift.hpp @@ -0,0 +1,28 @@ +#pragma once +#include + +namespace rai +{ +class xorshift1024star +{ +public: + xorshift1024star (): + p (0) + { + } + std::array s; + unsigned p; + uint64_t next () + { + auto p_l (p); + auto pn ((p_l + 1) & 15); + p = pn; + uint64_t s0 = s[ p_l ]; + uint64_t s1 = s[ pn ]; + s1 ^= s1 << 31; // a + s1 ^= s1 >> 11; // b + s0 ^= s0 >> 30; // c + return ( s[ pn ] = s0 ^ s1 ) * 1181783497276652981LL; + } +}; +} diff --git a/rai/rai_landing/entry.cpp b/rai/rai_landing/entry.cpp index a612788c..9cfbed00 100644 --- a/rai/rai_landing/entry.cpp +++ b/rai/rai_landing/entry.cpp @@ -101,7 +101,7 @@ int main (int argc, char * const * argv) { rai::node_init init; auto service (boost::make_shared ()); - rai::work_pool work; + rai::work_pool work (config.node.opencl_work); rai::alarm alarm (*service); auto node (std::make_shared (init, *service, working, alarm, config.node, work)); if (!init.error ()) diff --git a/rai/rai_node/daemon.cpp b/rai/rai_node/daemon.cpp index a1aaeb19..15c60ce0 100644 --- a/rai/rai_node/daemon.cpp +++ b/rai/rai_node/daemon.cpp @@ -62,7 +62,7 @@ void rai_daemon::daemon::run () if (!error) { auto service (boost::make_shared ()); - rai::work_pool work; + rai::work_pool work (config.node.opencl_work); rai::alarm alarm (*service); rai::node_init init; auto node (std::make_shared (init, *service, working, alarm, config.node, work)); diff --git a/rai/rai_node/entry.cpp b/rai/rai_node/entry.cpp index 37861065..d517c67f 100644 --- a/rai/rai_node/entry.cpp +++ b/rai/rai_node/entry.cpp @@ -148,7 +148,7 @@ int main (int argc, char * const * argv) if (!key.decode_hex (vm ["key"].as ())) { rai::keypair genesis (key.to_string ()); - rai::work_pool work; + rai::work_pool work (false); std::cout << "Genesis: " << genesis.prv.data.to_string () << std::endl << "Public: " << genesis.pub.to_string () << std::endl << "Account: " << genesis.pub.to_account () << std::endl; rai::keypair landing; std::cout << "Landing: " << landing.prv.data.to_string () << std::endl << "Public: " << landing.pub.to_string () << std::endl << "Account: " << landing.pub.to_account () << std::endl; @@ -215,7 +215,7 @@ int main (int argc, char * const * argv) } else if (vm.count ("debug_profile_generate")) { - rai::work_pool work; + rai::work_pool work (false); rai::change_block block (0, 0, rai::keypair ().prv, 0, 0); std::cerr << "Starting generation profiling\n"; for (uint64_t i (0); true; ++i) @@ -229,7 +229,7 @@ int main (int argc, char * const * argv) } else if (vm.count ("debug_profile_verify")) { - rai::work_pool work; + rai::work_pool work (false); rai::change_block block (0, 0, rai::keypair ().prv, 0, 0); std::cerr << "Starting verification profiling\n"; for (uint64_t i (0); true; ++i) diff --git a/rai/rai_wallet/entry.cpp b/rai/rai_wallet/entry.cpp index c7a567dc..821162c1 100644 --- a/rai/rai_wallet/entry.cpp +++ b/rai/rai_wallet/entry.cpp @@ -149,7 +149,7 @@ int run_wallet (int argc, char * const * argv) QApplication application (argc, const_cast (argv)); rai::set_application_icon (application); auto service (boost::make_shared ()); - rai::work_pool work; + rai::work_pool work (config.node.opencl_work); rai::alarm alarm (*service); rai::node_init init; auto node (std::make_shared (init, *service, working, alarm, config.node, work));