dncurrency/nano/node/distributed_work.cpp

#include <nano/boost/asio/bind_executor.hpp>
#include <nano/node/distributed_work.hpp>
#include <nano/node/node.hpp>
#include <nano/node/websocket.hpp>

#include <boost/algorithm/string/erase.hpp>
#include <boost/format.hpp>
#include <boost/range/iterator_range.hpp>

std::shared_ptr<request_type> nano::distributed_work::peer_request::get_prepared_json_request (std::string const & request_string_a) const
{
	auto http_request = std::make_shared<request_type> ();
	http_request->method (boost::beast::http::verb::post);
	http_request->set (boost::beast::http::field::content_type, "application/json");
	auto address_string = boost::algorithm::erase_first_copy (endpoint.address ().to_string (), "::ffff:");
	http_request->set (boost::beast::http::field::host, address_string);
	http_request->target ("/");
	http_request->version (11);
	http_request->body () = request_string_a;
	http_request->prepare_payload ();
	return http_request;
}

nano::distributed_work::distributed_work (nano::node & node_a, nano::work_request const & request_a, std::chrono::seconds const & backoff_a) :
	node (node_a),
	node_w (node_a.shared ()),
	request (request_a),
	backoff (backoff_a),
	strand (node_a.io_ctx.get_executor ()),
	need_resolve (request_a.peers),
	elapsed (nano::timer_state::started, "distributed work generation timer")
{
	debug_assert (!finished);
	debug_assert (status == work_generation_status::ongoing);
}

nano::distributed_work::~distributed_work ()
{
	debug_assert (status != work_generation_status::ongoing);
	if (auto node_l = node_w.lock ())
	{
		if (!node_l->stopped && node_l->websocket.server && node_l->websocket.server->any_subscriber (nano::websocket::topic::work))
		{
			nano::websocket::message_builder builder{ node_l->ledger };
			if (status == work_generation_status::success)
			{
				node_l->websocket.server->broadcast (builder.work_generation (request.version, request.root.as_block_hash (), work_result, request.difficulty, node_l->default_difficulty (request.version), elapsed.value (), winner, bad_peers));
			}
			else if (status == work_generation_status::cancelled)
			{
				node_l->websocket.server->broadcast (builder.work_cancelled (request.version, request.root.as_block_hash (), request.difficulty, node_l->default_difficulty (request.version), elapsed.value (), bad_peers));
			}
			else if (status == work_generation_status::failure_local || status == work_generation_status::failure_peers)
			{
				node_l->websocket.server->broadcast (builder.work_failed (request.version, request.root.as_block_hash (), request.difficulty, node_l->default_difficulty (request.version), elapsed.value (), bad_peers));
			}
		}
		stop_once (true);
	}
}

void nano::distributed_work::start ()
{
	// Start work generation if peers are not acting correctly, or if there are no peers configured
	if ((need_resolve.empty () || node.unresponsive_work_peers) && node.local_work_generation_enabled ())
	{
		start_local ();
	}
	// Fallback when local generation is required but it is not enabled is to simply call the callback with an error
	else if (need_resolve.empty () && request.callback)
	{
		status = work_generation_status::failure_local;
		request.callback (std::nullopt);
	}
	for (auto const & peer : need_resolve)
	{
		boost::system::error_code ec;
		auto parsed_address (boost::asio::ip::make_address_v6 (peer.first, ec));
		if (!ec)
		{
			do_request (nano::tcp_endpoint (parsed_address, peer.second));
		}
		else
		{
			auto this_l (shared_from_this ());
			node.network.resolver.async_resolve (boost::asio::ip::tcp::resolver::query (peer.first, std::to_string (peer.second)), [peer, this_l, &extra = resolved_extra] (boost::system::error_code const & ec, boost::asio::ip::tcp::resolver::iterator i_a) {
				if (!ec)
				{
					this_l->do_request (nano::tcp_endpoint (i_a->endpoint ().address (), i_a->endpoint ().port ()));
					++i_a;
					for (auto & i : boost::make_iterator_range (i_a, {}))
					{
						++extra;
						this_l->do_request (nano::tcp_endpoint (i.endpoint ().address (), i.endpoint ().port ()));
					}
				}
				else
				{
					this_l->node.logger.error (nano::log::type::distributed_work, "Error resolving work peer: {}:{} ({})",
					peer.first, peer.second, ec.message ());

					this_l->failure ();
				}
			});
		}
	}
}

void nano::distributed_work::start_local ()
{
	auto this_l (shared_from_this ());
	local_generation_started = true;
	node.work.generate (request.version, request.root, request.difficulty, [this_l] (boost::optional<uint64_t> const & work_a) {
		if (work_a.is_initialized ())
		{
			this_l->set_once (*work_a);
		}
		else if (!this_l->finished.exchange (true))
		{
			this_l->status = work_generation_status::failure_local;
			if (this_l->request.callback)
			{
				this_l->request.callback (std::nullopt);
			}
		}
		this_l->stop_once (false);
	});
}

void nano::distributed_work::do_request (nano::tcp_endpoint const & endpoint_a)
{
	auto this_l (shared_from_this ());
	auto connection (std::make_shared<peer_request> (node.io_ctx, endpoint_a));
	{
		nano::lock_guard<nano::mutex> lock{ mutex };
		connections.emplace_back (connection);
	}
	connection->socket.async_connect (connection->endpoint,
	boost::asio::bind_executor (strand,
	[this_l, connection] (boost::system::error_code const & ec) {
		if (!ec && !this_l->stopped)
		{
			std::string request_string;
			{
				boost::property_tree::ptree rpc_request;
				rpc_request.put ("action", "work_generate");
				rpc_request.put ("hash", this_l->request.root.to_string ());
				rpc_request.put ("difficulty", nano::to_string_hex (this_l->request.difficulty));
				if (this_l->request.account.has_value ())
				{
					rpc_request.put ("account", this_l->request.account.value ().to_account ());
				}
				std::stringstream ostream;
				boost::property_tree::write_json (ostream, rpc_request);
				request_string = ostream.str ();
			}
			auto peer_request (connection->get_prepared_json_request (request_string));
			boost::beast::http::async_write (connection->socket, *peer_request,
			boost::asio::bind_executor (this_l->strand,
			[this_l, connection, peer_request] (boost::system::error_code const & ec, std::size_t size_a) {
				if (!ec && !this_l->stopped)
				{
					boost::beast::http::async_read (connection->socket, connection->buffer, connection->response,
					boost::asio::bind_executor (this_l->strand, [this_l, connection] (boost::system::error_code const & ec, std::size_t size_a) {
						if (!ec && !this_l->stopped)
						{
							if (connection->response.result () == boost::beast::http::status::ok)
							{
								this_l->success (connection->response.body (), connection->endpoint);
							}
							else if (ec)
							{
								this_l->node.logger.error (nano::log::type::distributed_work, "Work peer responded with an error {} ({})",
								connection->endpoint,
								ec.message ());

								this_l->add_bad_peer (connection->endpoint);
								this_l->failure ();
							}
						}
						else if (ec)
						{
							this_l->do_cancel (connection->endpoint);
							this_l->failure ();
						}
					}));
				}
				else if (ec && ec != boost::system::errc::operation_canceled)
				{
					this_l->node.logger.error (nano::log::type::distributed_work, "Unable to write to work peer {} ({})",
					connection->endpoint,
					ec.message ());

					this_l->add_bad_peer (connection->endpoint);
					this_l->failure ();
				}
			}));
		}
		else if (ec && ec != boost::system::errc::operation_canceled)
		{
			this_l->node.logger.error (nano::log::type::distributed_work, "Unable to connect to work peer {} ({})",
			connection->endpoint,
			ec.message ());

			this_l->add_bad_peer (connection->endpoint);
			this_l->failure ();
		}
	}));
}

void nano::distributed_work::do_cancel (nano::tcp_endpoint const & endpoint_a)
{
	auto this_l (shared_from_this ());
	auto cancelling_l (std::make_shared<peer_request> (node.io_ctx, endpoint_a));
	cancelling_l->socket.async_connect (cancelling_l->endpoint,
	boost::asio::bind_executor (strand,
	[this_l, cancelling_l] (boost::system::error_code const & ec) {
		if (!ec)
		{
			std::string request_string;
			{
				boost::property_tree::ptree rpc_request;
				rpc_request.put ("action", "work_cancel");
				rpc_request.put ("hash", this_l->request.root.to_string ());
				std::stringstream ostream;
				boost::property_tree::write_json (ostream, rpc_request);
				request_string = ostream.str ();
			}
			auto peer_cancel (cancelling_l->get_prepared_json_request (request_string));
			boost::beast::http::async_write (cancelling_l->socket, *peer_cancel,
			boost::asio::bind_executor (this_l->strand,
			[this_l, peer_cancel, cancelling_l] (boost::system::error_code const & ec, std::size_t bytes_transferred) {
				if (ec && ec != boost::system::errc::operation_canceled)
				{
					this_l->node.logger.error (nano::log::type::distributed_work, "Unable to send work cancel to work peer {} ({})",
					cancelling_l->endpoint,
					ec.message ());
				}
			}));
		}
	}));
}

void nano::distributed_work::success (std::string const & body_a, nano::tcp_endpoint const & endpoint_a)
{
	bool error = true;
	try
	{
		std::stringstream istream (body_a);
		boost::property_tree::ptree result;
		boost::property_tree::read_json (istream, result);
		auto work_text (result.get<std::string> ("work"));
		uint64_t work;
		if (!nano::from_string_hex (work_text, work))
		{
			if (nano::dev::network_params.work.difficulty (request.version, request.root, work) >= request.difficulty)
			{
				error = false;
				node.unresponsive_work_peers = false;
				set_once (work, boost::str (boost::format ("%1%:%2%") % endpoint_a.address () % endpoint_a.port ()));
				stop_once (true);
			}
			else
			{
				node.logger.error (nano::log::type::distributed_work, "Incorrect work response from {} for root {} with difficulty {}: {}",
				endpoint_a,
				request.root,
				nano::to_string_hex (request.difficulty),
				work_text);
			}
		}
		else
		{
			node.logger.error (nano::log::type::distributed_work, "Work response from {} wasn't a number: {}",
			endpoint_a,
			work_text);
		}
	}
	catch (...)
	{
		node.logger.error (nano::log::type::distributed_work, "Work response from {} wasn't parsable: {}",
		endpoint_a,
		body_a);
	}
	if (error)
	{
		add_bad_peer (endpoint_a);
		failure ();
	}
}

void nano::distributed_work::stop_once (bool const local_stop_a)
{
	if (!stopped.exchange (true))
	{
		nano::lock_guard<nano::mutex> guard{ mutex };
		if (local_stop_a && node.local_work_generation_enabled ())
		{
			node.work.cancel (request.root);
		}
		for (auto & connection_w : connections)
		{
			if (auto connection_l = connection_w.lock ())
			{
				auto this_l (shared_from_this ());
				boost::asio::post (strand, boost::asio::bind_executor (strand, [this_l, connection_l] {
					boost::system::error_code ec;
					if (connection_l->socket.is_open ())
					{
						connection_l->socket.cancel (ec);
						if (!ec)
						{
							connection_l->socket.close (ec);
							if (ec)
							{
								this_l->node.logger.error (nano::log::type::distributed_work, "Error closing socket with work peer: {} ({})",
								connection_l->endpoint,
								ec.message ());
							}
						}
						else
						{
							this_l->node.logger.error (nano::log::type::distributed_work, "Error cancelling operation with work peer: {} ({})",
							connection_l->endpoint,
							ec.message ());
						}
					}
				}));
			}
		}
		connections.clear ();
	}
}

void nano::distributed_work::set_once (uint64_t const work_a, std::string const & source_a)
{
	if (!finished.exchange (true))
	{
		elapsed.stop ();

		node.logger.info (nano::log::type::distributed_work, "Work generation for {}, with a threshold difficulty of {} (multiplier {}x) complete: {} ms",
		request.root,
		nano::to_string_hex (request.difficulty),
		nano::to_string (nano::difficulty::to_multiplier (request.difficulty, node.default_difficulty (request.version)), 2),
		elapsed.value ().count ());

		status = work_generation_status::success;
		if (request.callback)
		{
			request.callback (work_a);
		}
		winner = source_a;
		work_result = work_a;
	}
}

void nano::distributed_work::cancel ()
{
	if (!finished.exchange (true))
	{
		elapsed.stop ();

		node.logger.info (nano::log::type::distributed_work, "Work generation for {} was cancelled after {} ms",
		request.root,
		elapsed.value ().count ());

		status = work_generation_status::cancelled;
		if (request.callback)
		{
			request.callback (std::nullopt);
		}
		stop_once (true);
	}
}

void nano::distributed_work::failure ()
{
	if (++failures == need_resolve.size () + resolved_extra.load ())
	{
		handle_failure ();
	}
}

void nano::distributed_work::handle_failure ()
{
	if (!finished)
	{
		node.unresponsive_work_peers = true;
		if (!local_generation_started && !finished.exchange (true))
		{
			node.logger.info (nano::log::type::distributed_work, "Work peer(s) failed to generate work for root {}, retrying... (backoff: {}s)",
			request.root,
			backoff.count ());

			status = work_generation_status::failure_peers;

			std::weak_ptr<nano::node> node_weak (node.shared ());
			auto next_backoff (std::min (backoff * 2, std::chrono::seconds (5 * 60)));
			node.workers.post_delayed (std::chrono::seconds (backoff), [node_weak, request_l = request, next_backoff] {
				bool error_l{ true };
				if (auto node_l = node_weak.lock ())
				{
					error_l = node_l->distributed_work.make (next_backoff, request_l);
				}
				if (error_l && request_l.callback)
				{
					request_l.callback (std::nullopt);
				}
			});
		}
		else
		{
			// wait for local work generation to complete
		}
	}
}

void nano::distributed_work::add_bad_peer (nano::tcp_endpoint const & endpoint_a)
{
	nano::lock_guard<nano::mutex> guard{ mutex };
	bad_peers.emplace_back (boost::str (boost::format ("%1%:%2%") % endpoint_a.address () % endpoint_a.port ()));
}