dncurrency/nano/node/bootstrap/bootstrap_connections.cpp

#include <nano/node/bootstrap/bootstrap.hpp>
#include <nano/node/bootstrap/bootstrap_attempt.hpp>
#include <nano/node/bootstrap/bootstrap_connections.hpp>
#include <nano/node/bootstrap/bootstrap_lazy.hpp>
#include <nano/node/common.hpp>
#include <nano/node/node.hpp>
#include <nano/node/transport/tcp.hpp>

#include <boost/format.hpp>

#include <memory>

constexpr double nano::bootstrap_limits::bootstrap_connection_scale_target_blocks;
constexpr double nano::bootstrap_limits::bootstrap_minimum_blocks_per_sec;
constexpr double nano::bootstrap_limits::bootstrap_minimum_termination_time_sec;
constexpr unsigned nano::bootstrap_limits::bootstrap_max_new_connections;
constexpr unsigned nano::bootstrap_limits::requeued_pulls_processed_blocks_factor;

nano::bootstrap_client::bootstrap_client (std::shared_ptr<nano::node> const & node_a, nano::bootstrap_connections & connections_a, std::shared_ptr<nano::transport::channel_tcp> const & channel_a, std::shared_ptr<nano::transport::socket> const & socket_a) :
	node (node_a),
	connections (connections_a),
	channel (channel_a),
	socket (socket_a),
	receive_buffer (std::make_shared<std::vector<uint8_t>> ()),
	start_time_m (std::chrono::steady_clock::now ())
{
	++connections.connections_count;
	receive_buffer->resize (256);
	channel->set_endpoint ();
}

nano::bootstrap_client::~bootstrap_client ()
{
	--connections.connections_count;
}

double nano::bootstrap_client::sample_block_rate ()
{
	auto elapsed = std::max (elapsed_seconds (), nano::bootstrap_limits::bootstrap_minimum_elapsed_seconds_blockrate);
	block_rate = static_cast<double> (block_count.load ()) / elapsed;
	return block_rate;
}

void nano::bootstrap_client::set_start_time (std::chrono::steady_clock::time_point start_time_a)
{
	nano::lock_guard<nano::mutex> guard{ start_time_mutex };
	start_time_m = start_time_a;
}

double nano::bootstrap_client::elapsed_seconds () const
{
	nano::lock_guard<nano::mutex> guard{ start_time_mutex };
	return std::chrono::duration_cast<std::chrono::duration<double>> (std::chrono::steady_clock::now () - start_time_m).count ();
}

void nano::bootstrap_client::stop (bool force)
{
	pending_stop = true;
	if (force)
	{
		hard_stop = true;
	}
}

nano::bootstrap_connections::bootstrap_connections (nano::node & node_a) :
	node (node_a)
{
}

std::shared_ptr<nano::bootstrap_client> nano::bootstrap_connections::connection (std::shared_ptr<nano::bootstrap_attempt> const & attempt_a, bool use_front_connection)
{
	nano::unique_lock<nano::mutex> lock{ mutex };
	condition.wait (lock, [&stopped = stopped, &idle = idle, &new_connections_empty = new_connections_empty] { return stopped || !idle.empty () || new_connections_empty; });
	std::shared_ptr<nano::bootstrap_client> result;
	if (!stopped && !idle.empty ())
	{
		if (!use_front_connection)
		{
			result = idle.back ();
			idle.pop_back ();
		}
		else
		{
			result = idle.front ();
			idle.pop_front ();
		}
	}
	if (result == nullptr && connections_count == 0 && new_connections_empty && attempt_a != nullptr)
	{
		node.logger.try_log (boost::str (boost::format ("Bootstrap attempt stopped because there are no peers")));
		lock.unlock ();
		attempt_a->stop ();
	}
	return result;
}

void nano::bootstrap_connections::pool_connection (std::shared_ptr<nano::bootstrap_client> const & client_a, bool new_client, bool push_front)
{
	nano::unique_lock<nano::mutex> lock{ mutex };
	auto const & socket_l = client_a->socket;
	if (!stopped && !client_a->pending_stop && !node.network.excluded_peers.check (client_a->channel->get_tcp_endpoint ()))
	{
		socket_l->set_timeout (node.network_params.network.idle_timeout);
		// Push into idle deque
		if (!push_front)
		{
			idle.push_back (client_a);
		}
		else
		{
			idle.push_front (client_a);
		}
		if (new_client)
		{
			clients.push_back (client_a);
		}
	}
	else
	{
		socket_l->close ();
	}
	lock.unlock ();
	condition.notify_all ();
}

void nano::bootstrap_connections::add_connection (nano::endpoint const & endpoint_a)
{
	connect_client (nano::tcp_endpoint (endpoint_a.address (), endpoint_a.port ()), true);
}

std::shared_ptr<nano::bootstrap_client> nano::bootstrap_connections::find_connection (nano::tcp_endpoint const & endpoint_a)
{
	nano::lock_guard<nano::mutex> lock{ mutex };
	std::shared_ptr<nano::bootstrap_client> result;
	for (auto i (idle.begin ()), end (idle.end ()); i != end && !stopped; ++i)
	{
		if ((*i)->channel->get_tcp_endpoint () == endpoint_a)
		{
			result = *i;
			idle.erase (i);
			break;
		}
	}
	return result;
}

void nano::bootstrap_connections::connect_client (nano::tcp_endpoint const & endpoint_a, bool push_front)
{
	++connections_count;
	auto socket (std::make_shared<nano::transport::client_socket> (node));
	auto this_l (shared_from_this ());
	socket->async_connect (endpoint_a,
	[this_l, socket, endpoint_a, push_front] (boost::system::error_code const & ec) {
		if (!ec)
		{
			if (this_l->node.config.logging.bulk_pull_logging ())
			{
				this_l->node.logger.try_log (boost::str (boost::format ("Connection established to %1%") % endpoint_a));
			}
			auto client (std::make_shared<nano::bootstrap_client> (this_l->node.shared (), *this_l, std::make_shared<nano::transport::channel_tcp> (*this_l->node.shared (), socket), socket));
			this_l->pool_connection (client, true, push_front);
		}
		else
		{
			if (this_l->node.config.logging.network_logging ())
			{
				switch (ec.value ())
				{
					default:
						this_l->node.logger.try_log (boost::str (boost::format ("Error initiating bootstrap connection to %1%: %2%") % endpoint_a % ec.message ()));
						break;
					case boost::system::errc::connection_refused:
					case boost::system::errc::operation_canceled:
					case boost::system::errc::timed_out:
					case 995: //Windows The I/O operation has been aborted because of either a thread exit or an application request
					case 10061: //Windows No connection could be made because the target machine actively refused it
						break;
				}
			}
		}
		--this_l->connections_count;
	});
}

unsigned nano::bootstrap_connections::target_connections (std::size_t pulls_remaining, std::size_t attempts_count) const
{
	auto const attempts_factor = nano::narrow_cast<unsigned> (node.config.bootstrap_connections * attempts_count);
	if (attempts_factor >= node.config.bootstrap_connections_max)
	{
		return std::max (1U, node.config.bootstrap_connections_max);
	}

	// Only scale up to bootstrap_connections_max for large pulls.
	double step_scale = std::min (1.0, std::max (0.0, (double)pulls_remaining / nano::bootstrap_limits::bootstrap_connection_scale_target_blocks));
	double target = (double)attempts_factor + (double)(node.config.bootstrap_connections_max - attempts_factor) * step_scale;
	return std::max (1U, (unsigned)(target + 0.5f));
}

struct block_rate_cmp
{
	bool operator() (std::shared_ptr<nano::bootstrap_client> const & lhs, std::shared_ptr<nano::bootstrap_client> const & rhs) const
	{
		return lhs->block_rate > rhs->block_rate;
	}
};

void nano::bootstrap_connections::populate_connections (bool repeat)
{
	double rate_sum = 0.0;
	std::size_t num_pulls = 0;
	std::size_t attempts_count = node.bootstrap_initiator.attempts.size ();
	std::priority_queue<std::shared_ptr<nano::bootstrap_client>, std::vector<std::shared_ptr<nano::bootstrap_client>>, block_rate_cmp> sorted_connections;
	std::unordered_set<nano::tcp_endpoint> endpoints;
	{
		nano::unique_lock<nano::mutex> lock{ mutex };
		num_pulls = pulls.size ();
		std::deque<std::weak_ptr<nano::bootstrap_client>> new_clients;
		for (auto & c : clients)
		{
			if (auto client = c.lock ())
			{
				new_clients.push_back (client);
				endpoints.insert (client->socket->remote_endpoint ());
				double elapsed_sec = client->elapsed_seconds ();
				auto blocks_per_sec = client->sample_block_rate ();
				rate_sum += blocks_per_sec;
				if (client->elapsed_seconds () > nano::bootstrap_limits::bootstrap_connection_warmup_time_sec && client->block_count > 0)
				{
					sorted_connections.push (client);
				}
				// Force-stop the slowest peers, since they can take the whole bootstrap hostage by dribbling out blocks on the last remaining pull.
				// This is ~1.5kilobits/sec.
				if (elapsed_sec > nano::bootstrap_limits::bootstrap_minimum_termination_time_sec && blocks_per_sec < nano::bootstrap_limits::bootstrap_minimum_blocks_per_sec)
				{
					if (node.config.logging.bulk_pull_logging ())
					{
						node.logger.try_log (boost::str (boost::format ("Stopping slow peer %1% (elapsed sec %2%s > %3%s and %4% blocks per second < %5%)") % client->channel->to_string () % elapsed_sec % nano::bootstrap_limits::bootstrap_minimum_termination_time_sec % blocks_per_sec % nano::bootstrap_limits::bootstrap_minimum_blocks_per_sec));
					}

					client->stop (true);
					new_clients.pop_back ();
				}
			}
		}
		// Cleanup expired clients
		clients.swap (new_clients);
	}

	auto target = target_connections (num_pulls, attempts_count);

	// We only want to drop slow peers when more than 2/3 are active. 2/3 because 1/2 is too aggressive, and 100% rarely happens.
	// Probably needs more tuning.
	if (sorted_connections.size () >= (target * 2) / 3 && target >= 4)
	{
		// 4 -> 1, 8 -> 2, 16 -> 4, arbitrary, but seems to work well.
		auto drop = (int)roundf (sqrtf ((float)target - 2.0f));

		if (node.config.logging.bulk_pull_logging ())
		{
			node.logger.try_log (boost::str (boost::format ("Dropping %1% bulk pull peers, target connections %2%") % drop % target));
		}

		for (int i = 0; i < drop; i++)
		{
			auto client = sorted_connections.top ();

			if (node.config.logging.bulk_pull_logging ())
			{
				node.logger.try_log (boost::str (boost::format ("Dropping peer with block rate %1%, block count %2% (%3%) ") % client->block_rate % client->block_count % client->channel->to_string ()));
			}

			client->stop (false);
			sorted_connections.pop ();
		}
	}

	if (node.config.logging.bulk_pull_logging ())
	{
		node.logger.try_log (boost::str (boost::format ("Bulk pull connections: %1%, rate: %2% blocks/sec, bootstrap attempts %3%, remaining pulls: %4%") % connections_count.load () % (int)rate_sum % attempts_count % num_pulls));
	}

	if (connections_count < target && (attempts_count != 0 || new_connections_empty) && !stopped)
	{
		auto delta = std::min ((target - connections_count) * 2, nano::bootstrap_limits::bootstrap_max_new_connections);
		// TODO - tune this better
		// Not many peers respond, need to try to make more connections than we need.
		for (auto i = 0u; i < delta; i++)
		{
			auto endpoint (node.network.bootstrap_peer ());
			if (endpoint != nano::tcp_endpoint (boost::asio::ip::address_v6::any (), 0) && (node.flags.allow_bootstrap_peers_duplicates || endpoints.find (endpoint) == endpoints.end ()) && !node.network.excluded_peers.check (endpoint))
			{
				connect_client (endpoint);
				endpoints.insert (endpoint);
				nano::lock_guard<nano::mutex> lock{ mutex };
				new_connections_empty = false;
			}
			else if (connections_count == 0)
			{
				{
					nano::lock_guard<nano::mutex> lock{ mutex };
					new_connections_empty = true;
				}
				condition.notify_all ();
			}
		}
	}
	if (!stopped && repeat)
	{
		std::weak_ptr<nano::bootstrap_connections> this_w (shared_from_this ());
		node.workers.add_timed_task (std::chrono::steady_clock::now () + std::chrono::seconds (1), [this_w] () {
			if (auto this_l = this_w.lock ())
			{
				this_l->populate_connections ();
			}
		});
	}
}

void nano::bootstrap_connections::start_populate_connections ()
{
	if (!populate_connections_started.exchange (true))
	{
		populate_connections ();
	}
}

void nano::bootstrap_connections::add_pull (nano::pull_info const & pull_a)
{
	nano::pull_info pull (pull_a);
	node.bootstrap_initiator.cache.update_pull (pull);
	{
		nano::lock_guard<nano::mutex> lock{ mutex };
		pulls.push_back (pull);
	}
	condition.notify_all ();
}

void nano::bootstrap_connections::request_pull (nano::unique_lock<nano::mutex> & lock_a)
{
	lock_a.unlock ();
	auto connection_l (connection ());
	lock_a.lock ();
	if (connection_l != nullptr && !pulls.empty ())
	{
		std::shared_ptr<nano::bootstrap_attempt> attempt_l;
		nano::pull_info pull;
		// Search pulls with existing attempts
		while (attempt_l == nullptr && !pulls.empty ())
		{
			pull = pulls.front ();
			pulls.pop_front ();
			attempt_l = node.bootstrap_initiator.attempts.find (pull.bootstrap_id);
			// Check if lazy pull is obsolete (head was processed or head is 0 for destinations requests)
			if (auto lazy = std::dynamic_pointer_cast<nano::bootstrap_attempt_lazy> (attempt_l))
			{
				if (!pull.head.is_zero () && lazy->lazy_processed_or_exists (pull.head))
				{
					attempt_l->pull_finished ();
					attempt_l = nullptr;
				}
			}
		}
		if (attempt_l != nullptr)
		{
			// The bulk_pull_client destructor attempt to requeue_pull which can cause a deadlock if this is the last reference
			// Dispatch request in an external thread in case it needs to be destroyed
			node.background ([connection_l, attempt_l, pull] () {
				auto client (std::make_shared<nano::bulk_pull_client> (connection_l, attempt_l, pull));
				client->request ();
			});
		}
	}
	else if (connection_l != nullptr)
	{
		// Reuse connection if pulls deque become empty
		lock_a.unlock ();
		pool_connection (connection_l);
		lock_a.lock ();
	}
}

void nano::bootstrap_connections::requeue_pull (nano::pull_info const & pull_a, bool network_error)
{
	auto pull (pull_a);
	if (!network_error)
	{
		++pull.attempts;
	}
	auto attempt_l (node.bootstrap_initiator.attempts.find (pull.bootstrap_id));
	if (attempt_l != nullptr)
	{
		auto lazy = std::dynamic_pointer_cast<nano::bootstrap_attempt_lazy> (attempt_l);
		++attempt_l->requeued_pulls;
		if (lazy)
		{
			pull.count = lazy->lazy_batch_size ();
		}
		if (attempt_l->mode == nano::bootstrap_mode::legacy && (pull.attempts < pull.retry_limit + (pull.processed / nano::bootstrap_limits::requeued_pulls_processed_blocks_factor)))
		{
			{
				nano::lock_guard<nano::mutex> lock{ mutex };
				pulls.push_front (pull);
			}
			attempt_l->pull_started ();
			condition.notify_all ();
		}
		else if (lazy && (pull.attempts <= pull.retry_limit + (pull.processed / node.network_params.bootstrap.lazy_max_pull_blocks)))
		{
			debug_assert (pull.account_or_head.as_block_hash () == pull.head);
			if (!lazy->lazy_processed_or_exists (pull.account_or_head.as_block_hash ()))
			{
				{
					nano::lock_guard<nano::mutex> lock{ mutex };
					pulls.push_back (pull);
				}
				attempt_l->pull_started ();
				condition.notify_all ();
			}
		}
		else
		{
			if (node.config.logging.bulk_pull_logging ())
			{
				node.logger.try_log (boost::str (boost::format ("Failed to pull account %1% or head block %2% down to %3% after %4% attempts and %5% blocks processed") % pull.account_or_head.to_account () % pull.account_or_head.to_string () % pull.end.to_string () % pull.attempts % pull.processed));
			}
			node.stats.inc (nano::stat::type::bootstrap, nano::stat::detail::bulk_pull_failed_account, nano::stat::dir::in);

			if (lazy && pull.processed > 0)
			{
				lazy->lazy_add (pull);
			}
			else if (attempt_l->mode == nano::bootstrap_mode::legacy)
			{
				node.bootstrap_initiator.cache.add (pull);
			}
		}
	}
}

void nano::bootstrap_connections::clear_pulls (uint64_t bootstrap_id_a)
{
	{
		nano::lock_guard<nano::mutex> lock{ mutex };
		auto i (pulls.begin ());
		while (i != pulls.end ())
		{
			if (i->bootstrap_id == bootstrap_id_a)
			{
				i = pulls.erase (i);
			}
			else
			{
				++i;
			}
		}
	}
	condition.notify_all ();
}

void nano::bootstrap_connections::run ()
{
	start_populate_connections ();
	nano::unique_lock<nano::mutex> lock{ mutex };
	while (!stopped)
	{
		if (!pulls.empty ())
		{
			request_pull (lock);
		}
		else
		{
			condition.wait (lock);
		}
	}
	stopped = true;
	lock.unlock ();
	condition.notify_all ();
}

void nano::bootstrap_connections::stop ()
{
	nano::unique_lock<nano::mutex> lock{ mutex };
	stopped = true;
	lock.unlock ();
	condition.notify_all ();
	lock.lock ();
	for (auto const & i : clients)
	{
		if (auto client = i.lock ())
		{
			client->socket->close ();
		}
	}
	clients.clear ();
	idle.clear ();
}