Fix intermittent node telemetry test failures (#2576)

2020-02-20 19:28:58 +00:00 · 2020-02-20 19:28:58 +00:00 · 8c3ae38f1c
commit 8c3ae38f1c
parent ae3c3e396a
7 changed files with 30 additions and 22 deletions
--- a/nano/core_test/node_telemetry.cpp
+++ b/nano/core_test/node_telemetry.cpp
@ -258,10 +258,11 @@ namespace nano
 {
 TEST (node_telemetry, basic)
 {
-	nano::system system (2);
-
-	auto node_client = system.nodes.front ();
-	auto node_server = system.nodes.back ();
+	nano::system system;
+	nano::node_flags node_flags;
+	node_flags.disable_ongoing_telemetry_requests = true;
+	auto node_client = system.add_node (node_flags);
+	auto node_server = system.add_node (node_flags);

 	wait_peer_connections (system);

@ -304,8 +305,6 @@ TEST (node_telemetry, basic)

 	// Wait the cache period and check cache is not used
 	std::this_thread::sleep_for (nano::telemetry_cache_cutoffs::test);
-	// Arbitrarily change something so that we can confirm different metrics were used
-	node_server->ledger.cache.block_count = 100;

 	std::atomic<bool> done{ false };
 	node_client->telemetry.get_metrics_peers_async ([&done, &all_telemetry_data_time_pairs](nano::telemetry_data_responses const & responses_a) {
@ -448,10 +447,12 @@ namespace nano
 {
 TEST (node_telemetry, single_request)
 {
-	nano::system system (2);
+	nano::system system;
+	nano::node_flags node_flags;
+	node_flags.disable_ongoing_telemetry_requests = true;

-	auto node_client = system.nodes.front ();
-	auto node_server = system.nodes.back ();
+	auto node_client = system.add_node (node_flags);
+	auto node_server = system.add_node (node_flags);

 	wait_peer_connections (system);

@ -702,10 +703,11 @@ TEST (node_telemetry, disconnects)

 TEST (node_telemetry, batch_use_single_request_cache)
 {
-	nano::system system (2);
-
-	auto node_client = system.nodes.front ();
-	auto node_server = system.nodes.back ();
+	nano::system system;
+	nano::node_flags node_flags;
+	node_flags.disable_ongoing_telemetry_requests = true;
+	auto node_client = system.add_node (node_flags);
+	auto node_server = system.add_node (node_flags);

 	wait_peer_connections (system);

@ -756,6 +758,8 @@ TEST (node_telemetry, batch_use_single_request_cache)
 		ASSERT_NO_ERROR (system.poll ());
 	}

+	std::this_thread::sleep_for (nano::telemetry_cache_cutoffs::test);
+
 	system.deadline_set (10s);
 	std::atomic<bool> done{ false };
 	node_client->telemetry.get_metrics_peers_async ([&done, &telemetry_data_time_pair](nano::telemetry_data_responses const & responses_a) {
--- a/nano/node/common.hpp
+++ b/nano/node/common.hpp
@ -456,7 +456,7 @@ public:
 class telemetry_cache_cutoffs
 {
 public:
-	static std::chrono::seconds constexpr test{ 2 };
+	static std::chrono::seconds constexpr test{ 3 };
 	static std::chrono::seconds constexpr beta{ 15 };
 	static std::chrono::seconds constexpr live{ 60 };

--- a/nano/node/node.cpp
+++ b/nano/node/node.cpp
@ -126,7 +126,7 @@ gap_cache (*this),
 ledger (store, stats, flags_a.generate_cache),
 checker (config.signature_checker_threads),
 network (*this, config.peering_port),
-telemetry (network, alarm, worker),
+telemetry (network, alarm, worker, flags.disable_ongoing_telemetry_requests),
 bootstrap_initiator (*this),
 bootstrap (config.peering_port, *this),
 application_path (application_path_a),
--- a/nano/node/nodeconfig.hpp
+++ b/nano/node/nodeconfig.hpp
@ -126,6 +126,7 @@ public:
 	bool disable_unchecked_drop{ true };
 	bool disable_providing_telemetry_metrics{ false };
 	bool disable_block_processor_unchecked_deletion{ false };
+	bool disable_ongoing_telemetry_requests{ false };
 	bool fast_bootstrap{ false };
 	bool read_only{ false };
 	nano::confirmation_height_mode confirmation_height_processor_mode{ nano::confirmation_height_mode::automatic };
--- a/nano/node/telemetry.cpp
+++ b/nano/node/telemetry.cpp
@ -14,9 +14,7 @@
 #include <numeric>
 #include <set>

-std::chrono::seconds constexpr nano::telemetry_impl::alarm_cutoff;
-
-nano::telemetry::telemetry (nano::network & network_a, nano::alarm & alarm_a, nano::worker & worker_a) :
+nano::telemetry::telemetry (nano::network & network_a, nano::alarm & alarm_a, nano::worker & worker_a, bool disable_ongoing_requests_a) :
 network (network_a),
 alarm (alarm_a),
 worker (worker_a),
@ -59,7 +57,10 @@ batch_request (std::make_shared<nano::telemetry_impl> (network, alarm, worker))
 		finished_single_requests.clear ();
 	};

-	ongoing_req_all_peers ();
+	if (!disable_ongoing_requests_a)
+	{
+		ongoing_req_all_peers ();
+	}
 }

 void nano::telemetry::stop ()
@ -286,6 +287,7 @@ size_t nano::telemetry::finished_single_requests_size ()
 }

 nano::telemetry_impl::telemetry_impl (nano::network & network_a, nano::alarm & alarm_a, nano::worker & worker_a) :
+alarm_cutoff (is_sanitizer_build || nano::running_within_valgrind () ? 6 : 3),
 network (network_a),
 alarm (alarm_a),
 worker (worker_a)
--- a/nano/node/telemetry.hpp
+++ b/nano/node/telemetry.hpp
@ -68,7 +68,7 @@ private:
 	nano::network_params network_params;
 	// Anything older than this requires requesting metrics from other nodes.
 	std::chrono::seconds const cache_cutoff{ nano::telemetry_cache_cutoffs::network_to_time (network_params.network) };
-	static std::chrono::seconds constexpr alarm_cutoff{ 3 };
+	std::chrono::seconds const alarm_cutoff;

 	// All data in this chunk is protected by this mutex
 	std::mutex mutex;
@ -116,7 +116,7 @@ std::unique_ptr<nano::container_info_component> collect_container_info (telemetr
 class telemetry
 {
 public:
-	telemetry (nano::network & network_a, nano::alarm & alarm_a, nano::worker & worker_a);
+	telemetry (nano::network & network_a, nano::alarm & alarm_a, nano::worker & worker_a, bool disable_ongoing_requests_a);

 	/*
 	 * Add telemetry metrics received from this endpoint.
@ -188,6 +188,7 @@ private:
 	void ongoing_req_all_peers ();

 	friend class node_telemetry_multiple_single_request_clearing_Test;
+	friend class node_telemetry_ongoing_requests_Test;
 	friend std::unique_ptr<container_info_component> collect_container_info (telemetry &, const std::string &);
 };

--- a/nano/slow_test/node.cpp
+++ b/nano/slow_test/node.cpp
@ -908,7 +908,7 @@ TEST (node_telemetry, ongoing_requests)

 	// Wait till the next ongoing will be called, and add a 1s buffer for the actual processing
 	auto time = std::chrono::steady_clock::now ();
-	while (std::chrono::steady_clock::now () < (time + nano::telemetry_cache_cutoffs::test + nano::telemetry_impl::alarm_cutoff + 1s))
+	while (std::chrono::steady_clock::now () < (time + nano::telemetry_cache_cutoffs::test + node_client->telemetry.batch_request->alarm_cutoff + 1s))
 	{
 		ASSERT_NO_ERROR (system.poll ());
 	}