Add some RocksDB TOML config options to tune memory usage (#2316)

* Add some rocksdb config options to control memory usage * Fix some comments
2019-09-25 09:25:22 +01:00 · 2019-09-25 09:25:22 +01:00 · 58cacfe886
commit 58cacfe886
parent b50c9d74f3
7 changed files with 138 additions and 51 deletions
--- a/nano/core_test/toml.cpp
+++ b/nano/core_test/toml.cpp
@ -239,6 +239,15 @@ TEST (toml, daemon_config_deserialize_defaults)
 	ASSERT_EQ (conf.node.stat_config.log_samples_filename, defaults.node.stat_config.log_samples_filename);

 	ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
+	ASSERT_EQ (conf.node.rocksdb_config.bloom_filter_bits, defaults.node.rocksdb_config.bloom_filter_bits);
+	ASSERT_EQ (conf.node.rocksdb_config.block_cache, defaults.node.rocksdb_config.block_cache);
+	ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
+	ASSERT_EQ (conf.node.rocksdb_config.enable_pipelined_write, defaults.node.rocksdb_config.enable_pipelined_write);
+	ASSERT_EQ (conf.node.rocksdb_config.cache_index_and_filter_blocks, defaults.node.rocksdb_config.cache_index_and_filter_blocks);
+	ASSERT_EQ (conf.node.rocksdb_config.block_size, defaults.node.rocksdb_config.block_size);
+	ASSERT_EQ (conf.node.rocksdb_config.memtable_size, defaults.node.rocksdb_config.memtable_size);
+	ASSERT_EQ (conf.node.rocksdb_config.num_memtables, defaults.node.rocksdb_config.num_memtables);
+	ASSERT_EQ (conf.node.rocksdb_config.total_memtable_size, defaults.node.rocksdb_config.total_memtable_size);
 }

 TEST (toml, optional_child)
@ -473,6 +482,15 @@ TEST (toml, daemon_config_deserialize_no_defaults)

 	[node.rocksdb]
 	enable = true
+	bloom_filter_bits = 10
+	block_cache = 512
+	io_threads = 99
+	enable_pipelined_write = true
+	cache_index_and_filter_blocks = true
+	block_size = 16
+	memtable_size = 128
+	num_memtables = 3
+	total_memtable_size = 0

 	[opencl]
 	device = 999
@ -603,6 +621,15 @@ TEST (toml, daemon_config_deserialize_no_defaults)
 	ASSERT_NE (conf.node.stat_config.log_samples_filename, defaults.node.stat_config.log_samples_filename);

 	ASSERT_NE (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
+	ASSERT_NE (conf.node.rocksdb_config.bloom_filter_bits, defaults.node.rocksdb_config.bloom_filter_bits);
+	ASSERT_NE (conf.node.rocksdb_config.block_cache, defaults.node.rocksdb_config.block_cache);
+	ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
+	ASSERT_NE (conf.node.rocksdb_config.enable_pipelined_write, defaults.node.rocksdb_config.enable_pipelined_write);
+	ASSERT_NE (conf.node.rocksdb_config.cache_index_and_filter_blocks, defaults.node.rocksdb_config.cache_index_and_filter_blocks);
+	ASSERT_NE (conf.node.rocksdb_config.block_size, defaults.node.rocksdb_config.block_size);
+	ASSERT_NE (conf.node.rocksdb_config.memtable_size, defaults.node.rocksdb_config.memtable_size);
+	ASSERT_NE (conf.node.rocksdb_config.num_memtables, defaults.node.rocksdb_config.num_memtables);
+	ASSERT_NE (conf.node.rocksdb_config.total_memtable_size, defaults.node.rocksdb_config.total_memtable_size);
 }

 /** There should be no required values **/
--- a/nano/lib/rocksdbconfig.cpp
+++ b/nano/lib/rocksdbconfig.cpp
@ -4,11 +4,56 @@
 nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const
 {
 	toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database\ntype:bool");
+	toml.put ("enable_pipelined_write", enable_pipelined_write, "Whether to use 2 separate write queues for memtable/WAL, true is recommended.\ntype:bool");
+	toml.put ("cache_index_and_filter_blocks", cache_index_and_filter_blocks, "Whether index and filter blocks are stored in block_cache, true is recommended.\ntype:bool");
+	toml.put ("bloom_filter_bits", bloom_filter_bits, "Number of bits to use with a bloom filter. Helps with point reads but uses more memory. 0 disables the bloom filter, 10 is recommended\ntype:uint32");
+	toml.put ("block_cache", block_cache, "Size (MB) of the block cache; A larger number will increase performance of read operations. At least 512MB is recommended.\ntype:uint64");
+	toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing. Number of hardware threads is recommended.\ntype:uint32");
+	toml.put ("block_size", block_size, "Uncompressed data (KBs) per block. Increasing block size decreases memory usage and space amplification, but increases read amplification. 16 is recommended.\ntype:uint32");
+	toml.put ("num_memtables", num_memtables, "Number of memtables to keep in memory per column family. 2 is the minimum, 3 is recommended.\ntype:uint32");
+	toml.put ("memtable_size", memtable_size, "Amount of memory (MB) to build up before flushing to disk for an individual column family. Large values increase performance. 64 or 128 is recommended\ntype:uint32");
+	toml.put ("total_memtable_size", total_memtable_size, "Total memory (MB) which can be used across all memtables, set to 0 for unconstrained.\ntype:uint32");
 	return toml.get_error ();
 }

 nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml)
 {
 	toml.get_optional<bool> ("enable", enable);
+	toml.get_optional<bool> ("enable_pipelined_write", enable_pipelined_write);
+	toml.get_optional<bool> ("cache_index_and_filter_blocks", cache_index_and_filter_blocks);
+	toml.get_optional<unsigned> ("bloom_filter_bits", bloom_filter_bits);
+	toml.get_optional<uint64_t> ("block_cache", block_cache);
+	toml.get_optional<unsigned> ("io_threads", io_threads);
+	toml.get_optional<unsigned> ("block_size", block_size);
+	toml.get_optional<unsigned> ("num_memtables", num_memtables);
+	toml.get_optional<unsigned> ("memtable_size", memtable_size);
+	toml.get_optional<unsigned> ("total_memtable_size", total_memtable_size);
+
+	// Validate ranges
+	if (bloom_filter_bits > 100)
+	{
+		toml.get_error ().set ("bloom_filter_bits is too high");
+	}
+	if (num_memtables < 2)
+	{
+		toml.get_error ().set ("num_memtables must be at least 2");
+	}
+	if (memtable_size == 0)
+	{
+		toml.get_error ().set ("memtable_size must be non-zero");
+	}
+	if ((total_memtable_size < memtable_size * 8) && (total_memtable_size != 0))
+	{
+		toml.get_error ().set ("total_memtable_size should be at least 8 times greater than memtable_size or be set to 0");
+	}
+	if (io_threads == 0)
+	{
+		toml.get_error ().set ("io_threads must be non-zero");
+	}
+	if (block_size == 0)
+	{
+		toml.get_error ().set ("block_size must be non-zero");
+	}
+
 	return toml.get_error ();
 }
--- a/nano/lib/rocksdbconfig.hpp
+++ b/nano/lib/rocksdbconfig.hpp
@ -2,6 +2,8 @@

 #include <nano/lib/errors.hpp>

+#include <thread>
+
 namespace nano
 {
 class tomlconfig;
@ -14,5 +16,14 @@ public:
 	nano::error deserialize_toml (nano::tomlconfig & toml_a);

 	bool enable{ false };
+	unsigned bloom_filter_bits{ 0 };
+	uint64_t block_cache{ 64 }; // MB
+	unsigned io_threads{ std::thread::hardware_concurrency () };
+	bool enable_pipelined_write{ false };
+	bool cache_index_and_filter_blocks{ false };
+	unsigned block_size{ 4 }; // KB
+	unsigned memtable_size{ 32 }; // MB
+	unsigned num_memtables{ 2 }; // Need a minimum of 2
+	unsigned total_memtable_size{ 512 }; // MB
 };
 }
--- a/nano/node/node.cpp
+++ b/nano/node/node.cpp
@ -123,7 +123,7 @@ alarm (alarm_a),
 work (work_a),
 distributed_work (*this),
 logger (config_a.logging.min_time_between_log_output),
-store_impl (nano::make_store (logger, application_path_a, flags.read_only, true, config_a.diagnostics_config.txn_tracking, config_a.block_processor_batch_max_time, config_a.lmdb_max_dbs, flags.sideband_batch_size, config_a.backup_before_upgrade, config_a.rocksdb_config.enable)),
+store_impl (nano::make_store (logger, application_path_a, flags.read_only, true, config_a.rocksdb_config, config_a.diagnostics_config.txn_tracking, config_a.block_processor_batch_max_time, config_a.lmdb_max_dbs, flags.sideband_batch_size, config_a.backup_before_upgrade, config_a.rocksdb_config.enable)),
 store (*store_impl),
 wallets_store_impl (std::make_unique<nano::mdb_wallets_store> (application_path_a / "wallets.ldb", config_a.lmdb_max_dbs)),
 wallets_store (*wallets_store_impl),
@ -1356,11 +1356,11 @@ nano::node_flags const & nano::inactive_node_flag_defaults ()
 	return node_flags;
 }

-std::unique_ptr<nano::block_store> nano::make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool read_only, bool add_db_postfix, nano::txn_tracking_config const & txn_tracking_config_a, std::chrono::milliseconds block_processor_batch_max_time_a, int lmdb_max_dbs, size_t batch_size, bool backup_before_upgrade, bool use_rocksdb_backend)
+std::unique_ptr<nano::block_store> nano::make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool read_only, bool add_db_postfix, nano::rocksdb_config const & rocksdb_config, nano::txn_tracking_config const & txn_tracking_config_a, std::chrono::milliseconds block_processor_batch_max_time_a, int lmdb_max_dbs, size_t batch_size, bool backup_before_upgrade, bool use_rocksdb_backend)
 {
 #if NANO_ROCKSDB
-	auto make_rocksdb = [&logger, add_db_postfix, &path, read_only]() {
-		return std::make_unique<nano::rocksdb_store> (logger, add_db_postfix ? path / "rocksdb" : path, read_only);
+	auto make_rocksdb = [&logger, add_db_postfix, &path, &rocksdb_config, read_only]() {
+		return std::make_unique<nano::rocksdb_store> (logger, add_db_postfix ? path / "rocksdb" : path, rocksdb_config, read_only);
 	};
 #endif

--- a/nano/node/rocksdb/rocksdb.cpp
+++ b/nano/node/rocksdb/rocksdb.cpp
@ -1,4 +1,5 @@
 #include <nano/crypto_lib/random_pool.hpp>
+#include <nano/lib/rocksdbconfig.hpp>
 #include <nano/node/rocksdb/rocksdb.hpp>
 #include <nano/node/rocksdb/rocksdb_iterator.hpp>
 #include <nano/node/rocksdb/rocksdb_txn.hpp>
@ -40,8 +41,9 @@ void rocksdb_val::convert_buffer_to_value ()
 }
 }

-nano::rocksdb_store::rocksdb_store (nano::logger_mt & logger_a, boost::filesystem::path const & path_a, bool open_read_only_a) :
-logger (logger_a)
+nano::rocksdb_store::rocksdb_store (nano::logger_mt & logger_a, boost::filesystem::path const & path_a, nano::rocksdb_config const & rocksdb_config_a, bool open_read_only_a) :
+logger (logger_a),
+rocksdb_config (rocksdb_config_a)
 {
 	boost::system::error_code error_mkdir, error_chmod;
 	boost::filesystem::create_directories (path_a, error_mkdir);
@ -466,38 +468,40 @@ rocksdb::Options nano::rocksdb_store::get_db_options () const
 	// Start agressively flushing WAL files when they reach over 1GB
 	db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL;

-	if (!low_end_system ())
-	{
-		// Adds a separate write queue for memtable/WAL
-		db_options.enable_pipelined_write = true;
+	// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
+	db_options.IncreaseParallelism (rocksdb_config.io_threads);
+	db_options.OptimizeLevelStyleCompaction ();
+
+	// Adds a separate write queue for memtable/WAL
+	db_options.enable_pipelined_write = rocksdb_config.enable_pipelined_write;
+
+	// Total size of memtables across column families. This can be used to manage the total memory used by memtables.
+	db_options.db_write_buffer_size = rocksdb_config.total_memtable_size;

-		// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
-		db_options.IncreaseParallelism (std::thread::hardware_concurrency ());
-		db_options.OptimizeLevelStyleCompaction ();
-	}
 	return db_options;
 }

-/** As options are currently hardcoded, a heuristic is taken based off the number of cores to modify config options */
-bool nano::rocksdb_store::low_end_system () const
-{
-	return (std::thread::hardware_concurrency () < 2);
-}
-
 rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_table_options () const
 {
 	rocksdb::BlockBasedTableOptions table_options;
-	if (!low_end_system ())
-	{
-		// 512MB block cache
-		table_options.block_cache = rocksdb::NewLRUCache (512 * 1024 * 1024LL);

-		// Bloom filter to help with point reads
-		table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (10, false));
-		table_options.block_size = 16 * 1024;
-		table_options.cache_index_and_filter_blocks = true;
-		table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+	// Block cache for reads
+	table_options.block_cache = rocksdb::NewLRUCache (rocksdb_config.block_cache * 1024 * 1024ULL);
+
+	// Bloom filter to help with point reads
+	auto bloom_filter_bits = rocksdb_config.bloom_filter_bits;
+	if (bloom_filter_bits > 0)
+	{
+		table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (bloom_filter_bits, false));
 	}
+
+	// Increasing block_size decreases memory usage and space amplification, but increases read amplification.
+	table_options.block_size = rocksdb_config.block_size * 1024ULL;
+
+	// Whether index and filter blocks are stored in block_cache. These settings should be synced
+	table_options.cache_index_and_filter_blocks = rocksdb_config.cache_index_and_filter_blocks;
+	table_options.pin_l0_filter_and_index_blocks_in_cache = rocksdb_config.cache_index_and_filter_blocks;
+
 	return table_options;
 }

@ -506,31 +510,29 @@ rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options () const
 	rocksdb::ColumnFamilyOptions cf_options;
 	cf_options.table_factory = table_factory;

-	if (!low_end_system ())
-	{
-		cf_options.level_compaction_dynamic_level_bytes = true;
+	// Number of files in level which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
+	cf_options.level0_file_num_compaction_trigger = 4;

-		// Number of files in level which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
-		cf_options.level0_file_num_compaction_trigger = 4;
+	// L1 size, compaction is triggered for L0 at this size (4 SST files in L1)
+	cf_options.max_bytes_for_level_base = 1024ULL * 1024 * 4 * rocksdb_config.memtable_size;

-		// L1 size, compaction is triggered for L0 at this size (512MB)
-		cf_options.max_bytes_for_level_base = 512 * 1024 * 1024LL;
+	// Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
+	cf_options.max_bytes_for_level_multiplier = 8;

-		// Each level is a multiple of the above. L1 will be 512MB. Le will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
-		cf_options.max_bytes_for_level_multiplier = 8;
+	// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
+	cf_options.ttl = 1 * 24 * 60 * 60;

-		// Size of level 1 sst files (128MB)
-		cf_options.target_file_size_base = 128 * 1024 * 1024LL;
+	// Size of level 1 sst files
+	cf_options.target_file_size_base = 1024ULL * 1024 * rocksdb_config.memtable_size;

-		// Size of each memtable (128MB)
-		cf_options.write_buffer_size = 128 * 1024 * 1024LL;
+	// Size of each memtable
+	cf_options.write_buffer_size = 1024ULL * 1024 * rocksdb_config.memtable_size;

-		// Number of memtables to keep in memory (1 active, rest inactive/immutable)
-		cf_options.max_write_buffer_number = 3;
+	// Size target of levels are changed dynamically based on size of the last level
+	cf_options.level_compaction_dynamic_level_bytes = true;

-		// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
-		cf_options.ttl = 1 * 24 * 60 * 60;
-	}
+	// Number of memtables to keep in memory (1 active, rest inactive/immutable)
+	cf_options.max_write_buffer_number = rocksdb_config.num_memtables;

 	return cf_options;
 }
@ -602,7 +604,7 @@ bool nano::rocksdb_store::copy_db (boost::filesystem::path const & destination_p
 	// Open it so that it flushes all WAL files
 	if (status.ok ())
 	{
-		nano::rocksdb_store rocksdb_store (logger, destination_path.string (), false);
+		nano::rocksdb_store rocksdb_store (logger, destination_path.string (), rocksdb_config, false);
 		return !rocksdb_store.init_error ();
 	}
 	return false;
--- a/nano/node/rocksdb/rocksdb.hpp
+++ b/nano/node/rocksdb/rocksdb.hpp
@ -18,13 +18,14 @@
 namespace nano
 {
 class logging_mt;
+class rocksdb_config;
 /**
 * rocksdb implementation of the block store
 */
 class rocksdb_store : public block_store_partial<rocksdb::Slice, rocksdb_store>
 {
 public:
-	rocksdb_store (nano::logger_mt &, boost::filesystem::path const &, bool open_read_only = false);
+	rocksdb_store (nano::logger_mt &, boost::filesystem::path const &, nano::rocksdb_config const & = nano::rocksdb_config{}, bool open_read_only = false);
 	~rocksdb_store ();
 	nano::write_transaction tx_begin_write (std::vector<nano::tables> const & tables_requiring_lock = {}, std::vector<nano::tables> const & tables_no_lock = {}) override;
 	nano::read_transaction tx_begin_read () override;
@ -98,10 +99,10 @@ private:

 	int increment (nano::write_transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a, uint64_t amount_a);
 	int decrement (nano::write_transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a, uint64_t amount_a);
-	bool low_end_system () const;
 	rocksdb::ColumnFamilyOptions get_cf_options () const;
 	void construct_column_family_mutexes ();
 	rocksdb::Options get_db_options () const;
 	rocksdb::BlockBasedTableOptions get_table_options () const;
+	nano::rocksdb_config rocksdb_config;
 };
 }
--- a/nano/secure/blockstore.hpp
+++ b/nano/secure/blockstore.hpp
@ -5,6 +5,7 @@
 #include <nano/lib/diagnosticsconfig.hpp>
 #include <nano/lib/logger_mt.hpp>
 #include <nano/lib/memory.hpp>
+#include <nano/lib/rocksdbconfig.hpp>
 #include <nano/secure/common.hpp>
 #include <nano/secure/versioning.hpp>

@ -729,7 +730,7 @@ public:
 	virtual nano::read_transaction tx_begin_read () = 0;
 };

-std::unique_ptr<nano::block_store> make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool open_read_only = false, bool add_db_postfix = false, nano::txn_tracking_config const & txn_tracking_config_a = nano::txn_tracking_config{}, std::chrono::milliseconds block_processor_batch_max_time_a = std::chrono::milliseconds (5000), int lmdb_max_dbs = 128, size_t batch_size = 512, bool backup_before_upgrade = false, bool rocksdb_backend = false);
+std::unique_ptr<nano::block_store> make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool open_read_only = false, bool add_db_postfix = false, nano::rocksdb_config const & rocksdb_config = nano::rocksdb_config{}, nano::txn_tracking_config const & txn_tracking_config_a = nano::txn_tracking_config{}, std::chrono::milliseconds block_processor_batch_max_time_a = std::chrono::milliseconds (5000), int lmdb_max_dbs = 128, size_t batch_size = 512, bool backup_before_upgrade = false, bool rocksdb_backend = false);
 }

 namespace std