Add some RocksDB TOML config options to tune memory usage (#2316)

* Add some rocksdb config options to control memory usage

* Fix some comments
This commit is contained in:
Wesley Shillingford 2019-09-25 09:25:22 +01:00 committed by GitHub
commit 58cacfe886
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 138 additions and 51 deletions

View file

@ -239,6 +239,15 @@ TEST (toml, daemon_config_deserialize_defaults)
ASSERT_EQ (conf.node.stat_config.log_samples_filename, defaults.node.stat_config.log_samples_filename);
ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
ASSERT_EQ (conf.node.rocksdb_config.bloom_filter_bits, defaults.node.rocksdb_config.bloom_filter_bits);
ASSERT_EQ (conf.node.rocksdb_config.block_cache, defaults.node.rocksdb_config.block_cache);
ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
ASSERT_EQ (conf.node.rocksdb_config.enable_pipelined_write, defaults.node.rocksdb_config.enable_pipelined_write);
ASSERT_EQ (conf.node.rocksdb_config.cache_index_and_filter_blocks, defaults.node.rocksdb_config.cache_index_and_filter_blocks);
ASSERT_EQ (conf.node.rocksdb_config.block_size, defaults.node.rocksdb_config.block_size);
ASSERT_EQ (conf.node.rocksdb_config.memtable_size, defaults.node.rocksdb_config.memtable_size);
ASSERT_EQ (conf.node.rocksdb_config.num_memtables, defaults.node.rocksdb_config.num_memtables);
ASSERT_EQ (conf.node.rocksdb_config.total_memtable_size, defaults.node.rocksdb_config.total_memtable_size);
}
TEST (toml, optional_child)
@ -473,6 +482,15 @@ TEST (toml, daemon_config_deserialize_no_defaults)
[node.rocksdb]
enable = true
bloom_filter_bits = 10
block_cache = 512
io_threads = 99
enable_pipelined_write = true
cache_index_and_filter_blocks = true
block_size = 16
memtable_size = 128
num_memtables = 3
total_memtable_size = 0
[opencl]
device = 999
@ -603,6 +621,15 @@ TEST (toml, daemon_config_deserialize_no_defaults)
ASSERT_NE (conf.node.stat_config.log_samples_filename, defaults.node.stat_config.log_samples_filename);
ASSERT_NE (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
ASSERT_NE (conf.node.rocksdb_config.bloom_filter_bits, defaults.node.rocksdb_config.bloom_filter_bits);
ASSERT_NE (conf.node.rocksdb_config.block_cache, defaults.node.rocksdb_config.block_cache);
ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
ASSERT_NE (conf.node.rocksdb_config.enable_pipelined_write, defaults.node.rocksdb_config.enable_pipelined_write);
ASSERT_NE (conf.node.rocksdb_config.cache_index_and_filter_blocks, defaults.node.rocksdb_config.cache_index_and_filter_blocks);
ASSERT_NE (conf.node.rocksdb_config.block_size, defaults.node.rocksdb_config.block_size);
ASSERT_NE (conf.node.rocksdb_config.memtable_size, defaults.node.rocksdb_config.memtable_size);
ASSERT_NE (conf.node.rocksdb_config.num_memtables, defaults.node.rocksdb_config.num_memtables);
ASSERT_NE (conf.node.rocksdb_config.total_memtable_size, defaults.node.rocksdb_config.total_memtable_size);
}
/** There should be no required values **/

View file

@ -4,11 +4,56 @@
nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const
{
toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database\ntype:bool");
toml.put ("enable_pipelined_write", enable_pipelined_write, "Whether to use 2 separate write queues for memtable/WAL, true is recommended.\ntype:bool");
toml.put ("cache_index_and_filter_blocks", cache_index_and_filter_blocks, "Whether index and filter blocks are stored in block_cache, true is recommended.\ntype:bool");
toml.put ("bloom_filter_bits", bloom_filter_bits, "Number of bits to use with a bloom filter. Helps with point reads but uses more memory. 0 disables the bloom filter, 10 is recommended\ntype:uint32");
toml.put ("block_cache", block_cache, "Size (MB) of the block cache; A larger number will increase performance of read operations. At least 512MB is recommended.\ntype:uint64");
toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing. Number of hardware threads is recommended.\ntype:uint32");
toml.put ("block_size", block_size, "Uncompressed data (KBs) per block. Increasing block size decreases memory usage and space amplification, but increases read amplification. 16 is recommended.\ntype:uint32");
toml.put ("num_memtables", num_memtables, "Number of memtables to keep in memory per column family. 2 is the minimum, 3 is recommended.\ntype:uint32");
toml.put ("memtable_size", memtable_size, "Amount of memory (MB) to build up before flushing to disk for an individual column family. Large values increase performance. 64 or 128 is recommended\ntype:uint32");
toml.put ("total_memtable_size", total_memtable_size, "Total memory (MB) which can be used across all memtables, set to 0 for unconstrained.\ntype:uint32");
return toml.get_error ();
}
nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml)
{
toml.get_optional<bool> ("enable", enable);
toml.get_optional<bool> ("enable_pipelined_write", enable_pipelined_write);
toml.get_optional<bool> ("cache_index_and_filter_blocks", cache_index_and_filter_blocks);
toml.get_optional<unsigned> ("bloom_filter_bits", bloom_filter_bits);
toml.get_optional<uint64_t> ("block_cache", block_cache);
toml.get_optional<unsigned> ("io_threads", io_threads);
toml.get_optional<unsigned> ("block_size", block_size);
toml.get_optional<unsigned> ("num_memtables", num_memtables);
toml.get_optional<unsigned> ("memtable_size", memtable_size);
toml.get_optional<unsigned> ("total_memtable_size", total_memtable_size);
// Validate ranges
if (bloom_filter_bits > 100)
{
toml.get_error ().set ("bloom_filter_bits is too high");
}
if (num_memtables < 2)
{
toml.get_error ().set ("num_memtables must be at least 2");
}
if (memtable_size == 0)
{
toml.get_error ().set ("memtable_size must be non-zero");
}
if ((total_memtable_size < memtable_size * 8) && (total_memtable_size != 0))
{
toml.get_error ().set ("total_memtable_size should be at least 8 times greater than memtable_size or be set to 0");
}
if (io_threads == 0)
{
toml.get_error ().set ("io_threads must be non-zero");
}
if (block_size == 0)
{
toml.get_error ().set ("block_size must be non-zero");
}
return toml.get_error ();
}

View file

@ -2,6 +2,8 @@
#include <nano/lib/errors.hpp>
#include <thread>
namespace nano
{
class tomlconfig;
@ -14,5 +16,14 @@ public:
nano::error deserialize_toml (nano::tomlconfig & toml_a);
bool enable{ false };
unsigned bloom_filter_bits{ 0 };
uint64_t block_cache{ 64 }; // MB
unsigned io_threads{ std::thread::hardware_concurrency () };
bool enable_pipelined_write{ false };
bool cache_index_and_filter_blocks{ false };
unsigned block_size{ 4 }; // KB
unsigned memtable_size{ 32 }; // MB
unsigned num_memtables{ 2 }; // Need a minimum of 2
unsigned total_memtable_size{ 512 }; // MB
};
}

View file

@ -123,7 +123,7 @@ alarm (alarm_a),
work (work_a),
distributed_work (*this),
logger (config_a.logging.min_time_between_log_output),
store_impl (nano::make_store (logger, application_path_a, flags.read_only, true, config_a.diagnostics_config.txn_tracking, config_a.block_processor_batch_max_time, config_a.lmdb_max_dbs, flags.sideband_batch_size, config_a.backup_before_upgrade, config_a.rocksdb_config.enable)),
store_impl (nano::make_store (logger, application_path_a, flags.read_only, true, config_a.rocksdb_config, config_a.diagnostics_config.txn_tracking, config_a.block_processor_batch_max_time, config_a.lmdb_max_dbs, flags.sideband_batch_size, config_a.backup_before_upgrade, config_a.rocksdb_config.enable)),
store (*store_impl),
wallets_store_impl (std::make_unique<nano::mdb_wallets_store> (application_path_a / "wallets.ldb", config_a.lmdb_max_dbs)),
wallets_store (*wallets_store_impl),
@ -1356,11 +1356,11 @@ nano::node_flags const & nano::inactive_node_flag_defaults ()
return node_flags;
}
std::unique_ptr<nano::block_store> nano::make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool read_only, bool add_db_postfix, nano::txn_tracking_config const & txn_tracking_config_a, std::chrono::milliseconds block_processor_batch_max_time_a, int lmdb_max_dbs, size_t batch_size, bool backup_before_upgrade, bool use_rocksdb_backend)
std::unique_ptr<nano::block_store> nano::make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool read_only, bool add_db_postfix, nano::rocksdb_config const & rocksdb_config, nano::txn_tracking_config const & txn_tracking_config_a, std::chrono::milliseconds block_processor_batch_max_time_a, int lmdb_max_dbs, size_t batch_size, bool backup_before_upgrade, bool use_rocksdb_backend)
{
#if NANO_ROCKSDB
auto make_rocksdb = [&logger, add_db_postfix, &path, read_only]() {
return std::make_unique<nano::rocksdb_store> (logger, add_db_postfix ? path / "rocksdb" : path, read_only);
auto make_rocksdb = [&logger, add_db_postfix, &path, &rocksdb_config, read_only]() {
return std::make_unique<nano::rocksdb_store> (logger, add_db_postfix ? path / "rocksdb" : path, rocksdb_config, read_only);
};
#endif

View file

@ -1,4 +1,5 @@
#include <nano/crypto_lib/random_pool.hpp>
#include <nano/lib/rocksdbconfig.hpp>
#include <nano/node/rocksdb/rocksdb.hpp>
#include <nano/node/rocksdb/rocksdb_iterator.hpp>
#include <nano/node/rocksdb/rocksdb_txn.hpp>
@ -40,8 +41,9 @@ void rocksdb_val::convert_buffer_to_value ()
}
}
nano::rocksdb_store::rocksdb_store (nano::logger_mt & logger_a, boost::filesystem::path const & path_a, bool open_read_only_a) :
logger (logger_a)
nano::rocksdb_store::rocksdb_store (nano::logger_mt & logger_a, boost::filesystem::path const & path_a, nano::rocksdb_config const & rocksdb_config_a, bool open_read_only_a) :
logger (logger_a),
rocksdb_config (rocksdb_config_a)
{
boost::system::error_code error_mkdir, error_chmod;
boost::filesystem::create_directories (path_a, error_mkdir);
@ -466,38 +468,40 @@ rocksdb::Options nano::rocksdb_store::get_db_options () const
// Start agressively flushing WAL files when they reach over 1GB
db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL;
if (!low_end_system ())
{
// Adds a separate write queue for memtable/WAL
db_options.enable_pipelined_write = true;
// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
db_options.IncreaseParallelism (rocksdb_config.io_threads);
db_options.OptimizeLevelStyleCompaction ();
// Adds a separate write queue for memtable/WAL
db_options.enable_pipelined_write = rocksdb_config.enable_pipelined_write;
// Total size of memtables across column families. This can be used to manage the total memory used by memtables.
db_options.db_write_buffer_size = rocksdb_config.total_memtable_size;
// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
db_options.IncreaseParallelism (std::thread::hardware_concurrency ());
db_options.OptimizeLevelStyleCompaction ();
}
return db_options;
}
/** As options are currently hardcoded, a heuristic is taken based off the number of cores to modify config options */
bool nano::rocksdb_store::low_end_system () const
{
return (std::thread::hardware_concurrency () < 2);
}
rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_table_options () const
{
rocksdb::BlockBasedTableOptions table_options;
if (!low_end_system ())
{
// 512MB block cache
table_options.block_cache = rocksdb::NewLRUCache (512 * 1024 * 1024LL);
// Bloom filter to help with point reads
table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (10, false));
table_options.block_size = 16 * 1024;
table_options.cache_index_and_filter_blocks = true;
table_options.pin_l0_filter_and_index_blocks_in_cache = true;
// Block cache for reads
table_options.block_cache = rocksdb::NewLRUCache (rocksdb_config.block_cache * 1024 * 1024ULL);
// Bloom filter to help with point reads
auto bloom_filter_bits = rocksdb_config.bloom_filter_bits;
if (bloom_filter_bits > 0)
{
table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (bloom_filter_bits, false));
}
// Increasing block_size decreases memory usage and space amplification, but increases read amplification.
table_options.block_size = rocksdb_config.block_size * 1024ULL;
// Whether index and filter blocks are stored in block_cache. These settings should be synced
table_options.cache_index_and_filter_blocks = rocksdb_config.cache_index_and_filter_blocks;
table_options.pin_l0_filter_and_index_blocks_in_cache = rocksdb_config.cache_index_and_filter_blocks;
return table_options;
}
@ -506,31 +510,29 @@ rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options () const
rocksdb::ColumnFamilyOptions cf_options;
cf_options.table_factory = table_factory;
if (!low_end_system ())
{
cf_options.level_compaction_dynamic_level_bytes = true;
// Number of files in level which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 4;
// Number of files in level which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 4;
// L1 size, compaction is triggered for L0 at this size (4 SST files in L1)
cf_options.max_bytes_for_level_base = 1024ULL * 1024 * 4 * rocksdb_config.memtable_size;
// L1 size, compaction is triggered for L0 at this size (512MB)
cf_options.max_bytes_for_level_base = 512 * 1024 * 1024LL;
// Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
cf_options.max_bytes_for_level_multiplier = 8;
// Each level is a multiple of the above. L1 will be 512MB. Le will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
cf_options.max_bytes_for_level_multiplier = 8;
// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
cf_options.ttl = 1 * 24 * 60 * 60;
// Size of level 1 sst files (128MB)
cf_options.target_file_size_base = 128 * 1024 * 1024LL;
// Size of level 1 sst files
cf_options.target_file_size_base = 1024ULL * 1024 * rocksdb_config.memtable_size;
// Size of each memtable (128MB)
cf_options.write_buffer_size = 128 * 1024 * 1024LL;
// Size of each memtable
cf_options.write_buffer_size = 1024ULL * 1024 * rocksdb_config.memtable_size;
// Number of memtables to keep in memory (1 active, rest inactive/immutable)
cf_options.max_write_buffer_number = 3;
// Size target of levels are changed dynamically based on size of the last level
cf_options.level_compaction_dynamic_level_bytes = true;
// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
cf_options.ttl = 1 * 24 * 60 * 60;
}
// Number of memtables to keep in memory (1 active, rest inactive/immutable)
cf_options.max_write_buffer_number = rocksdb_config.num_memtables;
return cf_options;
}
@ -602,7 +604,7 @@ bool nano::rocksdb_store::copy_db (boost::filesystem::path const & destination_p
// Open it so that it flushes all WAL files
if (status.ok ())
{
nano::rocksdb_store rocksdb_store (logger, destination_path.string (), false);
nano::rocksdb_store rocksdb_store (logger, destination_path.string (), rocksdb_config, false);
return !rocksdb_store.init_error ();
}
return false;

View file

@ -18,13 +18,14 @@
namespace nano
{
class logging_mt;
class rocksdb_config;
/**
* rocksdb implementation of the block store
*/
class rocksdb_store : public block_store_partial<rocksdb::Slice, rocksdb_store>
{
public:
rocksdb_store (nano::logger_mt &, boost::filesystem::path const &, bool open_read_only = false);
rocksdb_store (nano::logger_mt &, boost::filesystem::path const &, nano::rocksdb_config const & = nano::rocksdb_config{}, bool open_read_only = false);
~rocksdb_store ();
nano::write_transaction tx_begin_write (std::vector<nano::tables> const & tables_requiring_lock = {}, std::vector<nano::tables> const & tables_no_lock = {}) override;
nano::read_transaction tx_begin_read () override;
@ -98,10 +99,10 @@ private:
int increment (nano::write_transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a, uint64_t amount_a);
int decrement (nano::write_transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a, uint64_t amount_a);
bool low_end_system () const;
rocksdb::ColumnFamilyOptions get_cf_options () const;
void construct_column_family_mutexes ();
rocksdb::Options get_db_options () const;
rocksdb::BlockBasedTableOptions get_table_options () const;
nano::rocksdb_config rocksdb_config;
};
}

View file

@ -5,6 +5,7 @@
#include <nano/lib/diagnosticsconfig.hpp>
#include <nano/lib/logger_mt.hpp>
#include <nano/lib/memory.hpp>
#include <nano/lib/rocksdbconfig.hpp>
#include <nano/secure/common.hpp>
#include <nano/secure/versioning.hpp>
@ -729,7 +730,7 @@ public:
virtual nano::read_transaction tx_begin_read () = 0;
};
std::unique_ptr<nano::block_store> make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool open_read_only = false, bool add_db_postfix = false, nano::txn_tracking_config const & txn_tracking_config_a = nano::txn_tracking_config{}, std::chrono::milliseconds block_processor_batch_max_time_a = std::chrono::milliseconds (5000), int lmdb_max_dbs = 128, size_t batch_size = 512, bool backup_before_upgrade = false, bool rocksdb_backend = false);
std::unique_ptr<nano::block_store> make_store (nano::logger_mt & logger, boost::filesystem::path const & path, bool open_read_only = false, bool add_db_postfix = false, nano::rocksdb_config const & rocksdb_config = nano::rocksdb_config{}, nano::txn_tracking_config const & txn_tracking_config_a = nano::txn_tracking_config{}, std::chrono::milliseconds block_processor_batch_max_time_a = std::chrono::milliseconds (5000), int lmdb_max_dbs = 128, size_t batch_size = 512, bool backup_before_upgrade = false, bool rocksdb_backend = false);
}
namespace std