From cf59fde14a87e9594ddf2b001fe9b7aefc648b3c Mon Sep 17 00:00:00 2001 From: Wesley Shillingford Date: Wed, 9 Sep 2020 15:31:20 +0100 Subject: [PATCH] [RocksDB] Use different column family options and prefix extraction for unchecked table (#2856) * [RocksDB] Use different column family options and prefix extraction for unchecked * NewHashSkipListRepFactory cant be used with allow_concurrent_writes * Replace hardcoded column family (Colin comment) * add #ifdef NANO_ROCKSDB to test --- nano/core_test/block_store.cpp | 2 + nano/node/lmdb/lmdb.cpp | 11 ++ nano/node/lmdb/lmdb.hpp | 1 + nano/node/rocksdb/rocksdb.cpp | 205 ++++++++++++++++++++++++++--- nano/node/rocksdb/rocksdb.hpp | 14 +- nano/secure/blockstore_partial.hpp | 11 -- 6 files changed, 210 insertions(+), 34 deletions(-) diff --git a/nano/core_test/block_store.cpp b/nano/core_test/block_store.cpp index bc3cb98a..6b17f5a4 100644 --- a/nano/core_test/block_store.cpp +++ b/nano/core_test/block_store.cpp @@ -1878,6 +1878,7 @@ namespace nano { TEST (rocksdb_block_store, tombstone_count) { +#if NANO_ROCKSDB if (nano::using_rocksdb_in_tests ()) { nano::logger_mt logger; @@ -1890,6 +1891,7 @@ TEST (rocksdb_block_store, tombstone_count) store->unchecked_del (transaction, nano::unchecked_key (block1->previous (), block1->hash ())); ASSERT_EQ (store->tombstone_map.at (nano::tables::unchecked).num_since_last_flush.load (), 1); } +#endif } } diff --git a/nano/node/lmdb/lmdb.cpp b/nano/node/lmdb/lmdb.cpp index d4bc93a0..36f063af 100644 --- a/nano/node/lmdb/lmdb.cpp +++ b/nano/node/lmdb/lmdb.cpp @@ -757,6 +757,17 @@ void nano::mdb_store::create_backup_file (nano::mdb_env & env_a, boost::filesyst } } +std::vector nano::mdb_store::unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) +{ + std::vector result; + for (auto i (unchecked_begin (transaction_a, nano::unchecked_key (hash_a, 0))), n (unchecked_end ()); i != n && i->first.key () == hash_a; ++i) + { + nano::unchecked_info const & unchecked_info (i->second); + result.push_back (unchecked_info); + } + return result; +} + void nano::mdb_store::version_put (nano::write_transaction const & transaction_a, int version_a) { nano::uint256_union version_key (1); diff --git a/nano/node/lmdb/lmdb.hpp b/nano/node/lmdb/lmdb.hpp index cee6edbb..d8ccee80 100644 --- a/nano/node/lmdb/lmdb.hpp +++ b/nano/node/lmdb/lmdb.hpp @@ -191,6 +191,7 @@ public: MDB_dbi blocks{ 0 }; bool exists (nano::transaction const & transaction_a, tables table_a, nano::mdb_val const & key_a) const; + std::vector unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) override; int get (nano::transaction const & transaction_a, tables table_a, nano::mdb_val const & key_a, nano::mdb_val & value_a) const; int put (nano::write_transaction const & transaction_a, tables table_a, nano::mdb_val const & key_a, const nano::mdb_val & value_a) const; diff --git a/nano/node/rocksdb/rocksdb.cpp b/nano/node/rocksdb/rocksdb.cpp index cd52bc68..d4613c83 100644 --- a/nano/node/rocksdb/rocksdb.cpp +++ b/nano/node/rocksdb/rocksdb.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -75,7 +76,7 @@ cf_name_table_map (create_cf_name_table_map ()) if (!error) { generate_tombstone_map (); - table_factory.reset (rocksdb::NewBlockBasedTableFactory (get_table_options ())); + small_table_factory.reset (rocksdb::NewBlockBasedTableFactory (get_small_table_options ())); if (!open_read_only_a) { construct_column_family_mutexes (); @@ -104,12 +105,7 @@ std::unordered_map nano::rocksdb_store::create_cf_na void nano::rocksdb_store::open (bool & error_a, boost::filesystem::path const & path_a, bool open_read_only_a) { - std::vector column_families; - for (auto & [cf_name, table] : cf_name_table_map) - { - column_families.emplace_back (cf_name, get_cf_options ()); - } - + auto column_families = create_column_families (); auto options = get_db_options (); rocksdb::Status s; @@ -158,6 +154,98 @@ void nano::rocksdb_store::generate_tombstone_map () tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000)); } +rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options (std::string const & cf_name_a) const +{ + rocksdb::ColumnFamilyOptions cf_options; + auto const memtable_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size; + auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size; + if (cf_name_a == "unchecked") + { + // Unchecked table can have a lot of deletions, so increase compaction frequency. + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes); + + // Create prefix bloom for memtable with the size of write_buffer_size * memtable_prefix_bloom_size_ratio + cf_options.memtable_prefix_bloom_size_ratio = 0.1; + // The prefix to use is the size of the unchecked key (root) + cf_options.prefix_extractor.reset (rocksdb::NewFixedPrefixTransform (sizeof (nano::root))); + + // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded + cf_options.level0_file_num_compaction_trigger = 2; + + // L1 size, compaction is triggered for L0 at this size (2 SST files in L1) + cf_options.max_bytes_for_level_base = memtable_size_bytes * 2; + cf_options.max_bytes_for_level_multiplier = 10; // Default + } + else if (cf_name_a == "blocks") + { + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes); + } + else if (cf_name_a == "confirmation_height") + { + // Entries will not be deleted in the normal case, so can make memtables a lot bigger + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2); + } + else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers") + { + // Meta - It contains just version key + // Online weight - Periodically deleted + // Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small + cf_options = get_small_cf_options (small_table_factory); + } + else if (cf_name_a == "cached_counts") + { + // Really small (keys are blocks tables, value is uint64_t) + cf_options = get_small_cf_options (small_table_factory); + } + else if (cf_name_a == "pending") + { + // Pending can have a lot of deletions too + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes); + } + else if (cf_name_a == "frontiers") + { + // Frontiers is only needed during bootstrap for legacy blocks + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes); + } + else if (cf_name_a == "accounts") + { + // Can have deletions from rollbacks + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes); + } + else if (cf_name_a == "vote") + { + // No deletes it seems, only overwrites. + std::shared_ptr table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); + cf_options = get_active_cf_options (table_factory, memtable_size_bytes); + } + else if (cf_name_a == rocksdb::kDefaultColumnFamilyName) + { + // Do nothing. + } + else + { + debug_assert (false); + } + + return cf_options; +} + +std::vector nano::rocksdb_store::create_column_families () +{ + std::vector column_families; + for (auto & [cf_name, table] : cf_name_table_map) + { + column_families.emplace_back (cf_name, get_cf_options (cf_name)); + } + return column_families; +} + nano::write_transaction nano::rocksdb_store::tx_begin_write (std::vector const & tables_requiring_locks_a, std::vector const & tables_no_locks_a) { std::unique_ptr txn; @@ -422,12 +510,40 @@ int nano::rocksdb_store::clear (rocksdb::ColumnFamilyHandle * column_family) return handle.get () == column_family; }); debug_assert (handle_it != handles.cend ()); - status = db->CreateColumnFamily (get_cf_options (), name, &column_family); + status = db->CreateColumnFamily (get_cf_options (name), name, &column_family); release_assert (status.ok ()); handle_it->reset (column_family); return status.code (); } +std::vector nano::rocksdb_store::unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) +{ + auto cf = table_to_column_family (tables::unchecked); + + std::unique_ptr iter; + if (is_read (transaction_a)) + { + iter.reset (db->NewIterator (snapshot_options (transaction_a), cf)); + } + else + { + rocksdb::ReadOptions ropts; + ropts.fill_cache = false; + iter.reset (tx (transaction_a)->GetIterator (ropts, cf)); + } + + // Uses prefix extraction + std::vector result; + + auto prefix = nano::rocksdb_val (hash_a); + for (iter->Seek (prefix); iter->Valid () && iter->key ().starts_with (prefix); iter->Next ()) + { + auto unchecked_info = static_cast (nano::rocksdb_val (iter->value ())); + result.push_back (unchecked_info); + } + return result; +} + void nano::rocksdb_store::construct_column_family_mutexes () { for (auto table : all_tables ()) @@ -455,42 +571,90 @@ rocksdb::Options nano::rocksdb_store::get_db_options () // Adds a separate write queue for memtable/WAL db_options.enable_pipelined_write = true; + // Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel. + db_options.max_file_opening_threads = -1; + + // The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open. + // Default is 1GB, lowering this to avoid replaying for too long (100MB) + db_options.max_manifest_file_size = 100 * 1024 * 1024ULL; + auto event_listener_l = new event_listener ([this](rocksdb::FlushJobInfo const & flush_job_info_a) { this->on_flush (flush_job_info_a); }); db_options.listeners.emplace_back (event_listener_l); return db_options; } -rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_table_options () const +rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_active_table_options (int lru_size) const { rocksdb::BlockBasedTableOptions table_options; + // Improve point lookup performance be using the data block hash index (uses about 5% more space). + table_options.data_block_index_type = rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; + table_options.data_block_hash_table_util_ratio = 0.75; + // Block cache for reads table_options.block_cache = rocksdb::NewLRUCache (1024ULL * 1024 * base_block_cache_size * rocksdb_config.memory_multiplier); - // Bloom filter to help with point reads - auto bloom_filter_bits = 10; - table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (bloom_filter_bits, false)); + // Bloom filter to help with point reads. 10bits gives 1% false positive rate. + table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (10, false)); // Increasing block_size decreases memory usage and space amplification, but increases read amplification. table_options.block_size = 16 * 1024ULL; - // Whether index and filter blocks are stored in block_cache + // Whether level 0 index and filter blocks are stored in block_cache table_options.pin_l0_filter_and_index_blocks_in_cache = true; return table_options; } -rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options () const +rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_small_table_options () const +{ + rocksdb::BlockBasedTableOptions table_options; + // Improve point lookup performance be using the data block hash index (uses about 5% more space). + table_options.data_block_index_type = rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; + table_options.data_block_hash_table_util_ratio = 0.75; + table_options.block_size = 1024ULL; + return table_options; +} + +rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_small_cf_options (std::shared_ptr const & table_factory_a) const { rocksdb::ColumnFamilyOptions cf_options; - cf_options.table_factory = table_factory; + cf_options.table_factory = table_factory_a; - // Number of files in level which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded + // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded + cf_options.level0_file_num_compaction_trigger = 1; + + auto const memtable_size_bytes = 10000; + + // L1 size, compaction is triggered for L0 at this size (1 SST file in L1) + cf_options.max_bytes_for_level_base = memtable_size_bytes; + + // Files older than this (1 day) will be scheduled for compaction when there is no other background work + cf_options.ttl = 1 * 24 * 60 * 60; + + // Multiplier for each level + cf_options.target_file_size_multiplier = 10; + + // Size of level 1 sst files + cf_options.target_file_size_base = memtable_size_bytes; + + // Size of each memtable + cf_options.write_buffer_size = memtable_size_bytes; + + return cf_options; +} + +rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_active_cf_options (std::shared_ptr const & table_factory_a, unsigned long long memtable_size_bytes_a) const +{ + rocksdb::ColumnFamilyOptions cf_options; + cf_options.table_factory = table_factory_a; + + // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded cf_options.level0_file_num_compaction_trigger = 4; // L1 size, compaction is triggered for L0 at this size (4 SST files in L1) - cf_options.max_bytes_for_level_base = 1024ULL * 1024 * 4 * rocksdb_config.memory_multiplier * base_memtable_size; + cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4; // Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on... cf_options.max_bytes_for_level_multiplier = 8; @@ -498,11 +662,14 @@ rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options () const // Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however. cf_options.ttl = 1 * 24 * 60 * 60; + // Multiplier for each level + cf_options.target_file_size_multiplier = 10; + // Size of level 1 sst files - cf_options.target_file_size_base = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size; + cf_options.target_file_size_base = memtable_size_bytes_a; // Size of each memtable - cf_options.write_buffer_size = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size; + cf_options.write_buffer_size = memtable_size_bytes_a; // Size target of levels are changed dynamically based on size of the last level cf_options.level_compaction_dynamic_level_bytes = true; diff --git a/nano/node/rocksdb/rocksdb.hpp b/nano/node/rocksdb/rocksdb.hpp index 6ef2edbb..8ae2eae3 100644 --- a/nano/node/rocksdb/rocksdb.hpp +++ b/nano/node/rocksdb/rocksdb.hpp @@ -34,6 +34,7 @@ public: uint64_t count (nano::transaction const & transaction_a, tables table_a) const override; void version_put (nano::write_transaction const &, int) override; + std::vector unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) override; bool exists (nano::transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a) const; int get (nano::transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a, nano::rocksdb_val & value_a) const; @@ -66,7 +67,7 @@ private: rocksdb::OptimisticTransactionDB * optimistic_db = nullptr; std::unique_ptr db; std::vector> handles; - std::shared_ptr table_factory; + std::shared_ptr small_table_factory; std::unordered_map write_lock_mutexes; nano::rocksdb_config rocksdb_config; @@ -94,10 +95,13 @@ private: void open (bool & error_a, boost::filesystem::path const & path_a, bool open_read_only_a); - rocksdb::ColumnFamilyOptions get_cf_options () const; void construct_column_family_mutexes (); rocksdb::Options get_db_options (); - rocksdb::BlockBasedTableOptions get_table_options () const; + rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr const & table_factory_a, unsigned long long memtable_size_bytes_a) const; + rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr const & table_factory_a) const; + rocksdb::BlockBasedTableOptions get_active_table_options (int lru_size) const; + rocksdb::BlockBasedTableOptions get_small_table_options () const; + rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const; void on_flush (rocksdb::FlushJobInfo const &); void flush_table (nano::tables table_a); @@ -105,8 +109,10 @@ private: void generate_tombstone_map (); std::unordered_map create_cf_name_table_map () const; + std::vector create_column_families (); + constexpr static int base_memtable_size = 16; - constexpr static int base_block_cache_size = 16; + constexpr static int base_block_cache_size = 8; friend class rocksdb_block_store_tombstone_count_Test; }; diff --git a/nano/secure/blockstore_partial.hpp b/nano/secure/blockstore_partial.hpp index 04daa005..60b91bd5 100644 --- a/nano/secure/blockstore_partial.hpp +++ b/nano/secure/blockstore_partial.hpp @@ -92,17 +92,6 @@ public: return (success (status)); } - std::vector unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) override - { - std::vector result; - for (auto i (unchecked_begin (transaction_a, nano::unchecked_key (hash_a, 0))), n (unchecked_end ()); i != n && i->first.key () == hash_a; ++i) - { - nano::unchecked_info const & unchecked_info (i->second); - result.push_back (unchecked_info); - } - return result; - } - void block_put (nano::write_transaction const & transaction_a, nano::block_hash const & hash_a, nano::block const & block_a) override { debug_assert (block_a.sideband ().successor.is_zero () || block_exists (transaction_a, block_a.sideband ().successor));