[RocksDB] Use different column family options and prefix extraction for unchecked table (#2856)
* [RocksDB] Use different column family options and prefix extraction for unchecked * NewHashSkipListRepFactory cant be used with allow_concurrent_writes * Replace hardcoded column family (Colin comment) * add #ifdef NANO_ROCKSDB to test
This commit is contained in:
parent
f4147c0352
commit
cf59fde14a
6 changed files with 210 additions and 34 deletions
|
@ -1878,6 +1878,7 @@ namespace nano
|
|||
{
|
||||
TEST (rocksdb_block_store, tombstone_count)
|
||||
{
|
||||
#if NANO_ROCKSDB
|
||||
if (nano::using_rocksdb_in_tests ())
|
||||
{
|
||||
nano::logger_mt logger;
|
||||
|
@ -1890,6 +1891,7 @@ TEST (rocksdb_block_store, tombstone_count)
|
|||
store->unchecked_del (transaction, nano::unchecked_key (block1->previous (), block1->hash ()));
|
||||
ASSERT_EQ (store->tombstone_map.at (nano::tables::unchecked).num_since_last_flush.load (), 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -757,6 +757,17 @@ void nano::mdb_store::create_backup_file (nano::mdb_env & env_a, boost::filesyst
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<nano::unchecked_info> nano::mdb_store::unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a)
|
||||
{
|
||||
std::vector<nano::unchecked_info> result;
|
||||
for (auto i (unchecked_begin (transaction_a, nano::unchecked_key (hash_a, 0))), n (unchecked_end ()); i != n && i->first.key () == hash_a; ++i)
|
||||
{
|
||||
nano::unchecked_info const & unchecked_info (i->second);
|
||||
result.push_back (unchecked_info);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void nano::mdb_store::version_put (nano::write_transaction const & transaction_a, int version_a)
|
||||
{
|
||||
nano::uint256_union version_key (1);
|
||||
|
|
|
@ -191,6 +191,7 @@ public:
|
|||
MDB_dbi blocks{ 0 };
|
||||
|
||||
bool exists (nano::transaction const & transaction_a, tables table_a, nano::mdb_val const & key_a) const;
|
||||
std::vector<nano::unchecked_info> unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) override;
|
||||
|
||||
int get (nano::transaction const & transaction_a, tables table_a, nano::mdb_val const & key_a, nano::mdb_val & value_a) const;
|
||||
int put (nano::write_transaction const & transaction_a, tables table_a, nano::mdb_val const & key_a, const nano::mdb_val & value_a) const;
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include <rocksdb/merge_operator.h>
|
||||
#include <rocksdb/slice.h>
|
||||
#include <rocksdb/slice_transform.h>
|
||||
#include <rocksdb/utilities/backupable_db.h>
|
||||
#include <rocksdb/utilities/transaction.h>
|
||||
#include <rocksdb/utilities/transaction_db.h>
|
||||
|
@ -75,7 +76,7 @@ cf_name_table_map (create_cf_name_table_map ())
|
|||
if (!error)
|
||||
{
|
||||
generate_tombstone_map ();
|
||||
table_factory.reset (rocksdb::NewBlockBasedTableFactory (get_table_options ()));
|
||||
small_table_factory.reset (rocksdb::NewBlockBasedTableFactory (get_small_table_options ()));
|
||||
if (!open_read_only_a)
|
||||
{
|
||||
construct_column_family_mutexes ();
|
||||
|
@ -104,12 +105,7 @@ std::unordered_map<const char *, nano::tables> nano::rocksdb_store::create_cf_na
|
|||
|
||||
void nano::rocksdb_store::open (bool & error_a, boost::filesystem::path const & path_a, bool open_read_only_a)
|
||||
{
|
||||
std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
|
||||
for (auto & [cf_name, table] : cf_name_table_map)
|
||||
{
|
||||
column_families.emplace_back (cf_name, get_cf_options ());
|
||||
}
|
||||
|
||||
auto column_families = create_column_families ();
|
||||
auto options = get_db_options ();
|
||||
rocksdb::Status s;
|
||||
|
||||
|
@ -158,6 +154,98 @@ void nano::rocksdb_store::generate_tombstone_map ()
|
|||
tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000));
|
||||
}
|
||||
|
||||
rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options (std::string const & cf_name_a) const
|
||||
{
|
||||
rocksdb::ColumnFamilyOptions cf_options;
|
||||
auto const memtable_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size;
|
||||
auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size;
|
||||
if (cf_name_a == "unchecked")
|
||||
{
|
||||
// Unchecked table can have a lot of deletions, so increase compaction frequency.
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
|
||||
|
||||
// Create prefix bloom for memtable with the size of write_buffer_size * memtable_prefix_bloom_size_ratio
|
||||
cf_options.memtable_prefix_bloom_size_ratio = 0.1;
|
||||
// The prefix to use is the size of the unchecked key (root)
|
||||
cf_options.prefix_extractor.reset (rocksdb::NewFixedPrefixTransform (sizeof (nano::root)));
|
||||
|
||||
// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
|
||||
cf_options.level0_file_num_compaction_trigger = 2;
|
||||
|
||||
// L1 size, compaction is triggered for L0 at this size (2 SST files in L1)
|
||||
cf_options.max_bytes_for_level_base = memtable_size_bytes * 2;
|
||||
cf_options.max_bytes_for_level_multiplier = 10; // Default
|
||||
}
|
||||
else if (cf_name_a == "blocks")
|
||||
{
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
|
||||
}
|
||||
else if (cf_name_a == "confirmation_height")
|
||||
{
|
||||
// Entries will not be deleted in the normal case, so can make memtables a lot bigger
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2);
|
||||
}
|
||||
else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers")
|
||||
{
|
||||
// Meta - It contains just version key
|
||||
// Online weight - Periodically deleted
|
||||
// Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small
|
||||
cf_options = get_small_cf_options (small_table_factory);
|
||||
}
|
||||
else if (cf_name_a == "cached_counts")
|
||||
{
|
||||
// Really small (keys are blocks tables, value is uint64_t)
|
||||
cf_options = get_small_cf_options (small_table_factory);
|
||||
}
|
||||
else if (cf_name_a == "pending")
|
||||
{
|
||||
// Pending can have a lot of deletions too
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
|
||||
}
|
||||
else if (cf_name_a == "frontiers")
|
||||
{
|
||||
// Frontiers is only needed during bootstrap for legacy blocks
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
|
||||
}
|
||||
else if (cf_name_a == "accounts")
|
||||
{
|
||||
// Can have deletions from rollbacks
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
|
||||
}
|
||||
else if (cf_name_a == "vote")
|
||||
{
|
||||
// No deletes it seems, only overwrites.
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory (rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
|
||||
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
|
||||
}
|
||||
else if (cf_name_a == rocksdb::kDefaultColumnFamilyName)
|
||||
{
|
||||
// Do nothing.
|
||||
}
|
||||
else
|
||||
{
|
||||
debug_assert (false);
|
||||
}
|
||||
|
||||
return cf_options;
|
||||
}
|
||||
|
||||
std::vector<rocksdb::ColumnFamilyDescriptor> nano::rocksdb_store::create_column_families ()
|
||||
{
|
||||
std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
|
||||
for (auto & [cf_name, table] : cf_name_table_map)
|
||||
{
|
||||
column_families.emplace_back (cf_name, get_cf_options (cf_name));
|
||||
}
|
||||
return column_families;
|
||||
}
|
||||
|
||||
nano::write_transaction nano::rocksdb_store::tx_begin_write (std::vector<nano::tables> const & tables_requiring_locks_a, std::vector<nano::tables> const & tables_no_locks_a)
|
||||
{
|
||||
std::unique_ptr<nano::write_rocksdb_txn> txn;
|
||||
|
@ -422,12 +510,40 @@ int nano::rocksdb_store::clear (rocksdb::ColumnFamilyHandle * column_family)
|
|||
return handle.get () == column_family;
|
||||
});
|
||||
debug_assert (handle_it != handles.cend ());
|
||||
status = db->CreateColumnFamily (get_cf_options (), name, &column_family);
|
||||
status = db->CreateColumnFamily (get_cf_options (name), name, &column_family);
|
||||
release_assert (status.ok ());
|
||||
handle_it->reset (column_family);
|
||||
return status.code ();
|
||||
}
|
||||
|
||||
std::vector<nano::unchecked_info> nano::rocksdb_store::unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a)
|
||||
{
|
||||
auto cf = table_to_column_family (tables::unchecked);
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> iter;
|
||||
if (is_read (transaction_a))
|
||||
{
|
||||
iter.reset (db->NewIterator (snapshot_options (transaction_a), cf));
|
||||
}
|
||||
else
|
||||
{
|
||||
rocksdb::ReadOptions ropts;
|
||||
ropts.fill_cache = false;
|
||||
iter.reset (tx (transaction_a)->GetIterator (ropts, cf));
|
||||
}
|
||||
|
||||
// Uses prefix extraction
|
||||
std::vector<nano::unchecked_info> result;
|
||||
|
||||
auto prefix = nano::rocksdb_val (hash_a);
|
||||
for (iter->Seek (prefix); iter->Valid () && iter->key ().starts_with (prefix); iter->Next ())
|
||||
{
|
||||
auto unchecked_info = static_cast<nano::unchecked_info> (nano::rocksdb_val (iter->value ()));
|
||||
result.push_back (unchecked_info);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void nano::rocksdb_store::construct_column_family_mutexes ()
|
||||
{
|
||||
for (auto table : all_tables ())
|
||||
|
@ -455,42 +571,90 @@ rocksdb::Options nano::rocksdb_store::get_db_options ()
|
|||
// Adds a separate write queue for memtable/WAL
|
||||
db_options.enable_pipelined_write = true;
|
||||
|
||||
// Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel.
|
||||
db_options.max_file_opening_threads = -1;
|
||||
|
||||
// The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open.
|
||||
// Default is 1GB, lowering this to avoid replaying for too long (100MB)
|
||||
db_options.max_manifest_file_size = 100 * 1024 * 1024ULL;
|
||||
|
||||
auto event_listener_l = new event_listener ([this](rocksdb::FlushJobInfo const & flush_job_info_a) { this->on_flush (flush_job_info_a); });
|
||||
db_options.listeners.emplace_back (event_listener_l);
|
||||
|
||||
return db_options;
|
||||
}
|
||||
|
||||
rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_table_options () const
|
||||
rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_active_table_options (int lru_size) const
|
||||
{
|
||||
rocksdb::BlockBasedTableOptions table_options;
|
||||
|
||||
// Improve point lookup performance be using the data block hash index (uses about 5% more space).
|
||||
table_options.data_block_index_type = rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
|
||||
table_options.data_block_hash_table_util_ratio = 0.75;
|
||||
|
||||
// Block cache for reads
|
||||
table_options.block_cache = rocksdb::NewLRUCache (1024ULL * 1024 * base_block_cache_size * rocksdb_config.memory_multiplier);
|
||||
|
||||
// Bloom filter to help with point reads
|
||||
auto bloom_filter_bits = 10;
|
||||
table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (bloom_filter_bits, false));
|
||||
// Bloom filter to help with point reads. 10bits gives 1% false positive rate.
|
||||
table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (10, false));
|
||||
|
||||
// Increasing block_size decreases memory usage and space amplification, but increases read amplification.
|
||||
table_options.block_size = 16 * 1024ULL;
|
||||
|
||||
// Whether index and filter blocks are stored in block_cache
|
||||
// Whether level 0 index and filter blocks are stored in block_cache
|
||||
table_options.pin_l0_filter_and_index_blocks_in_cache = true;
|
||||
|
||||
return table_options;
|
||||
}
|
||||
|
||||
rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options () const
|
||||
rocksdb::BlockBasedTableOptions nano::rocksdb_store::get_small_table_options () const
|
||||
{
|
||||
rocksdb::BlockBasedTableOptions table_options;
|
||||
// Improve point lookup performance be using the data block hash index (uses about 5% more space).
|
||||
table_options.data_block_index_type = rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
|
||||
table_options.data_block_hash_table_util_ratio = 0.75;
|
||||
table_options.block_size = 1024ULL;
|
||||
return table_options;
|
||||
}
|
||||
|
||||
rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_small_cf_options (std::shared_ptr<rocksdb::TableFactory> const & table_factory_a) const
|
||||
{
|
||||
rocksdb::ColumnFamilyOptions cf_options;
|
||||
cf_options.table_factory = table_factory;
|
||||
cf_options.table_factory = table_factory_a;
|
||||
|
||||
// Number of files in level which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
|
||||
// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
|
||||
cf_options.level0_file_num_compaction_trigger = 1;
|
||||
|
||||
auto const memtable_size_bytes = 10000;
|
||||
|
||||
// L1 size, compaction is triggered for L0 at this size (1 SST file in L1)
|
||||
cf_options.max_bytes_for_level_base = memtable_size_bytes;
|
||||
|
||||
// Files older than this (1 day) will be scheduled for compaction when there is no other background work
|
||||
cf_options.ttl = 1 * 24 * 60 * 60;
|
||||
|
||||
// Multiplier for each level
|
||||
cf_options.target_file_size_multiplier = 10;
|
||||
|
||||
// Size of level 1 sst files
|
||||
cf_options.target_file_size_base = memtable_size_bytes;
|
||||
|
||||
// Size of each memtable
|
||||
cf_options.write_buffer_size = memtable_size_bytes;
|
||||
|
||||
return cf_options;
|
||||
}
|
||||
|
||||
rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_active_cf_options (std::shared_ptr<rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
|
||||
{
|
||||
rocksdb::ColumnFamilyOptions cf_options;
|
||||
cf_options.table_factory = table_factory_a;
|
||||
|
||||
// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
|
||||
cf_options.level0_file_num_compaction_trigger = 4;
|
||||
|
||||
// L1 size, compaction is triggered for L0 at this size (4 SST files in L1)
|
||||
cf_options.max_bytes_for_level_base = 1024ULL * 1024 * 4 * rocksdb_config.memory_multiplier * base_memtable_size;
|
||||
cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4;
|
||||
|
||||
// Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
|
||||
cf_options.max_bytes_for_level_multiplier = 8;
|
||||
|
@ -498,11 +662,14 @@ rocksdb::ColumnFamilyOptions nano::rocksdb_store::get_cf_options () const
|
|||
// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
|
||||
cf_options.ttl = 1 * 24 * 60 * 60;
|
||||
|
||||
// Multiplier for each level
|
||||
cf_options.target_file_size_multiplier = 10;
|
||||
|
||||
// Size of level 1 sst files
|
||||
cf_options.target_file_size_base = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size;
|
||||
cf_options.target_file_size_base = memtable_size_bytes_a;
|
||||
|
||||
// Size of each memtable
|
||||
cf_options.write_buffer_size = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size;
|
||||
cf_options.write_buffer_size = memtable_size_bytes_a;
|
||||
|
||||
// Size target of levels are changed dynamically based on size of the last level
|
||||
cf_options.level_compaction_dynamic_level_bytes = true;
|
||||
|
|
|
@ -34,6 +34,7 @@ public:
|
|||
|
||||
uint64_t count (nano::transaction const & transaction_a, tables table_a) const override;
|
||||
void version_put (nano::write_transaction const &, int) override;
|
||||
std::vector<nano::unchecked_info> unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) override;
|
||||
|
||||
bool exists (nano::transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a) const;
|
||||
int get (nano::transaction const & transaction_a, tables table_a, nano::rocksdb_val const & key_a, nano::rocksdb_val & value_a) const;
|
||||
|
@ -66,7 +67,7 @@ private:
|
|||
rocksdb::OptimisticTransactionDB * optimistic_db = nullptr;
|
||||
std::unique_ptr<rocksdb::DB> db;
|
||||
std::vector<std::unique_ptr<rocksdb::ColumnFamilyHandle>> handles;
|
||||
std::shared_ptr<rocksdb::TableFactory> table_factory;
|
||||
std::shared_ptr<rocksdb::TableFactory> small_table_factory;
|
||||
std::unordered_map<nano::tables, std::mutex> write_lock_mutexes;
|
||||
nano::rocksdb_config rocksdb_config;
|
||||
|
||||
|
@ -94,10 +95,13 @@ private:
|
|||
|
||||
void open (bool & error_a, boost::filesystem::path const & path_a, bool open_read_only_a);
|
||||
|
||||
rocksdb::ColumnFamilyOptions get_cf_options () const;
|
||||
void construct_column_family_mutexes ();
|
||||
rocksdb::Options get_db_options ();
|
||||
rocksdb::BlockBasedTableOptions get_table_options () const;
|
||||
rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr<rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
|
||||
rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr<rocksdb::TableFactory> const & table_factory_a) const;
|
||||
rocksdb::BlockBasedTableOptions get_active_table_options (int lru_size) const;
|
||||
rocksdb::BlockBasedTableOptions get_small_table_options () const;
|
||||
rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const;
|
||||
|
||||
void on_flush (rocksdb::FlushJobInfo const &);
|
||||
void flush_table (nano::tables table_a);
|
||||
|
@ -105,8 +109,10 @@ private:
|
|||
void generate_tombstone_map ();
|
||||
std::unordered_map<const char *, nano::tables> create_cf_name_table_map () const;
|
||||
|
||||
std::vector<rocksdb::ColumnFamilyDescriptor> create_column_families ();
|
||||
|
||||
constexpr static int base_memtable_size = 16;
|
||||
constexpr static int base_block_cache_size = 16;
|
||||
constexpr static int base_block_cache_size = 8;
|
||||
|
||||
friend class rocksdb_block_store_tombstone_count_Test;
|
||||
};
|
||||
|
|
|
@ -92,17 +92,6 @@ public:
|
|||
return (success (status));
|
||||
}
|
||||
|
||||
std::vector<nano::unchecked_info> unchecked_get (nano::transaction const & transaction_a, nano::block_hash const & hash_a) override
|
||||
{
|
||||
std::vector<nano::unchecked_info> result;
|
||||
for (auto i (unchecked_begin (transaction_a, nano::unchecked_key (hash_a, 0))), n (unchecked_end ()); i != n && i->first.key () == hash_a; ++i)
|
||||
{
|
||||
nano::unchecked_info const & unchecked_info (i->second);
|
||||
result.push_back (unchecked_info);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void block_put (nano::write_transaction const & transaction_a, nano::block_hash const & hash_a, nano::block const & block_a) override
|
||||
{
|
||||
debug_assert (block_a.sideband ().successor.is_zero () || block_exists (transaction_a, block_a.sideband ().successor));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue