From 999feb5c5bffb021287733bd34c86a45deb3735d Mon Sep 17 00:00:00 2001 From: shosseinimotlagh Date: Mon, 24 Feb 2025 12:08:57 -0800 Subject: [PATCH 1/5] add long running for remove --- src/lib/index/wb_cache.cpp | 3 +- src/tests/test_index_crash_recovery.cpp | 142 ++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 2 deletions(-) diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 749b530d9..4ac648816 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -312,11 +312,10 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p // log new nodes and freed nodes and parent and child static uint32_t txn_id = 0; static int last_cp_id = -2; - static std::string txn = ""; + std::string txn = ""; if (last_cp_id != icp_ctx->id()) { last_cp_id = icp_ctx->id(); txn_id = 0; - txn = ""; } if (new_node_bufs.empty() && freed_node_bufs.empty()) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index e13c886c9..7aa56f090 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -728,6 +728,148 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { } } +TYPED_TEST(IndexCrashTest, long_running_remove_crash) { + + // Define the lambda function + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); + auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); + bool load_mode = SISL_OPTIONS.count("load_from_file"); + bool save_mode = SISL_OPTIONS.count("save_to_file"); + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); + vector< std::string > flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child"/*, + "crash_flush_on_freed_child"*/}; + + std::string flip = ""; + OperationList operations; + auto m_start_time = Clock::now(); + auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; + double elapsed_time, progress_percent, last_progress_time = 0; + bool renew_btree_after_crash = false; + auto cur_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", num_entries); + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(num_entries, true /* reset */); + if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + auto opstr = SequenceGenerator::printOperations(operations); + LOGINFO("Lets before crash print operations\n{}", opstr); + + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + generator.setPutFrequency(0); + generator.setRemoveFrequency(100); + + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + // this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; + round <= rounds && !time_to_stop() && this->tree_key_count() > num_entries_per_rounds; round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + if (load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + flip = line; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + flip = flips[cur_flip_idx++ % flips.size()]; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } + file.close(); + } + } + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + for (auto [k, _] : operations) { + this->remove_one(k, true /* expect_success */); + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); + } + if (elapsed_time - last_progress_time > 30) { + last_progress_time = elapsed_time; + print_time = true; + } + if (print_time) { + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, + this->tree_key_count() * 100.0 / num_entries); + } + // this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; + } +} + // Basic reverse and forward order remove with different flip points TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { vector< std::string > flip_points = { From 535b453d1e367cf01d81d979066d38b915715ddc Mon Sep 17 00:00:00 2001 From: shosseinimotlagh Date: Fri, 25 Apr 2025 13:40:24 -0700 Subject: [PATCH 2/5] Fix index crash recovery and enable prefix in memory UT --- src/include/homestore/btree/btree.hpp | 3 +- .../homestore/btree/detail/btree_common.ipp | 84 ++- .../homestore/btree/detail/btree_internal.hpp | 6 +- .../homestore/btree/detail/btree_node.hpp | 25 +- .../homestore/btree/detail/btree_node_mgr.ipp | 10 +- .../btree/detail/btree_remove_impl.ipp | 38 +- .../homestore/btree/detail/simple_node.hpp | 16 +- .../homestore/index/index_internal.hpp | 3 + src/include/homestore/index/index_table.hpp | 680 ++++++++++++++---- src/include/homestore/index_service.hpp | 1 + src/lib/common/homestore_config.fbs | 6 + src/lib/index/index_cp.cpp | 85 ++- src/lib/index/index_cp.hpp | 4 +- src/lib/index/index_service.cpp | 15 +- src/lib/index/wb_cache.cpp | 158 ++-- src/lib/index/wb_cache.hpp | 10 + src/tests/CMakeLists.txt | 2 +- src/tests/btree_helpers/btree_test_helper.hpp | 19 +- src/tests/test_index_btree.cpp | 2 + src/tests/test_index_crash_recovery.cpp | 122 ++-- src/tests/test_mem_btree.cpp | 18 +- src/tests/test_scripts/index_test.py | 25 +- 22 files changed, 1043 insertions(+), 289 deletions(-) diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 0fec83ddd..3ba74623f 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -201,9 +201,10 @@ class Btree { uint64_t get_btree_node_cnt() const; uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; void to_string(bnodeid_t bnodeid, std::string& buf) const; - void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb) const; + void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, int nindent=-1) const; void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; + void sanity_sub_tree(bnodeid_t bnodeid=0) const; void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void print_node(const bnodeid_t& bnodeid) const; diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index ecda7e138..43e0c7c60 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -148,23 +148,27 @@ void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const { template < typename K, typename V > void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, - to_string_cb_t< K, V > const& cb) const { + to_string_cb_t< K, V > const& cb, int nindent) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; } - fmt::format_to(std::back_inserter(buf), "{}\n", node->to_custom_string(cb)); + if(nindent <0){ + nindent = node->level(); + } + std::string tabs(3*(nindent- node->level()), ' '); + fmt::format_to(std::back_inserter(buf), "{}{}\n", tabs, node->to_custom_string(cb)); if (!node->is_leaf()) { uint32_t i = 0; while (i < node->total_entries()) { BtreeLinkInfo p; node->get_nth_value(i, &p, false); - to_custom_string_internal(p.bnode_id(), buf, cb); + to_custom_string_internal(p.bnode_id(), buf, cb, nindent); ++i; } - if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb); } + if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb, nindent); } } unlock_node(node, acq_lock); } @@ -222,6 +226,35 @@ uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { return result; } +template < typename K, typename V > +void Btree< K, V >::sanity_sub_tree(bnodeid_t bnodeid) const { + if (bnodeid==0) { + bnodeid= m_root_node_info.bnode_id(); + } + BtreeNodePtr node; + if ( + auto ret = read_node_impl(bnodeid, node); ret!=btree_status_t::success) { + LOGINFO("reading node failed for bnodeid: {} reason: {}", bnodeid, ret); + }else{ + if(node->is_leaf()){ + return; + } + uint32_t nentries = node->has_valid_edge() ? node->total_entries() + 1 : node->total_entries(); + std::vector child_id_list; + child_id_list.reserve(nentries); + BT_REL_ASSERT_NE(node->has_valid_edge() && node->next_bnode() != empty_bnodeid, true, "node {} has valid edge and next id is not empty", node->to_string()); + for (uint32_t i = 0; i < nentries; ++i) { + validate_sanity_child(node, i); + BtreeLinkInfo child_info; + node->get_nth_value(i, &child_info, false /* copy */); + child_id_list.push_back(child_info.bnode_id()); + } + for (auto child_id: child_id_list){ + sanity_sub_tree(child_id); + } + } +} + template < typename K, typename V > void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const { BtreeLinkInfo child_info; @@ -240,26 +273,33 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 } return; } - child_node->get_first_key(&child_first_key); - child_node->get_last_key(&child_last_key); - BT_REL_ASSERT_LE(child_first_key.compare(&child_last_key), 0); - if (ind == parent_node->total_entries()) { + BT_REL_ASSERT_NE(child_node->is_node_deleted(), true, "child node {} is deleted", child_node->to_string()); + if(ind >= parent_node->total_entries()){ BT_REL_ASSERT_EQ(parent_node->has_valid_edge(), true); - if (ind > 0) { - parent_node->get_nth_key< K >(ind - 1, &parent_key, false); - BT_REL_ASSERT_GT(child_first_key.compare(&parent_key), 0); - BT_REL_ASSERT_LT(parent_key.compare_start(&child_first_key), 0); + if( ind >0){ + parent_key = parent_node->get_nth_key< K >(ind -1, false); } - } else { - parent_node->get_nth_key< K >(ind, &parent_key, false); - BT_REL_ASSERT_LE(child_first_key.compare(&parent_key), 0) - BT_REL_ASSERT_LE(child_last_key.compare(&parent_key), 0) - BT_REL_ASSERT_GE(parent_key.compare_start(&child_first_key), 0) - BT_REL_ASSERT_GE(parent_key.compare_start(&child_first_key), 0) - if (ind != 0) { - parent_node->get_nth_key< K >(ind - 1, &parent_key, false); - BT_REL_ASSERT_GT(child_first_key.compare(&parent_key), 0) - BT_REL_ASSERT_LT(parent_key.compare_start(&child_first_key), 0) + }else + { + parent_key = parent_node->get_nth_key< K >(ind, false); + } + K previous_parent_key; + if( ind >0 && parent_node->total_entries()>0){ + previous_parent_key = parent_node->get_nth_key< K >(ind - 1, false); + } + for (uint32_t i = 0; i total_entries() ; ++i) { + K cur_child_key = child_node->get_nth_key< K >(i, false); + if(ind < parent_node->total_entries()){ + BT_REL_ASSERT_LE(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent's {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); + if(ind>0) { + BT_REL_ASSERT_GT(cur_child_key.compare(previous_parent_key), 0, + " child {} {}-th key is less than its parent's {} {}-th key", child_node->to_string(), + i, parent_node->to_string(), ind - 1); + } + + }else + { + BT_REL_ASSERT_GT(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); } } } diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 7dbc50c0a..8f2b267ac 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -250,19 +250,19 @@ struct BtreeConfig { uint64_t m_min_keys_in_node{0}; #endif bool m_rebalance_turned_on{false}; - bool m_merge_turned_on{true}; btree_node_type m_leaf_node_type{btree_node_type::VAR_OBJECT}; btree_node_type m_int_node_type{btree_node_type::VAR_KEY}; std::string m_btree_name; // Unique name for the btree - + bool m_merge_turned_on{true}; + uint8_t m_max_merge_level{1}; private: uint32_t m_suggested_min_size; // Precomputed values uint32_t m_ideal_fill_size; public: BtreeConfig(uint32_t node_size, const std::string& btree_name = "") : - m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name} { + m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name}{ set_node_data_size(node_size - 512); // Just put estimate at this point of time. } diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 988b683cf..1c45501aa 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -86,9 +86,9 @@ struct persistent_hdr_t { auto sedge = (edge_info.m_bnodeid == empty_bnodeid) ? "" : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); - return fmt::format("id={}{}{} {} level={} nentries={}{} mod_cp={}", node_id, snext, sedge, - leaf ? "LEAF" : "INTERIOR", level, nentries, (node_deleted == 0x1) ? " Deleted" : "", - modified_cp_id); + return fmt::format("id={}{}{} {} level={} nentries={} mod_cp={}{}", node_id, snext, sedge, + leaf ? "LEAF" : "INTERIOR", level, nentries, modified_cp_id, + node_deleted == 0x1 ? " Deleted" : " LIVE"); } }; #pragma pack() @@ -119,7 +119,6 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node; m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node; #endif - } virtual ~BtreeNode() = default; @@ -368,9 +367,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { std::string to_custom_string(to_string_cb_t< K, V > const& cb) const { std::string snext = (this->next_bnode() == empty_bnodeid) ? "" : fmt::format(" next_node={}", this->next_bnode()); - auto str = fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} ", this->node_id(), this->link_version(), - this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, - this->node_gen()); + auto str = + fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} {} ", this->node_id(), this->link_version(), + this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, + this->node_gen(), this->is_node_deleted() ? " **DELETED**" : ""); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -396,12 +396,6 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } - - // Should not happen - if (this->is_node_deleted()) { - fmt::format_to(std::back_inserter(str), " **DELETED** "); - } - return str; } @@ -537,10 +531,9 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } bool is_merge_needed(const BtreeConfig& cfg) const { + if (level() > cfg.m_max_merge_level) { return false; } #ifdef _PRERELEASE - if (min_keys_in_node()) { - return total_entries() < min_keys_in_node(); - } + if (min_keys_in_node()) { return total_entries() < min_keys_in_node(); } #endif return (occupied_size() < cfg.suggested_min_size()); } diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index a5b0317de..aa536a728 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -334,11 +334,11 @@ void Btree< K, V >::_start_of_lock(const BtreeNodePtr& node, locktype_t ltype, c info.node = node.get(); if (ltype == locktype_t::WRITE) { bt_thread_vars()->wr_locked_nodes.push_back(info); - LOGTRACEMOD(btree, "ADDING node {} to write locked nodes list, its size={}", (void*)info.node, + LOGTRACEMOD(btree, "ADDING node {} to write locked nodes list, its size={}", info.node->node_id(), bt_thread_vars()->wr_locked_nodes.size()); } else if (ltype == locktype_t::READ) { bt_thread_vars()->rd_locked_nodes.push_back(info); - LOGTRACEMOD(btree, "ADDING node {} to read locked nodes list, its size={}", (void*)info.node, + LOGTRACEMOD(btree, "ADDING node {} to read locked nodes list, its size={}", info.node->node_id(), bt_thread_vars()->rd_locked_nodes.size()); } else { DEBUG_ASSERT(false, "Invalid locktype_t {}", ltype); @@ -355,7 +355,7 @@ bool Btree< K, V >::remove_locked_node(const BtreeNodePtr& node, locktype_t ltyp if (info.node == node.get()) { *out_info = info; pnode_infos->pop_back(); - LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", (void*)info.node, + LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}",info.node->node_id(), (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); return true; } else if (pnode_infos->size() > 1) { @@ -364,7 +364,7 @@ bool Btree< K, V >::remove_locked_node(const BtreeNodePtr& node, locktype_t ltyp *out_info = info; pnode_infos->at(pnode_infos->size() - 2) = pnode_infos->back(); pnode_infos->pop_back(); - LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", (void*)info.node, + LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", info.node->node_id(), (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); return true; } @@ -390,7 +390,7 @@ template < typename K, typename V > uint64_t Btree< K, V >::end_of_lock(const BtreeNodePtr& node, locktype_t ltype) { btree_locked_node_info info; if (!remove_locked_node(node, ltype, &info)) { - DEBUG_ASSERT(false, "Expected node = {} is not there in locked_node_list", (void*)node.get()); + DEBUG_ASSERT(false, "Expected node = {} is not there in locked_node_list", node->node_id()); return 0; } // DEBUG_ASSERT_EQ(node.get(), info.node); diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 82213dcc6..ccfe0f584 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -246,6 +246,11 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const _src_cursor_info src_cursor; total_size = leftmost_node->occupied_size(); + uint32_t expected_entities = leftmost_node->total_entries(); +#ifdef _PRERELEASE + const uint64_t max_keys = leftmost_node->max_keys_in_node(); +#endif + for (auto indx = start_idx + 1; indx <= end_idx; ++indx) { if (indx == parent_node->total_entries()) { BT_NODE_LOG_ASSERT(parent_node->has_valid_edge(), parent_node, @@ -271,6 +276,10 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // Only option is to rebalance the nodes across. If we are asked not to do so, skip it. if (!m_bt_cfg.m_rebalance_turned_on) { ret = btree_status_t::merge_not_required; + BT_NODE_LOG( + DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! num_nodes {} is more than old_nodes.size() {}", + parent_node->to_string(), leftmost_node->to_string(), num_nodes, old_nodes.size()); goto out; } } @@ -279,6 +288,10 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if (leftmost_node->occupied_size() > balanced_size) { // If for some reason balancing increases the current size, give up. // TODO: Is this a real case, isn't happening would mean some sort of bug in calculation of is_merge_needed? + BT_NODE_LOG( + DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! current size {} is more than balanced size {}", + parent_node->to_string(), leftmost_node->to_string(), leftmost_node->occupied_size(), balanced_size); BT_NODE_DBG_ASSERT(false, leftmost_node, "Didn't expect current size is more than balanced size without rebalancing"); ret = btree_status_t::merge_not_required; @@ -294,7 +307,19 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const leftmost_src.ith_nodes.push_back(i); // TODO: check whether value size of the node is greater than available_size? If so nentries is 0. Suppose if a // node contains one entry and the value size is much bigger than available size - auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size); + auto nentries = old_nodes[i]->num_entries_by_size(0, available_size); + +#ifdef _PRERELEASE + if (max_keys) { + if (expected_entities + nentries > max_keys) { + nentries = max_keys - expected_entities; + expected_entities = max_keys; + } else { + expected_entities += nentries; + } + } +#endif + if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in available_size -= old_nodes[i]->occupied_size(); if (i >= old_nodes.size() - 1) { @@ -353,13 +378,22 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // better merge next time. if (new_nodes.size() > old_nodes.size()) { ret = btree_status_t::merge_not_required; + BT_NODE_LOG( + DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! new nodes size {} is more than old nodes size {}", + parent_node->to_string(), leftmost_node->to_string(), new_nodes.size(), old_nodes.size()); goto out; } // There is a case where we are rebalancing and the second node which rebalanced didn't move any size, in that case // the first node is going to be exactly same and we will do again merge, so bail out here. - if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() >= new_nodes[0]->occupied_size())) { + if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() == new_nodes[0]->occupied_size())) { ret = btree_status_t::merge_not_required; + BT_NODE_LOG(DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! old nodes occupied size {} is more than " + "as new nodes occupied size {}", + parent_node->to_string(), leftmost_node->to_string(), old_nodes[0]->occupied_size(), + new_nodes[0]->occupied_size()); goto out; } diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index e85d1190c..25a87c1c1 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -166,6 +166,14 @@ class SimpleNode : public VariantNode< K, V > { nentries = std::min(nentries, other.total_entries() - start_idx); nentries = std::min(nentries, this->get_available_entries()); +#ifdef _PRERELEASE + const uint64_t max_keys = this->max_keys_in_node(); + if(max_keys){ + if(this->total_entries() + nentries > max_keys) { + nentries = max_keys - this->total_entries(); + } + } +#endif uint32_t sz = nentries * get_nth_obj_size(0); if (sz != 0) { std::memcpy(get_nth_obj(this->total_entries()), other.get_nth_obj_const(start_idx), sz); } this->add_entries(nentries); @@ -213,10 +221,10 @@ class SimpleNode : public VariantNode< K, V > { std::string to_string(bool print_friendly = false) const override { auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); - auto str = fmt::format("{}id={} level={} nEntries={} {} {} ", + auto str = fmt::format("{}id={} level={} nEntries={} {} {} {}", (print_friendly ? "------------------------------------------------------------\n" : ""), this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), snext); + (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->is_node_deleted()? " Deleted" : " LIVE"); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -379,9 +387,9 @@ class SimpleNode : public VariantNode< K, V > { return (this->node_data_area_const() + (get_nth_obj_size(ind) * ind)); } - void set_nth_key(uint32_t ind, BtreeKey* key) { + void set_nth_key(uint32_t ind, const BtreeKey& key) { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob const b = key->serialize(); + sisl::blob const b = key.serialize(); memcpy(entry, b.cbytes(), b.size()); } diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 1ff444650..c411edf70 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -75,6 +75,8 @@ class IndexTableBase { virtual void stop() = 0; virtual void repair_node(IndexBufferPtr const& buf) = 0; virtual void repair_root_node(IndexBufferPtr const& buf) = 0; + virtual void delete_stale_children(IndexBufferPtr const& buf) = 0; + virtual void audit_tree() = 0; }; enum class index_buf_state_t : uint8_t { @@ -94,6 +96,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { cp_id_t m_created_cp_id{-1}; // CP id when this buffer is created. std::atomic< index_buf_state_t > m_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? uint8_t* m_bytes{nullptr}; // Actual data buffer + uint32_t m_node_level{0}; //levels of the node in the btree std::shared_ptr< IndexBuffer > m_up_buffer; // Parent buffer in the chain to persisted sisl::atomic_counter< int > m_wait_for_down_buffers{0}; // Number of children need to wait for before persisting diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index a693ddc9e..06703e41f 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -60,6 +60,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { + this->m_bt_cfg.m_merge_turned_on = HS_DYNAMIC_CONFIG(btree.merge_turned_on); + this->m_bt_cfg.m_max_merge_level = HS_DYNAMIC_CONFIG(btree.max_merge_level); // Create a superblk for the index table and create MetaIndexBuffer corresponding to that m_sb.create(sizeof(index_table_sb)); m_sb->uuid = uuid; @@ -77,6 +79,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } IndexTable(superblk< index_table_sb >&& sb, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{std::move(sb)} { + this->m_bt_cfg.m_merge_turned_on = HS_DYNAMIC_CONFIG(btree.merge_turned_on); + this->m_bt_cfg.m_max_merge_level = HS_DYNAMIC_CONFIG(btree.max_merge_level); m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb); // After recovery, we see that root node is empty, which means that after btree is created, we crashed. @@ -98,6 +102,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } + void audit_tree() override { + cp_mgr().cp_guard(); + Btree< K, V >::sanity_sub_tree(); + } + btree_status_t destroy() override { if (is_stopping()) return btree_status_t::stopping; incr_pending_request_num(); @@ -181,6 +190,22 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } + void delete_stale_children(IndexBufferPtr const& idx_buf) override { + if (!idx_buf->is_meta_buf() && idx_buf->m_created_cp_id == -1) { + BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto cpg = cp_mgr().cp_guard(); + idx_buf->m_dirtied_cp_id = cpg->id(); + BtreeNodePtr bn = BtreeNodePtr{n}; + + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "delete_stale_links cp={} buf={}", cpg->id(), idx_buf->to_string()); + delete_stale_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + } + } + } + void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { // We cannot repair the meta buf on its own, we need to repair the root node which modifies the @@ -230,6 +255,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { node->set_checksum(); auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY); + idx_node->m_idx_buf->m_node_level = node->level(); if (prev_state == index_buf_state_t::CLEAN) { // It was clean before, dirtying it first time, add it to the wb_cache list to flush if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) { @@ -243,6 +269,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { (int)prev_state, (int)index_buf_state_t::FLUSHING, "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}", cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id); + BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), + "Writing a node which was not acquired by this cp"); } return btree_status_t::success; } @@ -294,6 +322,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void free_node_impl(const BtreeNodePtr& node, void* context) override { auto n = static_cast< IndexBtreeNode* >(node.get()); + n->m_idx_buf->m_node_level = node->level(); wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } @@ -314,175 +343,562 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } - btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); - // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs - // to be handled. Get the last key in the node - auto const last_parent_key = parent_node->get_last_key< K >(); - auto const is_parent_edge_node = parent_node->has_valid_edge(); - if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { - BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", - parent_node->node_id()); - return btree_status_t::not_found; + btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string()); + BtreeNodeList free_nodes; + auto nentries = parent_node->total_entries(); + uint32_t deleted = 0; + for (uint32_t i = nentries; i-- > 0;) { + BtreeLinkInfo cur_child_info; + BtreeNodePtr child_node; + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), child_node); ret == btree_status_t::success) { + if (child_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale child node [{}] for parent node [{}]", child_node->to_string(), + parent_node->to_string()); + child_node->set_node_deleted(); + free_node_impl(child_node, cp_ctx); + + if (i > 0) { + BtreeLinkInfo pre_child_info; + parent_node->get_nth_value(i - 1, &pre_child_info, false /* copy */); + // auto ckey = parent_node->get_nth_key< K >(i-1, true); + // parent_node->set_nth_key(i-1, ckey); + parent_node->update(i, pre_child_info); + parent_node->remove(i - 1); + } else { + parent_node->remove(i); + } + + LOGTRACEMOD(wbcache, "so far parent node [{}]", parent_node->to_string()); + // free_nodes.push_back(child_node); + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + cur_child_info.bnode_id(), parent_node->to_string(), ret); + } } + if (parent_node->has_valid_edge()) { + auto edge_info = parent_node->get_edge_value(); + BtreeNodePtr edge_node; + if (auto ret = read_node_impl(edge_info.bnode_id(), edge_node); ret == btree_status_t::success) { + if (edge_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale edge node [{}] for parent node [{}]", edge_node->to_string(), + parent_node->to_string()); + edge_node->set_node_deleted(); + free_node_impl(edge_node, cp_ctx); + if (parent_node->total_entries() == 0) { + parent_node->invalidate_edge(); + } else { + BtreeLinkInfo last_child_info; + parent_node->get_nth_value(parent_node->total_entries() - 1, &last_child_info, + false /* copy */); + parent_node->set_edge_value(last_child_info); + parent_node->remove(parent_node->total_entries() - 1); + LOGTRACEMOD(wbcache, "Replacing edge with previous child node [{}] for parent node [{}]", + last_child_info.bnode_id(), parent_node->to_string()); + } - // Get all original child ids as a support to check if we are beyond the last child node - std::set< bnodeid_t > orig_child_ids; - for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { - BtreeLinkInfo link_info; - parent_node->get_nth_value(i, &link_info, true); - orig_child_ids.insert(link_info.bnode_id()); + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read edge node {} for parent node [{}] reason {}", + edge_node->to_string(), parent_node->to_string(), ret); + } } - BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), - last_parent_key.to_string()); - - // Get the first child node and its link info - BtreeLinkInfo child_info; - BtreeNodePtr child_node; - auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, - locktype_t::READ, cp_ctx); - if (ret != btree_status_t::success) { - BT_LOG_ASSERT(false, "Parent node={} repair failed, because first child_node get has failed with ret={}", - parent_node->node_id(), enum_name(ret)); + if (deleted /*free_nodes.size()*/) { + btree_status_t ret = btree_status_t::success; + + if ((parent_node->total_entries() == 0) && !parent_node->has_valid_edge()) { + parent_node->set_node_deleted(); + LOGTRACEMOD(wbcache, + "Freeing parent node=[{}] because it is empty and not an edge node but had stale children", + parent_node->to_string()); + ret = write_node_impl(parent_node, cp_ctx); + free_node_impl(parent_node, cp_ctx); + LOGTRACEMOD(wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } else { + ret = write_node_impl(parent_node, cp_ctx); + if (ret != btree_status_t::success) { + LOGTRACEMOD(wbcache, "Failed to write parent node [{}] after deleting stale links", + parent_node->to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } + } + // auto ret = transact_nodes({}, free_nodes, parent_node, nullptr, cp_ctx); return ret; + } else { + LOGTRACEMOD(wbcache, "Accomplishing deleting stale links. No stale links found for parent node [{}]", + parent_node->to_string()); } + return btree_status_t::success; + } - // Keep a copy of the node buffer, in case we need to revert back - uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; - std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); + // + btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this + // needs to be handled. Get the last key in the node + + auto last_parent_key = parent_node->get_last_key< K >(); + auto const is_parent_edge_node = parent_node->has_valid_edge(); + if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { + BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", + parent_node->node_id()); + return btree_status_t::not_found; + } - // Remove all the entries in parent_node and let walk across child_nodes rebuild this node - parent_node->remove_all(this->m_bt_cfg); + // Get all original child ids as a support to check if we are beyond the last child node + std::unordered_map< bnodeid_t, K > orig_child_infos; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); + } + LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), + last_parent_key.to_string()); + + // Get the first child node and its link info + BtreeLinkInfo child_info; + BtreeNodePtr child_node; + BtreeNodePtr pre_child_node; + auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, + locktype_t::READ, cp_ctx); + if (ret != btree_status_t::success) { + BT_LOG_ASSERT(false, "Parent node={} repair failed, because first child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + return ret; + } - // Walk across all child nodes until it gets the last_parent_key and keep fixing them. - auto cur_parent = parent_node; - BtreeNodeList new_parent_nodes; - do { - if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { - if (child_node->is_node_deleted()) { - // Edge node is merged, we need to set the current last entry as edge - if (cur_parent->total_entries() > 0) { - auto prev_val = V{}; - cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); - cur_parent->remove(cur_parent->total_entries() - 1); - cur_parent->set_edge_value(prev_val); - BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", - cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + // update the last key of parent for issue + // 1- last key is X for parent (P) + // 2- check the non deleted last child (A) last key (here is Y) + // start from first child and store the last key of the child node, then traverse to next sibling + // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to + //siblings which has keys more than Y or end of list (name this parent sibling node F), + // 2-2- Put last key of F to last key of P + // 2-3 - set F as Next of A + BtreeNodeList siblings; + BtreeNodePtr next_cur_child; + BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), + "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); + if (parent_node->total_entries() > 0) { + auto updated_last_key = last_parent_key; + K last_child_last_key; + K last_child_neighbor_key; + BtreeNodePtr cur_child; + BtreeLinkInfo cur_child_info; + + bool found_child = false; + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == + btree_status_t::success) { + if (!cur_child->is_node_deleted() && cur_child->total_entries()) { + last_child_last_key = cur_child->get_last_key< K >(); + if (cur_child->next_bnode() != empty_bnodeid && + read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { + LOGTRACEMOD(wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", last_child_last_key.to_string(), + cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); + found_child = true; + break; + } + found_child = true; + break; + } + LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", + cur_child->to_string()); + } + } + + if (found_child) { + LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", + last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); + if (last_child_last_key.compare(last_parent_key) > 0) { + if (next_cur_child) { + last_child_neighbor_key = next_cur_child->get_last_key< K >(); + LOGTRACEMOD(wbcache, + "Voila !! last child_key of child [{}] is greater than its parents [{}] and its next neighbor key is {}", cur_child->to_string(), + parent_node->to_string(), last_child_neighbor_key.to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor", cur_child->to_string(), parent_node->to_string()); + } + + // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 +// or end + // of list (put all siblings in a list, here is F) , + BtreeNodePtr sibling; + BtreeNodePtr true_sibling; + BtreeLinkInfo sibling_info; + + auto sibling_node_id = parent_node->next_bnode(); + while (sibling_node_id != empty_bnodeid) { + if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { + if (sibling->is_node_deleted()) { + // Do we need to free the sibling node here? + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", + sibling->to_string()); + continue; + } + auto sibling_last_key = sibling->get_last_key< K >(); + if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + } else { + true_sibling = sibling; + break; + } + } + } + if (true_sibling) { + LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", + true_sibling->to_string(), + parent_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + parent_node->to_string()); + } + if (sibling_node_id != empty_bnodeid) { + last_parent_key = last_child_last_key; + parent_node->set_next_bnode(true_sibling->node_id()); + for (auto sibling : siblings) { + LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + } + LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); + BtreeLinkInfo first_child_info; + parent_node->get_nth_value(0, &first_child_info, false); + } } else { - BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", - cur_parent->node_id()); + LOGTRACEMOD(wbcache, + "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", parent_node->to_string()); + next_cur_child = nullptr; } - } else { - // Update edge and finish - BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), - child_node->to_string()); - child_node->set_next_bnode(empty_bnodeid); - write_node_impl(child_node, cp_ctx); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } - break; } - auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), - child_node->to_string(), child_last_key.to_string()); - - // Check if we are beyond the last child node. - // - // There can be cases where the child level merge is successfully persisted but the parent level is not. - // In this case, you may have your rightmost child node with last key greater than the last_parent_key. - // That's why here we have to check if the child node is one of the original child nodes first. - if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { - if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { - // We have reached a child beyond this parent, we can stop now + // Keep a copy of the node buffer, in case we need to revert back + uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; + std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); + + // Remove all the entries in parent_node and let walk across child_nodes rebuild this node + parent_node->remove_all(this->m_bt_cfg); + + // Walk across all child nodes until it gets the last_parent_key and keep fixing them. + auto cur_parent = parent_node; + BtreeNodeList new_parent_nodes; + do { + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + LOGTRACEMOD(wbcache, + "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + if (is_parent_edge_node) { + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), + child_node->link_version()}); + } else { + auto tsib_id = findTrueSibling(cur_parent); + if (tsib_id != empty_bnodeid) { + cur_parent->set_next_bnode(tsib_id); + LOGTRACEMOD(wbcache, + "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", + tsib_id, cur_parent->to_string(), child_node->to_string()); + } else { + cur_parent->set_next_bnode(empty_bnodeid); + // if this child node previously belonged to this parent node, we need to add it but as edge o.w, not this node + if (orig_child_infos.contains(child_node->node_id())){ + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), + child_node->link_version()}); + LOGTRACEMOD(wbcache, + "Child node [{}] is an edge node and previously belong to this parent, so we need to add it as edge", + child_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + cur_parent->to_string()); + } + BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), + "Parent node [{}] cannot be empty", cur_parent->to_string()); + } + } + +// +// } + break; + } break; } - } - if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), - BtreeLinkInfo::get_fixed_size())) { - // No room in the parent_node, let us split the parent_node and continue - auto new_parent = this->alloc_interior_node(); - if (new_parent == nullptr) { - ret = btree_status_t::space_not_avail; - break; + auto child_last_key = child_node->get_last_key< K >(); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), + child_node->to_string(), child_last_key.to_string()); + + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is + // not. In this case, you may have your rightmost child node with last key greater than the + // last_parent_key. That's why here we have to check if the child node is one of the original child + // nodes first. + if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { + if (child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + // TODO this case if child last key is less than last parent key to update the parent node. + // this case can potentially break the btree for put and remove op. + break; + } + if (child_node->total_entries() == 0) { + // this child has no entries, but maybe in the middle of the parent node, we need to update the key + // of parent as previous one and go on + LOGTRACEMOD(wbcache, + "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence loop ends", child_node->to_string()); + // now update the next of parent node by skipping all deleted siblings of this parent node + auto valid_sibling = cur_parent->next_bnode(); + while (valid_sibling != empty_bnodeid) { + BtreeNodePtr sibling; + if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) { + if (sibling->is_node_deleted()) { + valid_sibling = sibling->next_bnode(); + continue; + } + // cur_parent->set_next_bnode(sibling->node_id()); + break; + } + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + valid_sibling, cur_parent->to_string(), ret); + } + if (valid_sibling != empty_bnodeid) { + cur_parent->set_next_bnode(valid_sibling); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + + } else { + cur_parent->set_next_bnode(empty_bnodeid); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + } + + break; + } } - new_parent->set_next_bnode(cur_parent->next_bnode()); - cur_parent->set_next_bnode(new_parent->node_id()); - new_parent->set_level(cur_parent->level()); - cur_parent->inc_link_version(); + if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), + BtreeLinkInfo::get_fixed_size())) { + // No room in the parent_node, let us split the parent_node and continue + auto new_parent = this->alloc_interior_node(); + if (new_parent == nullptr) { + ret = btree_status_t::space_not_avail; + break; + } - new_parent_nodes.push_back(new_parent); - cur_parent = std::move(new_parent); - } + new_parent->set_next_bnode(cur_parent->next_bnode()); + cur_parent->set_next_bnode(new_parent->node_id()); + new_parent->set_level(cur_parent->level()); + cur_parent->inc_link_version(); - // Insert the last key of the child node into parent node - if (!child_node->is_node_deleted()) { - cur_parent->insert(cur_parent->total_entries(), - child_node->total_entries() > 0 ? child_last_key : last_parent_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); - if (child_node->total_entries() == 0) { - // There should be at most one empty child node per parent - if we find one, we should stop here - BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), - child_node->to_string()); - break; + new_parent_nodes.push_back(new_parent); + cur_parent = std::move(new_parent); } - } else { - // Node deleted indicates it's freed & no longer used during recovery - BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", - cur_parent->node_id(), child_node->to_string()); - } - BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); + // Insert the last key of the child node into parent node + if (!child_node->is_node_deleted()) { + if (child_node->total_entries() == 0) { + if (orig_child_infos.contains(child_node->node_id())) { + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}], but not the end of the parent node, so we need to update the key of parent node as original one {}", + child_node->to_string(), child_last_key.to_string()); + } else { + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}] but not belonging to this parent (probably next parent sibling); Hence end loop", child_node->to_string()); + break; + } + } + cur_parent->insert(cur_parent->total_entries(), child_last_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } else { + // Node deleted indicates it's freed & no longer used during recovery + LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + if (pre_child_node) { + // We need to update the next of the previous child node to this child node + + LOGTRACEMOD(wbcache, + "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] to this child node [{}]", cur_parent->node_id(), child_node->to_string(), + pre_child_node->to_string(), child_node->next_bnode()); + pre_child_node->set_next_bnode(child_node->next_bnode()); + // repairing the next of previous child node + // We need to set the state of the previous child node to clean, so that it can be flushed + IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get()); + idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + write_node_impl(pre_child_node, cp_ctx); + // update the key of last entry of the parent with the last key of deleted child + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(), + child_last_key.to_string()); + // update it here to go to the next child node and unlock this node + LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node"); + child_node->set_next_bnode(child_node->next_bnode()); + } + } - // Move to the next child node - auto const next_node_id = child_node->next_bnode(); - this->unlock_node(child_node, locktype_t::READ); - if (next_node_id == empty_bnodeid) { - // This can be a deleted edge node - only check if it is still valid + LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), + cur_parent->to_string()); + + // Move to the next child node + auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); if (!child_node->is_node_deleted()) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " - "repair is partial", - child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; + // We need to free the child node + pre_child_node = child_node; + } + if (next_node_id == empty_bnodeid) { + // This can be a deleted edge node - only check if it is still valid + if (!child_node->is_node_deleted()) { + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} repair is partial", child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; + } + child_node = nullptr; + break; + } + if (next_cur_child && next_node_id == next_cur_child->node_id()) { + // We are at the last child node, we can stop now + LOGTRACEMOD( + wbcache, + "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop", child_node->node_id(), next_cur_child->to_string()); + child_node = nullptr; + break; + } + ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); + if (ret != btree_status_t::success) { + BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", + parent_node->node_id(), enum_name(ret)); + child_node = nullptr; + break; + } + + } while (true); + + if (child_node) { this->unlock_node(child_node, locktype_t::READ); } + // if last parent has the key less than the last child key, then we need to update the parent node with + // the last child key if it doesn't have edge. + auto last_parent = parent_node; + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } + if (last_parent->total_entries() && !last_parent->has_valid_edge()) { + if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { + BtreeLinkInfo child_info; + last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); + parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); + LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", + parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); + } + // if last key of children is less than the last key of parent, then we need to update the last key of non interior child + if (last_parent->level() > 1 && !last_parent->has_valid_edge()) { + // read last child + BtreeNodePtr last_child; + BtreeLinkInfo child_info; + auto total_entries = last_parent->total_entries(); + last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */); + if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) { + // get last key of cur child + auto last_child_key = last_child->get_last_key< K >(); + BtreeLinkInfo last_child_info; + last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/); + if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) { + auto cur_child_st = last_child->to_string(); + last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info); + LOGTRACEMOD(wbcache, + "Updating interior child node={} with last_parent_key={} and child_info={}", + cur_child_st, last_parent_key.to_string(), last_child_info.to_string()); + write_node_impl(last_child, cp_ctx); + } + } } - child_node = nullptr; - break; } - ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); - if (ret != btree_status_t::success) { - BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", - parent_node->node_id(), enum_name(ret)); - child_node = nullptr; - break; + if (ret == btree_status_t::success) { + // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair) + IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get()); + p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); } - } while (true); - if (child_node) { this->unlock_node(child_node, locktype_t::READ); } + if (ret != btree_status_t::success) { + BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", + enum_name(ret), parent_node->node_id()); + std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); + } - if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { - // We shouldn't have an empty interior node in the tree, let's delete it. - // The buf will be released by the caller - BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); - parent_node->set_node_deleted(); + delete[] tmp_buffer; + return ret; } - if (ret == btree_status_t::success) { - ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); + bnodeid_t findTrueSibling(BtreeNodePtr const& node) { + if (node == nullptr) return empty_bnodeid; + bnodeid_t sibling_id = empty_bnodeid; + if (node->has_valid_edge()) { + sibling_id = node->get_edge_value().bnode_id(); + } else { + sibling_id = node->next_bnode(); } + if (sibling_id == empty_bnodeid) { + return empty_bnodeid; + } else { + BtreeNodePtr sibling_node; + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } - if (ret != btree_status_t::success) { - BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", - enum_name(ret), parent_node->node_id()); - std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); + if (sibling_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", sibling_node->to_string(), node->to_string()); + return findTrueSibling(sibling_node); + } else { + return sibling_id; + } } + return sibling_id; + } - delete[] tmp_buffer; - return ret; + K get_last_true_child_key(BtreeNodePtr const& parent_node) { + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + BtreeLinkInfo cur_child_info; + BtreeNodePtr cur_child; + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted()) { + if (cur_child->total_entries()) { + return cur_child->get_last_key< K >(); + } else { + LOGTRACEMOD(wbcache, "Last valid child {} has no entries", cur_child->to_string()); + } + } + } + } } + }; } // namespace homestore diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 801cace13..e14f6c18f 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -89,6 +89,7 @@ class IndexService { // the following methods are used wb_cache , which will not used by upper layer. so graceful shutdown just skips // them for now. void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf); + void parent_recover(uint32_t ordinal, IndexBufferPtr const& node_buf); void update_root(uint32_t ordinal, IndexBufferPtr const& node_buf); IndexWBCacheBase& wb_cache() { diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index a661da497..cf400ee73 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -57,6 +57,12 @@ table Btree { max_nodes_to_rebalance: uint32 = 3; mem_btree_page_size: uint32 = 8192; + + /* Maximum level of btree merge operation enabled while removig keys. */ + max_merge_level: uint8 = 1; + + /* Merge enabled */ + merge_turned_on: bool = true; } table Cache { diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 122667726..fd411526a 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -58,6 +58,7 @@ void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBuffe rec->append(op_t::child_new, buf->blkid()); } for (auto const& buf : freed_bufs) { + rec->free_node_level = buf->m_node_level; rec->append(op_t::child_freed, buf->blkid()); } } @@ -235,7 +236,78 @@ std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { cur_ptr += rec->size(); LOGTRACEMOD(wbcache, "Recovered txn record: {}: {}", t, rec->to_string()); } + auto modifyBuffer = [](IndexBufferPtr& buffer) { + IndexBufferPtr up_buf = buffer->m_up_buffer; + auto real_up_buf = up_buf; + while (real_up_buf && real_up_buf->m_node_freed) { + real_up_buf = real_up_buf->m_up_buffer; + } + if (real_up_buf != up_buf) { + up_buf->remove_down_buffer(buffer); + buffer->m_up_buffer = real_up_buf; + real_up_buf->add_down_buffer(buffer); + LOGTRACEMOD(wbcache, "Change upbuffer from {} to {}", up_buf->to_string(), + buffer->m_up_buffer->to_string()); + } + }; +#if 0 + auto dag_print = [](const std::map< BlkId, IndexBufferPtr >& dags, std::string delimiter) { + int index = 1; + for (const auto& [blkid, bufferPtr] : dags) { + LOGTRACEMOD(wbcache, "{}{} - blkid {} buffer {} ", delimiter, index++, blkid.to_integer(), + bufferPtr->to_string()); + } + }; + LOGTRACEMOD(wbcache,"Before modify : \n "); + dag_print(buf_map, "Before: "); +#endif + for (auto& [blkid, bufferPtr] : buf_map) { + modifyBuffer(bufferPtr); + } + // LOGTRACEMOD(wbcache,"\n\n\nAFTER modify : \n "); + // dag_print(buf_map, "After: "); + + auto sanityCheck = [](const std::map< BlkId, IndexBufferPtr >& dags) { + for (const auto& [blkid, bufferPtr] : dags) { + auto up_buffer = bufferPtr->m_up_buffer; + if (up_buffer) { + HS_REL_ASSERT( + !up_buffer->m_node_freed, + "Sanity check failed: Buffer {} blkdid {} has an up_buffer {} blkid that is marked as freed.", + bufferPtr->to_string(), blkid.to_integer(), up_buffer->to_string(), + up_buffer->blkid().to_integer()); + HS_REL_ASSERT_EQ(up_buffer->m_created_cp_id, -1, + "Sanity check failed: Buffer {} has an up_buffer {} that just created", + bufferPtr->to_string(), up_buffer->to_string()); + HS_REL_ASSERT_EQ(up_buffer->m_index_ordinal, bufferPtr->m_index_ordinal, + "Sanity check failed: Buffer {} has an up_buffer {} that has different index_ordinal.", + bufferPtr->to_string(), up_buffer->to_string()); + HS_REL_ASSERT(!bufferPtr->is_meta_buf(), + "Sanity check failed: down buffer {} is meta buffer of up buffer {}", + bufferPtr->to_string(), up_buffer->to_string()); + HS_REL_ASSERT( + !up_buffer->m_wait_for_down_buffers.testz(), + "Sanity check failed: Buffer {} has an up_buffer {} that has zero m_wait_for_down_buffers.", + bufferPtr->to_string(), up_buffer->to_string()); +#ifdef _PRERELEASE + HS_DBG_ASSERT(up_buffer->is_in_down_buffers(bufferPtr), + "Sanity check failed: up_buffer {} has't {} as a down_buffer.", up_buffer->to_string(), + bufferPtr->to_string()); +#endif + } + HS_REL_ASSERT(!bufferPtr->m_node_freed || bufferPtr->m_wait_for_down_buffers.testz(), + "Sanity check failed: Freed buffer {} has non-zero m_wait_for_down_buffers.", + bufferPtr->to_string()); +#ifdef _PRERELEASE + HS_DBG_ASSERT(bufferPtr->m_wait_for_down_buffers.test_eq(bufferPtr->m_down_buffers.size()), + "Sanity check failed: Buffer {} has a mismatch between down_buffers_count and " + "m_wait_for_down_buffers.", + bufferPtr->to_string()); +#endif + } + }; + sanityCheck(buf_map); return buf_map; } @@ -264,7 +336,14 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, } if (up_buf) { - auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; + auto real_up_buf = up_buf; + if (up_buf->m_created_cp_id == cpg->id()) { + real_up_buf = up_buf->m_up_buffer; + } else if (up_buf->m_node_freed) { + real_up_buf = up_buf->m_up_buffer; + LOGTRACEMOD(wbcache, "\n\n change upbuffer from {} to {}\n\n", up_buf->to_string(), + real_up_buf->to_string()); + } #ifndef NDEBUG // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} @@ -299,6 +378,7 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, for (uint8_t idx{0}; idx < rec->num_freed_ids; ++idx) { auto freed_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), inplace_child_buf ? inplace_child_buf : parent_buf); + freed_buf->m_node_level = rec->free_node_level; freed_buf->m_node_freed = true; } } @@ -337,6 +417,9 @@ std::string IndexCPContext::txn_record::to_string() const { fmt::format_to(std::back_inserter(str), ", freed_ids=["); add_to_string(str, idx, num_freed_ids); + if (num_freed_ids) { + fmt::format_to(std::back_inserter(str), ", freed_node_level= {}", (uint8_t)(free_node_level)); + }; fmt::format_to(std::back_inserter(str), "{}", (is_parent_meta ? ", parent is meta" : "")); return str; } diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index d7bd124df..dffb3113c 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -37,7 +37,8 @@ struct IndexCPContext : public VDevCPContext { uint8_t has_inplace_parent : 1; // Do we have parent_id in the list of ids. It will be first uint8_t has_inplace_child : 1; // Do we have child_id in the list of ids. It will be second uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer - uint8_t reserved1 : 5; + uint8_t free_node_level : 4; // Free/created node level + uint8_t reserved1 : 1; uint8_t num_new_ids; uint8_t num_freed_ids; uint8_t reserved{0}; @@ -48,6 +49,7 @@ struct IndexCPContext : public VDevCPContext { has_inplace_parent{0x0}, has_inplace_child{0x0}, is_parent_meta{0x0}, + free_node_level{0x0}, num_new_ids{0}, num_freed_ids{0}, index_ordinal{ordinal} {} diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 8e8f47bef..76da72842 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -88,6 +88,9 @@ void IndexService::start() { std::unique_lock lg(m_index_map_mtx); for (const auto& [_, tbl] : m_index_map) { tbl->recovery_completed(); +#ifdef _PRERELEASE + tbl->audit_tree(); +#endif } // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty buffer // after recovery can be added to dirty list for flushing in the new cp @@ -161,6 +164,16 @@ void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& nod } } +void IndexService::parent_recover(uint32_t ordinal, IndexBufferPtr const& node_buf) { + auto tbl = get_index_table(node_buf->m_index_ordinal); + if (tbl) { + tbl->delete_stale_children(node_buf); + } else { + HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", + node_buf->m_index_ordinal); + } +} + void IndexService::update_root(uint32_t ordinal, IndexBufferPtr const& node_buf) { auto tbl = get_index_table(ordinal); if (tbl) { @@ -264,7 +277,7 @@ void IndexBuffer::remove_down_buffer(const IndexBufferPtr& buf) { } } } - HS_DBG_ASSERT(found, "Down buffer is linked to up_buf, but up_buf doesn't have down_buf in its list"); + HS_DBG_ASSERT(found, "Down buffer {} is linked to up_buf, but up_buf {} doesn't have down_buf in its list", buf->to_string(), buf->m_up_buffer? buf->m_up_buffer->to_string(): std::string("nulptr")); #endif } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 4ac648816..f0818f45e 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -106,6 +106,7 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { idx_buf->m_created_cp_id = cpg->id(); idx_buf->m_dirtied_cp_id = cpg->id(); auto node = node_initializer(idx_buf); + idx_buf->m_node_level = node->level(); if (!m_in_recovery) { // Add the node to the cache. Skip if we are in recovery mode. @@ -127,6 +128,7 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } else { + LOGTRACEMOD(wbcache, "write buf [{}] in recovery mode", buf->to_string()); m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid); } } else { @@ -141,7 +143,7 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t auto const blkid = BlkId{id}; retry: - // Check if the blkid is already in cache, if not load and put it into the cache + // Check if the blkid is already in cache, if notL load and put it into the cache if (!m_in_recovery && m_cache.get(blkid, node)) { return; } // Read the buffer from virtual device @@ -179,6 +181,7 @@ bool IndexWBCache::get_writable_buf(const BtreeNodePtr& node, CPContext* context // If its not clean, we do deep copy. auto new_buf = std::make_shared< IndexBuffer >(idx_buf->m_blkid, m_node_size, m_vdev->align_size()); new_buf->m_created_cp_id = idx_buf->m_created_cp_id; + new_buf->m_node_level = idx_buf->m_node_level; std::memcpy(new_buf->raw_buffer(), idx_buf->raw_buffer(), m_node_size); node->update_phys_buf(new_buf->raw_buffer()); @@ -299,11 +302,11 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p if (new_node_bufs.empty() && freed_node_bufs.empty()) { // This is an update for meta, root transaction. - if (child_buf->m_created_cp_id != -1) { - DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(), - "Root buffer is not created by current cp (for split root), its not expected"); + if (child_buf->m_created_cp_id < icp_ctx->id()) { + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, child_buf, {}, {}); + } else { + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } - icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, new_node_bufs, freed_node_bufs); @@ -319,7 +322,7 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } if (new_node_bufs.empty() && freed_node_bufs.empty()) { - fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + fmt::format_to(std::back_inserter(txn), "{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, (parent_buf && parent_buf->blkid().to_integer() != 0) ? std::to_string(parent_buf->blkid().to_integer()) : "empty", @@ -340,10 +343,10 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p ? std::to_string(child_buf->blkid().to_integer()) : "empty"; - fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + fmt::format_to(std::back_inserter(txn), ": {} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, child_str, new_nodes, freed_nodes); } - LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); + LOGTRACEMOD(wbcache, "tranasction till now: cp: {} {}", icp_ctx->id(), txn); txn_id++; #endif #if 0 @@ -447,15 +450,7 @@ void IndexWBCache::load_buf(IndexBufferPtr const& buf) { } } -struct DagNode { - IndexBufferPtr buffer; - std::vector< shared< DagNode > > children; -}; - -using DagPtr = std::shared_ptr< DagNode >; -using DagMap = std::map< IndexBufferPtr, DagPtr >; - -static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { +IndexWBCache::DagMap IndexWBCache::generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { std::vector< IndexBufferPtr > bufs; std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; }); @@ -497,7 +492,7 @@ static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { return generateDagMap(bufs); } -static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { +std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { std::string str{fmt::format("#_of_dags={}\n", dags.size())}; int cnt = 1; for (const auto& [_, dag] : dags) { @@ -508,6 +503,7 @@ static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { stack.pop_back(); auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : ""; auto sfree = node->buffer->m_node_freed ? "FREED" : ""; + load_buf(node->buffer); fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index, node->buffer->to_string(), snew, sfree); int c = node->children.size(); @@ -556,10 +552,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { return log; }; - std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); - LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); auto dags = generate_dag_buffers(bufs); - LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); + LOGTRACEMOD(wbcache, "before processing recovery DAGS:\n {}\n\n\n\n", to_string_dag_bufs(dags, icp_ctx->id())); #endif // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one @@ -575,27 +569,60 @@ void IndexWBCache::recover(sisl::byte_view sb) { // On the second pass, we only take part of the parents/siblings and then repair them, if needed. std::vector< IndexBufferPtr > pending_bufs; std::vector< IndexBufferPtr > deleted_bufs; + std::multiset< IndexBufferPtr, bool (*)(const IndexBufferPtr&, const IndexBufferPtr&) > + potential_parent_recovered_bufs( + [](const IndexBufferPtr& a, const IndexBufferPtr& b) { return a->m_node_level < b->m_node_level; }); + + LOGTRACEMOD(wbcache, "\n\n\nRecovery processing begins\n\n\n"); for (auto const& [_, buf] : bufs) { + load_buf(buf); + if (buf->m_node_freed) { - // Freed node - load_buf(buf); + LOGTRACEMOD(wbcache, "recovering free buf {}", buf->to_string()); if (was_node_committed(buf)) { // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true; - write_buf(nullptr, buf, icp_ctx); + write_buf(nullptr, buf, icp_ctx); // no need to write it here !! deleted_bufs.push_back(buf); pending_bufs.push_back(buf->m_up_buffer); + LOGINFOMOD(wbcache, "Freeing deleted buf {} and adding up buffer to pending {}", buf->to_string(), + buf->m_up_buffer->to_string()); } else { // (Up) buffer is not committed, node need to be kept and (potentially) repaired later - buf->m_node_freed = false; - if (buf->m_created_cp_id == icp_ctx->id()) { - // New nodes need to be commited first + if (buf->m_created_cp_id != icp_ctx->id()) { + LOGTRACEMOD(wbcache, + "NOT FREE committing buffer {} node deleted is false reason: node commited?= {} " + "up committed? {}", + buf->to_string(), was_node_committed(buf), was_node_committed(buf->m_up_buffer)); + buf->m_node_freed = false; + r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = false; m_vdev->commit_blk(buf->m_blkid); + // it can happen when children moved to one of right parent sibling and then the previous node is deleted but not commited during crash (upbuffer is not committed). but its children already committed. and freed (or changed) + if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); } + } else { + LOGINFO("deleting and creating new buf {}", buf->to_string()); + deleted_bufs.push_back(buf); + } + // 1- upbuffer was dirtied by the same cp, so it is not commited, so we don't need to repair it. + // remove it from down_waiting list (probably recursively going up) 2- upbuffer was created and + // freed at the same cp, so it is not commited, so we don't need to repair it. + if (buf->m_up_buffer) { + LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(), + buf->m_up_buffer->to_string()); + buf->m_up_buffer->remove_down_buffer(buf); + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, + "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); + update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/); + } + buf->m_up_buffer = nullptr; } - pending_bufs.push_back(buf); - buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency } } else if (buf->m_created_cp_id == icp_ctx->id()) { + LOGTRACEMOD(wbcache, "recovering new buf {}", buf->to_string()); // New node if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { // Both current and up buffer is commited, we can safely commit the current block @@ -604,31 +631,63 @@ void IndexWBCache::recover(sisl::byte_view sb) { } else { // Up buffer is not committed, we need to repair it first buf->m_up_buffer->remove_down_buffer(buf); - // buf->m_up_buffer = nullptr; if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, "\npruning due to zero dependency of child\n up buffer {} \n buffer \n{}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); update_up_buffer_counters(buf->m_up_buffer); } +// buf->m_up_buffer = nullptr; } } } - + LOGTRACEMOD(wbcache, "\n\n\nRecovery processing Ends\n\n\n"); #ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", pending_bufs.size(), bufs.size(), icp_ctx->id()); - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); - LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); -#endif + // add deleted bufs to logs here as well + auto modified_dags = generate_dag_buffers(bufs); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log({}, pending_bufs)); + LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(modified_dags, icp_ctx->id())); - for (auto const& buf : pending_bufs) { - recover_buf(buf); - if (buf->m_bytes != nullptr && r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { +#endif + uint32_t cnt = 0; + LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", + potential_parent_recovered_bufs.size()); + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - check stale recovered buf {}", cnt++, buf->to_string()); + } + // This step is needed since there is a case where all(or some) children of an interior node is freed (after moving + // to a previous sibling parent) and after crash, this node has stale links to its children + cnt = 0; + std::vector buffers_to_repair; + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - potential parent recovered buf {}", cnt, buf->to_string()); + parent_recover(buf); + if (buf->m_bytes == nullptr || r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { // This buffer was marked as deleted during repair, so we also need to free it deleted_bufs.push_back(buf); + }else + { + // This buffer was not marked as deleted during repair, so we need to repair it + buffers_to_repair.push_back(buf); } } + // let all unfreed buffers to be repaired first. This is important to let detect and remove all stale links first + // and then repair them before actual repair (due to dependency of finding true siblings) + for (auto const& buf : buffers_to_repair) { + LOGTRACEMOD(wbcache, "recover and repairing unfreed non-stale link interior node buf {}", buf->to_string()); + index_service().repair_index_node(buf->m_index_ordinal, buf); + } + // actual recover is done here in recovery path + for (auto const& buf : pending_bufs) { + LOGTRACEMOD(wbcache, "recover and repairing up_buffer buf {}", buf->to_string()); + recover_buf(buf); + } for (auto const& buf : deleted_bufs) { + LOGTRACEMOD(wbcache, "freeing buf after repairing (last step) {}", buf->to_string()); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } @@ -636,6 +695,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { m_vdev->recovery_completed(); } +void IndexWBCache::parent_recover(IndexBufferPtr const& buf) { + index_service().parent_recover(buf->m_index_ordinal, buf); +} // if buf->m_wait_for_down_buffers.testz() is true (which means that it has no dependency on any other buffer) then we // can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement // its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement @@ -643,14 +705,16 @@ void IndexWBCache::recover(sisl::byte_view sb) { // wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers. void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) { if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) { - LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers"); + LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers\n"); return; } auto grand_buf = buf->m_up_buffer; - grand_buf->remove_down_buffer(buf); LOGINFOMOD(wbcache, - "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up", + "Decrementing wait_for_down_buffers due to zero dependency of child for grand_buffer {} up_buffer {}, " + "Keep going up", grand_buf->to_string(), buf->to_string()); + grand_buf->remove_down_buffer(buf); + buf->m_up_buffer = nullptr; update_up_buffer_counters(grand_buf); } @@ -685,7 +749,7 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { // If the node is freed, then it can be considered committed as long as its up buffer was committed if (buf->m_node_freed) { - HS_DBG_ASSERT(buf->m_up_buffer, "Buf was marked deleted, but doesn't have an up_buffer"); + HS_DBG_ASSERT(buf->m_up_buffer, "Buf {} was marked deleted, but doesn't have an up_buffer", buf->to_string()); return was_node_committed(buf->m_up_buffer); } @@ -697,8 +761,7 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), - cp_ctx->to_string()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}", cp_ctx->to_string_with_dags()); // #ifdef _PRERELEASE // static int id = 0; // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; @@ -778,19 +841,20 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { - LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), + LOGTRACEMOD(wbcache, "cp {} Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } + } catch (const std::runtime_error& e) { + std::call_once(flag, + []() { LOGERROR("Crash simulation is ongoing; aid simulation by not flushing."); }); + } }); - if (!part_of_batch) { m_vdev->submit_batch(); } } } diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 7d10d7f54..82221ab0d 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -61,6 +61,13 @@ class IndexWBCache : public IndexWBCacheBase { folly::Future< bool > async_cp_flush(IndexCPContext* context); IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const; void recover(sisl::byte_view sb) override; + struct DagNode { + IndexBufferPtr buffer; + std::vector< shared< DagNode > > children; + }; + + using DagPtr = std::shared_ptr< DagNode >; + using DagMap = std::map< IndexBufferPtr, DagPtr >; private: void start_flush_threads(); @@ -77,6 +84,9 @@ class IndexWBCache : public IndexWBCacheBase { IndexBufferPtrList& bufs); void recover_buf(IndexBufferPtr const& buf); + void parent_recover(IndexBufferPtr const& buf); + std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0); + DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap); bool was_node_committed(IndexBufferPtr const& buf); void load_buf(IndexBufferPtr const& buf); void update_up_buffer_counters(IndexBufferPtr const& buf); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index cba159954..f255ea81b 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -34,7 +34,7 @@ if (${build_nonio_tests}) set(TEST_MEMBTREE_SOURCE_FILES test_mem_btree.cpp) add_executable(test_mem_btree ${TEST_MEMBTREE_SOURCE_FILES}) - target_link_libraries(test_mem_btree ${COMMON_TEST_DEPS} GTest::gtest) + target_link_libraries(test_mem_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME MemBtree COMMAND test_mem_btree) set_tests_properties(MemBtree PROPERTIES TIMEOUT 1200) diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 3ab8632e6..227fd0ee0 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -25,7 +25,7 @@ #include #include #include - +#include "common/homestore_config.hpp" #include "test_common/range_scheduler.hpp" #include "shadow_map.hpp" @@ -44,8 +44,17 @@ struct BtreeTestHelper { void SetUp() { m_cfg.m_leaf_node_type = T::leaf_node_type; m_cfg.m_int_node_type = T::interior_node_type; + if (SISL_OPTIONS.count("disable_merge")) { + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.btree.merge_turned_on = false; + HS_SETTINGS_FACTORY().save(); + }); + } + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.btree.max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + HS_SETTINGS_FACTORY().save(); + }); m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); - if (SISL_OPTIONS.count("disable_merge")) { m_cfg.m_merge_turned_on = false; } if (m_is_multi_threaded) { std::mutex mtx; @@ -225,16 +234,14 @@ struct BtreeTestHelper { rreq.enable_route_tracing(); bool removed = (m_bt->remove(rreq) == btree_status_t::success); - if(care_success) { + if (care_success) { ASSERT_EQ(removed, m_shadow_map.exists(*pk)) << "Removal of key " << pk->key() << " status doesn't match with shadow"; if (removed) { m_shadow_map.remove_and_check(*pk, *existing_v); } - }else { + } else { // Do not care if the key is not present in the btree, just cleanup the shadow map m_shadow_map.erase(*pk); } - - } void remove_random() { diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 6083140bf..c62ce2f84 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -49,6 +49,8 @@ SISL_OPTION_GROUP( (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", ::cxxopts::value< bool >()->default_value("1"), ""), + (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("127"), + ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 7aa56f090..9121f7240 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -49,6 +49,8 @@ SISL_OPTION_GROUP( ""), (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"), ""), + (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("1"), ""), + (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", @@ -329,6 +331,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } + void destroy_btree() { + hs()->index_service().remove_index_table(this->m_bt); + this->m_bt->destroy(); + this->trigger_cp(true); + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(m_shadow_filename); + LOGINFO("destroy btree - erase shadow map {}", m_shadow_filename); + } + void restart_homestore(uint32_t shutdown_delay_sec = 3) override { this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this); LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n"); @@ -441,8 +452,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Sanity check passed for {} keys!", count); } - void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") { + this->remove_flip(flip); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -461,7 +473,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); } - // this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -473,7 +485,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); } - // this->print_keys("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -580,7 +592,7 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); } - this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + this->crash_and_recover(flips[i], operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); if (renew_btree_after_crash) { this->reset_btree(); }; } } @@ -709,8 +721,7 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { } } else { // remove the flips so that they do not get triggered erroneously - this->remove_flip(flip); - this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); + this->crash_and_recover(flip, operations, fmt::format("long_tree_{}", round)); } if (elapsed_time - last_progress_time > 30) { last_progress_time = elapsed_time; @@ -726,6 +737,8 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } + this->destroy_btree(); + log_obj_life_counter(); } TYPED_TEST(IndexCrashTest, long_running_remove_crash) { @@ -764,8 +777,8 @@ TYPED_TEST(IndexCrashTest, long_running_remove_crash) { operations = generator.generateOperations(num_entries, true /* reset */); if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } } - auto opstr = SequenceGenerator::printOperations(operations); - LOGINFO("Lets before crash print operations\n{}", opstr); + // auto opstr = SequenceGenerator::printOperations(operations); + // LOGINFO("Lets before crash print operations\n{}", opstr); for (auto [k, _] : operations) { this->put(k, btree_put_type::INSERT, true /* expect_success */); @@ -778,12 +791,13 @@ TYPED_TEST(IndexCrashTest, long_running_remove_crash) { test_common::HSTestHelper::trigger_cp(true); this->get_all(); this->m_shadow_map.save(this->m_shadow_filename); - // this->print_keys("reapply: after preload"); + // this->print_keys("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); - for (uint32_t round = 1; - round <= rounds && !time_to_stop() && this->tree_key_count() > num_entries_per_rounds; round++) { + for (uint32_t round = 1; round <= rounds && !time_to_stop() && this->tree_key_count() >= num_entries_per_rounds; + round++) { LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); + // this->print_keys(fmt::format("before round {}",round)); bool print_time = false; elapsed_time = get_elapsed_time_sec(m_start_time); if (load_mode) { @@ -832,7 +846,7 @@ TYPED_TEST(IndexCrashTest, long_running_remove_crash) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); } } - LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + // LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); for (auto [k, _] : operations) { this->remove_one(k, true /* expect_success */); if (!time_to_stop()) { @@ -852,7 +866,7 @@ TYPED_TEST(IndexCrashTest, long_running_remove_crash) { this->get_all(); } } else { - this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); + this->crash_and_recover(flip, operations, fmt::format("long_tree_{}", round)); } if (elapsed_time - last_progress_time > 30) { last_progress_time = elapsed_time; @@ -868,13 +882,17 @@ TYPED_TEST(IndexCrashTest, long_running_remove_crash) { // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } + this->print_keys(fmt::format("tree at end")); + this->destroy_btree(); + log_obj_life_counter(); } // Basic reverse and forward order remove with different flip points TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { vector< std::string > flip_points = { - "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", - // "crash_flush_on_freed_child", + "crash_flush_on_merge_at_parent", + "crash_flush_on_merge_at_left_child", + "crash_flush_on_freed_child", }; for (size_t i = 0; i < flip_points.size(); ++i) { @@ -884,7 +902,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); // Populate some keys [1,num_entries) and trigger cp to persist - LOGINFO("Step {}-1: Populate some keys and flush", i + 1); + LOGINFO("Step {}-0: Populate some keys and flush", i + 1); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); for (auto k = 0u; k < num_entries; ++k) { this->put(k, btree_put_type::INSERT, true /* expect_success */); @@ -892,10 +910,8 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { test_common::HSTestHelper::trigger_cp(true); this->m_shadow_map.save(this->m_shadow_filename); - this->visualize_keys("tree_merge_full.dot"); - // Split keys into batches and remove the last one in reverse order - LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-1: Set crash flag {}", i + 1, flip_point); int batch_num = 4; { int n = batch_num; @@ -905,20 +921,21 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { for (auto k = r; k >= l; --k) { ops.emplace_back(k, OperationType::Remove); } - LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); - + LOGINFO("Step {}-1-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + this->print_keys(fmt::format("Print before Step {}-1-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, r, l)); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->visualize_keys("tree_merge_before_first_crash.dot"); - - LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); - this->crash_and_recover(ops); + LOGINFO("Step {}-1-2: Trigger cp to crash", i + 1); + this->crash_and_recover(flip_point, ops); } + this->print_keys(fmt::format("Print after recover Step {}1--3: flip {}", i + 1, flip_point)); // Remove the next batch of keys in forward order - LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-2: Set crash flag {}", i + 1, flip_point); + { int n = batch_num - 1; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; @@ -926,21 +943,47 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { for (auto k = l; k <= r; ++k) { ops.emplace_back(k, OperationType::Remove); } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + this->print_keys(fmt::format("Print before Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, l, r)); + this->set_basic_flip(flip_point); + for (auto [k, _] : ops) { + this->remove_one(k, true); + } + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(flip_point, ops); + } + this->print_keys(fmt::format("Print after recover Step {}-2-3: flip {}", i + 1, flip_point)); + + // Remove the next batch of keys in random order + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-3: Set crash flag {}", i + 1, flip_point); + { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->visualize_keys("tree_merge_before_second_crash.dot"); + this->print_keys(fmt::format("Print before Step {}-3: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, l, r)); LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); - this->crash_and_recover(ops); + this->crash_and_recover(flip_point, ops); } + this->print_keys(fmt::format("Print after recover Step {}-3-3: flip {}", i + 1, flip_point)); // Remove the next batch of keys in random order - LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { - int n = batch_num - 2; + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-4: Set crash flag {} Remove another batch in ascending order", + i + 1, flip_point); + { + int n = batch_num - 3; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; SequenceGenerator generator(0, 100, l, r); @@ -948,21 +991,17 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { OperationList ops = generator.generateOperations(r - l + 1, false); LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); - + this->print_keys(fmt::format("Print before Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, l, r)); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->visualize_keys("tree_merge_before_third_crash.dot"); - LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); - this->crash_and_recover(ops); + this->crash_and_recover(flip_point, ops); } + this->print_keys(fmt::format("Print after recover Step {}-4-3: flip {}", i + 1, flip_point)); - LOGINFO("Step {}-5: Cleanup the tree", i + 1); - for (auto k = 0u; k < num_entries; ++k) { - this->remove_one(k, false); - } test_common::HSTestHelper::trigger_cp(true); this->get_all(); } @@ -1104,11 +1143,14 @@ int main(int argc, char* argv[]) { SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_index_crash_recovery, iomgr, test_common_setup); sisl::logging::SetLogger("test_index_crash_recovery"); spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v"); - if (SISL_OPTIONS.count("seed")) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } #ifdef _PRERELEASE diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 141fcf5e2..50f8df9cd 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -42,6 +42,8 @@ SISL_OPTION_GROUP( (num_entries, "", "num_entries", "number of entries to test with", ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), + (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("127"), + ""), (num_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), (num_fibers, "", "num_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", @@ -107,13 +109,18 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { #ifdef _PRERELEASE this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); #endif + this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); + //if TestType is PrefixIntervalBtreeTest print here something + if constexpr (std::is_same_v) { + this->m_cfg.m_merge_turned_on = false; + } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } }; -// TODO Enable PrefixIntervalBtreeTest later -using BtreeTypes = testing::Types< /* PrefixIntervalBtreeTest, */ FixedLenBtreeTest, VarKeySizeBtreeTest, - VarValueSizeBtreeTest, VarObjSizeBtreeTest >; +using BtreeTypes = testing::Types< FixedLenBtreeTest, VarKeySizeBtreeTest, + VarValueSizeBtreeTest, VarObjSizeBtreeTest, PrefixIntervalBtreeTest >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); TYPED_TEST(BtreeTest, SequentialInsert) { @@ -308,6 +315,11 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin #ifdef _PRERELEASE this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); #endif + this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); + if constexpr (std::is_same_v) { + this->m_cfg.m_merge_turned_on = false; + } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 564bd61c5..54059cf0a 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -21,8 +21,8 @@ def run_test(options, type): print("Test completed") -def run_crash_test(options): - cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " +def run_crash_test(options, crash_type='put', type=0): + cmd_opts = f"--gtest_filter=IndexCrashTest/{type}.long_running_{crash_type}_crash --gtest_break_on_failure --min_keys_in_node={options['min_keys_in_node']} --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " # print(f"Running test with options: {cmd_opts}") try: subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, @@ -99,7 +99,19 @@ def long_running_crash_put(options): options['run_time'] = 14400 # 4 hours options['preload_size'] = 1024 print(f"options: {options}") - run_crash_test(options) + run_crash_test(options, 'put', 0) + print("Long running crash put completed") + +def long_running_crash_remove(options): + print("Long running crash remove started") + options['num_entries'] = 1000 + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['num_entries_per_rounds'] = 100 + options['min_keys_in_node'] = 2 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'remove', 0) print("Long running crash put completed") @@ -120,9 +132,14 @@ def main(): def long_running(*args): options = parse_arguments() + for i in range(50): + print(f"Iteration {i + 1}") + long_running_crash_remove(options) + for i in range(5): + print(f"Iteration {i + 1}") + long_running_crash_put(options) long_runnig_index(options) long_running_clean_shutdown(options) - long_running_crash_put(options) if __name__ == "__main__": From 1713af1094744338b9761a96152894f48d489070 Mon Sep 17 00:00:00 2001 From: shosseinimotlagh Date: Fri, 25 Apr 2025 15:11:32 -0700 Subject: [PATCH 3/5] bump up conan version --- src/lib/index/wb_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index f0818f45e..66abd4b37 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -816,8 +816,8 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { } void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { -#ifdef _PRERELEASE static std::once_flag flag; +#ifdef _PRERELEASE if (hs()->crash_simulator().is_crashed()) { std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); return; From 700e2dc1e02217b2c0a022de3a00d64b3f048519 Mon Sep 17 00:00:00 2001 From: shosseinimotlagh Date: Mon, 28 Apr 2025 10:06:55 -0700 Subject: [PATCH 4/5] Address comments --- src/include/homestore/index/index_table.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 06703e41f..31c793bdf 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -622,7 +622,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } else { - auto tsib_id = findTrueSibling(cur_parent); + auto tsib_id = find_true_sibling(cur_parent); if (tsib_id != empty_bnodeid) { cur_parent->set_next_bnode(tsib_id); LOGTRACEMOD(wbcache, @@ -857,7 +857,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return ret; } - bnodeid_t findTrueSibling(BtreeNodePtr const& node) { + bnodeid_t find_true_sibling(BtreeNodePtr const& node) { if (node == nullptr) return empty_bnodeid; bnodeid_t sibling_id = empty_bnodeid; if (node->has_valid_edge()) { @@ -873,7 +873,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (sibling_node->is_node_deleted()) { LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", sibling_node->to_string(), node->to_string()); - return findTrueSibling(sibling_node); + return find_true_sibling(sibling_node); } else { return sibling_id; } From 698b6a1a624961bdfce84107e3ca693adc5535e4 Mon Sep 17 00:00:00 2001 From: shosseinimotlagh Date: Mon, 28 Apr 2025 21:43:31 -0700 Subject: [PATCH 5/5] update conan --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 26c04a6aa..0114dfd0d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.4" + version = "6.12.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine"